def run(self): # initialise d_counts = {} d_counts['total'] = {} d_counts['total']['.uk'] = d_counts['total']['.scot'] = d_counts[ 'total']['.wales'] = d_counts['total']['.cymru'] = d_counts[ 'total']['.london'] = d_counts['total']['not_uk'] = 0 d_counts['total']['uk_domain'] = d_counts['total'][ 'uk_geoip'] = d_counts['total']['uk_postal_address'] = d_counts[ 'total']['via_correspondence'] = d_counts['total'][ 'prof_judgement'] = 0 i_wct_uids = 0 a_orgs = [] a_schedules = [] # enable and start logging logger = logging.getLogger() logger.debug('Script initialized') # get counts self.process_frequent_exports(d_counts, a_schedules) ## i_wct_uids = get_ukwa_licensed_content(w3act_exporter, logger) #i_new_instances = self.calculate_instances() i_new_instances = 0 #i_new_sips = self.calculate_sips() i_new_sips = 0 # calculate organisations and schedules a_orgs = Counter(a_orgs) a_schedules = Counter(a_schedules) # output results self.output_results(d_counts, a_orgs, i_wct_uids, a_schedules, i_new_sips, i_new_instances, self.a_ldls, logger)
def check_hash(path, file_hash): logger.debug("Checking file %s hash %s" % (path, file_hash)) if len(file_hash) != 128: raise Exception("%s hash not 128 character length [%s]" % (path, len(file_hash))) if not all(c in string.hexdigits for c in file_hash): raise Exception("%s hash not all hex [%s]" % (path, file_hash))
def run(self): logger.debug("file %s to hash" % (self.path)) t = luigi.LocalTarget(self.path) with t.open('r') as reader: file_hash = hashlib.sha512(reader.read()).hexdigest() # test hash CalculateLocalHash.check_hash(self.path, file_hash) with self.output().open('w') as f: f.write(file_hash)
def mdex_gov_uk_publications(self): # Start by grabbing the Link-rel-up header to refine the landing page url: # e.g. https://www.gov.uk/government/uploads/system/uploads/attachment_data/file/497662/accidents-involving-illegal-alcohol-levels-2014.pdf # Link: <https://www.gov.uk/government/statistics/reported-road-casualties-in-great-britain-estimates-involving-illegal-alcohol-levels-2014>; rel="up" r = requests.head(url=self.doc_wb_url()) if r.links.has_key('up'): lpu = r.links['up'] self.doc["landing_page_url"] = lpu['url'] # Grab the landing page URL as HTML logger.debug("Downloading and parsing: %s" % self.doc['landing_page_url']) r = requests.get(self.lp_wb_url()) h = html.fromstring(r.content) # Extract the metadata: logger.debug('xpath/title %s' % h.xpath('//header//h1/text()')) self.doc['title'] = self._get0(h.xpath('//header//h1/text()')) self.doc['publication_date'] = self._get0( h.xpath("//aside[contains(@class, 'meta')]//time/@datetime"))[0:10] if self.doc['publication_date'] == '': self.doc.pop('publication_date') self.doc['publishers'] = h.xpath( "//aside[contains(@class, 'meta')]//a[contains(@class, 'organisation-link')]/text()" ) # Look through landing page for links, find metadata section corresponding to the document: for a in h.xpath("//a"): if self.doc["document_url"] in urljoin( self.doc["landing_page_url"], a.attrib["href"]): if ("class" in a.getparent().getparent().attrib) and \ a.getparent().getparent().attrib["class"] == "attachment-details": div = a.getparent().getparent() # Process title, allowing document title metadata to override: lp_title = self._get0( div.xpath("./h2[@class='title']/a/text()")) if len(lp_title) > 0: self.doc['title'] = lp_title # Process references refs = div.xpath("./p/span[@class='references']") # We also need to look out for Command and Act papers and match them by modifying the publisher list for ref in refs: isbn = self._get0( ref.xpath("./span[@class='isbn']/text()")) if len(isbn) > 0: self.doc['isbn'] = isbn if len( ref.xpath( "./span[starts-with(text(), 'HC') or starts-with(text(), 'Cm') or starts-with(text(), 'CM')]" )) > 0: self.doc['publishers'] = ["Command and Act Papers"] if not self.doc['title']: raise Exception( 'Title extraction failed! Metadata extraction for this target should be reviewed.' )
def calculate_instances(self): i_new_instances = 0 o_targets = self.client.list( "/data/wayback/cdx-index/")["FileStatuses"]["FileStatus"] for o_target in o_targets: o_instances = self.client.list("/data/wayback/cdx-index/%s/" \ % o_target["pathSuffix"])["FileStatuses"]["FileStatus"] for o_instance in o_instances: i_mod = datetime.fromtimestamp(o_instance["modificationTime"] / 1000) if i_mod > (datetime.now() - relativedelta(months=-1)): i_new_instances += 1 logger.debug('New instances = ' + str(i_new_instances)) return i_new_instances
def run(self): logger.debug("file %s to hash" % (self.path)) # get hash for local or hdfs file t = self.input() client = luigi.contrib.hdfs.get_autoconfig_client(threading.local()) # Having to side-step the first client as it seems to be buggy/use an old API - note also confused put() with client.client.read(str(t.path)) as reader: file_hash = hashlib.sha512(reader.read()).hexdigest() # test hash CalculateLocalHash.check_hash(self.path, file_hash) with self.output().open('w') as f: f.write(file_hash)
def calculate_sips(self): i_new_sips = 0 o_dirs = self.client.list("/heritrix/sips/") logger.info(o_dirs) for o_dir in o_dirs: logger.info(o_dir) o_sips = self.client.list( "/heritrix/sips/%s/" % o_dir["pathSuffix"])["FileStatuses"]["FileStatus"] for o_sip in o_sips: i_mod = datetime.fromtimestamp(o_sip["modificationTime"] / 1000) if i_mod > (datetime.now() + relativedelta(months=-1)): i_new_sips += 1 logger.debug('New SIPs = ' + str(i_new_sips)) return i_new_sips
def mdex_gov_uk_publications(self): # Start by grabbing the Link-rel-up header to refine the landing page url: # e.g. https://www.gov.uk/government/uploads/system/uploads/attachment_data/file/497662/accidents-involving-illegal-alcohol-levels-2014.pdf # Link: <https://www.gov.uk/government/statistics/reported-road-casualties-in-great-britain-estimates-involving-illegal-alcohol-levels-2014>; rel="up" r = requests.head(url=self.doc_wb_url()) if r.links.has_key('up'): lpu = r.links['up'] self.doc["landing_page_url"] = lpu['url'] # Grab the landing page URL as HTML logger.debug("Downloading and parsing: %s" % self.doc['landing_page_url']) r = requests.get(self.lp_wb_url()) h = html.fromstring(r.content) # Extract the metadata: logger.debug('xpath/title %s' % h.xpath('//header//h1/text()') ) self.doc['title'] = self._get0(h.xpath('//header//h1/text()')) self.doc['publication_date'] = self._get0(h.xpath("//aside[contains(@class, 'meta')]//time/@datetime"))[0:10] if self.doc['publication_date'] == '': self.doc.pop('publication_date') self.doc['publishers'] = h.xpath("//aside[contains(@class, 'meta')]//a[contains(@class, 'organisation-link')]/text()") # Look through landing page for links, find metadata section corresponding to the document: for a in h.xpath("//a"): if self.doc["document_url"] in urljoin(self.doc["landing_page_url"], a.attrib["href"]): if ("class" in a.getparent().getparent().attrib) and \ a.getparent().getparent().attrib["class"] == "attachment-details": div = a.getparent().getparent() # Process title, allowing document title metadata to override: lp_title = self._get0(div.xpath("./h2[@class='title']/a/text()")) if len(lp_title) > 0: self.doc['title'] = lp_title # Process references refs = div.xpath("./p/span[@class='references']") # We also need to look out for Command and Act papers and match them by modifying the publisher list for ref in refs: isbn = self._get0(ref.xpath("./span[@class='isbn']/text()")) if len(isbn) > 0: self.doc['isbn'] = isbn if len(ref.xpath("./span[starts-with(text(), 'HC') or starts-with(text(), 'Cm') or starts-with(text(), 'CM')]")) > 0: self.doc['publishers'] = ["Command and Act Papers"] if not self.doc['title']: raise Exception('Title extraction failed! Metadata extraction for this target should be reviewed.')
def get_ukwa_licensed_content(self, w3act_exporter, logger): i_wct_uids = 0 logger.debug('Getting W3ACT export get_by_all') try: export_all = w3act_exporter.get_by_export("all") i_wct_uids = len(export_all) logger.debug('Size of get_by_all export ' + str(i_wct_uids)) except: logger.debug('get_by_all export failed') i_wct_uids = 'ERROR: stats.py script failed to export get_by_all from W3ACT' return i_wct_uids
def parse_crawl_log(self, logs): """ Parses the crawl log to check the WARCs are present. :return: """ # Set up remote connection: rf = luigi.contrib.ssh.RemoteFileSystem(self.host) warcfiles = set() remote_log = luigi.contrib.ssh.RemoteTarget(logs[0], self.host) with remote_log.open('r') as f: for line in f: parts = re.split(" +", line, maxsplit=11) # Skip failed downloads: if parts[1] == '-' or parts[1] == '' or int(parts[1]) <= 0: if parts[1] == '': logger.info( "Skipping line with empty status! '%s' from log file '%s'" % (line, logs[0])) continue # Skip locally-resolved DNS records if parts[1] == "1001": logger.debug( "Skipping finding WARC for locally-defined hostname: %s" % parts[3]) continue # Attempt to parse JSON try: (annotations, line_json) = re.split("{", parts[11], maxsplit=1) line_json = "{%s" % line_json # logger.debug("LOG JSON: %s" % line_json) # logger.debug("LOG ANNOTATIONS: %s" % annotations) jmd = json.loads(line_json) except Exception as e: logger.info("LOG LINE: %s" % line) logger.info("LOG LINE part[11]: %s" % parts[11]) logger.exception(e) raise e if 'warcFilename' in jmd: warcfiles.add(jmd['warcFilename']) elif 'warcPrefix' in jmd: for wren in remote_ls(LOCAL_WREN_FOLDER, "%s*.warc.gz*" % jmd['warcPrefix'], rf): if wren.endswith('.open'): wren = wren[:-5] warcfiles.add(os.path.basename(wren)) # Also check in case file has already been moved into output/warcs/{job}/{launch}: for wren in remote_ls(self.warc_file_path(), "%s*.warc.gz*" % jmd['warcPrefix'], rf): warcfiles.add(os.path.basename(wren)) # FIXME Also look on HDFS for matching files? else: logger.warning("No WARC file entry found for line: %s" % line) warcs = [] viral = [] for warcfile in warcfiles: if self._file_exists("%s/%s" % (self.viral_file_path(), warcfile), rf): logger.info("Found Viral WARC %s/%s" % (self.viral_file_path(), warcfile)) viral.append("%s/%s" % (self.viral_file_path(), warcfile)) elif self._file_exists("%s/%s" % (LOCAL_WREN_FOLDER, warcfile), rf): logger.info("Found WREN WARC %s" % warcfile) warcs.append("%s/%s" % (LOCAL_WREN_FOLDER, warcfile)) elif self._file_exists("%s/%s" % (self.warc_file_path(), warcfile), rf): logger.info("Found WARC %s/%s" % (self.warc_file_path(), warcfile)) warcs.append("%s/%s" % (self.warc_file_path(), warcfile)) else: raise Exception("Cannot file warc file %s" % warcfile) return warcs, viral
def process_frequent_exports(self, d_counts, a_schedules): # for each crawl frequency for frequency in self.input(): logger.debug('Reading W3ACT export for ' + frequency) freq_export = json.load(self.input()[frequency].open()) d_counts[frequency] = {} d_counts[frequency]['.uk'] = d_counts[frequency][ '.scot'] = d_counts[frequency]['.wales'] = d_counts[frequency][ '.cymru'] = d_counts[frequency]['.london'] = d_counts[ frequency]['not_uk'] = 0 d_counts[frequency]['uk_domain'] = d_counts[frequency][ 'uk_geoip'] = d_counts[frequency][ 'uk_postal_address'] = d_counts[frequency][ 'via_correspondence'] = d_counts[frequency][ 'prof_judgement'] = 0 # for each frequency with collected data, count URL country codes if freq_export is None: logger.debug("None returned for " + frequency) else: for node in freq_export: #logger.info(node) a_schedules.append( frequency) # This doesn't really make sense I think? for url in [u["url"] for u in node["fieldUrls"]]: if urlparse(url).netloc.endswith(".uk"): d_counts[frequency]['.uk'] += 1 d_counts[frequency]['uk_domain'] += 1 elif urlparse(url).netloc.endswith(".london"): d_counts[frequency]['.london'] += 1 d_counts[frequency]['uk_domain'] += 1 elif urlparse(url).netloc.endswith(".wales"): d_counts[frequency]['.wales'] += 1 d_counts[frequency]['uk_domain'] += 1 elif urlparse(url).netloc.endswith(".cymru"): d_counts[frequency]['.cymru'] += 1 d_counts[frequency]['uk_domain'] += 1 elif urlparse(url).netloc.endswith(".scot"): d_counts[frequency]['.scot'] += 1 d_counts[frequency]['uk_domain'] += 1 else: d_counts[frequency]['not_uk'] += 1 if node["field_uk_hosting"]: d_counts[frequency]['uk_geoip'] += 1 if node["field_uk_postal_address"]: d_counts[frequency]['uk_postal_address'] += 1 if node["field_via_correspondence"]: d_counts[frequency]['via_correspondence'] += 1 if node["field_professional_judgement"]: d_counts[frequency]['prof_judgement'] += 1 # log frequency counts for subset in sorted(d_counts[frequency]): logger.debug("\t" + subset + " = " + str(d_counts[frequency][subset])) # accumulate total values for subset in d_counts[frequency]: d_counts['total'][subset] += d_counts[frequency][subset] # log count totals for subset in sorted(d_counts['total']): logger.info(subset + " = " + str(d_counts['total'][subset]))