def requires(self): # FIXME Need to make sure this copes with crawl.log.TIMESTAMP etc. from failures. # Set up remote connection: rf = luigi.contrib.ssh.RemoteFileSystem(self.host) # Look for checkpoints to package, and package them in the correct order: outputs = {} is_final = False for item_path in remote_ls( "%s/logs/%s/%s" % (LOCAL_OUTPUT_FOLDER, self.job, self.launch_id), "crawl.log*", rf): item = os.path.basename(item_path) logger.info("ITEM %s" % item) if item == "crawl.log": is_final = True outputs["final"] = item elif item.endswith(".lck"): pass else: outputs[item[-14:]] = item output_list = sorted(outputs.keys()) logger.info("Ordered by date: %s" % output_list) # Build up the list of stages, in order, and aggregate: aggregate = list() for key in output_list: aggregate.append(outputs[key]) yield AggregateOutputs(self.host, self.job, self.launch_id, key, aggregate)
def run(self): # Sort inputs by checkpoint timestamp and merge into versioned lists: aggregate = {} if isinstance(self.input(), list): inputs = self.input() else: inputs = [self.input()] # Build up the aggregate: for input in inputs: logger.info("Reading %s" % input.path) item = json.load(input.open()) for key in item.keys(): if isinstance(item[key], list): current = aggregate.get(key, []) current.extend(item[key]) elif isinstance(item[key], dict): current = aggregate.get(key, {}) current.update(item[key]) elif item[key]: current = item[key] aggregate[key] = current logger.info("Aggregate: %s" % aggregate) with self.output().open('w') as f: f.write('{}'.format(json.dumps(aggregate, indent=4)))
def watch_target(self, tid): target = {} target['watchedTarget'] = {} target['watchedTarget']['documentUrlScheme'] = "" logger.info("PUT %d %s" % (tid, json.dumps(target))) r = requests.put("%s/api/targets/%d" % (self.url, tid), headers=self.up_headers, data=json.dumps(target)) return r
def run(self): # Set up remote connection: rf = luigi.contrib.ssh.RemoteFileSystem(self.host) logs = [self.get_crawl_log(rf)] start_date = self.file_start_date(logs) # Find the WARCs referenced from the crawl log: (warcs, viral) = self.parse_crawl_log(logs) # TODO Look for WARCs not spotted via the logs and add them in (ALSO allow this in the log parser) if self.stage == 'final': for item in remote_ls(self.warc_file_path(), "*.warc.gz", rf): if item not in warcs: logger.info("Found additional WARC: %s" % item) warcs.append(item) # for item in remote_ls(self.viral_file_path(), "*.warc.gz", rf): if item not in warcs: logger.info("Found additional Viral WARC: %s" % item) warcs.append(item) # TODO Get sha512 and ARK identifiers for WARCs now, and store in launch folder and thus the zip? # Loop over all the WARCs involved i = 0 hashes = {} for warc in warcs: # do some hard work here i += 1 self.set_status_message = "Progress: Hashing WARC %d of %s" % ( i, len(warcs)) hash_output = yield CalculateRemoteHash(self.host, warc) with hash_output.open('r') as reader: sha = reader.read().rstrip('\n') hashes[warc] = sha # Report on progress... self.set_status_message = "Progress: Hashed WARC %d of %s" % ( i, len(warcs)) # Bundle logs and configuration data into a zip and upload it to HDFS zips = [ PackageLogs(self.host, self.job, self.launch_id, self.stage).output().path ] # FIXME Need to mint and add in ARKs at this point: # Output the job package summary: job_output = { 'job_id': self.job, 'launch_id': self.launch_id, 'start_date': start_date, 'warcs': warcs, 'viral': viral, 'logs': logs, 'zips': zips, 'hashes': hashes } with self.output().open('w') as f: f.write('{}'.format(json.dumps(job_output, indent=4)))
def get(self, key, fn, duration): if key not in self \ or self[key].timeStamp + self[key].duration < time.time(): logger.info('adding new value for %s' % key) o = fn(key) self[key] = CachedItem(key, o, duration) else: logger.info('loading from cache for key %s' % key) return self[key].value
def run(self): # Require that the file is old: if True: logger.info( "But this file is too young to assume we're done: %s " % self.path) return PendingFile(self.job, self.launch_id, self.path) # Okay to move: return MoveToHdfs(self.job, self.launch_id, self.path, self.delete_local)
def post_target(self, url, title): target = {} target['field_urls'] = [url] target['title'] = title target['selector'] = 1 target['field_scope'] = "root" target['field_depth'] = "CAPPED" target['field_ignore_robots_txt'] = False logger.info("POST %s" % (json.dumps(target))) r = requests.post("%s/api/targets" % self.url, headers=self.up_headers, data=json.dumps(target)) return r
def run_doc_mdex_test_extraction(self, url, lpu, src, title): logger.info("Looking at document URL: %s" % url) doc = {} doc['document_url'] = url doc['landing_page_url'] = lpu targets = json.load(self.input()['targets'].open('r')) doc = DocumentMDEx(targets, doc, src, null_if_no_target_found=False).mdex() logger.info(json.dumps(doc)) if doc.get('title', None) != title: raise Exception("Wrong title found for this document! '%s' v '%s'" % (doc['title'], title))
def run(self): # Copy up to HDFS client = luigi.contrib.hdfs.get_autoconfig_client(threading.local()) logger.info("HDFS hash, pre: %s" % client.client.checksum(self.output().path)) with open(self.path, 'r') as f: client.client.write(data=f, hdfs_path=self.output().path, overwrite=True) logger.info("HDFS hash, post: %s" % client.client.checksum(self.output().path))
def run(self): # Read the file in and write it to HDFS with self.input().open('r') as reader: with self.output().open('w') as writer: logger.info("Uploading %s to %s" % (self.input().path, self.output().path)) while True: chunk = reader.read(DEFAULT_BUFFER_SIZE) if not chunk: break writer.write(chunk)
def run(self): open_path = "%s.open" % self.path if os.path.isfile(open_path) and not os.path.isfile(self.path): logger.info("Found an open file that needs closing: %s " % open_path) # Require that the file is old: if True: logger.info("But this file is too young to close: %s " % open_path) return PendingFile(self.job, self.launch_id, self.path) # Closing it, as it's old enough: shutil.move(open_path, self.path)
def update_target_schedule(self, tid, frequency, start_date, end_date=None): target = {} target['field_crawl_frequency'] = frequency.upper() sd = dateutil.parser.parse(start_date) target['field_crawl_start_date'] = int(time.mktime(sd.timetuple())) if end_date: ed = dateutil.parser.parse(start_date) target['field_crawl_end_date'] = int(time.mktime(ed.timetuple())) else: target['field_crawl_end_date'] = 0 logger.info("PUT %d %s" % (tid, json.dumps(target))) r = requests.put("%s/api/targets/%d" % (self.url, tid), headers=self.up_headers, data=json.dumps(target)) return r
def run_doc_mdex_test_extraction(self, url, lpu, src, title): logger.info("Looking at document URL: %s" % url) doc = {} doc['document_url'] = url doc['landing_page_url'] = lpu targets = json.load(self.input()['targets'].open('r')) doc = DocumentMDEx(targets, doc, src, null_if_no_target_found=False).mdex() logger.info(json.dumps(doc)) if doc.get('title', None) != title: raise Exception( "Wrong title found for this document! '%s' v '%s'" % (doc['title'], title))
def requires(self): logger.info("Looking in %s %s" % (self.job, self.launch_id)) # Look in /heritrix/output/wren files and move them to the /warcs/ folder: tasks = [] warc_glob = "%s/*-%s-%s-*.warc.gz" % (WREN_FOLDER, self.job, self.launch_id) logger.info("Looking for WREN outputs: %s" % warc_glob) for wren_item in glob.glob(warc_glob): tasks.append(MoveToWarcsFolder(self.job, self.launch_id, wren_item)) yield tasks # Look in warcs and viral for WARCs e.g in /heritrix/output/{warcs|viral}/{job}/{launch_id} tasks = [] for out_type in ['warcs', 'viral']: glob_path = "%s/%s/%s/%s/*.warc.gz" % ( CRAWL_OUTPUT_FOLDER, self.job, self.launch_id, out_type) logger.info("GLOB:%s" % glob_path) for item in glob.glob( "%s/%s/%s/%s/*.warc.gz" % (CRAWL_OUTPUT_FOLDER, self.job, self.launch_id, out_type)): logger.info("ITEM:%s" % item) tasks.append( MoveToHdfs(self.job, self.launch_id, item, self.delete_local)) # Yield these as a group, so they can run in parallel: if len(tasks) > 0: yield tasks # And look for /heritrix/output/logs: tasks = [] for log_item in glob.glob( "%s/%s/%s/logs/*.log*" % (CRAWL_OUTPUT_FOLDER, self.job, self.launch_id)): if os.path.splitext(log_item)[1] == '.lck': continue elif os.path.splitext(log_item)[1] == '.log': # Only move files with the '.log' suffix if this job is no-longer running: logger.info("Using MoveToHdfsIfOld for %s" % log_item) tasks.append( MoveToHdfsIfOld(self.job, self.launch_id, log_item, self.delete_local)) else: tasks.append( MoveToHdfs(self.job, self.launch_id, log_item, self.delete_local)) # Yield these as a group, so any MoveToHdfsIfStopped jobs don't prevent MoveToHdfs from running if len(tasks) > 0: yield tasks
def requires(self): h = webhdfs() logs = [] for path in [ "/heritrix/output/logs/dc0-%s" % self.dc_id, "/heritrix/output/logs/dc1-%s" % self.dc_id, "/heritrix/output/logs/dc2-%s" % self.dc_id, "/heritrix/output/logs/dc3-%s" % self.dc_id ]: for item in h.list(path): if item.startswith('crawl.log'): logs.append("%s/%s" % (path, item)) print("Found %i log files..." % len(logs)) logger.info("Found %i log files..." % len(logs)) yield SummariseLogFiles(logs, 'dc', self.dc_id, True)
def run(self): logger.info(self.launch_id) is_final = False outputs = [] for input in self.input(): if input.path.endswith(".final"): is_final = True outputs.append(input.path) # only report complete success if... if is_final: with self.output().open('w') as f: f.write('{}'.format(json.dumps(outputs, indent=4))) else: yield CheckJobStopped(self.host, self.job, self.launch_id)
def calculate_sips(self): i_new_sips = 0 o_dirs = self.client.list("/heritrix/sips/") logger.info(o_dirs) for o_dir in o_dirs: logger.info(o_dir) o_sips = self.client.list( "/heritrix/sips/%s/" % o_dir["pathSuffix"])["FileStatuses"]["FileStatus"] for o_sip in o_sips: i_mod = datetime.fromtimestamp(o_sip["modificationTime"] / 1000) if i_mod > (datetime.now() + relativedelta(months=-1)): i_new_sips += 1 logger.debug('New SIPs = ' + str(i_new_sips)) return i_new_sips
def enumerate_launches(self): # Set up connection: rf = luigi.contrib.ssh.RemoteFileSystem(self.host) # Look for jobs that need to be processed: for date in self.date_interval: for job_item in remote_ls(LOCAL_JOB_FOLDER, "*", rf): job = os.path.basename(job_item) if rf.isdir(job_item): launch_glob = "%s*" % date.strftime('%Y%m%d') logger.info("Looking for job launch folders matching %s" % launch_glob) for launch_item in remote_ls(job_item, launch_glob, rf): logger.info("Found %s" % launch_item) if rf.isdir(launch_item): launch = os.path.basename(launch_item) yield (self.host, job, launch)
def complete(self): # Read local: local = luigi.LocalTarget(path=self.source_path) with local.open('r') as reader: local_hash = hashlib.sha512( reader.read().encode('utf-8')).hexdigest() logger.info("LOCAL HASH: %s" % local_hash) # Read from HDFS client = luigi.contrib.hdfs.WebHdfsClient() if not client.exists(self.target_path): return False with client.client.read(self.target_path) as reader: hdfs_hash = hashlib.sha512(reader.read()).hexdigest() logger.info("HDFS HASH: %s" % hdfs_hash) # If they match, we are good: return hdfs_hash == local_hash
def __init__(self, url, email, password): self.url = url.rstrip("/") loginUrl = "%s/login" % self.url logger.info("Logging into %s as %s " % (loginUrl, email)) response = requests.post(loginUrl, data={"email": email, "password": password}) if not response.history: logger.error("W3ACT Login failed!") raise Exception("W3ACT Login Failed!") self.cookie = response.history[0].headers["set-cookie"] self.get_headers = { "Cookie": self.cookie, } self.up_headers = { "Cookie": self.cookie, "Content-Type": "application/json" } self.ld_cache = CachedDict()
def run(self): # Load the targets: with self.input().open() as f: all_targets = json.load(f) # Grab detailed target data: logger.info("Filtering detailed information for %i targets..." % len(all_targets)) # Filter... targets = [] for t in all_targets: if t['crawl_frequency'] is None: logger.warning("No crawl frequency set for %s" % t) elif t['crawl_frequency'].lower() == self.frequency.lower(): targets.append(t) # Persist to disk: with self.output().open('w') as f: f.write('{}'.format(json.dumps(targets, indent=4)))
def run(self): # Read in sha512 with self.input()[0].open('r') as f: local_hash = f.readline() logger.info("Got local hash %s" % local_hash) # Re-download and get the hash with self.input()[1].open('r') as f: hdfs_hash = f.readline() logger.info("Got HDFS hash %s" % hdfs_hash) if local_hash != hdfs_hash: raise Exception("Local & HDFS hashes do not match for %s" % self.path) # Otherwise, move to hdfs was good, so delete: if self.delete_local: os.remove(str(self.path)) # and write out success with self.output().open('w') as f: f.write(hdfs_hash)
def mdex_default(self): ''' Default extractor uses landing page for title etc.''' # Grab the landing page URL as HTML r = requests.get(self.lp_wb_url()) h = html.fromstring(r.content) h.make_links_absolute(self.doc["landing_page_url"]) logger.info("Looking for links...") # Attempt to find the nearest prior header: for a in h.xpath("//a[@href]"): if self.doc["document_url"] in a.attrib["href"]: element = a # try a preceding match: for hel in a.xpath("./preceding::*[self::h1 or self::h2 or self::h3 or self::h4 or self::h5 or self::h6][1]"): # logger.info("EEE %s" % hel.text_content()) logger.info("Preceding header %s" % hel.text_content()) self.doc['title'] = hel.text_content().strip() return # Try recursing up the tree (I think this is superceded by the preceding-match logic above). while element.getparent() is not None: element = element.getparent() #logger.info("ELEMENT %s " % element) #logger.info("ELEMENT %s " % element.text_content()) for hel in element.xpath(".//*[self::h2 or self::h3 or self::h4 or self::h5]"): logger.info("header %s" % hel.text_content()) self.doc['title'] = hel.text_content().strip() return self.doc['title'] = a.text_content() return # Extract a title from the first header, or failing that, the page title: self.doc['title'] = self._get0(h.xpath("//h1/text()")).strip() if not self.doc['title']: self.doc['title'] = self._get0(h.xpath("//title/text()")).strip()
def run(self): """Zips up all log/config. files and copies said archive to HDFS; finds the earliest timestamp in the logs.""" # Set up remote connection: rf = luigi.contrib.ssh.RemoteFileSystem(self.host) # Set up the output, first making sure the full path exists: with self.output().open('w') as f: f.write('') self.output().remove() # What to remove from the paths: chop = len(str(h3().local_prefix)) with zipfile.ZipFile(self.output().path, 'w', allowZip64=True) as zipout: # Crawl log: for crawl_log in remote_ls( "%s/logs/%s/%s" % (h3.local_output_folder, self.job, self.launch_id), "/crawl.log%s" % get_stage_suffix(self.stage), rf): logger.info("Found %s..." % os.path.basename(crawl_log)) self.add_remote_file(zipout, crawl_log[chop:], crawl_log, rf) # Error log(s) for log in remote_ls( "%s/logs/%s/%s" % (LOCAL_OUTPUT_FOLDER, self.job, self.launch_id), "/*-errors.log%s" % get_stage_suffix(self.stage), rf): logger.info("Found %s..." % os.path.basename(log)) self.add_remote_file(zipout, log[chop:], log, rf) # Job text files for txt in remote_ls( "%s/%s/%s" % (LOCAL_JOB_FOLDER, self.job, self.launch_id), "/*.txt", rf): self.add_remote_file(zipout, txt[chop:], txt, rf) # Job json files: for txt in remote_ls( "%s/%s/%s" % (LOCAL_JOB_FOLDER, self.job, self.launch_id), "/*.json", rf): logger.info("Found %s..." % os.path.basename(txt)) self.add_remote_file(zipout, txt[chop:], txt, rf) # Job crawler definition: cxml = "%s/%s/%s/crawler-beans.cxml" % (LOCAL_JOB_FOLDER, self.job, self.launch_id) if rf.exists(cxml): logger.info("Found config...") self.add_remote_file(zipout, cxml[chop:], cxml, rf) else: logger.error("Cannot find config.") raise Exception("Cannot find config.")
def _file_exists(path, rf): """ Checks whether the given file exists and has content - allowed to be '.open' at this point. Also checks on HDFS if there is no local file. :type path: str """ logger.info("Looking for %s" % path) if rf.exists(path) and not rf.isdir(path): # and rf.getsize(path) > 0: return True elif rf.exists("%s.open" % path) and not rf.isdir( "%s.open" % path): # and rf.getsize("%s.open" % path) > 0: return True else: try: if get_hdfs_target(path).exists(): return True except luigi.contrib.hdfs.error.HDFSCliError as e: logger.error("Exception while checking HDFS.") logger.exception(e) return False
def find_watched_target_for(self, url, source, publishers): ''' Given a URL and an array of publisher strings, determine which Watched Target to associate them with. ''' # Find the list of Targets where a seed matches the given URL surt = url_to_surt(url, host_only=True) matches = [] for t in self.targets: if t['watched']: a_match = False for seed in t['seeds']: if surt.startswith(url_to_surt(seed, host_only=True)): a_match = True if a_match: matches.append(t) # No matches: if len(matches) == 0: logger.error("No match found for url %s" % url) return None # raise Exception("No matching target for url "+url) # If one match, assume that is the right Target: if len(matches) == 1: return int(matches[0]['id']) # # Else multiple matches, so need to disambiguate. # # Attempt to disambiguate based on source ONLY: if source is not None: for t in matches: for seed in t['seeds']: logger.info("Looking for source match '%s' against '%s' " % (source, seed)) if seed == source: # return int(t['id']) logger.info("Found match source+seed but this is not enough to disambiguate longer crawls.") break # Then attempt to disambiguate based on publisher # FIXME Make this a bit more forgiving of punctation/minor differences title_matches = [] for t in matches: for publisher in publishers: logger.info("Looking for publisher match '%s' in title '%s' " % (publisher, t['title'])) if publisher and publisher.lower() in t['title'].lower(): title_matches.append(t) break if len(title_matches) == 0: logger.warning("No matching title to associate with url %s " % url) return None # raise Exception("No matching title to associate with url %s " % url) elif len(title_matches) == 1: return int(title_matches[0]['id']) else: logger.warning("Too many matching titles for %s" % url) for t in title_matches: logger.warning("Candidate: %d %s " % (t['id'], t['title'])) logger.warning("Assuming first match is sufficient... (%s)" % title_matches[0]['title']) return int(title_matches[0]['id'])
def uploader(local_path, hdfs_path): """ Copy up to HDFS, making it suitably atomic by using a temporary filename during upload. Done as a static method to prevent accidental confusion of self.path/self.output().path etc. :return: None """ # Set up the HDFS client: client = luigi.contrib.hdfs.get_autoconfig_client(threading.local()) # Create the temporary file name: tmp_path = "%s.temp" % hdfs_path # Now upload the file, allowing overwrites as this is a temporary file and # simultanous updates should not be possible: logger.info("Uploading as %s" % tmp_path) with open(local_path, 'r') as f: client.client.write(data=f, hdfs_path=tmp_path, overwrite=True) # Check if the destination file exists and raise an exception if so: if client.exists(hdfs_path): raise Exception( "Path %s already exists! This should never happen!" % hdfs_path) # Move the uploaded file into the right place: client.client.rename(tmp_path, hdfs_path) # Give the namenode a moment to catch-up with itself and then check it's there: # FIXME I suspect this is only needed for our ancient HDFS time.sleep(2) status = client.client.status(hdfs_path) # Log successful upload: logger.info("Upload completed for %s" % hdfs_path)
def get_crawl_log(self, rf): # First, parse the crawl log(s) and determine the WARC file names: logfilepath = "%s/logs/%s/%s/crawl.log%s" % ( LOCAL_OUTPUT_FOLDER, self.job, self.launch_id, get_stage_suffix(self.stage)) logger.info("Looking for crawl logs stage: %s" % self.stage) logger.info("Looking for crawl logs: %s" % logfilepath) if rf.exists(logfilepath): logger.info("Found %s..." % os.path.basename(logfilepath)) return logfilepath else: raise Exception("Log file '%s' not found!" % logfilepath)
def _get_json(self, url): js = None try: logger.info("Getting URL: %s" % url) r = requests.get(url, headers=self.get_headers) if r.status_code == 200: js = json.loads(r.content) else: logger.info(r.status_code) logger.info(r.text) except: logger.warning(str(sys.exc_info()[0])) logger.warning(str(traceback.format_exc())) return js
def enumerate_launches(self): # Look for jobs that need to be processed: for date in self.date_interval: logger.info("Looking at date %s" % date) for job_item in glob.glob("%s/*" % CRAWL_OUTPUT_FOLDER): job = os.path.basename(job_item) if os.path.isdir(job_item): launch_glob = "%s/%s*" % (job_item, date.strftime('%Y%m%d')) logger.info("Looking for job launch folders matching %s" % launch_glob) for launch_item in glob.glob(launch_glob): logger.info("Found %s" % launch_item) if os.path.isdir(launch_item): launch = os.path.basename(launch_item) yield (job, launch)
def mdex(self): ''' Metadata extraction and target association. ''' # Pass the document through a different extractor based on how the URL starts. try: if (self.doc["document_url"].startswith("https://www.gov.uk/")): self.mdex_gov_uk_publications() elif (self.doc["document_url"].startswith("http://www.ifs.org.uk/") ): self.mdex_ifs_reports() else: self.mdex_default() except Exception as e: logger.error( "Ignoring error during extraction for document %s and landing page %s" % (self.doc['document_url'], self.doc['landing_page_url'])) logger.exception(e) if not 'title' in self.doc or not self.doc['title']: logger.info("Falling back on default extraction logic...") self.mdex_default() logger.info("GOT %s" % self.doc) # Look up which Target this URL should be associated with: if self.targets and self.doc.has_key('landing_page_url'): logger.info( "Looking for match for %s source %s and publishers '%s'" % (self.doc['landing_page_url'], self.source, self.doc.get('publishers', []))) self.doc['target_id'] = self.find_watched_target_for( self.doc['landing_page_url'], self.source, self.doc.get('publishers', [])) # If there is no association, drop it: if not self.doc.get('target_id', None) and self.null_if_no_target_found: logger.critical( "Failed to associated document with any target: %s" % self.doc) return None # If the publisher appears unambiguous, store it where it can be re-used if len(self.doc.get('publishers', [])) is 1: self.doc['publisher'] = self.doc['publishers'][0] # Or return the modified version: return self.doc
def run(self): client = luigi.contrib.hdfs.WebHdfsClient() # Upload to temp file: temp_path = "%s.temp" % self.target_path logger.info("Uploading to %s" % temp_path) with open(str(self.source_path)) as f: client.client.write(hdfs_path=temp_path, data=f.read(), overwrite=self.overwrite) # Remove any existing file, if we're allowed to: if self.overwrite: if client.exists(self.target_path): logger.info("Removing %s..." % self.target_path) client.remove(self.target_path, skip_trash=True) # And rename logger.info("Renaming to %s" % self.target_path) client.rename(temp_path, self.target_path) # Give the namenode a moment to catch-up with itself and then check it's there: # FIXME I suspect this is only needed for our ancient HDFS time.sleep(10) status = client.client.status(self.target_path)
def mdex(self): ''' Metadata extraction and target association. ''' # Pass the document through a different extractor based on how the URL starts. try: if( self.doc["document_url"].startswith("https://www.gov.uk/")): self.mdex_gov_uk_publications() elif( self.doc["document_url"].startswith("http://www.ifs.org.uk/")): self.mdex_ifs_reports() else: self.mdex_default() except Exception as e: logger.error("Ignoring error during extraction for document %s and landing page %s" % (self.doc['document_url'], self.doc['landing_page_url'])) logger.exception(e) if not 'title' in self.doc or not self.doc['title']: logger.info("Falling back on default extraction logic...") self.mdex_default() logger.info("GOT %s" % self.doc) # Look up which Target this URL should be associated with: if self.targets and self.doc.has_key('landing_page_url'): logger.info("Looking for match for %s source %s and publishers '%s'" % (self.doc['landing_page_url'], self.source, self.doc.get('publishers',[]))) self.doc['target_id'] = self.find_watched_target_for(self.doc['landing_page_url'], self.source, self.doc.get('publishers', [])) # If there is no association, drop it: if not self.doc.get('target_id', None) and self.null_if_no_target_found: logger.critical("Failed to associated document with any target: %s" % self.doc) return None # If the publisher appears unambiguous, store it where it can be re-used if len(self.doc.get('publishers',[])) is 1: self.doc['publisher'] = self.doc['publishers'][0] # Or return the modified version: return self.doc
def mdex_default(self): ''' Default extractor uses landing page for title etc.''' # Grab the landing page URL as HTML r = requests.get(self.lp_wb_url()) h = html.fromstring(r.content) h.make_links_absolute(self.doc["landing_page_url"]) logger.info("Looking for links...") # Attempt to find the nearest prior header: for a in h.xpath("//a[@href]"): if self.doc["document_url"] in a.attrib["href"]: element = a # try a preceding match: for hel in a.xpath( "./preceding::*[self::h1 or self::h2 or self::h3 or self::h4 or self::h5 or self::h6][1]" ): # logger.info("EEE %s" % hel.text_content()) logger.info("Preceding header %s" % hel.text_content()) self.doc['title'] = hel.text_content().strip() return # Try recursing up the tree (I think this is superceded by the preceding-match logic above). while element.getparent() is not None: element = element.getparent() #logger.info("ELEMENT %s " % element) #logger.info("ELEMENT %s " % element.text_content()) for hel in element.xpath( ".//*[self::h2 or self::h3 or self::h4 or self::h5]" ): logger.info("header %s" % hel.text_content()) self.doc['title'] = hel.text_content().strip() return self.doc['title'] = a.text_content() return # Extract a title from the first header, or failing that, the page title: self.doc['title'] = self._get0(h.xpath("//h1/text()")).strip() if not self.doc['title']: self.doc['title'] = self._get0(h.xpath("//title/text()")).strip()
def parse_crawl_log(self, logs): """ Parses the crawl log to check the WARCs are present. :return: """ # Set up remote connection: rf = luigi.contrib.ssh.RemoteFileSystem(self.host) warcfiles = set() remote_log = luigi.contrib.ssh.RemoteTarget(logs[0], self.host) with remote_log.open('r') as f: for line in f: parts = re.split(" +", line, maxsplit=11) # Skip failed downloads: if parts[1] == '-' or parts[1] == '' or int(parts[1]) <= 0: if parts[1] == '': logger.info( "Skipping line with empty status! '%s' from log file '%s'" % (line, logs[0])) continue # Skip locally-resolved DNS records if parts[1] == "1001": logger.debug( "Skipping finding WARC for locally-defined hostname: %s" % parts[3]) continue # Attempt to parse JSON try: (annotations, line_json) = re.split("{", parts[11], maxsplit=1) line_json = "{%s" % line_json # logger.debug("LOG JSON: %s" % line_json) # logger.debug("LOG ANNOTATIONS: %s" % annotations) jmd = json.loads(line_json) except Exception as e: logger.info("LOG LINE: %s" % line) logger.info("LOG LINE part[11]: %s" % parts[11]) logger.exception(e) raise e if 'warcFilename' in jmd: warcfiles.add(jmd['warcFilename']) elif 'warcPrefix' in jmd: for wren in remote_ls(LOCAL_WREN_FOLDER, "%s*.warc.gz*" % jmd['warcPrefix'], rf): if wren.endswith('.open'): wren = wren[:-5] warcfiles.add(os.path.basename(wren)) # Also check in case file has already been moved into output/warcs/{job}/{launch}: for wren in remote_ls(self.warc_file_path(), "%s*.warc.gz*" % jmd['warcPrefix'], rf): warcfiles.add(os.path.basename(wren)) # FIXME Also look on HDFS for matching files? else: logger.warning("No WARC file entry found for line: %s" % line) warcs = [] viral = [] for warcfile in warcfiles: if self._file_exists("%s/%s" % (self.viral_file_path(), warcfile), rf): logger.info("Found Viral WARC %s/%s" % (self.viral_file_path(), warcfile)) viral.append("%s/%s" % (self.viral_file_path(), warcfile)) elif self._file_exists("%s/%s" % (LOCAL_WREN_FOLDER, warcfile), rf): logger.info("Found WREN WARC %s" % warcfile) warcs.append("%s/%s" % (LOCAL_WREN_FOLDER, warcfile)) elif self._file_exists("%s/%s" % (self.warc_file_path(), warcfile), rf): logger.info("Found WARC %s/%s" % (self.warc_file_path(), warcfile)) warcs.append("%s/%s" % (self.warc_file_path(), warcfile)) else: raise Exception("Cannot file warc file %s" % warcfile) return warcs, viral
def find_watched_target_for(self, url, source, publishers): ''' Given a URL and an array of publisher strings, determine which Watched Target to associate them with. ''' # Find the list of Targets where a seed matches the given URL surt = url_to_surt(url, host_only=True) matches = [] for t in self.targets: if t['watched']: a_match = False for seed in t['seeds']: if surt.startswith(url_to_surt(seed, host_only=True)): a_match = True if a_match: matches.append(t) # No matches: if len(matches) == 0: logger.error("No match found for url %s" % url) return None # raise Exception("No matching target for url "+url) # If one match, assume that is the right Target: if len(matches) == 1: return int(matches[0]['id']) # # Else multiple matches, so need to disambiguate. # # Attempt to disambiguate based on source ONLY: if source is not None: for t in matches: for seed in t['seeds']: logger.info("Looking for source match '%s' against '%s' " % (source, seed)) if seed == source: # return int(t['id']) logger.info( "Found match source+seed but this is not enough to disambiguate longer crawls." ) break # Then attempt to disambiguate based on publisher # FIXME Make this a bit more forgiving of punctation/minor differences title_matches = [] for t in matches: for publisher in publishers: logger.info("Looking for publisher match '%s' in title '%s' " % (publisher, t['title'])) if publisher and publisher.lower() in t['title'].lower(): title_matches.append(t) break if len(title_matches) == 0: logger.warning("No matching title to associate with url %s " % url) return None # raise Exception("No matching title to associate with url %s " % url) elif len(title_matches) == 1: return int(title_matches[0]['id']) else: logger.warning("Too many matching titles for %s" % url) for t in title_matches: logger.warning("Candidate: %d %s " % (t['id'], t['title'])) logger.warning("Assuming first match is sufficient... (%s)" % title_matches[0]['title']) return int(title_matches[0]['id'])
def _get_ld_export(self, frequency): qurl = "%s/api/crawl/feed/ld/%s" % (self.url, frequency) logger.info("Getting %s" % qurl) return self._get_json(qurl)
def get_json(self, path): path = path.lstrip("/") qurl = "%s/%s" % (self.url, path) logger.info("Getting %s" % qurl) return self._get_json(qurl)
def update_target_selector(self, tid, uid): target = {} target['selector'] = uid logger.info("PUT %d %s" % (tid, json.dumps(target))) r = requests.put("%s/api/targets/%d" % (self.url, tid), headers=self.up_headers, data=json.dumps(target)) return r
def output(self): t = get_hdfs_target(self.path) logger.info("Output is %s" % t.path) return t
def scan_job_launch(self, job, launch): logger.info("Looking at moving files for %s %s" % (job, launch)) yield MoveFilesForLaunch(job, launch, self.delete_local)
def process_output(self,job,launch): logger.info("Processing %s/%s" % (job, launch)) yield GenerateWarcStats(job,launch)
def requires(self): # Enumerate the jobs: for (job, launch) in self.enumerate_launches(): logger.info("Processing %s/%s" % (job, launch)) yield self.scan_job_launch(job, launch)