def output(self): outputs = [] # Get HDFS client: client = luigi.contrib.hdfs.WebHdfsClient() # Find log files: parent_path = "/heritrix/output/%s/%s/logs" % (self.job, self.launch_id) for listed_item in client.listdir(parent_path): # Oddly, depending on the implementation, the listed_path may be absolute or basename-only, so fix here: item = os.path.basename(listed_item) item_path = os.path.join(parent_path, item) if item.endswith(".lck"): logger.error("Lock file should be be present on HDFS! %s" % (item, item_path)) pass elif item.startswith("crawl.log"): outputs.append( luigi.contrib.hdfs.HdfsTarget(path=item_path, format=Plain)) #logger.debug("Including %s" % item) else: pass #logger.debug("Skipping %s" % item) # Return the logs to be processed: return outputs
def find_watched_target_for(self, url, source, publishers): ''' Given a URL and an array of publisher strings, determine which Watched Target to associate them with. ''' # Find the list of Targets where a seed matches the given URL surt = url_to_surt(url, host_only=True) matches = [] for t in self.targets: if t['watched']: a_match = False for seed in t['seeds']: if surt.startswith(url_to_surt(seed, host_only=True)): a_match = True if a_match: matches.append(t) # No matches: if len(matches) == 0: logger.error("No match found for url %s" % url) return None # raise Exception("No matching target for url "+url) # If one match, assume that is the right Target: if len(matches) == 1: return int(matches[0]['id']) # # Else multiple matches, so need to disambiguate. # # Attempt to disambiguate based on source ONLY: if source is not None: for t in matches: for seed in t['seeds']: logger.info("Looking for source match '%s' against '%s' " % (source, seed)) if seed == source: # return int(t['id']) logger.info("Found match source+seed but this is not enough to disambiguate longer crawls.") break # Then attempt to disambiguate based on publisher # FIXME Make this a bit more forgiving of punctation/minor differences title_matches = [] for t in matches: for publisher in publishers: logger.info("Looking for publisher match '%s' in title '%s' " % (publisher, t['title'])) if publisher and publisher.lower() in t['title'].lower(): title_matches.append(t) break if len(title_matches) == 0: logger.warning("No matching title to associate with url %s " % url) return None # raise Exception("No matching title to associate with url %s " % url) elif len(title_matches) == 1: return int(title_matches[0]['id']) else: logger.warning("Too many matching titles for %s" % url) for t in title_matches: logger.warning("Candidate: %d %s " % (t['id'], t['title'])) logger.warning("Assuming first match is sufficient... (%s)" % title_matches[0]['title']) return int(title_matches[0]['id'])
def run(self): """Zips up all log/config. files and copies said archive to HDFS; finds the earliest timestamp in the logs.""" # Set up remote connection: rf = luigi.contrib.ssh.RemoteFileSystem(self.host) # Set up the output, first making sure the full path exists: with self.output().open('w') as f: f.write('') self.output().remove() # What to remove from the paths: chop = len(str(h3().local_prefix)) with zipfile.ZipFile(self.output().path, 'w', allowZip64=True) as zipout: # Crawl log: for crawl_log in remote_ls( "%s/logs/%s/%s" % (h3.local_output_folder, self.job, self.launch_id), "/crawl.log%s" % get_stage_suffix(self.stage), rf): logger.info("Found %s..." % os.path.basename(crawl_log)) self.add_remote_file(zipout, crawl_log[chop:], crawl_log, rf) # Error log(s) for log in remote_ls( "%s/logs/%s/%s" % (LOCAL_OUTPUT_FOLDER, self.job, self.launch_id), "/*-errors.log%s" % get_stage_suffix(self.stage), rf): logger.info("Found %s..." % os.path.basename(log)) self.add_remote_file(zipout, log[chop:], log, rf) # Job text files for txt in remote_ls( "%s/%s/%s" % (LOCAL_JOB_FOLDER, self.job, self.launch_id), "/*.txt", rf): self.add_remote_file(zipout, txt[chop:], txt, rf) # Job json files: for txt in remote_ls( "%s/%s/%s" % (LOCAL_JOB_FOLDER, self.job, self.launch_id), "/*.json", rf): logger.info("Found %s..." % os.path.basename(txt)) self.add_remote_file(zipout, txt[chop:], txt, rf) # Job crawler definition: cxml = "%s/%s/%s/crawler-beans.cxml" % (LOCAL_JOB_FOLDER, self.job, self.launch_id) if rf.exists(cxml): logger.info("Found config...") self.add_remote_file(zipout, cxml[chop:], cxml, rf) else: logger.error("Cannot find config.") raise Exception("Cannot find config.")
def mdex(self): ''' Metadata extraction and target association. ''' # Pass the document through a different extractor based on how the URL starts. try: if (self.doc["document_url"].startswith("https://www.gov.uk/")): self.mdex_gov_uk_publications() elif (self.doc["document_url"].startswith("http://www.ifs.org.uk/") ): self.mdex_ifs_reports() else: self.mdex_default() except Exception as e: logger.error( "Ignoring error during extraction for document %s and landing page %s" % (self.doc['document_url'], self.doc['landing_page_url'])) logger.exception(e) if not 'title' in self.doc or not self.doc['title']: logger.info("Falling back on default extraction logic...") self.mdex_default() logger.info("GOT %s" % self.doc) # Look up which Target this URL should be associated with: if self.targets and self.doc.has_key('landing_page_url'): logger.info( "Looking for match for %s source %s and publishers '%s'" % (self.doc['landing_page_url'], self.source, self.doc.get('publishers', []))) self.doc['target_id'] = self.find_watched_target_for( self.doc['landing_page_url'], self.source, self.doc.get('publishers', [])) # If there is no association, drop it: if not self.doc.get('target_id', None) and self.null_if_no_target_found: logger.critical( "Failed to associated document with any target: %s" % self.doc) return None # If the publisher appears unambiguous, store it where it can be re-used if len(self.doc.get('publishers', [])) is 1: self.doc['publisher'] = self.doc['publishers'][0] # Or return the modified version: return self.doc
def __init__(self, url, email, password): self.url = url.rstrip("/") loginUrl = "%s/login" % self.url logger.info("Logging into %s as %s " % (loginUrl, email)) response = requests.post(loginUrl, data={"email": email, "password": password}) if not response.history: logger.error("W3ACT Login failed!") raise Exception("W3ACT Login Failed!") self.cookie = response.history[0].headers["set-cookie"] self.get_headers = { "Cookie": self.cookie, } self.up_headers = { "Cookie": self.cookie, "Content-Type": "application/json" } self.ld_cache = CachedDict()
def _file_exists(path, rf): """ Checks whether the given file exists and has content - allowed to be '.open' at this point. Also checks on HDFS if there is no local file. :type path: str """ logger.info("Looking for %s" % path) if rf.exists(path) and not rf.isdir(path): # and rf.getsize(path) > 0: return True elif rf.exists("%s.open" % path) and not rf.isdir( "%s.open" % path): # and rf.getsize("%s.open" % path) > 0: return True else: try: if get_hdfs_target(path).exists(): return True except luigi.contrib.hdfs.error.HDFSCliError as e: logger.error("Exception while checking HDFS.") logger.exception(e) return False
def mdex(self): ''' Metadata extraction and target association. ''' # Pass the document through a different extractor based on how the URL starts. try: if( self.doc["document_url"].startswith("https://www.gov.uk/")): self.mdex_gov_uk_publications() elif( self.doc["document_url"].startswith("http://www.ifs.org.uk/")): self.mdex_ifs_reports() else: self.mdex_default() except Exception as e: logger.error("Ignoring error during extraction for document %s and landing page %s" % (self.doc['document_url'], self.doc['landing_page_url'])) logger.exception(e) if not 'title' in self.doc or not self.doc['title']: logger.info("Falling back on default extraction logic...") self.mdex_default() logger.info("GOT %s" % self.doc) # Look up which Target this URL should be associated with: if self.targets and self.doc.has_key('landing_page_url'): logger.info("Looking for match for %s source %s and publishers '%s'" % (self.doc['landing_page_url'], self.source, self.doc.get('publishers',[]))) self.doc['target_id'] = self.find_watched_target_for(self.doc['landing_page_url'], self.source, self.doc.get('publishers', [])) # If there is no association, drop it: if not self.doc.get('target_id', None) and self.null_if_no_target_found: logger.critical("Failed to associated document with any target: %s" % self.doc) return None # If the publisher appears unambiguous, store it where it can be re-used if len(self.doc.get('publishers',[])) is 1: self.doc['publisher'] = self.doc['publishers'][0] # Or return the modified version: return self.doc
def find_watched_target_for(self, url, source, publishers): ''' Given a URL and an array of publisher strings, determine which Watched Target to associate them with. ''' # Find the list of Targets where a seed matches the given URL surt = url_to_surt(url, host_only=True) matches = [] for t in self.targets: if t['watched']: a_match = False for seed in t['seeds']: if surt.startswith(url_to_surt(seed, host_only=True)): a_match = True if a_match: matches.append(t) # No matches: if len(matches) == 0: logger.error("No match found for url %s" % url) return None # raise Exception("No matching target for url "+url) # If one match, assume that is the right Target: if len(matches) == 1: return int(matches[0]['id']) # # Else multiple matches, so need to disambiguate. # # Attempt to disambiguate based on source ONLY: if source is not None: for t in matches: for seed in t['seeds']: logger.info("Looking for source match '%s' against '%s' " % (source, seed)) if seed == source: # return int(t['id']) logger.info( "Found match source+seed but this is not enough to disambiguate longer crawls." ) break # Then attempt to disambiguate based on publisher # FIXME Make this a bit more forgiving of punctation/minor differences title_matches = [] for t in matches: for publisher in publishers: logger.info("Looking for publisher match '%s' in title '%s' " % (publisher, t['title'])) if publisher and publisher.lower() in t['title'].lower(): title_matches.append(t) break if len(title_matches) == 0: logger.warning("No matching title to associate with url %s " % url) return None # raise Exception("No matching title to associate with url %s " % url) elif len(title_matches) == 1: return int(title_matches[0]['id']) else: logger.warning("Too many matching titles for %s" % url) for t in title_matches: logger.warning("Candidate: %d %s " % (t['id'], t['title'])) logger.warning("Assuming first match is sufficient... (%s)" % title_matches[0]['title']) return int(title_matches[0]['id'])