def find_watched_target_for(self, url, source, publishers): ''' Given a URL and an array of publisher strings, determine which Watched Target to associate them with. ''' # Find the list of Targets where a seed matches the given URL surt = url_to_surt(url, host_only=True) matches = [] for t in self.targets: if t['watched']: a_match = False for seed in t['seeds']: if surt.startswith(url_to_surt(seed, host_only=True)): a_match = True if a_match: matches.append(t) # No matches: if len(matches) == 0: logger.error("No match found for url %s" % url) return None # raise Exception("No matching target for url "+url) # If one match, assume that is the right Target: if len(matches) == 1: return int(matches[0]['id']) # # Else multiple matches, so need to disambiguate. # # Attempt to disambiguate based on source ONLY: if source is not None: for t in matches: for seed in t['seeds']: logger.info("Looking for source match '%s' against '%s' " % (source, seed)) if seed == source: # return int(t['id']) logger.info("Found match source+seed but this is not enough to disambiguate longer crawls.") break # Then attempt to disambiguate based on publisher # FIXME Make this a bit more forgiving of punctation/minor differences title_matches = [] for t in matches: for publisher in publishers: logger.info("Looking for publisher match '%s' in title '%s' " % (publisher, t['title'])) if publisher and publisher.lower() in t['title'].lower(): title_matches.append(t) break if len(title_matches) == 0: logger.warning("No matching title to associate with url %s " % url) return None # raise Exception("No matching title to associate with url %s " % url) elif len(title_matches) == 1: return int(title_matches[0]['id']) else: logger.warning("Too many matching titles for %s" % url) for t in title_matches: logger.warning("Candidate: %d %s " % (t['id'], t['title'])) logger.warning("Assuming first match is sufficient... (%s)" % title_matches[0]['title']) return int(title_matches[0]['id'])
def requires(self): watched_surts = self.load_watched_surts() # Then scan the logs for documents: line_count = 0 with open(self.path, 'r') as f: for line in f: line_count += 1 if line_count % 100 == 0: self.set_status_message = "Currently at line %i of file %s" % ( line_count, self.path) # And yield tasks for each relevant document: (timestamp, status_code, content_length, url, hop_path, via, mime, thread, start_time_plus_duration, hash, source, annotations) = re.split(" +", line, maxsplit=11) # Skip non-downloads: if status_code == '-' or status_code == '' or int( status_code) / 100 != 2: continue # Check the URL and Content-Type: if "application/pdf" in mime: for prefix in watched_surts: document_surt = url_to_surt(url) landing_page_surt = url_to_surt(via) # Are both URIs under the same watched SURT: if document_surt.startswith( prefix) and landing_page_surt.startswith( prefix): logger.info("Found document: %s" % line) # Proceed to extract metadata and pass on to W3ACT: doc = {} doc['wayback_timestamp'] = start_time_plus_duration[: 14] doc['landing_page_url'] = via doc['document_url'] = url doc['filename'] = os.path.basename( urlparse(url).path) doc['size'] = int(content_length) # Add some more metadata to the output so we can work out where this came from later: doc['job_name'] = self.job.name doc['launch_id'] = self.launch_id doc['source'] = source logger.info("Found document: %s" % doc) yield ExtractDocumentAndPost( self.job, self.launch_id, doc, source)
def load_watched_surts(self): # First find the watched seeds list: with open("%s/%s/%s/watched-surts.txt" % (h3().local_job_folder, self.job.name, self.launch_id)) as reader: watched = [line.rstrip('\n') for line in reader] logger.info("WATCHED %s" % watched) # Convert to SURT form: watched_surts = set() for url in watched: watched_surts.add(url_to_surt(url)) logger.info("WATCHED SURTS %s" % watched_surts) return watched_surts
def load_watched_surts(self): # First find the watched seeds list: with open( "%s/%s/%s/watched-surts.txt" % (h3().local_job_folder, self.job.name, self.launch_id)) as reader: watched = [line.rstrip('\n') for line in reader] logger.info("WATCHED %s" % watched) # Convert to SURT form: watched_surts = set() for url in watched: watched_surts.add(url_to_surt(url)) logger.info("WATCHED SURTS %s" % watched_surts) return watched_surts
def requires(self): watched_surts = self.load_watched_surts() # Then scan the logs for documents: line_count = 0 with open(self.path, 'r') as f: for line in f: line_count += 1 if line_count % 100 == 0: self.set_status_message = "Currently at line %i of file %s" % (line_count, self.path) # And yield tasks for each relevant document: (timestamp, status_code, content_length, url, hop_path, via, mime, thread, start_time_plus_duration, hash, source, annotations) = re.split(" +", line, maxsplit=11) # Skip non-downloads: if status_code == '-' or status_code == '' or int(status_code) / 100 != 2: continue # Check the URL and Content-Type: if "application/pdf" in mime: for prefix in watched_surts: document_surt = url_to_surt(url) landing_page_surt = url_to_surt(via) # Are both URIs under the same watched SURT: if document_surt.startswith(prefix) and landing_page_surt.startswith(prefix): logger.info("Found document: %s" % line) # Proceed to extract metadata and pass on to W3ACT: doc = {} doc['wayback_timestamp'] = start_time_plus_duration[:14] doc['landing_page_url'] = via doc['document_url'] = url doc['filename'] = os.path.basename(urlparse(url).path) doc['size'] = int(content_length) # Add some more metadata to the output so we can work out where this came from later: doc['job_name'] = self.job.name doc['launch_id'] = self.launch_id doc['source'] = source logger.info("Found document: %s" % doc) yield ExtractDocumentAndPost(self.job, self.launch_id, doc, source)
def find_watched_target_for(self, url, source, publishers): ''' Given a URL and an array of publisher strings, determine which Watched Target to associate them with. ''' # Find the list of Targets where a seed matches the given URL surt = url_to_surt(url, host_only=True) matches = [] for t in self.targets: if t['watched']: a_match = False for seed in t['seeds']: if surt.startswith(url_to_surt(seed, host_only=True)): a_match = True if a_match: matches.append(t) # No matches: if len(matches) == 0: logger.error("No match found for url %s" % url) return None # raise Exception("No matching target for url "+url) # If one match, assume that is the right Target: if len(matches) == 1: return int(matches[0]['id']) # # Else multiple matches, so need to disambiguate. # # Attempt to disambiguate based on source ONLY: if source is not None: for t in matches: for seed in t['seeds']: logger.info("Looking for source match '%s' against '%s' " % (source, seed)) if seed == source: # return int(t['id']) logger.info( "Found match source+seed but this is not enough to disambiguate longer crawls." ) break # Then attempt to disambiguate based on publisher # FIXME Make this a bit more forgiving of punctation/minor differences title_matches = [] for t in matches: for publisher in publishers: logger.info("Looking for publisher match '%s' in title '%s' " % (publisher, t['title'])) if publisher and publisher.lower() in t['title'].lower(): title_matches.append(t) break if len(title_matches) == 0: logger.warning("No matching title to associate with url %s " % url) return None # raise Exception("No matching title to associate with url %s " % url) elif len(title_matches) == 1: return int(title_matches[0]['id']) else: logger.warning("Too many matching titles for %s" % url) for t in title_matches: logger.warning("Candidate: %d %s " % (t['id'], t['title'])) logger.warning("Assuming first match is sufficient... (%s)" % title_matches[0]['title']) return int(title_matches[0]['id'])
from crawl.w3act.w3act import w3act from crawl.h3.utils import url_to_surt import logging LOGGING_FORMAT="[%(asctime)s] %(levelname)s: %(message)s" logging.basicConfig(format=LOGGING_FORMAT, level=logging.DEBUG) log = logging.getLogger(__name__) log.setLevel(logging.DEBUG) if __name__ == "__main__": parser = argparse.ArgumentParser( 'Grab Open Access targets and output to a file in SURT form.') parser.add_argument('--act-url', dest='act_url', type=str, default="https://www.webarchive.org.uk/act/", help="ACT endpoint to use. [default: %(default)s]") parser.add_argument('--act-username', dest='act_username', type=str, help="ACT username to use. [default: %(default)s]") parser.add_argument('--act-password', dest='act_password', type=str, help="ACT password to use. [default: %(default)s]") parser.add_argument('output_file', metavar='output file', default="/wayback/ldhosts.txt", help="Output file to create, e.g. '/wayback/ldhosts.txt''.") args = parser.parse_args() w = w3act(args.act_url, args.act_username, args.act_password) items = w.get_oa_export("all") surts = ["http://(%s" % url_to_surt(u) for t in items for u in t["seeds"]] with open(args.output_file, "wb") as o: o.write("\n".join(surts))