Beispiel #1
0
    def find_watched_target_for(self, url, source, publishers):
        '''
        Given a URL and an array of publisher strings, determine which Watched Target to associate them with.
        '''
        # Find the list of Targets where a seed matches the given URL
        surt = url_to_surt(url, host_only=True)
        matches = []
        for t in self.targets:
            if t['watched']:
                a_match = False
                for seed in t['seeds']:
                    if surt.startswith(url_to_surt(seed, host_only=True)):
                        a_match = True
                if a_match:
                    matches.append(t)

        # No matches:
        if len(matches) == 0:
            logger.error("No match found for url %s" % url)
            return None
        # raise Exception("No matching target for url "+url)
        # If one match, assume that is the right Target:
        if len(matches) == 1:
            return int(matches[0]['id'])
        #
        # Else multiple matches, so need to disambiguate.
        #
        # Attempt to disambiguate based on source ONLY:
        if source is not None:
            for t in matches:
                for seed in t['seeds']:
                    logger.info("Looking for source match '%s' against '%s' " % (source, seed))
                    if seed == source:
                        # return int(t['id'])
                        logger.info("Found match source+seed but this is not enough to disambiguate longer crawls.")
                        break
        # Then attempt to disambiguate based on publisher
        # FIXME Make this a bit more forgiving of punctation/minor differences
        title_matches = []
        for t in matches:
            for publisher in publishers:
                logger.info("Looking for publisher match '%s' in title '%s' " % (publisher, t['title']))
                if publisher and publisher.lower() in t['title'].lower():
                    title_matches.append(t)
                    break
        if len(title_matches) == 0:
            logger.warning("No matching title to associate with url %s " % url)
            return None
        # raise Exception("No matching title to associate with url %s " % url)
        elif len(title_matches) == 1:
            return int(title_matches[0]['id'])
        else:
            logger.warning("Too many matching titles for %s" % url)
            for t in title_matches:
                logger.warning("Candidate: %d %s " % (t['id'], t['title']))
            logger.warning("Assuming first match is sufficient... (%s)" % title_matches[0]['title'])
            return int(title_matches[0]['id'])
Beispiel #2
0
 def requires(self):
     watched_surts = self.load_watched_surts()
     # Then scan the logs for documents:
     line_count = 0
     with open(self.path, 'r') as f:
         for line in f:
             line_count += 1
             if line_count % 100 == 0:
                 self.set_status_message = "Currently at line %i of file %s" % (
                     line_count, self.path)
             # And yield tasks for each relevant document:
             (timestamp, status_code, content_length, url, hop_path, via,
              mime, thread, start_time_plus_duration, hash, source,
              annotations) = re.split(" +", line, maxsplit=11)
             # Skip non-downloads:
             if status_code == '-' or status_code == '' or int(
                     status_code) / 100 != 2:
                 continue
             # Check the URL and Content-Type:
             if "application/pdf" in mime:
                 for prefix in watched_surts:
                     document_surt = url_to_surt(url)
                     landing_page_surt = url_to_surt(via)
                     # Are both URIs under the same watched SURT:
                     if document_surt.startswith(
                             prefix) and landing_page_surt.startswith(
                                 prefix):
                         logger.info("Found document: %s" % line)
                         # Proceed to extract metadata and pass on to W3ACT:
                         doc = {}
                         doc['wayback_timestamp'] = start_time_plus_duration[:
                                                                             14]
                         doc['landing_page_url'] = via
                         doc['document_url'] = url
                         doc['filename'] = os.path.basename(
                             urlparse(url).path)
                         doc['size'] = int(content_length)
                         # Add some more metadata to the output so we can work out where this came from later:
                         doc['job_name'] = self.job.name
                         doc['launch_id'] = self.launch_id
                         doc['source'] = source
                         logger.info("Found document: %s" % doc)
                         yield ExtractDocumentAndPost(
                             self.job, self.launch_id, doc, source)
Beispiel #3
0
 def load_watched_surts(self):
     # First find the watched seeds list:
     with open("%s/%s/%s/watched-surts.txt" % (h3().local_job_folder, self.job.name, self.launch_id)) as reader:
         watched = [line.rstrip('\n') for line in reader]
         logger.info("WATCHED %s" % watched)
     # Convert to SURT form:
     watched_surts = set()
     for url in watched:
         watched_surts.add(url_to_surt(url))
     logger.info("WATCHED SURTS %s" % watched_surts)
     return watched_surts
Beispiel #4
0
 def load_watched_surts(self):
     # First find the watched seeds list:
     with open(
             "%s/%s/%s/watched-surts.txt" %
         (h3().local_job_folder, self.job.name, self.launch_id)) as reader:
         watched = [line.rstrip('\n') for line in reader]
         logger.info("WATCHED %s" % watched)
     # Convert to SURT form:
     watched_surts = set()
     for url in watched:
         watched_surts.add(url_to_surt(url))
     logger.info("WATCHED SURTS %s" % watched_surts)
     return watched_surts
Beispiel #5
0
 def requires(self):
     watched_surts = self.load_watched_surts()
     # Then scan the logs for documents:
     line_count = 0
     with open(self.path, 'r') as f:
         for line in f:
             line_count += 1
             if line_count % 100 == 0:
                 self.set_status_message = "Currently at line %i of file %s" % (line_count, self.path)
             # And yield tasks for each relevant document:
             (timestamp, status_code, content_length, url, hop_path, via, mime,
              thread, start_time_plus_duration, hash, source, annotations) = re.split(" +", line, maxsplit=11)
             # Skip non-downloads:
             if status_code == '-' or status_code == '' or int(status_code) / 100 != 2:
                 continue
             # Check the URL and Content-Type:
             if "application/pdf" in mime:
                 for prefix in watched_surts:
                     document_surt = url_to_surt(url)
                     landing_page_surt = url_to_surt(via)
                     # Are both URIs under the same watched SURT:
                     if document_surt.startswith(prefix) and landing_page_surt.startswith(prefix):
                         logger.info("Found document: %s" % line)
                         # Proceed to extract metadata and pass on to W3ACT:
                         doc = {}
                         doc['wayback_timestamp'] = start_time_plus_duration[:14]
                         doc['landing_page_url'] = via
                         doc['document_url'] = url
                         doc['filename'] = os.path.basename(urlparse(url).path)
                         doc['size'] = int(content_length)
                         # Add some more metadata to the output so we can work out where this came from later:
                         doc['job_name'] = self.job.name
                         doc['launch_id'] = self.launch_id
                         doc['source'] = source
                         logger.info("Found document: %s" % doc)
                         yield ExtractDocumentAndPost(self.job, self.launch_id, doc, source)
Beispiel #6
0
    def find_watched_target_for(self, url, source, publishers):
        '''
        Given a URL and an array of publisher strings, determine which Watched Target to associate them with.
        '''
        # Find the list of Targets where a seed matches the given URL
        surt = url_to_surt(url, host_only=True)
        matches = []
        for t in self.targets:
            if t['watched']:
                a_match = False
                for seed in t['seeds']:
                    if surt.startswith(url_to_surt(seed, host_only=True)):
                        a_match = True
                if a_match:
                    matches.append(t)

        # No matches:
        if len(matches) == 0:
            logger.error("No match found for url %s" % url)
            return None
        # raise Exception("No matching target for url "+url)
        # If one match, assume that is the right Target:
        if len(matches) == 1:
            return int(matches[0]['id'])
        #
        # Else multiple matches, so need to disambiguate.
        #
        # Attempt to disambiguate based on source ONLY:
        if source is not None:
            for t in matches:
                for seed in t['seeds']:
                    logger.info("Looking for source match '%s' against '%s' " %
                                (source, seed))
                    if seed == source:
                        # return int(t['id'])
                        logger.info(
                            "Found match source+seed but this is not enough to disambiguate longer crawls."
                        )
                        break
        # Then attempt to disambiguate based on publisher
        # FIXME Make this a bit more forgiving of punctation/minor differences
        title_matches = []
        for t in matches:
            for publisher in publishers:
                logger.info("Looking for publisher match '%s' in title '%s' " %
                            (publisher, t['title']))
                if publisher and publisher.lower() in t['title'].lower():
                    title_matches.append(t)
                    break
        if len(title_matches) == 0:
            logger.warning("No matching title to associate with url %s " % url)
            return None
        # raise Exception("No matching title to associate with url %s " % url)
        elif len(title_matches) == 1:
            return int(title_matches[0]['id'])
        else:
            logger.warning("Too many matching titles for %s" % url)
            for t in title_matches:
                logger.warning("Candidate: %d %s " % (t['id'], t['title']))
            logger.warning("Assuming first match is sufficient... (%s)" %
                           title_matches[0]['title'])
            return int(title_matches[0]['id'])
from crawl.w3act.w3act import w3act
from crawl.h3.utils import url_to_surt
import logging

LOGGING_FORMAT="[%(asctime)s] %(levelname)s: %(message)s"
logging.basicConfig(format=LOGGING_FORMAT, level=logging.DEBUG)
log = logging.getLogger(__name__)
log.setLevel(logging.DEBUG)

if __name__ == "__main__":
    parser = argparse.ArgumentParser(
        'Grab Open Access targets and output to a file in SURT form.')
    parser.add_argument('--act-url', dest='act_url', type=str,
                        default="https://www.webarchive.org.uk/act/",
                        help="ACT endpoint to use. [default: %(default)s]")
    parser.add_argument('--act-username', dest='act_username', type=str,
                        help="ACT username to use. [default: %(default)s]")
    parser.add_argument('--act-password', dest='act_password', type=str,
                        help="ACT password to use. [default: %(default)s]")
    parser.add_argument('output_file', metavar='output file', default="/wayback/ldhosts.txt",
                        help="Output file to create, e.g. '/wayback/ldhosts.txt''.")

    args = parser.parse_args()

    w = w3act(args.act_url, args.act_username, args.act_password)
    items = w.get_oa_export("all")
    surts = ["http://(%s" % url_to_surt(u) for t in items for u in t["seeds"]]
    with open(args.output_file, "wb") as o:
        o.write("\n".join(surts))