Exemple #1
0
 def output(self):
     outputs = []
     # Get HDFS client:
     client = luigi.contrib.hdfs.WebHdfsClient()
     # Find log files:
     parent_path = "/heritrix/output/%s/%s/logs" % (self.job,
                                                    self.launch_id)
     for listed_item in client.listdir(parent_path):
         # Oddly, depending on the implementation, the listed_path may be absolute or basename-only, so fix here:
         item = os.path.basename(listed_item)
         item_path = os.path.join(parent_path, item)
         if item.endswith(".lck"):
             logger.error("Lock file should be be present on HDFS! %s" %
                          (item, item_path))
             pass
         elif item.startswith("crawl.log"):
             outputs.append(
                 luigi.contrib.hdfs.HdfsTarget(path=item_path,
                                               format=Plain))
             #logger.debug("Including %s" % item)
         else:
             pass
             #logger.debug("Skipping %s" % item)
     # Return the logs to be processed:
     return outputs
    def find_watched_target_for(self, url, source, publishers):
        '''
        Given a URL and an array of publisher strings, determine which Watched Target to associate them with.
        '''
        # Find the list of Targets where a seed matches the given URL
        surt = url_to_surt(url, host_only=True)
        matches = []
        for t in self.targets:
            if t['watched']:
                a_match = False
                for seed in t['seeds']:
                    if surt.startswith(url_to_surt(seed, host_only=True)):
                        a_match = True
                if a_match:
                    matches.append(t)

        # No matches:
        if len(matches) == 0:
            logger.error("No match found for url %s" % url)
            return None
        # raise Exception("No matching target for url "+url)
        # If one match, assume that is the right Target:
        if len(matches) == 1:
            return int(matches[0]['id'])
        #
        # Else multiple matches, so need to disambiguate.
        #
        # Attempt to disambiguate based on source ONLY:
        if source is not None:
            for t in matches:
                for seed in t['seeds']:
                    logger.info("Looking for source match '%s' against '%s' " % (source, seed))
                    if seed == source:
                        # return int(t['id'])
                        logger.info("Found match source+seed but this is not enough to disambiguate longer crawls.")
                        break
        # Then attempt to disambiguate based on publisher
        # FIXME Make this a bit more forgiving of punctation/minor differences
        title_matches = []
        for t in matches:
            for publisher in publishers:
                logger.info("Looking for publisher match '%s' in title '%s' " % (publisher, t['title']))
                if publisher and publisher.lower() in t['title'].lower():
                    title_matches.append(t)
                    break
        if len(title_matches) == 0:
            logger.warning("No matching title to associate with url %s " % url)
            return None
        # raise Exception("No matching title to associate with url %s " % url)
        elif len(title_matches) == 1:
            return int(title_matches[0]['id'])
        else:
            logger.warning("Too many matching titles for %s" % url)
            for t in title_matches:
                logger.warning("Candidate: %d %s " % (t['id'], t['title']))
            logger.warning("Assuming first match is sufficient... (%s)" % title_matches[0]['title'])
            return int(title_matches[0]['id'])
Exemple #3
0
    def run(self):
        """Zips up all log/config. files and copies said archive to HDFS; finds the
        earliest timestamp in the logs."""
        # Set up remote connection:
        rf = luigi.contrib.ssh.RemoteFileSystem(self.host)
        # Set up the output, first making sure the full path exists:
        with self.output().open('w') as f:
            f.write('')
        self.output().remove()
        # What to remove from the paths:
        chop = len(str(h3().local_prefix))
        with zipfile.ZipFile(self.output().path, 'w',
                             allowZip64=True) as zipout:
            # Crawl log:
            for crawl_log in remote_ls(
                    "%s/logs/%s/%s" %
                (h3.local_output_folder, self.job, self.launch_id),
                    "/crawl.log%s" % get_stage_suffix(self.stage), rf):
                logger.info("Found %s..." % os.path.basename(crawl_log))
                self.add_remote_file(zipout, crawl_log[chop:], crawl_log, rf)
            # Error log(s)
            for log in remote_ls(
                    "%s/logs/%s/%s" %
                (LOCAL_OUTPUT_FOLDER, self.job, self.launch_id),
                    "/*-errors.log%s" % get_stage_suffix(self.stage), rf):
                logger.info("Found %s..." % os.path.basename(log))
                self.add_remote_file(zipout, log[chop:], log, rf)

            # Job text files
            for txt in remote_ls(
                    "%s/%s/%s" % (LOCAL_JOB_FOLDER, self.job, self.launch_id),
                    "/*.txt", rf):
                self.add_remote_file(zipout, txt[chop:], txt, rf)

            # Job json files:
            for txt in remote_ls(
                    "%s/%s/%s" % (LOCAL_JOB_FOLDER, self.job, self.launch_id),
                    "/*.json", rf):
                logger.info("Found %s..." % os.path.basename(txt))
                self.add_remote_file(zipout, txt[chop:], txt, rf)
            # Job crawler definition:
            cxml = "%s/%s/%s/crawler-beans.cxml" % (LOCAL_JOB_FOLDER, self.job,
                                                    self.launch_id)
            if rf.exists(cxml):
                logger.info("Found config...")
                self.add_remote_file(zipout, cxml[chop:], cxml, rf)
            else:
                logger.error("Cannot find config.")
                raise Exception("Cannot find config.")
Exemple #4
0
    def mdex(self):
        '''
        Metadata extraction and target association.
        '''
        # Pass the document through a different extractor based on how the URL starts.
        try:
            if (self.doc["document_url"].startswith("https://www.gov.uk/")):
                self.mdex_gov_uk_publications()
            elif (self.doc["document_url"].startswith("http://www.ifs.org.uk/")
                  ):
                self.mdex_ifs_reports()
            else:
                self.mdex_default()
        except Exception as e:
            logger.error(
                "Ignoring error during extraction for document %s and landing page %s"
                % (self.doc['document_url'], self.doc['landing_page_url']))
            logger.exception(e)

        if not 'title' in self.doc or not self.doc['title']:
            logger.info("Falling back on default extraction logic...")
            self.mdex_default()
            logger.info("GOT %s" % self.doc)

        # Look up which Target this URL should be associated with:
        if self.targets and self.doc.has_key('landing_page_url'):
            logger.info(
                "Looking for match for %s source %s and publishers '%s'" %
                (self.doc['landing_page_url'], self.source,
                 self.doc.get('publishers', [])))
            self.doc['target_id'] = self.find_watched_target_for(
                self.doc['landing_page_url'], self.source,
                self.doc.get('publishers', []))

        # If there is no association, drop it:
        if not self.doc.get('target_id',
                            None) and self.null_if_no_target_found:
            logger.critical(
                "Failed to associated document with any target: %s" % self.doc)
            return None

        # If the publisher appears unambiguous, store it where it can be re-used
        if len(self.doc.get('publishers', [])) is 1:
            self.doc['publisher'] = self.doc['publishers'][0]

        # Or return the modified version:
        return self.doc
Exemple #5
0
 def __init__(self, url, email, password):
     self.url = url.rstrip("/")
     loginUrl = "%s/login" % self.url
     logger.info("Logging into %s as %s " % (loginUrl, email))
     response = requests.post(loginUrl, data={"email": email, "password": password})
     if not response.history:
         logger.error("W3ACT Login failed!")
         raise Exception("W3ACT Login Failed!")
     self.cookie = response.history[0].headers["set-cookie"]
     self.get_headers = {
         "Cookie": self.cookie,
     }
     self.up_headers = {
         "Cookie": self.cookie,
         "Content-Type": "application/json"
     }
     self.ld_cache = CachedDict()
Exemple #6
0
    def _file_exists(path, rf):
        """
        Checks whether the given file exists and has content - allowed to be '.open' at this point.

        Also checks on HDFS if there is no local file.

        :type path: str
        """
        logger.info("Looking for %s" % path)
        if rf.exists(path) and not rf.isdir(path):  # and rf.getsize(path) > 0:
            return True
        elif rf.exists("%s.open" % path) and not rf.isdir(
                "%s.open" % path):  # and rf.getsize("%s.open" % path) > 0:
            return True
        else:
            try:
                if get_hdfs_target(path).exists():
                    return True
            except luigi.contrib.hdfs.error.HDFSCliError as e:
                logger.error("Exception while checking HDFS.")
                logger.exception(e)
            return False
    def mdex(self):
        '''
        Metadata extraction and target association.
        '''
        # Pass the document through a different extractor based on how the URL starts.
        try:
            if( self.doc["document_url"].startswith("https://www.gov.uk/")):
                self.mdex_gov_uk_publications()
            elif( self.doc["document_url"].startswith("http://www.ifs.org.uk/")):
                self.mdex_ifs_reports()
            else:
                self.mdex_default()
        except Exception as e:
            logger.error("Ignoring error during extraction for document %s and landing page %s" % (self.doc['document_url'], self.doc['landing_page_url']))
            logger.exception(e)

        if not 'title' in self.doc or not self.doc['title']:
            logger.info("Falling back on default extraction logic...")
            self.mdex_default()
            logger.info("GOT %s" % self.doc)

        # Look up which Target this URL should be associated with:
        if self.targets and self.doc.has_key('landing_page_url'):
            logger.info("Looking for match for %s source %s and publishers '%s'" % (self.doc['landing_page_url'], self.source, self.doc.get('publishers',[])))
            self.doc['target_id'] = self.find_watched_target_for(self.doc['landing_page_url'], self.source, self.doc.get('publishers', []))
        
        # If there is no association, drop it:
        if not self.doc.get('target_id', None) and self.null_if_no_target_found:
            logger.critical("Failed to associated document with any target: %s" % self.doc)
            return None

        # If the publisher appears unambiguous, store it where it can be re-used
        if len(self.doc.get('publishers',[])) is 1:
            self.doc['publisher'] = self.doc['publishers'][0]
            
        # Or return the modified version:
        return self.doc
Exemple #8
0
    def find_watched_target_for(self, url, source, publishers):
        '''
        Given a URL and an array of publisher strings, determine which Watched Target to associate them with.
        '''
        # Find the list of Targets where a seed matches the given URL
        surt = url_to_surt(url, host_only=True)
        matches = []
        for t in self.targets:
            if t['watched']:
                a_match = False
                for seed in t['seeds']:
                    if surt.startswith(url_to_surt(seed, host_only=True)):
                        a_match = True
                if a_match:
                    matches.append(t)

        # No matches:
        if len(matches) == 0:
            logger.error("No match found for url %s" % url)
            return None
        # raise Exception("No matching target for url "+url)
        # If one match, assume that is the right Target:
        if len(matches) == 1:
            return int(matches[0]['id'])
        #
        # Else multiple matches, so need to disambiguate.
        #
        # Attempt to disambiguate based on source ONLY:
        if source is not None:
            for t in matches:
                for seed in t['seeds']:
                    logger.info("Looking for source match '%s' against '%s' " %
                                (source, seed))
                    if seed == source:
                        # return int(t['id'])
                        logger.info(
                            "Found match source+seed but this is not enough to disambiguate longer crawls."
                        )
                        break
        # Then attempt to disambiguate based on publisher
        # FIXME Make this a bit more forgiving of punctation/minor differences
        title_matches = []
        for t in matches:
            for publisher in publishers:
                logger.info("Looking for publisher match '%s' in title '%s' " %
                            (publisher, t['title']))
                if publisher and publisher.lower() in t['title'].lower():
                    title_matches.append(t)
                    break
        if len(title_matches) == 0:
            logger.warning("No matching title to associate with url %s " % url)
            return None
        # raise Exception("No matching title to associate with url %s " % url)
        elif len(title_matches) == 1:
            return int(title_matches[0]['id'])
        else:
            logger.warning("Too many matching titles for %s" % url)
            for t in title_matches:
                logger.warning("Candidate: %d %s " % (t['id'], t['title']))
            logger.warning("Assuming first match is sufficient... (%s)" %
                           title_matches[0]['title'])
            return int(title_matches[0]['id'])