Ejemplo n.º 1
0
    def requires(self):
        # FIXME Need to make sure this copes with crawl.log.TIMESTAMP etc. from failures.
        # Set up remote connection:
        rf = luigi.contrib.ssh.RemoteFileSystem(self.host)
        # Look for checkpoints to package, and package them in the correct order:
        outputs = {}
        is_final = False
        for item_path in remote_ls(
                "%s/logs/%s/%s" %
            (LOCAL_OUTPUT_FOLDER, self.job, self.launch_id), "crawl.log*", rf):
            item = os.path.basename(item_path)
            logger.info("ITEM %s" % item)
            if item == "crawl.log":
                is_final = True
                outputs["final"] = item
            elif item.endswith(".lck"):
                pass
            else:
                outputs[item[-14:]] = item

        output_list = sorted(outputs.keys())
        logger.info("Ordered by date: %s" % output_list)

        # Build up the list of stages, in order, and aggregate:
        aggregate = list()
        for key in output_list:
            aggregate.append(outputs[key])
            yield AggregateOutputs(self.host, self.job, self.launch_id, key,
                                   aggregate)
Ejemplo n.º 2
0
    def run(self):
        # Sort inputs by checkpoint timestamp and merge into versioned lists:
        aggregate = {}
        if isinstance(self.input(), list):
            inputs = self.input()
        else:
            inputs = [self.input()]
        # Build up the aggregate:
        for input in inputs:
            logger.info("Reading %s" % input.path)
            item = json.load(input.open())
            for key in item.keys():
                if isinstance(item[key], list):
                    current = aggregate.get(key, [])
                    current.extend(item[key])
                elif isinstance(item[key], dict):
                    current = aggregate.get(key, {})
                    current.update(item[key])
                elif item[key]:
                    current = item[key]
                aggregate[key] = current

        logger.info("Aggregate: %s" % aggregate)

        with self.output().open('w') as f:
            f.write('{}'.format(json.dumps(aggregate, indent=4)))
Ejemplo n.º 3
0
 def watch_target(self, tid):
     target = {}
     target['watchedTarget'] = {}
     target['watchedTarget']['documentUrlScheme'] = ""
     logger.info("PUT %d %s" % (tid, json.dumps(target)))
     r = requests.put("%s/api/targets/%d" % (self.url, tid), headers=self.up_headers, data=json.dumps(target))
     return r
Ejemplo n.º 4
0
    def run(self):
        # Set up remote connection:
        rf = luigi.contrib.ssh.RemoteFileSystem(self.host)
        logs = [self.get_crawl_log(rf)]
        start_date = self.file_start_date(logs)
        # Find the WARCs referenced from the crawl log:
        (warcs, viral) = self.parse_crawl_log(logs)
        # TODO Look for WARCs not spotted via the logs and add them in (ALSO allow this in the log parser)
        if self.stage == 'final':
            for item in remote_ls(self.warc_file_path(), "*.warc.gz", rf):
                if item not in warcs:
                    logger.info("Found additional WARC: %s" % item)
                    warcs.append(item)
            #
            for item in remote_ls(self.viral_file_path(), "*.warc.gz", rf):
                if item not in warcs:
                    logger.info("Found additional Viral WARC: %s" % item)
                    warcs.append(item)

        # TODO Get sha512 and ARK identifiers for WARCs now, and store in launch folder and thus the zip?
        # Loop over all the WARCs involved
        i = 0
        hashes = {}
        for warc in warcs:
            # do some hard work here
            i += 1
            self.set_status_message = "Progress: Hashing WARC %d of %s" % (
                i, len(warcs))
            hash_output = yield CalculateRemoteHash(self.host, warc)
            with hash_output.open('r') as reader:
                sha = reader.read().rstrip('\n')
            hashes[warc] = sha
            # Report on progress...
            self.set_status_message = "Progress: Hashed WARC %d of %s" % (
                i, len(warcs))

        # Bundle logs and configuration data into a zip and upload it to HDFS
        zips = [
            PackageLogs(self.host, self.job, self.launch_id,
                        self.stage).output().path
        ]

        # FIXME Need to mint and add in ARKs at this point:

        # Output the job package summary:
        job_output = {
            'job_id': self.job,
            'launch_id': self.launch_id,
            'start_date': start_date,
            'warcs': warcs,
            'viral': viral,
            'logs': logs,
            'zips': zips,
            'hashes': hashes
        }

        with self.output().open('w') as f:
            f.write('{}'.format(json.dumps(job_output, indent=4)))
Ejemplo n.º 5
0
    def get(self, key, fn, duration):
        if key not in self \
                or self[key].timeStamp + self[key].duration < time.time():
            logger.info('adding new value for %s' % key)
            o = fn(key)
            self[key] = CachedItem(key, o, duration)
        else:
            logger.info('loading from cache for key %s' % key)

        return self[key].value
Ejemplo n.º 6
0
 def run(self):
     # Require that the file is old:
     if True:
         logger.info(
             "But this file is too young to assume we're done: %s " %
             self.path)
         return PendingFile(self.job, self.launch_id, self.path)
     # Okay to move:
     return MoveToHdfs(self.job, self.launch_id, self.path,
                       self.delete_local)
Ejemplo n.º 7
0
 def post_target(self, url, title):
     target = {}
     target['field_urls'] = [url]
     target['title'] = title
     target['selector'] = 1
     target['field_scope'] = "root"
     target['field_depth'] = "CAPPED"
     target['field_ignore_robots_txt'] = False
     logger.info("POST %s" % (json.dumps(target)))
     r = requests.post("%s/api/targets" % self.url, headers=self.up_headers, data=json.dumps(target))
     return r
Ejemplo n.º 8
0
    def run_doc_mdex_test_extraction(self, url, lpu, src, title):

        logger.info("Looking at document URL: %s" % url)
        doc = {}
        doc['document_url'] = url
        doc['landing_page_url'] = lpu
        targets = json.load(self.input()['targets'].open('r'))
        doc = DocumentMDEx(targets, doc, src, null_if_no_target_found=False).mdex()
        logger.info(json.dumps(doc))
        if doc.get('title', None) != title:
            raise Exception("Wrong title found for this document! '%s' v '%s'" % (doc['title'], title))
Ejemplo n.º 9
0
 def run(self):
     # Copy up to HDFS
     client = luigi.contrib.hdfs.get_autoconfig_client(threading.local())
     logger.info("HDFS hash, pre:  %s" %
                 client.client.checksum(self.output().path))
     with open(self.path, 'r') as f:
         client.client.write(data=f,
                             hdfs_path=self.output().path,
                             overwrite=True)
     logger.info("HDFS hash, post:  %s" %
                 client.client.checksum(self.output().path))
Ejemplo n.º 10
0
 def run(self):
     # Read the file in and write it to HDFS
     with self.input().open('r') as reader:
         with self.output().open('w') as writer:
             logger.info("Uploading %s to %s" %
                         (self.input().path, self.output().path))
             while True:
                 chunk = reader.read(DEFAULT_BUFFER_SIZE)
                 if not chunk:
                     break
                 writer.write(chunk)
Ejemplo n.º 11
0
 def run(self):
     open_path = "%s.open" % self.path
     if os.path.isfile(open_path) and not os.path.isfile(self.path):
         logger.info("Found an open file that needs closing: %s " %
                     open_path)
         # Require that the file is old:
         if True:
             logger.info("But this file is too young to close: %s " %
                         open_path)
             return PendingFile(self.job, self.launch_id, self.path)
         # Closing it, as it's old enough:
         shutil.move(open_path, self.path)
Ejemplo n.º 12
0
 def update_target_schedule(self, tid, frequency, start_date, end_date=None):
     target = {}
     target['field_crawl_frequency'] = frequency.upper()
     sd = dateutil.parser.parse(start_date)
     target['field_crawl_start_date'] = int(time.mktime(sd.timetuple()))
     if end_date:
         ed = dateutil.parser.parse(start_date)
         target['field_crawl_end_date'] = int(time.mktime(ed.timetuple()))
     else:
         target['field_crawl_end_date'] = 0
     logger.info("PUT %d %s" % (tid, json.dumps(target)))
     r = requests.put("%s/api/targets/%d" % (self.url, tid), headers=self.up_headers, data=json.dumps(target))
     return r
Ejemplo n.º 13
0
    def run_doc_mdex_test_extraction(self, url, lpu, src, title):

        logger.info("Looking at document URL: %s" % url)
        doc = {}
        doc['document_url'] = url
        doc['landing_page_url'] = lpu
        targets = json.load(self.input()['targets'].open('r'))
        doc = DocumentMDEx(targets, doc, src,
                           null_if_no_target_found=False).mdex()
        logger.info(json.dumps(doc))
        if doc.get('title', None) != title:
            raise Exception(
                "Wrong title found for this document! '%s' v '%s'" %
                (doc['title'], title))
Ejemplo n.º 14
0
    def requires(self):
        logger.info("Looking in %s %s" % (self.job, self.launch_id))
        # Look in /heritrix/output/wren files and move them to the /warcs/ folder:
        tasks = []
        warc_glob = "%s/*-%s-%s-*.warc.gz" % (WREN_FOLDER, self.job,
                                              self.launch_id)
        logger.info("Looking for WREN outputs: %s" % warc_glob)
        for wren_item in glob.glob(warc_glob):
            tasks.append(MoveToWarcsFolder(self.job, self.launch_id,
                                           wren_item))
        yield tasks

        # Look in warcs and viral for WARCs e.g in /heritrix/output/{warcs|viral}/{job}/{launch_id}
        tasks = []
        for out_type in ['warcs', 'viral']:
            glob_path = "%s/%s/%s/%s/*.warc.gz" % (
                CRAWL_OUTPUT_FOLDER, self.job, self.launch_id, out_type)
            logger.info("GLOB:%s" % glob_path)
            for item in glob.glob(
                    "%s/%s/%s/%s/*.warc.gz" %
                (CRAWL_OUTPUT_FOLDER, self.job, self.launch_id, out_type)):
                logger.info("ITEM:%s" % item)
                tasks.append(
                    MoveToHdfs(self.job, self.launch_id, item,
                               self.delete_local))
        # Yield these as a group, so they can run in parallel:
        if len(tasks) > 0:
            yield tasks

        # And look for /heritrix/output/logs:
        tasks = []
        for log_item in glob.glob(
                "%s/%s/%s/logs/*.log*" %
            (CRAWL_OUTPUT_FOLDER, self.job, self.launch_id)):
            if os.path.splitext(log_item)[1] == '.lck':
                continue
            elif os.path.splitext(log_item)[1] == '.log':
                # Only move files with the '.log' suffix if this job is no-longer running:
                logger.info("Using MoveToHdfsIfOld for %s" % log_item)
                tasks.append(
                    MoveToHdfsIfOld(self.job, self.launch_id, log_item,
                                    self.delete_local))
            else:
                tasks.append(
                    MoveToHdfs(self.job, self.launch_id, log_item,
                               self.delete_local))
        # Yield these as a group, so any MoveToHdfsIfStopped jobs don't prevent MoveToHdfs from running
        if len(tasks) > 0:
            yield tasks
Ejemplo n.º 15
0
 def requires(self):
     h = webhdfs()
     logs = []
     for path in [
             "/heritrix/output/logs/dc0-%s" % self.dc_id,
             "/heritrix/output/logs/dc1-%s" % self.dc_id,
             "/heritrix/output/logs/dc2-%s" % self.dc_id,
             "/heritrix/output/logs/dc3-%s" % self.dc_id
     ]:
         for item in h.list(path):
             if item.startswith('crawl.log'):
                 logs.append("%s/%s" % (path, item))
     print("Found %i log files..." % len(logs))
     logger.info("Found %i log files..." % len(logs))
     yield SummariseLogFiles(logs, 'dc', self.dc_id, True)
Ejemplo n.º 16
0
    def run(self):
        logger.info(self.launch_id)
        is_final = False
        outputs = []
        for input in self.input():
            if input.path.endswith(".final"):
                is_final = True
            outputs.append(input.path)

        # only report complete success if...
        if is_final:
            with self.output().open('w') as f:
                f.write('{}'.format(json.dumps(outputs, indent=4)))
        else:
            yield CheckJobStopped(self.host, self.job, self.launch_id)
Ejemplo n.º 17
0
 def calculate_sips(self):
     i_new_sips = 0
     o_dirs = self.client.list("/heritrix/sips/")
     logger.info(o_dirs)
     for o_dir in o_dirs:
         logger.info(o_dir)
         o_sips = self.client.list(
             "/heritrix/sips/%s/" %
             o_dir["pathSuffix"])["FileStatuses"]["FileStatus"]
         for o_sip in o_sips:
             i_mod = datetime.fromtimestamp(o_sip["modificationTime"] /
                                            1000)
             if i_mod > (datetime.now() + relativedelta(months=-1)):
                 i_new_sips += 1
     logger.debug('New SIPs = ' + str(i_new_sips))
     return i_new_sips
Ejemplo n.º 18
0
 def enumerate_launches(self):
     # Set up connection:
     rf = luigi.contrib.ssh.RemoteFileSystem(self.host)
     # Look for jobs that need to be processed:
     for date in self.date_interval:
         for job_item in remote_ls(LOCAL_JOB_FOLDER, "*", rf):
             job = os.path.basename(job_item)
             if rf.isdir(job_item):
                 launch_glob = "%s*" % date.strftime('%Y%m%d')
                 logger.info("Looking for job launch folders matching %s" %
                             launch_glob)
                 for launch_item in remote_ls(job_item, launch_glob, rf):
                     logger.info("Found %s" % launch_item)
                     if rf.isdir(launch_item):
                         launch = os.path.basename(launch_item)
                         yield (self.host, job, launch)
Ejemplo n.º 19
0
    def complete(self):
        # Read local:
        local = luigi.LocalTarget(path=self.source_path)
        with local.open('r') as reader:
            local_hash = hashlib.sha512(
                reader.read().encode('utf-8')).hexdigest()
            logger.info("LOCAL HASH: %s" % local_hash)
        # Read from HDFS
        client = luigi.contrib.hdfs.WebHdfsClient()
        if not client.exists(self.target_path):
            return False
        with client.client.read(self.target_path) as reader:
            hdfs_hash = hashlib.sha512(reader.read()).hexdigest()
            logger.info("HDFS HASH: %s" % hdfs_hash)

        # If they match, we are good:
        return hdfs_hash == local_hash
Ejemplo n.º 20
0
 def __init__(self, url, email, password):
     self.url = url.rstrip("/")
     loginUrl = "%s/login" % self.url
     logger.info("Logging into %s as %s " % (loginUrl, email))
     response = requests.post(loginUrl, data={"email": email, "password": password})
     if not response.history:
         logger.error("W3ACT Login failed!")
         raise Exception("W3ACT Login Failed!")
     self.cookie = response.history[0].headers["set-cookie"]
     self.get_headers = {
         "Cookie": self.cookie,
     }
     self.up_headers = {
         "Cookie": self.cookie,
         "Content-Type": "application/json"
     }
     self.ld_cache = CachedDict()
Ejemplo n.º 21
0
    def run(self):
        # Load the targets:
        with self.input().open() as f:
            all_targets = json.load(f)

        # Grab detailed target data:
        logger.info("Filtering detailed information for %i targets..." %
                    len(all_targets))

        # Filter...
        targets = []
        for t in all_targets:
            if t['crawl_frequency'] is None:
                logger.warning("No crawl frequency set for %s" % t)
            elif t['crawl_frequency'].lower() == self.frequency.lower():
                targets.append(t)

        # Persist to disk:
        with self.output().open('w') as f:
            f.write('{}'.format(json.dumps(targets, indent=4)))
Ejemplo n.º 22
0
    def run(self):
        # Read in sha512
        with self.input()[0].open('r') as f:
            local_hash = f.readline()
        logger.info("Got local hash %s" % local_hash)
        # Re-download and get the hash
        with self.input()[1].open('r') as f:
            hdfs_hash = f.readline()
        logger.info("Got HDFS hash %s" % hdfs_hash)

        if local_hash != hdfs_hash:
            raise Exception("Local & HDFS hashes do not match for %s" %
                            self.path)

        # Otherwise, move to hdfs was good, so delete:
        if self.delete_local:
            os.remove(str(self.path))
        # and write out success
        with self.output().open('w') as f:
            f.write(hdfs_hash)
Ejemplo n.º 23
0
 def mdex_default(self):
     ''' Default extractor uses landing page for title etc.'''
     # Grab the landing page URL as HTML
     r = requests.get(self.lp_wb_url())
     h = html.fromstring(r.content)
     h.make_links_absolute(self.doc["landing_page_url"])
     logger.info("Looking for links...")
     # Attempt to find the nearest prior header:
     for a in h.xpath("//a[@href]"):
         if self.doc["document_url"] in a.attrib["href"]:
             element = a
             # try a preceding match:
             for hel in a.xpath("./preceding::*[self::h1 or self::h2 or self::h3 or self::h4 or self::h5 or self::h6][1]"):
                 # logger.info("EEE %s" % hel.text_content())
                 logger.info("Preceding header %s" % hel.text_content())
                 self.doc['title'] = hel.text_content().strip()
                 return
             # Try recursing up the tree (I think this is superceded by the preceding-match logic above).
             while element.getparent() is not None:
                 element = element.getparent()
                 #logger.info("ELEMENT %s " % element)
                 #logger.info("ELEMENT %s " % element.text_content())
                 for hel in element.xpath(".//*[self::h2 or self::h3 or self::h4 or self::h5]"):
                     logger.info("header %s" % hel.text_content())
                     self.doc['title'] = hel.text_content().strip()
                     return
             self.doc['title'] = a.text_content()
             return
     # Extract a title from the first header, or failing that, the page title:
     self.doc['title'] = self._get0(h.xpath("//h1/text()")).strip()
     if not self.doc['title']:
         self.doc['title'] = self._get0(h.xpath("//title/text()")).strip()
Ejemplo n.º 24
0
    def run(self):
        """Zips up all log/config. files and copies said archive to HDFS; finds the
        earliest timestamp in the logs."""
        # Set up remote connection:
        rf = luigi.contrib.ssh.RemoteFileSystem(self.host)
        # Set up the output, first making sure the full path exists:
        with self.output().open('w') as f:
            f.write('')
        self.output().remove()
        # What to remove from the paths:
        chop = len(str(h3().local_prefix))
        with zipfile.ZipFile(self.output().path, 'w',
                             allowZip64=True) as zipout:
            # Crawl log:
            for crawl_log in remote_ls(
                    "%s/logs/%s/%s" %
                (h3.local_output_folder, self.job, self.launch_id),
                    "/crawl.log%s" % get_stage_suffix(self.stage), rf):
                logger.info("Found %s..." % os.path.basename(crawl_log))
                self.add_remote_file(zipout, crawl_log[chop:], crawl_log, rf)
            # Error log(s)
            for log in remote_ls(
                    "%s/logs/%s/%s" %
                (LOCAL_OUTPUT_FOLDER, self.job, self.launch_id),
                    "/*-errors.log%s" % get_stage_suffix(self.stage), rf):
                logger.info("Found %s..." % os.path.basename(log))
                self.add_remote_file(zipout, log[chop:], log, rf)

            # Job text files
            for txt in remote_ls(
                    "%s/%s/%s" % (LOCAL_JOB_FOLDER, self.job, self.launch_id),
                    "/*.txt", rf):
                self.add_remote_file(zipout, txt[chop:], txt, rf)

            # Job json files:
            for txt in remote_ls(
                    "%s/%s/%s" % (LOCAL_JOB_FOLDER, self.job, self.launch_id),
                    "/*.json", rf):
                logger.info("Found %s..." % os.path.basename(txt))
                self.add_remote_file(zipout, txt[chop:], txt, rf)
            # Job crawler definition:
            cxml = "%s/%s/%s/crawler-beans.cxml" % (LOCAL_JOB_FOLDER, self.job,
                                                    self.launch_id)
            if rf.exists(cxml):
                logger.info("Found config...")
                self.add_remote_file(zipout, cxml[chop:], cxml, rf)
            else:
                logger.error("Cannot find config.")
                raise Exception("Cannot find config.")
Ejemplo n.º 25
0
    def _file_exists(path, rf):
        """
        Checks whether the given file exists and has content - allowed to be '.open' at this point.

        Also checks on HDFS if there is no local file.

        :type path: str
        """
        logger.info("Looking for %s" % path)
        if rf.exists(path) and not rf.isdir(path):  # and rf.getsize(path) > 0:
            return True
        elif rf.exists("%s.open" % path) and not rf.isdir(
                "%s.open" % path):  # and rf.getsize("%s.open" % path) > 0:
            return True
        else:
            try:
                if get_hdfs_target(path).exists():
                    return True
            except luigi.contrib.hdfs.error.HDFSCliError as e:
                logger.error("Exception while checking HDFS.")
                logger.exception(e)
            return False
Ejemplo n.º 26
0
    def find_watched_target_for(self, url, source, publishers):
        '''
        Given a URL and an array of publisher strings, determine which Watched Target to associate them with.
        '''
        # Find the list of Targets where a seed matches the given URL
        surt = url_to_surt(url, host_only=True)
        matches = []
        for t in self.targets:
            if t['watched']:
                a_match = False
                for seed in t['seeds']:
                    if surt.startswith(url_to_surt(seed, host_only=True)):
                        a_match = True
                if a_match:
                    matches.append(t)

        # No matches:
        if len(matches) == 0:
            logger.error("No match found for url %s" % url)
            return None
        # raise Exception("No matching target for url "+url)
        # If one match, assume that is the right Target:
        if len(matches) == 1:
            return int(matches[0]['id'])
        #
        # Else multiple matches, so need to disambiguate.
        #
        # Attempt to disambiguate based on source ONLY:
        if source is not None:
            for t in matches:
                for seed in t['seeds']:
                    logger.info("Looking for source match '%s' against '%s' " % (source, seed))
                    if seed == source:
                        # return int(t['id'])
                        logger.info("Found match source+seed but this is not enough to disambiguate longer crawls.")
                        break
        # Then attempt to disambiguate based on publisher
        # FIXME Make this a bit more forgiving of punctation/minor differences
        title_matches = []
        for t in matches:
            for publisher in publishers:
                logger.info("Looking for publisher match '%s' in title '%s' " % (publisher, t['title']))
                if publisher and publisher.lower() in t['title'].lower():
                    title_matches.append(t)
                    break
        if len(title_matches) == 0:
            logger.warning("No matching title to associate with url %s " % url)
            return None
        # raise Exception("No matching title to associate with url %s " % url)
        elif len(title_matches) == 1:
            return int(title_matches[0]['id'])
        else:
            logger.warning("Too many matching titles for %s" % url)
            for t in title_matches:
                logger.warning("Candidate: %d %s " % (t['id'], t['title']))
            logger.warning("Assuming first match is sufficient... (%s)" % title_matches[0]['title'])
            return int(title_matches[0]['id'])
Ejemplo n.º 27
0
    def uploader(local_path, hdfs_path):
        """
        Copy up to HDFS, making it suitably atomic by using a temporary filename during upload.

        Done as a static method to prevent accidental confusion of self.path/self.output().path etc.

        :return: None
        """
        # Set up the HDFS client:
        client = luigi.contrib.hdfs.get_autoconfig_client(threading.local())

        # Create the temporary file name:
        tmp_path = "%s.temp" % hdfs_path

        # Now upload the file, allowing overwrites as this is a temporary file and
        # simultanous updates should not be possible:
        logger.info("Uploading as %s" % tmp_path)
        with open(local_path, 'r') as f:
            client.client.write(data=f, hdfs_path=tmp_path, overwrite=True)

        # Check if the destination file exists and raise an exception if so:
        if client.exists(hdfs_path):
            raise Exception(
                "Path %s already exists! This should never happen!" %
                hdfs_path)

        # Move the uploaded file into the right place:
        client.client.rename(tmp_path, hdfs_path)

        # Give the namenode a moment to catch-up with itself and then check it's there:
        # FIXME I suspect this is only needed for our ancient HDFS
        time.sleep(2)
        status = client.client.status(hdfs_path)

        # Log successful upload:
        logger.info("Upload completed for %s" % hdfs_path)
Ejemplo n.º 28
0
 def get_crawl_log(self, rf):
     # First, parse the crawl log(s) and determine the WARC file names:
     logfilepath = "%s/logs/%s/%s/crawl.log%s" % (
         LOCAL_OUTPUT_FOLDER, self.job, self.launch_id,
         get_stage_suffix(self.stage))
     logger.info("Looking for crawl logs stage: %s" % self.stage)
     logger.info("Looking for crawl logs: %s" % logfilepath)
     if rf.exists(logfilepath):
         logger.info("Found %s..." % os.path.basename(logfilepath))
         return logfilepath
     else:
         raise Exception("Log file '%s' not found!" % logfilepath)
Ejemplo n.º 29
0
 def _get_json(self, url):
     js = None
     try:
         logger.info("Getting URL: %s" % url)
         r = requests.get(url, headers=self.get_headers)
         if r.status_code == 200:
             js = json.loads(r.content)
         else:
             logger.info(r.status_code)
             logger.info(r.text)
     except:
         logger.warning(str(sys.exc_info()[0]))
         logger.warning(str(traceback.format_exc()))
     return js
Ejemplo n.º 30
0
 def enumerate_launches(self):
     # Look for jobs that need to be processed:
     for date in self.date_interval:
         logger.info("Looking at date %s" % date)
         for job_item in glob.glob("%s/*" % CRAWL_OUTPUT_FOLDER):
             job = os.path.basename(job_item)
             if os.path.isdir(job_item):
                 launch_glob = "%s/%s*" % (job_item,
                                           date.strftime('%Y%m%d'))
                 logger.info("Looking for job launch folders matching %s" %
                             launch_glob)
                 for launch_item in glob.glob(launch_glob):
                     logger.info("Found %s" % launch_item)
                     if os.path.isdir(launch_item):
                         launch = os.path.basename(launch_item)
                         yield (job, launch)
Ejemplo n.º 31
0
    def mdex(self):
        '''
        Metadata extraction and target association.
        '''
        # Pass the document through a different extractor based on how the URL starts.
        try:
            if (self.doc["document_url"].startswith("https://www.gov.uk/")):
                self.mdex_gov_uk_publications()
            elif (self.doc["document_url"].startswith("http://www.ifs.org.uk/")
                  ):
                self.mdex_ifs_reports()
            else:
                self.mdex_default()
        except Exception as e:
            logger.error(
                "Ignoring error during extraction for document %s and landing page %s"
                % (self.doc['document_url'], self.doc['landing_page_url']))
            logger.exception(e)

        if not 'title' in self.doc or not self.doc['title']:
            logger.info("Falling back on default extraction logic...")
            self.mdex_default()
            logger.info("GOT %s" % self.doc)

        # Look up which Target this URL should be associated with:
        if self.targets and self.doc.has_key('landing_page_url'):
            logger.info(
                "Looking for match for %s source %s and publishers '%s'" %
                (self.doc['landing_page_url'], self.source,
                 self.doc.get('publishers', [])))
            self.doc['target_id'] = self.find_watched_target_for(
                self.doc['landing_page_url'], self.source,
                self.doc.get('publishers', []))

        # If there is no association, drop it:
        if not self.doc.get('target_id',
                            None) and self.null_if_no_target_found:
            logger.critical(
                "Failed to associated document with any target: %s" % self.doc)
            return None

        # If the publisher appears unambiguous, store it where it can be re-used
        if len(self.doc.get('publishers', [])) is 1:
            self.doc['publisher'] = self.doc['publishers'][0]

        # Or return the modified version:
        return self.doc
Ejemplo n.º 32
0
    def run(self):
        client = luigi.contrib.hdfs.WebHdfsClient()
        # Upload to temp file:
        temp_path = "%s.temp" % self.target_path
        logger.info("Uploading to %s" % temp_path)
        with open(str(self.source_path)) as f:
            client.client.write(hdfs_path=temp_path,
                                data=f.read(),
                                overwrite=self.overwrite)
        # Remove any existing file, if we're allowed to:
        if self.overwrite:
            if client.exists(self.target_path):
                logger.info("Removing %s..." % self.target_path)
                client.remove(self.target_path, skip_trash=True)
        # And rename
        logger.info("Renaming to %s" % self.target_path)
        client.rename(temp_path, self.target_path)

        # Give the namenode a moment to catch-up with itself and then check it's there:
        # FIXME I suspect this is only needed for our ancient HDFS
        time.sleep(10)
        status = client.client.status(self.target_path)
Ejemplo n.º 33
0
    def mdex(self):
        '''
        Metadata extraction and target association.
        '''
        # Pass the document through a different extractor based on how the URL starts.
        try:
            if( self.doc["document_url"].startswith("https://www.gov.uk/")):
                self.mdex_gov_uk_publications()
            elif( self.doc["document_url"].startswith("http://www.ifs.org.uk/")):
                self.mdex_ifs_reports()
            else:
                self.mdex_default()
        except Exception as e:
            logger.error("Ignoring error during extraction for document %s and landing page %s" % (self.doc['document_url'], self.doc['landing_page_url']))
            logger.exception(e)

        if not 'title' in self.doc or not self.doc['title']:
            logger.info("Falling back on default extraction logic...")
            self.mdex_default()
            logger.info("GOT %s" % self.doc)

        # Look up which Target this URL should be associated with:
        if self.targets and self.doc.has_key('landing_page_url'):
            logger.info("Looking for match for %s source %s and publishers '%s'" % (self.doc['landing_page_url'], self.source, self.doc.get('publishers',[])))
            self.doc['target_id'] = self.find_watched_target_for(self.doc['landing_page_url'], self.source, self.doc.get('publishers', []))
        
        # If there is no association, drop it:
        if not self.doc.get('target_id', None) and self.null_if_no_target_found:
            logger.critical("Failed to associated document with any target: %s" % self.doc)
            return None

        # If the publisher appears unambiguous, store it where it can be re-used
        if len(self.doc.get('publishers',[])) is 1:
            self.doc['publisher'] = self.doc['publishers'][0]
            
        # Or return the modified version:
        return self.doc
Ejemplo n.º 34
0
 def mdex_default(self):
     ''' Default extractor uses landing page for title etc.'''
     # Grab the landing page URL as HTML
     r = requests.get(self.lp_wb_url())
     h = html.fromstring(r.content)
     h.make_links_absolute(self.doc["landing_page_url"])
     logger.info("Looking for links...")
     # Attempt to find the nearest prior header:
     for a in h.xpath("//a[@href]"):
         if self.doc["document_url"] in a.attrib["href"]:
             element = a
             # try a preceding match:
             for hel in a.xpath(
                     "./preceding::*[self::h1 or self::h2 or self::h3 or self::h4 or self::h5 or self::h6][1]"
             ):
                 # logger.info("EEE %s" % hel.text_content())
                 logger.info("Preceding header %s" % hel.text_content())
                 self.doc['title'] = hel.text_content().strip()
                 return
             # Try recursing up the tree (I think this is superceded by the preceding-match logic above).
             while element.getparent() is not None:
                 element = element.getparent()
                 #logger.info("ELEMENT %s " % element)
                 #logger.info("ELEMENT %s " % element.text_content())
                 for hel in element.xpath(
                         ".//*[self::h2 or self::h3 or self::h4 or self::h5]"
                 ):
                     logger.info("header %s" % hel.text_content())
                     self.doc['title'] = hel.text_content().strip()
                     return
             self.doc['title'] = a.text_content()
             return
     # Extract a title from the first header, or failing that, the page title:
     self.doc['title'] = self._get0(h.xpath("//h1/text()")).strip()
     if not self.doc['title']:
         self.doc['title'] = self._get0(h.xpath("//title/text()")).strip()
Ejemplo n.º 35
0
    def parse_crawl_log(self, logs):
        """
        Parses the crawl log to check the WARCs are present.
        :return:
        """
        # Set up remote connection:
        rf = luigi.contrib.ssh.RemoteFileSystem(self.host)
        warcfiles = set()
        remote_log = luigi.contrib.ssh.RemoteTarget(logs[0], self.host)
        with remote_log.open('r') as f:
            for line in f:
                parts = re.split(" +", line, maxsplit=11)
                # Skip failed downloads:
                if parts[1] == '-' or parts[1] == '' or int(parts[1]) <= 0:
                    if parts[1] == '':
                        logger.info(
                            "Skipping line with empty status! '%s' from log file '%s'"
                            % (line, logs[0]))
                    continue
                # Skip locally-resolved DNS records
                if parts[1] == "1001":
                    logger.debug(
                        "Skipping finding WARC for locally-defined hostname: %s"
                        % parts[3])
                    continue
                # Attempt to parse JSON
                try:
                    (annotations, line_json) = re.split("{",
                                                        parts[11],
                                                        maxsplit=1)
                    line_json = "{%s" % line_json
                    # logger.debug("LOG JSON: %s" % line_json)
                    # logger.debug("LOG ANNOTATIONS: %s" % annotations)
                    jmd = json.loads(line_json)
                except Exception as e:
                    logger.info("LOG LINE: %s" % line)
                    logger.info("LOG LINE part[11]: %s" % parts[11])
                    logger.exception(e)
                    raise e
                if 'warcFilename' in jmd:
                    warcfiles.add(jmd['warcFilename'])
                elif 'warcPrefix' in jmd:
                    for wren in remote_ls(LOCAL_WREN_FOLDER,
                                          "%s*.warc.gz*" % jmd['warcPrefix'],
                                          rf):
                        if wren.endswith('.open'):
                            wren = wren[:-5]
                        warcfiles.add(os.path.basename(wren))
                    # Also check in case file has already been moved into output/warcs/{job}/{launch}:
                    for wren in remote_ls(self.warc_file_path(),
                                          "%s*.warc.gz*" % jmd['warcPrefix'],
                                          rf):
                        warcfiles.add(os.path.basename(wren))
                    # FIXME Also look on HDFS for matching files?
                else:
                    logger.warning("No WARC file entry found for line: %s" %
                                   line)

        warcs = []
        viral = []
        for warcfile in warcfiles:
            if self._file_exists("%s/%s" % (self.viral_file_path(), warcfile),
                                 rf):
                logger.info("Found Viral WARC %s/%s" %
                            (self.viral_file_path(), warcfile))
                viral.append("%s/%s" % (self.viral_file_path(), warcfile))
            elif self._file_exists("%s/%s" % (LOCAL_WREN_FOLDER, warcfile),
                                   rf):
                logger.info("Found WREN WARC %s" % warcfile)
                warcs.append("%s/%s" % (LOCAL_WREN_FOLDER, warcfile))
            elif self._file_exists("%s/%s" % (self.warc_file_path(), warcfile),
                                   rf):
                logger.info("Found WARC %s/%s" %
                            (self.warc_file_path(), warcfile))
                warcs.append("%s/%s" % (self.warc_file_path(), warcfile))
            else:
                raise Exception("Cannot file warc file %s" % warcfile)

        return warcs, viral
Ejemplo n.º 36
0
    def find_watched_target_for(self, url, source, publishers):
        '''
        Given a URL and an array of publisher strings, determine which Watched Target to associate them with.
        '''
        # Find the list of Targets where a seed matches the given URL
        surt = url_to_surt(url, host_only=True)
        matches = []
        for t in self.targets:
            if t['watched']:
                a_match = False
                for seed in t['seeds']:
                    if surt.startswith(url_to_surt(seed, host_only=True)):
                        a_match = True
                if a_match:
                    matches.append(t)

        # No matches:
        if len(matches) == 0:
            logger.error("No match found for url %s" % url)
            return None
        # raise Exception("No matching target for url "+url)
        # If one match, assume that is the right Target:
        if len(matches) == 1:
            return int(matches[0]['id'])
        #
        # Else multiple matches, so need to disambiguate.
        #
        # Attempt to disambiguate based on source ONLY:
        if source is not None:
            for t in matches:
                for seed in t['seeds']:
                    logger.info("Looking for source match '%s' against '%s' " %
                                (source, seed))
                    if seed == source:
                        # return int(t['id'])
                        logger.info(
                            "Found match source+seed but this is not enough to disambiguate longer crawls."
                        )
                        break
        # Then attempt to disambiguate based on publisher
        # FIXME Make this a bit more forgiving of punctation/minor differences
        title_matches = []
        for t in matches:
            for publisher in publishers:
                logger.info("Looking for publisher match '%s' in title '%s' " %
                            (publisher, t['title']))
                if publisher and publisher.lower() in t['title'].lower():
                    title_matches.append(t)
                    break
        if len(title_matches) == 0:
            logger.warning("No matching title to associate with url %s " % url)
            return None
        # raise Exception("No matching title to associate with url %s " % url)
        elif len(title_matches) == 1:
            return int(title_matches[0]['id'])
        else:
            logger.warning("Too many matching titles for %s" % url)
            for t in title_matches:
                logger.warning("Candidate: %d %s " % (t['id'], t['title']))
            logger.warning("Assuming first match is sufficient... (%s)" %
                           title_matches[0]['title'])
            return int(title_matches[0]['id'])
Ejemplo n.º 37
0
 def _get_ld_export(self, frequency):
     qurl = "%s/api/crawl/feed/ld/%s" % (self.url, frequency)
     logger.info("Getting %s" % qurl)
     return self._get_json(qurl)
Ejemplo n.º 38
0
 def get_json(self, path):
     path = path.lstrip("/")
     qurl = "%s/%s" % (self.url, path)
     logger.info("Getting %s" % qurl)
     return self._get_json(qurl)
Ejemplo n.º 39
0
 def update_target_selector(self, tid, uid):
     target = {}
     target['selector'] = uid
     logger.info("PUT %d %s" % (tid, json.dumps(target)))
     r = requests.put("%s/api/targets/%d" % (self.url, tid), headers=self.up_headers, data=json.dumps(target))
     return r
Ejemplo n.º 40
0
 def output(self):
     t = get_hdfs_target(self.path)
     logger.info("Output is %s" % t.path)
     return t
Ejemplo n.º 41
0
 def scan_job_launch(self, job, launch):
     logger.info("Looking at moving files for %s %s" % (job, launch))
     yield MoveFilesForLaunch(job, launch, self.delete_local)
Ejemplo n.º 42
0
 def process_output(self,job,launch):
     logger.info("Processing %s/%s" % (job, launch))
     yield GenerateWarcStats(job,launch)
Ejemplo n.º 43
0
 def requires(self):
     # Enumerate the jobs:
     for (job, launch) in self.enumerate_launches():
         logger.info("Processing %s/%s" % (job, launch))
         yield self.scan_job_launch(job, launch)