コード例 #1
0
    def _iter_postings(self, quarter):
        logging.info("Finding Virginia postings for %s", quarter)
        quarter_start, quarter_end = quarter_to_daterange(quarter)
        bucket = self.s3_conn.get_bucket(self.bucket_name)
        keylist = list(bucket.list(prefix=self.prefix, delimiter=''))
        for key in keylist:
            if key.name.endswith('.cache.json'):
                continue

            logging.info("Processing key %s", key.name)
            with tempfile.NamedTemporaryFile() as local_file:
                key.get_contents_to_file(local_file)
                local_file.seek(0)
                for posting in stream_json_file(local_file):
                    if len(posting['datePosted']) == 0:
                        continue
                    listing_start = datetime.strptime(posting['datePosted'],
                                                      self.DATE_FORMAT)
                    if len(posting['dateExpires']) == 0:
                        listing_end = listing_start
                    else:
                        listing_end = datetime.strptime(
                            posting['dateExpires'], self.DATE_FORMAT)
                    if overlaps(listing_start.date(), listing_end.date(),
                                quarter_start, quarter_end):
                        yield posting
コード例 #2
0
    def _iter_postings(self, quarter):
        """Iterate through raw postings for a given quarter

        Args:
            quarter (string): A quarter (in format 2015Q1)

        Yields:
            Untransformed job postings (dict)
        """
        logging.info("Finding USAJobs postings for %s", quarter)
        quarter_start, quarter_end = quarter_to_daterange(quarter)
        bucket = self.s3_conn.get_bucket(self.bucket_name)
        full_prefix = self.prefix + '/' + quarter
        keylist = list(bucket.list(prefix=full_prefix, delimiter=''))
        for key in keylist:
            logging.info("Processing key %s", key.name)
            contents = key.get_contents_as_string()
            posting = json.loads(contents.decode('utf-8'))
            posting['id'] = key.name.split('.')[-2]
            if len(posting['PositionStartDate']) == 0:
                continue
            listing_start = datetime.strptime(posting['PositionStartDate'],
                                              self.DATE_FORMAT)
            if len(posting['PositionEndDate']) == 0:
                listing_end = listing_start
            else:
                listing_end = datetime.strptime(posting['PositionEndDate'],
                                                self.DATE_FORMAT)
            if overlaps(listing_start.date(), listing_end.date(),
                        quarter_start, quarter_end):
                yield posting
            else:
                logging.warning('Posting %s does not overlap with quarter %s',
                                posting['id'], quarter)
コード例 #3
0
 def _iter_postings(self, quarter):
     logging.info("Finding CareerBuilder postings for %s", quarter)
     quarter_start, quarter_end = quarter_to_daterange(quarter)
     bucket = self.s3_conn.get_bucket(self.bucket_name)
     keylist = list(bucket.list(prefix=self.prefix, delimiter=''))
     for key in keylist:
         in_file = 0
         overlapping = 0
         logging.info('Processing key %s', key.name)
         with tempfile.NamedTemporaryFile() as local_file:
             key.get_contents_to_file(local_file, cb=log_download_progress)
             logging.info('Downloaded key %s for processing', key.name)
             local_file.seek(0)
             for posting in stream_json_file(local_file):
                 in_file += 1
                 listing_start = datetime.strptime(posting['created'],
                                                   self.DATE_FORMAT)
                 listing_end = datetime.strptime(posting['modified'],
                                                 self.DATE_FORMAT)
                 if overlaps(listing_start.date(), listing_end.date(),
                             quarter_start, quarter_end):
                     overlapping += 1
                     yield posting
             logging.info('%s overlapping out of %s total in file',
                          overlapping, in_file)