def aggregate_properties_for_quarter(quarter, grouping_properties, aggregate_properties, aggregate_functions, aggregations_path, aggregation_name): """Aggregate computed properties for a quarter and writes the resulting CSV to S3 Args: quarter (string) grouping_properties (list of JobPostingComputedProperty) Properties to form the primary key of the aggregation aggregate_properties (list of JobPostingComputedProperty) Properties to be aggregated over the primary key aggregate_functions (dict) A lookup of aggregate functions to be applied for each aggregate column aggregations_path (string) The base s3 path to store aggregations aggregation_name (string) The name of this particular aggregation Returns: nothing """ start_date, end_date = quarter_to_daterange(quarter) included_dates = [date for date in dates_in_range(start_date, end_date)] aggregation_df = aggregation_for_properties_and_dates( grouping_properties, aggregate_properties, aggregate_functions, included_dates) out_path = '/'.join( [aggregations_path, aggregation_name, quarter + '.csv']) fs = s3fs.S3FileSystem() with fs.open(out_path, 'wb') as f: f.write(aggregation_df.to_csv(None).encode()) return out_path
def _iter_postings(self, quarter): logging.info("Finding Virginia postings for %s", quarter) quarter_start, quarter_end = quarter_to_daterange(quarter) bucket = self.s3_conn.get_bucket(self.bucket_name) keylist = list(bucket.list(prefix=self.prefix, delimiter='')) for key in keylist: if key.name.endswith('.cache.json'): continue logging.info("Processing key %s", key.name) with tempfile.NamedTemporaryFile() as local_file: key.get_contents_to_file(local_file) local_file.seek(0) for posting in stream_json_file(local_file): if len(posting['datePosted']) == 0: continue listing_start = datetime.strptime(posting['datePosted'], self.DATE_FORMAT) if len(posting['dateExpires']) == 0: listing_end = listing_start else: listing_end = datetime.strptime( posting['dateExpires'], self.DATE_FORMAT) if overlaps(listing_start.date(), listing_end.date(), quarter_start, quarter_end): yield posting
def _iter_postings(self, quarter): """Iterate through raw postings for a given quarter Args: quarter (string): A quarter (in format 2015Q1) Yields: Untransformed job postings (dict) """ logging.info("Finding USAJobs postings for %s", quarter) quarter_start, quarter_end = quarter_to_daterange(quarter) bucket = self.s3_conn.get_bucket(self.bucket_name) full_prefix = self.prefix + '/' + quarter keylist = list(bucket.list(prefix=full_prefix, delimiter='')) for key in keylist: logging.info("Processing key %s", key.name) contents = key.get_contents_as_string() posting = json.loads(contents.decode('utf-8')) posting['id'] = key.name.split('.')[-2] if len(posting['PositionStartDate']) == 0: continue listing_start = datetime.strptime(posting['PositionStartDate'], self.DATE_FORMAT) if len(posting['PositionEndDate']) == 0: listing_end = listing_start else: listing_end = datetime.strptime(posting['PositionEndDate'], self.DATE_FORMAT) if overlaps(listing_start.date(), listing_end.date(), quarter_start, quarter_end): yield posting else: logging.warning('Posting %s does not overlap with quarter %s', posting['id'], quarter)
def _iter_postings(self, quarter): logging.info("Finding CareerBuilder postings for %s", quarter) quarter_start, quarter_end = quarter_to_daterange(quarter) bucket = self.s3_conn.get_bucket(self.bucket_name) keylist = list(bucket.list(prefix=self.prefix, delimiter='')) for key in keylist: in_file = 0 overlapping = 0 logging.info('Processing key %s', key.name) with tempfile.NamedTemporaryFile() as local_file: key.get_contents_to_file(local_file, cb=log_download_progress) logging.info('Downloaded key %s for processing', key.name) local_file.seek(0) for posting in stream_json_file(local_file): in_file += 1 listing_start = datetime.strptime(posting['created'], self.DATE_FORMAT) listing_end = datetime.strptime(posting['modified'], self.DATE_FORMAT) if overlaps(listing_start.date(), listing_end.date(), quarter_start, quarter_end): overlapping += 1 yield posting logging.info('%s overlapping out of %s total in file', overlapping, in_file)