def define_partner_quarterly(main_dag_name): dag = DAG(dag_id='{}.partner_quarterly'.format(main_dag_name), default_args=default_args, schedule_interval='0 0 1 */3 *', max_active_runs=1) raw_jobs = config.get('raw_jobs_s3_paths', {}) if not raw_jobs: return dag if 'VA' not in raw_jobs: return dag va_bucket, va_prefix = split_s3_path(raw_jobs['VA']) PartnerUpdateOperator( task_id='va_jobs_update', dag=dag, sources=[ 'http://opendata.cs.vt.edu/dataset/2002e48d-363e-40d1-9d1b-a134301126a7/resource/d40efa75-ed86-4a01-9854-90d27539d477/download/joblistings.merged.parsed.unique.grpbyyear.2010-2015.01.json', 'http://opendata.cs.vt.edu/dataset/2002e48d-363e-40d1-9d1b-a134301126a7/resource/c7d4d3e6-61fb-4985-920d-7ac20732083d/download/joblistings.merged.parsed.unique.grpbyyear.2010-2015.02.json', 'http://opendata.cs.vt.edu/dataset/2002e48d-363e-40d1-9d1b-a134301126a7/resource/638255b0-cd2f-4b34-8abb-cf46f075bdfd/download/joblistings.merged.parsed.unique.grpbyyear.2010-2015.03.json', 'http://opendata.cs.vt.edu/dataset/2002e48d-363e-40d1-9d1b-a134301126a7/resource/7e14bb60-2474-420b-ae7b-62b195051f1f/download/joblistings.merged.parsed.unique.grpbyyear.2010-2015.04.json', 'http://opendata.cs.vt.edu/dataset/b67a5b8e-679a-4442-a8c8-4bb55a4618d6/resource/62da570a-6970-46de-b206-dab067ba51eb/download/joblistings.merged.parsed.unique.grpbyyear.2016.json' ], output_bucket=va_bucket, output_prefix=va_prefix, cache_headers=['Content-Range'], ) return dag
def save(self, s3_conn, s3_prefix): """Save stats to S3, including percentages Args: s3_conn (boto.Connection) - an s3 connection s3_prefix (str) s3 path (including bucket) to save dataset stats """ bucket_name, prefix = split_s3_path(s3_prefix) bucket = s3_conn.get_bucket(bucket_name) for field_name, counts in self.accumulator.items(): output = BytesIO() writer = csv.writer(output) for value, count in counts.most_common(): writer.writerow([value, count]) key = boto.s3.key.Key( bucket=bucket, name='{}/{}/{}/{}.csv'.format( prefix, self.directory, self.quarter, field_name ) ) logging.info('Writing stats to %s', key) output.seek(0) key.set_contents_from_string(output.getvalue())
def generate_job_postings_from_s3( s3_conn, s3_prefix: Text, ) -> JobPostingGeneratorType: """ Stream all job listings from s3 Args: s3_conn: a boto s3 connection s3_prefix: path to the job listings. Yields: string in json format representing the next job listing Refer to sample_job_listing.json for example structure """ retrier = Retrying(retry_on_exception=retry_if_io_error, wait_exponential_multiplier=100, wait_exponential_max=100000) bucket_name, prefix = split_s3_path(s3_prefix) bucket = s3_conn.get_bucket(bucket_name) keys = bucket.list(prefix=prefix) for key in keys: logging.info('Extracting job postings from key {}'.format(key.name)) with BytesIO() as outfile: retrier.call(key.get_contents_to_file, outfile, cb=log_download_progress) outfile.seek(0) for line in outfile: yield json.loads(line.decode('utf-8'))
def _save(self, s3_prefix): """Save stats to S3, including percentages """ bucket_name, prefix = split_s3_path(s3_prefix) bucket = self.s3_conn.get_bucket(bucket_name) self._compute_percentages() self.stats['last_updated'] = datetime.now().isoformat() key = self._key(bucket, prefix) key.set_contents_from_string(json.dumps(self.stats))
def define_partner_etl(main_dag_name): dag = QuarterlySubDAG(main_dag_name, 'partner_etl') raw_jobs = config.get('raw_jobs_s3_paths', {}) if not raw_jobs: return dag bucket, prefix = split_s3_path(config['job_postings']['s3_path']) partner_stats = {} for partner_id, s3_path in raw_jobs.items(): importer_class = importers.get(partner_id, None) if not importer_class: logging.warning('Importer for %s not found, skipping', partner_id) continue input_bucket, input_prefix = split_s3_path(s3_path) etl = PartnerETLOperator( task_id='{}_etl'.format(partner_id), dag=dag, transformer_class=importer_class, output_bucket=bucket, output_prefix=prefix, partner_id=partner_id, passthrough_kwargs={ 'bucket_name': input_bucket, 'prefix': input_prefix, } ) partner_stats[partner_id] = PartnerStatsAggregateOperator( task_id='{}_partner_agg'.format(partner_id), dag=dag, partner_id=partner_id ) partner_stats[partner_id].set_upstream(etl) global_stats = GlobalStatsAggregateOperator(task_id='global_agg', dag=dag) for partner_stats_instance in partner_stats.values(): global_stats.set_upstream(partner_stats_instance) return dag
def quarterly_posting_stats(s3_conn, stats_s3_path): bucket_name, prefix = split_s3_path(stats_s3_path) bucket = s3_conn.get_bucket(bucket_name) total = Counter() for key in bucket.list( prefix='{}/{}'.format(prefix, DatasetStatsCounter.directory)): quarter = key.name[-6:] stats = json.loads(key.get_contents_as_string().decode('utf-8')) total[quarter] += stats['total'] return total
def download_with_prefix(s3_conn, s3_prefix, out_directory): bucket_name, prefix = split_s3_path(s3_prefix) bucket = s3_conn.get_bucket(bucket_name) out_filenames = [] for key in bucket.list(prefix=prefix): leaf_name = key.name.split('/')[-1] out_filename = os.path.join(out_directory, leaf_name) key.get_contents_to_filename(out_filename) out_filenames.append(out_filename) return out_filenames
def partners(s3_conn, s3_prefix): partners_list = [] bucket_name, prefix = split_s3_path(s3_prefix) bucket = s3_conn.get_bucket(bucket_name) for key in bucket.list(prefix='{}/{}'.format( prefix, DatasetStatsAggregator.directory)): stats = json.loads(key.get_contents_as_string().decode('utf-8')) if stats['total'] > 0: partner_id = key.name.split('/')[-1].split('.')[0] partners_list.append(partner_id) return partners_list
def _save(self, s3_prefix): """Save stats to S3, including percentages """ bucket_name, prefix = split_s3_path(s3_prefix) bucket = self.s3_conn.get_bucket(bucket_name) self._compute_percentages() self.stats['last_updated'] = datetime.now().isoformat() key = boto.s3.key.Key(bucket=bucket, name='{}/{}/{}.json'.format( prefix, self.directory, self.dataset_id)) key.set_contents_from_string(json.dumps(self.stats))
def __init__(self, s3_conn, s3_path, cache_dir): """ Args: s3_conn: a boto s3 connection s3_path: path to the onet directory cache_dir: directory to cache files """ self.s3_conn = s3_conn self.cache_dir = cache_dir self.s3_path = s3_path self.bucket_name, self.prefix = split_s3_path(self.s3_path)
def test_iterate(self): """Test that records from all files are properly returned or excluded according to the given date range This is explicitly testing edge cases; under normal operation each file will just contain records for the specified quarter """ bucket_name, prefix = split_s3_path(self.s3_prefix) bucket = self.connection.create_bucket(bucket_name) mock_data = { '2014Q4.gz': [ { 'dateacquired': '2014-12-15 00:00:00' }, { 'dateacquired': '2014-11-15 00:00:00' }, { 'dateacquired': '2015-01-15 00:00:00' }, ], '2015Q1.gz': [ { 'dateacquired': '2014-12-15 00:00:00' }, { 'dateacquired': '2014-11-15 00:00:00' }, { 'dateacquired': '2015-01-15 00:00:00' }, ] } for keyname, rows in mock_data.items(): key = boto.s3.key.Key(bucket=bucket, name='{}/{}'.format(prefix, keyname)) stream = BytesIO() gzipfile = gzip.GzipFile(fileobj=stream, mode='w') gzipfile.write(b'dateacquired\n') for row in rows: gzipfile.write(row['dateacquired'].encode('utf-8')) gzipfile.write(b'\n') gzipfile.close() stream.seek(0) key.set_contents_from_file(stream) self.assert_num_postings_for_quarter('2015Q1', 1) self.assert_num_postings_for_quarter('2014Q4', 2) self.assert_num_postings_for_quarter('2014Q1', 0)
def execute(self, context): conn = S3Hook() input_bucket, input_prefix = split_s3_path(config['output_tables']['s3_path']) key = conn.get_key( '{}/{}'.format(input_prefix, titles_filename), bucket_name=input_bucket ) text = key.get_contents_as_string().decode('utf-8') reader = csv.DictReader(io.StringIO(text), delimiter='\t') JobTitlesMasterIndexer( s3_conn=conn.get_conn(), es_client=basic_client(), job_title_generator=reader, alias_name=config['normalizer']['titles_master_index_name'] ).replace()
def save(self, s3_conn, s3_prefix): """Save stats to S3, including percentages Args: s3_conn (boto.Connection) - an s3 connection s3_prefix (str) s3 path (including bucket) to save dataset stats """ bucket_name, prefix = split_s3_path(s3_prefix) bucket = s3_conn.get_bucket(bucket_name) self._compute_percentages() self.stats['last_updated'] = datetime.now().isoformat() key = boto.s3.key.Key(bucket=bucket, name='{}/{}/{}_{}'.format( prefix, self.directory, self.dataset_id, self.quarter)) key.set_contents_from_string(json.dumps(self.stats))
def job_postings(s3_conn, quarter, s3_path, source="all"): """ Stream all job listings from s3 for a given quarter Args: s3_conn: a boto s3 connection quarter: a string representing a quarter (2015Q1) s3_path: path to the job listings. source: should be a string or a subset of "nlx", "va", "cb" or "all" Yields: string in json format representing the next job listing Refer to sample_job_listing.json for example structure """ retrier = Retrying( retry_on_exception=retry_if_io_error, wait_exponential_multiplier=100, wait_exponential_max=100000 ) bucket_name, prefix = split_s3_path(s3_path) bucket = s3_conn.get_bucket(bucket_name) # keys = bucket.list(prefix='{}/{}'.format(prefix, quarter)) if isinstance(source, str): if source.lower() == "all": keys = bucket.list(prefix='{}/{}'.format(prefix, quarter)) else: keys = bucket.list(prefix='{}/{}/{}_'.format(prefix, quarter, source.upper())) elif isinstance(source, list): keys = [] for s in source: keys.append(bucket.list(prefix='{}/{}/{}_'.format(prefix, quarter, s.upper()))) keys = chain(*keys) for key in keys: logging.info('Extracting job postings from key {}'.format(key.name)) with BytesIO() as outfile: retrier.call(key.get_contents_to_file, outfile, cb=log_download_progress) outfile.seek(0) for line in outfile: yield line.decode('utf-8')
def add_s3_content(s3_conn, key_data): for path, data in key_data.items(): bucket_name, key_name = split_s3_path(path) bucket = s3_conn.create_bucket(bucket_name) key = boto.s3.key.Key(bucket=bucket, name=key_name) key.set_contents_from_string(data)
def _iterate_keys(self, s3_prefix): bucket_name, prefix = split_s3_path(s3_prefix) bucket = self.s3_conn.get_bucket(bucket_name) for key in bucket.list( prefix='{}/quarterly/{}_'.format(prefix, self.dataset_id)): yield key
def _iterate_keys(self, s3_prefix): bucket_name, prefix = split_s3_path(s3_prefix) bucket = self.s3_conn.get_bucket(bucket_name) for key in bucket.list(prefix='{}/dataset_summaries/'.format(prefix)): yield key
def _load(self, s3_prefix): bucket_name, prefix = split_s3_path(s3_prefix) bucket = self.s3_conn.get_bucket(bucket_name) key = self._key(bucket, prefix) self.stats = json.loads(key.get_contents_as_string().decode('utf-8'))