def execute(self, context): def make_request(conn_id, endpoint, payload=None): return (ChargifyHook(conn_id).run(endpoint, payload).json()) output = [] final_payload = {'per_page': 200, 'page': 1} for param in self.payload: final_payload[param] = self.payload[param] response = make_request(self.chargify_conn_id, self.endpoint, final_payload) while response: output.extend(response) final_payload['page'] += 1 response = make_request(self.chargify_conn_id, self.endpoint, final_payload) logging.info('Retrieved: ' + str(final_payload['per_page'] * final_payload['page'])) output = [record[self.endpoint[:-1]] for record in output] output = '\n'.join([json.dumps(flatten(record)) for record in output]) s3 = S3Hook(s3_conn_id=self.s3_conn_id) s3.load_string(string_data=output, bucket_name=self.s3_bucket, key=self.s3_key, replace=True) s3.connection.close()
def execute(self, context): s3_conn = S3Hook().get_conn() quarter = datetime_to_quarter(context['execution_date']) with tempfile.TemporaryDirectory() as temp_dir: job_postings_generator = job_postings_highmem( s3_conn, quarter, config['job_postings']['s3_path']) geo_querier = JobCBSAFromGeocodeQuerier( cbsa_results=S3CachedCBSAFinder( s3_conn=s3_conn, cache_s3_path=config['cbsa_lookup'] ['s3_path']).all_cached_cbsa_results) logging.basicConfig( format='%(asctime)s %(process)d %(levelname)s: %(message)s') with Pool(processes=config['aggregation']['n_processes']) as pool: try: it = self.map( pool=pool, job_postings_generator=job_postings_generator, geo_querier=geo_querier, temp_dir=temp_dir) combined_agg = self.reduce(it) except Exception as e: logging.error("Child error: {}".format( traceback.format_exc())) raise self.save(combined_agg, quarter, s3_conn)
def execute(self, context): self.hive = HiveCliHook(hive_cli_conn_id=self.hive_cli_conn_id) self.s3 = S3Hook(s3_conn_id=self.s3_conn_id) logging.info("Downloading S3 file") if self.wildcard_match: if not self.s3.check_for_wildcard_key(self.s3_key): raise AirflowException("No key matches {0}".format(self.s3_key)) s3_key_object = self.s3.get_wildcard_key(self.s3_key) else: if not self.s3.check_for_key(self.s3_key): raise AirflowException( "The key {0} does not exists".format(self.s3_key)) s3_key_object = self.s3.get_key(self.s3_key) with NamedTemporaryFile("w") as f: logging.info("Dumping S3 key {0} contents to local" " file {1}".format(s3_key_object.key, f.name)) s3_key_object.get_contents_to_file(f) f.flush() self.s3.connection.close() if not self.headers: logging.info("Loading file into Hive") self.hive.load_file( f.name, self.hive_table, field_dict=self.field_dict, create=self.create, partition=self.partition, delimiter=self.delimiter, recreate=self.recreate) else: with open(f.name, 'r') as tmpf: if self.check_headers: header_l = tmpf.readline() header_line = header_l.rstrip() header_list = header_line.split(self.delimiter) field_names = list(self.field_dict.keys()) test_field_match = [h1.lower() == h2.lower() for h1, h2 in zip(header_list, field_names)] if not all(test_field_match): logging.warning("Headers do not match field names" "File headers:\n {header_list}\n" "Field names: \n {field_names}\n" "".format(**locals())) raise AirflowException("Headers do not match the " "field_dict keys") with NamedTemporaryFile("w") as f_no_headers: tmpf.seek(0) next(tmpf) for line in tmpf: f_no_headers.write(line) f_no_headers.flush() logging.info("Loading file without headers into Hive") self.hive.load_file( f_no_headers.name, self.hive_table, field_dict=self.field_dict, create=self.create, partition=self.partition, delimiter=self.delimiter, recreate=self.recreate)
def classify_common(job_postings, aggregator_constructor, temp_dir, processed_folder, phase_indices, download_folder): s3_conn = S3Hook().get_conn() corpus_creator = SimpleCorpusCreator() title_cleaner = partial(title_clean, phase_indices=phase_indices) common_classifier = Classifier( s3_conn=s3_conn, classifier_id='ann_0614', classify_kwargs={'mode': 'common'}, temporary_directory=download_folder, ) job_aggregators = { 'soc_code_common': SocCodeAggregator(corpus_creator=corpus_creator, occupation_classifier=common_classifier, output_count=2, output_total=True) } aggregator = aggregator_constructor(job_aggregators=job_aggregators, title_cleaner=title_cleaner) aggregator.process_postings(job_postings) aggregator.job_aggregators['soc_code_common'].occupation_classifier = None aggregator.job_aggregators['soc_code_common'].corpus_creator = None return save( aggregator, temp_dir, )
def execute(self, context): response = self.get_data() response.columns = response.columns.map(boa.constrict) json_data = json.loads(response.to_json(orient='records')) schema_map = self.schemaMapping(json_data[0]) s3 = S3Hook(s3_conn_id=self.s3_conn_id) if self.s3_key.endswith('.json'): split = path.splitext(self.s3_key) schema_key = '{0}_schema{1}'.format(split[0], split[1]) results = [ dict([boa.constrict(k), v] for k, v in i.items()) for i in json_data ] results = '\n'.join([json.dumps(i) for i in results]) s3.load_string(string_data=str(schema_map), bucket_name=self.s3_bucket, key=schema_key, replace=True) s3.load_string(string_data=results, bucket_name=self.s3_bucket, key=self.s3_key, replace=True) s3.load_string s3.connection.close()
def execute(self, context): postgres_hook = PostgresHook(postgres_conn_id=self.postgres_conn_id) s3_hook = S3Hook(aws_conn_id=self.s3_conn_id) res = self.query_db(self.query, postgres_hook) res.seek(0) s3_hook.load_file_obj(res, key="egress/sources.airflow.csv", bucket_name="demo-bucket-temp-977338899", replace=True) return True
def execute(self, context): conn = S3Hook().get_conn() stats_aggregator = DatasetStatsAggregator( dataset_id=self.partner_id, s3_conn=conn ) stats_aggregator.run(config['partner_stats']['s3_path'])
def transform_py(**kwargs): s3 = kwargs.get('s3_conn_id', None) s3_key = kwargs.get('templates_dict').get('s3_key', None) transformed_key = kwargs.get('templates_dict').get('transformed_key', None) s3_bucket = kwargs.get('s3_bucket', None) hook = S3Hook(s3) (hook.get_key(s3_key, bucket_name=s3_bucket) .get_contents_to_filename('temp.csv')) df = pd.read_csv('temp.csv') records = json.loads(df.to_json(orient='records')) del df records = [unflatten_list(record) for record in records] records = '\n'.join([json.dumps(record) for record in records]) hook.load_string(string_data=records, key=transformed_key, bucket_name=s3_bucket, replace=True)
def outputManager(self, context, output, key, bucket): if len(output) == 0 or output is None: if self.total_output_files == 0: logging.info("No records pulled from Hubspot.") downstream_tasks = context['task'].get_flat_relatives(upstream=False) logging.info('Skipping downstream tasks...') logging.debug("Downstream task_ids %s", downstream_tasks) if downstream_tasks: self.skip(context['dag_run'], context['ti'].execution_date, downstream_tasks) else: logging.info('Logging {0} to S3...'.format(key)) output = [flatten(e) for e in output] output = '\n'.join([json.dumps({boa.constrict(k): v for k, v in i.items()}) for i in output]) s3 = S3Hook(self.s3_conn_id) s3.load_string( string_data=str(output), key=key, bucket_name=bucket, replace=True ) s3.connection.close() self.total_output_files += 1
def __init__(self, s3_key, field_dict, hive_table, delimiter=',', create=True, recreate=False, partition=None, headers=False, check_headers=False, s3_conn_id='s3_default', hive_cli_conn_id='hive_cli_default', *args, **kwargs): super(S3ToHiveTransfer, self).__init__(*args, **kwargs) self.s3_key = s3_key self.field_dict = field_dict self.hive_table = hive_table self.delimiter = delimiter self.create = create self.recreate = recreate self.partition = partition self.headers = headers self.check_headers = check_headers self.hive = HiveCliHook(hive_cli_conn_id=hive_cli_conn_id) self.s3 = S3Hook(s3_conn_id=s3_conn_id)
def skill_aggregate(job_postings, aggregator_constructor, temp_dir, processed_folder, phase_indices, download_folder): title_cleaner = partial(title_clean, phase_indices=phase_indices) skills_filename = '{}/skills_master_table.tsv'\ .format(processed_folder) if not os.path.isfile(skills_filename): download(s3_conn=S3Hook().get_conn(), out_filename=skills_filename, s3_path=config['output_tables']['s3_path'] + '/skills_master_table.tsv') corpus_creator = SimpleCorpusCreator() job_aggregators = { 'onet_skills': OccupationScopedSkillAggregator( corpus_creator=corpus_creator, skill_extractor=OccupationScopedSkillExtractor( skills_filename=skills_filename), output_count=10) } aggregator = aggregator_constructor(job_aggregators=job_aggregators, title_cleaner=title_cleaner) aggregator.process_postings(job_postings) aggregator.job_aggregators['onet_skills'].skill_extractor = None aggregator.job_aggregators['onet_skills'].corpus_creator = None return save( aggregator, temp_dir, )
def __init__(self, source_s3_key, dest_s3_key, transform_script, source_s3_conn_id='s3_default', dest_s3_conn_id='s3_default', replace=False, *args, **kwargs): super(S3FileTransformOperator, self).__init__(*args, **kwargs) self.source_s3_key = source_s3_key self.source_s3_conn_id = source_s3_conn_id self.dest_s3_key = dest_s3_key self.dest_s3_conn_id = dest_s3_conn_id self.replace = replace self.transform_script = transform_script self.source_s3 = S3Hook(s3_conn_id=source_s3_conn_id) self.dest_s3 = S3Hook(s3_conn_id=dest_s3_conn_id)
def execute(self, context): s3_conn = S3Hook().get_conn() quarter = datetime_to_quarter(context['execution_date']) output_folder = config.get('output_folder', 'output') if not os.path.isdir(output_folder): os.mkdir(output_folder) merge(s3_conn, self.group_config_key, quarter, output_folder) merge(s3_conn, self.rollup_config_key, quarter, output_folder)
def execute(self, context): conn = S3Hook().get_conn() quarter = datetime_to_quarter(context['execution_date']) NormalizeTopNIndexer( quarter=quarter, job_postings_generator=partial(job_postings, s3_path=config['job_postings']['s3_path']), job_titles_index=config['normalizer']['titles_master_index_name'], alias_name=config['normalizer']['es_index_name'], s3_conn=conn, es_client=basic_client() ).append()
def __init__(self): remote_conn_id = configuration.get('core', 'REMOTE_LOG_CONN_ID') try: from airflow.hooks import S3Hook self.hook = S3Hook(remote_conn_id) except: self.hook = None logging.error( 'Could not create an S3Hook with connection id "{}". ' 'Please make sure that airflow[s3] is installed and ' 'the S3 connection exists.'.format(remote_conn_id))
def poke(self, context): self.s3 = S3Hook(s3_conn_id=self.s3_conn_id) bucket, key = self.s3.parse_s3_url(self.s3_key) full_url = 's3://' + bucket + '/' + key logging.info('Poking for key : {full_url}'.format(**locals())) if self.s3.check_for_key(key, bucket): return True raise AirflowException( 'Not present -- retry. If this is a test, then run the dependent job to fix the S3 hook issue.' )
def upload_to_s3(file_name): # Instanstiate s3_hook=S3Hook(aws_conn_id=S3_CONN_ID) # Create file sample_file = "{0}_file_{1}.txt".format(name, file_name) #swap your name here example_file = open(sample_file, "w+") example_file.write("Putting some data in for task {0}".format(file_name)) example_file.close() s3_hook.load_file(sample_file, 'globetelecom/{0}'.format(sample_file), bucket_name=BUCKET, replace=True)
def execute(self, context): response = (BambooHRHook(self.bamboo_conn_id) .run(self.company_name, self.methodMapping(self.method), self.payload)).text results = json.loads(response) s3 = S3Hook(s3_conn_id=self.s3_conn_id) if self.s3_key.endswith('.json'): split = path.splitext(self.s3_key) schema_key = '{0}_schema{1}'.format(split[0], split[1])
def execute(self, context): conn = S3Hook().get_conn() title_extractor = OnetTitleExtractor( onet_source=OnetCache( conn, cache_dir=config['onet']['cache_dir'], s3_path=config['onet']['s3_path'], ), output_filename=titles_filename, hash_function=md5 ) title_extractor.run() upload(conn, titles_filename, config['output_tables']['s3_path'])
def execute(self, context): conn = S3Hook().get_conn() skill_extractor = OnetSkillImportanceExtractor( onet_source=OnetCache( conn, cache_dir=config['onet']['cache_dir'], s3_path=config['onet']['s3_path'], ), output_filename=skill_importance_filename, hash_function=md5 ) skill_extractor.run() upload(conn, skill_importance_filename, config['output_tables']['s3_path'])
def execute(self, context): source_s3 = S3Hook(s3_conn_id=self.source_s3_conn_id) dest_s3 = S3Hook(s3_conn_id=self.dest_s3_conn_id) logging.info("Downloading source S3 file {0}" "".format(self.source_s3_key)) if not source_s3.check_for_key(self.source_s3_key): raise AirflowException("The source key {0} does not exist" "".format(self.source_s3_key)) source_s3_key_object = source_s3.get_key(self.source_s3_key) with NamedTemporaryFile("w") as f_source, NamedTemporaryFile( "w") as f_dest: logging.info("Dumping S3 file {0} contents to local file {1}" "".format(self.source_s3_key, f_source.name)) source_s3_key_object.get_contents_to_file(f_source) f_source.flush() source_s3.connection.close() transform_script_process = subprocess.Popen( [self.transform_script, f_source.name, f_dest.name], stdout=subprocess.PIPE, stderr=subprocess.PIPE) (transform_script_stdoutdata, transform_script_stderrdata ) = transform_script_process.communicate() logging.info("Transform script stdout " "" + transform_script_stdoutdata) if transform_script_process.returncode > 0: raise AirflowException("Transform script failed " "" + transform_script_stderrdata) else: logging.info("Transform script successful." "Output temporarily located at {0}" "".format(f_dest.name)) logging.info("Uploading transformed file to S3") f_dest.flush() dest_s3.load_file(filename=f_dest.name, key=self.dest_s3_key, replace=self.replace) logging.info("Upload successful") dest_s3.connection.close()
def execute(self, context): s3_conn = S3Hook().get_conn() quarter = datetime_to_quarter(context['execution_date']) job_label_filename = 'tmp/job_label_train_' + quarter + '.csv' with open(job_label_filename, 'w') as outfile: writer = csv.writer(outfile, delimiter=',') job_postings_generator = job_postings( s3_conn, quarter, config['job_postings']['s3_path']) corpus_generator = JobCategoryCorpusCreator().label_corpora( job_postings_generator) for label in corpus_generator: writer.writerow([label]) logging.info('Done labeling job categories to %s', job_label_filename)
def execute(self, context): conn = S3Hook().get_conn() execution_date = context['execution_date'] quarter = datetime_to_quarter(execution_date) if quarter != datetime_to_quarter(datetime.now()): logging.warning('PartnerSnapshotOperator cannot be backfilled. Skipping') return updater = self.updater_class(**(self.passthrough_kwargs)) postings = updater.deduplicated_postings() upload_dict( s3_conn=conn, s3_prefix=self.s3_prefix + '/' + quarter, data_to_sync=postings )
def execute(self, context): s3_conn = S3Hook().get_conn() geocoder = S3CachedGeocoder( s3_conn=s3_conn, cache_s3_path=config['geocoder']['s3_path'] ) finder = S3CachedCBSAFinder( s3_conn=s3_conn, cache_s3_path=config['cbsa_lookup']['s3_path'] ) logging.info('Finding all CBSAs') finder.find_all_cbsas_and_save(geocoder.all_cached_geocodes) logging.info('Done finding CBSAs')
def execute(self, context): conn = S3Hook() input_bucket, input_prefix = split_s3_path(config['output_tables']['s3_path']) key = conn.get_key( '{}/{}'.format(input_prefix, titles_filename), bucket_name=input_bucket ) text = key.get_contents_as_string().decode('utf-8') reader = csv.DictReader(io.StringIO(text), delimiter='\t') JobTitlesMasterIndexer( s3_conn=conn.get_conn(), es_client=basic_client(), job_title_generator=reader, alias_name=config['normalizer']['titles_master_index_name'] ).replace()
def execute(self, context): s3_conn = S3Hook().get_conn() quarter = datetime_to_quarter(context['execution_date']) job_postings_generator = job_postings_highmem( s3_conn, quarter, config['job_postings']['s3_path'] ) geocoder = S3CachedGeocoder( s3_conn=s3_conn, cache_s3_path=config['geocoder']['s3_path'] ) logging.info('Starting geocoding') geocoder.geocode_job_postings_and_save(job_postings_generator) logging.info('Done geocoding')
def execute(self, context): s3_conn = S3Hook().get_conn() quarter = datetime_to_quarter(context['execution_date']) job_vector_filename = 'tmp/job_features_train_' + quarter + '.csv' with open(job_vector_filename, 'w') as outfile: writer = csv.writer(outfile, delimiter=',') job_postings_generator = job_postings( s3_conn, quarter, config['job_postings']['s3_path']) corpus_generator = Doc2VecGensimCorpusCreator().array_corpora( job_postings_generator) vectorized_job_generator = Doc2Vectorizer( model_name='gensim_doc2vec', path=config['job_vectorizer_cache']['s3_path'], s3_conn=s3_conn).vectorize(corpus_generator) for vector in vectorized_job_generator: writer.writerow(vector) logging.info('Done vecotrizing job postings to %s', job_vector_filename)
def execute(self, context): conn = S3Hook().get_conn() bucket = conn.get_bucket(self.output_bucket) for url in self.sources: name = url.split('/')[-1] r = requests.get(url, stream=True) # Check the remote headers against the stored headers cache_dict = {k: r.headers[k] for k in self.cache_headers} cache_key = boto.s3.key.Key( bucket=bucket, name='{}/{}/.cache.json'.format(self.output_prefix, name) ) if cache_key.exists(): logging.info("Checking %s for updates", name) stored_cache = json.loads(cache_key.get_contents_as_string()) if cache_dict == stored_cache: logging.info("Skipping %s", name) continue logging.info("Downloading %s", name) # Cached headers differ; DELETE ALL EXISTING DATA for key in bucket.list( prefix="{}/{}/".format(self.output_prefix, name) ): key.delete() cache_key.set_contents_from_string(json.dumps(cache_dict)) for batch in Batch(r.iter_lines(), self.postings_per_file): key = boto.s3.key.Key( bucket=bucket, name='{}/{}/{}'.format( self.output_prefix, name, str(uuid.uuid4()) + ".json" ) ) with tempfile.TemporaryFile() as f: for posting in batch: f.write(posting) f.write('\n') f.seek(0) key.set_contents_from_file(f)
def upload_to_s3(file_name, **context): # Instanstiaute execution_date = context['ds'] s3_hook = S3Hook(aws_conn_id=S3_CONN_ID) # Create file sample_file = "file_{0}_{1}.txt".format(file_name, execution_date) example_file = open(sample_file, "w+") example_file.write("Putting some data in for task {0}".format(file_name)) example_file.close() s3_hook.load_file(sample_file, 'workshop/{0}'.format(sample_file), bucket_name=BUCKET, replace=True)
def execute(self, context): conn = S3Hook().get_conn() quarter = datetime_to_quarter(context['execution_date']) stats_counter = DatasetStatsCounter( quarter=quarter, dataset_id=self.partner_id ) transformer = self.transformer_class( s3_conn=conn, partner_id=self.partner_id, onet_cache=OnetCache( s3_conn=conn, cache_dir=config['onet']['cache_dir'], s3_path=config['onet']['s3_path'], ), **self.passthrough_kwargs ) self.clear_old_postings(conn, quarter) for batch in Batch( transformer.postings(quarter, stats_counter), self.postings_per_file ): logging.info('Processing new batch') with tempfile.TemporaryFile(mode='w+') as f: for posting in batch: f.write(json.dumps(posting)) f.write('\n') logging.debug('New batch written, commencing upload') bucket = conn.get_bucket(self.output_bucket) key = boto.s3.key.Key( bucket=bucket, name='{}/{}/{}_{}'.format(self.output_prefix, quarter, self.partner_id, uuid.uuid4()) ) f.seek(0) key.set_contents_from_string(f.read()) logging.debug('Batch upload complete') stats_counter.save( s3_conn=conn, s3_prefix=config['partner_stats']['s3_path'] )