Example #1
0
    def execute(self, context):
        def make_request(conn_id, endpoint, payload=None):
            return (ChargifyHook(conn_id).run(endpoint, payload).json())

        output = []
        final_payload = {'per_page': 200, 'page': 1}
        for param in self.payload:
            final_payload[param] = self.payload[param]

        response = make_request(self.chargify_conn_id, self.endpoint,
                                final_payload)
        while response:
            output.extend(response)
            final_payload['page'] += 1
            response = make_request(self.chargify_conn_id, self.endpoint,
                                    final_payload)

            logging.info('Retrieved: ' + str(final_payload['per_page'] *
                                             final_payload['page']))

        output = [record[self.endpoint[:-1]] for record in output]
        output = '\n'.join([json.dumps(flatten(record)) for record in output])

        s3 = S3Hook(s3_conn_id=self.s3_conn_id)
        s3.load_string(string_data=output,
                       bucket_name=self.s3_bucket,
                       key=self.s3_key,
                       replace=True)
        s3.connection.close()
Example #2
0
    def execute(self, context):
        s3_conn = S3Hook().get_conn()
        quarter = datetime_to_quarter(context['execution_date'])

        with tempfile.TemporaryDirectory() as temp_dir:
            job_postings_generator = job_postings_highmem(
                s3_conn, quarter, config['job_postings']['s3_path'])
            geo_querier = JobCBSAFromGeocodeQuerier(
                cbsa_results=S3CachedCBSAFinder(
                    s3_conn=s3_conn,
                    cache_s3_path=config['cbsa_lookup']
                    ['s3_path']).all_cached_cbsa_results)

            logging.basicConfig(
                format='%(asctime)s %(process)d %(levelname)s: %(message)s')
            with Pool(processes=config['aggregation']['n_processes']) as pool:
                try:
                    it = self.map(
                        pool=pool,
                        job_postings_generator=job_postings_generator,
                        geo_querier=geo_querier,
                        temp_dir=temp_dir)
                    combined_agg = self.reduce(it)
                except Exception as e:
                    logging.error("Child error: {}".format(
                        traceback.format_exc()))
                    raise
            self.save(combined_agg, quarter, s3_conn)
Example #3
0
 def execute(self, context):
     self.hive = HiveCliHook(hive_cli_conn_id=self.hive_cli_conn_id)
     self.s3 = S3Hook(s3_conn_id=self.s3_conn_id)
     logging.info("Downloading S3 file")
     if self.wildcard_match:
         if not self.s3.check_for_wildcard_key(self.s3_key):
             raise AirflowException("No key matches {0}".format(self.s3_key))
         s3_key_object = self.s3.get_wildcard_key(self.s3_key)
     else:
         if not self.s3.check_for_key(self.s3_key):
             raise AirflowException(
                 "The key {0} does not exists".format(self.s3_key))
         s3_key_object = self.s3.get_key(self.s3_key)
     with NamedTemporaryFile("w") as f:
         logging.info("Dumping S3 key {0} contents to local"
                      " file {1}".format(s3_key_object.key, f.name))
         s3_key_object.get_contents_to_file(f)
         f.flush()
         self.s3.connection.close()
         if not self.headers:
             logging.info("Loading file into Hive")
             self.hive.load_file(
                 f.name,
                 self.hive_table,
                 field_dict=self.field_dict,
                 create=self.create,
                 partition=self.partition,
                 delimiter=self.delimiter,
                 recreate=self.recreate)
         else:
             with open(f.name, 'r') as tmpf:
                 if self.check_headers:
                     header_l = tmpf.readline()
                     header_line = header_l.rstrip()
                     header_list = header_line.split(self.delimiter)
                     field_names = list(self.field_dict.keys())
                     test_field_match = [h1.lower() == h2.lower() for h1, h2
                                         in zip(header_list, field_names)]
                     if not all(test_field_match):
                         logging.warning("Headers do not match field names"
                                         "File headers:\n {header_list}\n"
                                         "Field names: \n {field_names}\n"
                                         "".format(**locals()))
                         raise AirflowException("Headers do not match the "
                                         "field_dict keys")
                 with NamedTemporaryFile("w") as f_no_headers:
                     tmpf.seek(0)
                     next(tmpf)
                     for line in tmpf:
                         f_no_headers.write(line)
                     f_no_headers.flush()
                     logging.info("Loading file without headers into Hive")
                     self.hive.load_file(
                         f_no_headers.name,
                         self.hive_table,
                         field_dict=self.field_dict,
                         create=self.create,
                         partition=self.partition,
                         delimiter=self.delimiter,
                         recreate=self.recreate)
Example #4
0
def classify_common(job_postings, aggregator_constructor, temp_dir,
                    processed_folder, phase_indices, download_folder):
    s3_conn = S3Hook().get_conn()
    corpus_creator = SimpleCorpusCreator()
    title_cleaner = partial(title_clean, phase_indices=phase_indices)

    common_classifier = Classifier(
        s3_conn=s3_conn,
        classifier_id='ann_0614',
        classify_kwargs={'mode': 'common'},
        temporary_directory=download_folder,
    )
    job_aggregators = {
        'soc_code_common':
        SocCodeAggregator(corpus_creator=corpus_creator,
                          occupation_classifier=common_classifier,
                          output_count=2,
                          output_total=True)
    }

    aggregator = aggregator_constructor(job_aggregators=job_aggregators,
                                        title_cleaner=title_cleaner)

    aggregator.process_postings(job_postings)
    aggregator.job_aggregators['soc_code_common'].occupation_classifier = None
    aggregator.job_aggregators['soc_code_common'].corpus_creator = None
    return save(
        aggregator,
        temp_dir,
    )
Example #5
0
    def execute(self, context):

        response = self.get_data()
        response.columns = response.columns.map(boa.constrict)

        json_data = json.loads(response.to_json(orient='records'))
        schema_map = self.schemaMapping(json_data[0])

        s3 = S3Hook(s3_conn_id=self.s3_conn_id)

        if self.s3_key.endswith('.json'):
            split = path.splitext(self.s3_key)
            schema_key = '{0}_schema{1}'.format(split[0], split[1])

        results = [
            dict([boa.constrict(k), v] for k, v in i.items())
            for i in json_data
        ]
        results = '\n'.join([json.dumps(i) for i in results])

        s3.load_string(string_data=str(schema_map),
                       bucket_name=self.s3_bucket,
                       key=schema_key,
                       replace=True)

        s3.load_string(string_data=results,
                       bucket_name=self.s3_bucket,
                       key=self.s3_key,
                       replace=True)
        s3.load_string
        s3.connection.close()
 def execute(self, context):
     postgres_hook = PostgresHook(postgres_conn_id=self.postgres_conn_id)
     s3_hook = S3Hook(aws_conn_id=self.s3_conn_id)
     res = self.query_db(self.query, postgres_hook)
     res.seek(0)
     s3_hook.load_file_obj(res, key="egress/sources.airflow.csv", bucket_name="demo-bucket-temp-977338899", replace=True)
     return True
Example #7
0
 def execute(self, context):
     conn = S3Hook().get_conn()
     stats_aggregator = DatasetStatsAggregator(
         dataset_id=self.partner_id,
         s3_conn=conn
     )
     stats_aggregator.run(config['partner_stats']['s3_path'])
Example #8
0
def transform_py(**kwargs):
    s3 = kwargs.get('s3_conn_id', None)
    s3_key = kwargs.get('templates_dict').get('s3_key', None)
    transformed_key = kwargs.get('templates_dict').get('transformed_key', None)

    s3_bucket = kwargs.get('s3_bucket', None)
    hook = S3Hook(s3)

    (hook.get_key(s3_key,
                  bucket_name=s3_bucket)
         .get_contents_to_filename('temp.csv'))

    df = pd.read_csv('temp.csv')

    records = json.loads(df.to_json(orient='records'))
    del df

    records = [unflatten_list(record) for record in records]

    records = '\n'.join([json.dumps(record) for record in records])

    hook.load_string(string_data=records,
                     key=transformed_key,
                     bucket_name=s3_bucket,
                     replace=True)
Example #9
0
    def outputManager(self, context, output, key, bucket):
        if len(output) == 0 or output is None:
            if self.total_output_files == 0:
                logging.info("No records pulled from Hubspot.")

                downstream_tasks = context['task'].get_flat_relatives(upstream=False)

                logging.info('Skipping downstream tasks...')
                logging.debug("Downstream task_ids %s", downstream_tasks)

                if downstream_tasks:
                    self.skip(context['dag_run'],
                              context['ti'].execution_date,
                              downstream_tasks)
        else:
            logging.info('Logging {0} to S3...'.format(key))

            output = [flatten(e) for e in output]
            output = '\n'.join([json.dumps({boa.constrict(k): v
                               for k, v in i.items()}) for i in output])

            s3 = S3Hook(self.s3_conn_id)
            s3.load_string(
                string_data=str(output),
                key=key,
                bucket_name=bucket,
                replace=True
            )
            s3.connection.close()

            self.total_output_files += 1
Example #10
0
 def __init__(self,
              s3_key,
              field_dict,
              hive_table,
              delimiter=',',
              create=True,
              recreate=False,
              partition=None,
              headers=False,
              check_headers=False,
              s3_conn_id='s3_default',
              hive_cli_conn_id='hive_cli_default',
              *args,
              **kwargs):
     super(S3ToHiveTransfer, self).__init__(*args, **kwargs)
     self.s3_key = s3_key
     self.field_dict = field_dict
     self.hive_table = hive_table
     self.delimiter = delimiter
     self.create = create
     self.recreate = recreate
     self.partition = partition
     self.headers = headers
     self.check_headers = check_headers
     self.hive = HiveCliHook(hive_cli_conn_id=hive_cli_conn_id)
     self.s3 = S3Hook(s3_conn_id=s3_conn_id)
Example #11
0
def skill_aggregate(job_postings, aggregator_constructor, temp_dir,
                    processed_folder, phase_indices, download_folder):
    title_cleaner = partial(title_clean, phase_indices=phase_indices)

    skills_filename = '{}/skills_master_table.tsv'\
        .format(processed_folder)

    if not os.path.isfile(skills_filename):
        download(s3_conn=S3Hook().get_conn(),
                 out_filename=skills_filename,
                 s3_path=config['output_tables']['s3_path'] +
                 '/skills_master_table.tsv')
    corpus_creator = SimpleCorpusCreator()
    job_aggregators = {
        'onet_skills':
        OccupationScopedSkillAggregator(
            corpus_creator=corpus_creator,
            skill_extractor=OccupationScopedSkillExtractor(
                skills_filename=skills_filename),
            output_count=10)
    }
    aggregator = aggregator_constructor(job_aggregators=job_aggregators,
                                        title_cleaner=title_cleaner)
    aggregator.process_postings(job_postings)
    aggregator.job_aggregators['onet_skills'].skill_extractor = None
    aggregator.job_aggregators['onet_skills'].corpus_creator = None
    return save(
        aggregator,
        temp_dir,
    )
Example #12
0
 def __init__(self,
              source_s3_key,
              dest_s3_key,
              transform_script,
              source_s3_conn_id='s3_default',
              dest_s3_conn_id='s3_default',
              replace=False,
              *args,
              **kwargs):
     super(S3FileTransformOperator, self).__init__(*args, **kwargs)
     self.source_s3_key = source_s3_key
     self.source_s3_conn_id = source_s3_conn_id
     self.dest_s3_key = dest_s3_key
     self.dest_s3_conn_id = dest_s3_conn_id
     self.replace = replace
     self.transform_script = transform_script
     self.source_s3 = S3Hook(s3_conn_id=source_s3_conn_id)
     self.dest_s3 = S3Hook(s3_conn_id=dest_s3_conn_id)
Example #13
0
    def execute(self, context):
        s3_conn = S3Hook().get_conn()
        quarter = datetime_to_quarter(context['execution_date'])
        output_folder = config.get('output_folder', 'output')
        if not os.path.isdir(output_folder):
            os.mkdir(output_folder)

        merge(s3_conn, self.group_config_key, quarter, output_folder)
        merge(s3_conn, self.rollup_config_key, quarter, output_folder)
Example #14
0
 def execute(self, context):
     conn = S3Hook().get_conn()
     quarter = datetime_to_quarter(context['execution_date'])
     NormalizeTopNIndexer(
         quarter=quarter,
         job_postings_generator=partial(job_postings, s3_path=config['job_postings']['s3_path']),
         job_titles_index=config['normalizer']['titles_master_index_name'],
         alias_name=config['normalizer']['es_index_name'],
         s3_conn=conn,
         es_client=basic_client()
     ).append()
Example #15
0
 def __init__(self):
     remote_conn_id = configuration.get('core', 'REMOTE_LOG_CONN_ID')
     try:
         from airflow.hooks import S3Hook
         self.hook = S3Hook(remote_conn_id)
     except:
         self.hook = None
         logging.error(
             'Could not create an S3Hook with connection id "{}". '
             'Please make sure that airflow[s3] is installed and '
             'the S3 connection exists.'.format(remote_conn_id))
Example #16
0
    def poke(self, context):
        self.s3 = S3Hook(s3_conn_id=self.s3_conn_id)
        bucket, key = self.s3.parse_s3_url(self.s3_key)
        full_url = 's3://' + bucket + '/' + key

        logging.info('Poking for key : {full_url}'.format(**locals()))
        if self.s3.check_for_key(key, bucket):
            return True

        raise AirflowException(
            'Not present -- retry. If this is a test, then run the dependent job to fix the S3 hook issue.'
        )
Example #17
0
def upload_to_s3(file_name):

    # Instanstiate
    s3_hook=S3Hook(aws_conn_id=S3_CONN_ID) 
    
    # Create file
    sample_file = "{0}_file_{1}.txt".format(name, file_name) #swap your name here
    example_file = open(sample_file, "w+")
    example_file.write("Putting some data in for task {0}".format(file_name))
    example_file.close()
    
    s3_hook.load_file(sample_file, 'globetelecom/{0}'.format(sample_file), bucket_name=BUCKET, replace=True)
    def execute(self, context):
        response = (BambooHRHook(self.bamboo_conn_id)
                    .run(self.company_name,
                         self.methodMapping(self.method),
                         self.payload)).text

        results = json.loads(response)
        s3 = S3Hook(s3_conn_id=self.s3_conn_id)

        if self.s3_key.endswith('.json'):
            split = path.splitext(self.s3_key)
            schema_key = '{0}_schema{1}'.format(split[0], split[1])
Example #19
0
 def execute(self, context):
     conn = S3Hook().get_conn()
     title_extractor = OnetTitleExtractor(
         onet_source=OnetCache(
             conn,
             cache_dir=config['onet']['cache_dir'],
             s3_path=config['onet']['s3_path'],
         ),
         output_filename=titles_filename,
         hash_function=md5
     )
     title_extractor.run()
     upload(conn, titles_filename, config['output_tables']['s3_path'])
Example #20
0
 def execute(self, context):
     conn = S3Hook().get_conn()
     skill_extractor = OnetSkillImportanceExtractor(
         onet_source=OnetCache(
             conn,
             cache_dir=config['onet']['cache_dir'],
             s3_path=config['onet']['s3_path'],
         ),
         output_filename=skill_importance_filename,
         hash_function=md5
     )
     skill_extractor.run()
     upload(conn, skill_importance_filename, config['output_tables']['s3_path'])
 def execute(self, context):
     source_s3 = S3Hook(s3_conn_id=self.source_s3_conn_id)
     dest_s3 = S3Hook(s3_conn_id=self.dest_s3_conn_id)
     logging.info("Downloading source S3 file {0}"
                  "".format(self.source_s3_key))
     if not source_s3.check_for_key(self.source_s3_key):
         raise AirflowException("The source key {0} does not exist"
                                "".format(self.source_s3_key))
     source_s3_key_object = source_s3.get_key(self.source_s3_key)
     with NamedTemporaryFile("w") as f_source, NamedTemporaryFile(
             "w") as f_dest:
         logging.info("Dumping S3 file {0} contents to local file {1}"
                      "".format(self.source_s3_key, f_source.name))
         source_s3_key_object.get_contents_to_file(f_source)
         f_source.flush()
         source_s3.connection.close()
         transform_script_process = subprocess.Popen(
             [self.transform_script, f_source.name, f_dest.name],
             stdout=subprocess.PIPE,
             stderr=subprocess.PIPE)
         (transform_script_stdoutdata, transform_script_stderrdata
          ) = transform_script_process.communicate()
         logging.info("Transform script stdout "
                      "" + transform_script_stdoutdata)
         if transform_script_process.returncode > 0:
             raise AirflowException("Transform script failed "
                                    "" + transform_script_stderrdata)
         else:
             logging.info("Transform script successful."
                          "Output temporarily located at {0}"
                          "".format(f_dest.name))
         logging.info("Uploading transformed file to S3")
         f_dest.flush()
         dest_s3.load_file(filename=f_dest.name,
                           key=self.dest_s3_key,
                           replace=self.replace)
         logging.info("Upload successful")
         dest_s3.connection.close()
Example #22
0
 def execute(self, context):
     s3_conn = S3Hook().get_conn()
     quarter = datetime_to_quarter(context['execution_date'])
     job_label_filename = 'tmp/job_label_train_' + quarter + '.csv'
     with open(job_label_filename, 'w') as outfile:
         writer = csv.writer(outfile, delimiter=',')
         job_postings_generator = job_postings(
             s3_conn, quarter, config['job_postings']['s3_path'])
         corpus_generator = JobCategoryCorpusCreator().label_corpora(
             job_postings_generator)
         for label in corpus_generator:
             writer.writerow([label])
     logging.info('Done labeling job categories to %s',
                  job_label_filename)
Example #23
0
 def execute(self, context):
     conn = S3Hook().get_conn()
     execution_date = context['execution_date']
     quarter = datetime_to_quarter(execution_date)
     if quarter != datetime_to_quarter(datetime.now()):
         logging.warning('PartnerSnapshotOperator cannot be backfilled. Skipping')
         return
     updater = self.updater_class(**(self.passthrough_kwargs))
     postings = updater.deduplicated_postings()
     upload_dict(
         s3_conn=conn,
         s3_prefix=self.s3_prefix + '/' + quarter,
         data_to_sync=postings
     )
Example #24
0
    def execute(self, context):
        s3_conn = S3Hook().get_conn()

        geocoder = S3CachedGeocoder(
            s3_conn=s3_conn,
            cache_s3_path=config['geocoder']['s3_path']
        )
        finder = S3CachedCBSAFinder(
            s3_conn=s3_conn,
            cache_s3_path=config['cbsa_lookup']['s3_path']
        )
        logging.info('Finding all CBSAs')
        finder.find_all_cbsas_and_save(geocoder.all_cached_geocodes)
        logging.info('Done finding CBSAs')
Example #25
0
 def execute(self, context):
     conn = S3Hook()
     input_bucket, input_prefix = split_s3_path(config['output_tables']['s3_path'])
     key = conn.get_key(
         '{}/{}'.format(input_prefix, titles_filename),
         bucket_name=input_bucket
     )
     text = key.get_contents_as_string().decode('utf-8')
     reader = csv.DictReader(io.StringIO(text), delimiter='\t')
     JobTitlesMasterIndexer(
         s3_conn=conn.get_conn(),
         es_client=basic_client(),
         job_title_generator=reader,
         alias_name=config['normalizer']['titles_master_index_name']
     ).replace()
Example #26
0
    def execute(self, context):
        s3_conn = S3Hook().get_conn()
        quarter = datetime_to_quarter(context['execution_date'])

        job_postings_generator = job_postings_highmem(
            s3_conn,
            quarter,
            config['job_postings']['s3_path']
        )

        geocoder = S3CachedGeocoder(
            s3_conn=s3_conn,
            cache_s3_path=config['geocoder']['s3_path']
        )
        logging.info('Starting geocoding')
        geocoder.geocode_job_postings_and_save(job_postings_generator)
        logging.info('Done geocoding')
Example #27
0
 def execute(self, context):
     s3_conn = S3Hook().get_conn()
     quarter = datetime_to_quarter(context['execution_date'])
     job_vector_filename = 'tmp/job_features_train_' + quarter + '.csv'
     with open(job_vector_filename, 'w') as outfile:
         writer = csv.writer(outfile, delimiter=',')
         job_postings_generator = job_postings(
             s3_conn, quarter, config['job_postings']['s3_path'])
         corpus_generator = Doc2VecGensimCorpusCreator().array_corpora(
             job_postings_generator)
         vectorized_job_generator = Doc2Vectorizer(
             model_name='gensim_doc2vec',
             path=config['job_vectorizer_cache']['s3_path'],
             s3_conn=s3_conn).vectorize(corpus_generator)
         for vector in vectorized_job_generator:
             writer.writerow(vector)
     logging.info('Done vecotrizing job postings to %s',
                  job_vector_filename)
    def execute(self, context):
        conn = S3Hook().get_conn()
        bucket = conn.get_bucket(self.output_bucket)

        for url in self.sources:
            name = url.split('/')[-1]
            r = requests.get(url, stream=True)

            # Check the remote headers against the stored headers
            cache_dict = {k: r.headers[k] for k in self.cache_headers}
            cache_key = boto.s3.key.Key(
                bucket=bucket,
                name='{}/{}/.cache.json'.format(self.output_prefix, name)
            )
            if cache_key.exists():
                logging.info("Checking %s for updates", name)
                stored_cache = json.loads(cache_key.get_contents_as_string())
                if cache_dict == stored_cache:
                    logging.info("Skipping %s", name)
                    continue
            logging.info("Downloading %s", name)
            # Cached headers differ; DELETE ALL EXISTING DATA
            for key in bucket.list(
                prefix="{}/{}/".format(self.output_prefix, name)
            ):
                key.delete()
            cache_key.set_contents_from_string(json.dumps(cache_dict))

            for batch in Batch(r.iter_lines(), self.postings_per_file):
                key = boto.s3.key.Key(
                    bucket=bucket,
                    name='{}/{}/{}'.format(
                        self.output_prefix,
                        name,
                        str(uuid.uuid4()) + ".json"
                    )
                )
                with tempfile.TemporaryFile() as f:
                    for posting in batch:
                        f.write(posting)
                        f.write('\n')
                    f.seek(0)
                    key.set_contents_from_file(f)
Example #29
0
def upload_to_s3(file_name, **context):

    # Instanstiaute

    execution_date = context['ds']

    s3_hook = S3Hook(aws_conn_id=S3_CONN_ID)

    # Create file

    sample_file = "file_{0}_{1}.txt".format(file_name, execution_date)
    example_file = open(sample_file, "w+")
    example_file.write("Putting some data in for task {0}".format(file_name))
    example_file.close()

    s3_hook.load_file(sample_file,
                      'workshop/{0}'.format(sample_file),
                      bucket_name=BUCKET,
                      replace=True)
Example #30
0
    def execute(self, context):
        conn = S3Hook().get_conn()
        quarter = datetime_to_quarter(context['execution_date'])
        stats_counter = DatasetStatsCounter(
            quarter=quarter,
            dataset_id=self.partner_id
        )
        transformer = self.transformer_class(
            s3_conn=conn,
            partner_id=self.partner_id,
            onet_cache=OnetCache(
                s3_conn=conn,
                cache_dir=config['onet']['cache_dir'],
                s3_path=config['onet']['s3_path'],
            ),
            **self.passthrough_kwargs
        )
        self.clear_old_postings(conn, quarter)
        for batch in Batch(
            transformer.postings(quarter, stats_counter),
            self.postings_per_file
        ):
            logging.info('Processing new batch')
            with tempfile.TemporaryFile(mode='w+') as f:
                for posting in batch:
                    f.write(json.dumps(posting))
                    f.write('\n')

                logging.debug('New batch written, commencing upload')
                bucket = conn.get_bucket(self.output_bucket)
                key = boto.s3.key.Key(
                    bucket=bucket,
                    name='{}/{}/{}_{}'.format(self.output_prefix, quarter,
                                              self.partner_id, uuid.uuid4())
                )
                f.seek(0)
                key.set_contents_from_string(f.read())
                logging.debug('Batch upload complete')
        stats_counter.save(
            s3_conn=conn,
            s3_prefix=config['partner_stats']['s3_path']
        )