Esempio n. 1
0
def run():
    PROJECT_ID = 'my-project'
    BUCKET = 'gs://my-bucket'
    DIR_PATH = BUCKET + '/output/' + datetime.datetime.now().strftime(
        '%Y_%m_%d_%H_%M_%S') + '/'

    options = PipelineOptions(flags=None,
                              runner='DataflowRunner',
                              project=PROJECT_ID,
                              job_name='class',
                              temp_location=BUCKET + '/temp',
                              region='us-central1')

    p = beam.pipeline.Pipeline(options=options)

    sql = 'SELECT sid, cno, cname, credits, grade FROM college_staging.Class'
    bq_source = ReadFromBigQuery(query=sql,
                                 use_standard_sql=True,
                                 gcs_location=BUCKET)

    query_results = p | 'Read from BQ' >> beam.io.Read(bq_source)

    takes_pcoll = query_results | 'Make Takes' >> beam.ParDo(MakeTakes())

    takes_pcoll | 'Log takes output' >> WriteToText(DIR_PATH +
                                                    'takes_output.txt')

    dataset_id = 'college_refined'
    table_id = PROJECT_ID + ':' + dataset_id + '.' + 'Takes_Dataflow'
    schema_id = 'sid:STRING,cno:STRING,grade:STRING'

    takes_pcoll | 'Write takes to BQ' >> WriteToBigQuery(
        table=table_id, schema=schema_id, custom_gcs_temp_location=BUCKET)

    class_pcoll = query_results | 'Make Class' >> beam.ParDo(MakeClass())

    grouped_class_pcoll = class_pcoll | 'GroupByKey' >> beam.GroupByKey()

    grouped_class_pcoll | 'Log class groups' >> WriteToText(
        DIR_PATH + 'class_groups_output.txt')

    unique_class_pcoll = grouped_class_pcoll | 'Make Unique Class' >> beam.ParDo(
        MakeUniqueClass())

    unique_class_pcoll | 'Log class unique' >> WriteToText(
        DIR_PATH + 'class_unique_output.txt')

    table_id = PROJECT_ID + ':' + dataset_id + '.' + 'Class_Dataflow'
    schema_id = 'cno:STRING,cname:STRING,credits:INTEGER'

    unique_class_pcoll | 'Write class to BQ' >> WriteToBigQuery(
        table=table_id, schema=schema_id, custom_gcs_temp_location=BUCKET)

    result = p.run()
    result.wait_until_finish()
Esempio n. 2
0
def run():
    PROJECT_ID = 'my-project'
    BUCKET = 'gs://my-bucket/temp'

    options = {'project': PROJECT_ID}
    opts = beam.pipeline.PipelineOptions(flags=[], **options)

    p = beam.Pipeline('DirectRunner', options=opts)

    sql = 'SELECT sid, cno, cname, credits, grade FROM college_staging.Class limit 50'
    bq_source = ReadFromBigQuery(query=sql,
                                 use_standard_sql=True,
                                 gcs_location=BUCKET)

    query_results = p | 'Read from BQ' >> beam.io.Read(bq_source)

    takes_pcoll = query_results | 'Make Takes' >> beam.ParDo(MakeTakes())

    takes_pcoll | 'Log takes output' >> WriteToText('takes_output.txt')

    dataset_id = 'college_refined'
    table_id = PROJECT_ID + ':' + dataset_id + '.' + 'Takes_Beam'
    schema_id = 'sid:STRING,cno:STRING,grade:STRING'

    takes_pcoll | 'Write takes to BQ' >> WriteToBigQuery(
        table=table_id, schema=schema_id, custom_gcs_temp_location=BUCKET)

    class_pcoll = query_results | 'Make Class' >> beam.ParDo(MakeClass())

    grouped_class_pcoll = class_pcoll | 'GroupByKey' >> beam.GroupByKey()

    grouped_class_pcoll | 'Log class groups' >> WriteToText(
        'class_groups_output.txt')

    unique_class_pcoll = grouped_class_pcoll | 'Make Unique Class' >> beam.ParDo(
        MakeUniqueClass())

    unique_class_pcoll | 'Log class unique' >> WriteToText(
        'class_unique_output.txt')

    table_id = PROJECT_ID + ':' + dataset_id + '.' + 'Class_Beam'
    schema_id = 'cno:STRING,cname:STRING,credits:INTEGER'

    unique_class_pcoll | 'Write class to BQ' >> WriteToBigQuery(
        table=table_id, schema=schema_id, custom_gcs_temp_location=BUCKET)

    result = p.run()
    result.wait_until_finish()
def run(argv=None):
    parser = argparse.ArgumentParser()
    parser.add_argument(
        '--input_topic',
        dest='input_topic',
        help='Input topic in the form projects/<project>/topics/<topic>')
    parser.add_argument('--output',
                        dest='output_file',
                        help='Output file where to write')
    parser.add_argument('--table', dest='table_name', help='BQ table name')
    parser.add_argument('--dataset', dest='dataset_id', help='BQ dataset')
    parser.add_argument('--project_id', dest='project_id', help='Project ID')
    known_args, pipeline_args = parser.parse_known_args(argv)
    pipeline_args.extend(['--project=<your-project>'])
    pipeline_options = PipelineOptions(pipeline_args)

    with beam.Pipeline(options=pipeline_options) as p:
        lines = p | ReadFromPubSub(topic=known_args.input_topic)

        def str_to_dict(str_line):
            import pandas as pd
            df_rows = eval(str_line)
            pd.DataFrame.from_dict(df_rows)
            bq_rows = eval(re.sub('\[|\]', '', str_line.decode('utf-8')))
            logging.info(bq_rows)
            return bq_rows

        lines = lines | 'String to dict' >> beam.Map(str_to_dict)
        lines = lines | 'Output to BQ' >> WriteToBigQuery(
            table=known_args.table_name,
            dataset=known_args.dataset_id,
            project=known_args.project_id)
def main():
    # bq_source = BigQuerySource(query="""
    #                            SELECT created_at, text
    #                            FROM got_sentiment.got_tweets
    #                            """,
    #                            validate=False, coder=None,
    #                            use_standard_sql=True, flatten_results=True,
    #                            kms_key=None)

    # Removed attributes from ReadFromPubSub:
    #                              with_attributes=False,
    #                             timestamp_attribute='created_at'

    # Create the Pipeline with the specified options.
    with Pipeline(options=options) as p:
        results = (
            p | 'read_from_topic' >> ReadFromPubSub(topic=PUBSUB_TOPIC)
            | 'Window' >> WindowInto(window.FixedWindows(60))
            | 'Emit_needed_values' >> FlatMap(emit_values, entity_map)
            | 'Combine' >> CombinePerKey(EntityScoreCombine())
            | 'Add Window Timestamp' >> beam.ParDo(AddWindowTimestampFn())
            | 'FormatForWrite' >> Map(format_for_write)
            | 'Write' >> WriteToBigQuery('streaming_scores',
                                         dataset=BQ_DATASET,
                                         project=PROJECT_ID,
                                         create_disposition='CREATE_IF_NEEDED',
                                         write_disposition='WRITE_APPEND',
                                         batch_size=20))
Esempio n. 5
0
    def _run_pubsub_bq_pipeline(self, method, triggering_frequency=None):
        l = [i for i in range(self._SIZE)]

        matchers = [
            PipelineStateMatcher(PipelineState.RUNNING),
            BigqueryFullResultStreamingMatcher(project=self.project,
                                               query="SELECT number FROM %s" %
                                               self.output_table,
                                               data=[(i, ) for i in l])
        ]

        args = self.test_pipeline.get_full_options_as_args(
            on_success_matcher=hc.all_of(*matchers),
            wait_until_finish_duration=self.WAIT_UNTIL_FINISH_DURATION,
            experiments='use_beam_bq_sink',
            streaming=True)

        def add_schema_info(element):
            yield {'number': element}

        messages = [str(i).encode('utf-8') for i in l]
        for message in messages:
            self.pub_client.publish(self.input_topic.name, message)

        with beam.Pipeline(argv=args) as p:
            mesages = (p
                       | ReadFromPubSub(subscription=self.input_sub.name)
                       | beam.ParDo(add_schema_info))
            _ = mesages | WriteToBigQuery(
                self.output_table,
                schema=self.SCHEMA,
                method=method,
                triggering_frequency=triggering_frequency)
Esempio n. 6
0
def run(argv=None, save_main_session=True):
  """Main entry point; defines and runs the wordcount pipeline."""
  parser = argparse.ArgumentParser()
  parser.add_argument('--input',
                      dest='input',
                      required=True,
                      help='Input Pub/Sub subscription to read from.')
  parser.add_argument('--output',
                      dest='output',
                      required=True,
                      help='Output BigQuery table to write results to.')
  known_args, pipeline_args = parser.parse_known_args(argv)

  # We use the save_main_session option because one or more DoFn's in this
  # workflow rely on global context (e.g., a module imported at module level).
  pipeline_options = PipelineOptions(pipeline_args)
  pipeline_options.view_as(SetupOptions).save_main_session = save_main_session
  p = beam.Pipeline(options=pipeline_options)

  # Read the text file[pattern] into a PCollection.
  (p | 'read' >> ReadFromPubSub(subscription=known_args.input)
     | 'extract words' >> beam.FlatMap(extract_words)
     | 'transform to kv' >> beam.Map(lambda x: (x,1))
     | 'window per minute' >> beam.WindowInto(
                                window.FixedWindows(5),
                                trigger=trigger.AfterProcessingTime(delay=10),
                                accumulation_mode=trigger.AccumulationMode.DISCARDING)
     | 'group by words' >> beam.GroupByKey()
     | 'count ones' >> beam.Map(count_ones)
     | 'format for bq' >> beam.Map(format_for_bigquery)
     | 'write to bigquery' >> WriteToBigQuery(table=known_args.output))

  result = p.run()
  result.wait_until_finish()
Esempio n. 7
0
    def __init__(self,
                 temp_gcs_location,
                 table,
                 dataset=None,
                 project=None,
                 schema=None,
                 create_disposition=BigQueryDisposition.CREATE_IF_NEEDED,
                 write_disposition=BigQueryDisposition.WRITE_EMPTY,
                 test_client=None,
                 temp_shards_per_day=0):

        self.temp_shards_per_day = temp_shards_per_day or DEFAULT_TEMP_SHARDS_PER_DAY

        # store table ref as a string so that it will pickle
        self.table_reference = decode_table_ref(table, dataset, project)
        self.project_id = project or self.table_reference.projectId

        # store schema as a dict so that it will pickle
        self.table_schema_dict = WriteToBigQuery.table_schema_to_dict(
            parse_table_schema(schema))
        self.create_disposition = create_disposition
        self.write_disposition = write_disposition

        super(BigQueryDatePartitionedSink,
              self).__init__(file_path_prefix=pp.join(temp_gcs_location,
                                                      'shard'),
                             file_name_suffix='.json')
Esempio n. 8
0
def run(argv=None, save_main_session=True):
  """Main entry point; defines and runs the wordcount pipeline."""
  parser = argparse.ArgumentParser()
  parser.add_argument('--input',
                      dest='input',
                      default='gs://dataflow-samples/shakespeare/kinglear.txt',
                      help='Input file to process.')
  parser.add_argument('--output',
                      dest='output',
                      required=True,
                      help='Output BigQuery table to write results to.')
  known_args, pipeline_args = parser.parse_known_args(argv)

  # We use the save_main_session option because one or more DoFn's in this
  # workflow rely on global context (e.g., a module imported at module level).
  pipeline_options = PipelineOptions(pipeline_args)
  pipeline_options.view_as(SetupOptions).save_main_session = save_main_session
  p = beam.Pipeline(options=pipeline_options)

  # Read the text file[pattern] into a PCollection.
  (p | 'read' >> ReadFromText(known_args.input)
     | 'extract words' >> beam.FlatMap(extract_words)
     | 'transform to kv' >> beam.Map(lambda x: (x,1))
     | 'group by words' >> beam.GroupByKey()
     | 'count ones' >> beam.Map(count_ones)
     | 'format for bq' >> beam.Map(format_for_bigquery)
     | 'write to bigquery' >> WriteToBigQuery(table=known_args.output))

  result = p.run()
  result.wait_until_finish()
Esempio n. 9
0
def run():
    PROJECT_ID = 'my-project'
    BUCKET = 'gs://my-bucket'
    DIR_PATH = BUCKET + '/output/' + datetime.datetime.now().strftime(
        '%Y_%m_%d_%H_%M_%S') + '/'

    options = PipelineOptions(flags=None,
                              runner='DataflowRunner',
                              project=PROJECT_ID,
                              job_name='teacher',
                              temp_location=BUCKET + '/temp',
                              region='us-central1')

    p = beam.pipeline.Pipeline(options=options)

    sql = 'SELECT tid, instructor, dept FROM college_staging.Teacher'
    bq_source = ReadFromBigQuery(query=sql,
                                 use_standard_sql=True,
                                 gcs_location=BUCKET)

    query_results = p | 'Read from BQ' >> beam.io.Read(bq_source)

    out_pcoll = query_results | 'Format Name' >> beam.ParDo(FormatName())

    out_pcoll | 'Log output' >> WriteToText(DIR_PATH + 'output.txt')

    dataset_id = 'college_refined'
    table_id = PROJECT_ID + ':' + dataset_id + '.' + 'Teacher_Dataflow'
    schema_id = 'tid:STRING,fname:STRING,lname:STRING,dept:STRING'

    out_pcoll | 'Write to BQ' >> WriteToBigQuery(
        table=table_id, schema=schema_id, custom_gcs_temp_location=BUCKET)

    result = p.run()
    result.wait_until_finish()
def run():
    PROJECT_ID = 'trim-cistern-288221'
    BUCKET = 'gs://bhnk-milestone1-data'

    options = {'project': PROJECT_ID}
    opts = beam.pipeline.PipelineOptions(flags=[], **options)

    p = beam.Pipeline('DirectRunner', options=opts)

    sql = 'SELECT * FROM imdb_refined.Writers limit 250'
    bq_source = ReadFromBigQuery(query=sql,
                                 use_standard_sql=True,
                                 gcs_location=BUCKET)

    query_results = p | 'Read from BQ' >> beam.io.Read(bq_source)

    out_pcoll = query_results | 'Split Writers' >> beam.ParDo(SplitWriters())

    out_pcoll | 'Log output' >> WriteToText('output.txt')

    dataset_id = 'imdb_refined'
    table_id = PROJECT_ID + ':' + dataset_id + '.' + 'Writers_Beam'
    schema_id = 'tconst:STRING,writers:STRING'

    out_pcoll | 'Write to BQ' >> WriteToBigQuery(
        table=table_id, schema=schema_id, custom_gcs_temp_location=BUCKET)

    result = p.run()
    result.wait_until_finish()
def run():
    PROJECT_ID = 'acquired-rarity-288205'
    BUCKET = 'gs://ykdb_beam/temp'

    options = {'project': PROJECT_ID}
    opts = beam.pipeline.PipelineOptions(flags=[], **options)

    p = beam.Pipeline('DirectRunner', options=opts)

    # ***************************************** REMOVE DUPLICATES ****************************************************
    sql = "SELECT * FROM (SELECT  *, COUNT(*) AS count FROM H_1B_refined.Ownership GROUP BY occ_code, occ_title, ownership, naics_title, grp, tot_emp, emp_prse, h_mean, a_mean, mean_prse, a_pct10, a_pct25, a_median, a_pct75, a_pct90 HAVING count = 1) LIMIT 100"
    bq_source = ReadFromBigQuery(query=sql,
                                 use_standard_sql=True,
                                 gcs_location=BUCKET)

    query_results = p | 'Read from BQ' >> beam.io.Read(bq_source)

    out_pcoll = query_results | 'Remove Dups Ownership' >> beam.ParDo(
        NoDuplicates())

    out_pcoll | 'Log output' >> WriteToText('output_ownership.txt')

    # ***************************************** INSERT INTO BQ ****************************************************
    dataset_id = 'H_1B_refined'
    table_id = PROJECT_ID + ':' + dataset_id + '.' + 'Ownership_Beam'

    schema_id = 'occ_code:STRING, occ_title:STRING, ownership:STRING, naics_title:STRING, grp:STRING, tot_emp:INTEGER, emp_prse:FLOAT, h_mean:FLOAT, a_mean:INTEGER, mean_prse:FLOAT, a_pct10:INTEGER, a_pct25:INTEGER, a_median:INTEGER, a_pct75:INTEGER, a_pct90:INTEGER'

    out_pcoll | 'Write to BQ' >> WriteToBigQuery(
        table=table_id, schema=schema_id, custom_gcs_temp_location=BUCKET)

    result = p.run()
    result.wait_until_finish()
def run(argv=None):
    parser = argparse.ArgumentParser()
    parser.add_argument('--topic',
                        dest='topic',
                        default=default_topic)

    known_args, pipeline_args = parser.parse_known_args(argv)
    pipeline_args.extend(['--project={}'.format(project), '--streaming', '--experiments=allow_non_updatable_job'])

    pipeline_options = PipelineOptions(pipeline_args)
    pipeline_options.view_as(SetupOptions).save_main_session = True

    # TODO
    schema = 'name:string,age:integer,timestamp:timestamp'

    def to_dict(element):
        return [yaml.load(element)]

    def get_table(element):
        # table format is DATASET.TABLE
        table = element['table']
        element['timestamp'] = datetime.now().strftime('%Y-%m-%d %H:%M:%S')
        element.pop('table')
        return '{}:{}'.format(project, table)

    with beam.Pipeline(options=pipeline_options) as p:
        (p | "ReadTopic" >> beam.io.ReadFromPubSub(topic=known_args.topic)
         | "ToDict" >> beam.ParDo(to_dict)
         | "WriteToBQ" >> WriteToBigQuery(table=get_table, schema=schema,
                                          create_disposition=BigQueryDisposition.CREATE_NEVER,
                                          write_disposition=BigQueryDisposition.WRITE_APPEND))
Esempio n. 13
0
def run():
    PROJECT_ID = 'my-project'
    BUCKET = 'gs://my-bucket/temp'

    options = {'project': PROJECT_ID}
    opts = beam.pipeline.PipelineOptions(flags=[], **options)

    p = beam.Pipeline('DirectRunner', options=opts)

    sql = 'SELECT tid, instructor, dept FROM college_staging.Teacher limit 50'
    bq_source = ReadFromBigQuery(query=sql,
                                 use_standard_sql=True,
                                 gcs_location=BUCKET)

    query_results = p | 'Read from BQ' >> beam.io.Read(bq_source)

    out_pcoll = query_results | 'Format Name' >> beam.ParDo(FormatName())

    out_pcoll | 'Log output' >> WriteToText('output.txt')

    dataset_id = 'college_refined'
    table_id = PROJECT_ID + ':' + dataset_id + '.' + 'Teacher_Beam'
    schema_id = 'tid:STRING,fname:STRING,lname:STRING,dept:STRING'

    out_pcoll | 'Write to BQ' >> WriteToBigQuery(
        table=table_id, schema=schema_id, custom_gcs_temp_location=BUCKET)

    result = p.run()
    result.wait_until_finish()
def run():
     PROJECT_ID = 'acquired-rarity-288205'
     BUCKET = 'gs://ykdb_beam/temp'

     options = {
     'project': PROJECT_ID
     }
     opts = beam.pipeline.PipelineOptions(flags=[], **options)

     p = beam.Pipeline('DirectRunner', options=opts)
        
     
     # ***************************************** FIXING soc_code ****************************************************
     sql = "SELECT * FROM H_1B_refined.Occupation_fix_date WHERE length(soc_code) != 7 AND soc_code NOT LIKE '%-%' LIMIT 200"
     bq_source = ReadFromBigQuery(query=sql, use_standard_sql=True, gcs_location=BUCKET)
     
     query_results = p | 'Read from BQ' >> beam.io.Read(bq_source)
     
     out_pcoll = query_results | 'Format Soc' >> beam.ParDo(FormatSocCode())
     
     out_pcoll | 'Log output' >> WriteToText('output_occupation.txt')
     
    
     # ***************************************** INSERT INTO BQ ****************************************************
     dataset_id = 'H_1B_refined'
     table_id = PROJECT_ID + ':' + dataset_id + '.' + 'Occupation_Beam'
     schema_id = 'job_title:STRING, employer_name:STRING, employer_city:STRING, employment_start_date:DATE, employment_end_date:DATE, soc_code:STRING, soc_title:STRING, prevailing_wage_YR:FLOAT, pw_wage_level:STRING, pw_wage_source:STRING, pw_wage_source_year:INTEGER, pw_wage_source_other:STRING, worksite_city:STRING, worksite_country:STRING, worksite_state:STRING, worksite_postal_code:STRING'

     out_pcoll | 'Write to BQ' >> WriteToBigQuery(table=table_id, schema=schema_id, custom_gcs_temp_location=BUCKET)
     
     result = p.run()
     result.wait_until_finish()      
Esempio n. 15
0
def run():
    PROJECT_ID = 'acquired-rarity-288205'
    BUCKET = 'gs://ykdb_beam/temp'

    options = {'project': PROJECT_ID}
    opts = beam.pipeline.PipelineOptions(flags=[], **options)

    p = beam.Pipeline('DirectRunner', options=opts)

    # ***************************************** REMOVE DUPLICATES ****************************************************
    sql = "SELECT * FROM (SELECT  *, COUNT(*) AS count FROM H_1B_refined.Employer GROUP BY employer_name, employer_address, employer_city, employer_state, employer_postal_code, employer_country, employer_province, h_1b_dependent, willful_violator HAVING count = 1) LIMIT 100"
    bq_source = ReadFromBigQuery(query=sql,
                                 use_standard_sql=True,
                                 gcs_location=BUCKET)

    query_results = p | 'Read from BQ' >> beam.io.Read(bq_source)

    out_pcoll = query_results | 'Remove Dups Employer' >> beam.ParDo(
        NoDuplicates())

    out_pcoll | 'Log output' >> WriteToText('output_employer.txt')

    # ***************************************** INSERT INTO BQ ****************************************************
    dataset_id = 'H_1B_refined'
    table_id = PROJECT_ID + ':' + dataset_id + '.' + 'Employer_Beam'

    schema_id = 'employer_name:STRING, employer_address:STRING, employer_city:STRING, employer_state:STRING, employer_postal_code:STRING, employer_country:STRING, employer_province:STRING, h_1b_dependent:BOOLEAN, willful_violator:BOOLEAN'

    out_pcoll | 'Write to BQ' >> WriteToBigQuery(
        table=table_id, schema=schema_id, custom_gcs_temp_location=BUCKET)

    result = p.run()
    result.wait_until_finish()
def run():
    PROJECT_ID = 'acquired-rarity-288205'
    BUCKET = 'gs://ykdb_beam/temp'

    options = {'project': PROJECT_ID}
    opts = beam.pipeline.PipelineOptions(flags=[], **options)

    p = beam.Pipeline('DirectRunner', options=opts)

    # ***************************************** REMOVE DUPLICATES ****************************************************
    sql = "SELECT * FROM (SELECT  *, COUNT(*) AS count FROM H_1B_refined.Application GROUP BY CASE_NUMBER, CASE_STATUS, CASE_SUBMITTED, DECESION_DATE, VISA_CLASS, employer_name, employer_city HAVING count = 1) LIMIT 100"
    bq_source = ReadFromBigQuery(query=sql,
                                 use_standard_sql=True,
                                 gcs_location=BUCKET)

    query_results = p | 'Read from BQ' >> beam.io.Read(bq_source)

    out_pcoll = query_results | 'Remove Dups Application' >> beam.ParDo(
        NoDuplicates())

    out_pcoll | 'Log output' >> WriteToText('output_application.txt')

    # ***************************************** INSERT INTO BQ ****************************************************
    dataset_id = 'H_1B_refined'
    table_id = PROJECT_ID + ':' + dataset_id + '.' + 'Application_Beam'

    schema_id = 'CASE_NUMBER:STRING, CASE_STATUS:STRING, CASE_SUBMITTED:DATE, DECESION_DATE:DATE, VISA_CLASS:STRING, employer_name:STRING, employer_city:STRING'

    out_pcoll | 'Write to BQ' >> WriteToBigQuery(
        table=table_id, schema=schema_id, custom_gcs_temp_location=BUCKET)

    result = p.run()
    result.wait_until_finish()
def run(argv=None, save_main_session=True):
    parser = argparse.ArgumentParser()
    parser.add_argument('--input', dest='input', help='Input file to process.')
    parser.add_argument('--output-file',
                        dest='output_file',
                        help='Output file to write results to.')
    parser.add_argument('--output-bq',
                        dest='output_bq',
                        help='Output BigQuery Table to write results to.')
    known_args, pipeline_args = parser.parse_known_args(argv)

    #     pipeline_args.extend([
    #       '--runner=DirectRunner',
    # #      '--temp_location=/tmp/beam/tmp',
    #       '--job_name=parse-twitter-job',
    #     ])

    def transform_tweet_to_bq(data):
        d = data['data']
        hashtags = [
            h['tag'] for h in d.get('entities', {}).get('hashtags', {})
        ]
        places = data.get('includes', {}).get('places', {})
        place = places[0] if len(places) > 0 else {}
        return {
            'id': d['id'],
            'author_id': d['author_id'],
            'created_at': d['created_at'],
            'place_id': d.get('geo', {}).get('place_id', ''),
            'lang': d.get('lang', ''),
            'text': d['text'],
            'keywords': re.findall(r'[@#\w\']{6,}', d['text'], re.UNICODE),
            'place_name': place.get('full_name', ''),
            'place_country': place.get('country_code', ''),
            'hashtags': hashtags,
        }

    # We use the save_main_session option because one or more DoFn's in this
    # workflow rely on global context (e.g., a module imported at module level).
    pipeline_options = PipelineOptions(pipeline_args)
    pipeline_options.view_as(
        SetupOptions).save_main_session = save_main_session
    with beam.Pipeline(options=pipeline_options) as p:
        output = (p
                  | 'Read' >> ReadFromText(known_args.input)
                  | 'FromJSON' >> beam.Map(json.loads)
                  | 'Transform' >> beam.Map(transform_tweet_to_bq))

        if known_args.output_file:
            unused = (output
                      | 'ToJSON' >> beam.Map(json.dumps)
                      | 'Write' >> WriteToText(known_args.output_file))

        if known_args.output_bq:
            unused = (output | 'LoadBigQuery' >> WriteToBigQuery(
                known_args.output_bq, method=WriteToBigQuery.Method.FILE_LOADS)
                      )
Esempio n. 18
0
def run():
    with beam.Pipeline(options=PipelineOptions(streaming=True)) as p:
        pc = (p | ReadFromPubSub(topic=get_topic_path())
              | beam.WindowInto(
                  window.FixedWindows(WINDOW_SIZE),
                  accumulation_mode=trigger.AccumulationMode.DISCARDING)
              | 'AddWindowInfo' >> beam.ParDo(add_window_info)
              | beam.CombinePerKey(sum)
              | beam.ParDo(prepare_element)
              | 'Print' >> beam.ParDo(print_fn)
              | WriteToBigQuery(BIGQUERY_TABLE_ID))
 def build(self, p):
     schema = [f"{c['name']}:{TYPES[c['type']]}" for c in self._columns]
     return (p
             | beam.Map(lambda rec: {k: str(v)
                                     for k, v in rec.items()})
             | WriteToBigQuery(
                 project=self._project,
                 dataset=self._dataset,
                 table=self._table,
                 schema=','.join(schema),
                 write_disposition=self._write_disposition,
                 custom_gcs_temp_location=self._gcs_temp_location))
Esempio n. 20
0
def parse_table_schema(schema):
    """
    Accepts a BigQuery tableschema as a string, dict (from json), or bigquery.TabelSchema and returns
    a bigquery.TableSchema

    String Format

    "[FIELD]:[DATA_TYPE],[FIELD]:[DATA_TYPE]"

    dict format

    {
      "fields": [
        {
          "name": "[FIELD]",
          "type": "[DATA_TYPE]"
        },
        {
          "name": "[FIELD]",
          "type": "[DATA_TYPE]"
        }
    ]}

    see https://cloud.google.com/bigquery/data-types
    see https://cloud.google.com/bigquery/docs/schemas#specifying_a_schema_file


    """
    if schema is None:
        return schema
    elif isinstance(schema, bq.TableSchema):
        return schema
    elif isinstance(schema, six.string_types):
        # try to parse json into dict
        try:
            schema = ujson.loads(schema)
        except ValueError as e:
            pass

    if isinstance(schema, six.string_types):
        # if it is still a string, then it must not be json.  Assume it is string representation
        return WriteToBigQuery.get_table_schema_from_string(schema)
    elif isinstance(schema, dict):
        # either it came in as a dict or it got converted from json earlier
        return parse_table_schema_from_json(ujson.dumps(schema))
    else:
        raise TypeError('Unexpected schema argument: %s.' % schema)
Esempio n. 21
0
def run():
    # set up location
    PROJECT_ID = 'trim-cistern-288221'
    BUCKET = 'gs://bhnk-milestone1-data'
    DIR_PATH = BUCKET + '/output/' + datetime.datetime.now().strftime(
        '%Y_%m_%d_%H_%M_%S') + '/'

    # use DataflowRunner instead of DirectRunner
    options = PipelineOptions(flags=None,
                              runner='DataflowRunner',
                              project=PROJECT_ID,
                              job_name='imdbcharacter',
                              temp_location=BUCKET + '/temp',
                              region='us-central1')

    p = beam.pipeline.Pipeline(options=options)

    # retrieve the data from imdb_refined dataset and save this information (location)
    sql = 'SELECT * FROM imdb_refined.Characters'
    bq_source = ReadFromBigQuery(query=sql,
                                 use_standard_sql=True,
                                 gcs_location=BUCKET)

    # use the previously saved information (location) and read from BigQuery
    # query results is now input P collection
    query_results = p | 'Read from BQ' >> beam.io.Read(bq_source)

    # Use ParDo to call function on query results
    out_pcoll = query_results | 'Split characters' >> beam.ParDo(
        SplitCharacters())

    # write the results into text file
    out_pcoll | 'Log output' >> WriteToText(DIR_PATH + 'output.txt')

    dataset_id = 'imdb_refined'
    table_id = PROJECT_ID + ':' + dataset_id + '.' + 'Characters_Dataflow'
    schema_id = 'tconst:STRING,nconst:STRING,characters:STRING'

    # write to BigQuery using the location set above
    out_pcoll | 'Write to BQ' >> WriteToBigQuery(
        table=table_id, schema=schema_id, custom_gcs_temp_location=BUCKET)

    # run and display results after everything is finished
    result = p.run()
    result.wait_until_finish()
Esempio n. 22
0
def run(argv=None):
    parser = argparse.ArgumentParser()
    parser.add_argument(
        '--input_topic',
        dest='input_topic',
        help='Input topic in the form projects/<project>/topics/<topic>')
    parser.add_argument('--output',
                        dest='output_file',
                        help='Output file where to write')
    parser.add_argument('--table', dest='table_name', help='BQ table name')
    parser.add_argument('--dataset', dest='dataset_id', help='BQ dataset')
    parser.add_argument('--project_id', dest='project_id', help='Project ID')
    known_args, pipeline_args = parser.parse_known_args(argv)
    pipeline_args.extend(['--project=main-training-project', '--streaming'])
    """
    pipeline_args.extend(['--runner=DataflowRunner',
                             '--project=yourprojectid',
                             '--staging_location=gs://yourgsbucket',
                             '--temp_location=gs://yourgsbucket',
                         '--job_name=your-job-name'])
    """
    pipeline_options = PipelineOptions(pipeline_args)
    #pipeline_options.view_as(SetupOptions).save_main_session = True

    with beam.Pipeline(options=pipeline_options) as p:
        lines = p | ReadFromPubSub(topic=known_args.input_topic)

        def str_to_dict(str_line):
            import pandas as pd
            import nonpypimodule
            import changecommentfield

            df_rows = eval(str_line)
            pd.DataFrame.from_dict(df_rows)
            bq_rows = eval(re.sub('\[|\]', '', str_line.decode('utf-8')))
            bq_rows['post'] = nonpypimodule.return_sentence()
            bq_rows = changecommentfield.change_field(bq_rows)
            logging.info(bq_rows)
            return bq_rows

        lines = lines | 'String to dict' >> beam.Map(str_to_dict)
        lines = lines | 'Output to BQ' >> WriteToBigQuery(
            table=known_args.table_name,
            dataset=known_args.dataset_id,
            project=known_args.project_id)
def run():
    PROJECT_ID = 'acquired-rarity-288205'
    BUCKET = 'gs://ykdb_beam_us'
    DIR_PATH = BUCKET + '/output/' + datetime.datetime.now().strftime(
        '%Y_%m_%d_%H_%M_%S') + '/'

    options = PipelineOptions(flags=None,
                              runner='DataflowRunner',
                              project=PROJECT_ID,
                              job_name='occupation',
                              temp_location=BUCKET + '/temp',
                              region='us-central1')

    p = beam.pipeline.Pipeline(options=options)
    '''
     sql = "SELECT FARM_FINGERPRINT(TO_JSON_STRING(t)) AS job_id, * FROM (SELECT job_title, emp.employer_id AS employer_id, soc_code, soc_title, prevailing_wage_YR, pw_wage_level, pw_wage_source, pw_wage_source_year, pw_wage_source_other, worksite_city, worksite_country, worksite_state, worksite_postal_code FROM (SELECT  *, COUNT(*) AS count FROM H_1B_refined.Occupation WHERE prevailing_wage_YR > 5000 GROUP BY job_title, employer_name, employer_city, employment_start_date, employment_end_date, soc_code, soc_title, prevailing_wage_YR, pw_wage_level, pw_wage_source, pw_wage_source_year, pw_wage_source_other, worksite_city, worksite_country, worksite_state, worksite_postal_code HAVING count = 1) AS occ JOIN H_1B_refined.Employer_Dataflow AS emp ON emp.employer_name = occ.employer_name AND emp.employer_city = occ.employer_city) AS t WHERE length(soc_code) >= 6 AND length(soc_code) <= 10 AND length(soc_code) != 8" '''

    sql = "SELECT FARM_FINGERPRINT(TO_JSON_STRING(t)) AS job_id, job_title, employer_id, soc_code, soc_title, prevailing_wage_YR, pw_wage_level, pw_wage_source, pw_wage_source_year, pw_wage_source_other, worksite_city, worksite_country, worksite_state, worksite_postal_code FROM (SELECT  *, COUNT(*) AS count FROM(SELECT job_title, emp.employer_id AS employer_id, occ.employer_name, occ.employer_city, employment_start_date, employment_end_date, soc_code, soc_title, prevailing_wage_YR, pw_wage_level, pw_wage_source, pw_wage_source_year, pw_wage_source_other, worksite_city, worksite_country, worksite_state, worksite_postal_code FROM H_1B_refined.Occupation as occ JOIN H_1B_refined.Employer_Dataflow AS emp ON emp.employer_name = occ.employer_name AND emp.employer_city = occ.employer_city WHERE prevailing_wage_YR > 5000 AND length(soc_code) >= 6 AND length(soc_code) <= 10 AND length(soc_code) != 8) GROUP BY job_title, employer_id, employer_name, employer_city, employment_start_date, employment_end_date, soc_code, soc_title, prevailing_wage_YR, pw_wage_level, pw_wage_source, pw_wage_source_year, pw_wage_source_other, worksite_city, worksite_country, worksite_state, worksite_postal_code HAVING count = 1) as t"

    bq_source = ReadFromBigQuery(query=sql,
                                 use_standard_sql=True,
                                 gcs_location=BUCKET)

    query_results = p | 'Read from BQ' >> beam.io.Read(bq_source)
    '''
     out_pcoll_fix_date = query_results | 'Format Date' >> beam.ParDo(FormatDate())
        
     out_pcoll_fix_date | 'Log fix_date_output' >> WriteToText(DIR_PATH + 'output_occ_fix_date.txt')
     '''
    out_pcoll = query_results | 'Format Soc' >> beam.ParDo(FormatSocCode())

    out_pcoll | 'Log output' >> WriteToText(DIR_PATH + 'output_occupation.txt')

    # ***************************************** INSERT INTO BQ ****************************************************
    dataset_id = 'H_1B_refined'
    table_id = PROJECT_ID + ':' + dataset_id + '.' + 'Occupation_Dataflow'

    schema_id = 'job_id:INTEGER, job_title:STRING, employer_id:INTEGER, soc_code:STRING, soc_title:STRING, prevailing_wage_YR:FLOAT, pw_wage_level:STRING, pw_wage_source:STRING, pw_wage_source_year:INTEGER, pw_wage_source_other:STRING, worksite_city:STRING, worksite_country:STRING, worksite_state:STRING, worksite_postal_code:STRING'

    out_pcoll | 'Write to BQ' >> WriteToBigQuery(
        table=table_id, schema=schema_id, custom_gcs_temp_location=BUCKET)

    result = p.run()
    result.wait_until_finish()
def run():
    PROJECT_ID = 'acquired-rarity-288205'
    BUCKET = 'gs://ykdb_beam/temp'

    options = {'project': PROJECT_ID}
    opts = beam.pipeline.PipelineOptions(flags=[], **options)

    p = beam.Pipeline('DirectRunner', options=opts)

    # ***************************************** REMOVE DUPLICATES ****************************************************
    sql = "SELECT job_title, employer_name, employer_city, employment_start_date, employment_end_date, soc_code, soc_title, prevailing_wage_YR, pw_wage_level, pw_wage_source, pw_wage_source_year, pw_wage_source_other, worksite_city, worksite_country, worksite_state, worksite_postal_code FROM (SELECT  *, COUNT(*) AS count FROM H_1B_refined.Occupation WHERE prevailing_wage_YR > 5000 AND length(soc_code) > 5 GROUP BY job_title, employer_name, employer_city, employment_start_date, employment_end_date, soc_code, soc_title, prevailing_wage_YR, pw_wage_level, pw_wage_source, pw_wage_source_year, pw_wage_source_other, worksite_city, worksite_country, worksite_state, worksite_postal_code HAVING count = 1) LIMIT 50"
    bq_source = ReadFromBigQuery(query=sql,
                                 use_standard_sql=True,
                                 gcs_location=BUCKET)

    query_results = p | 'Read from BQ' >> beam.io.Read(bq_source)

    out_pcoll_no_dup = query_results | 'Format prevailing_wage_YR and Remove dups' >> beam.ParDo(
        NoDuplicates())

    out_pcoll_no_dup | 'Log no_dup_output' >> WriteToText(
        'output_occ_no_dup.txt')

    out_pcoll_fix_date = out_pcoll_no_dup | 'Format Date' >> beam.ParDo(
        FormatDate())

    out_pcoll_fix_date | 'Log fix_date_output' >> WriteToText(
        'output_occ_fix_date.txt')

    out_pcoll = out_pcoll_fix_date | 'Format Soc' >> beam.ParDo(
        FormatSocCode())

    out_pcoll | 'Log output' >> WriteToText('output_occupation.txt')

    # ***************************************** INSERT INTO BQ ****************************************************
    dataset_id = 'H_1B_refined'
    table_id = PROJECT_ID + ':' + dataset_id + '.' + 'Occupation_Beam'
    schema_id = 'job_title:STRING, employer_name:STRING, employer_city:STRING, employment_start_date:Date, employment_end_date:Date, soc_code:STRING, soc_title:STRING, prevailing_wage_YR:FLOAT, pw_wage_level:STRING, pw_wage_source:STRING, pw_wage_source_year:INTEGER, pw_wage_source_other:STRING, worksite_city:STRING, worksite_country:STRING, worksite_state:STRING, worksite_postal_code:STRING'

    out_pcoll | 'Write to BQ' >> WriteToBigQuery(
        table=table_id, schema=schema_id, custom_gcs_temp_location=BUCKET)

    result = p.run()
    result.wait_until_finish()
Esempio n. 25
0
def run():
    pipeline_options = PipelineOptions()
    options = pipeline_options.view_as(SQLOptions)
    #options.view_as(SetupOptions).save_main_session = True
    #temp_location = options.view_as(GoogleCloudOptions).temp_location
    #print("Here!", temp_location)
    pipeline = beam.Pipeline(options=options)
    
    

    mysql_data = (pipeline | ReadFromSQL(host=options.host, port=options.port,
                                        username=options.username, password=options.password,
                                        database=options.database, query=options.query,
                                        #wrapper={'mssql': MSSQLWrapper, 'mysql': MySQLWrapper, 'postgres': PostgresWrapper}[options.db_type],
                                        wrapper=MSSQLWrapper,
                                        # wrapper=PostgresWrapper
                                        #
                                        )
                           #| 'Parse'   >> beam.Map(parse_json)
                           | 'Write to Table' >> WriteToBigQuery(
                               table= options.output_table,
                               schema = 'SCHEMA_AUTODETECT',
                               write_disposition=BigQueryDisposition.WRITE_TRUNCATE,
                               create_disposition=beam.io.BigQueryDisposition.CREATE_IF_NEEDED)
                 )

    #transformed_data = mysql_data | "Transform records" >> beam.Map(transform_records, 'user_id')
    # transformed_data | "insert into mysql" >> SQLWriter(options.host, options.port,
    #                                                     options.username, options.password,
    #                                                     options.database,
    #                                                     table='output',
    #                                                     wrapper=MySQLWrapper, autocommit=True, batch_size=500)
    # transformed_data | "insert into postgres" >> SQLWriter(options.host, 5432,
    #                                                        'postgres', options.password,
    #                                                        options.database, table=options.output_table,
    #                                                        wrapper=PostgresWrapper, autocommit=False, batch_size=500)
    #mysql_data | "Log records " >> beam.Map(log) | beam.io.WriteToText(options.output, num_shards=1, file_name_suffix=".json")

 

    pipeline.run().wait_until_finish()
def run():
    PROJECT_ID = 'acquired-rarity-288205'
    BUCKET = 'gs://ykdb_beam_us'
    DIR_PATH = BUCKET + '/output/' + datetime.datetime.now().strftime(
        '%Y_%m_%d_%H_%M_%S') + '/'

    options = PipelineOptions(flags=None,
                              runner='DataflowRunner',
                              project=PROJECT_ID,
                              job_name='application',
                              temp_location=BUCKET + '/temp',
                              region='us-central1')

    p = beam.pipeline.Pipeline(options=options)

    # ***************************************** REMOVE DUPLICATES ****************************************************
    sql = "SELECT FARM_FINGERPRINT(TO_JSON_STRING(t)) AS case_id, * FROM (SELECT emp.employer_id, CASE_NUMBER, CASE_STATUS, CASE_SUBMITTED, DECESION_DATE AS DECISION_DATE, VISA_CLASS FROM (SELECT  *, COUNT(*) AS count FROM H_1B_refined.Application GROUP BY CASE_NUMBER, CASE_STATUS, CASE_SUBMITTED, DECESION_DATE, VISA_CLASS, employer_name, employer_city HAVING count = 1) AS app JOIN H_1B_refined.Employer_Dataflow AS emp ON emp.employer_name = app.employer_name AND emp.employer_city = app.employer_city) AS t"
    bq_source = ReadFromBigQuery(query=sql,
                                 use_standard_sql=True,
                                 gcs_location=BUCKET)

    query_results = p | 'Read from BQ' >> beam.io.Read(bq_source)

    out_pcoll = query_results | 'Application Transfromation' >> beam.ParDo(
        NoDuplicates())

    out_pcoll | 'Log output' >> WriteToText(DIR_PATH +
                                            'output_appplication.txt')

    # ***************************************** INSERT INTO BQ ****************************************************
    dataset_id = 'H_1B_refined'
    table_id = PROJECT_ID + ':' + dataset_id + '.' + 'Application_Dataflow'

    schema_id = 'case_id:INTEGER, employer_id:INTEGER, CASE_NUMBER:STRING, CASE_STATUS:STRING, CASE_SUBMITTED:DATE, DECISION_DATE:DATE, VISA_CLASS:STRING'

    out_pcoll | 'Write to BQ' >> WriteToBigQuery(
        table=table_id, schema=schema_id, custom_gcs_temp_location=BUCKET)

    result = p.run()
    result.wait_until_finish()
Esempio n. 27
0
def run(argv=None, save_main_session=True):
    """Main entry point; defines and runs the wordcount pipeline."""
    parser = argparse.ArgumentParser()
    parser.add_argument(
        '--input',
        dest='input',
        default='gs://dataflow-samples/shakespeare/kinglear.txt',
        help='Input file to process.')
    parser.add_argument('--output',
                        dest='output',
                        required=True,
                        help='Output BigQuery table to write results to.')
    known_args, pipeline_args = parser.parse_known_args(argv)

    # We use the save_main_session option because one or more DoFn's in this
    # workflow rely on global context (e.g., a module imported at module level).
    pipeline_options = PipelineOptions(pipeline_args)
    pipeline_options.view_as(
        SetupOptions).save_main_session = save_main_session
    p = beam.Pipeline(options=pipeline_options)

    # Read the text file[pattern] into a PCollection.
    (p | 'read' >> ReadFromText(known_args.input)
     | 'parse message' >> beam.Map(parse_message)
     | 'window per minute' >> beam.WindowInto(
         window.FixedWindows(1),
         trigger=trigger.DefaultTrigger(),
         accumulation_mode=trigger.AccumulationMode.DISCARDING)
     | 'group by id' >> beam.GroupByKey()
     | 'sum_value' >> beam.Map(sum_value)
     | 'format for bq' >> beam.Map(format_for_bigquery)
     | 'write to bigquery' >> WriteToBigQuery(
         table=known_args.output,
         schema='key:INTEGER,sum:FLOAT',
         create_disposition='CREATE_IF_NEEDED',
         write_disposition='WRITE_APPEND'))

    result = p.run()
    result.wait_until_finish()
Esempio n. 28
0
def run():
    PROJECT_ID = 'acquired-rarity-288205'
    BUCKET = 'gs://ykdb_beam_us'
    DIR_PATH = BUCKET + '/output/' + datetime.datetime.now().strftime(
        '%Y_%m_%d_%H_%M_%S') + '/'

    options = PipelineOptions(flags=None,
                              runner='DataflowRunner',
                              project=PROJECT_ID,
                              job_name='ownership',
                              temp_location=BUCKET + '/temp',
                              region='us-central1')

    p = beam.pipeline.Pipeline(options=options)

    # ***************************************** REMOVE DUPLICATES ****************************************************
    sql = "SELECT FARM_FINGERPRINT(TO_JSON_STRING(t)) as ownership_id, * FROM (SELECT * FROM (SELECT  *, COUNT(*) AS count FROM H_1B_refined.Ownership GROUP BY occ_code, occ_title, ownership, naics_title, grp, tot_emp, emp_prse, h_mean, a_mean, mean_prse, a_pct10, a_pct25, a_median, a_pct75, a_pct90 HAVING count = 1)) as  t"
    bq_source = ReadFromBigQuery(query=sql,
                                 use_standard_sql=True,
                                 gcs_location=BUCKET)

    query_results = p | 'Read from BQ' >> beam.io.Read(bq_source)

    out_pcoll = query_results | 'Remove Dups Ownership' >> beam.ParDo(
        NoDuplicates())

    out_pcoll | 'Log output' >> WriteToText(DIR_PATH + 'output_ownership.txt')

    # ***************************************** INSERT INTO BQ ****************************************************
    dataset_id = 'H_1B_refined'
    table_id = PROJECT_ID + ':' + dataset_id + '.' + 'Ownership_Dataflow'

    schema_id = 'ownership_id:INTEGER, occ_code:STRING, occ_title:STRING, ownership:STRING, naics_title:STRING, grp:STRING, tot_emp:INTEGER, emp_prse:FLOAT, h_mean:FLOAT, a_mean:INTEGER, mean_prse:FLOAT, a_pct10:INTEGER, a_pct25:INTEGER, a_median:INTEGER, a_pct75:INTEGER, a_pct90:INTEGER'

    out_pcoll | 'Write to BQ' >> WriteToBigQuery(
        table=table_id, schema=schema_id, custom_gcs_temp_location=BUCKET)

    result = p.run()
    result.wait_until_finish()
Esempio n. 29
0
def run():
    PROJECT_ID = 'acquired-rarity-288205'
    BUCKET = 'gs://ykdb_beam_us'
    DIR_PATH = BUCKET + '/output/' + datetime.datetime.now().strftime(
        '%Y_%m_%d_%H_%M_%S') + '/'

    options = PipelineOptions(flags=None,
                              runner='DataflowRunner',
                              project=PROJECT_ID,
                              job_name='employer',
                              temp_location=BUCKET + '/temp',
                              region='us-central1')

    p = beam.pipeline.Pipeline(options=options)

    # ***************************************** REMOVE DUPLICATES ****************************************************
    sql = "SELECT FARM_FINGERPRINT(TO_JSON_STRING(t)) AS employer_id, * FROM (SELECT employer_name, employer_address, employer_city, employer_state, employer_postal_code, employer_country, employer_province, h_1b_dependent, willful_violator FROM (SELECT  *, COUNT(*) AS count FROM H_1B_refined.Employer GROUP BY employer_name, employer_address, employer_city, employer_state, employer_postal_code, employer_country, employer_province, h_1b_dependent, willful_violator HAVING count = 1)) AS t"
    bq_source = ReadFromBigQuery(query=sql,
                                 use_standard_sql=True,
                                 gcs_location=BUCKET)

    query_results = p | 'Read from BQ' >> beam.io.Read(bq_source)

    out_pcoll = query_results | 'Remove Dups Employer' >> beam.ParDo(
        NoDuplicates())

    out_pcoll | 'Log output' >> WriteToText(DIR_PATH + 'output_employer.txt')

    # ***************************************** INSERT INTO BQ ****************************************************
    dataset_id = 'H_1B_refined'
    table_id = PROJECT_ID + ':' + dataset_id + '.' + 'Employer_Dataflow'

    schema_id = 'employer_id:INTEGER, employer_name:STRING, employer_address:STRING, employer_city:STRING, employer_state:STRING, employer_postal_code:STRING, employer_country:STRING, employer_province:STRING, h_1b_dependent:BOOLEAN, willful_violator:BOOLEAN'

    out_pcoll | 'Write to BQ' >> WriteToBigQuery(
        table=table_id, schema=schema_id, custom_gcs_temp_location=BUCKET)

    result = p.run()
    result.wait_until_finish()
Esempio n. 30
0
def run():
    # set up location
    PROJECT_ID = 'trim-cistern-288221'
    BUCKET = 'gs://bhnk-milestone1-data'

    options = {'project': PROJECT_ID}
    opts = beam.pipeline.PipelineOptions(flags=[], **options)

    # executed with DirectRunner
    p = beam.Pipeline('DirectRunner', options=opts)

    # retrieve the data from imdb_refined dataset and save this information (location)
    sql = 'SELECT * FROM imdb_refined.Primary_Professions limit 250'
    bq_source = ReadFromBigQuery(query=sql,
                                 use_standard_sql=True,
                                 gcs_location=BUCKET)

    # use the previously saved information (location) and read from BigQuery
    # query results is now input P collection
    query_results = p | 'Read from BQ' >> beam.io.Read(bq_source)

    # Use ParDo to call function on query results
    out_pcoll = query_results | 'Split Primary Professions' >> beam.ParDo(
        SplitPrimaryProfessions())

    out_pcoll | 'Log output' >> WriteToText('output.txt')

    dataset_id = 'imdb_refined'
    table_id = PROJECT_ID + ':' + dataset_id + '.' + 'Primary_Professions_Beam'
    schema_id = 'nconst:STRING,primaryProfession:STRING'

    # write to BigQuery using the location set above
    out_pcoll | 'Write to BQ' >> WriteToBigQuery(
        table=table_id, schema=schema_id, custom_gcs_temp_location=BUCKET)

    # run and display results after everything is finished
    result = p.run()
    result.wait_until_finish()