def run(): PROJECT_ID = 'my-project' BUCKET = 'gs://my-bucket' DIR_PATH = BUCKET + '/output/' + datetime.datetime.now().strftime( '%Y_%m_%d_%H_%M_%S') + '/' options = PipelineOptions(flags=None, runner='DataflowRunner', project=PROJECT_ID, job_name='class', temp_location=BUCKET + '/temp', region='us-central1') p = beam.pipeline.Pipeline(options=options) sql = 'SELECT sid, cno, cname, credits, grade FROM college_staging.Class' bq_source = ReadFromBigQuery(query=sql, use_standard_sql=True, gcs_location=BUCKET) query_results = p | 'Read from BQ' >> beam.io.Read(bq_source) takes_pcoll = query_results | 'Make Takes' >> beam.ParDo(MakeTakes()) takes_pcoll | 'Log takes output' >> WriteToText(DIR_PATH + 'takes_output.txt') dataset_id = 'college_refined' table_id = PROJECT_ID + ':' + dataset_id + '.' + 'Takes_Dataflow' schema_id = 'sid:STRING,cno:STRING,grade:STRING' takes_pcoll | 'Write takes to BQ' >> WriteToBigQuery( table=table_id, schema=schema_id, custom_gcs_temp_location=BUCKET) class_pcoll = query_results | 'Make Class' >> beam.ParDo(MakeClass()) grouped_class_pcoll = class_pcoll | 'GroupByKey' >> beam.GroupByKey() grouped_class_pcoll | 'Log class groups' >> WriteToText( DIR_PATH + 'class_groups_output.txt') unique_class_pcoll = grouped_class_pcoll | 'Make Unique Class' >> beam.ParDo( MakeUniqueClass()) unique_class_pcoll | 'Log class unique' >> WriteToText( DIR_PATH + 'class_unique_output.txt') table_id = PROJECT_ID + ':' + dataset_id + '.' + 'Class_Dataflow' schema_id = 'cno:STRING,cname:STRING,credits:INTEGER' unique_class_pcoll | 'Write class to BQ' >> WriteToBigQuery( table=table_id, schema=schema_id, custom_gcs_temp_location=BUCKET) result = p.run() result.wait_until_finish()
def run(): PROJECT_ID = 'my-project' BUCKET = 'gs://my-bucket/temp' options = {'project': PROJECT_ID} opts = beam.pipeline.PipelineOptions(flags=[], **options) p = beam.Pipeline('DirectRunner', options=opts) sql = 'SELECT sid, cno, cname, credits, grade FROM college_staging.Class limit 50' bq_source = ReadFromBigQuery(query=sql, use_standard_sql=True, gcs_location=BUCKET) query_results = p | 'Read from BQ' >> beam.io.Read(bq_source) takes_pcoll = query_results | 'Make Takes' >> beam.ParDo(MakeTakes()) takes_pcoll | 'Log takes output' >> WriteToText('takes_output.txt') dataset_id = 'college_refined' table_id = PROJECT_ID + ':' + dataset_id + '.' + 'Takes_Beam' schema_id = 'sid:STRING,cno:STRING,grade:STRING' takes_pcoll | 'Write takes to BQ' >> WriteToBigQuery( table=table_id, schema=schema_id, custom_gcs_temp_location=BUCKET) class_pcoll = query_results | 'Make Class' >> beam.ParDo(MakeClass()) grouped_class_pcoll = class_pcoll | 'GroupByKey' >> beam.GroupByKey() grouped_class_pcoll | 'Log class groups' >> WriteToText( 'class_groups_output.txt') unique_class_pcoll = grouped_class_pcoll | 'Make Unique Class' >> beam.ParDo( MakeUniqueClass()) unique_class_pcoll | 'Log class unique' >> WriteToText( 'class_unique_output.txt') table_id = PROJECT_ID + ':' + dataset_id + '.' + 'Class_Beam' schema_id = 'cno:STRING,cname:STRING,credits:INTEGER' unique_class_pcoll | 'Write class to BQ' >> WriteToBigQuery( table=table_id, schema=schema_id, custom_gcs_temp_location=BUCKET) result = p.run() result.wait_until_finish()
def run(argv=None): parser = argparse.ArgumentParser() parser.add_argument( '--input_topic', dest='input_topic', help='Input topic in the form projects/<project>/topics/<topic>') parser.add_argument('--output', dest='output_file', help='Output file where to write') parser.add_argument('--table', dest='table_name', help='BQ table name') parser.add_argument('--dataset', dest='dataset_id', help='BQ dataset') parser.add_argument('--project_id', dest='project_id', help='Project ID') known_args, pipeline_args = parser.parse_known_args(argv) pipeline_args.extend(['--project=<your-project>']) pipeline_options = PipelineOptions(pipeline_args) with beam.Pipeline(options=pipeline_options) as p: lines = p | ReadFromPubSub(topic=known_args.input_topic) def str_to_dict(str_line): import pandas as pd df_rows = eval(str_line) pd.DataFrame.from_dict(df_rows) bq_rows = eval(re.sub('\[|\]', '', str_line.decode('utf-8'))) logging.info(bq_rows) return bq_rows lines = lines | 'String to dict' >> beam.Map(str_to_dict) lines = lines | 'Output to BQ' >> WriteToBigQuery( table=known_args.table_name, dataset=known_args.dataset_id, project=known_args.project_id)
def main(): # bq_source = BigQuerySource(query=""" # SELECT created_at, text # FROM got_sentiment.got_tweets # """, # validate=False, coder=None, # use_standard_sql=True, flatten_results=True, # kms_key=None) # Removed attributes from ReadFromPubSub: # with_attributes=False, # timestamp_attribute='created_at' # Create the Pipeline with the specified options. with Pipeline(options=options) as p: results = ( p | 'read_from_topic' >> ReadFromPubSub(topic=PUBSUB_TOPIC) | 'Window' >> WindowInto(window.FixedWindows(60)) | 'Emit_needed_values' >> FlatMap(emit_values, entity_map) | 'Combine' >> CombinePerKey(EntityScoreCombine()) | 'Add Window Timestamp' >> beam.ParDo(AddWindowTimestampFn()) | 'FormatForWrite' >> Map(format_for_write) | 'Write' >> WriteToBigQuery('streaming_scores', dataset=BQ_DATASET, project=PROJECT_ID, create_disposition='CREATE_IF_NEEDED', write_disposition='WRITE_APPEND', batch_size=20))
def _run_pubsub_bq_pipeline(self, method, triggering_frequency=None): l = [i for i in range(self._SIZE)] matchers = [ PipelineStateMatcher(PipelineState.RUNNING), BigqueryFullResultStreamingMatcher(project=self.project, query="SELECT number FROM %s" % self.output_table, data=[(i, ) for i in l]) ] args = self.test_pipeline.get_full_options_as_args( on_success_matcher=hc.all_of(*matchers), wait_until_finish_duration=self.WAIT_UNTIL_FINISH_DURATION, experiments='use_beam_bq_sink', streaming=True) def add_schema_info(element): yield {'number': element} messages = [str(i).encode('utf-8') for i in l] for message in messages: self.pub_client.publish(self.input_topic.name, message) with beam.Pipeline(argv=args) as p: mesages = (p | ReadFromPubSub(subscription=self.input_sub.name) | beam.ParDo(add_schema_info)) _ = mesages | WriteToBigQuery( self.output_table, schema=self.SCHEMA, method=method, triggering_frequency=triggering_frequency)
def run(argv=None, save_main_session=True): """Main entry point; defines and runs the wordcount pipeline.""" parser = argparse.ArgumentParser() parser.add_argument('--input', dest='input', required=True, help='Input Pub/Sub subscription to read from.') parser.add_argument('--output', dest='output', required=True, help='Output BigQuery table to write results to.') known_args, pipeline_args = parser.parse_known_args(argv) # We use the save_main_session option because one or more DoFn's in this # workflow rely on global context (e.g., a module imported at module level). pipeline_options = PipelineOptions(pipeline_args) pipeline_options.view_as(SetupOptions).save_main_session = save_main_session p = beam.Pipeline(options=pipeline_options) # Read the text file[pattern] into a PCollection. (p | 'read' >> ReadFromPubSub(subscription=known_args.input) | 'extract words' >> beam.FlatMap(extract_words) | 'transform to kv' >> beam.Map(lambda x: (x,1)) | 'window per minute' >> beam.WindowInto( window.FixedWindows(5), trigger=trigger.AfterProcessingTime(delay=10), accumulation_mode=trigger.AccumulationMode.DISCARDING) | 'group by words' >> beam.GroupByKey() | 'count ones' >> beam.Map(count_ones) | 'format for bq' >> beam.Map(format_for_bigquery) | 'write to bigquery' >> WriteToBigQuery(table=known_args.output)) result = p.run() result.wait_until_finish()
def __init__(self, temp_gcs_location, table, dataset=None, project=None, schema=None, create_disposition=BigQueryDisposition.CREATE_IF_NEEDED, write_disposition=BigQueryDisposition.WRITE_EMPTY, test_client=None, temp_shards_per_day=0): self.temp_shards_per_day = temp_shards_per_day or DEFAULT_TEMP_SHARDS_PER_DAY # store table ref as a string so that it will pickle self.table_reference = decode_table_ref(table, dataset, project) self.project_id = project or self.table_reference.projectId # store schema as a dict so that it will pickle self.table_schema_dict = WriteToBigQuery.table_schema_to_dict( parse_table_schema(schema)) self.create_disposition = create_disposition self.write_disposition = write_disposition super(BigQueryDatePartitionedSink, self).__init__(file_path_prefix=pp.join(temp_gcs_location, 'shard'), file_name_suffix='.json')
def run(argv=None, save_main_session=True): """Main entry point; defines and runs the wordcount pipeline.""" parser = argparse.ArgumentParser() parser.add_argument('--input', dest='input', default='gs://dataflow-samples/shakespeare/kinglear.txt', help='Input file to process.') parser.add_argument('--output', dest='output', required=True, help='Output BigQuery table to write results to.') known_args, pipeline_args = parser.parse_known_args(argv) # We use the save_main_session option because one or more DoFn's in this # workflow rely on global context (e.g., a module imported at module level). pipeline_options = PipelineOptions(pipeline_args) pipeline_options.view_as(SetupOptions).save_main_session = save_main_session p = beam.Pipeline(options=pipeline_options) # Read the text file[pattern] into a PCollection. (p | 'read' >> ReadFromText(known_args.input) | 'extract words' >> beam.FlatMap(extract_words) | 'transform to kv' >> beam.Map(lambda x: (x,1)) | 'group by words' >> beam.GroupByKey() | 'count ones' >> beam.Map(count_ones) | 'format for bq' >> beam.Map(format_for_bigquery) | 'write to bigquery' >> WriteToBigQuery(table=known_args.output)) result = p.run() result.wait_until_finish()
def run(): PROJECT_ID = 'my-project' BUCKET = 'gs://my-bucket' DIR_PATH = BUCKET + '/output/' + datetime.datetime.now().strftime( '%Y_%m_%d_%H_%M_%S') + '/' options = PipelineOptions(flags=None, runner='DataflowRunner', project=PROJECT_ID, job_name='teacher', temp_location=BUCKET + '/temp', region='us-central1') p = beam.pipeline.Pipeline(options=options) sql = 'SELECT tid, instructor, dept FROM college_staging.Teacher' bq_source = ReadFromBigQuery(query=sql, use_standard_sql=True, gcs_location=BUCKET) query_results = p | 'Read from BQ' >> beam.io.Read(bq_source) out_pcoll = query_results | 'Format Name' >> beam.ParDo(FormatName()) out_pcoll | 'Log output' >> WriteToText(DIR_PATH + 'output.txt') dataset_id = 'college_refined' table_id = PROJECT_ID + ':' + dataset_id + '.' + 'Teacher_Dataflow' schema_id = 'tid:STRING,fname:STRING,lname:STRING,dept:STRING' out_pcoll | 'Write to BQ' >> WriteToBigQuery( table=table_id, schema=schema_id, custom_gcs_temp_location=BUCKET) result = p.run() result.wait_until_finish()
def run(): PROJECT_ID = 'trim-cistern-288221' BUCKET = 'gs://bhnk-milestone1-data' options = {'project': PROJECT_ID} opts = beam.pipeline.PipelineOptions(flags=[], **options) p = beam.Pipeline('DirectRunner', options=opts) sql = 'SELECT * FROM imdb_refined.Writers limit 250' bq_source = ReadFromBigQuery(query=sql, use_standard_sql=True, gcs_location=BUCKET) query_results = p | 'Read from BQ' >> beam.io.Read(bq_source) out_pcoll = query_results | 'Split Writers' >> beam.ParDo(SplitWriters()) out_pcoll | 'Log output' >> WriteToText('output.txt') dataset_id = 'imdb_refined' table_id = PROJECT_ID + ':' + dataset_id + '.' + 'Writers_Beam' schema_id = 'tconst:STRING,writers:STRING' out_pcoll | 'Write to BQ' >> WriteToBigQuery( table=table_id, schema=schema_id, custom_gcs_temp_location=BUCKET) result = p.run() result.wait_until_finish()
def run(): PROJECT_ID = 'acquired-rarity-288205' BUCKET = 'gs://ykdb_beam/temp' options = {'project': PROJECT_ID} opts = beam.pipeline.PipelineOptions(flags=[], **options) p = beam.Pipeline('DirectRunner', options=opts) # ***************************************** REMOVE DUPLICATES **************************************************** sql = "SELECT * FROM (SELECT *, COUNT(*) AS count FROM H_1B_refined.Ownership GROUP BY occ_code, occ_title, ownership, naics_title, grp, tot_emp, emp_prse, h_mean, a_mean, mean_prse, a_pct10, a_pct25, a_median, a_pct75, a_pct90 HAVING count = 1) LIMIT 100" bq_source = ReadFromBigQuery(query=sql, use_standard_sql=True, gcs_location=BUCKET) query_results = p | 'Read from BQ' >> beam.io.Read(bq_source) out_pcoll = query_results | 'Remove Dups Ownership' >> beam.ParDo( NoDuplicates()) out_pcoll | 'Log output' >> WriteToText('output_ownership.txt') # ***************************************** INSERT INTO BQ **************************************************** dataset_id = 'H_1B_refined' table_id = PROJECT_ID + ':' + dataset_id + '.' + 'Ownership_Beam' schema_id = 'occ_code:STRING, occ_title:STRING, ownership:STRING, naics_title:STRING, grp:STRING, tot_emp:INTEGER, emp_prse:FLOAT, h_mean:FLOAT, a_mean:INTEGER, mean_prse:FLOAT, a_pct10:INTEGER, a_pct25:INTEGER, a_median:INTEGER, a_pct75:INTEGER, a_pct90:INTEGER' out_pcoll | 'Write to BQ' >> WriteToBigQuery( table=table_id, schema=schema_id, custom_gcs_temp_location=BUCKET) result = p.run() result.wait_until_finish()
def run(argv=None): parser = argparse.ArgumentParser() parser.add_argument('--topic', dest='topic', default=default_topic) known_args, pipeline_args = parser.parse_known_args(argv) pipeline_args.extend(['--project={}'.format(project), '--streaming', '--experiments=allow_non_updatable_job']) pipeline_options = PipelineOptions(pipeline_args) pipeline_options.view_as(SetupOptions).save_main_session = True # TODO schema = 'name:string,age:integer,timestamp:timestamp' def to_dict(element): return [yaml.load(element)] def get_table(element): # table format is DATASET.TABLE table = element['table'] element['timestamp'] = datetime.now().strftime('%Y-%m-%d %H:%M:%S') element.pop('table') return '{}:{}'.format(project, table) with beam.Pipeline(options=pipeline_options) as p: (p | "ReadTopic" >> beam.io.ReadFromPubSub(topic=known_args.topic) | "ToDict" >> beam.ParDo(to_dict) | "WriteToBQ" >> WriteToBigQuery(table=get_table, schema=schema, create_disposition=BigQueryDisposition.CREATE_NEVER, write_disposition=BigQueryDisposition.WRITE_APPEND))
def run(): PROJECT_ID = 'my-project' BUCKET = 'gs://my-bucket/temp' options = {'project': PROJECT_ID} opts = beam.pipeline.PipelineOptions(flags=[], **options) p = beam.Pipeline('DirectRunner', options=opts) sql = 'SELECT tid, instructor, dept FROM college_staging.Teacher limit 50' bq_source = ReadFromBigQuery(query=sql, use_standard_sql=True, gcs_location=BUCKET) query_results = p | 'Read from BQ' >> beam.io.Read(bq_source) out_pcoll = query_results | 'Format Name' >> beam.ParDo(FormatName()) out_pcoll | 'Log output' >> WriteToText('output.txt') dataset_id = 'college_refined' table_id = PROJECT_ID + ':' + dataset_id + '.' + 'Teacher_Beam' schema_id = 'tid:STRING,fname:STRING,lname:STRING,dept:STRING' out_pcoll | 'Write to BQ' >> WriteToBigQuery( table=table_id, schema=schema_id, custom_gcs_temp_location=BUCKET) result = p.run() result.wait_until_finish()
def run(): PROJECT_ID = 'acquired-rarity-288205' BUCKET = 'gs://ykdb_beam/temp' options = { 'project': PROJECT_ID } opts = beam.pipeline.PipelineOptions(flags=[], **options) p = beam.Pipeline('DirectRunner', options=opts) # ***************************************** FIXING soc_code **************************************************** sql = "SELECT * FROM H_1B_refined.Occupation_fix_date WHERE length(soc_code) != 7 AND soc_code NOT LIKE '%-%' LIMIT 200" bq_source = ReadFromBigQuery(query=sql, use_standard_sql=True, gcs_location=BUCKET) query_results = p | 'Read from BQ' >> beam.io.Read(bq_source) out_pcoll = query_results | 'Format Soc' >> beam.ParDo(FormatSocCode()) out_pcoll | 'Log output' >> WriteToText('output_occupation.txt') # ***************************************** INSERT INTO BQ **************************************************** dataset_id = 'H_1B_refined' table_id = PROJECT_ID + ':' + dataset_id + '.' + 'Occupation_Beam' schema_id = 'job_title:STRING, employer_name:STRING, employer_city:STRING, employment_start_date:DATE, employment_end_date:DATE, soc_code:STRING, soc_title:STRING, prevailing_wage_YR:FLOAT, pw_wage_level:STRING, pw_wage_source:STRING, pw_wage_source_year:INTEGER, pw_wage_source_other:STRING, worksite_city:STRING, worksite_country:STRING, worksite_state:STRING, worksite_postal_code:STRING' out_pcoll | 'Write to BQ' >> WriteToBigQuery(table=table_id, schema=schema_id, custom_gcs_temp_location=BUCKET) result = p.run() result.wait_until_finish()
def run(): PROJECT_ID = 'acquired-rarity-288205' BUCKET = 'gs://ykdb_beam/temp' options = {'project': PROJECT_ID} opts = beam.pipeline.PipelineOptions(flags=[], **options) p = beam.Pipeline('DirectRunner', options=opts) # ***************************************** REMOVE DUPLICATES **************************************************** sql = "SELECT * FROM (SELECT *, COUNT(*) AS count FROM H_1B_refined.Employer GROUP BY employer_name, employer_address, employer_city, employer_state, employer_postal_code, employer_country, employer_province, h_1b_dependent, willful_violator HAVING count = 1) LIMIT 100" bq_source = ReadFromBigQuery(query=sql, use_standard_sql=True, gcs_location=BUCKET) query_results = p | 'Read from BQ' >> beam.io.Read(bq_source) out_pcoll = query_results | 'Remove Dups Employer' >> beam.ParDo( NoDuplicates()) out_pcoll | 'Log output' >> WriteToText('output_employer.txt') # ***************************************** INSERT INTO BQ **************************************************** dataset_id = 'H_1B_refined' table_id = PROJECT_ID + ':' + dataset_id + '.' + 'Employer_Beam' schema_id = 'employer_name:STRING, employer_address:STRING, employer_city:STRING, employer_state:STRING, employer_postal_code:STRING, employer_country:STRING, employer_province:STRING, h_1b_dependent:BOOLEAN, willful_violator:BOOLEAN' out_pcoll | 'Write to BQ' >> WriteToBigQuery( table=table_id, schema=schema_id, custom_gcs_temp_location=BUCKET) result = p.run() result.wait_until_finish()
def run(): PROJECT_ID = 'acquired-rarity-288205' BUCKET = 'gs://ykdb_beam/temp' options = {'project': PROJECT_ID} opts = beam.pipeline.PipelineOptions(flags=[], **options) p = beam.Pipeline('DirectRunner', options=opts) # ***************************************** REMOVE DUPLICATES **************************************************** sql = "SELECT * FROM (SELECT *, COUNT(*) AS count FROM H_1B_refined.Application GROUP BY CASE_NUMBER, CASE_STATUS, CASE_SUBMITTED, DECESION_DATE, VISA_CLASS, employer_name, employer_city HAVING count = 1) LIMIT 100" bq_source = ReadFromBigQuery(query=sql, use_standard_sql=True, gcs_location=BUCKET) query_results = p | 'Read from BQ' >> beam.io.Read(bq_source) out_pcoll = query_results | 'Remove Dups Application' >> beam.ParDo( NoDuplicates()) out_pcoll | 'Log output' >> WriteToText('output_application.txt') # ***************************************** INSERT INTO BQ **************************************************** dataset_id = 'H_1B_refined' table_id = PROJECT_ID + ':' + dataset_id + '.' + 'Application_Beam' schema_id = 'CASE_NUMBER:STRING, CASE_STATUS:STRING, CASE_SUBMITTED:DATE, DECESION_DATE:DATE, VISA_CLASS:STRING, employer_name:STRING, employer_city:STRING' out_pcoll | 'Write to BQ' >> WriteToBigQuery( table=table_id, schema=schema_id, custom_gcs_temp_location=BUCKET) result = p.run() result.wait_until_finish()
def run(argv=None, save_main_session=True): parser = argparse.ArgumentParser() parser.add_argument('--input', dest='input', help='Input file to process.') parser.add_argument('--output-file', dest='output_file', help='Output file to write results to.') parser.add_argument('--output-bq', dest='output_bq', help='Output BigQuery Table to write results to.') known_args, pipeline_args = parser.parse_known_args(argv) # pipeline_args.extend([ # '--runner=DirectRunner', # # '--temp_location=/tmp/beam/tmp', # '--job_name=parse-twitter-job', # ]) def transform_tweet_to_bq(data): d = data['data'] hashtags = [ h['tag'] for h in d.get('entities', {}).get('hashtags', {}) ] places = data.get('includes', {}).get('places', {}) place = places[0] if len(places) > 0 else {} return { 'id': d['id'], 'author_id': d['author_id'], 'created_at': d['created_at'], 'place_id': d.get('geo', {}).get('place_id', ''), 'lang': d.get('lang', ''), 'text': d['text'], 'keywords': re.findall(r'[@#\w\']{6,}', d['text'], re.UNICODE), 'place_name': place.get('full_name', ''), 'place_country': place.get('country_code', ''), 'hashtags': hashtags, } # We use the save_main_session option because one or more DoFn's in this # workflow rely on global context (e.g., a module imported at module level). pipeline_options = PipelineOptions(pipeline_args) pipeline_options.view_as( SetupOptions).save_main_session = save_main_session with beam.Pipeline(options=pipeline_options) as p: output = (p | 'Read' >> ReadFromText(known_args.input) | 'FromJSON' >> beam.Map(json.loads) | 'Transform' >> beam.Map(transform_tweet_to_bq)) if known_args.output_file: unused = (output | 'ToJSON' >> beam.Map(json.dumps) | 'Write' >> WriteToText(known_args.output_file)) if known_args.output_bq: unused = (output | 'LoadBigQuery' >> WriteToBigQuery( known_args.output_bq, method=WriteToBigQuery.Method.FILE_LOADS) )
def run(): with beam.Pipeline(options=PipelineOptions(streaming=True)) as p: pc = (p | ReadFromPubSub(topic=get_topic_path()) | beam.WindowInto( window.FixedWindows(WINDOW_SIZE), accumulation_mode=trigger.AccumulationMode.DISCARDING) | 'AddWindowInfo' >> beam.ParDo(add_window_info) | beam.CombinePerKey(sum) | beam.ParDo(prepare_element) | 'Print' >> beam.ParDo(print_fn) | WriteToBigQuery(BIGQUERY_TABLE_ID))
def build(self, p): schema = [f"{c['name']}:{TYPES[c['type']]}" for c in self._columns] return (p | beam.Map(lambda rec: {k: str(v) for k, v in rec.items()}) | WriteToBigQuery( project=self._project, dataset=self._dataset, table=self._table, schema=','.join(schema), write_disposition=self._write_disposition, custom_gcs_temp_location=self._gcs_temp_location))
def parse_table_schema(schema): """ Accepts a BigQuery tableschema as a string, dict (from json), or bigquery.TabelSchema and returns a bigquery.TableSchema String Format "[FIELD]:[DATA_TYPE],[FIELD]:[DATA_TYPE]" dict format { "fields": [ { "name": "[FIELD]", "type": "[DATA_TYPE]" }, { "name": "[FIELD]", "type": "[DATA_TYPE]" } ]} see https://cloud.google.com/bigquery/data-types see https://cloud.google.com/bigquery/docs/schemas#specifying_a_schema_file """ if schema is None: return schema elif isinstance(schema, bq.TableSchema): return schema elif isinstance(schema, six.string_types): # try to parse json into dict try: schema = ujson.loads(schema) except ValueError as e: pass if isinstance(schema, six.string_types): # if it is still a string, then it must not be json. Assume it is string representation return WriteToBigQuery.get_table_schema_from_string(schema) elif isinstance(schema, dict): # either it came in as a dict or it got converted from json earlier return parse_table_schema_from_json(ujson.dumps(schema)) else: raise TypeError('Unexpected schema argument: %s.' % schema)
def run(): # set up location PROJECT_ID = 'trim-cistern-288221' BUCKET = 'gs://bhnk-milestone1-data' DIR_PATH = BUCKET + '/output/' + datetime.datetime.now().strftime( '%Y_%m_%d_%H_%M_%S') + '/' # use DataflowRunner instead of DirectRunner options = PipelineOptions(flags=None, runner='DataflowRunner', project=PROJECT_ID, job_name='imdbcharacter', temp_location=BUCKET + '/temp', region='us-central1') p = beam.pipeline.Pipeline(options=options) # retrieve the data from imdb_refined dataset and save this information (location) sql = 'SELECT * FROM imdb_refined.Characters' bq_source = ReadFromBigQuery(query=sql, use_standard_sql=True, gcs_location=BUCKET) # use the previously saved information (location) and read from BigQuery # query results is now input P collection query_results = p | 'Read from BQ' >> beam.io.Read(bq_source) # Use ParDo to call function on query results out_pcoll = query_results | 'Split characters' >> beam.ParDo( SplitCharacters()) # write the results into text file out_pcoll | 'Log output' >> WriteToText(DIR_PATH + 'output.txt') dataset_id = 'imdb_refined' table_id = PROJECT_ID + ':' + dataset_id + '.' + 'Characters_Dataflow' schema_id = 'tconst:STRING,nconst:STRING,characters:STRING' # write to BigQuery using the location set above out_pcoll | 'Write to BQ' >> WriteToBigQuery( table=table_id, schema=schema_id, custom_gcs_temp_location=BUCKET) # run and display results after everything is finished result = p.run() result.wait_until_finish()
def run(argv=None): parser = argparse.ArgumentParser() parser.add_argument( '--input_topic', dest='input_topic', help='Input topic in the form projects/<project>/topics/<topic>') parser.add_argument('--output', dest='output_file', help='Output file where to write') parser.add_argument('--table', dest='table_name', help='BQ table name') parser.add_argument('--dataset', dest='dataset_id', help='BQ dataset') parser.add_argument('--project_id', dest='project_id', help='Project ID') known_args, pipeline_args = parser.parse_known_args(argv) pipeline_args.extend(['--project=main-training-project', '--streaming']) """ pipeline_args.extend(['--runner=DataflowRunner', '--project=yourprojectid', '--staging_location=gs://yourgsbucket', '--temp_location=gs://yourgsbucket', '--job_name=your-job-name']) """ pipeline_options = PipelineOptions(pipeline_args) #pipeline_options.view_as(SetupOptions).save_main_session = True with beam.Pipeline(options=pipeline_options) as p: lines = p | ReadFromPubSub(topic=known_args.input_topic) def str_to_dict(str_line): import pandas as pd import nonpypimodule import changecommentfield df_rows = eval(str_line) pd.DataFrame.from_dict(df_rows) bq_rows = eval(re.sub('\[|\]', '', str_line.decode('utf-8'))) bq_rows['post'] = nonpypimodule.return_sentence() bq_rows = changecommentfield.change_field(bq_rows) logging.info(bq_rows) return bq_rows lines = lines | 'String to dict' >> beam.Map(str_to_dict) lines = lines | 'Output to BQ' >> WriteToBigQuery( table=known_args.table_name, dataset=known_args.dataset_id, project=known_args.project_id)
def run(): PROJECT_ID = 'acquired-rarity-288205' BUCKET = 'gs://ykdb_beam_us' DIR_PATH = BUCKET + '/output/' + datetime.datetime.now().strftime( '%Y_%m_%d_%H_%M_%S') + '/' options = PipelineOptions(flags=None, runner='DataflowRunner', project=PROJECT_ID, job_name='occupation', temp_location=BUCKET + '/temp', region='us-central1') p = beam.pipeline.Pipeline(options=options) ''' sql = "SELECT FARM_FINGERPRINT(TO_JSON_STRING(t)) AS job_id, * FROM (SELECT job_title, emp.employer_id AS employer_id, soc_code, soc_title, prevailing_wage_YR, pw_wage_level, pw_wage_source, pw_wage_source_year, pw_wage_source_other, worksite_city, worksite_country, worksite_state, worksite_postal_code FROM (SELECT *, COUNT(*) AS count FROM H_1B_refined.Occupation WHERE prevailing_wage_YR > 5000 GROUP BY job_title, employer_name, employer_city, employment_start_date, employment_end_date, soc_code, soc_title, prevailing_wage_YR, pw_wage_level, pw_wage_source, pw_wage_source_year, pw_wage_source_other, worksite_city, worksite_country, worksite_state, worksite_postal_code HAVING count = 1) AS occ JOIN H_1B_refined.Employer_Dataflow AS emp ON emp.employer_name = occ.employer_name AND emp.employer_city = occ.employer_city) AS t WHERE length(soc_code) >= 6 AND length(soc_code) <= 10 AND length(soc_code) != 8" ''' sql = "SELECT FARM_FINGERPRINT(TO_JSON_STRING(t)) AS job_id, job_title, employer_id, soc_code, soc_title, prevailing_wage_YR, pw_wage_level, pw_wage_source, pw_wage_source_year, pw_wage_source_other, worksite_city, worksite_country, worksite_state, worksite_postal_code FROM (SELECT *, COUNT(*) AS count FROM(SELECT job_title, emp.employer_id AS employer_id, occ.employer_name, occ.employer_city, employment_start_date, employment_end_date, soc_code, soc_title, prevailing_wage_YR, pw_wage_level, pw_wage_source, pw_wage_source_year, pw_wage_source_other, worksite_city, worksite_country, worksite_state, worksite_postal_code FROM H_1B_refined.Occupation as occ JOIN H_1B_refined.Employer_Dataflow AS emp ON emp.employer_name = occ.employer_name AND emp.employer_city = occ.employer_city WHERE prevailing_wage_YR > 5000 AND length(soc_code) >= 6 AND length(soc_code) <= 10 AND length(soc_code) != 8) GROUP BY job_title, employer_id, employer_name, employer_city, employment_start_date, employment_end_date, soc_code, soc_title, prevailing_wage_YR, pw_wage_level, pw_wage_source, pw_wage_source_year, pw_wage_source_other, worksite_city, worksite_country, worksite_state, worksite_postal_code HAVING count = 1) as t" bq_source = ReadFromBigQuery(query=sql, use_standard_sql=True, gcs_location=BUCKET) query_results = p | 'Read from BQ' >> beam.io.Read(bq_source) ''' out_pcoll_fix_date = query_results | 'Format Date' >> beam.ParDo(FormatDate()) out_pcoll_fix_date | 'Log fix_date_output' >> WriteToText(DIR_PATH + 'output_occ_fix_date.txt') ''' out_pcoll = query_results | 'Format Soc' >> beam.ParDo(FormatSocCode()) out_pcoll | 'Log output' >> WriteToText(DIR_PATH + 'output_occupation.txt') # ***************************************** INSERT INTO BQ **************************************************** dataset_id = 'H_1B_refined' table_id = PROJECT_ID + ':' + dataset_id + '.' + 'Occupation_Dataflow' schema_id = 'job_id:INTEGER, job_title:STRING, employer_id:INTEGER, soc_code:STRING, soc_title:STRING, prevailing_wage_YR:FLOAT, pw_wage_level:STRING, pw_wage_source:STRING, pw_wage_source_year:INTEGER, pw_wage_source_other:STRING, worksite_city:STRING, worksite_country:STRING, worksite_state:STRING, worksite_postal_code:STRING' out_pcoll | 'Write to BQ' >> WriteToBigQuery( table=table_id, schema=schema_id, custom_gcs_temp_location=BUCKET) result = p.run() result.wait_until_finish()
def run(): PROJECT_ID = 'acquired-rarity-288205' BUCKET = 'gs://ykdb_beam/temp' options = {'project': PROJECT_ID} opts = beam.pipeline.PipelineOptions(flags=[], **options) p = beam.Pipeline('DirectRunner', options=opts) # ***************************************** REMOVE DUPLICATES **************************************************** sql = "SELECT job_title, employer_name, employer_city, employment_start_date, employment_end_date, soc_code, soc_title, prevailing_wage_YR, pw_wage_level, pw_wage_source, pw_wage_source_year, pw_wage_source_other, worksite_city, worksite_country, worksite_state, worksite_postal_code FROM (SELECT *, COUNT(*) AS count FROM H_1B_refined.Occupation WHERE prevailing_wage_YR > 5000 AND length(soc_code) > 5 GROUP BY job_title, employer_name, employer_city, employment_start_date, employment_end_date, soc_code, soc_title, prevailing_wage_YR, pw_wage_level, pw_wage_source, pw_wage_source_year, pw_wage_source_other, worksite_city, worksite_country, worksite_state, worksite_postal_code HAVING count = 1) LIMIT 50" bq_source = ReadFromBigQuery(query=sql, use_standard_sql=True, gcs_location=BUCKET) query_results = p | 'Read from BQ' >> beam.io.Read(bq_source) out_pcoll_no_dup = query_results | 'Format prevailing_wage_YR and Remove dups' >> beam.ParDo( NoDuplicates()) out_pcoll_no_dup | 'Log no_dup_output' >> WriteToText( 'output_occ_no_dup.txt') out_pcoll_fix_date = out_pcoll_no_dup | 'Format Date' >> beam.ParDo( FormatDate()) out_pcoll_fix_date | 'Log fix_date_output' >> WriteToText( 'output_occ_fix_date.txt') out_pcoll = out_pcoll_fix_date | 'Format Soc' >> beam.ParDo( FormatSocCode()) out_pcoll | 'Log output' >> WriteToText('output_occupation.txt') # ***************************************** INSERT INTO BQ **************************************************** dataset_id = 'H_1B_refined' table_id = PROJECT_ID + ':' + dataset_id + '.' + 'Occupation_Beam' schema_id = 'job_title:STRING, employer_name:STRING, employer_city:STRING, employment_start_date:Date, employment_end_date:Date, soc_code:STRING, soc_title:STRING, prevailing_wage_YR:FLOAT, pw_wage_level:STRING, pw_wage_source:STRING, pw_wage_source_year:INTEGER, pw_wage_source_other:STRING, worksite_city:STRING, worksite_country:STRING, worksite_state:STRING, worksite_postal_code:STRING' out_pcoll | 'Write to BQ' >> WriteToBigQuery( table=table_id, schema=schema_id, custom_gcs_temp_location=BUCKET) result = p.run() result.wait_until_finish()
def run(): pipeline_options = PipelineOptions() options = pipeline_options.view_as(SQLOptions) #options.view_as(SetupOptions).save_main_session = True #temp_location = options.view_as(GoogleCloudOptions).temp_location #print("Here!", temp_location) pipeline = beam.Pipeline(options=options) mysql_data = (pipeline | ReadFromSQL(host=options.host, port=options.port, username=options.username, password=options.password, database=options.database, query=options.query, #wrapper={'mssql': MSSQLWrapper, 'mysql': MySQLWrapper, 'postgres': PostgresWrapper}[options.db_type], wrapper=MSSQLWrapper, # wrapper=PostgresWrapper # ) #| 'Parse' >> beam.Map(parse_json) | 'Write to Table' >> WriteToBigQuery( table= options.output_table, schema = 'SCHEMA_AUTODETECT', write_disposition=BigQueryDisposition.WRITE_TRUNCATE, create_disposition=beam.io.BigQueryDisposition.CREATE_IF_NEEDED) ) #transformed_data = mysql_data | "Transform records" >> beam.Map(transform_records, 'user_id') # transformed_data | "insert into mysql" >> SQLWriter(options.host, options.port, # options.username, options.password, # options.database, # table='output', # wrapper=MySQLWrapper, autocommit=True, batch_size=500) # transformed_data | "insert into postgres" >> SQLWriter(options.host, 5432, # 'postgres', options.password, # options.database, table=options.output_table, # wrapper=PostgresWrapper, autocommit=False, batch_size=500) #mysql_data | "Log records " >> beam.Map(log) | beam.io.WriteToText(options.output, num_shards=1, file_name_suffix=".json") pipeline.run().wait_until_finish()
def run(): PROJECT_ID = 'acquired-rarity-288205' BUCKET = 'gs://ykdb_beam_us' DIR_PATH = BUCKET + '/output/' + datetime.datetime.now().strftime( '%Y_%m_%d_%H_%M_%S') + '/' options = PipelineOptions(flags=None, runner='DataflowRunner', project=PROJECT_ID, job_name='application', temp_location=BUCKET + '/temp', region='us-central1') p = beam.pipeline.Pipeline(options=options) # ***************************************** REMOVE DUPLICATES **************************************************** sql = "SELECT FARM_FINGERPRINT(TO_JSON_STRING(t)) AS case_id, * FROM (SELECT emp.employer_id, CASE_NUMBER, CASE_STATUS, CASE_SUBMITTED, DECESION_DATE AS DECISION_DATE, VISA_CLASS FROM (SELECT *, COUNT(*) AS count FROM H_1B_refined.Application GROUP BY CASE_NUMBER, CASE_STATUS, CASE_SUBMITTED, DECESION_DATE, VISA_CLASS, employer_name, employer_city HAVING count = 1) AS app JOIN H_1B_refined.Employer_Dataflow AS emp ON emp.employer_name = app.employer_name AND emp.employer_city = app.employer_city) AS t" bq_source = ReadFromBigQuery(query=sql, use_standard_sql=True, gcs_location=BUCKET) query_results = p | 'Read from BQ' >> beam.io.Read(bq_source) out_pcoll = query_results | 'Application Transfromation' >> beam.ParDo( NoDuplicates()) out_pcoll | 'Log output' >> WriteToText(DIR_PATH + 'output_appplication.txt') # ***************************************** INSERT INTO BQ **************************************************** dataset_id = 'H_1B_refined' table_id = PROJECT_ID + ':' + dataset_id + '.' + 'Application_Dataflow' schema_id = 'case_id:INTEGER, employer_id:INTEGER, CASE_NUMBER:STRING, CASE_STATUS:STRING, CASE_SUBMITTED:DATE, DECISION_DATE:DATE, VISA_CLASS:STRING' out_pcoll | 'Write to BQ' >> WriteToBigQuery( table=table_id, schema=schema_id, custom_gcs_temp_location=BUCKET) result = p.run() result.wait_until_finish()
def run(argv=None, save_main_session=True): """Main entry point; defines and runs the wordcount pipeline.""" parser = argparse.ArgumentParser() parser.add_argument( '--input', dest='input', default='gs://dataflow-samples/shakespeare/kinglear.txt', help='Input file to process.') parser.add_argument('--output', dest='output', required=True, help='Output BigQuery table to write results to.') known_args, pipeline_args = parser.parse_known_args(argv) # We use the save_main_session option because one or more DoFn's in this # workflow rely on global context (e.g., a module imported at module level). pipeline_options = PipelineOptions(pipeline_args) pipeline_options.view_as( SetupOptions).save_main_session = save_main_session p = beam.Pipeline(options=pipeline_options) # Read the text file[pattern] into a PCollection. (p | 'read' >> ReadFromText(known_args.input) | 'parse message' >> beam.Map(parse_message) | 'window per minute' >> beam.WindowInto( window.FixedWindows(1), trigger=trigger.DefaultTrigger(), accumulation_mode=trigger.AccumulationMode.DISCARDING) | 'group by id' >> beam.GroupByKey() | 'sum_value' >> beam.Map(sum_value) | 'format for bq' >> beam.Map(format_for_bigquery) | 'write to bigquery' >> WriteToBigQuery( table=known_args.output, schema='key:INTEGER,sum:FLOAT', create_disposition='CREATE_IF_NEEDED', write_disposition='WRITE_APPEND')) result = p.run() result.wait_until_finish()
def run(): PROJECT_ID = 'acquired-rarity-288205' BUCKET = 'gs://ykdb_beam_us' DIR_PATH = BUCKET + '/output/' + datetime.datetime.now().strftime( '%Y_%m_%d_%H_%M_%S') + '/' options = PipelineOptions(flags=None, runner='DataflowRunner', project=PROJECT_ID, job_name='ownership', temp_location=BUCKET + '/temp', region='us-central1') p = beam.pipeline.Pipeline(options=options) # ***************************************** REMOVE DUPLICATES **************************************************** sql = "SELECT FARM_FINGERPRINT(TO_JSON_STRING(t)) as ownership_id, * FROM (SELECT * FROM (SELECT *, COUNT(*) AS count FROM H_1B_refined.Ownership GROUP BY occ_code, occ_title, ownership, naics_title, grp, tot_emp, emp_prse, h_mean, a_mean, mean_prse, a_pct10, a_pct25, a_median, a_pct75, a_pct90 HAVING count = 1)) as t" bq_source = ReadFromBigQuery(query=sql, use_standard_sql=True, gcs_location=BUCKET) query_results = p | 'Read from BQ' >> beam.io.Read(bq_source) out_pcoll = query_results | 'Remove Dups Ownership' >> beam.ParDo( NoDuplicates()) out_pcoll | 'Log output' >> WriteToText(DIR_PATH + 'output_ownership.txt') # ***************************************** INSERT INTO BQ **************************************************** dataset_id = 'H_1B_refined' table_id = PROJECT_ID + ':' + dataset_id + '.' + 'Ownership_Dataflow' schema_id = 'ownership_id:INTEGER, occ_code:STRING, occ_title:STRING, ownership:STRING, naics_title:STRING, grp:STRING, tot_emp:INTEGER, emp_prse:FLOAT, h_mean:FLOAT, a_mean:INTEGER, mean_prse:FLOAT, a_pct10:INTEGER, a_pct25:INTEGER, a_median:INTEGER, a_pct75:INTEGER, a_pct90:INTEGER' out_pcoll | 'Write to BQ' >> WriteToBigQuery( table=table_id, schema=schema_id, custom_gcs_temp_location=BUCKET) result = p.run() result.wait_until_finish()
def run(): PROJECT_ID = 'acquired-rarity-288205' BUCKET = 'gs://ykdb_beam_us' DIR_PATH = BUCKET + '/output/' + datetime.datetime.now().strftime( '%Y_%m_%d_%H_%M_%S') + '/' options = PipelineOptions(flags=None, runner='DataflowRunner', project=PROJECT_ID, job_name='employer', temp_location=BUCKET + '/temp', region='us-central1') p = beam.pipeline.Pipeline(options=options) # ***************************************** REMOVE DUPLICATES **************************************************** sql = "SELECT FARM_FINGERPRINT(TO_JSON_STRING(t)) AS employer_id, * FROM (SELECT employer_name, employer_address, employer_city, employer_state, employer_postal_code, employer_country, employer_province, h_1b_dependent, willful_violator FROM (SELECT *, COUNT(*) AS count FROM H_1B_refined.Employer GROUP BY employer_name, employer_address, employer_city, employer_state, employer_postal_code, employer_country, employer_province, h_1b_dependent, willful_violator HAVING count = 1)) AS t" bq_source = ReadFromBigQuery(query=sql, use_standard_sql=True, gcs_location=BUCKET) query_results = p | 'Read from BQ' >> beam.io.Read(bq_source) out_pcoll = query_results | 'Remove Dups Employer' >> beam.ParDo( NoDuplicates()) out_pcoll | 'Log output' >> WriteToText(DIR_PATH + 'output_employer.txt') # ***************************************** INSERT INTO BQ **************************************************** dataset_id = 'H_1B_refined' table_id = PROJECT_ID + ':' + dataset_id + '.' + 'Employer_Dataflow' schema_id = 'employer_id:INTEGER, employer_name:STRING, employer_address:STRING, employer_city:STRING, employer_state:STRING, employer_postal_code:STRING, employer_country:STRING, employer_province:STRING, h_1b_dependent:BOOLEAN, willful_violator:BOOLEAN' out_pcoll | 'Write to BQ' >> WriteToBigQuery( table=table_id, schema=schema_id, custom_gcs_temp_location=BUCKET) result = p.run() result.wait_until_finish()
def run(): # set up location PROJECT_ID = 'trim-cistern-288221' BUCKET = 'gs://bhnk-milestone1-data' options = {'project': PROJECT_ID} opts = beam.pipeline.PipelineOptions(flags=[], **options) # executed with DirectRunner p = beam.Pipeline('DirectRunner', options=opts) # retrieve the data from imdb_refined dataset and save this information (location) sql = 'SELECT * FROM imdb_refined.Primary_Professions limit 250' bq_source = ReadFromBigQuery(query=sql, use_standard_sql=True, gcs_location=BUCKET) # use the previously saved information (location) and read from BigQuery # query results is now input P collection query_results = p | 'Read from BQ' >> beam.io.Read(bq_source) # Use ParDo to call function on query results out_pcoll = query_results | 'Split Primary Professions' >> beam.ParDo( SplitPrimaryProfessions()) out_pcoll | 'Log output' >> WriteToText('output.txt') dataset_id = 'imdb_refined' table_id = PROJECT_ID + ':' + dataset_id + '.' + 'Primary_Professions_Beam' schema_id = 'nconst:STRING,primaryProfession:STRING' # write to BigQuery using the location set above out_pcoll | 'Write to BQ' >> WriteToBigQuery( table=table_id, schema=schema_id, custom_gcs_temp_location=BUCKET) # run and display results after everything is finished result = p.run() result.wait_until_finish()