def test_one_job_fails_all_jobs_fail(self): # If one of the import jobs fails, then other jobs must not be performed. # This is to avoid reinsertion of some records when a pipeline fails and # is rerun. output_table_1 = '%s%s' % (self.output_table, 1) output_table_2 = '%s%s' % (self.output_table, 2) self.bigquery_client.get_or_create_table( self.project, self.dataset_id, output_table_1.split('.')[1], bigquery_tools.parse_table_schema_from_json(self.BIG_QUERY_SCHEMA), None, None) self.bigquery_client.get_or_create_table( self.project, self.dataset_id, output_table_2.split('.')[1], bigquery_tools.parse_table_schema_from_json(self.BIG_QUERY_SCHEMA_2), None, None) pipeline_verifiers = [ BigqueryFullResultMatcher( project=self.project, query="SELECT name, language FROM %s" % output_table_1, data=[]), BigqueryFullResultMatcher( project=self.project, query="SELECT name, foundation FROM %s" % output_table_2, data=[]) ] args = self.test_pipeline.get_full_options_as_args( experiments='use_beam_bq_sink') with self.assertRaises(Exception): # The pipeline below fails because neither a schema nor SCHEMA_AUTODETECT # are specified. with beam.Pipeline(argv=args) as p: input = p | beam.Create(_ELEMENTS) input2 = p | "Broken record" >> beam.Create(['language_broken_record']) input = (input, input2) | beam.Flatten() _ = ( input | "WriteWithMultipleDests" >> bigquery.WriteToBigQuery( table=lambda x: (output_table_1 if 'language' in x else output_table_2), create_disposition=( beam.io.BigQueryDisposition.CREATE_IF_NEEDED), write_disposition=beam.io.BigQueryDisposition.WRITE_APPEND, temp_file_format=bigquery_tools.FileFormat.JSON)) hamcrest_assert(p, all_of(*pipeline_verifiers))
def run_bq_pipeline(argv=None): """Run the sample BigQuery pipeline. Args: argv: Arguments to the run function. """ parser = argparse.ArgumentParser() parser.add_argument('--query', required=True, help='Query to process for the table.') parser.add_argument('--output', required=True, help='Output BQ table to write results to.') parser.add_argument('--output_schema', dest='output_schema', required=True, help='Schema for output BQ table.') parser.add_argument('--use_standard_sql', action='store_true', dest='use_standard_sql', help='Output BQ table to write results to.') known_args, pipeline_args = parser.parse_known_args(argv) table_schema = parse_table_schema_from_json(known_args.output_schema) p = TestPipeline(options=PipelineOptions(pipeline_args)) # pylint: disable=expression-not-assigned # pylint: disable=bad-continuation (p | 'read' >> beam.io.Read(beam.io.BigQuerySource( query=known_args.query, use_standard_sql=known_args.use_standard_sql)) | 'write' >> beam.io.Write(beam.io.BigQuerySink( known_args.output, schema=table_schema, create_disposition=beam.io.BigQueryDisposition.CREATE_IF_NEEDED, write_disposition=beam.io.BigQueryDisposition.WRITE_EMPTY))) result = p.run() result.wait_until_finish()
def test_parse_table_schema_from_json(self): string_field = bigquery.TableFieldSchema( name='s', type='STRING', mode='NULLABLE', description='s description') number_field = bigquery.TableFieldSchema( name='n', type='INTEGER', mode='REQUIRED', description='n description') record_field = bigquery.TableFieldSchema( name='r', type='RECORD', mode='REQUIRED', description='r description', fields=[string_field, number_field]) expected_schema = bigquery.TableSchema(fields=[record_field]) json_str = json.dumps({ 'fields': [{ 'name': 'r', 'type': 'RECORD', 'mode': 'REQUIRED', 'description': 'r description', 'fields': [{ 'name': 's', 'type': 'STRING', 'mode': 'NULLABLE', 'description': 's description' }, { 'name': 'n', 'type': 'INTEGER', 'mode': 'REQUIRED', 'description': 'n description' }] }] }) self.assertEqual(parse_table_schema_from_json(json_str), expected_schema)
def setUp(self): self.test_pipeline = TestPipeline(is_integration_test=True) self.project = self.test_pipeline.get_option('project') # Set up PubSub environment. from google.cloud import pubsub self.pub_client = pubsub.PublisherClient() self.pubsub_setup_client = PubSubSetupClient(project=self.project) self.input_topic = self.pubsub_setup_client.create_topic(INPUT_TOPIC) self.output_topic = self.pubsub_setup_client.create_topic(OUTPUT_TOPIC) self.input_sub = self.pubsub_setup_client.create_subscription( self.input_topic, INPUT_SUB) self.output_sub = self.pubsub_setup_client.create_subscription( self.output_topic, OUTPUT_SUB) # Set up BigQuery tables self.dataset_ref = utils.create_bq_dataset(self.project, OUTPUT_DATASET) self.bq_wrapper = BigQueryWrapper() table_schema = parse_table_schema_from_json(schemas.get_test_schema()) def _create_table(table_id, schema): return self.bq_wrapper.get_or_create_table( project_id=self.project, dataset_id=self.dataset_ref.dataset_id, table_id=table_id, schema=schema, create_disposition='CREATE_IF_NEEDED', write_disposition='WRITE_APPEND') self.table_ref = _create_table(OUTPUT_TABLE, table_schema)
def import_json_bq_schema(): path = os.path.join( os.path.dirname(inspect.getfile(inspect.currentframe())), 'mimic_cxr_bigquery_labels_schema.json') with open(path) as fp: return parse_table_schema_from_json(fp.read())
def _create_input_data(self): """ Runs an additional pipeline which creates test data and waits for its completion. """ SCHEMA = parse_table_schema_from_json( '{"fields": [{"name": "data", "type": "BYTES"}]}') def format_record(record): # Since Synthetic Source returns data as a dictionary, we should skip one # of the part import base64 return {'data': base64.b64encode(record[1])} with TestPipeline() as p: ( # pylint: disable=expression-not-assigned p | 'Produce rows' >> Read( SyntheticSource(self.parse_synthetic_source_options())) | 'Format' >> Map(format_record) | 'Write to BigQuery' >> WriteToBigQuery( dataset=self.input_dataset, table=self.input_table, schema=SCHEMA, create_disposition=BigQueryDisposition.CREATE_IF_NEEDED, write_disposition=BigQueryDisposition.WRITE_EMPTY))
def parse_method(self, string_input): """This method translates a single line of comma separated values to a dictionary which can be loaded into BigQuery. """ # Strip out return characters and quote characters. schema = parse_table_schema_from_json(self.schema_str) field_map = [f for f in schema.fields] # Use a CSV Reader which can handle quoted strings etc. reader = csv.reader(string_input.split('\n')) for csv_row in reader: month = '01' day = '01' year = csv_row[2] row = {} i = 0 for value in csv_row: if field_map[i].type == 'DATE': # Format the date to YYYY-MM-DD format which BigQuery # accepts. value = '-'.join((year, month, day)) row[field_map[i].name] = value i += 1 return row
def run_bq_pipeline(argv=None): """Run the sample BigQuery pipeline. Args: argv: Arguments to the run function. """ parser = argparse.ArgumentParser() parser.add_argument('--query', required=True, help='Query to process for the table.') parser.add_argument('--output', required=True, help='Output BQ table to write results to.') parser.add_argument('--output_schema', dest='output_schema', required=True, help='Schema for output BQ table.') parser.add_argument('--use_standard_sql', action='store_true', dest='use_standard_sql', help='Output BQ table to write results to.') parser.add_argument('--kms_key', default=None, help='Use this Cloud KMS key with BigQuery.') parser.add_argument('--native', default=False, action='store_true', help='Use NativeSources and Sinks.') known_args, pipeline_args = parser.parse_known_args(argv) table_schema = parse_table_schema_from_json(known_args.output_schema) kms_key = known_args.kms_key p = TestPipeline(options=PipelineOptions(pipeline_args)) # Note to future modifiers: Keep using BigQuerySource if known_args.native is # True. data = p | 'read' >> beam.io.Read( beam.io.BigQuerySource(query=known_args.query, use_standard_sql=known_args.use_standard_sql, kms_key=kms_key)) if known_args.native: _ = data | 'write' >> beam.io.Write( beam.io.BigQuerySink( known_args.output, schema=table_schema, create_disposition=beam.io.BigQueryDisposition. CREATE_IF_NEEDED, write_disposition=beam.io.BigQueryDisposition.WRITE_EMPTY, kms_key=kms_key)) else: _ = data | 'write' >> beam.io.WriteToBigQuery( known_args.output, schema=table_schema, create_disposition=beam.io.BigQueryDisposition.CREATE_IF_NEEDED, write_disposition=beam.io.BigQueryDisposition.WRITE_EMPTY, kms_key=kms_key) result = p.run() result.wait_until_finish()
def test_one_job_fails_all_jobs_fail(self): # If one of the import jobs fails, then other jobs must not be performed. # This is to avoid reinsertion of some records when a pipeline fails and # is rerun. output_table_1 = '%s%s' % (self.output_table, 1) output_table_2 = '%s%s' % (self.output_table, 2) self.bigquery_client.get_or_create_table( self.project, self.dataset_id, output_table_1.split('.')[1], bigquery_tools.parse_table_schema_from_json(self.BIG_QUERY_SCHEMA), None, None) self.bigquery_client.get_or_create_table( self.project, self.dataset_id, output_table_2.split('.')[1], bigquery_tools.parse_table_schema_from_json(self.BIG_QUERY_SCHEMA_2), None, None) pipeline_verifiers = [ BigqueryFullResultMatcher( project=self.project, query="SELECT * FROM %s" % output_table_1, data=[]), BigqueryFullResultMatcher( project=self.project, query="SELECT * FROM %s" % output_table_2, data=[])] args = self.test_pipeline.get_full_options_as_args( experiments='use_beam_bq_sink') with self.assertRaises(Exception): with beam.Pipeline(argv=args) as p: input = p | beam.Create(_ELEMENTS) input2 = p | "Broken record" >> beam.Create(['language_broken_record']) input = (input, input2) | beam.Flatten() _ = (input | "WriteWithMultipleDests" >> bigquery.WriteToBigQuery( table=lambda x: (output_table_1 if 'language' in x else output_table_2), create_disposition=( beam.io.BigQueryDisposition.CREATE_IF_NEEDED), write_disposition=beam.io.BigQueryDisposition.WRITE_APPEND)) hamcrest_assert(p, all_of(*pipeline_verifiers))
def _validate_schema(self, expected_fields, actual_schema): super()._validate_schema(expected_fields, actual_schema) json_schema = schema_converter.convert_table_schema_to_json_bq_schema( actual_schema) # Beam expects schema to be generated from dict with 'fields' item being # list of columns, while 'bq mk' command expects just the list of fields. updated_json_schema = json.dumps({"fields": json.loads(json_schema)}) schema_from_json = parse_table_schema_from_json(updated_json_schema) self.assertEqual(schema_from_json, actual_schema)
def run(argv=None): """The main function which creates the pipeline and runs it.""" parser = argparse.ArgumentParser() # Here we add some specific command line arguments we expect. Specifically # we have the input file to load and the output table to write to. parser.add_argument( '--input', dest='input', required=False, help='Input file to read. This can be a local file or ' 'a file in a Google Storage Bucket.', # This example file contains a total of only 10 lines. # It is useful for developing on a small set of data default='gs://spls/gsp290/data_files/head_usa_names.csv') # This defaults to the temp dataset in your BigQuery project. You'll have # to create the temp dataset yourself using bq mk temp parser.add_argument('--output', dest='output', required=False, help='Output BQ table to write results to.', default='lake.usa_names_transformed') # Parse arguments from the command line. known_args, pipeline_args = parser.parse_known_args(argv) # DataTransformation is a class we built in this script to hold the logic for # transforming the file into a BigQuery table. data_ingestion = DataTransformation() # Initiate the pipeline using the pipeline arguments passed in from the # command line. This includes information like where Dataflow should # store temp files, and what the project id is. p = beam.Pipeline(options=PipelineOptions(pipeline_args)) schema = parse_table_schema_from_json(data_ingestion.schema_str) (p # Read the file. This is the source of the pipeline. All further # processing starts with lines read from the file. We use the input # argument from the command line. We also skip the first line which is a # header row. | 'Read From Text' >> beam.io.ReadFromText(known_args.input, skip_header_lines=1) # This stage of the pipeline translates from a CSV file single row # input as a string, to a dictionary object consumable by BigQuery. # It refers to a function we have written. This function will # be run in parallel on different workers using input from the # previous stage of the pipeline. | 'String to BigQuery Row' >> beam.Map(lambda s: data_ingestion.parse_method(s)) | 'Write to BigQuery' >> beam.io.Write( beam.io.BigQuerySink( # The table name is a required argument for the BigQuery sink. # In this case we use the value passed in from the command line. known_args.output, # Here we use the JSON schema read in from a JSON file. # Specifying the schema allows the API to create the table correctly if it does not yet exist. schema=schema, # Creates the table in BigQuery if it does not yet exist. create_disposition=beam.io.BigQueryDisposition.CREATE_IF_NEEDED, # Deletes all data in the BigQuery table before writing. write_disposition=beam.io.BigQueryDisposition.WRITE_TRUNCATE))) p.run().wait_until_finish()
def get_table_schema(schema): """Transform the table schema into a bigquery.TableSchema instance. Args: schema: The schema to be used if the BigQuery table to write has to be created. This is a dictionary object created in the WriteToBigQuery transform. Returns: table_schema: The schema to be used if the BigQuery table to write has to be created but in the bigquery.TableSchema format. """ if schema is None: return schema elif isinstance(schema, (str, unicode)): return bigquery_tools.parse_table_schema_from_json(schema) elif isinstance(schema, dict): return bigquery_tools.parse_table_schema_from_json(json.dumps(schema)) else: raise TypeError('Unexpected schema argument: %s.' % schema)
def get_table_schema(schema): """Transform the table schema into a bigquery.TableSchema instance. Args: schema: The schema to be used if the BigQuery table to write has to be created. This is a dictionary object created in the WriteToBigQuery transform. Returns: table_schema: The schema to be used if the BigQuery table to write has to be created but in the bigquery.TableSchema format. """ if schema is None: return schema elif isinstance(schema, (str, unicode)): return bigquery_tools.parse_table_schema_from_json(schema) elif isinstance(schema, dict): return bigquery_tools.parse_table_schema_from_json(json.dumps(schema)) else: raise TypeError('Unexpected schema argument: %s.' % schema)
def parse_method(self, string_input): """This method translates a single line of comma separated values to a dictionary which can be loaded into BigQuery. Args: string_input: A comma separated list of values in the form of state_abbreviation,gender,year,name,count_of_babies,dataset_created_date example string_input: KS,F,1923,Dorothy,654,11/28/2016 Returns: A dict mapping BigQuery column names as keys to the corresponding value parsed from string_input. In this example, the data is not transformed, and remains in the same format as the CSV. There are no date format transformations. example output: {'state': 'KS', 'gender': 'F', 'year': '1923-01-01', <- This is the BigQuery date format. 'name': 'Dorothy', 'number': '654', 'created_date': '11/28/2016' } """ # Strip out return characters and quote characters. schema = parse_table_schema_from_json(self.schema_str) field_map = [f for f in schema.fields] # Use a CSV Reader which can handle quoted strings etc. reader = csv.reader(string_input.split('\n')) for csv_row in reader: # Our source data only contains year, so default January 1st as the # month and day. month = '01' day = '01' # The year comes from our source data. year = csv_row[2] row = {} i = 0 # Iterate over the values from our csv file, applying any transformation logic. for value in csv_row: # If the schema indicates this field is a date format, we must # transform the date from the source data into a format that # BigQuery can understand. if field_map[i].type == 'DATE': # Format the date to YYYY-MM-DD format which BigQuery # accepts. value = '-'.join((year, month, day)) row[field_map[i].name] = value i += 1 return row
def run_bq_pipeline(argv=None): """Run the sample BigQuery pipeline. Args: argv: Arguments to the run function. """ parser = argparse.ArgumentParser() parser.add_argument('--query', required=True, help='Query to process for the table.') parser.add_argument('--output', required=True, help='Output BQ table to write results to.') parser.add_argument('--output_schema', dest='output_schema', required=True, help='Schema for output BQ table.') parser.add_argument('--use_standard_sql', action='store_true', dest='use_standard_sql', help='Output BQ table to write results to.') parser.add_argument('--kms_key', default=None, help='Use this Cloud KMS key with BigQuery.') parser.add_argument('--bq_temp_location', default=None, help=('GCS bucket to use to store files for ' 'loading data into BigQuery.')) known_args, pipeline_args = parser.parse_known_args(argv) table_schema = parse_table_schema_from_json(known_args.output_schema) kms_key = known_args.kms_key p = TestPipeline(options=PipelineOptions(pipeline_args)) # pylint: disable=expression-not-assigned # pylint: disable=bad-continuation (p | 'read' >> beam.io.Read( beam.io.BigQuerySource(query=known_args.query, use_standard_sql=known_args.use_standard_sql, kms_key=kms_key)) | 'write' >> beam.io.WriteToBigQuery( known_args.output, schema=table_schema, create_disposition=beam.io.BigQueryDisposition.CREATE_IF_NEEDED, write_disposition=beam.io.BigQueryDisposition.WRITE_EMPTY, gs_location=known_args.bq_temp_location)) result = p.run() result.wait_until_finish()
def run(argv=None): known_args, pipeline_args = _parse_user_args(argv) options = get_pipeline_options(pipeline_args) # Load schema schema = '{"fields": ' + open(known_args.schema_path, "r").read() + '}' schema = parse_table_schema_from_json(schema) with beam.Pipeline(options=options) as p: # Get message from pubsub and split it by identifier formated_messages = ( p | "Read from PubSub" >> beam.io.ReadFromPubSub(known_args.topic) | "Windowing" >> beam.WindowInto(window.FixedWindows(30)) | "Decoder" >> beam.Map(lambda e: e.decode()) | "Split into List" >> beam.ParDo(SplitWords(","))) # Pipeline split: # 1. Write to FS # 2. Snooze for 10 sec, and change data locally # Write to FS writer_messages = ( formated_messages | "Write to FS" >> beam.ParDo(WriteToFS()) | "Get FS keys" >> beam.Map(lambda val: (val["uniqe_id"], val))) # Snooze for 10 sec, and change data locally do_something_that_takes_time = ( formated_messages | "Snooze For 10 Seconds" >> beam.ParDo(Snooze()) | "Add Data" >> beam.ParDo(ChangeData("changed!")) | "Get Update keys" >> beam.Map(lambda val: (val["uniqe_id"], val))) # Pipeline group by id and update data in FS after changed locally results = ((writer_messages, do_something_that_takes_time) | "Group by key" >> beam.CoGroupByKey() | "Update FS" >> beam.ParDo(UpdateToFS())) # Write updated data to Big Query (results | "Read Document From FS" >> beam.ParDo(ReadFromFS()) | "Format For BQ" >> beam.ParDo(FormatForBQ()) | "Write to BigQuery" >> beam.io.WriteToBigQuery("saar.messaging", schema=schema))
def parse_table_schema(schema): """ Accepts a BigQuery tableschema as a string, dict (from json), or bigquery.TabelSchema and returns a bigquery.TableSchema String Format "[FIELD]:[DATA_TYPE],[FIELD]:[DATA_TYPE]" dict format { "fields": [ { "name": "[FIELD]", "type": "[DATA_TYPE]" }, { "name": "[FIELD]", "type": "[DATA_TYPE]" } ]} see https://cloud.google.com/bigquery/data-types see https://cloud.google.com/bigquery/docs/schemas#specifying_a_schema_file """ if schema is None: return schema elif isinstance(schema, bq.TableSchema): return schema elif isinstance(schema, six.string_types): # try to parse json into dict try: schema = ujson.loads(schema) except ValueError as e: pass if isinstance(schema, six.string_types): # if it is still a string, then it must not be json. Assume it is string representation return WriteToBigQuery.get_table_schema_from_string(schema) elif isinstance(schema, dict): # either it came in as a dict or it got converted from json earlier return parse_table_schema_from_json(ujson.dumps(schema)) else: raise TypeError('Unexpected schema argument: %s.' % schema)
def test(self): SCHEMA = parse_table_schema_from_json( '{"fields": [{"name": "data", "type": "BYTES"}]}') def format_record(record): # Since Synthetic Source returns data as a dictionary, we should skip one # of the part return {'data': base64.b64encode(record[1])} # pylint: disable=expression-not-assigned (self.pipeline | 'ProduceRows' >> Read( SyntheticSource(self.parseTestPipelineOptions())) | 'Format' >> Map(format_record) | 'WriteToBigQuery' >> WriteToBigQuery( self.output_dataset + '.' + self.output_table, schema=SCHEMA, create_disposition=BigQueryDisposition.CREATE_IF_NEEDED, write_disposition=BigQueryDisposition.WRITE_EMPTY))
def run_bq_pipeline(argv=None): """Run the sample BigQuery pipeline. Args: argv: Arguments to the run function. """ parser = argparse.ArgumentParser() parser.add_argument('--query', required=True, help='Query to process for the table.') parser.add_argument('--output', required=True, help='Output BQ table to write results to.') parser.add_argument('--output_schema', dest='output_schema', required=True, help='Schema for output BQ table.') parser.add_argument('--use_standard_sql', action='store_true', dest='use_standard_sql', help='Output BQ table to write results to.') parser.add_argument('--kms_key', default=None, help='Use this Cloud KMS key with BigQuery.') parser.add_argument('--bq_temp_location', default=None, help=('GCS bucket to use to store files for ' 'loading data into BigQuery.')) known_args, pipeline_args = parser.parse_known_args(argv) table_schema = parse_table_schema_from_json(known_args.output_schema) kms_key = known_args.kms_key p = TestPipeline(options=PipelineOptions(pipeline_args)) # pylint: disable=expression-not-assigned # pylint: disable=bad-continuation (p | 'read' >> beam.io.Read(beam.io.BigQuerySource( query=known_args.query, use_standard_sql=known_args.use_standard_sql, kms_key=kms_key)) | 'write' >> beam.io.WriteToBigQuery( known_args.output, schema=table_schema, create_disposition=beam.io.BigQueryDisposition.CREATE_IF_NEEDED, write_disposition=beam.io.BigQueryDisposition.WRITE_EMPTY, gs_location=known_args.bq_temp_location)) result = p.run() result.wait_until_finish()
def run(argv=None): """The main function which creates the pipeline and runs it.""" parser = argparse.ArgumentParser() parser.add_argument('--input', dest='input', required=False, help='Input file to read', default='gs://linux-etl/data_files/head_usa_names.csv') parser.add_argument('--output', dest='output', required=False, help='Output BQ table to write results to.', default='lake.usa_names_transformed') # Parse arguments from the command line. known_args, pipeline_args = parser.parse_known_args(argv) # DataTransformation is a class we built in this script to hold the logic for # transforming the file into a BigQuery table. data_ingestion = DataTransformation() # Initiate the pipeline using the pipeline arguments passed in from the # command line. p = beam.Pipeline(options=PipelineOptions(pipeline_args)) schema = parse_table_schema_from_json(data_ingestion.schema_str) (p | 'Read From Text' >> beam.io.ReadFromText(known_args.input, skip_header_lines=1) | 'String to BigQuery Row' >> beam.Map(lambda s: data_ingestion.parse_method(s)) | 'Write to BigQuery' >> beam.io.Write( beam.io.BigQuerySink( known_args.output, schema=schema, create_disposition=beam.io.BigQueryDisposition.CREATE_IF_NEEDED, write_disposition=beam.io.BigQueryDisposition.WRITE_TRUNCATE))) p.run().wait_until_finish()
def test(self): SCHEMA = parse_table_schema_from_json( '{"fields": [{"name": "data", "type": "BYTES"}]}') def format_record(record): # Since Synthetic Source returns data as a dictionary, we should skip one # of the part return {'data': base64.b64encode(record[1])} ( # pylint: disable=expression-not-assigned self.pipeline | 'Produce rows' >> Read( SyntheticSource(self.parse_synthetic_source_options())) | 'Count messages' >> ParDo(CountMessages(self.metrics_namespace)) | 'Format' >> Map(format_record) | 'Measure time' >> ParDo(MeasureTime(self.metrics_namespace)) | 'Write to BigQuery' >> WriteToBigQuery( dataset=self.output_dataset, table=self.output_table, schema=SCHEMA, create_disposition=BigQueryDisposition.CREATE_IF_NEEDED, write_disposition=BigQueryDisposition.WRITE_TRUNCATE))
def expand(self, pcoll): from apache_beam.io.gcp.bigquery_tools import parse_table_schema_from_json import json schema = None if self.schema: schema = parse_table_schema_from_json(json.dumps(self.schema)) out = pcoll | io.Write( io.BigQuerySink( self.table_reference.tableId, self.table_reference.datasetId, self.table_reference.projectId, schema, self.create_disposition, self.write_disposition, kms_key=self.kms_key)) # The WriteToBigQuery can have different outputs depending on if it's # Batch or Streaming. This retrieved the output keys from the node and # is replacing them here to be consistent. return {key: out for key in self.outputs}
def test_multiple_destinations_transform(self): output_table_1 = '%s%s' % (self.output_table, 1) output_table_2 = '%s%s' % (self.output_table, 2) output_table_3 = '%s%s' % (self.output_table, 3) output_table_4 = '%s%s' % (self.output_table, 4) schema1 = bigquery.WriteToBigQuery.get_dict_table_schema( bigquery_tools.parse_table_schema_from_json(self.BIG_QUERY_SCHEMA)) schema2 = bigquery.WriteToBigQuery.get_dict_table_schema( bigquery_tools.parse_table_schema_from_json( self.BIG_QUERY_SCHEMA_2)) schema_kv_pairs = [ (output_table_1, schema1), (output_table_2, schema2), (output_table_3, schema1), (output_table_4, schema2) ] pipeline_verifiers = [ BigqueryFullResultMatcher( project=self.project, query="SELECT name, language FROM %s" % output_table_1, data=[(d['name'], d['language']) for d in _ELEMENTS if 'language' in d]), BigqueryFullResultMatcher( project=self.project, query="SELECT name, foundation FROM %s" % output_table_2, data=[(d['name'], d['foundation']) for d in _ELEMENTS if 'foundation' in d]), BigqueryFullResultMatcher( project=self.project, query="SELECT name, language FROM %s" % output_table_3, data=[(d['name'], d['language']) for d in _ELEMENTS if 'language' in d]), BigqueryFullResultMatcher( project=self.project, query="SELECT name, foundation FROM %s" % output_table_4, data=[(d['name'], d['foundation']) for d in _ELEMENTS if 'foundation' in d]) ] args = self.test_pipeline.get_full_options_as_args( on_success_matcher=all_of(*pipeline_verifiers), experiments='use_beam_bq_sink') with beam.Pipeline(argv=args) as p: input = p | beam.Create(_ELEMENTS) schema_map_pcv = beam.pvalue.AsDict( p | "MakeSchemas" >> beam.Create(schema_kv_pairs)) table_record_pcv = beam.pvalue.AsDict( p | "MakeTables" >> beam.Create([('table1', output_table_1), ('table2', output_table_2)])) # Get all input in same machine input = (input | beam.Map(lambda x: (None, x)) | beam.GroupByKey() | beam.FlatMap(lambda elm: elm[1])) _ = ( input | "WriteWithMultipleDestsFreely" >> bigquery.WriteToBigQuery( table=lambda x, tables: (tables['table1'] if 'language' in x else tables['table2']), table_side_inputs=(table_record_pcv, ), schema=lambda dest, schema_map: schema_map.get(dest, None), schema_side_inputs=(schema_map_pcv, ), create_disposition=beam.io.BigQueryDisposition. CREATE_IF_NEEDED, write_disposition=beam.io.BigQueryDisposition.WRITE_EMPTY)) _ = (input | "WriteWithMultipleDests" >> bigquery.WriteToBigQuery( table=lambda x: (output_table_3 if 'language' in x else output_table_4), schema=lambda dest, schema_map: schema_map.get(dest, None), schema_side_inputs=(schema_map_pcv, ), create_disposition=beam.io.BigQueryDisposition. CREATE_IF_NEEDED, write_disposition=beam.io.BigQueryDisposition.WRITE_EMPTY, max_file_size=20, max_files_per_bundle=-1))
def parse_table_schema_from_json(schema_string): return bigquery_tools.parse_table_schema_from_json(schema_string)
def run(args, pipeline_args=None): """Executes Pipeline. :param args: :param pipeline_args: :return: """ """Build and run the pipeline.""" # We use the save_main_session option because one or more DoFn's in this # workflow rely on global context (e.g., a module imported at module level). pipeline_options = PipelineOptions(pipeline_args, streaming=True, save_main_session=True) pipeline_options.view_as(StandardOptions).runner = args.runner # Run on Cloud DataFlow by default google_cloud_options = pipeline_options.view_as(GoogleCloudOptions) google_cloud_options.project = PROJECT_ID google_cloud_options.job_name = 'pubsub-api-bigquery' google_cloud_options.staging_location = args.staging_location google_cloud_options.temp_location = args.temp_location google_cloud_options.region = args.region p = beam.Pipeline(options=pipeline_options) lines = p | 'read in tweets' >> beam.io.ReadFromPubSub( topic=args.input_topic, with_attributes=False, id_label='tweet_id') # TODO: Change to PubSub id. # Window them, and batch them into batches. (Not too large) output_tweets = ( lines | 'assign window key' >> beam.WindowInto( window.FixedWindows(args.window_size)) | 'batch into n batches' >> BatchElements( min_batch_size=args.min_batch_size, max_batch_size=args.max_batch_size) | 'predict sentiment' >> beam.FlatMap(lambda messages: prediction_helper(messages))) # Make explicit BQ schema for output tables: bq_schema_json = { "fields": [ { "name": "id", "type": "STRING" }, { "name": "text", "type": "STRING" }, { "name": "user_id", "type": "STRING" }, { "name": "sentiment", "type": "FLOAT" }, { "name": "posted_at", "type": "TIMESTAMP" }, { "name": "favorite_count", "type": "INTEGER" }, { "name": "retweet_count", "type": "INTEGER" }, { "name": "media", "type": "STRING" }, ] } bq_schema = parse_table_schema_from_json(json.dumps(bq_schema_json)) # Write to BigQuery output_tweets | 'store twitter posts' >> beam.io.WriteToBigQuery( table=args.bigquery_table, dataset=args.bigquery_dataset, schema=bq_schema, write_disposition=beam.io.BigQueryDisposition.WRITE_APPEND, create_disposition=beam.io.BigQueryDisposition.CREATE_IF_NEEDED, project=PROJECT_ID) result = p.run() result.wait_until_finish()
def parse_table_schema_from_json(schema_string): return bigquery_tools.parse_table_schema_from_json(schema_string)
def main(argv=None): parser = argparse.ArgumentParser() parser.add_argument('--input', dest='input', required=True, help='Input path') parser.add_argument('--output', dest='output', required=True, help='Output file to write results to.') # parser.add_argument('--host', # dest='host', # required=False, # help='Database host') known_args, pipeline_args = parser.parse_known_args(argv) pipeline_options = PipelineOptions(pipeline_args) pipeline_options.view_as(SetupOptions).save_main_session = True table_schema = parse_table_schema_from_json(json.dumps(_FIELDS)) # additional_bq_parameters = { # 'timePartitioning': { # 'type': 'DAY', # 'field': 'orderdate'}} # Scrip para conectar no banco de dados # db_config = DBConfig(drivername='postgresql', # username='******', # password='******', # database='perfectorder', # host=known_args.host, # port = 5432) ####### FILTERS EXAMPLES ####################### # # today = date.today() # filters = "orders_order.due_date >= TO_DATE('{}', 'YYYY-MM-DD')".format(today) # # filters = "orders_order.customer_id = 3 AND orders_order.due_date >= TO_DATE('{}', 'YYYY-MM-DD')".format(today) # ##################################################################################################################### p = beam.Pipeline(options=pipeline_options) ######################################################### # # GENERAL DATA ############################################################ _region = f"{known_args.input}dss_region.csv" _region_columns = ['r_regionkey', 'r_name', 'r_comment'] logging.info('Reading region data..') pregion = ( p | 'Reading region data' >> beam.io.ReadFromText(_region, skip_header_lines=1) | 'Mapping region data to Json' >> beam.ParDo( Split(columns=_region_columns)) | 'Mapping region values' >> beam.Map(lambda element: (element['r_regionkey'], element['r_name']))) _nation = f"{known_args.input}dss_nation.csv" _nation_columns = ['nationkey', 'nation_name', 'regionkey', 'n_comment'] logging.info('Reading nation data..') pnation = ( p | 'Reading nation data' >> beam.io.ReadFromText(_nation, skip_header_lines=1) | 'Mapping nation data to Json' >> beam.ParDo( Split(columns=_nation_columns)) # | 'Getting Region Name' >> ApplyMap('region', 'n_regionkey', pregion) # | 'Mapping nation values' >> beam.Map(lambda element: { # 'nationkey' : element['n_nationkey'], # 'nation': element['n_name'], # 'region': element['region'], # }) ) ######################################################### # # CUSTOMER DATA ############################################################ _customer = f"{known_args.input}dss_customer.csv" _customer_columns = [ 'custkey', 'customer_name', 'customer_addres', 'nationkey', 'phone', 'acctbal', 'mktsegment', 'comment' ] logging.info('Reading customer data..') pcustomers = ( p | 'Reading customer data' >> beam.io.ReadFromText(_customer, skip_header_lines=1) | 'Reshuffling customer data to be parallel' >> beam.Reshuffle() | 'Mapping customer data to Json' >> beam.ParDo( Split(columns=_customer_columns))) ## Enrich Customer Data logging.info('Enrich Customer Data..') pipeline_dict = {'customer': pcustomers, 'nation': pnation} pcustomer_nation = ( pipeline_dict | 'Join Customer with Nations' >> LeftJoin( 'customer', pcustomers, 'nation', pnation, 'nationkey') | 'Getting Region Name to Customer' >> ApplyMap( 'customer_region', 'regionkey', pregion) | 'Mapping customers values' >> beam.Map( lambda element: { 'custkey': element['custkey'], 'customer_name': element['customer_name'], 'customer_addres': element['customer_addres'], 'mktsegment': element['mktsegment'], 'customer_nation': element['nation_name'], 'customer_region': element['customer_region'] })) ######################################################### # # ORDER DATA ############################################################ # Getting order data logging.info('Reading order data..') _order_colums = [ 'orderkey', 'custkey', 'orderstatus', 'totalprice', 'orderdate', 'orderpriority', 'clerk', 'shippriority', 'comment' ] _order = f"{known_args.input}dss_order.csv" porder = (p | 'Reading order data' >> beam.io.ReadFromText( _order, skip_header_lines=1) | 'Reshuffling order data to be parallel' >> beam.Reshuffle() | 'Mapping order data to Json' >> beam.ParDo( Split(columns=_order_colums)) | 'cleaning unncessary fields from order' >> beam.Map( lambda element: { 'orderkey': element['orderkey'], 'custkey': element['custkey'], 'orderstatus': element['orderstatus'], 'totalprice': element['totalprice'], 'orderdate': datetime.strptime(element['orderdate'], '"%Y-%m-%d"'). strftime('%Y-%m-%d'), 'orderpriority': element['orderpriority'], 'shippriority': element['shippriority'] })) logging.info('Join order data with customer data') pipeline_dict = {'orders': porder, 'customers': pcustomer_nation} porder_customer = ( pipeline_dict | 'Join Order with Customer' >> LeftJoin('orders', porder, 'customers', pcustomer_nation, 'custkey')) ######################################################### # # ITEMS DATA # NESTED FIELDS ############################################################ ######################################################### # # SUPPLIER DATA ############################################################ logging.info('Reading Supplier data..') _supplier_colums = [ 'suppkey', 'supplier_name', 'supplier_address', 'nationkey', 'phone', 'acctbal', 's_comment' ] _supplier = f"{known_args.input}dss_supplier.csv" psupplier = (p | 'Reading supplier data' >> beam.io.ReadFromText( _supplier, skip_header_lines=1) | 'Mapping supplier data to Json' >> beam.ParDo( Split(columns=_supplier_colums))) ## Enrich Supplier Data logging.info('Enrich Supplier Data..') pipeline_dict = {'supplier': psupplier, 'supplier_nation': pnation} psupplier_nation = ( pipeline_dict | 'Join Supplier with Nations' >> LeftJoin( 'supplier', psupplier, 'supplier_nation', pnation, 'nationkey') | 'Getting Region Name to Supplier' >> ApplyMap( 'supplier_region', 'regionkey', pregion) | 'Mapping supplier fields' >> beam.Map( lambda element: { 'suppkey': element['suppkey'], 'supplier_name': element['supplier_name'], 'supplier_address': element['supplier_address'], 'supplier_nation': element['nation_name'], 'supplier_region': element['supplier_region'] })) ######################################################### # # PRODUCT DATA ############################################################ logging.info('Reading Product data..') _product_colums = [ 'partkey', 'product_name', 'product_manufacture', 'product_brand', 'product_type', 'product_size', 'product_container', 'retailprice', 'product_comment' ] _product = f"{known_args.input}dss_part.csv" pproduct = (p | 'Reading product data' >> beam.io.ReadFromText( _product, skip_header_lines=1) | 'Mapping product data to Json' >> beam.ParDo( Split(columns=_product_colums)) | 'Product mapping values' >> beam.Map( lambda element: { 'partkey': element['partkey'], 'product_name': element['product_name'], 'product_manufacture': element['product_manufacture'], 'product_brand': element['product_brand'], 'product_type': element['product_type'], 'product_size': element['product_size'], 'product_container': element['product_container'], 'retailprice': element['retailprice'] })) ######################################################### # # PRODUCT AVAILABILITY BY SUPPLIER ############################################################ logging.info('Reading Product Availability data..') _psupp_colums = [ 'partkey', 'suppkey', 'availqty', 'supplycost', 'ps_comment' ] _psupp = f"{known_args.input}dss_partsupp.csv" ppsupp = ( p | 'Reading product Availability data' >> beam.io.ReadFromText( _psupp, skip_header_lines=1) | 'Mapping product Availability data to Json' >> beam.ParDo( Split(columns=_psupp_colums)) | 'Creating Complex Key for Product and Supplier' >> beam.Map( lambda element: { 'ckey': "{}|{}".format(element['partkey'], element['suppkey']), 'availqty': element['availqty'], 'supplycost': element['supplycost'] })) ######################################################### # # ITEMS DATA ############################################################ logging.info('Reading items data..') _items_colums = [ 'orderkey', 'partkey', 'suppkey', 'l_linenumber', 'l_quantity', 'l_extendedprice', 'l_discount', 'l_tax', 'l_returnflag', 'l_linestatus', 'l_shipdate', 'l_commitdate', 'l_receiptdate', 'l_shipinstruct', 'l_shipmode', 'l_comment' ] _items = f"{known_args.input}dss_lineitem.csv" pitems = (p | 'Reading items data' >> beam.io.ReadFromText( _items, skip_header_lines=1) | 'Reshuffling items data to be parallel' >> beam.Reshuffle() | 'Mapping items data to Json' >> beam.ParDo( Split(columns=_items_colums)) | 'Mapping items fields' >> beam.Map( lambda element: { 'ckey': "{}|{}".format(element['partkey'], element['suppkey']), 'orderkey': element['orderkey'], 'partkey': element['partkey'], 'suppkey': element['suppkey'], 'linenumber': element['l_linenumber'], 'quantity': element['l_quantity'], 'extendedprice': element['l_extendedprice'], 'discount': element['l_discount'], 'tax': element['l_tax'], 'returnflag': element['l_returnflag'], 'linestatus': element['l_linestatus'], 'shipdate': datetime.strptime(element['l_shipdate'], '"%Y-%m-%d"'). strftime('%Y-%m-%d'), 'commitdate': datetime.strptime(element['l_commitdate'], '"%Y-%m-%d"' ).strftime('%Y-%m-%d'), 'receiptdate': datetime.strptime(element['l_receiptdate'], '"%Y-%m-%d"' ).strftime('%Y-%m-%d'), 'delay': (datetime.strptime(element[ 'l_commitdate'], '"%Y-%m-%d"') - datetime.strptime( element['l_receiptdate'], '"%Y-%m-%d"')).days, 'shipinstruct': element['l_shipinstruct'], 'shipmode': element['l_shipmode'] })) ## Enrich Items Data logging.info('Enrich Items Data..') pipeline_dict = {'item': pitems, 'product': pproduct} pitems_product = (pipeline_dict | 'Join Items with Product' >> LeftJoin( 'item', pitems, 'product', pproduct, 'partkey')) ## Enrich Items Data logging.info('Enrich Items Data..') pipeline_dict = {'item': pitems_product, 'supplier': psupplier_nation} pitems_supp = ( pipeline_dict | 'Join items with Supplier' >> LeftJoin( 'item', pitems_product, 'supplier', psupplier_nation, 'suppkey')) # Enrich Items Data logging.info('Enrich Items Data..') pipeline_dict = {'item': pitems_supp, 'avail': ppsupp} pitems_availability = (pipeline_dict | 'Join items with avail' >> LeftJoin( 'item', pitems_supp, 'avail', ppsupp, 'ckey')) ######################################################### # # ADD ITEMS TO ORDERS AND WRITE ############################################################ pipeline_dict = {'orders': porder_customer, 'items': pitems_availability} results = ( pipeline_dict | 'Join Order with Items' >> JoinNested('orders', porder_customer, 'items', pitems_availability, 'orderkey') # | 'Writing to BQ' >> beam.io.WriteToText(known_args.output) | 'Writing to BQ' >> beam.io.WriteToBigQuery( known_args.output, 'santodigital', 'perfect-order-api', schema=table_schema, create_disposition=beam.io.BigQueryDisposition.CREATE_IF_NEEDED, write_disposition=beam.io.BigQueryDisposition.WRITE_TRUNCATE)) p.run().wait_until_finish()
def run(argv=None): """The main function which creates the pipeline and runs it.""" parser = argparse.ArgumentParser() parser.add_argument( '--input', dest='input', required=True, help='Input file to read. This can be a local file or ' 'a file in a Google Storage Bucket.') # This defaults to the lake dataset in your BigQuery project. You'll have # to create the lake dataset yourself using this command: # bq mk lake parser.add_argument('--output', dest='output', required=True, help='Output BQ table to write results to.') parser.add_argument('--temp_bucket', dest='temp_bucket', required=True, help='temp bucket name.') parser.add_argument('--credential', dest='credential', required=True, help='credential json key.') parser.add_argument('--schema', dest='schema_string', required=True, help='data schema json format.') parser.add_argument('--skip_json_lines', dest='skip_json_lines', type=int, required=False, help='skip csv lines.', default=0) # Parse arguments from the command line. known_args, pipeline_args = parser.parse_known_args(argv) pipeline_args += ["--runner=DataflowRunner", "--save_main_session", #"--staging_location=gs://%s/staging" % (known_args.temp_bucket), "--temp_location=gs://%s/temp" % (known_args.temp_bucket)] os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = known_args.credential #schema_string='{"fields":[{"name":"usage","type":"record","fields":[{"name":"cpu","type":"STRING"},{"name":"mem","type":"STRING"}]},{"name":"_proc_PID_io","type":"record","fields":[{"name":"syscw","type":"INTEGER","mode":"repeated"},{"name":"cancelled_write_bytes","type":"INTEGER","mode":"repeated"},{"name":"wchar","type":"INTEGER","mode":"repeated"},{"name":"syscr","type":"INTEGER","mode":"repeated"},{"name":"read_bytes","type":"INTEGER","mode":"repeated"},{"name":"rchar","type":"INTEGER","mode":"repeated"},{"name":"write_bytes","type":"INTEGER","mode":"repeated"}]},{"name":"_proc_PID_stat","type":"record","fields":[{"name":"ds_agent","type":"STRING","mode":"repeated"}]},{"name":"_proc_PID_status","type":"record","fields":[{"name":"ShdPnd","type":"INTEGER","mode":"repeated"},{"name":"CapInh","type":"INTEGER","mode":"repeated"},{"name":"Cpus_allowed_list","type":"STRING","mode":"repeated"},{"name":"SigBlk","type":"INTEGER","mode":"repeated"},{"name":"State","type":"STRING","mode":"repeated"},{"name":"TracerPid","type":"INTEGER","mode":"repeated"},{"name":"FDSize","type":"INTEGER","mode":"repeated"},{"name":"VmRSS","type":"INTEGER","mode":"repeated"},{"name":"Gid","type":"INTEGER","mode":"repeated"},{"name":"CapBnd","type":"STRING","mode":"repeated"},{"name":"Utrace","type":"INTEGER","mode":"repeated"},{"name":"VmExe","type":"INTEGER","mode":"repeated"},{"name":"Pid","type":"INTEGER","mode":"repeated"},{"name":"SigIgn","type":"INTEGER","mode":"repeated"},{"name":"Groups","type":"INTEGER","mode":"repeated"},{"name":"Name","type":"STRING","mode":"repeated"},{"name":"Uid","type":"INTEGER","mode":"repeated"},{"name":"VmSwap","type":"INTEGER","mode":"repeated"},{"name":"SigCgt","type":"STRING","mode":"repeated"},{"name":"VmStk","type":"INTEGER","mode":"repeated"},{"name":"VmPeak","type":"INTEGER","mode":"repeated"},{"name":"VmData","type":"INTEGER","mode":"repeated"},{"name":"nonvoluntary_ctxt_switches","type":"INTEGER","mode":"repeated"},{"name":"voluntary_ctxt_switches","type":"INTEGER","mode":"repeated"},{"name":"Mems_allowed_list","type":"STRING","mode":"repeated"},{"name":"Mems_allowed","type":"STRING","mode":"repeated"},{"name":"SigQ","type":"STRING","mode":"repeated"},{"name":"Tgid","type":"INTEGER","mode":"repeated"},{"name":"Cpus_allowed","type":"STRING","mode":"repeated"},{"name":"CapEff","type":"STRING","mode":"repeated"},{"name":"VmLck","type":"INTEGER","mode":"repeated"},{"name":"VmPTE","type":"INTEGER","mode":"repeated"},{"name":"VmSize","type":"INTEGER","mode":"repeated"},{"name":"CapPrm","type":"STRING","mode":"repeated"},{"name":"PPid","type":"INTEGER","mode":"repeated"},{"name":"SigPnd","type":"INTEGER","mode":"repeated"},{"name":"Threads","type":"INTEGER","mode":"repeated"},{"name":"VmHWM","type":"INTEGER","mode":"repeated"},{"name":"VmLib","type":"INTEGER","mode":"repeated"}]}]}' #schema = parse_table_schema_from_json(known_args.schema_string) schema = parse_table_schema_from_json('{"fields":%s}' % (known_args.schema_string)) # DataIngestion is a class we built in this script to hold the logic for # transforming the file into a BigQuery table. data_ingestion = DataIngestion() # Initiate the pipeline using the pipeline arguments passed in from the # command line. This includes information such as the project ID and # where Dataflow should store temp files. p = beam.Pipeline(options=PipelineOptions(pipeline_args)) ( p | 'Read from a File' >> beam.io.ReadFromText(known_args.input, skip_header_lines=known_args.skip_json_lines) # This stage of the pipeline translates from a CSV file single row # input as a string, to a dictionary object consumable by BigQuery. # It refers to a function we have written. This function will # be run in parallel on different workers using input from the # previous stage of the pipeline. | 'String To BigQuery Row' >> beam.Map(lambda s: data_ingestion.parse_method(s)) | 'Write to BigQuery' >> beam.io.Write( beam.io.BigQuerySink( # The table name is a required argument for the BigQuery sink. # In this case we use the value passed in from the command line. known_args.output, # Here we use the simplest way of defining a schema: # fieldName:fieldType schema=schema, # Creates the table in BigQuery if it does not yet exist. create_disposition=beam.io.BigQueryDisposition.CREATE_IF_NEEDED, # Deletes all data in the BigQuery table before writing. write_disposition=beam.io.BigQueryDisposition.WRITE_APPEND))) p.run().wait_until_finish()
def run_bq_pipeline(argv=None): """Run the sample BigQuery pipeline. Args: argv: Arguments to the run function. """ parser = argparse.ArgumentParser() parser.add_argument('--query', required=True, help='Query to process for the table.') parser.add_argument('--output', required=True, help='Output BQ table to write results to.') parser.add_argument('--output_schema', dest='output_schema', required=True, help='Schema for output BQ table.') parser.add_argument('--use_standard_sql', action='store_true', dest='use_standard_sql', help='Output BQ table to write results to.') parser.add_argument('--kms_key', default=None, help='Use this Cloud KMS key with BigQuery.') parser.add_argument('--native', default=False, action='store_true', help='Use NativeSources and Sinks.') parser.add_argument('--use_json_exports', default=False, action='store_true', help='Use JSON as the file format for exports.') known_args, pipeline_args = parser.parse_known_args(argv) table_schema = parse_table_schema_from_json(known_args.output_schema) kms_key = known_args.kms_key options = PipelineOptions(pipeline_args) p = TestPipeline(options=options) # Note to future modifiers: Keep using BigQuerySource if known_args.native is # True. if known_args.native: data = p | 'read' >> beam.io.Read( beam.io.BigQuerySource( query=known_args.query, use_standard_sql=known_args.use_standard_sql, kms_key=kms_key)) else: data = p | 'read' >> beam.io.gcp.bigquery.ReadFromBigQuery( query=known_args.query, project=options.view_as(GoogleCloudOptions).project, use_standard_sql=known_args.use_standard_sql, use_json_exports=known_args.use_json_exports, kms_key=kms_key) temp_file_format = ('NEWLINE_DELIMITED_JSON' if known_args.use_json_exports else 'AVRO') _ = data | 'write' >> beam.io.WriteToBigQuery( known_args.output, schema=table_schema, create_disposition=beam.io.BigQueryDisposition.CREATE_IF_NEEDED, write_disposition=beam.io.BigQueryDisposition.WRITE_EMPTY, temp_file_format=temp_file_format, kms_key=kms_key) result = p.run() result.wait_until_finish()
"CORE_LB_ZoneEntityFR": "STRING", "CORE_LB_ZoneEntityStatus": "STRING", "CORE_LB_Order": "INTEGER", "CORE_ID_ZoneSourceId": "STRING", "CORE_DT_LastMod": "DATETIME", "CORE_DT_RecordCreationDate": "DATETIME", "CORE_DT_RecordModificationDate": "DATETIME", "CORE_FL_IsDeleted": "INTEGER", "CORE_FL_Latitude": "FLOAT", "CORE_FL_Longitude": "FLOAT" } mapping_list = [{"name": k, "type": mapping[k]} for k in mapping.keys()] return json.JSONEncoder(sort_keys=True).encode({"fields": mapping_list}) table_schema = parse_table_schema_from_json(make_sink_schema()) source = BigQuerySource( query= "SELECT ROW_NUMBER() over(order by Code) as CORE_Id_ZoneId, Parent as CORE_LB_ZoneParentCode, Level as CORE_LB_Level, Code as CORE_LB_ZoneCode, EntityEN as CORE_LB_ZoneEntityEN, EntityFR as CORE_LB_ZoneEntityFR, EntityStatus as CORE_LB_ZoneEntityStatus, `Order` as CORE_LB_Order, Id as CORE_ID_ZoneSourceId, LastMod as CORE_DT_LastMod, current_date() as CORE_DT_RecordCreationDate, current_date() as CORE_DT_RecordModificationDate, 0 as CORE_FL_IsDeleted, geo.Latitude as CORE_FL_Latitude, geo.Longitude as CORE_FL_Longitude FROM `studied-client-307710.SMT_STG.SecondAxis` ax left outer join `studied-client-307710.SMT_STG.Geographic_Coordinates` geo on geo.SubZone = ax.EntityEN", use_standard_sql=True) # you can also use SQL queries #source = BigQuerySource(source_table_spec) target = BigQuerySink(sink_table_spec, schema=table_schema) #target = beam.io.WriteToText("output.txt") def run(argv=None): parser = argparse.ArgumentParser() known_args, pipeline_args = parser.parse_known_args(argv) with beam.Pipeline(argv=pipeline_args) as p: raw_values = (
def parse_table_schema_from_json(schema_string): import warnings warnings.warn("This function is deprecated and will be permanently moved " "to the bigquery_tools module in a future version of beam") return bigquery_tools.parse_table_schema_from_json(schema_string)
def run_pipeline(pipeline_options, known_args): p = beam.Pipeline(options=pipeline_options) lines = p | "read in tweets" >> beam.io.ReadFromPubSub( subscription=known_args.input_subscription, with_attributes=False # id_label="tweet_id" # not for direct runner ) output_tweets = ( lines | 'add window key' >> beam.WindowInto( window.FixedWindows(10)) # 10 seconds | 'batch messages' >> BatchElements(min_batch_size=2, max_batch_size=50) | 'predict sentiment' >> beam.FlatMap(lambda messages: predict_sentiment(messages))) bq_schema_tweets = json.dumps({ "fields": [{ "name": "id", "type": "STRING" }, { "name": "time_stamp", "type": "TIMESTAMP" }, { "name": "text", "type": "STRING" }, { "name": "username", "type": "STRING" }, { "name": "sentiment", "type": "INTEGER" }, { "name": "sentiment_score", "type": "FLOAT" }, { "name": "sentiment_magnitude", "type": "FLOAT" }, { "name": "language", "type": "STRING" }, { "name": "n_followers", "type": "INTEGER" }] }) output_tweets | 'write to BQ' >> beam.io.WriteToBigQuery( table=os.getenv('BQ_TABLE'), dataset=os.getenv('BQ_DATASET'), schema=parse_table_schema_from_json(bq_schema_tweets), write_disposition=beam.io.BigQueryDisposition.WRITE_APPEND, create_disposition=beam.io.BigQueryDisposition.CREATE_IF_NEEDED, project=os.getenv('GC_PROJECT')) output_batch = ( lines | 'add window key 2' >> beam.WindowInto(window.FixedWindows( 1 * 60)) # 1 minute | 'batch messages 2' >> BatchElements(min_batch_size=10) | 'analyze in batch' >> beam.Map(lambda messages: analyze_batch(messages))) bq_schema_batches = json.dumps({ "fields": [{ "name": "time_stamp", "type": "TIMESTAMP" }, { "name": "batch_size", "type": "INTEGER" }, { "name": "top_words", "type": "STRING" }, { "name": "top_languages", "type": "STRING" }, { "name": "avg_num_words", "type": "FLOAT" }, { "name": "avg_num_characters", "type": "FLOAT" }, { "name": "avg_sentiment_score", "type": "FLOAT" }] }) output_batch | 'write to BQ 2' >> beam.io.WriteToBigQuery( table=os.getenv('BQ_TABLE_BATCH'), dataset=os.getenv('BQ_DATASET'), schema=parse_table_schema_from_json(bq_schema_batches), write_disposition=beam.io.BigQueryDisposition.WRITE_APPEND, create_disposition=beam.io.BigQueryDisposition.CREATE_IF_NEEDED, project=os.getenv('GC_PROJECT')) return p.run()
new_x = pd.DataFrame.from_dict(element, orient="index").T.fillna(0) weight = self.model.predict(new_x.iloc[:, :8])[0] yield { 'guid': element['guid'], 'weight': weight, 'time': str(element['time']) } schema = parse_table_schema_from_json( json.dumps({ 'fields': [{ 'name': 'guid', 'type': 'STRING' }, { 'name': 'weight', 'type': 'FLOAT64' }, { 'name': 'time', 'type': 'STRING' }] })) class CreateEntityDoFn(beam.DoFn): def process(self, element): key = Key(['natality-guid', element['guid']]) entity = Entity(key) entity.set_properties({ 'weight': element['weight'], 'time': element['time']
def run(argv=None): """ Método principal """ parser = argparse.ArgumentParser() parser.add_argument('--input_a', dest='input_a', required=False, help='Arquivo de entrada', default='gs://dotz-exam/raw/price_quote.csv') parser.add_argument('--input_b', dest='input_b', required=False, help='Arquivo de entrada', default='gs://dotz-exam/raw/bill_of_materials.csv') parser.add_argument('--input_c', dest='input_c', required=False, help='Arquivo de entrada', default='gs://dotz-exam/raw/comp_boss.csv') parser.add_argument('--output_table', dest='output_table', required=False, help='Saida para BQ', default='price_bill_comp') known_args, pipeline_args = parser.parse_known_args(argv) data_ingestion = DataIngestion() price_key = 'tube_assembly_id' bill_key = 'tube_assembly_id' p = beam.Pipeline(options=PipelineOptions(pipeline_args)) schema = parse_table_schema_from_json( data_ingestion.price_bill_comp_schema) price_data = ( p | 'price quote - Read text file' >> beam.io.ReadFromText( known_args.input_a, skip_header_lines=1) | 'price quote - Convert to dict' >> beam.ParDo(price_quote_to_dict())) # | 'price quote - Filter by id' >> beam.ParDo(filter_by_key(), 'tube_assembly_id', 'TA-11583')) bill_data = (p | 'bill - Read text file' >> beam.io.ReadFromText( known_args.input_b, skip_header_lines=1) | 'bill - Convert to dict' >> beam.ParDo(bill_to_dict())) # | 'bill - Filter by id' >> beam.ParDo(filter_by_key(), 'tube_assembly_id', 'TA-11583')) price_bill_data = ( { 'price': price_data, 'bill': bill_data } | 'Left join {0} and {1} on {2}'.format( 'price', 'bill', 'tube_assembly_id') >> LeftJoin( price_data, bill_data, 'price', 'bill', 'tube_assembly_id')) comp_data1 = (p | 'comp1 - Read text file' >> beam.io.ReadFromText( known_args.input_c, skip_header_lines=1) | 'comp1 - Convert to dict' >> beam.ParDo(comp1_to_dict())) # comp_data2 = (p # | 'comp2 - Read text file' >> beam.io.ReadFromText(known_args.input_c, skip_header_lines=1) # | 'comp2 - Convert to dict' >> beam.ParDo(comp2_to_dict())) price_bill_comp_data1 = ( { 'price_bill': price_bill_data, 'comp1': comp_data1 } | 'Left join {0} and {1} on {2}'.format( 'price_bill', 'comp1', 'component_id_1') >> LeftJoin( price_data, bill_data, 'price_bill', 'comp1', 'component_id_1') | 'Comp1 - Salvar' >> beam.io.WriteToText('./tmp/', 'comp1') | 'Salvar no GCS' >> beam.io.WriteToText( file_path_prefix='gs://dotz-exam/work/', file_name_suffix='.json', append_trailing_newlines=True)) # TODO: Fazer a junção das demais informações sobre componentes # price_bill_comp_data2 = ({'price_bill_comp_data1': price_bill_comp_data1, 'comp2': comp_data2} # | 'Left join {0} and {1} on {2}'.format('price_bill_comp_data1', 'comp2', 'component_id_2') # >> LeftJoin(price_bill_comp_data1, comp_data2, 'price_bill_comp_data1', 'comp2', 'component_id_2') # | 'Comp2 - Salvar' >> beam.io.WriteToText('./tmp/', 'comp2') # ) p.run().wait_until_finish()