def _create_input_data(self): """ Runs an additional pipeline which creates test data and waits for its completion. """ SCHEMA = parse_table_schema_from_json( '{"fields": [{"name": "data", "type": "BYTES"}]}') def format_record(record): # Since Synthetic Source returns data as a dictionary, we should skip one # of the part import base64 return {'data': base64.b64encode(record[1])} with TestPipeline() as p: ( # pylint: disable=expression-not-assigned p | 'Produce rows' >> Read( SyntheticSource(self.parse_synthetic_source_options())) | 'Format' >> Map(format_record) | 'Write to BigQuery' >> WriteToBigQuery( dataset=self.input_dataset, table=self.input_table, schema=SCHEMA, create_disposition=BigQueryDisposition.CREATE_IF_NEEDED, write_disposition=BigQueryDisposition.WRITE_EMPTY))
def main(argv): """Main entry point""" # Define and parse command line arguments parser = argparse.ArgumentParser() parser.add_argument('--input', type=str, default='', help='Path to the data file(s) containing game data.') parser.add_argument( '--output_dataset', type=str, default='', help='The BigQuery dataset name where to write all the data.') parser.add_argument( '--output_table_name', type=str, default='', help='The BigQuery table name where to write all the data.') args, pipeline_args = parser.parse_known_args(argv) options = PipelineOptions(pipeline_args) # Create and run the pipeline with beam.Pipeline(options=options) as p: (p | 'ReadInputText' >> beam.io.ReadFromText(args.input) | 'ParseGameEvent' >> ParDo(ParseEventFn()) | 'ExtractUserScore' >> ExtractAndSumScore() | 'FormatUserScoreSums' >> ParDo(FormatUserScoreSumsFn()) | 'WriteTeamScoreSums' >> WriteToBigQuery( args.output_table_name, args.output_dataset, options.get_all_options().get("project"), table_schema()))
def run(argv=None): # argument parser parser = argparse.ArgumentParser() # pipeline options, google_cloud_options known_args, pipeline_args = parser.parse_known_args(argv) pipeline_options = PipelineOptions(pipeline_args) setup_options = pipeline_options.view_as(SetupOptions) setup_options.save_main_session = True p = beam.Pipeline(options=pipeline_options) # Read from a csv source and a pubsub source p1 = p | 'read from topic' >> beam.io.ReadFromPubSub(topic='projects/PROJECT_ID/topics/TOPIC_NAME_1') \ | 'convert to dict' >> beam.Map(lambda x: json.loads(x)) p2 = p | 'read from csv files' >> ReadFromText('gs://bucket_name/historical/files*.csv') \ | 'split' >> beam.Map(lambda x: x.split(',')) \ | 'format to dict' >> beam.Map(lambda x: {"id": x[0], "name": x[1]}) input_rec = (p1, p2) | 'flatten' >> beam.Flatten() # Write the messages output_rec = input_rec | 'write to GCS' >> WriteToBigQuery( 'PROJECT_ID:DATASET_ID.TABLE_NAME', schema='id:INTEGER, name:STRING', create_disposition=beam.io.BigQueryDisposition.CREATE_IF_NEEDED, write_disposition=beam.io.BigQueryDisposition.WRITE_APPEND) result = p.run() result.wait_until_finish()
def run(argv=None): # argument parser parser = argparse.ArgumentParser() # pipeline options, google_cloud_options known_args, pipeline_args = parser.parse_known_args(argv) pipeline_options = PipelineOptions(pipeline_args) setup_options = pipeline_options.view_as(SetupOptions) setup_options.save_main_session = True p = beam.Pipeline(options=pipeline_options) p1 = p | 'trigger from pubsub' >> beam.io.ReadFromPubSub(topic='projects/PROJECT_ID/topics/TOPIC_NAME_1') \ | "convert msg to dict" >> beam.Map(lambda x: json.loads(x)) \ | "extract filename" >> beam.Map(lambda x : 'gs://{}/{}'.format(x['bucket'], x['name'])) \ | "read file" >> ReadAllFromText() \ | 'split' >> beam.Map(lambda x: x.split(',')) \ | 'format to dict' >> beam.Map(lambda x: {"id": x[0], "name": x[1]}) # Write the messages output_rec = p1 | 'write to BigQuery' >> WriteToBigQuery( 'PROJECT_ID:DATASET_ID.TABLE_NAME', schema='id:INTEGER, name:STRING', create_disposition=beam.io.BigQueryDisposition.CREATE_IF_NEEDED, write_disposition=beam.io.BigQueryDisposition.WRITE_APPEND) result = p.run() result.wait_until_finish()
def run(argv=None): # argument parser parser = argparse.ArgumentParser() # pipeline options, google_cloud_options known_args, pipeline_args = parser.parse_known_args(argv) pipeline_options = PipelineOptions(pipeline_args) setup_options = pipeline_options.view_as(SetupOptions) setup_options.save_main_session = True p = beam.Pipeline(options=pipeline_options) # Create two PCollections by reading from two different pubsub topics p1 = p | 'read from topic 1' >> beam.io.ReadFromPubSub( topic='projects/PROJECT_ID/topics/TOPIC_NAME_1') p2 = p | 'read from topic 2' >> beam.io.ReadFromPubSub( topic='projects/PROJECT_ID/topics/TOPIC_NAME_2') # Merge the two PCollections merged = (p1, p2) | 'merge sources' >> beam.Flatten() # Convert to dict rec_dict = merged | 'convert to dict' >> beam.Map(lambda x: json.loads(x)) # Write the messages rec_dict | 'write to GCS' >> WriteToBigQuery( 'PROJECT_ID:DATASET_ID.TABLE_NAME', schema='id:INTEGER, name:STRING', create_disposition=beam.io.BigQueryDisposition.CREATE_IF_NEEDED, write_disposition=beam.io.BigQueryDisposition.WRITE_APPEND) result = p.run() result.wait_until_finish()
def run(argv=None): parser = argparse.ArgumentParser() parser.add_argument( "--input", dest="input", default="gs://airflow-training-knab-asv/land_registry_price_paid_uk/*/*.json", help="Input file to process.", ) known_args, pipeline_args = parser.parse_known_args(argv) pipeline_args.extend( [ "--runner=DataflowRunner", "--project=gdd-990fd90d0db6efbabdc6b70f1c", "--staging_location=gs://airflow-training-knab-asv/dataflow-staging", "--temp_location=gs://airflow-training-knab-asv/dataflow-temp", "--job_name=gcs-gzcomp-to-bq1", ] ) pipeline_options = PipelineOptions(pipeline_args) pipeline_options.view_as(SetupOptions).save_main_session = True with beam.Pipeline(options=pipeline_options) as p: ( p | "ReadFromGCS" >> ReadFromText(known_args.input, coder=JsonCoder()) | WriteToBigQuery( "result_table", dataset="result_dataset", project="gdd-990fd90d0db6efbabdc6b70f1c", schema="city:string, county:string, district:string, duration:string, locality:string, newly_built:boolean, paon:string, postcode:string, ppd_category_type:string, price:numeric, property_type:string, record_status:string, saon:string, street:string, transaction:string, transfer_date:numeric", create_disposition=BigQueryDisposition.CREATE_IF_NEEDED, ) )
def run(argv=None): parser = argparse.ArgumentParser() parser.add_argument('input_topic', type=str, help="Input Pub/Sub topic name.") parser.add_argument( 'output_table', type=str, help="Output BigQuery table name. Example: project.db.name") parser.add_argument('--model_project', type=str, help="Google Project ID with model.") parser.add_argument('--model_name', type=str, help="Name of the Google AI Platform model name.") parser.add_argument('--model_region', type=str, help="AI Platform region name.") parser.add_argument('--model_version', type=str, help="AI Platform model version.") known_args, pipeline_args = parser.parse_known_args(argv) _topic_comp = known_args.input_topic.split('/') if len(_topic_comp) != 4 or _topic_comp[0] != 'projects' or _topic_comp[ 2] != 'topics': raise ValueError("Table topic name has inappropriate format.") if len(known_args.output_table.split('.')) != 2: raise ValueError("Table name has inappropriate format.") inf_args = [ known_args.model_project, known_args.model_name, known_args.model_region, known_args.model_version ] options = PipelineOptions(pipeline_args) options.view_as(SetupOptions).save_main_session = True options.view_as(StandardOptions).streaming = True p = Pipeline(options=options) _ = (p | 'read from pub/sub' >> ReadFromPubSub( known_args.input_topic).with_output_types(bytes) | 'windowing' >> WindowInto(window.FixedWindows(10, 0)) | 'convert to dict' >> Map(json.loads) | 'pre processing' >> PreProcessing() | 'make inference' >> ParDo(MakeRemoteInferenceDoFn(*inf_args)) | 'format message' >> Map(formatter) | 'write to BQ' >> WriteToBigQuery( table=known_args.output_table, schema=build_bq_schema(), create_disposition=BigQueryDisposition.CREATE_IF_NEEDED, write_disposition=BigQueryDisposition.WRITE_APPEND)) if os.environ.get('DEPLOY'): p.run( ) # I use p.run() instead of "opening context `with Pipeline() as p`" because it need to exit after running. else: p.run().wait_until_finish()
def run(argv=None): """ Main entry point, define and run the pipeline """ parser = argparse.ArgumentParser( description='Run Apache Beam to process the logs') parser.add_argument('--input', dest='input', help='Input file to process') parser.add_argument('--output', dest='output', help='Output file to write results to') parser.add_argument( '--input_subscription', dest='input_subscription', help=('Input PubSub subscription of the form ' '"projects/<PROJECT>/subscriptions/<SUBSCRIPTION>."')) parser.add_argument( '--output_table', dest='output_table', help=('BigQuery Table to write results to, with the form ' '<PROJECT>:<DATASET>.<TABLE>')) known_args, pipeline_args = parser.parse_known_args(argv) # We use the save_main_session option because one or more DoFn's in this # workflow rely on global context (e.g., a module imported at module level). pipeline_options = PipelineOptions(pipeline_args) pipeline_options.view_as(SetupOptions).save_main_session = True pipeline_options.view_as(StandardOptions).streaming = True print('pipeline options:', pipeline_options) # Specification for table in BigQuery table_spec = args.output_table table_schema = 'host:STRING, utc_timestamp:TIMESTAMP, action:STRING, uri:STRING, protocol:STRING, status:STRING, size:INTEGER' with beam.Pipeline(options=pipeline_options) as p: # Read the text file[pattern] into a PCollection. if known_args.input_subscription: lines = (p | ReadFromPubSub(subscription=known_args.input_subscription ).with_output_types(bytes)) else: lines = (p | ReadFromText(known_args.input, coder=coders.BytesCoder())) output = (lines | 'parse_filter' >> beam.ParDo(ParseAndFilterDoFn())) # | 'parse' >> (beam.Map(parse_one_record))) # output | WriteToText(known_args.output) output | WriteToBigQuery( table_spec, schema=table_schema, write_disposition=beam.io.BigQueryDisposition.WRITE_APPEND, create_disposition=beam.io.BigQueryDisposition.CREATE_IF_NEEDED)
def run(argv=None): """Pipeline for reading data from a PubSub topic, redacting the data using Cloud DLP and writing the results to BigQuery""" parser = argparse.ArgumentParser() parser.add_argument('--input', dest='input', help='PubSub topic to read from.') parser.add_argument( '--output', dest='output', help= 'BigQuery output dataset and table name in the format dataset.tablename' ) known_args, pipeline_args = parser.parse_known_args(argv) # We use the save_main_session option because one or more DoFn's in this # workflow rely on global context (e.g., a module imported at module level). pipeline_options = PipelineOptions(pipeline_args) pipeline_options.view_as(SetupOptions).save_main_session = True with beam.Pipeline(options=pipeline_options) as p: if 'streaming' in p.options.display_data(): # Read in the CSV file lines = (p | 'ReadFromPubSub' >> ReadFromPubSub( topic=known_args.input).with_output_types(bytes) | 'DecodeMessage' >> beam.Map(lambda x: x.decode('utf-8')) | 'ParseMessage' >> beam.ParDo(ParsePubSubMessageFn())) else: # Read in the CSV file lines = (p | 'ReadFromGCS' >> ReadFromText(known_args.input) | 'ParseFileFn' >> beam.ParDo(ParseFileFn())) # Redact PII from the 'text' column. redacted_rows = ( lines | 'IdentifyAndRedactText' >> IdentifyAndRedactText( p.options.display_data()['project'], ['ALL_BASIC'])) # Format rows and write to BigQuery. (redacted_rows | 'MapToTableRows' >> beam.Map(lambda row: { 'id': row['id'], 'text': row['text'] }) | 'WriteToBigQuery' >> WriteToBigQuery( known_args.output, schema='id:INTEGER, text:STRING', project=p.options.display_data()['project'], create_disposition=beam.io.BigQueryDisposition.CREATE_IF_NEEDED, write_disposition=beam.io.BigQueryDisposition.WRITE_APPEND))
def run(argv=None): parser = argparse.ArgumentParser() parser.add_argument( "--input", dest="input", help="Input file to process.", ) parser.add_argument( "--table", dest="table", help="Destination BigQuery table", ) parser.add_argument( "--dataset", dest="dataset", help="Destination BigQuery dataset", ) known_args, pipeline_args = parser.parse_known_args(argv) pipeline_args.extend( [ "--staging_location=gs://airflow-emgsilva/dataflow-staging", "--temp_location=gs://airflow-emgsilva/dataflow-temp" ] ) pipeline_options = PipelineOptions(pipeline_args) pipeline_options.view_as(SetupOptions).save_main_session = True with beam.Pipeline(options=pipeline_options) as p: ( p | "ReadFromGCS" >> ReadFromText(known_args.input, coder=JsonCoder()) | WriteToBigQuery( known_args.table, dataset=known_args.dataset, schema="city:string, " "county:string, " "district:string, " "duration:string, " "locality:string, " "newly_built:boolean, " "paon:string, " "postcode:string, " "ppd_category_type:string, " "price:numeric, " "property_type:string, " "record_status:string, " "saon:string, " "street:string, " "transaction:string, " "transfer_date:numeric", create_disposition=BigQueryDisposition.CREATE_IF_NEEDED) )
def run(argv=None, save_main_session=True): parser = argparse.ArgumentParser() parser.add_argument( '--schema_registry', dest='schema_registry', default='http://127.0.0.1:8081', help='Schema registry endpoint. Defaults to local endpoint.') parser.add_argument('--failed-bq-inserts', dest='failed_bq_inserts', required=True, help='Bucket for writing failed inserts') known_args, pipeline_args = parser.parse_known_args(argv) pipeline_args.extend([ '--job_name=dbz-test-example', ]) pipeline_options = PipelineOptions(pipeline_args) pipeline_options.view_as( SetupOptions).save_main_session = save_main_session pipeline_options.view_as(StandardOptions).streaming = True project_id = 'crafty-apex-264713' kafka_topic = 'dbserver1.inventory.customers' pubsub_topic = f'projects/{project_id}/topics/{kafka_topic}' with beam.Pipeline(options=pipeline_options) as p: bq = ( p | 'Read from PubSub' >> ReadFromPubSub(topic=pubsub_topic) | '2 Second Window' >> beam.WindowInto(window.FixedWindows(2)) | 'Avro to Row' >> beam.FlatMap( avro_to_row(known_args.schema_registry)) # | 'Write to File' >> # beam.io.WriteToText('args.output') | 'Write to BigQuery' >> WriteToBigQuery( 'crafty-apex-264713:inventory.customers', schema='id:INT64,' 'first_name:STRING,' 'last_name:STRING,' 'email:STRING,' '__op:STRING,' '__source_ts_ms:INT64,' '__lsn:INT64', create_disposition=BigQueryDisposition.CREATE_IF_NEEDED, write_disposition=BigQueryDisposition.WRITE_APPEND)) # Can't get this to run in dataflow - causes job graph that is not updatable # In direct runner I can't get it to spit any errors """
def main(argv): """Main entry point""" # Define and parse command line arguments parser = argparse.ArgumentParser() parser.add_argument( '--input', type=str, default='', help= 'Path to the data file(s) containing game data (use either this parameter or --topic but not both).' ) parser.add_argument( '--topic', type=str, default='', help= 'Topic to subscribe to (use either this parameter or --input but not both).' ) parser.add_argument( '--output_dataset', type=str, default='', help='The BigQuery dataset name where to write all the data.') parser.add_argument( '--output_table_name', type=str, default='', help='The BigQuery table name where to write all the data.') args, pipeline_args = parser.parse_known_args(argv) options = PipelineOptions(pipeline_args) options.view_as(SetupOptions).save_main_session = True with beam.Pipeline(options=options) as p: (p | 'ReadGameEvents' >> ReadGameEvents(args) | 'WindowedTeamScore' >> WindowedTeamScore(30) | 'FormatTeamScoreSums' >> ParDo( FormatTeamScoreSumsFn( (args.topic != None) and (args.topic != ""))) | 'WriteTeamScoreSums' >> WriteToBigQuery(args.output_table_name, args.output_dataset, options.get_all_options().get("project"), table_schema(), BigQueryDisposition.CREATE_IF_NEEDED, BigQueryDisposition.WRITE_APPEND))
def run(): transform_events_options = PipelineOptions().view_as( TransformEventsOptions) pipeline_options = PipelineOptions() pipeline_options.view_as(SetupOptions).save_main_session = True p = beam.Pipeline(options=pipeline_options, ) incoming_topic = str(transform_events_options.incoming_topic) logging.info("Incoming topic for events = {}".format(incoming_topic)) output_table = str(transform_events_options.output_table) logging.info( "Output table for transformed events: {}".format(output_table)) _ = (p | 'Read events from PubSub' >> ReadStringsFromPubSub(incoming_topic) | 'Transform PubSub events' >> beam.ParDo(TransformEvents()) | 'Write to BigQuery' >> WriteToBigQuery(table=output_table)) p.run()
def get_replacement_transform(self, ptransform): # Imported here to avoid circular dependencies. # pylint: disable=wrong-import-order, wrong-import-position from apache_beam import io class WriteToBigQuery(io.WriteToBigQuery): override = True def __init__(self, transform, outputs): self.transform = transform self.outputs = outputs def __getattr__(self, name): """Returns the given attribute from the parent. This allows this transform to act like a WriteToBigQuery transform without having to construct a new WriteToBigQuery transform. """ return self.transform.__getattribute__(name) def expand(self, pcoll): from apache_beam.io.gcp.bigquery_tools import parse_table_schema_from_json import json schema = None if self.schema: schema = parse_table_schema_from_json(json.dumps(self.schema)) out = pcoll | io.Write( io.BigQuerySink( self.table_reference.tableId, self.table_reference.datasetId, self.table_reference.projectId, schema, self.create_disposition, self.write_disposition, kms_key=self.kms_key)) # The WriteToBigQuery can have different outputs depending on if it's # Batch or Streaming. This retrieved the output keys from the node and # is replacing them here to be consistent. return {key: out for key in self.outputs} return WriteToBigQuery(ptransform, self.outputs)
def run(argv=None): """Pipeline for reading data from a PubSub topic or a Cloud Storage bucket, redacting the data using Cloud DLP and writing the results to BigQuery""" parser = argparse.ArgumentParser() parser.add_argument('--input', dest='input', help='PubSub topic to read from.') parser.add_argument('--output', dest='output', help='BigQuery output dataset and table name in the format dataset.tablename') known_args, pipeline_args = parser.parse_known_args(argv) # We use the save_main_session option because one or more DoFn's in this # workflow rely on global context (e.g., a module imported at module level). pipeline_options = PipelineOptions(pipeline_args) pipeline_options.view_as(SetupOptions).save_main_session = True with beam.Pipeline(options=pipeline_options) as p: lines = (p # 1. Read in the file from PubSub. | 'ReadFromPubSub' >> ReadFromPubSub() # 2. Process the JSON message from PubSub | 'ParseMessage' ) average = (lines | 'ApplyWindow' ) # 3. For each Key, sum up the values # 4. Format the as Python dictionaries for writing to BigQuery (lines # 4. Format the as Python dictionaries for writing to BigQuery | 'ConvertToDictionary' # 5. Write the output to BigQuery | 'WriteToBigQuery' >> WriteToBigQuery( known_args.output, schema='id:INTEGER, total:INTEGER', create_disposition=beam.io.BigQueryDisposition.CREATE_IF_NEEDED, write_disposition=beam.io.BigQueryDisposition.WRITE_APPEND ))
def run(argv=None): """Pipeline for reading data from a PubSub topic or a Cloud Storage bucket, redacting the data using Cloud DLP and writing the results to BigQuery""" parser = argparse.ArgumentParser() parser.add_argument('--input', dest='input', help='PubSub topic to read from.') parser.add_argument( '--output', dest='output', help= 'BigQuery output dataset and table name in the format dataset.tablename' ) known_args, pipeline_args = parser.parse_known_args(argv) # We use the save_main_session option because one or more DoFn's in this # workflow rely on global context (e.g., a module imported at module level). pipeline_options = PipelineOptions(pipeline_args) pipeline_options.view_as(SetupOptions).save_main_session = True with beam.Pipeline(options=pipeline_options) as p: # Read in the CSV file lines = (p | 'ReadFromPubSub' >> ReadFromPubSub( topic=known_args.input).with_output_types(bytes) | 'DecodeMessage' >> beam.Map(lambda x: x.decode('utf-8')) | 'ParseMessage' >> beam.ParDo(ParsePubSubMessageFn())) windows = (lines | 'WindowInto' >> beam.WindowInto(FixedWindows(30, 0)) | 'SumValues' >> beam.CombinePerKey(sum)) # Format rows and write to BigQuery. (windows | 'ConvertToDictionary' >> beam.Map(lambda row: { 'id': row[0], 'total': row[1] }) | 'WriteToBigQuery' >> WriteToBigQuery( known_args.output, schema='id:INTEGER, total:INTEGER', create_disposition=beam.io.BigQueryDisposition.CREATE_IF_NEEDED, write_disposition=beam.io.BigQueryDisposition.WRITE_APPEND))
def test(self): SCHEMA = parse_table_schema_from_json( '{"fields": [{"name": "data", "type": "BYTES"}]}') def format_record(record): # Since Synthetic Source returns data as a dictionary, we should skip one # of the part return {'data': base64.b64encode(record[1])} # pylint: disable=expression-not-assigned (self.pipeline | 'ProduceRows' >> Read( SyntheticSource(self.parseTestPipelineOptions())) | 'Format' >> Map(format_record) | 'WriteToBigQuery' >> WriteToBigQuery( self.output_dataset + '.' + self.output_table, schema=SCHEMA, create_disposition=BigQueryDisposition.CREATE_IF_NEEDED, write_disposition=BigQueryDisposition.WRITE_EMPTY))
def run(argv=None, save_main_session=True): pipeline_args = [] pipeline_args.extend([ # CHANGE 2/5: (OPTIONAL) Change this to DataflowRunner to # run your pipeline on the Google Cloud Dataflow Service. '--runner=DirectRunner', #'--runner=dataflow', # CHANGE 3/5: Your project ID is required in order to run your pipeline on # the Google Cloud Dataflow Service. '--project=dragon-test-270305', # CHANGE 4/5: Your Google Cloud Storage path is required for staging local # files. '--staging_location=gs://duysdf/', # CHANGE 5/5: Your Google Cloud Storage path is required for temporary # files. '--temp_location=gs://duysdf/temp', '--job_name=backend_log_dataflow_to_bigquery', '--streaming', '--region=asia-southeast1', '--max-workers=1', ]) # We use the save_main_session option because one or more DoFn's in this # workflow rely on global context (e.g., a module imported at module level). pipeline_options = PipelineOptions(pipeline_args) pipeline_options.view_as(SetupOptions).save_main_session = save_main_session with beam.Pipeline(options=pipeline_options) as p: elements = (p | "CreatePCollectionData" >> beam.Create([ json.dumps({'_Dest': 'T3', 'X': 'AAA', 'Y':123 }), json.dumps({'_Dest': 'T4', 'X': 'BBB', 'Y':456 }), ])) # output multi pcollections ; yield TaggedOutput('T3', element) goes to T3 pcollection processed_tagged_log = elements | "multiplex-pcoll" >> beam.ParDo(ParseBackendLog()).with_outputs(*g_backend_tables) for key in g_backend_tables: processed_tagged_log[key] | "WriteBQ_%s" % key >> WriteToBigQuery( table=key, dataset=g_dataset, project=g_project_id, write_disposition=beam.io.BigQueryDisposition.WRITE_APPEND)
def run(argv=None, save_main_session=True): """Main entry point; defines and runs the wordcount pipeline.""" parser = argparse.ArgumentParser() parser.add_argument( '--input', dest='input', default= '/home/shravan/Desktop/gcp_files/2020-10-02-11-34-19-EA6C5E314B70B157', help='Input file to process.') parser.add_argument('--output', dest='output', required=False, default='output', help='Output file to write results to.') known_args, pipeline_args = parser.parse_known_args(argv) pipeline_options = PipelineOptions(pipeline_args) pipeline_options.view_as( SetupOptions).save_main_session = save_main_session with beam.Pipeline(options=pipeline_options) as p: #obj = Utility() table_spec = bigquery.TableReference(projectId='justlikethat-294122', datasetId='log_analysis', tableId='quotes') table_schema = 'source:STRING, quote:STRING' data_ingestion = dataingestion() (p | 'Read from a File' >> beam.io.ReadFromText(known_args.input) | 'String To BigQuery Row' >> beam.Map(lambda s: data_ingestion.parse_method(s)) | 'Write to BigQuery' >> WriteToBigQuery( table_spec, schema='source:STRING, quote:STRING', create_disposition=beam.io.BigQueryDisposition.CREATE_IF_NEEDED, write_disposition=beam.io.BigQueryDisposition.WRITE_TRUNCATE)) """
def main(argv): """Main entry point""" # Define and parse command line arguments parser = argparse.ArgumentParser() parser.add_argument('--input', type=str, default='', help='Path to the data file(s) containing game data.') parser.add_argument('--output_dataset', type=str, default='', help='The BigQuery dataset name where to write all the data.') parser.add_argument('--output_table_name', type=str, default='', help='The BigQuery table name where to write all the data.') args, pipeline_args = parser.parse_known_args(argv) options = PipelineOptions(pipeline_args) options.view_as(SetupOptions).save_main_session = True # Create and run the pipeline with beam.Pipeline(options=options) as p: (p | 'ReadInputText' >> beam.io.ReadFromText(args.input) | 'ParseGameEvent' >> ParDo(ParseEventFn()) | 'AddEventTimestamps' >> beam.Map(lambda element: TimestampedValue(element, element['timestamp'])) | 'WindowedTeamScore' >> WindowedTeamScore(3600000) # 1 hour = 3600 seconds = 3600000 milliseconds | 'FormatTeamScoreSums' >> ParDo(FormatTeamScoreSumsFn()) | 'WriteTeamScoreSums' >> WriteToBigQuery( args.output_table_name, args.output_dataset, options.get_all_options().get("project"), table_schema() ) )
def test(self): SCHEMA = parse_table_schema_from_json( '{"fields": [{"name": "data", "type": "BYTES"}]}') def format_record(record): # Since Synthetic Source returns data as a dictionary, we should skip one # of the part return {'data': base64.b64encode(record[1])} ( # pylint: disable=expression-not-assigned self.pipeline | 'Produce rows' >> Read( SyntheticSource(self.parse_synthetic_source_options())) | 'Count messages' >> ParDo(CountMessages(self.metrics_namespace)) | 'Format' >> Map(format_record) | 'Measure time' >> ParDo(MeasureTime(self.metrics_namespace)) | 'Write to BigQuery' >> WriteToBigQuery( dataset=self.output_dataset, table=self.output_table, schema=SCHEMA, create_disposition=BigQueryDisposition.CREATE_IF_NEEDED, write_disposition=BigQueryDisposition.WRITE_TRUNCATE))
def run_pipeline(source, target): header = get_header(source) fields = header.split(CSV_DELIMITER) (bq_schema, schema) = get_schema(target) input_path = 'gs://dotz-hiring-datalake/raw/{}.csv'.format(source) output_path = 'gs://dotz-hiring-datalake/processed/{}.json/part'.format( target) pipeline_args = [ '--job_name={}-{}'.format(target, str(time.time()).replace('.', '-')), '--input={}'.format(input_path), '--output={}'.format(output_path) ] pipeline_args.extend(BASE_PIPELINE_ARGS) pipeline_options = PipelineOptions(pipeline_args) pipeline_options.view_as(SetupOptions).save_main_session = True with beam.Pipeline(options=pipeline_options) as pipeline: lines = pipeline | ReadFromText(input_path) # not so bright way to remove a CSV header lines = lines | 'RemoveHeader' >> beam.Filter( lambda line: line != header) objs = lines | 'CSV2JSON' >> beam.Map(csv2json(fields)) proc_objs = objs | 'ProcessJSONs' >> beam.Map(process(schema)) filtered_proc_objs = proc_objs | 'FilterEmpties' >> beam.Filter( lambda x: x) dumped_objs = filtered_proc_objs | 'DumpJSONs' >> beam.Map(json.dumps) dumped_objs | WriteToText(output_path) filtered_proc_objs | WriteToBigQuery( 'dotz-hiring:tubulation.{}'.format(target), write_disposition=BigQueryDisposition.WRITE_TRUNCATE, create_disposition=BigQueryDisposition.CREATE_NEVER)
def run(argv=None): """Pipeline for reading data from a Cloud Storage bucket and writing the results to BigQuery""" parser = argparse.ArgumentParser() parser.add_argument('--input', dest='input', help='File to read in.') parser.add_argument( '--output', dest='output', help= 'BigQuery output dataset and table name in the format dataset.tablename' ) known_args, pipeline_args = parser.parse_known_args(argv) # We use the save_main_session option because one or more DoFn's in this # workflow rely on global context (e.g., a module imported at module level). pipeline_options = PipelineOptions(pipeline_args) pipeline_options.view_as(SetupOptions).save_main_session = True with beam.Pipeline(options=pipeline_options) as p: user_request_header = [ 'id', 'first_name', 'last_name', 'email', 'gender' ] input_rows = ( p | 'ReadFile' >> ReadFromText(known_args.input, skip_header_lines=1) | 'ParseFile' >> beam.ParDo(ParseFileFn(user_request_header))) (input_rows | 'WriteToBigQuery' >> WriteToBigQuery( known_args.output, schema= 'id:INTEGER, first_name:STRING, last_name:STRING, email:STRING, gender:STRING', create_disposition=beam.io.BigQueryDisposition.CREATE_IF_NEEDED, write_disposition=beam.io.BigQueryDisposition.WRITE_APPEND)) p.run().wait_until_finish()
def run(argv=None): """Main entry point. It defines and runs the pipeline.""" parser = argparse.ArgumentParser() parser.add_argument( '--input', dest='input', default='gs://meetup-batch-processing/input/googleplaystore.csv', help='Input file to process.') parser.add_argument( '--output', dest='output', default='gs://meetup-batch-processing/output/googleplaystore.csv', help='Output file to process.') parser.add_argument( '--table-output', dest='table_output', default= 'meetup-hands-on-gcp-2019:googleplaystore_batch_dataflow.play_store', help='Bigquery table name for output.') known_args, pipeline_args = parser.parse_known_args(argv) pipeline_options = PipelineOptions(pipeline_args) with beam.Pipeline(options=pipeline_options) as pipeline: raw_lines = pipeline | 'ReadFromCsv' >> ReadFromText( known_args.input, skip_header_lines=1) lines = raw_lines | 'processCsv' >> beam.ParDo(ProcessCSV()) output = lines | 'parseRecord' >> beam.ParDo(ParseRecord()) output | 'writeBigQuery' >> WriteToBigQuery( known_args.table_output, write_disposition=beam.io.BigQueryDisposition.WRITE_TRUNCATE, create_disposition=beam.io.BigQueryDisposition.CREATE_NEVER) logging.info('Finished.')
def run(argv=None): """Pipeline for reading data from a Cloud Storage bucket and writing the results to BigQuery""" parser = argparse.ArgumentParser() parser.add_argument('--input', dest='input', help='File to read in.') parser.add_argument('--output', dest='output', help='BigQuery output dataset and table name in the format dataset.tablename') known_args, pipeline_args = parser.parse_known_args(argv) # We use the save_main_session option because one or more DoFn's in this # workflow rely on global context (e.g., a module imported at module level). pipeline_options = PipelineOptions(pipeline_args) pipeline_options.view_as(SetupOptions).save_main_session = True with beam.Pipeline(options=pipeline_options) as p: retail_header = ['department', 'value'] input_rows = (p | 'ReadFile' >> ReadFromText(known_args.input, skip_header_lines=1) | 'ParseFile' >> beam.ParDo(ParseFileFn(retail_header)) | 'CreateKVPairs' >> beam.Map(lambda x: (x['department'], float(x['value']))) | 'SumValues' >> beam.CombinePerKey(sum) | 'Format' >> beam.Map(lambda x: {'department': x[0], 'value': float(x[1])}) ) (input_rows | 'WriteToBigQuery' >> WriteToBigQuery( known_args.output, schema='department:STRING, value:FLOAT', create_disposition=beam.io.BigQueryDisposition.CREATE_IF_NEEDED, write_disposition=beam.io.BigQueryDisposition.WRITE_TRUNCATE) ) p.run().wait_until_finish()
def run(argv=None): """Pipeline for reading data from a Cloud Storage bucket and writing the results to BigQuery""" parser = argparse.ArgumentParser() parser.add_argument('--input', dest='input', help='File to read in.') parser.add_argument( '--output', dest='output', help= 'BigQuery output dataset and table name in the format dataset.tablename' ) known_args, pipeline_args = parser.parse_known_args(argv) # We use the save_main_session option because one or more DoFn's in this # workflow rely on global context (e.g., a module imported at module level). pipeline_options = PipelineOptions(pipeline_args) pipeline_options.view_as(SetupOptions).save_main_session = True with beam.Pipeline(options=pipeline_options) as p: # 1. Read in the file from Google Cloud Storage. Hint: remember there is a header line in the CSV. input_rows = p | 'ReadFile' >> ReadFromText() # 2. Convert the rows into Key, Value pairs. Hint: use tuples # 3. For each Key, sum up the values. Hint: CombinePerKey(sum) # 4. Format the as Python dictionaries for writing to BigQuery # 5. Write the output to BigQuery (input_rows | 'WriteToBigQuery' >> WriteToBigQuery( known_args.output, schema='department:STRING, value:FLOAT', create_disposition=beam.io.BigQueryDisposition.CREATE_IF_NEEDED, write_disposition=beam.io.BigQueryDisposition.WRITE_TRUNCATE)) p.run().wait_until_finish()
def main(argv): parser = argparse.ArgumentParser() parser.add_argument('--topic', type=str, default='', help='Topic to subscribe to (use either this parameter or --input but not both).') parser.add_argument('--output_dataset', type=str, default='', help='The BigQuery dataset name where to write all the data.') parser.add_argument('--output_table_name', type=str, default='', help='The BigQuery table name where to write all the data.') args, pipeline_args = parser.parse_known_args(argv) options = PipelineOptions(pipeline_args) options.view_as(SetupOptions).save_main_session = True with beam.Pipeline(options=options) as p: (p | 'ReadMessages' >> ReadFromPubSub(args.topic) | 'FormatRecord' >> beam.Map(lambda element: {"data": element}) # | "PrintBeforeInsert" >> beam.Map(lambda record: print str(element)) | 'WriteDataElementBQ' >> WriteToBigQuery( args.output_table_name, args.output_dataset, options.get_all_options().get("project"), table_schema(), BigQueryDisposition.CREATE_IF_NEEDED, BigQueryDisposition.WRITE_APPEND ) )
| "Decode and format json" >> beam.Map(lambda x: json.loads(x))) order_product = messages | "Extract id" >> beam.Map(lambda x: (x.get("order_id"), x)) def group_products(order_products): order_id, products = order_products output = {"order_id": str(order_id), "product": []} logging.info("order_id: {}".format(str(order_id))) for product in products: output["device_id"] = product.pop("device_id") product.pop("order_id") output["product"] = output["product"] + [product] return output orders = (order_product | beam.WindowInto(window.Sessions(500)) | "Group by order" >> beam.GroupByKey() | "Join orders" >> beam.Map(group_products)) # output = (orders # | "Format orders" >> beam.Map(format_orders)) orders | WriteToBigQuery( args.table, args.dataset, write_disposition=beam.io.BigQueryDisposition.WRITE_APPEND) p.run()
def run(argv=None): import random import datetime currentDT = datetime.datetime.now() # Your GCP Project ID and GCS locations # are passed through as part of your Dataflow job command # BigQuery output info dataset = 'lab_dev' table = 'play_by_play' # Dataflow job name (don't edit) job_name = 'play-by-play-{}'.format( currentDT.strftime("%Y-%m-%d-%H-%M-%S")) filepath = 'gs://cloud-training-demos/ncaa/next-bootcamp/2018-19/play_by_play/*' pipeline_args = [ # change these '--runner=DataflowRunner', '--project={}'.format(argv['project_id']), '--dataset={}'.format(dataset), '--table={}'.format(table), '--staging_location={}'.format(argv['staging']), '--temp_location={}'.format(argv['temp_location']), '--num_workers=5', '--max_num_workers=20', '--region={}'.format(argv['region']), '--job_name={}'.format(job_name) ] pipeline_options = PipelineOptions(pipeline_args) with beam.Pipeline(options=pipeline_options) as p: files = p | ReadFromText(filepath) keyed = files | 'Key' >> beam.Map(lambda x: (random.randint(1, 101), x)) grouped = keyed | 'GBK' >> beam.GroupByKey() flattended = grouped | 'Expand' >> beam.FlatMap(lambda x: x[1]) to_insert = flattended | 'Format' >> beam.ParDo(Format()) # to_insert | beam.ParDo(Check()) table_schema = bigquery.TableSchema() for col, col_type in play_by_play_schema.iteritems(): this_schema = bigquery.TableFieldSchema() this_schema.name = col this_schema.type = col_type this_schema.mode = 'nullable' table_schema.fields.append(this_schema) to_insert | WriteToBigQuery( table='{_project_}:{_dataset_}.{_table_}'.format( _dataset_=dataset, _project_=argv['project_id'], _table_=table), schema=table_schema, create_disposition=beam.io.BigQueryDisposition.CREATE_IF_NEEDED, write_disposition=beam.io.BigQueryDisposition.WRITE_APPEND)
def run(argv=None): """Main entry point; defines and runs the wordcount pipeline.""" parser = argparse.ArgumentParser() parser.add_argument('--input', dest='input', required=True, help='Input file to process.') parser.add_argument('--config', dest='config', required=True, help='Configuration file with bigquery project settings and date range.') parser.add_argument('--oauth_file', dest='oauth_file', required=True, help='File to authorize process.') parser.add_argument('--schema', dest='schema', required=True, help='File with schema of table.') known_args, pipeline_args = parser.parse_known_args(argv) os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = known_args.oauth_file data = {} if os.path.isfile(known_args.schema): with open(known_args.schema) as file: data = json.load(file) else: print('Missing configuration file' + known_args.schema) return schema = data['schema'] data = {} if os.path.isfile(known_args.config): with open(known_args.config) as file: data = json.load(file) else: print('Missing configuration file' + known_args.schema) return project = data['project_id'] dataset = data['dataset_id'] table = data['table'] date_start = datetime.datetime.strptime(data['start_date'], '%Y-%m-%d').date() date_end = datetime.datetime.strptime(data['end_date'], '%Y-%m-%d').date() keys = schema.replace(':STRING','').replace(':INTEGER','').replace(':FLOAT','').split(',') pipeline_options = PipelineOptions(pipeline_args) pipeline_options.view_as(SetupOptions).save_main_session = True p = beam.Pipeline(options=pipeline_options) lines = p | 'readFromGCS' >> ReadFromText(known_args.input, skip_header_lines=1).with_output_types(unicode) while date_end - date_start >= datetime.timedelta(days=0): date = date_start.strftime('%Y-%m-%d') date_start += datetime.timedelta(days=1) output = lines | 'splitCSV_'+date >> beam.ParDo(Split(date, keys)) output | 'writeToBQ_'+date >> WriteToBigQuery(table=table+'_'+date.replace('-',''), dataset=dataset, project=project, schema=schema, create_disposition=BigQueryDisposition.CREATE_IF_NEEDED, write_disposition=BigQueryDisposition.WRITE_APPEND) result = p.run() result.wait_until_finish()