def generate_cooccur_schema(): """BigQuery schema for the word co-occurrence table.""" json_str = json.dumps({ 'fields': [{ 'name': 'w1', 'type': 'STRING', 'mode': 'NULLABLE' }, { 'name': 'w2', 'type': 'STRING', 'mode': 'NULLABLE' }, { 'name': 'count', 'type': 'INTEGER', 'mode': 'NULLABLE' }, { 'name': 'log_weight', 'type': 'FLOAT', 'mode': 'NULLABLE' }, { 'name': 'ts', 'type': 'TIMESTAMP', 'mode': 'NULLABLE' }] }) return parse_table_schema_from_json(json_str)
def parse_method(self, string_input): # Strip out carriage return, newline and quote characters. values = re.split(",", re.sub('\r\n', '', re.sub(u'"', '', string_input))) row = dict(zip(('name', 'email'), values)) return row """This method translates a single line of comma separated values to a dictionary which can be loaded into BigQuery. Args: string_input: A comma separated list of values in the form of state_abbreviation,gender,year,name,count_of_babies,dataset_created_date example string_input: KS,F,1923,Dorothy,654,11/28/2016 Returns: A dict mapping BigQuery column names as keys to the corresponding value parsed from string_input. In this example, the data is not transformed, and remains in the same format as the CSV. There are no date format transformations. example output: {'state': 'KS', 'gender': 'F', 'year': '1923-01-01', <- This is the BigQuery date format. 'name': 'Dorothy', 'number': '654', 'created_date': '11/28/2016' } """ # Strip out return characters and quote characters. schema = bigquery.parse_table_schema_from_json(self.schema_str)
def run_bq_pipeline(argv=None): """Run the sample BigQuery pipeline. Args: argv: Arguments to the run function. """ parser = argparse.ArgumentParser() parser.add_argument('--query', required=True, help='Query to process for the table.') parser.add_argument('--output', required=True, help='Output BQ table to write results to.') parser.add_argument('--output_schema', dest='output_schema', required=True, help='Schema for output BQ table.') parser.add_argument('--use_standard_sql', action='store_true', dest='use_standard_sql', help='Output BQ table to write results to.') known_args, pipeline_args = parser.parse_known_args(argv) table_schema = parse_table_schema_from_json(known_args.output_schema) p = TestPipeline(options=PipelineOptions(pipeline_args)) # pylint: disable=expression-not-assigned # pylint: disable=bad-continuation (p | 'read' >> beam.io.Read(beam.io.BigQuerySource( query=known_args.query, use_standard_sql=known_args.use_standard_sql)) | 'write' >> beam.io.Write(beam.io.BigQuerySink( known_args.output, schema=table_schema, create_disposition=beam.io.BigQueryDisposition.CREATE_IF_NEEDED, write_disposition=beam.io.BigQueryDisposition.WRITE_TRUNCATE))) result = p.run() result.wait_until_finish()
def get_bq_schema(self): """ This helper function parses a 'FIELDNAME:DATATYPE' string for the BQ api. """ return beam_bigquery.parse_table_schema_from_json( json.dumps(self.schema))
def get_bq_schema(self): """ This helper function parses a list of bq.schema.SchemaField for the BQ api. """ return beam_bigquery.parse_table_schema_from_json( json.dumps(self.schema))
def run(argv=None): parser = argparse.ArgumentParser() parser.add_argument( '--input', dest='input', required=False, help='Input file to read. This can be a local file or ' 'a file in a Google Storage Bucket.', default='gs://python-dataflow-example/data_files/head_usa_names.csv') parser.add_argument('--output', dest='output', required=False, help='Output BQ table to write results to.', default='lake_gig2.usa_names_transformed') known_args, pipeline_args = parser.parse_known_args(argv) data_ingestion = DataTransformation() p = beam.Pipeline(options=PipelineOptions(pipeline_args)) schema = parse_table_schema_from_json(data_ingestion.schema_str) (p | 'Read From Text' >> beam.io.ReadFromText(known_args.input, skip_header_lines=1) | 'String to BigQuery Row' >> beam.Map(lambda s: data_ingestion.parse_method(s)) | 'Write to BigQuery' >> beam.io.Write( beam.io.WriteToBigQuery( known_args.output, schema=schema, create_disposition=beam.io.BigQueryDisposition.CREATE_IF_NEEDED, write_disposition=beam.io.BigQueryDisposition.WRITE_TRUNCATE))) p.run().wait_until_finish()
def parse_method(self, string_input): """This method translates a single line of comma separated values to a dictionary which can be loaded into BigQuery. Args: string_input: A comma separated list of values in the form of state_abbreviation,gender,year,name,count_of_babies,dataset_created_date example string_input: KS,F,1923,Dorothy,654,11/28/2016 Returns: A dict mapping BigQuery column names as keys to the corresponding value parsed from string_input. In this example, the data is not transformed, and remains in the same format as the CSV. There are no date format transformations. example output: {'state': 'KS', 'gender': 'F', 'year': '1923-01-01', <- This is the BigQuery date format. 'name': 'Dorothy', 'number': '654', 'created_date': '11/28/2016' } """ # Strip out return characters and quote characters. schema = bigquery.parse_table_schema_from_json(self.schema_str) field_map = [f for f in schema.fields] # Use a CSV Reader which can handle quoted strings etc. reader = csv.reader(string_input.split('\n')) for csv_row in reader: if (sys.version_info.major < 3.0): values = [x.decode('utf8') for x in csv_row] else: values = csv_row # Our source data only contains year, so default January 1st as the # month and day. month = u'01' day = u'01' # The year comes from our source data. year = values[2] row = {} i = 0 # Iterate over the values from our csv file, applying any transformation logic. for value in values: # If the schema indicates this field is a date format, we must # transform the date from the source data into a format that # BigQuery can understand. if field_map[i].type == 'DATE': # Format the date to YYYY-MM-DD format which BigQuery # accepts. value = u'-'.join((year, month, day)) row[field_map[i].name] = value i += 1 return row
def run(argv=None): """The main function which creates the pipeline and runs it.""" parser = argparse.ArgumentParser() # Here we add some specific command line arguments we expect. Specifically # we have the input file to load and the output table to write to. parser.add_argument( '--input', dest='input', required=False, help='Input file to read. This can be a local file or ' 'a file in a Google Storage Bucket.', # This example file contains a total of only 10 lines. # It is useful for developing on a small set of data default='gs://python-dataflow-example/data_files/head_usa_names.csv') # This defaults to the temp dataset in your BigQuery project. You'll have # to create the temp dataset yourself using bq mk temp parser.add_argument('--output', dest='output', required=False, help='Output BQ table to write results to.', default='lake.usa_names_transformed') # Parse arguments from the command line. known_args, pipeline_args = parser.parse_known_args(argv) # DataTransformation is a class we built in this script to hold the logic for # transforming the file into a BigQuery table. data_ingestion = DataTransformation() # Initiate the pipeline using the pipeline arguments passed in from the # command line. This includes information like where Dataflow should # store temp files, and what the project id is. p = beam.Pipeline(options=PipelineOptions(pipeline_args)) schema = parse_table_schema_from_json(data_ingestion.schema_str) (p # Read the file. This is the source of the pipeline. All further # processing starts with lines read from the file. We use the input # argument from the command line. We also skip the first line which is a # header row. | 'Read From Text' >> beam.io.ReadFromText(known_args.input, skip_header_lines=1) # This stage of the pipeline translates from a CSV file single row # input as a string, to a dictionary object consumable by BigQuery. # It refers to a function we have written. This function will # be run in parallel on different workers using input from the # previous stage of the pipeline. | 'String to BigQuery Row' >> beam.Map(lambda s: data_ingestion.parse_method(s)) | 'Write to BigQuery' >> beam.io.Write( beam.io.BigQuerySink( # The table name is a required argument for the BigQuery sink. # In this case we use the value passed in from the command line. known_args.output, # Here we use the JSON schema read in from a JSON file. # Specifying the schema allows the API to create the table correctly if it does not yet exist. schema=schema, # Creates the table in BigQuery if it does not yet exist. create_disposition=beam.io.BigQueryDisposition.CREATE_IF_NEEDED, # Deletes all data in the BigQuery table before writing. write_disposition=beam.io.BigQueryDisposition.WRITE_TRUNCATE))) p.run().wait_until_finish()
def parse_method(self, string_input): """This method translates a single line of comma separated values to a dictionary which can be loaded into BigQuery. Args: string_input: A comma separated list of values in the form Returns: A dict mapping BigQuery column names as keys to the corresponding value parsed from string_input. The data is not transformed and remains in the same format as the CSV file.`` output form: { 'field name_1': 'value_1', 'field name_2': 'value_2', ... } """ # Strip the return and quote characters. schema = parse_table_schema_from_json(self.schema_str) schema_2 = parse_table_schema_from_json(self.schema_str_2) # field mapping field_map = [f for f in schema.fields] # encoding reload(sys) sys.setdefaultencoding("utf-8") # use csv Reader that deal with quoted strings. reader = csv.reader(string_input.split('\n')) for csv_row in reader: values = [x.decode('utf8') for x in csv_row] # store in dictionary. iterate over the values from the csv file row = {} i = 0 for value in values: # dictionary to get the value row[field_map[i].name] = value i += 1 return row
def run(argv=None, save_main_session=True): parser = argparse.ArgumentParser() #1 Replace your hackathon-edem with your project id parser.add_argument('--input_topic', dest='input_topic', default='projects/gft-app-294621/topics/audio_topic', help='Input file to process.') #2 Replace your hackathon-edem with your project id parser.add_argument('--input_subscription', dest='input_subscription', default='projects/gft-app-294621/subscriptions/audio_sub', help='Input Subscription') parser.add_argument('--output_table', required=True, dest='gft-app-294621:audios_dataset.classified_data', help= ('Output BigQuery table for results specified as: PROJECT:DATASET.TABLE ' 'or DATASET.TABLE.')) known_args, pipeline_args = parser.parse_known_args(argv) pipeline_options = PipelineOptions(pipeline_args) google_cloud_options = pipeline_options.view_as(GoogleCloudOptions) google_cloud_options.project = 'gft-app-294621' google_cloud_options.job_name = 'gft_app' # Uncomment below and add your bucket if you want to execute on Dataflow google_cloud_options.staging_location = 'gs://audio_app/binaries' google_cloud_options.temp_location = 'gs://audio_app/temp' google_cloud_options.region ='europe-west1' pipeline_options.view_as(StandardOptions).runner = 'DataflowRunner' pipeline_options.view_as(StandardOptions).streaming = True pipeline_options.view_as(SetupOptions).save_main_session = save_main_session p = beam.Pipeline(options=options) data = (p | 'Read from PubSub' >> beam.io.ReadFromPubSub(subscription=known_args.input_subscription)) processed_data = (data | beam.ParDo(preprocessAudio())) classified_data = (processed_data | beam.ParDo(audioClassifier())) table_schema = parse_table_schema_from_json(json.dumps(json.load(open("schema.json"))["schema"])) classified_data | 'Write to Big Query' >> beam.io.WriteToBigQuery(args.output, schema=table_schema, create_disposition=beam.io.BigQueryDisposition.CREATE_IF_NEEDED,write_disposition=beam.io.BigQueryDisposition.WRITE_APPEND) result = p.run() result.wait_until_finish()
def generate_wc_schema(): """BigQuery schema for the word count table.""" json_str = json.dumps({ 'fields': [{ 'name': 'word', 'type': 'STRING', 'mode': 'NULLABLE' }, { 'name': 'percent', 'type': 'FLOAT', 'mode': 'NULLABLE' }, { 'name': 'ts', 'type': 'TIMESTAMP', 'mode': 'NULLABLE' }] }) return parse_table_schema_from_json(json_str)
def test_parse_table_schema_from_json(self): string_field = bigquery.TableFieldSchema( name='s', type='STRING', mode='NULLABLE', description='s description') number_field = bigquery.TableFieldSchema( name='n', type='INTEGER', mode='REQUIRED', description='n description') record_field = bigquery.TableFieldSchema( name='r', type='RECORD', mode='REQUIRED', description='r description', fields=[string_field, number_field]) expected_schema = bigquery.TableSchema(fields=[record_field]) json_str = json.dumps({'fields': [ {'name': 'r', 'type': 'RECORD', 'mode': 'REQUIRED', 'description': 'r description', 'fields': [ {'name': 's', 'type': 'STRING', 'mode': 'NULLABLE', 'description': 's description'}, {'name': 'n', 'type': 'INTEGER', 'mode': 'REQUIRED', 'description': 'n description'}]}]}) self.assertEqual(parse_table_schema_from_json(json_str), expected_schema)
def test_parse_table_schema_from_json(self): string_field = bigquery.TableFieldSchema( name='s', type='STRING', mode='NULLABLE', description='s description') number_field = bigquery.TableFieldSchema( name='n', type='INTEGER', mode='REQUIRED', description='n description') record_field = bigquery.TableFieldSchema( name='r', type='RECORD', mode='REQUIRED', description='r description', fields=[string_field, number_field]) expected_schema = bigquery.TableSchema(fields=[record_field]) json_str = json.dumps({'fields': [ {'name': 'r', 'type': 'RECORD', 'mode': 'REQUIRED', 'description': 'r description', 'fields': [ {'name': 's', 'type': 'STRING', 'mode': 'NULLABLE', 'description': 's description'}, {'name': 'n', 'type': 'INTEGER', 'mode': 'REQUIRED', 'description': 'n description'}]}]}) self.assertEqual(parse_table_schema_from_json(json_str), expected_schema)
def generate_url_schema(): """BigQuery schema for the urls count table.""" json_str = json.dumps({ 'fields': [{ 'name': 'url', 'type': 'STRING', 'mode': 'NULLABLE' }, { 'name': 'count', 'type': 'INTEGER', 'mode': 'NULLABLE' }, { 'name': 'ts', 'type': 'TIMESTAMP', 'mode': 'NULLABLE' }] }) return parse_table_schema_from_json(json_str)
def parse_line(self, element): import re import csv from apache_beam.io.gcp.bigquery import parse_table_schema_from_json schema = parse_table_schema_from_json(self.schema_str) field_map = [f for f in schema.fields] for line in csv.reader(element.split('\n'), quotechar='"', delimiter='\t', quoting=csv.QUOTE_ALL, skipinitialspace=True): row = {} i = 0 values = [v for v in line] for value in values: row[field_map[i].name.encode('utf-8')] = value i += 1 return row
def get_bq_schema(self, env, schema_file): if env == 'dev': with open(schema_file) as f: data = f.read() schema_str = '{"fields": ' + data + '}' #print(schema_str) elif env == 'test': from google.cloud import storage client = storage.Client() bucket = client.get_bucket(self.gcs_bucket) # ('saledata-deba') blob = bucket.get_blob( self.schema_folder_gs + '/' + schema_file) #('schema/BreadBasketSchema.json') schema_str = '{"fields": ' + blob.download_as_string().decode( "utf-8") + '}' # print(schema_str) schema = parse_table_schema_from_json(schema_str) return schema
def run_bq_pipeline(argv=None): """Run the sample BigQuery pipeline. Args: argv: Arguments to the run function. """ parser = argparse.ArgumentParser() parser.add_argument('--query', required=True, help='Query to process for the table.') parser.add_argument('--output', required=True, help='Output BQ table to write results to.') parser.add_argument('--output_schema', dest='output_schema', required=True, help='Schema for output BQ table.') parser.add_argument('--use_standard_sql', action='store_true', dest='use_standard_sql', help='Output BQ table to write results to.') known_args, pipeline_args = parser.parse_known_args(argv) table_schema = parse_table_schema_from_json(known_args.output_schema) p = TestPipeline(options=PipelineOptions(pipeline_args)) # pylint: disable=expression-not-assigned # pylint: disable=bad-continuation (p | 'read' >> beam.io.Read( beam.io.BigQuerySource(query=known_args.query, use_standard_sql=known_args.use_standard_sql)) | 'write' >> beam.io.Write( beam.io.BigQuerySink( known_args.output, schema=table_schema, create_disposition=beam.io.BigQueryDisposition.CREATE_IF_NEEDED, write_disposition=beam.io.BigQueryDisposition.WRITE_TRUNCATE))) result = p.run() result.wait_until_finish()
def parse_method(self, string_input): schema = parse_table_schema_from_json(self.schema_str) field_map = [f for f in schema.fields] reader = csv.reader(string_input.split('\n')) for csv_row in reader: values = [x for x in csv_row] month = u'01' day = u'01' year = values[2] row = {} i = 0 for value in values: if field_map[i].type == 'DATE': value = u'-'.join((year, month, day)) row[field_map[i].name] = value i += 1 return row
def parse_method(self, string_input): schema = parse_table_schema_from_json(self.schema_str) field_map = [f for f in schema.fields] reader = csv.reader(string_input.split('\n')) for csv_row in reader: values = [x for x in csv_row] # Our source data only contains year, so default January 1st as the # month and day. month = u'01' day = u'01' # The year comes from our source data. year = values[2] row = {} i = 0 for value in values: if field_map[i].type == 'DATE': value = u'-'.join((year, month, day)) row[field_map[i].name] = value i += 1 return row
def run(argv=None): user_options = PipelineOptions().view_as(UserOptions) pipeline_options = PipelineOptions() data_ingestion = DataTransformation() pipeline_options.view_as(SetupOptions).save_main_session = True data_ingestion = DataTransformation() p = beam.Pipeline(options=pipeline_options) schema = parse_table_schema_from_json(data_ingestion.schema_str) (p | 'Read From Text' >> beam.io.ReadFromText(user_options.input, skip_header_lines=1) | 'String to BigQuery Row' >> beam.Map(lambda s: data_ingestion.parse_method(s)) | 'Write to BigQuery' >> beam.io.Write( beam.io.WriteToBigQuery( user_options.output, schema=schema, create_disposition=beam.io.BigQueryDisposition.CREATE_IF_NEEDED, write_disposition=beam.io.BigQueryDisposition.WRITE_TRUNCATE))) p.run().wait_until_finish()
def run(argv=None): print('running') p = beam.Pipeline(options=PipelineOptions()) lines = (p | beam.Create([{ "id": "some random name", "value": "i dont know" }, { "id": "id2", "value": "whatever man" }])) schema_str = '{"fields": ' + json.dumps(SCHEMA_OBJ) + '}' schema = parse_table_schema_from_json(schema_str) output_destination = '%s.%s' % (BQ_DATASET, BQ_TABLE) (lines | 'Write lines to BigQuery' >> beam.io.WriteToBigQuery( output_destination, schema=schema, create_disposition=beam.io.BigQueryDisposition.CREATE_IF_NEEDED, write_disposition=beam.io.BigQueryDisposition.WRITE_APPEND)) p.run().wait_until_finish()
def parse_method(self, string_input): """This method translates a single line of comma separated values to a dictionary which can be loaded into BigQuery. Args: string_input: A comma separated list of values in the form Returns: A dict mapping BigQuery column names as keys to the corresponding value parsed from string_input. """ # Strip out return characters and quote characters. schema = parse_table_schema_from_json(self.schema_str) field_map = [f for f in schema.fields] # import sys reload(sys) sys.setdefaultencoding("utf-8") # Use a CSV Reader which can handle quoted strings etc. # reader = csv.reader(('"{}"'.format(string_input)), quotechar='"', delimiter=' ') reader = csv.reader(string_input.split('\n')) for csv_row in reader: # values = [x.decode('utf8') for x in csv_row] values = [x.decode('utf8') for x in csv_row] row = {} i = 0 for value in values: row[field_map[i].name] = value i += 1 return row
def run(argv=None): """The main function which creates the pipeline and runs it.""" parser = argparse.ArgumentParser() # Here we add some specific command line arguments we expect. # This defaults the output table in your BigQuery you'll have # to create the example_data dataset yourself using bq mk temp parser.add_argument('--output', dest='output', required=False, help='Output BQ table to write results to.', default='lake.orders_denormalized_cogroupbykey') # Parse arguments from the command line. known_args, pipeline_args = parser.parse_known_args(argv) # DataLakeToDataMartCGBK is a class we built in this script to hold the logic for # transforming the file into a BigQuery table. It also contains an example of # using CoGroupByKey data_lake_to_data_mart = DataLakeToDataMartCGBK() schema = parse_table_schema_from_json(data_lake_to_data_mart.schema_str) pipeline = beam.Pipeline(options=PipelineOptions(pipeline_args)) # This query returns details about the account, normalized into a # different table. We will be joining the data in to the main orders dataset in order # to create a denormalized table. account_details_source = ( pipeline | 'Read Account Details from BigQuery ' >> beam.io.Read( beam.io.BigQuerySource(query=""" SELECT acct_number, acct_company_name, acct_group_name, acct_name, acct_org_name, address, city, state, zip_code, country FROM `python-dataflow-example.example_data.account` """, use_standard_sql=True)) # This next stage of the pipeline maps the acct_number to a single row of # results from BigQuery. Mapping this way helps Dataflow move your data arround # to different workers. When later stages of the pipeline run, all results from # a given account number will run on one worker. | 'Map Account to Order Details' >> beam.Map(lambda row: (row['acct_number'], row))) orders_query = data_lake_to_data_mart.get_orders_query() # Read the orders from BigQuery. This is the source of the pipeline. All further # processing starts with rows read from the query results here. orders = ( pipeline | 'Read Orders from BigQuery ' >> beam.io.Read( beam.io.BigQuerySource(query=orders_query, use_standard_sql=True)) | # This next stage of the pipeline maps the acct_number to a single row of # results from BigQuery. Mapping this way helps Dataflow move your data around # to different workers. When later stages of the pipeline run, all results from # a given account number will run on one worker. 'Map Account to Account Details' >> beam.Map(lambda row: (row['acct_number'], row))) # CoGroupByKey allows us to arrange the results together by key # Both "orders" and "account_details" are maps of # acct_number -> "Row of results from BigQuery". # The mapping is done in the above code using Beam.Map() result = {'orders': orders, 'account_details': account_details_source} | \ beam.CoGroupByKey() # The add_account_details function is responsible for defining how to # join the two datasets. It passes the results of CoGroupByKey, which # groups the data from the same key in each dataset together in the same # worker. joined = result | beam.FlatMap(data_lake_to_data_mart.add_account_details) joined | 'Write Data to BigQuery' >> beam.io.Write( beam.io.BigQuerySink( # The table name is a required argument for the BigQuery sink. # In this case we use the value passed in from the command line. known_args.output, # Here we use the JSON schema read in from a JSON file. # Specifying the schema allows the API to create the table correctly if it does not yet exist. schema=schema, # Creates the table in BigQuery if it does not yet exist. create_disposition=beam.io.BigQueryDisposition.CREATE_IF_NEEDED, # Deletes all data in the BigQuery table before writing. write_disposition=beam.io.BigQueryDisposition.WRITE_TRUNCATE)) pipeline.run().wait_until_finish()
def run(argv=None): """The main function which creates the pipeline and runs it.""" parser = argparse.ArgumentParser() parser.add_argument('--model_identifier', required=True, help='Used to identify LTV model and append to table and file names', default='by_sample_id') #filter-campaign parser.add_argument('--stage-bucket', dest='stage_bucket', required=False, help='Staging bucket to use', default='ltv-dataflow') parser.add_argument('--data-bucket', dest='data_bucket', required=False, help='Data bucket to use', default='telemetry-to-gcp') parser.add_argument('--load_bq', required=False, help='True/False to load summary/details to bq', default=False) parser.add_argument('--estimate_model', required=False, help='True/False to estimate model params', default=False) parser.add_argument('--calculate_model', required=False, help='True/False to calculate ltv', default=False) parser.add_argument('--calculate_stats', required=False, help='True/False to calculate stats', default=False) parser.add_argument('--upload_stats', required=False, help='True/False to calculate stats', default=False) # to be merged in calc stats once i figure out wild card stuff parser.add_argument('--send_output', required=False, help='True/False to send ltv and aggr files to Marketing GCP', default=False) parser.add_argument('--delete_data', required=False, help='True/False to delete input data and BQ data', default=False) start = time.clock() # Parse arguments from the command line. known_args, pipeline_args = parser.parse_known_args(argv) logging.info('running beam_calc for model: ' + known_args.model_identifier) # read in the output bg table schema bg_out_schema= '' #schema_file = "gs://ltv-dataflow-dev/templates/input/calc_output_schema.json" schema_file = 'gs://{}/templates/input/calc_output_schema.json'.format(known_args.stage_bucket) with gcs.open(schema_file) as f: data = f.read() # Wrapping the schema in fields is required for the BigQuery API. bg_out_schema = '{"fields": ' + data + '}' schema = parse_table_schema_from_json(bg_out_schema) #schema = bigquery_tools.parse_table_schema_from_json(bg_out_schema) #logging.info(schema) #ltv_beam.py:306: BeamDeprecationWarning: parse_table_schema_from_json is deprecated since 2.11.0. Use bigquery_tools.parse_table_schema_from_json instead. # estimate LTV model min_sample_size = calculate_min_sample_size() #estimate_model(min_sample_size) pipeline_options = PipelineOptions(pipeline_args) #pipeline_options.view_as(SetupOptions).save_main_session = True p = beam.Pipeline(options=PipelineOptions(pipeline_args)) # to be replaced with direct telem bq table to dataflow input? input_file_dir = 'gs://' + known_args.data_bucket + '/clv/' + known_args.model_identifier.replace('_','-') + '/{0}/*.parquet' # load parquet files into bq, overwrite (may not need if we can join datasets in Telemetry? hmmm, no, we need to get random selection) # use dummy singleton DoFn if no Parquet reader # much faster to directly load summary and details to bq - need in bq for quick join (not sure how fast join is as a pcollection) if known_args.load_bq: logging.info('loading bq') load_data_bq('ltv', 'summary_' + known_args.model_identifier, input_file_dir.format('summary')) load_data_bq('ltv', 'details_' + known_args.model_identifier, input_file_dir.format('details')) # run this to estimate LTV model parameters if known_args.estimate_model: logging.info('estimate model parameters'+known_args.estimate_model) serial_dummy = p | 'Read' >> beam.Create( ['serial_dummy'] ) | 'Estimate Lifetimes Model' >> beam.ParDo(ltv_calculate.EstimateLTVFn(min_sample_size,output_bucket='gs://ltv-dataflow-dev/tmp/',model_tag=known_args.model_identifier)) p.run().wait_until_finish() # fun estimation first before moving on to calculation # this has to be run on dataflow or will not upload to bq if known_args.calculate_model: logging.info('calculate ltv') data_query = ("SELECT * FROM ltv.summary_" + known_args.model_identifier) # + " ORDER BY RAND() LIMIT {}").format(100) ##data_query = ("SELECT * FROM ltv.summary_" + known_args.model_identifier + " ORDER BY RAND() LIMIT {}").format(100) (p | 'Read Orders from BigQuery ' >> beam.io.Read(beam.io.BigQuerySource(query=data_query, use_standard_sql=True)) | 'Apply Lifetimes Model' >> beam.ParDo( ltv_calculate.CalcLTVFn( dill.load(gcs.open("gs://" + known_args.stage_bucket + "/tmp/bgf_" + known_args.model_identifier + ".pkl", 'rb')), dill.load(gcs.open("gs://" + known_args.stage_bucket + "/tmp/ggf_" + known_args.model_identifier + ".pkl", 'rb')) ) ) | 'Write Data to BigQuery' >> beam.io.WriteToBigQuery("ltv.calc_" + known_args.model_identifier, schema=schema,create_disposition=beam.io.BigQueryDisposition.CREATE_IF_NEEDED,write_disposition=beam.io.BigQueryDisposition.WRITE_TRUNCATE) ) p.run().wait_until_finish() # calc aggregate statistics if known_args.calculate_stats: logging.info('aggregate ltv') qry_outlier_stats = ("""SELECT count(distinct(client_id)) ct, avg(historical_searches) avg, stddev(historical_searches) std FROM ltv.summary_{0} WHERE client_id not in (SELECT client_id FROM ltv.details_{0} WHERE default_search_engine='google-nocodes')""").format(known_args.model_identifier) query_job = bq_client.query(qry_outlier_stats) outlier_stats = query_job.to_dataframe() # no need to go through query_job.result() #logging.info(outlier_stats.head(1)) ct = outlier_stats['ct'][0] if ct > 0: mu = outlier_stats['avg'][0] sigma = outlier_stats['std'][0] outliers_upper = str(mu + 2.5 * sigma) outliers_lower = str(mu - 2.5 * sigma) # anyway to specify computation/mem instensive machine here? if known_args.model_identifier=='filter_campaign': sid_list = [''] else: sid_list = ['10','33','53','89'] for sid in sid_list: if known_args.model_identifier=='filter_campaign': sid_qry = '' assert sid == '' else: sid_qry = 'AND l.sample_id='+sid (p | 'Read_Age'+sid >> beam.Create( ['another_serial_dummy_age'+sid] ) | 'AggrCustomerAgeFn'+sid >> beam.ParDo(ltv_aggregate.AggrCustomerAgeFn(model_tag=known_args.model_identifier, outliers_lower=outliers_lower, outliers_upper=outliers_upper, sample_id_qry=sid_qry, sid=sid, output_folder=known_args.stage_bucket))) (p | 'Read_E10'+sid >> beam.Create( ['another_serial_dummy_e10'+sid] ) | 'AggrE10ActivityGroupLocaleFn'+sid >> beam.ParDo(ltv_aggregate.AggrE10ActivityGroupLocaleFn(model_tag=known_args.model_identifier, outliers_lower=outliers_lower, outliers_upper=outliers_upper, sample_id_qry=sid_qry, sid=sid, output_folder=known_args.stage_bucket))) (p | 'Read_Geo'+sid >> beam.Create( ['another_serial_dummy_geo'+sid] ) | 'AggrGlobalGeoUserStatusFn'+sid >> beam.ParDo(ltv_aggregate.AggrGlobalGeoUserStatusFn(model_tag=known_args.model_identifier, outliers_lower=outliers_lower, outliers_upper=outliers_upper, sample_id_qry=sid_qry, sid=sid, output_folder=known_args.stage_bucket))) (p | 'Read_City'+sid >> beam.Create( ['another_serial_dummy_city'+sid] ) | 'AggrCityFn'+sid >> beam.ParDo(ltv_aggregate.AggrCityFn(model_tag=known_args.model_identifier, outliers_lower=outliers_lower, outliers_upper=outliers_upper, sample_id_qry=sid_qry, sid=sid, output_folder=known_args.stage_bucket))) (p | 'Read_ECB'+sid >> beam.Create( ['another_serial_dummy_ecb'+sid] ) | 'AggrEngineChannelBrowserFn'+sid >> beam.ParDo(ltv_aggregate.AggrEngineChannelBrowserFn(model_tag=known_args.model_identifier, outliers_lower=outliers_lower, outliers_upper=outliers_upper, sample_id_qry=sid_qry, sid=sid, output_folder=known_args.stage_bucket))) (p | 'Read_Mem'+sid >> beam.Create( ['another_serial_dummy_mem'+sid] ) | 'AggrMemoryFn'+sid >> beam.ParDo(ltv_aggregate.AggrMemoryFn(model_tag=known_args.model_identifier, outliers_lower=outliers_lower, outliers_upper=outliers_upper, sample_id_qry=sid_qry, sid=sid, output_folder=known_args.stage_bucket))) (p | 'Read_Attr1'+sid >> beam.Create( ['another_serial_dummy_attr1'+sid] ) | 'AggrAttributes1Fn'+sid >> beam.ParDo(ltv_aggregate.AggrAttributes1Fn(model_tag=known_args.model_identifier, outliers_lower=outliers_lower, outliers_upper=outliers_upper, sample_id_qry=sid_qry, sid=sid, output_folder=known_args.stage_bucket))) (p | 'Read_Attr2'+sid >> beam.Create( ['another_serial_dummy_attr2'+sid] ) | 'AggrAttributes2Fn'+sid >> beam.ParDo(ltv_aggregate.AggrAttributes2Fn(model_tag=known_args.model_identifier, outliers_lower=outliers_lower, outliers_upper=outliers_upper, sample_id_qry=sid_qry, sid=sid, output_folder=known_args.stage_bucket))) (p | 'Read_Attr3'+sid >> beam.Create( ['another_serial_dummy_attr3'+sid] ) | 'AggrAttributes3Fn'+sid >> beam.ParDo(ltv_aggregate.AggrAttributes3Fn(model_tag=known_args.model_identifier, outliers_lower=outliers_lower, outliers_upper=outliers_upper, sample_id_qry=sid_qry, sid=sid, output_folder=known_args.stage_bucket))) (p | 'Read_OS1'+sid >> beam.Create( ['another_serial_dummy_os1'+sid] ) | 'AggrOS1Fn'+sid >> beam.ParDo(ltv_aggregate.AggrOS1Fn(model_tag=known_args.model_identifier, outliers_lower=outliers_lower, outliers_upper=outliers_upper, sample_id_qry=sid_qry, sid=sid, output_folder=known_args.stage_bucket))) (p | 'Read_OS2'+sid >> beam.Create( ['another_serial_dummy_os2'+sid] ) | 'AggrOS2Fn'+sid >> beam.ParDo(ltv_aggregate.AggrOS2Fn(model_tag=known_args.model_identifier, outliers_lower=outliers_lower, outliers_upper=outliers_upper, sample_id_qry=sid_qry, sid=sid, output_folder=known_args.stage_bucket))) (p | 'Read_SCD'+sid >> beam.Create( ['another_serial_dummy_scd'+sid] ) | 'AggrSyncConfiguredDesktopFn'+sid >> beam.ParDo(ltv_aggregate.AggrSyncConfiguredDesktopFn(model_tag=known_args.model_identifier, outliers_lower=outliers_lower, outliers_upper=outliers_upper, sample_id_qry=sid_qry, sid=sid, output_folder=known_args.stage_bucket))) (p | 'Read_SCM'+sid >> beam.Create( ['another_serial_dummy_scm'+sid] ) | 'AggrSyncConfiguredMobileFn'+sid >> beam.ParDo(ltv_aggregate.AggrSyncConfiguredMobileFn(model_tag=known_args.model_identifier, outliers_lower=outliers_lower, outliers_upper=outliers_upper, sample_id_qry=sid_qry, sid=sid, output_folder=known_args.stage_bucket))) (p | 'Read_BC'+sid >> beam.Create( ['another_serial_dummy_bc'+sid] ) | 'AggrBookmarksCountFn'+sid >> beam.ParDo(ltv_aggregate.AggrBookmarksCountFn(model_tag=known_args.model_identifier, outliers_lower=outliers_lower, outliers_upper=outliers_upper, sample_id_qry=sid_qry, sid=sid, output_folder=known_args.stage_bucket))) # compute ltv statistics # quantile compute https://stackoverflow.com/questions/46827512/efficient-algorithm-for-computing-quantiles-in-terabytes-dataset p.run().wait_until_finish() if known_args.upload_stats: # delete anything from bq aggr table before running aggr stats qry_truncate = ("DELETE from ltv.aggr_{} WHERE True").format(known_args.model_identifier) bq_client.query(qry_truncate) # load any files in ltv-dataflow/tmp containing self.model_tag; and delete them dataset_ref = bq_client.dataset('ltv') job_config = bigquery.LoadJobConfig() job_config.autodetect = True job_config.skip_leading_rows = 1 # The source format defaults to CSV, so the line below is optional. job_config.source_format = bigquery.SourceFormat.CSV uri = "gs://" + known_args.stage_bucket + "/tmp/" + known_args.model_identifier + '' + "/aggr_*.csv" try: load_job = bq_client.load_table_from_uri(uri, dataset_ref.table('aggr_'+known_args.model_identifier), job_config=job_config) # API request print("Starting job {}".format(load_job.job_id)) load_job.result() # Waits for table load to complete. print("Job finished.") destination_table = bq_client.get_table(dataset_ref.table('aggr_'+known_args.model_identifier)) print("Loaded {} rows.".format(destination_table.num_rows)) except ImportError: logging.info('uri: ' + uri) # push files to Marketing GCP in a GCF? # output parquet files if known_args.send_output: # test dataflow write parquet files --- SUCCESSFUL! we can output files in parquet format calc_query = ("SELECT client_id, sample_id FROM ltv.summary_{} LIMIT 10").format(known_args.model_identifier) #data_query = ("SELECT summ.*, det.* EXCEPT (client_id) FROM ltv.summary summ LEFT JOIN ltv.details det ON summ.client_id=det.client_id WHERE summ.client_id in ('3691929b0e07e22c86c1167c83ded58f481caf89','64ba414c3820805b1f64021cf3e082b091dec4f4')") # ugh need to differentiate by sample id sid = '' calc_fn = 'gs://' + known_args.stage_bucket + '/output/ltv_calc_' + known_args.model_identifier + sid (p | 'Read Orders from BigQuery ' >> beam.io.Read(beam.io.BigQuerySource(query=calc_query, use_standard_sql=True)) | 'Write Data to Parquet' >> beam.io.WriteToParquet(calc_fn, pa.schema([('client_id', pa.string()), ('sample_id', pa.int32())])) ) # https://arrow.apache.org/docs/python/data.html#type-metadata p.run().wait_until_finish() # if file exists, push(move) to marketing, else throw error # if all files are in output dir, then clean up bq and put _SUCCESS in dir which should trigger gcf # delete intermediary data from bq (and gcs?) if known_args.send_output: # delete anything in bq ltv tables qry_truncate = ("DELETE from ltv.summary_{} WHERE True").format(known_args.model_identifier) bq_client.query(qry_truncate) qry_truncate = ("DELETE from ltv.details_{} WHERE True").format(known_args.model_identifier) bq_client.query(qry_truncate) qry_truncate = ("DELETE from ltv.calc_{} WHERE True").format(known_args.model_identifier) bq_client.query(qry_truncate) qry_truncate = ("DELETE from ltv.aggr_{} WHERE True").format(known_args.model_identifier) bq_client.query(qry_truncate) # delete gcs data files? logging.info('ltv.run() runtime: ' + str(time.clock() - start))
def run(argv=None): """The main function which creates the pipeline and runs it.""" parser = argparse.ArgumentParser() parser.add_argument('--input', dest='input', required=False, help='Input file to read. This can be a local file or ' 'a file in a Google Storage Bucket.', default='./data/AB_NYC_2019.csv') parser.add_argument('--output', dest='output', required=False, help='Output BQ table to write results to.', default='./direct_run_output/result') # Parse arguments from the command line. known_args, pipeline_args = parser.parse_known_args(argv) data_ingestion = DataIngestion() # schema = parse_table_schema_from_json(data_ingestion.schema_str) with beam.Pipeline(options=PipelineOptions(pipeline_args)) as p: lines = p | 'Read from a File' >> beam.io.ReadFromText( known_args.input, skip_header_lines=1) csv_lines = (lines | 'String To BigQuery Row' >> beam.Map(lambda s: data_ingestion.parse_method(s))) original_output = ( csv_lines | 'Write to BigQuery' >> beam.io.WriteToText( known_args.output + '1', file_name_suffix='.csv')) # Count the occurrences of each word. def count_listings(neighbourhood_listings): (neighbourhood, listings) = neighbourhood_listings n = str(sum(listings)) return { u'neighbourhood': neighbourhood, u'count_listings': unicode(n) } # transformed_lines = ( csv_lines | 'Projected' >> beam.Map( lambda row: { f: row[f] for f in ['neighbourhood', 'calculated_host_listings_count'] }) | 'Key-Value' >> beam.Map(lambda row: (row[ 'neighbourhood'], int(row['calculated_host_listings_count']))) | 'Group' >> beam.GroupByKey() | 'Count' >> beam.Map(count_listings)) # | 'Format' >> # beam.Map(formatting)) transformed_output = ( transformed_lines | 'Write' >> beam.io.WriteToText(known_args.output + '2', file_name_suffix='.csv'))
def run(argv=None): """The main function that builds the pipeline and runs a batch processing. """ # add specific command line arguments we expect parser = argparse.ArgumentParser() # Final stage of the pipeline. Define the destination of the data. # Define the input file to read and the output table to import in BigQuery. parser.add_argument('--input', dest='input', required=False, help='Input file to read. A local file or ' 'a file in a Google Storage Bucket.', default='gs://takehom-data-k/AB_NYC_2019.csv') # first, we create the dataset from the command: bq mk AB_NYC_2019 # This defaults to the project dataset in the BigQuery. # Form: dataset.table parser.add_argument('--output', dest='output', required=False, help='Output BigQuery table to import results to.', default='AB_NYC_2019.AB_NYC_2019') # parse args from the command line known_args, pipeline_args = parser.parse_known_args(argv) # use the class DataProcess to process the logic # and transform the file into a BigQuery table data_process = DataProcess() # Define output schema from json file schema = parse_table_schema_from_json( data_process.schema_str) # output schema for raw data schema_2 = parse_table_schema_from_json( data_process.schema_str_2) # output schema for transformed data # ! """Initiate the pipeline: using the pipeline arguments passed in from the commnd line. example: python dataflow_pipeline.py --project=$PROJECT\ --runner=DataflowRunner\ --staging_location=gs://$PROJECT/temp --temp_location gs://$PROJECT/temp\ --input gs://$PROJECT/AB_NYC_2019.csv\ --save_main_session """ with beam.Pipeline(options=PipelineOptions(pipeline_args)) as p: # read the soucre file of the pipeline # skip the first row (header) and starts with the lines read from the file lines = p | 'Read from a csv File' >> beam.io.ReadFromText( known_args.input, skip_header_lines=1) """1. First pipeline flow for raw data input and output. """ # the pipeline translate from the csv file single row # input a string to a dictionary object that is acceptable by BigQuery # this is done by the defined helper function, which run parallel # using input from previous stage csv_lines = (lines | 'String To BigQuery Row' >> beam.Map(lambda s: data_process.parse_method(s))) # output stage is the importing stage to BigQuery table, taking the previous # stage of the pipline original_output = ( csv_lines | 'Write to BigQuery - Raw Data' >> beam.io.Write( beam.io.BigQuerySink( # The table name is a required argument for the BigQuery sink # this is the value passed in from the command line known_args.output + '_raw', # The schema is from our schema file: # with the form fieldName:fieldType schema=schema, # creates the table in BigQuery create_disposition=beam.io.BigQueryDisposition. CREATE_IF_NEEDED, # before writing in, deletes all data in the BigQuery table write_disposition=beam.io.BigQueryDisposition. WRITE_TRUNCATE))) """2. Scenod pipeline flow that performs group by and counts with two fields information. input and output. """ # helper function to count the number of listing by each neighbourhood # return into dictionary object and unicode string format # our schema file handles assinged data type when importing in BigQuery def count_listings(neighbourhood_listings): (neighbourhood, listings) = neighbourhood_listings n = str(sum(listings)) return { u'neighbourhood': neighbourhood, u'count_listings': unicode(n) } # this stage of pipline is to to take the previous pipeline processed dictionary object # then projected to the fields we are interested # then create a key value pair for further aggregation # then perform group by on 'neighborhood' field with the filed 'calculated_host_listings_count' # after group by, we leverage the helper funtion to count the host_listing # this is by summing up values in each group by filed 'neighborhood' transformed_lines = ( csv_lines | 'Projected to desired fields' >> beam.Map( lambda row: { f: row[f] for f in ['neighbourhood', 'calculated_host_listings_count'] }) | 'Key-Value pair' >> beam.Map(lambda row: (row[ 'neighbourhood'], int(row['calculated_host_listings_count']))) | 'Group' >> beam.GroupByKey() | 'Count' >> beam.Map(count_listings)) # second output stage is the importing stage to BigQuery table, taking the previous # stage of the pipline for output that is performed aggregation and transformation transformed_output = ( transformed_lines | 'Write to BigQuery - Transformed Data' >> beam.io.Write( beam.io.BigQuerySink( # The table name is a required argument for the BigQuery sink # this is the value passed in from the command line known_args.output + '_transform', # The schema is from our schema file: # with the form fieldName:fieldType schema=schema_2, # creates the table in BigQuery create_disposition=beam.io.BigQueryDisposition. CREATE_IF_NEEDED, # before writing in, deletes all data in the BigQuery table write_disposition=beam.io.BigQueryDisposition. WRITE_TRUNCATE)))
def run(argv=None): """The main function which creates the pipeline and runs it.""" parser = argparse.ArgumentParser() # Here we add some specific command line arguments we expect. Specifically # we have the input file to load and the output table to write to. parser.add_argument( '--input', dest='input', required=False, help='Input file to read. This can be a local file or ' 'a file in a Google Storage Bucket.', # This example file contains a total of only 10 lines. # Useful for quickly debugging on a small set of data default='gs://python-dataflow-example/data_files/head_usa_names.csv') # The output defaults to the lake dataset in your BigQuery project. You'll have # to create the lake dataset yourself using this command: # bq mk lake parser.add_argument('--output', dest='output', required=False, help='Output BQ table to write results to.', default='lake.usa_names_enriched') # Parse arguments from the command line. known_args, pipeline_args = parser.parse_known_args(argv) # DataIngestion is a class we built in this script to hold the logic for # transforming the file into a BigQuery table. data_ingestion = DataIngestion() # Initiate the pipeline using the pipeline arguments passed in from the # command line. This includes information like where Dataflow should store # temp files, and what the project id is p = beam.Pipeline(options=PipelineOptions(pipeline_args)) schema = parse_table_schema_from_json(data_ingestion.schema_str) # This function adds in a full state name by looking up the # full name in the short_to_long_name_map. The short_to_long_name_map # comes from a read from BigQuery in the next few lines def add_full_state_name(row, short_to_long_name_map): row['state_full_name'] = short_to_long_name_map[row['state']] return row # This is a second source of data. The source is from BigQuery. # This will come into our pipeline a side input. read_query = """ SELECT state_name, state_abbreviation FROM `python-dataflow-example.example_data.state_abbreviations`""" state_abbreviations = ( p | 'Read from BigQuery' >> beam.io.Read( beam.io.BigQuerySource(query=read_query, use_standard_sql=True)) # We must create a python tuple of key to value pairs here in order to # use the data as a side input. Dataflow will use the keys to distribute the # work to the correct worker. | 'Abbreviation to Full Name' >> beam.Map(lambda row: (row['state_abbreviation'], row['state_name']))) (p # Read the file. This is the source of the pipeline. All further # processing starts with lines read from the file. We use the input # argument from the command line. We also skip the first line which is # a header row. | 'Read From Text' >> beam.io.ReadFromText(known_args.input, skip_header_lines=1) # Translates from the raw string data in the CSV to a dictionary. # The dictionary is a keyed by column names with the values being the values # we want to store in BigQuery. | 'String to BigQuery Row' >> beam.Map(lambda s: data_ingestion.parse_method(s)) # Here we pass in a side input, which is data that comes from outside our # CSV source. The side input contains a map of states to their full name. | 'Join Data' >> beam.Map(add_full_state_name, AsDict(state_abbreviations)) # This is the final stage of the pipeline, where we define the destination # of the data. In this case we are writing to BigQuery. | 'Write to BigQuery' >> beam.io.Write( beam.io.BigQuerySink( # The table name is a required argument for the BigQuery sink. # In this case we use the value passed in from the command line. known_args.output, # Here we use the JSON schema read in from a JSON file. # Specifying the schema allows the API to create the table correctly if it does not yet exist. schema=schema, # Creates the table in BigQuery if it does not yet exist. create_disposition=beam.io.BigQueryDisposition.CREATE_IF_NEEDED, # Deletes all data in the BigQuery table before writing. write_disposition=beam.io.BigQueryDisposition.WRITE_TRUNCATE))) p.run().wait_until_finish()
def track_events(): import ujson from uuid import uuid4 from conf import conf from settings.settings import keys, appsflier_fields, events_schema, send_event_schema from utils.functions import validate_version, set_properties, validate_appsflyer, send_appsflyer_event from utils.redis_client import RedisClient project = conf.PROJECT redis_client = RedisClient() def transform(message): event = ujson.loads(message) # if event.get("platform") in ("Android", "iOS"): # appsflyer_version = validate_version(event.get("appVersion"), keys.get("appsflyer_version_{}". # format(event.get("platform")))) # else: # appsflyer_version = False # event["send_appsflyer"] = validate_appsflyer(event, keys.get("appsflyer_events"), appsflyer_version) event["send_appsflyer"] = False return [event] def send_appsflyer(event): event_af = ujson.loads(ujson.dumps(event)) if (not event_af.get("send_appsflyer")) or project == "merlin-qa": return [] try: send_appsflyer_event(event_af, appsflier_fields) return [{ "type": "apps", "idEvent": event_af.get("idEvent"), "createdAt": "{:%Y-%m-%d %H:%M:%S.%f}".format(datetime.now()), "sent": True }] except Exception as ex: logging.error(ex.message) return [{ "type": "apps", "idEvent": event_af.get("idEvent"), "createdAt": "{:%Y-%m-%d %H:%M:%S.%f}".format(datetime.now()), "sent": False, "error": ex.message }] from datetime import datetime def transform_bigquery(event): from google.cloud import datastore event_bq = ujson.loads(ujson.dumps(event)) event_bq = set_properties(event_bq) if "idJob" in event_bq: try: job_key = datastore.Key.from_legacy_urlsafe( urlsafe=event_bq.get("idJob")) event_bq["idJob"] = job_key.name except Exception: pass event_bq.pop("appsflyer", None) event_bq.pop("send_appsflyer", None) return [event_bq] def select_table(bq_event): if bq_event.get("eventName") not in keys.get("expensive_events"): return [bq_event] return [] def select_table_expensive(bq_event): if bq_event.get("eventName") in keys.get("expensive_events"): return [bq_event] return [] # Define pipeline storage_bucket = "gs://{}/dataflow".format(conf.BUCKET_NAME) pipeline = beam.Pipeline( runner=conf.RUNNER, argv=[ "--project", project, "--subnetwork", "regions/us-central1/subnetworks/default", "--staging_location", "{}/staging_location".format(storage_bucket), "--temp_location", "{}/temp".format(storage_bucket), "--output", "{}/output".format(storage_bucket), "--setup_file", "./setup.py", "--job_name", "events-sync-{}".format(str(uuid4())), "--batch", "--worker_machine_type", "n1-standard-2" ]) # PTransforms # Read from pubsub events = pipeline | "Read pubsub messages" >> beam.Create( redis_client.receive_messages()) events = events | "Transform message" >> beam.FlatMap( lambda message: transform(message)) # Send event to appsflyer # apps_events = events | "Send to appsflyer" >> beam.FlatMap(lambda event: send_appsflyer(event)) bigquery_events = events | "Transform to bigquery" >> beam.FlatMap( lambda event: transform_bigquery(event)) bigquery_events_normal = bigquery_events | "Select event to bigquery" >> beam.FlatMap( lambda event: select_table(event)) bigquery_events_expensive = bigquery_events | "Select event expensive to bigquery" >> beam.FlatMap( lambda event: select_table_expensive(event)) from apache_beam.io.gcp.bigquery import parse_table_schema_from_json bigquery_events_schema = parse_table_schema_from_json( ujson.dumps(events_schema)) bigquery_path = "{}:merlin_events.Event".format(project) bigquery_events_normal | "Write events on bigquery" >> beam.io.WriteToBigQuery( bigquery_path, schema=bigquery_events_schema, write_disposition=beam.io.BigQueryDisposition.WRITE_APPEND, create_disposition=beam.io.BigQueryDisposition.CREATE_IF_NEEDED) bigquery_path = "{}:merlin_events.Event_expensive".format(project) bigquery_events_expensive | "Write events expensive on bigquery" >> beam.io.WriteToBigQuery( bigquery_path, schema=bigquery_events_schema, write_disposition=beam.io.BigQueryDisposition.WRITE_APPEND, create_disposition=beam.io.BigQueryDisposition.CREATE_IF_NEEDED) bigquery_events_schema = parse_table_schema_from_json( ujson.dumps(send_event_schema)) bigquery_path = "{}:merlin_events.SendEvent".format(project) # apps_events | "Write sent apps events on bigquery" >> beam.io.WriteToBigQuery( # bigquery_path, # schema=bigquery_events_schema, # write_disposition=beam.io.BigQueryDisposition.WRITE_APPEND, # create_disposition=beam.io.BigQueryDisposition.CREATE_IF_NEEDED # ) result = pipeline.run() if conf.ENV == "LOCAL": result.wait_until_finish()
return instances """ def run(argv=None): # Main pipeline run def # Make explicit BQ schema for output tables bigqueryschema_json = '{"fields": [' \ '{"name":"id","type":"STRING"},' \ '{"name":"text","type":"NUMERIC"},' \ '{"name":"user_id","type":"STRING"},' \ #'{"name":"sentiment","type":"FLOAT"},' \ '{"name":"posted_at","type":"TIMESTAMP"}' \ ']}' bigqueryschema = parse_table_schema_from_json(bigqueryschema_json) bigqueryschema_mean_json = '{"fields": [' \ '{"name":"posted_at","type":"TIMESTAMP"},' \ #'{"name":"sentiment","type":"FLOAT"}' \ ']}' bigqueryschema_mean = parse_table_schema_from_json(bigqueryschema_mean_json) """Build and run the pipeline.""" parser = argparse.ArgumentParser() group = parser.add_mutually_exclusive_group(required=False) group.add_argument( '--input_subscription', help=('Input PubSub subscription of the form ' '"projects/<PROJECT>/subscriptions/<SUBSCRIPTION>."'), default="projects/YOUR_PROJECT/subscriptions/YOUR_SUBSCRIPTION"
element, orient='index').transpose().fillna(0) weight = self._model.predict(new_x.iloc[:, 1:8])[0] return [{ 'guid': element['guid'], 'weight': weight, 'time': str(element['time']) }] schema = parse_table_schema_from_json( json.dumps({ 'fields': [{ 'name': 'guid', 'type': 'STRING' }, { 'name': 'weight', 'type': 'FLOAT64' }, { 'name': 'time', 'type': 'STRING' }] })) parser = argparse.ArgumentParser() known_args, pipeline_args = parser.parse_known_args(None) pipeline_options = PipelineOptions(pipeline_args) p = beam.Pipeline(options=pipeline_options) data = p | 'Read from Bigquery' >> beam.io.Read( beam.io.BigQuerySource(query=query, use_standard_sql=True)) scored = data | 'Apply Model' >> beam.ParDo(ApplyDoFn())
def run(argv=None): """The main function which creates the pipeline and runs it.""" parser = argparse.ArgumentParser() # Here we add some specific command line arguments we expect. S # This defaults the output table in your BigQuery you'll have # to create the example_data dataset yourself using bq mk temp parser.add_argument('--output', dest='output', required=False, help='Output BQ table to write results to.', default='lake.orders_denormalized_sideinput') # Parse arguments from the command line. known_args, pipeline_args = parser.parse_known_args(argv) # DataLakeToDataMart is a class we built in this script to hold the logic for # transforming the file into a BigQuery table. data_lake_to_data_mart = DataLakeToDataMart() p = beam.Pipeline(options=PipelineOptions(pipeline_args)) schema = parse_table_schema_from_json(data_lake_to_data_mart.schema_str) pipeline = beam.Pipeline(options=PipelineOptions(pipeline_args)) # This query returns details about the account, normalized into a # different table. We will be joining the data in to the main orders dataset in order # to create a denormalized table. account_details_source = ( pipeline | 'Read Account Details from BigQuery ' >> beam.io.Read( beam.io.BigQuerySource(query=""" SELECT acct_number, acct_company_name, acct_group_name, acct_name, acct_org_name, address, city, state, zip_code, country FROM `qwiklabs-resources.python_dataflow_example.account`""", # This next stage of the pipeline maps the acct_number to a single row of # results from BigQuery. Mapping this way helps Dataflow move your data around # to different workers. When later stages of the pipeline run, all results from # a given account number will run on one worker. use_standard_sql=True)) | 'Account Details' >> beam.Map( lambda row: ( row['acct_number'], row ))) orders_query = data_lake_to_data_mart.get_orders_query() (p # Read the orders from BigQuery. This is the source of the pipeline. All further # processing starts with rows read from the query results here. | 'Read Orders from BigQuery ' >> beam.io.Read( beam.io.BigQuerySource(query=orders_query, use_standard_sql=True)) # Here we pass in a side input, which is data that comes from outside our # main source. The side input contains a map of states to their full name | 'Join Data with sideInput' >> beam.Map(data_lake_to_data_mart.add_account_details, AsDict( account_details_source)) # This is the final stage of the pipeline, where we define the destination # of the data. In this case we are writing to BigQuery. | 'Write Data to BigQuery' >> beam.io.Write( beam.io.BigQuerySink( # The table name is a required argument for the BigQuery sink. # In this case we use the value passed in from the command line. known_args.output, # Here we use the JSON schema read in from a JSON file. # Specifying the schema allows the API to create the table correctly if it does not yet exist. schema=schema, # Creates the table in BigQuery if it does not yet exist. create_disposition=beam.io.BigQueryDisposition.CREATE_IF_NEEDED, # Deletes all data in the BigQuery table before writing. write_disposition=beam.io.BigQueryDisposition.WRITE_TRUNCATE))) p.run().wait_until_finish()
def run(argv=None): # Main pipeline run def # Make explicit BQ schema for output tables bigqueryschema_json = '{"fields": [' \ '{"name":"id","type":"STRING"},' \ '{"name":"text","type":"NUMERIC"},' \ '{"name":"user_id","type":"STRING"},' \ '{"name":"sentiment","type":"FLOAT"},' \ '{"name":"posted_at","type":"TIMESTAMP"}' \ ']}' bigqueryschema = parse_table_schema_from_json(bigqueryschema_json) bigqueryschema_mean_json = '{"fields": [' \ '{"name":"posted_at","type":"TIMESTAMP"},' \ '{"name":"sentiment","type":"FLOAT"}' \ ']}' bigqueryschema_mean = parse_table_schema_from_json( bigqueryschema_mean_json) """Build and run the pipeline.""" parser = argparse.ArgumentParser() group = parser.add_mutually_exclusive_group(required=False) group.add_argument( '--input_subscription', help=('Input PubSub subscription of the form ' '"projects/<PROJECT>/subscriptions/<SUBSCRIPTION>."'), default="projects/YOUR_PROJECT/subscriptions/YOUR_SUBSCRIPTION") group.add_argument('--input_topic', help=('Input PubSub topic of the form ' '"projects/<PROJECT>/topics/<TOPIC>."'), default="projects/YOUR_PROJECT/topics/YOUR_TOPIC") known_args, pipeline_args = parser.parse_known_args(argv) # We use the save_main_session option because one or more DoFn's in this # workflow rely on global context (e.g., a module imported at module level). pipeline_options = PipelineOptions(pipeline_args) pipeline_options.view_as(SetupOptions).save_main_session = True pipeline_options.view_as(StandardOptions).streaming = True # Run on Cloud Dataflow by default pipeline_options.view_as(StandardOptions).runner = 'DataflowRunner' google_cloud_options = pipeline_options.view_as(GoogleCloudOptions) google_cloud_options.project = 'YOUR_RPOJECT' google_cloud_options.staging_location = 'gs://YOUR_BEAM_STAGING_BUCKET/staging' google_cloud_options.temp_location = 'gs://YOUR_BEAM_STAGING_BUCKET/temp' google_cloud_options.region = 'europe-west1' p = beam.Pipeline(options=pipeline_options) # Read from PubSub into a PCollection. if known_args.input_subscription: lines = p | "read in tweets" >> beam.io.ReadFromPubSub( subscription=known_args.input_subscription, with_attributes=False, id_label="tweet_id") else: lines = p | "read in tweets" >> beam.io.ReadFromPubSub( topic=known_args.input_topic, with_attributes=False, id_label="tweet_id") # Window them, and batch them into batches of 50 (not too large) output_tweets = ( lines | 'assign window key' >> beam.WindowInto(window.FixedWindows(10)) | 'batch into n batches' >> BatchElements(min_batch_size=49, max_batch_size=50) | 'predict sentiment' >> beam.FlatMap(lambda messages: estimate(messages))) # Write to Bigquery output_tweets | 'store twitter posts' >> beam.io.WriteToBigQuery( table="YOUR_TABLE", dataset="YOUR_DATASET", schema=bigqueryschema, write_disposition=beam.io.BigQueryDisposition.WRITE_APPEND, create_disposition=beam.io.BigQueryDisposition.CREATE_IF_NEEDED, project="YOUR_PROJECT") # Average out and log the mean value (output_tweets | 'pair with key' >> beam.Map(lambda x: (1, x)) | 'group by key' >> beam.GroupByKey() | 'aggregate and format' >> beam.Map(aggregate_format) | 'store aggregated sentiment' >> beam.io.WriteToBigQuery( table="YOUR_TABLE", dataset="YOUR_DATASET", schema=bigqueryschema_mean, write_disposition=beam.io.BigQueryDisposition.WRITE_APPEND, create_disposition=beam.io.BigQueryDisposition.CREATE_IF_NEEDED, project="YOUR_PROJECT")) result = p.run() result.wait_until_finish()