def run(argv=None): """ This funciton parses the command line arguments and runs the Beam Pipeline. Args: argv: list containing the commandline arguments for this call of the script. """ schema_inferred = False data_args, pipeline_args = parse_data_generator_args(argv) data_args, schema_inferred = fetch_schema(data_args, schema_inferred) pipeline_options = PipelineOptions(pipeline_args) temp_location = pipeline_options.display_data()['temp_location'] temp_blob = write_n_line_file_to_gcs( pipeline_options.display_data()['project'], temp_location, data_args.num_records) data_gen = data_generator_from_data_args(data_args) # Initiate the pipeline using the pipeline arguments passed in from the # command line. This includes information including where Dataflow should # store temp files, and what the project id is and what runner to use. p = beam.Pipeline(options=pipeline_options) (p # Read the file we created with num_records newlines. | 'Read file with num_records lines' >> beam.io.ReadFromText(temp_location + '/temp_num_records.txt') # Use our instance of our custom DataGenerator Class to generate 1 fake datum # with the appropriate schema for each element in the PColleciton created above. | 'Generate Data' >> beam.ParDo(FakeRowGen(data_gen)) | 'Write to BigQuery' >> beam.io.gcp.bigquery.WriteToBigQuery( # The table name is a required argument for the BigQuery sink. # In this case we use the value passed in from the command line. data_args.output_bq_table, schema=None if schema_inferred else data_gen.get_bq_schema_string(), # Creates the table in BigQuery if it does not yet exist. create_disposition=beam.io.BigQueryDisposition.CREATE_IF_NEEDED, write_disposition=data_gen.write_disp, # Use the max recommended batch size. batch_size=500)) p.run().wait_until_finish() # Manually clean up of temp_num_records.txt because it will be outside this job's # directory and Dataflow will not remove it for us. temp_blob.delete()
def run(argv=None): """ This function parses the command line arguments and runs the Beam Pipeline. Args: argv: list containing the commandline arguments for this call of the script. """ # Keeps track if schema was inferred by input or ouput table. schema_inferred = False data_args, pipeline_args = parse_data_generator_args(argv) data_args, schema_inferred = fetch_schema(data_args, schema_inferred) pipeline_options = PipelineOptions(pipeline_args) temp_location = pipeline_options.display_data()['temp_location'] temp_blob = write_n_line_file_to_gcs( pipeline_options.display_data()['project'], temp_location, data_args.num_records) data_gen = DataGenerator(bq_schema_filename=data_args.schema_file, input_bq_table=data_args.input_bq_table, p_null=data_args.p_null, n_keys=data_args.n_keys, min_date=data_args.min_date, max_date=data_args.max_date, only_pos=data_args.only_pos, max_int=data_args.max_int, max_float=data_args.max_float, float_precision=data_args.float_precision, write_disp=data_args.write_disp, key_skew=data_args.key_skew, primary_key_cols=data_args.primary_key_cols) # Initiate the pipeline using the pipeline arguments passed in from the # command line. This includes information including where Dataflow should # store temp files, and what the project id is and what runner to use. p = beam.Pipeline(options=pipeline_options) rows = ( p # Read the file we created with num_records newlines. | 'Read file with num_records lines' >> beam.io.ReadFromText( os.path.join('gs://', temp_blob.bucket.name, temp_blob.name)) # Use our instance of our custom DataGenerator Class to generate 1 fake # datum with the appropriate schema for each element in the PColleciton # created above. | 'Generate Data' >> beam.ParDo(FakeRowGen(data_gen)) | 'Parse Json Strings' >> beam.FlatMap(lambda row: [json.loads(row)])) if data_args.primary_key_cols: for key in data_args.primary_key_cols.split(','): rows |= 'Enforcing primary key: {}'.format( key) >> EnforcePrimaryKeys(key) if data_args.csv_schema_order: (rows | 'Order fields for CSV writing.' >> beam.FlatMap( lambda d: [dict_to_csv(d, data_args.csv_schema_order.split(','))]) | 'Write to GCS' >> beam.io.textio.WriteToText( file_path_prefix=data_args.output_prefix, file_name_suffix='.csv') ) if data_args.avro_schema_file: fastavro_avsc = fastavro.schema.load_schema(data_args.avro_schema_file) (rows # Need to convert time stamps from strings to timestamp-micros | 'Fix date and time Types for Avro.' >> beam.FlatMap(lambda row: fix_record_for_avro(row, fastavro_avsc)) | 'Write to Avro.' >> beam.io.avroio.WriteToAvro( file_path_prefix=data_args.output_prefix, codec='null', file_name_suffix='.avro', use_fastavro=True, schema=fastavro_avsc)) if data_args.write_to_parquet: with open(data_args.schema_file, 'r') as infile: str_schema = json.load(infile) pa_schema = get_pyarrow_translated_schema(str_schema) (rows | 'Fix data and time Types for Parquet.' >> beam.FlatMap(lambda row: fix_record_for_parquet(row, str_schema)) | 'Write to Parquet.' >> beam.io.WriteToParquet( file_path_prefix=data_args.output_prefix, codec='null', file_name_suffix='.parquet', schema=pa_schema)) if data_args.output_bq_table: (rows | 'Write to BigQuery.' >> beam.io.gcp.bigquery.WriteToBigQuery( # The table name is a required argument for the BigQuery sink. # In this case we use the value passed in from the command # line. data_args.output_bq_table, schema=None if schema_inferred else data_gen.get_bq_schema(), # Creates the table in BigQuery if it does not yet exist. create_disposition=beam.io.BigQueryDisposition.CREATE_IF_NEEDED, write_disposition=data_gen.write_disp, # Use the max recommended batch size. batch_size=500)) p.run().wait_until_finish() # Manually clean up of temp_num_records.txt because it will be outside this # job's directory and Dataflow will not remove it for us. temp_blob.delete()
def run(argv=None): parser = argparse.ArgumentParser() parser.add_argument('--core_host', dest='core_host', type=str) parser.add_argument('--core_port', dest='core_port', type=int) parser.add_argument('--core_username', dest='core_username', type=str) parser.add_argument('--core_password', dest='core_password', type=str) parser.add_argument('--core_database', dest='core_database', type=str) parser.add_argument('--remittances_host', dest='remittances_host', type=str) parser.add_argument('--remittances_port', dest='remittances_port', type=int) parser.add_argument('--remittances_username', dest='remittances_username', type=str) parser.add_argument('--remittances_password', dest='remittances_password', type=str) parser.add_argument('--remittances_database', dest='remittances_database', type=str) parser.add_argument('--auth_uri', dest='auth_uri', type=str) parser.add_argument('--auth_database', dest='auth_database', type=str) known_args, pipeline_args = parser.parse_known_args(argv) pipeline_options = PipelineOptions(pipeline_args) project_id = pipeline_options.display_data()['project'] with beam.Pipeline(options=pipeline_options) as p: # Reads from data sources read_core = ( p | 'ReadCore' >> ReadFromDB(source_config=SourceConfiguration( drivername='postgresql+psycopg2', host=known_args.core_host, port=known_args.core_port, username=known_args.core_username, password=known_args.core_password, database=known_args.core_database, create_if_missing=False), table_name='transaction', query='SELECT * FROM transaction') | 'FilterTransactions' >> beam.Filter(filter_transaction)) read_auth = ( p | 'Read Users' >> ReadFromMongoDB(uri=known_args.auth_uri, db=known_args.auth_database, coll='users') | 'TransformUsers' >> beam.ParDo(TransformUser()) | 'FilterUsers' >> beam.Filter(filter_user)) read_remittances = ( p | 'ReadRemittances' >> ReadFromMySQL( query='SELECT * FROM valiu_remittances.remittance', host=known_args.remittances_host, database=known_args.remittances_database, user=known_args.remittances_username, password=known_args.remittances_password, port=known_args.remittances_port, splitter=splitters.NoSplitter()) | 'TransformRemittances' >> beam.ParDo(TransformRemittance()) | 'FilterRemittances' >> beam.Filter(filter_remittance)) # Merges merged_transactions = ( (read_core, read_auth) | 'MergeTransactionsUsers' >> LeftJoin('id2', 'id', 'user_')) merged_remittances = ( (read_remittances, read_auth) | 'MergeRemittanceUsers' >> MergeRemittancesUsers()) # Writes to BigQuery write_users = ( read_auth | 'WriteUsers' >> beam.io.WriteToBigQuery( table='auth_users', dataset='auth', project=project_id, create_disposition=beam.io.BigQueryDisposition. CREATE_IF_NEEDED, write_disposition=beam.io.BigQueryDisposition.WRITE_APPEND, schema=users_table_schema, additional_bq_parameters=users_table_partitioning)) write_remittances = ( merged_remittances | 'WriteRemittances' >> beam.io.WriteToBigQuery( table='remittances_movements', dataset='remittances', project=project_id, create_disposition=beam.io.BigQueryDisposition. CREATE_IF_NEEDED, write_disposition=beam.io.BigQueryDisposition.WRITE_APPEND, schema=remittances_table_schema, additional_bq_parameters=remittances_table_partitioning)) write_transactions_cash_in = ( merged_transactions | 'FilterCashIn' >> beam.Filter(filter_currency_operation, 'COP', 'USDv') | 'CleanTransactionsCashIn' >> beam.ParDo( TransformTransaction('cash_in')) | 'WriteCashIn' >> beam.io.WriteToBigQuery( table='core_cash_in', dataset='core', project=project_id, create_disposition=beam.io.BigQueryDisposition. CREATE_IF_NEEDED, write_disposition=beam.io.BigQueryDisposition.WRITE_APPEND, schema=transactions_table_schema, additional_bq_parameters=transactions_table_partitioning)) write_transactions_cash_out = ( merged_transactions | 'FilterCashOut' >> beam.Filter(filter_currency_operation, 'USDv', 'VES') | 'CleanTransactionsCashOut' >> beam.ParDo( TransformTransaction('cash_out')) | 'WriteCashOut' >> beam.io.WriteToBigQuery( table='core_cash_out', dataset='core', project=project_id, create_disposition=beam.io.BigQueryDisposition. CREATE_IF_NEEDED, write_disposition=beam.io.BigQueryDisposition.WRITE_APPEND, schema=transactions_table_schema, additional_bq_parameters=transactions_table_partitioning)) write_transactions_p2p = ( merged_transactions | 'FilterP2P' >> beam.Filter(filter_currency_operation, 'USDv', 'USDv') | 'CleanTransactionsP2P' >> beam.ParDo(TransformTransaction('p2p')) | 'WriteP2P' >> beam.io.WriteToBigQuery( table='core_p2p', dataset='core', project=project_id, create_disposition=beam.io.BigQueryDisposition. CREATE_IF_NEEDED, write_disposition=beam.io.BigQueryDisposition.WRITE_APPEND, schema=transactions_table_schema, additional_bq_parameters=transactions_table_partitioning))
logging.info('running locally on DirectRunner') argv = [ '--runner', 'DirectRunner', '--staging_location', os.path.join(args.data_dir, "staging"), '--temp_location', os.path.join(args.data_dir, "temp"), # see https://beam.apache.org/documentation/sdks/python-pipeline-dependencies/ for more details '--setup_file', os.path.join(CURRENT_DIR, 'setup.py'), ] options = PipelineOptions(flags=argv) t1 = time.time() with tft_beam.Context(temp_dir=options.display_data()['temp_location']): pipeline = beam.Pipeline(options=options) # when training we want all the data if args.mode == "train": logging.info("TRAINING") if args.run_cloud: source = ( pipeline | 'Read BQ table' >> beam.io.Read( beam.io.gcp.bigquery.BigQuerySource( query=BQQuery.train_query, use_standard_sql=True))) else: source = (pipeline | 'Read local JSON' >> beam.io.ReadFromText( os.path.join(args.data_dir, 'bq_sample.json')) | 'Parse JSON' >> MapAndFilterErrors(json.loads))