def run(argv=None): """ This function parses the command line arguments and runs the Beam Pipeline. Args: argv: list containing the commandline arguments for this call of the script. """ # Keeps track if schema was inferred by input or ouput table. schema_inferred = False data_args, pipeline_args = parse_data_generator_args(argv) data_args, schema_inferred = fetch_schema(data_args, schema_inferred) pipeline_options = PipelineOptions(pipeline_args) temp_location = pipeline_options.display_data()['temp_location'] temp_blob = write_n_line_file_to_gcs( pipeline_options.display_data()['project'], temp_location, data_args.num_records) data_gen = DataGenerator(bq_schema_filename=data_args.schema_file, input_bq_table=data_args.input_bq_table, p_null=data_args.p_null, n_keys=data_args.n_keys, min_date=data_args.min_date, max_date=data_args.max_date, only_pos=data_args.only_pos, max_int=data_args.max_int, max_float=data_args.max_float, float_precision=data_args.float_precision, write_disp=data_args.write_disp, key_skew=data_args.key_skew, primary_key_cols=data_args.primary_key_cols) # Initiate the pipeline using the pipeline arguments passed in from the # command line. This includes information including where Dataflow should # store temp files, and what the project id is and what runner to use. p = beam.Pipeline(options=pipeline_options) rows = ( p # Read the file we created with num_records newlines. | 'Read file with num_records lines' >> beam.io.ReadFromText( os.path.join('gs://', temp_blob.bucket.name, temp_blob.name)) # Use our instance of our custom DataGenerator Class to generate 1 fake # datum with the appropriate schema for each element in the PColleciton # created above. | 'Generate Data' >> beam.ParDo(FakeRowGen(data_gen)) | 'Parse Json Strings' >> beam.FlatMap(lambda row: [json.loads(row)])) if data_args.primary_key_cols: for key in data_args.primary_key_cols.split(','): rows |= 'Enforcing primary key: {}'.format( key) >> EnforcePrimaryKeys(key) if data_args.csv_schema_order: (rows | 'Order fields for CSV writing.' >> beam.FlatMap( lambda d: [dict_to_csv(d, data_args.csv_schema_order.split(','))]) | 'Write to GCS' >> beam.io.textio.WriteToText( file_path_prefix=data_args.output_prefix, file_name_suffix='.csv') ) if data_args.avro_schema_file: fastavro_avsc = fastavro.schema.load_schema(data_args.avro_schema_file) (rows # Need to convert time stamps from strings to timestamp-micros | 'Fix date and time Types for Avro.' >> beam.FlatMap(lambda row: fix_record_for_avro(row, fastavro_avsc)) | 'Write to Avro.' >> beam.io.avroio.WriteToAvro( file_path_prefix=data_args.output_prefix, codec='null', file_name_suffix='.avro', use_fastavro=True, schema=fastavro_avsc)) if data_args.write_to_parquet: with open(data_args.schema_file, 'r') as infile: str_schema = json.load(infile) pa_schema = get_pyarrow_translated_schema(str_schema) (rows | 'Fix data and time Types for Parquet.' >> beam.FlatMap(lambda row: fix_record_for_parquet(row, str_schema)) | 'Write to Parquet.' >> beam.io.WriteToParquet( file_path_prefix=data_args.output_prefix, codec='null', file_name_suffix='.parquet', schema=pa_schema)) if data_args.output_bq_table: (rows | 'Write to BigQuery.' >> beam.io.gcp.bigquery.WriteToBigQuery( # The table name is a required argument for the BigQuery sink. # In this case we use the value passed in from the command # line. data_args.output_bq_table, schema=None if schema_inferred else data_gen.get_bq_schema(), # Creates the table in BigQuery if it does not yet exist. create_disposition=beam.io.BigQueryDisposition.CREATE_IF_NEEDED, write_disposition=data_gen.write_disp, # Use the max recommended batch size. batch_size=500)) p.run().wait_until_finish() # Manually clean up of temp_num_records.txt because it will be outside this # job's directory and Dataflow will not remove it for us. temp_blob.delete()
def test_get_pyarrow_translated_schema(self): string_input_schema = { "fields": [{ "type": "STRING", "name": "string1", "mode": "REQUIRED" }, { "type": "NUMERIC", "name": "numeric1", "mode": "NULLABLE" }, { "type": "INTEGER", "name": "integer1", "mode": "REQUIRED" }, { "type": "FLOAT", "name": "float1", "mode": "NULLABLE" }, { "type": "BOOLEAN", "name": "boolean1", "mode": "REQUIRED" }, { "type": "TIMESTAMP", "name": "timestamp1", "mode": "REQUIRED" }, { "type": "DATE", "name": "date1", "mode": "REQUIRED" }, { "type": "TIME", "name": "time1", "mode": "REQUIRED" }, { "type": "DATETIME", "name": "datetime1", "mode": "REQUIRED" }, { "type": "RECORD", "name": "record1", "mode": "REPEATED", "fields": [{ "type": "BOOLEAN", "name": "boolean1", "mode": "REQUIRED" }, { "type": "TIMESTAMP", "name": "timestamp1", "mode": "REQUIRED" }] }] } expected_pa_schema = pa.schema([ pa.field(name='string1', type=pa.string() #nullable=False ), pa.field(name='numeric1', type=pa.int64() #nullable=True ), pa.field( name='integer1', type=pa.int64(), #nullable=False ), pa.field(name='float1', type=pa.float64() #nullable=True ), pa.field(name='boolean1', type=pa.bool_() #nullable=False ), pa.field(name='timestamp1', type=pa.timestamp('us') #nullable=False ), pa.field( name='date1', type=pa.date32(), #nullable=False ), pa.field(name='time1', type=pa.time64('us') #nullable=False ), pa.field(name='datetime1', type=pa.timestamp('us') #nullable=False ), pa.field( name='record1', type=pa.list_( pa.struct([ pa.field(name='boolean1', type=pa.bool_() #nullable=False ), pa.field(name='timestamp1', type=pa.timestamp('us') #nullable=False ) ]))) ]) pyarrow_schema = get_pyarrow_translated_schema(string_input_schema) self.assertEqual(pyarrow_schema, expected_pa_schema)