コード例 #1
0
def run(argv=None):
    """
    This function parses the command line arguments and runs the Beam Pipeline.

    Args:
        argv: list containing the commandline arguments for this call of the
         script.
    """
    # Keeps track if schema was inferred by input or ouput table.
    schema_inferred = False

    data_args, pipeline_args = parse_data_generator_args(argv)
    data_args, schema_inferred = fetch_schema(data_args, schema_inferred)
    pipeline_options = PipelineOptions(pipeline_args)

    temp_location = pipeline_options.display_data()['temp_location']
    temp_blob = write_n_line_file_to_gcs(
        pipeline_options.display_data()['project'], temp_location,
        data_args.num_records)

    data_gen = DataGenerator(bq_schema_filename=data_args.schema_file,
                             input_bq_table=data_args.input_bq_table,
                             p_null=data_args.p_null,
                             n_keys=data_args.n_keys,
                             min_date=data_args.min_date,
                             max_date=data_args.max_date,
                             only_pos=data_args.only_pos,
                             max_int=data_args.max_int,
                             max_float=data_args.max_float,
                             float_precision=data_args.float_precision,
                             write_disp=data_args.write_disp,
                             key_skew=data_args.key_skew,
                             primary_key_cols=data_args.primary_key_cols)

    # Initiate the pipeline using the pipeline arguments passed in from the
    # command line.  This includes information including where Dataflow should
    # store temp files, and what the project id is and what runner to use.
    p = beam.Pipeline(options=pipeline_options)

    rows = (
        p
        # Read the file we created with num_records newlines.
        | 'Read file with num_records lines' >> beam.io.ReadFromText(
            os.path.join('gs://', temp_blob.bucket.name, temp_blob.name))

        # Use our instance of our custom DataGenerator Class to generate 1 fake
        # datum with the appropriate schema for each element in the PColleciton
        # created above.
        | 'Generate Data' >> beam.ParDo(FakeRowGen(data_gen))
        | 'Parse Json Strings' >> beam.FlatMap(lambda row: [json.loads(row)]))

    if data_args.primary_key_cols:
        for key in data_args.primary_key_cols.split(','):
            rows |= 'Enforcing primary key: {}'.format(
                key) >> EnforcePrimaryKeys(key)

    if data_args.csv_schema_order:
        (rows
         | 'Order fields for CSV writing.' >> beam.FlatMap(
             lambda d: [dict_to_csv(d, data_args.csv_schema_order.split(','))])
         | 'Write to GCS' >> beam.io.textio.WriteToText(
             file_path_prefix=data_args.output_prefix, file_name_suffix='.csv')
         )

    if data_args.avro_schema_file:
        fastavro_avsc = fastavro.schema.load_schema(data_args.avro_schema_file)

        (rows
         # Need to convert time stamps from strings to timestamp-micros
         | 'Fix date and time Types for Avro.' >>
         beam.FlatMap(lambda row: fix_record_for_avro(row, fastavro_avsc))
         | 'Write to Avro.' >> beam.io.avroio.WriteToAvro(
             file_path_prefix=data_args.output_prefix,
             codec='null',
             file_name_suffix='.avro',
             use_fastavro=True,
             schema=fastavro_avsc))

    if data_args.write_to_parquet:
        with open(data_args.schema_file, 'r') as infile:
            str_schema = json.load(infile)
        pa_schema = get_pyarrow_translated_schema(str_schema)
        (rows
         | 'Fix data and time Types for Parquet.' >>
         beam.FlatMap(lambda row: fix_record_for_parquet(row, str_schema))
         | 'Write to Parquet.' >> beam.io.WriteToParquet(
             file_path_prefix=data_args.output_prefix,
             codec='null',
             file_name_suffix='.parquet',
             schema=pa_schema))

    if data_args.output_bq_table:
        (rows
         | 'Write to BigQuery.' >> beam.io.gcp.bigquery.WriteToBigQuery(
             # The table name is a required argument for the BigQuery sink.
             # In this case we use the value passed in from the command
             # line.
             data_args.output_bq_table,
             schema=None if schema_inferred else data_gen.get_bq_schema(),
             # Creates the table in BigQuery if it does not yet exist.
             create_disposition=beam.io.BigQueryDisposition.CREATE_IF_NEEDED,
             write_disposition=data_gen.write_disp,
             # Use the max recommended batch size.
             batch_size=500))

    p.run().wait_until_finish()

    # Manually clean up of temp_num_records.txt because it will be outside this
    # job's directory and Dataflow will not remove it for us.
    temp_blob.delete()
コード例 #2
0
    def test_get_pyarrow_translated_schema(self):

        string_input_schema = {
            "fields": [{
                "type": "STRING",
                "name": "string1",
                "mode": "REQUIRED"
            }, {
                "type": "NUMERIC",
                "name": "numeric1",
                "mode": "NULLABLE"
            }, {
                "type": "INTEGER",
                "name": "integer1",
                "mode": "REQUIRED"
            }, {
                "type": "FLOAT",
                "name": "float1",
                "mode": "NULLABLE"
            }, {
                "type": "BOOLEAN",
                "name": "boolean1",
                "mode": "REQUIRED"
            }, {
                "type": "TIMESTAMP",
                "name": "timestamp1",
                "mode": "REQUIRED"
            }, {
                "type": "DATE",
                "name": "date1",
                "mode": "REQUIRED"
            }, {
                "type": "TIME",
                "name": "time1",
                "mode": "REQUIRED"
            }, {
                "type": "DATETIME",
                "name": "datetime1",
                "mode": "REQUIRED"
            }, {
                "type":
                "RECORD",
                "name":
                "record1",
                "mode":
                "REPEATED",
                "fields": [{
                    "type": "BOOLEAN",
                    "name": "boolean1",
                    "mode": "REQUIRED"
                }, {
                    "type": "TIMESTAMP",
                    "name": "timestamp1",
                    "mode": "REQUIRED"
                }]
            }]
        }
        expected_pa_schema = pa.schema([
            pa.field(name='string1', type=pa.string()
                     #nullable=False
                     ),
            pa.field(name='numeric1', type=pa.int64()
                     #nullable=True
                     ),
            pa.field(
                name='integer1',
                type=pa.int64(),
                #nullable=False
            ),
            pa.field(name='float1', type=pa.float64()
                     #nullable=True
                     ),
            pa.field(name='boolean1', type=pa.bool_()
                     #nullable=False
                     ),
            pa.field(name='timestamp1',
                     type=pa.timestamp('us')
                     #nullable=False
                     ),
            pa.field(
                name='date1',
                type=pa.date32(),
                #nullable=False
            ),
            pa.field(name='time1', type=pa.time64('us')
                     #nullable=False
                     ),
            pa.field(name='datetime1',
                     type=pa.timestamp('us')
                     #nullable=False
                     ),
            pa.field(
                name='record1',
                type=pa.list_(
                    pa.struct([
                        pa.field(name='boolean1',
                                 type=pa.bool_()
                                 #nullable=False
                                 ),
                        pa.field(name='timestamp1',
                                 type=pa.timestamp('us')
                                 #nullable=False
                                 )
                    ])))
        ])

        pyarrow_schema = get_pyarrow_translated_schema(string_input_schema)
        self.assertEqual(pyarrow_schema, expected_pa_schema)