def run(argv=None): """Test Avro IO (backed by fastavro or Apache Avro) on a simple pipeline that transforms bitcoin transactions""" parser = argparse.ArgumentParser() parser.add_argument('--input', dest='input', default='gs://beam-avro-test/bitcoin/txns/*', help='Input file(s) to process.') parser.add_argument('--output', dest='output', required=True, help='Output file to write results to.') parser.add_argument('--compress', dest='compress', required=False, action='store_true', help='When set, compress the output data') parser.add_argument('--fastavro', dest='use_fastavro', required=False, action='store_true', help='When set, use fastavro for Avro I/O') opts, pipeline_args = parser.parse_known_args(argv) # We use the save_main_session option because one or more DoFn's in this # workflow rely on global context (e.g., a module imported at module level). pipeline_options = PipelineOptions(pipeline_args) pipeline_options.view_as(SetupOptions).save_main_session = True p = beam.Pipeline(options=pipeline_options) # Read the avro file[pattern] into a PCollection. records = \ p | 'read' >> ReadFromAvro(opts.input, use_fastavro=opts.use_fastavro) measured = records | 'scan' >> beam.ParDo(BitcoinTxnCountDoFn()) # pylint: disable=expression-not-assigned measured | 'write' >> \ WriteToAvro( opts.output, schema=SCHEMA, codec=('deflate' if opts.compress else 'null'), use_fastavro=opts.use_fastavro ) result = p.run() result.wait_until_finish() # Do not query metrics when creating a template which doesn't run if (not hasattr(result, 'has_job') # direct runner or result.has_job): # not just a template creation metrics = result.metrics().query() for counter in metrics['counters']: logging.info("Counter: %s", counter) for dist in metrics['distributions']: logging.info("Distribution: %s", dist)
def run(): argv = [ '--project={0}'.format(PROJECT), '--staging_location=gs://{0}/staging/'.format(BUCKET), '--temp_location=gs://{0}/staging/'.format(BUCKET), '--runner=DataflowRunner' ] #Apache Beam Pipeline p = beam.Pipeline(argv=argv) (p | 'ReadAvroFromGCS' >> ReadFromAvro('gs://dataflow-excercise/test-dataset.avro') | 'WriteToBigQuery' >> beam.io.WriteToBigQuery( '{0}:apache_beam.avro_dataflow2'.format(PROJECT), schema=table_schema) ) p.run()