def run(argv=None): """ If you want to run just this file for rapid development, change runner to 'DirectRunner' and add GCS paths for --input and --avro_output, e.g. python qalert_requests_dataflow.py --input gs://pghpa_test_qalert/requests/2020/06/2020-06-17_requests.json --avro_output gs://pghpa_test_qalert/requests/avro_output/2020/06/2020-06-17/ """ known_args, pipeline_options, avro_schema = generate_args( job_name='parking-meters-dataflow', bucket='{}_parking'.format(os.environ['GCS_PREFIX']), argv=argv, schema_name='parking_meters' ) with beam.Pipeline(options=pipeline_options) as p: field_name_swaps = [("longitude", "long"), ("latitude", "lat")] type_changes = [("long", "float"), ("lat", "float")] lines = p | ReadFromText(known_args.input, coder=JsonCoder()) load = ( lines | beam.ParDo(SwapFieldNames(field_name_swaps)) | beam.ParDo(ChangeDataTypes(type_changes)) | WriteToAvro(known_args.avro_output, schema=avro_schema, file_name_suffix='.avro', use_fastavro=True))
def run(argv=None): """ If you want to run just this file for rapid development, pass the arg '-r DirectRunner' and add GCS paths for --input and --avro_output, e.g. python qalert_requests_dataflow.py --input gs://pghpa_test_qalert/requests/2020/06/2020-06-17_requests.json --avro_output gs://pghpa_test_qalert/requests/avro_output/2020/06/2020-06-17/ -r DirectRunner """ known_args, pipeline_options, avro_schema = generate_args( job_name='wprdc-fire-dataflow', bucket='{}_ems_fire'.format(os.environ['GCS_PREFIX']), argv=argv, schema_name='ems_calls') with beam.Pipeline(options=pipeline_options) as p: field_name_swaps = [("census_block_group_center__x", "long"), ("census_block_group_center__y", "lat")] lines = p | ReadFromText(known_args.input, coder=JsonCoder()) load = (lines | beam.ParDo(SwapFieldNames(field_name_swaps)) | WriteToAvro(known_args.avro_output, schema=avro_schema, file_name_suffix='.avro', use_fastavro=True))
def run(argv=None): known_args, pipeline_options, avro_schema = generate_args( job_name='computronix-domi-permits', bucket='{}_computronix'.format(os.environ['GCS_PREFIX']), argv=argv, schema_name='domi_permits_computronix') with beam.Pipeline(options=pipeline_options) as p: field_name_swaps = [('PERMITTYPEPERMITTYPE', 'PERMITTYPE'), ('TYPEOFWORKDESCRIPTION', 'WORKTYPE'), ('APPLICANTCUSTOMFORMATTEDNAME', 'APPLICANTNAME'), ('ALLCONTRACTORSNAME', 'CONTRACTORNAME'), ('SPECIALPERMITINSTRUCTIONS', 'SPECIALINSTRUCTIONS'), ('STATUSDESCRIPTION', 'STATUS')] lines = p | ReadFromText(known_args.input, coder=JsonCoder()) load = (lines | beam.ParDo(ParseNestedFields()) | beam.ParDo(SwapFieldNames(field_name_swaps)) | beam.ParDo(GeocodeAddress()) | WriteToAvro(known_args.avro_output, schema=avro_schema, file_name_suffix='.avro', use_fastavro=True))
def run(argv=None): """Test Avro IO (backed by fastavro or Apache Avro) on a simple pipeline that transforms bitcoin transactions""" parser = argparse.ArgumentParser() parser.add_argument('--input', dest='input', default='gs://beam-avro-test/bitcoin/txns/*', help='Input file(s) to process.') parser.add_argument('--output', dest='output', required=True, help='Output file to write results to.') parser.add_argument('--compress', dest='compress', required=False, action='store_true', help='When set, compress the output data') parser.add_argument('--fastavro', dest='use_fastavro', required=False, action='store_true', help='When set, use fastavro for Avro I/O') opts, pipeline_args = parser.parse_known_args(argv) # We use the save_main_session option because one or more DoFn's in this # workflow rely on global context (e.g., a module imported at module level). pipeline_options = PipelineOptions(pipeline_args) pipeline_options.view_as(SetupOptions).save_main_session = True p = beam.Pipeline(options=pipeline_options) # Read the avro file[pattern] into a PCollection. records = \ p | 'read' >> ReadFromAvro(opts.input, use_fastavro=opts.use_fastavro) measured = records | 'scan' >> beam.ParDo(BitcoinTxnCountDoFn()) # pylint: disable=expression-not-assigned measured | 'write' >> \ WriteToAvro( opts.output, schema=SCHEMA, codec=('deflate' if opts.compress else 'null'), use_fastavro=opts.use_fastavro ) result = p.run() result.wait_until_finish() # Do not query metrics when creating a template which doesn't run if (not hasattr(result, 'has_job') # direct runner or result.has_job): # not just a template creation metrics = result.metrics().query() for counter in metrics['counters']: logging.info("Counter: %s", counter) for dist in metrics['distributions']: logging.info("Distribution: %s", dist)
def run(argv=None): known_args, pipeline_options, avro_schema = generate_args( job_name='accela-permits', bucket='{}_accela'.format(os.environ['GCS_PREFIX']), argv=argv, schema_name='accela_permits' ) with beam.Pipeline(options=pipeline_options) as p: exclude_fields = [ 'module', 'serviceProviderCode', 'undistributedCost', 'totalJobCost', 'recordClass', 'reportedChannel', 'closedByDepartment', 'estimatedProductionUnit', 'actualProductionUnit', 'createdByCloning', 'closedByUser', 'trackingId', 'initiatedProduct', 'createdBy', 'value', 'balance', 'booking', 'infraction', 'misdemeanor', 'offenseWitnessed', 'defendantSignature', 'parcels', 'id', 'statusDate', 'jobValue', 'reportedDate' ] address_field = 'address' field_name_swaps = [ ('customId', 'id'), ('totalPay', 'total_paid') ] lines = p | ReadFromText(known_args.input, coder=JsonCoder()) load = ( lines | beam.ParDo(FilterInvalidRecord()) | beam.ParDo(FilterFields(exclude_fields)) | beam.ParDo(ParseNestedFields()) | beam.ParDo(GeocodeAddress(address_field)) | beam.ParDo(SwapFieldNames(field_name_swaps)) | beam.ParDo(ColumnsCamelToSnakeCase()) | WriteToAvro(known_args.avro_output, schema=avro_schema, file_name_suffix='.avro', use_fastavro=True))
def run(argv=None): known_args, pipeline_options, avro_schema = generate_args( job_name='twilio-311-dataflow', bucket='{}_twilio'.format(os.environ['GCS_PREFIX']), argv=argv, schema_name='twilio_reports' ) with beam.Pipeline(options=pipeline_options) as p: # Read the text file[pattern] into a PCollection. lines = p | ReadFromText(known_args.input, coder=JsonCoder()) load = ( lines | WriteToAvro(known_args.avro_output, schema=avro_schema, file_name_suffix='.avro', use_fastavro=True))
def run(argv=None): known_args, pipeline_options, avro_schema = generate_args( job_name='firearms-dataflow', bucket='{}_firearm_seizures'.format(os.environ['GCS_PREFIX']), argv=argv, schema_name='firearm_seizures' ) with beam.Pipeline(options=pipeline_options) as p: # Read the text file[pattern] into a PCollection. lines = p | ReadFromText(known_args.input, skip_header_lines=1) load = ( lines | beam.ParDo(ConvertToDicts()) | WriteToAvro(known_args.avro_output, schema=avro_schema, file_name_suffix='.avro', use_fastavro=True))
def run(argv=None): known_args, pipeline_options, avro_schema = generate_args( job_name='computronix-trades-dataflow', bucket='{}_computronix'.format(os.environ['GCS_PREFIX']), argv=argv, schema_name='trade_licenses_computronix') with beam.Pipeline(options=pipeline_options) as p: # Read the text file[pattern] into a PCollection. lines = p | ReadFromText(known_args.input, coder=JsonCoder()) load = (lines | beam.ParDo(FormatColumnNames()) | beam.ParDo(ConvertTypes()) | WriteToAvro(known_args.avro_output, schema=avro_schema, file_name_suffix='.avro', use_fastavro=True))
def run(argv=None): dt = datetime.now() parser = argparse.ArgumentParser() parser.add_argument( '--input', dest='input', default='gs://{}_311/requests/{}/{}/{}_requests.json'.format( os.environ['GCS_PREFIX'], dt.strftime('%Y'), dt.strftime('%m').lower(), dt.strftime("%Y-%m-%d")), help='Input file to process.') parser.add_argument( '--avro_output', dest='avro_output', default='gs://{}_311/requests/avro_output/{}/{}/{}/avro_output'.format( os.environ['GCS_PREFIX'], dt.strftime('%Y'), dt.strftime('%m').lower(), dt.strftime("%Y-%m-%d")), help='Output directory to write avro files.') known_args, pipeline_args = parser.parse_known_args(argv) #TODO: run on on-prem network when route is opened # Use runner=DataflowRunner to run in GCP environment, DirectRunner to run locally pipeline_args.extend( generate_args('qalert-requests-dataflow', '{}_311'.format(os.environ['GCS_PREFIX']), 'DirectRunner')) avro_schema = get_schema('City_of_Pittsburgh_QAlert_Requests') pipeline_options = PipelineOptions(pipeline_args) with beam.Pipeline(options=pipeline_options) as p: # Read the text file[pattern] into a PCollection. lines = p | ReadFromText(known_args.input, coder=JsonCoder()) load = (lines | beam.ParDo(GetStatus()) | beam.ParDo(CleanLatLong()) | WriteToAvro(known_args.avro_output, schema=avro_schema, file_name_suffix='.avro', use_fastavro=True))
def run(argv=None): known_args, pipeline_options, avro_schema = dataflow_utils.generate_args( job_name='comm-ctr-attendance-dataflow', bucket='{}_community_centers'.format(os.environ['GCS_PREFIX']), argv=argv, schema_name='community_center_attendance') with beam.Pipeline(options=pipeline_options) as p: # Read the text file[pattern] into a PCollection. lines = p | ReadFromText(known_args.input, coder=dataflow_utils.JsonCoder()) load = (lines | beam.ParDo(ColumnsCamelToSnakeCase()) | WriteToAvro(known_args.avro_output, schema=avro_schema, file_name_suffix='.avro', use_fastavro=True))
def run(argv=None): dt = datetime.now() parser = argparse.ArgumentParser() parser.add_argument( '--input', dest='input', default='gs://{}_firearm_seizures/{}/{}/{}_firearm_seizures.csv'. format(os.environ['GCS_PREFIX'], dt.strftime('%Y'), dt.strftime('%m').lower(), dt.strftime("%Y-%m-%d")), help='Input file to process.') parser.add_argument( '--avro_output', dest='avro_output', default='gs://{}_firearm_seizures/avro_output/{}/{}/{}/avro_output'. format(os.environ['GCS_PREFIX'], dt.strftime('%Y'), dt.strftime('%m').lower(), dt.strftime("%Y-%m-%d")), help='Output directory to write avro files.') known_args, pipeline_args = parser.parse_known_args(argv) # TODO: run on on-prem network when route is opened # Use runner=DataflowRunner to run in GCP environment, DirectRunner to run locally pipeline_args.extend( generate_args('firearms-dataflow', '{}_firearm_seizures'.format(os.environ['GCS_PREFIX']), 'DirectRunner')) avro_schema = get_schema('firearm_seizures') pipeline_options = PipelineOptions(pipeline_args) with beam.Pipeline(options=pipeline_options) as p: # Read the text file[pattern] into a PCollection. lines = p | ReadFromText(known_args.input, skip_header_lines=1) load = (lines | beam.ParDo(ConvertToDicts()) | WriteToAvro(known_args.avro_output, schema=avro_schema, file_name_suffix='.avro', use_fastavro=True))
def run(argv=None): known_args, pipeline_options, avro_schema = generate_args( job_name='registered-businesses-dataflow', bucket='{}_finance'.format(os.environ['GCS_PREFIX']), argv=argv, schema_name='registered_businesses') with beam.Pipeline(options=pipeline_options) as p: lines = p | ReadFromText(known_args.input, coder=JsonCoder()) load = (lines | beam.ParDo(ColumnsToLowerCase()) | beam.ParDo(ParseAddress()) | beam.ParDo( NormalizeAddress(StaticValueProvider(str, 'address_full'))) | WriteToAvro(known_args.avro_output, schema=avro_schema, file_name_suffix='.avro', use_fastavro=True))
def run(argv=None): """ If you want to run just this file for rapid development, change runner to 'DirectRunner' and add GCS paths for --input and --avro_output, e.g. python qalert_requests_dataflow.py --input gs://pghpa_test_qalert/requests/2020/06/2020-06-17_requests.json --avro_output gs://pghpa_test_qalert/requests/avro_output/2020/06/2020-06-17/ """ known_args, pipeline_options, avro_schema = generate_args( job_name='parking-transactions-dataflow', bucket='{}_parking'.format(os.environ['GCS_PREFIX']), argv=argv, schema_name='parking_transactions' ) with beam.Pipeline(options=pipeline_options) as p: lines = p | ReadFromText(known_args.input, coder=JsonCoder()) load = ( lines | WriteToAvro(known_args.avro_output, schema=avro_schema, file_name_suffix='.avro', use_fastavro=True))
def run(argv=None): known_args, pipeline_options, avro_schema = generate_args( job_name='police-blotter-30-day-dataflow', bucket='{}_police'.format(os.environ['GCS_PREFIX']), argv=argv, schema_name='30_day_police_blotter') with beam.Pipeline(options=pipeline_options) as p: lines = p | ReadFromText(known_args.input, coder=JsonCoder()) data_type_changes = [('CCR', 'int'), ('TRACT', 'int')] field_name_swaps = [('PK', 'id')] load = (lines | beam.ParDo(CleanPKs()) | beam.ParDo(ChangeDataTypes(data_type_changes)) | beam.ParDo(SwapFieldNames(field_name_swaps)) | WriteToAvro(known_args.avro_output, schema=avro_schema, file_name_suffix='.avro', use_fastavro=True))
def run(argv=None): """ If you want to run just this file for rapid development, add the arg '-r DirectRunner' and add GCS paths for --input and --avro_output, e.g. python qalert_requests_dataflow.py --input gs://pghpa_test_qalert/requests/2020/06/2020-06-17_requests.json --avro_output gs://pghpa_test_qalert/requests/avro_output/2020/06/2020-06-17/ -r DirectRunner """ known_args, pipeline_options, avro_schema = generate_args( job_name='qalert-requests-dataflow', bucket='{}_qalert'.format(os.environ['GCS_PREFIX']), argv=argv, schema_name='City_of_Pittsburgh_QAlert_Requests') with beam.Pipeline(options=pipeline_options) as p: date_conversions = [('lastActionUnix', 'lastAction'), ('addDateUnix', 'createDate')] field_name_swaps = [('addDateUnix', 'createDateUnix'), ('status', 'statusCode'), ('latitude', 'lat'), ('longitude', 'long'), ('master', 'masterRequestId'), ('typeId', 'requestTypeId'), ('typeName', 'requestType')] lines = p | ReadFromText(known_args.input, coder=JsonCoder()) load = (lines | beam.ParDo(GetDateStrings(date_conversions)) | beam.ParDo(SwapFieldNames(field_name_swaps)) | beam.ParDo(GetStatus()) | beam.ParDo(GetClosedDate()) | WriteToAvro(known_args.avro_output, schema=avro_schema, file_name_suffix='.avro', use_fastavro=True))
def run(argv=None): """Main entry point""" parser = argparse.ArgumentParser() # parser.add_argument('--project', type=str, required=False, help='project') parser.add_argument( '--records', dest='records', type=int, # default='gs://dataflow-samples/shakespeare/kinglear.txt', default='10', # gsutil cp gs://dataflow-samples/shakespeare/kinglear.txt help='Number of records to be generate') parser.add_argument('--output', dest='output', required=False, default='./', help='Output file to write results to.') # Parse arguments from the command line. known_args, pipeline_args = parser.parse_known_args(argv) # Store the CLI arguments to variables # project_id = known_args.project # Setup the dataflow pipeline options pipeline_options = PipelineOptions(pipeline_args) # pipeline_options.view_as(SetupOptions).save_main_session = True # google_cloud_options = pipeline_options.view_as(GoogleCloudOptions) # google_cloud_options.project = project_id save_main_session = True pipeline_options.view_as( SetupOptions).save_main_session = save_main_session # SCHEMA_STRING = ''' # {"namespace": "example.avro", # "type": "record", # "name": "User", # "fields": [ # {"name": "ACNO", "type": "int"}, # {"name": "PRIN_BAL", "type": "int"}, # {"name": "FEE_ANT", "default": null, "type": ["null", "double"]}, # {"name": "GENDER", "default": null, "type": ["null", {"logicalType": "char", "type": "string", "maxLength": 1}]} # ] # } # ''' SCHEMA = { "namespace": "example.avro", "type": "record", "name": "User", "fields": [{ "name": "ACNO", "type": [ "null", { "logicalType": "char", "type": "string", "maxLength": 20 } ] }, { "name": "FIELD_1", "type": [ "null", { "logicalType": "char", "type": "float", "maxLength": 20 } ] }, { "name": "FIELD_2", "type": [ "null", { "logicalType": "char", "type": "float", "maxLength": 20 } ] }] } # {"name": "GENDER', "type": "string"} # {"name": "FEE_ANT", "type": "long"} # p = beam.Pipeline(options=pipeline_options) rec_cnt = known_args.records with beam.Pipeline(options=pipeline_options) as p: left_pcol_name = 'p1' file = p | 'read_source' >> beam.io.ReadFromAvro( "./data/account_id_schema_new.avro") p1 = file | beam.Map(lambda x: { 'ACNO': x['ACNO'], 'FIELD_1': x["FIELD_1"] }) p2 = file | beam.Map(lambda x: { 'ACNO': x['ACNO'], 'FIELD_2': x["FIELD_2"] }) P1_1 = p1 | "write" >> beam.io.WriteToText('./data.csv') P2_2 = p2 | "write2" >> beam.io.WriteToText('./data2.csv') right_pcol_name = 'p2' join_keys = { left_pcol_name: [ 'ACNO' # 't1_col_B' ], right_pcol_name: [ 'ACNO' # 't2_col_B' ] } pipelines_dictionary = {left_pcol_name: p1, right_pcol_name: p2} test_pipeline = pipelines_dictionary | 'left join' >> Join( left_pcol_name=left_pcol_name, left_pcol=p1, right_pcol_name=right_pcol_name, right_pcol=p2, join_type='left', join_keys=join_keys) print(type(test_pipeline)) test_pipeline | "print" >> beam.io.WriteToText('./test.csv') compressIdc = True use_fastavro = True # test_pipeline | 'write_fastavro' >> WriteToAvro( known_args.output, # '/tmp/dataflow/{}/{}'.format( # 'demo', 'output'), # parse_schema(json.loads(SCHEMA_STRING)), parse_schema(SCHEMA), use_fastavro=use_fastavro, file_name_suffix='.avro', codec=('deflate' if compressIdc else 'null'), ) result = p.run() result.wait_until_finish()
def run(argv=None): """Main entry point""" parser = argparse.ArgumentParser() # parser.add_argument('--project', type=str, required=False, help='project') parser.add_argument( '--records', dest='records', type=int, # default='gs://dataflow-samples/shakespeare/kinglear.txt', default='10', # gsutil cp gs://dataflow-samples/shakespeare/kinglear.txt help='Number of records to be generate') parser.add_argument('--output', dest='output', required=False, default='./', help='Output file to write results to.') # Parse arguments from the command line. known_args, pipeline_args = parser.parse_known_args(argv) # Store the CLI arguments to variables # project_id = known_args.project # Setup the dataflow pipeline options pipeline_options = PipelineOptions(pipeline_args) # pipeline_options.view_as(SetupOptions).save_main_session = True # google_cloud_options = pipeline_options.view_as(GoogleCloudOptions) # google_cloud_options.project = project_id save_main_session = True pipeline_options.view_as( SetupOptions).save_main_session = save_main_session print(pipeline_args) SCHEMA = { "namespace": "example.avro", "type": "record", "name": "User", "fields": [{ "name": "ACNO", "type": [ "null", { "logicalType": "char", "type": "string", "maxLength": 20 } ] }, { "name": "NUM_OF_MTHS_PD_30", "type": ["null", 'int', 'string'] }, { "name": "FIELD_1", "type": [ "null", { "logicalType": "char", "type": "float", "maxLength": 20 } ] }, { "name": "FIELD_2", "type": [ "null", { "logicalType": "char", "type": "float", "maxLength": 20 } ] }, { "name": "FIELD_3", "type": [ "null", { "logicalType": "char", "type": "float", "maxLength": 20 } ] }, { "name": "FIELD_4", "type": [ "null", { "logicalType": "char", "type": "float", "maxLength": 20 } ] }, { "name": "FIELD_5", "type": [ "null", { "logicalType": "char", "type": "float", "maxLength": 20 } ] }, { "name": "FIELD_6", "type": [ "null", { "logicalType": "char", "type": "float", "maxLength": 20 } ] }, { "name": "FIELD_7", "type": [ "null", { "logicalType": "char", "type": "float", "maxLength": 20 } ] }, { "name": "FIELD_8", "type": [ "null", { "logicalType": "char", "type": "float", "maxLength": 20 } ] }, { "name": "FIELD_9", "type": [ "null", { "logicalType": "char", "type": "float", "maxLength": 20 } ] }, { "name": "FIELD_10", "type": [ "null", { "logicalType": "char", "type": "float", "maxLength": 20 } ] }] } rec_cnt = known_args.records with beam.Pipeline(options=pipeline_options) as p: left_pcol_name = 'p1' file = p | 'read_source' >> beam.io.ReadFromAvro( "./data/Curr_account.avro") | beam.Distinct() file2 = p | 'read_source2' >> beam.io.ReadFromAvro( "./data/Prev_account.avro") p1 = file | 'filter fields' >> beam.Filter( lambda x: int(x['NUM_OF_MTHS_PD_30']) >= 0) p2 = file2 | 'filter fields2' >> beam.Filter( lambda x: int(x['NUM_OF_MTHS_PD_30']) >= 0) # P1_1 = p1 | "write" >> beam.io.WriteToText('./data.csv') # P2_2 = p2 | "write2" >> beam.io.WriteToText('./data2.csv') right_pcol_name = 'p2' join_keys = { left_pcol_name: [ 'ACNO' # 't1_col_B' ], right_pcol_name: [ 'ACNO' # 't2_col_B' ] } pipelines_dictionary = {left_pcol_name: p1, right_pcol_name: p2} test_pipeline = pipelines_dictionary | 'left join' >> Join( left_pcol_name=left_pcol_name, left_pcol=p1, right_pcol_name=right_pcol_name, right_pcol=p2, join_type='left', join_keys=join_keys) test_pipeline | 'add 1 to NUM_OF_MTHS_PD_30' >> beam.Map( add_one) | "write4" >> beam.io.WriteToText('./data4.csv') print(type(test_pipeline)) compressIdc = True use_fastavro = True # test_pipeline | 'write_fastavro' >> WriteToAvro( known_args.output, parse_schema(SCHEMA), use_fastavro=use_fastavro, file_name_suffix='.avro', codec=('deflate' if compressIdc else 'null'), ) result = p.run() result.wait_until_finish()
def run(argv=None): """Main entry point""" parser = argparse.ArgumentParser() parser.add_argument('--project', default='mvp-project-273913', type=str, required=False, help='project') parser.add_argument('--job_name', default='rpm', type=str) parser.add_argument('--worker_node', default='n1-standard-4') parser.add_argument('--temp_location', default='gs://zz_michael/dataflow_s/tmp') parser.add_argument('--location', default='gcs') parser.add_argument('--region', default='asia-east1') parser.add_argument('--staging_location', default='gs://zz_michael/dataflow_s/stage') parser.add_argument( '--output', required=False, default= 'gs://zz_michael/dataflow_s/RPM/output/account_id_schema_output.avro', help='Output file to write results to.') parser.add_argument( '--input', default='gs://zz_michael/dataflow_s/RPM/Curr_account.avro', help='input file to write results to.') parser.add_argument( '--input2', default='gs://zz_michael/dataflow_s/RPM/Prev_account.avro', help='input file to write results to.') # Parse arguments from the command line. # known_args, pipeline_args = parser.parse_known_args(argv) args = parser.parse_args() dataflow_options = [ '--project=%s' % (args.project), '--job_name=%s' % (args.job_name), '--temp_location=%s' % (args.temp_location), '--worker_machine_type=%s' % (args.worker_node), '--region=%s' % (args.region) ] dataflow_options.append('--staging_location=%s' % (args.staging_location)) options = PipelineOptions(dataflow_options) gcloud_options = options.view_as(GoogleCloudOptions) options.view_as(StandardOptions).runner = "dataflow" table_schema = { 'fields': [{ "name": "ACNO", "type": 'INTEGER', 'mode': 'NULLABLE' }, { "name": "NUM_OF_MTHS_PD_30", "type": 'INTEGER', 'mode': 'NULLABLE' }, { "name": "FIELD_1", "type": 'FLOAT', 'mode': 'NULLABLE' }, { "name": "FIELD_2", "type": 'FLOAT', 'mode': 'NULLABLE' }, { "name": "FIELD_3", "type": 'FLOAT', 'mode': 'NULLABLE' }, { "name": "FIELD_4", "type": 'FLOAT', 'mode': 'NULLABLE' }, { "name": "FIELD_5", "type": 'FLOAT', 'mode': 'NULLABLE' }, { "name": "FIELD_6", "type": 'FLOAT', 'mode': 'NULLABLE' }, { "name": "FIELD_7", "type": 'FLOAT', 'mode': 'NULLABLE' }, { "name": "FIELD_8", "type": 'FLOAT', 'mode': 'NULLABLE' }, { "name": "FIELD_9", "type": 'FLOAT', 'mode': 'NULLABLE' }, { "name": "FIELD_10", "type": 'FLOAT', 'mode': 'NULLABLE' }] } SCHEMA = { "namespace": "example.avro", "type": "record", "name": "User", "fields": [{ "name": "ACNO", "type": [ "null", { "logicalType": "char", "type": "string", "maxLength": 20 } ] }, { "name": "NUM_OF_MTHS_PD_30", "type": ["null", 'int', 'string'] }, { "name": "FIELD_1", "type": [ "null", { "logicalType": "char", "type": "float", "maxLength": 20 } ] }, { "name": "FIELD_2", "type": [ "null", { "logicalType": "char", "type": "float", "maxLength": 20 } ] }, { "name": "FIELD_3", "type": [ "null", { "logicalType": "char", "type": "float", "maxLength": 20 } ] }, { "name": "FIELD_4", "type": [ "null", { "logicalType": "char", "type": "float", "maxLength": 20 } ] }, { "name": "FIELD_5", "type": [ "null", { "logicalType": "char", "type": "float", "maxLength": 20 } ] }, { "name": "FIELD_6", "type": [ "null", { "logicalType": "char", "type": "float", "maxLength": 20 } ] }, { "name": "FIELD_7", "type": [ "null", { "logicalType": "char", "type": "float", "maxLength": 20 } ] }, { "name": "FIELD_8", "type": [ "null", { "logicalType": "char", "type": "float", "maxLength": 20 } ] }, { "name": "FIELD_9", "type": [ "null", { "logicalType": "char", "type": "float", "maxLength": 20 } ] }, { "name": "FIELD_10", "type": [ "null", { "logicalType": "char", "type": "float", "maxLength": 20 } ] }] } with beam.Pipeline(options=options) as p: left_pcol_name = 'p1' Curr_Month = p | 'read_Curr_Month' >> beam.io.ReadFromAvro(args.input) Prev_Month = p | 'read_Prev_Month' >> beam.io.ReadFromAvro(args.input2) p1 = Curr_Month | 'select fields from Curr_Month' >> beam.Filter( lambda x: int(x['NUM_OF_MTHS_PD_30']) >= 0) p2 = Prev_Month | 'select fields2 from Prev_Month' >> beam.Filter( lambda x: int(x['NUM_OF_MTHS_PD_30']) >= 0) right_pcol_name = 'p2' join_keys = { left_pcol_name: [ 'ACNO' # 't1_col_B' ], right_pcol_name: [ 'ACNO' # 't2_col_B' ] } joinkey_dict = {left_pcol_name: p1, right_pcol_name: p2} joined_data = joinkey_dict | 'left join' >> Join( left_pcol_name=left_pcol_name, left_pcol=p1, right_pcol_name=right_pcol_name, right_pcol=p2, join_type='left', join_keys=join_keys) derived_result = joined_data | 'Transform (add 1 to fileld)' >> beam.Map( cycle_dlqn) print(type(joined_data)) compressIdc = True use_fastavro = True compressIdc = True use_fastavro = True if args.location == "bigquery": derived_result | beam.io.WriteToBigQuery( table_spec, schema=table_schema, write_disposition=beam.io.BigQueryDisposition.WRITE_TRUNCATE, create_disposition=beam.io.BigQueryDisposition.CREATE_IF_NEEDED ) else: derived_result | 'Write out Stage' >> WriteToAvro( args.output, parse_schema(SCHEMA), use_fastavro=use_fastavro, file_name_suffix='.avro', codec=('deflate' if compressIdc else 'null'), ) result = p.run() result.wait_until_finish()
def run(argv=None): """Main entry point""" parser = argparse.ArgumentParser() # parser.add_argument('--project', type=str, required=False, help='project') parser.add_argument( '--records', dest='records', type=int, # default='gs://dataflow-samples/shakespeare/kinglear.txt', default='10', # gsutil cp gs://dataflow-samples/shakespeare/kinglear.txt help='Number of records to be generate') parser.add_argument('--output', dest='output', required=False, default='/tmp/dataflow/demo/output', help='Output file to write results to.') # Parse arguments from the command line. known_args, pipeline_args = parser.parse_known_args(argv) # Store the CLI arguments to variables # project_id = known_args.project # Setup the dataflow pipeline options pipeline_options = PipelineOptions(pipeline_args) # pipeline_options.view_as(SetupOptions).save_main_session = True # google_cloud_options = pipeline_options.view_as(GoogleCloudOptions) # google_cloud_options.project = project_id save_main_session = True pipeline_options.view_as( SetupOptions).save_main_session = save_main_session # SCHEMA_STRING = ''' # {"namespace": "example.avro", # "type": "record", # "name": "User", # "fields": [ # {"name": "ACNO", "type": "int"}, # {"name": "PRIN_BAL", "type": "int"}, # {"name": "FEE_ANT", "default": null, "type": ["null", "double"]}, # {"name": "GENDER", "default": null, "type": ["null", {"logicalType": "char", "type": "string", "maxLength": 1}]} # ] # } # ''' SCHEMA = { "namespace": "example.avro", "type": "record", "name": "User", "fields": [{ "name": "ACNO", "type": "int" }, { "name": "PRIN_BAL", "type": "int" }, { "name": "FEE_ANT", "default": 'null', "type": ["null", "double"] }, { "name": "GENDER", "default": 'null', "type": [ "null", { "logicalType": "char", "type": "string", "maxLength": 1 } ] }] } # {"name": "GENDER', "type": "string"} # {"name": "FEE_ANT", "type": "long"} # p = beam.Pipeline(options=pipeline_options) rec_cnt = known_args.records with beam.Pipeline(options=pipeline_options) as p: left_pcol_name = 'p1' p1 = p | 'Create source data' >> beam.Create( [{ 'ACNO': i + 1, 'PRIN_BAL': i + 1, 'GENDER1': 'Y', 'GENDER': random.choice(['Y', 'N']), } for i in range(rec_cnt)]) right_pcol_name = 'p2' p2 = p | 'Create join data' >> beam.Create( [{ 'ACNO': i + 1, 'FEE_ANT': random.random() * 100000000, } for i in range(rec_cnt)]) join_keys = { left_pcol_name: [ 'ACNO' # 't1_col_B' ], right_pcol_name: [ 'ACNO' # 't2_col_B' ] } pipelines_dictionary = {left_pcol_name: p1, right_pcol_name: p2} test_pipeline = pipelines_dictionary | 'left join' >> Join( left_pcol_name=left_pcol_name, left_pcol=p1, right_pcol_name=right_pcol_name, right_pcol=p2, join_type='left', join_keys=join_keys) # test_pipeline | "print" >> beam.Map(printfn) compressIdc = True use_fastavro = True test_pipeline | 'write_fastavro' >> WriteToAvro( known_args.output, # '/tmp/dataflow/{}/{}'.format( # 'demo', 'output'), # parse_schema(json.loads(SCHEMA_STRING)), parse_schema(SCHEMA), use_fastavro=use_fastavro, file_name_suffix='.avro', codec=('deflate' if compressIdc else 'null'), )
def run(known_args, pipeline_args): # We use the save_main_session option because one or more DoFn's in this # workflow rely on global context (e.g., a module imported at module level). logging.getLogger().setLevel(logging.INFO) pipeline_options = PipelineOptions(pipeline_args) pipeline_options.view_as(SetupOptions).save_main_session = True p = beam.Pipeline(options=pipeline_options) # Read the text file[pattern] into a PCollection. lines = p | 'read' >> ReadFromText(known_args.input) # Count the occurrences of each word. def count_ones(word_ones): (word, ones) = word_ones return (word, sum(ones)) counts = (lines | 'split' >> (beam.ParDo(WordExtractingDoFn()).with_output_types(unicode)) | 'pair_with_one' >> beam.Map(lambda x: (x, 1)) | 'group' >> beam.GroupByKey() | 'count' >> beam.Map(count_ones)) # Format the counts into a PCollection of strings. def format_text(word_count): (word, count) = word_count return '%s: %d' % (word, count) # Format the counts into a PCollection of dictionary strings. def format_dict(word_count): (word, count) = word_count row = dict(zip(HEADER, [word, count])) return row if known_args.format == 'text': output = counts | 'format text' >> beam.Map(format_text) # Write the output using a "Write" transform that has side effects. # pylint: disable=expression-not-assigned output | 'write text' >> WriteToText(known_args.output) elif known_args.format == 'avro': output = counts | 'format avro' >> beam.Map(format_dict) schema = avro.schema.parse(json.dumps(AVRO_SCHEMA)) # Write the output using a "Write" transform that has side effects. # pylint: disable=expression-not-assigned output | 'write avro' >> WriteToAvro( file_path_prefix=known_args.output, schema=schema, codec=DEFAULT_CODEC) else: output = counts | 'format parquet' >> beam.Map(format_dict) # Write the output using a "Write" transform that has side effects. # pylint: disable=expression-not-assigned output | 'write parquet' >> WriteToParquet( file_path_prefix=known_args.output, schema=PARQUET_SCHEMA, codec=DEFAULT_CODEC) result = p.run() result.wait_until_finish() # Do not query metrics when creating a template which doesn't run if (not hasattr(result, 'has_job') # direct runner or result.has_job): # not just a template creation empty_lines_filter = MetricsFilter().with_name('empty_lines') query_result = result.metrics().query(empty_lines_filter) if query_result['counters']: empty_lines_counter = query_result['counters'][0] logging.info('number of empty lines: %d', empty_lines_counter.result) word_lengths_filter = MetricsFilter().with_name('word_len_dist') query_result = result.metrics().query(word_lengths_filter) if query_result['distributions']: word_lengths_dist = query_result['distributions'][0] logging.info('average word length: %d', word_lengths_dist.result.mean)
def test_avro_it(self): num_records = self.test_pipeline.get_option('records') num_records = int(num_records) if num_records else 1000000 # Seed a `PCollection` with indices that will each be FlatMap'd into # `batch_size` records, to avoid having a too-large list in memory at # the outset batch_size = self.test_pipeline.get_option('batch-size') batch_size = int(batch_size) if batch_size else 10000 # pylint: disable=range-builtin-not-iterating batches = range(int(num_records / batch_size)) def batch_indices(start): # pylint: disable=range-builtin-not-iterating return range(start * batch_size, (start + 1) * batch_size) # A `PCollection` with `num_records` avro records records_pcoll = \ self.test_pipeline \ | 'create-batches' >> Create(batches) \ | 'expand-batches' >> FlatMap(batch_indices) \ | 'create-records' >> Map(record) fastavro_output = '/'.join([self.output, 'fastavro']) avro_output = '/'.join([self.output, 'avro']) self.addCleanup(delete_files, [self.output + '*']) # pylint: disable=expression-not-assigned records_pcoll \ | 'write_fastavro' >> WriteToAvro( fastavro_output, self.SCHEMA, use_fastavro=True ) # pylint: disable=expression-not-assigned records_pcoll \ | 'write_avro' >> WriteToAvro( avro_output, self.SCHEMA, use_fastavro=False ) result = self.test_pipeline.run() result.wait_until_finish() assert result.state == PipelineState.DONE fastavro_read_pipeline = TestPipeline(is_integration_test=True) fastavro_records = \ fastavro_read_pipeline \ | 'create-fastavro' >> Create(['%s*' % fastavro_output]) \ | 'read-fastavro' >> ReadAllFromAvro(use_fastavro=True) \ | Map(lambda rec: (rec['number'], rec)) avro_records = \ fastavro_read_pipeline \ | 'create-avro' >> Create(['%s*' % avro_output]) \ | 'read-avro' >> ReadAllFromAvro(use_fastavro=False) \ | Map(lambda rec: (rec['number'], rec)) def check(elem): v = elem[1] def assertEqual(l, r): if l != r: raise BeamAssertException('Assertion failed: %s == %s' % (l, r)) assertEqual(v.keys(), ['avro', 'fastavro']) avro_values = v['avro'] fastavro_values = v['fastavro'] assertEqual(avro_values, fastavro_values) assertEqual(len(avro_values), 1) # pylint: disable=expression-not-assigned { 'avro': avro_records, 'fastavro': fastavro_records } \ | CoGroupByKey() \ | Map(check) fastavro_read_pipeline.run().wait_until_finish() assert result.state == PipelineState.DONE
def test_avro_it(self): num_records = self.test_pipeline.get_option('records') num_records = int(num_records) if num_records else 1000000 fastavro_output = '/'.join([self.output, 'fastavro']) # Seed a `PCollection` with indices that will each be FlatMap'd into # `batch_size` records, to avoid having a too-large list in memory at # the outset batch_size = self.test_pipeline.get_option('batch-size') batch_size = int(batch_size) if batch_size else 10000 # pylint: disable=bad-option-value batches = range(int(num_records / batch_size)) def batch_indices(start): # pylint: disable=bad-option-value return range(start * batch_size, (start + 1) * batch_size) # A `PCollection` with `num_records` avro records records_pcoll = \ self.test_pipeline \ | 'create-batches' >> Create(batches) \ | 'expand-batches' >> FlatMap(batch_indices) \ | 'create-records' >> Map(record) # pylint: disable=expression-not-assigned records_pcoll \ | 'write_fastavro' >> WriteToAvro( fastavro_output, parse_schema(json.loads(self.SCHEMA_STRING)), ) result = self.test_pipeline.run() result.wait_until_finish() fastavro_pcoll = self.test_pipeline \ | 'create-fastavro' >> Create(['%s*' % fastavro_output]) \ | 'read-fastavro' >> ReadAllFromAvro() mapped_fastavro_pcoll = fastavro_pcoll | "map_fastavro" >> Map( lambda x: (x['number'], x)) mapped_record_pcoll = records_pcoll | "map_record" >> Map( lambda x: (x['number'], x)) def validate_record(elem): v = elem[1] def assertEqual(l, r): if l != r: raise BeamAssertException('Assertion failed: %s == %s' % (l, r)) assertEqual(sorted(v.keys()), ['fastavro', 'record_pcoll']) record_pcoll_values = v['record_pcoll'] fastavro_values = v['fastavro'] assertEqual(record_pcoll_values, fastavro_values) assertEqual(len(record_pcoll_values), 1) { "record_pcoll": mapped_record_pcoll, "fastavro": mapped_fastavro_pcoll } | CoGroupByKey() | Map(validate_record) result = self.test_pipeline.run() result.wait_until_finish() self.addCleanup(delete_files, [self.output]) assert result.state == PipelineState.DONE
def run(argv=None): """Main entry point""" parser = argparse.ArgumentParser() parser.add_argument('--project', default='query-11', type=str, required=False, help='project') parser.add_argument('--job_name', default='basel3', type=str) parser.add_argument('--temp_location', default='gs://dataflow_s/tmp') parser.add_argument('--region', default='us-central1') parser.add_argument('--staging_location', default='gs://dataflow_s/stage') parser.add_argument( '--records', dest='records', type=int, # default='gs://dataflow-samples/shakespeare/kinglear.txt', default='10', # gsutil cp gs://dataflow-samples/shakespeare/kinglear.txt help='Number of records to be generate') parser.add_argument( '--output', required=False, default='gs://dataflow_s/RPM/account_id_schema_output.avro', help='Output file to write results to.') parser.add_argument( '--input', default='gs://dataflow_s/RPM/account_id_schema_new.avro', help='input file to write results to.') # Parse arguments from the command line. # known_args, pipeline_args = parser.parse_known_args(argv) args = parser.parse_args() dataflow_options = [ '--project=%s' % (args.project), '--job_name=%s' % (args.job_name), '--temp_location=%s' % (args.temp_location), '--region=%s' % (args.region) ] dataflow_options.append('--staging_location=%s' % (args.staging_location)) options = PipelineOptions(dataflow_options) gcloud_options = options.view_as(GoogleCloudOptions) # options.view_as(StandardOptions).runner = "dataflow" input_filename = args.input output_filename = args.output SCHEMA = { "namespace": "example.avro", "type": "record", "name": "User", "fields": [{ "name": "ACNO", "type": [ "null", { "logicalType": "char", "type": "string", "maxLength": 20 } ] }, { "name": "FIELD_1", "type": [ "null", { "logicalType": "char", "type": "float", "maxLength": 20 } ] }, { "name": "FIELD_2", "type": [ "null", { "logicalType": "char", "type": "float", "maxLength": 20 } ] }, { "name": "FIELD_3", "type": [ "null", { "logicalType": "char", "type": "float", "maxLength": 20 } ] }, { "name": "FIELD_4", "type": [ "null", { "logicalType": "char", "type": "float", "maxLength": 20 } ] }, { "name": "FIELD_5", "type": [ "null", { "logicalType": "char", "type": "float", "maxLength": 20 } ] }, { "name": "FIELD_6", "type": [ "null", { "logicalType": "char", "type": "float", "maxLength": 20 } ] }, { "name": "FIELD_7", "type": [ "null", { "logicalType": "char", "type": "float", "maxLength": 20 } ] }, { "name": "FIELD_8", "type": [ "null", { "logicalType": "char", "type": "float", "maxLength": 20 } ] }, { "name": "FIELD_9", "type": [ "null", { "logicalType": "char", "type": "float", "maxLength": 20 } ] }, { "name": "FIELD_10", "type": [ "null", { "logicalType": "char", "type": "float", "maxLength": 20 } ] }] } rec_cnt = args.records with beam.Pipeline(options=options) as p: left_pcol_name = 'p1' file = p | 'read_source' >> beam.io.ReadFromAvro( 'gs://dataflow_s/RPM/account_id_schema_new.avro') p1 = file | beam.Map(lambda x: { 'ACNO': x['ACNO'], 'FIELD_1': x["FIELD_1"] }) p2 = file | beam.Map(lambda x: { 'ACNO': x['ACNO'], 'FIELD_2': x["FIELD_2"] }) # P1_1 = p1 | "write" >> beam.io.WriteToText('./data.csv') # P2_2 = p2 | "write2" >> beam.io.WriteToText('./data2.csv') right_pcol_name = 'p2' join_keys = { left_pcol_name: [ 'ACNO' # 't1_col_B' ], right_pcol_name: [ 'ACNO' # 't2_col_B' ] } pipelines_dictionary = {left_pcol_name: p1, right_pcol_name: p2} test_pipeline = pipelines_dictionary | 'left join' >> Join( left_pcol_name=left_pcol_name, left_pcol=p1, right_pcol_name=right_pcol_name, right_pcol=p2, join_type='left', join_keys=join_keys) print(type(test_pipeline)) compressIdc = True use_fastavro = True # test_pipeline | 'write_fastavro' >> WriteToAvro( args.output, parse_schema(SCHEMA), use_fastavro=use_fastavro, file_name_suffix='.avro', codec=('deflate' if compressIdc else 'null'), ) result = p.run() result.wait_until_finish()
def run(argv=None): """Main entry point; defines and runs the wordcount pipeline.""" parser = argparse.ArgumentParser() parser.add_argument( '--input', dest='input', default='gs://dataflow-samples/shakespeare/kinglear.txt', help='Input file to process.') parser.add_argument('--output', dest='output', required=True, help='Output file to write results to.') parser.add_argument('--format', dest='format', default='text', help='Supported output file formats: %s.' % FORMATS) known_args, pipeline_args = parser.parse_known_args(argv) if known_args.format not in FORMATS: raise ValueError('--format should be one of: %s' % FORMATS) # We use the save_main_session option because one or more DoFn's in this # workflow rely on global context (e.g., a module imported at module level). pipeline_options = PipelineOptions(pipeline_args) pipeline_options.view_as(SetupOptions).save_main_session = True p = beam.Pipeline(options=pipeline_options) # Read the text file[pattern] into a PCollection. lines = p | 'read' >> ReadFromText(known_args.input) # Count the occurrences of each word. def count_ones(word_ones): (word, ones) = word_ones return (word, sum(ones)) counts = (lines | 'split' >> (beam.ParDo(WordExtractingDoFn()).with_output_types(unicode)) | 'pair_with_one' >> beam.Map(lambda x: (x, 1)) | 'group' >> beam.GroupByKey() | 'count' >> beam.Map(count_ones)) # Format the counts into a PCollection of strings. def format_text(word_count): (word, count) = word_count return '%s: %d' % (word, count) # Format the counts into a PCollection of dictionary strings. def format_dict(word_count): (word, count) = word_count row = dict(zip(HEADER, [word, count])) return row if known_args.format == 'text': output = counts | 'format text' >> beam.Map(format_text) # Write the output using a "Write" transform that has side effects. # pylint: disable=expression-not-assigned output | 'write text' >> WriteToText(known_args.output) elif known_args.format == 'avro': output = counts | 'format avro' >> beam.Map(format_dict) schema = avro.schema.parse(json.dumps(AVRO_SCHEMA)) # Write the output using a "Write" transform that has side effects. # pylint: disable=expression-not-assigned output | 'write avro' >> WriteToAvro( file_path_prefix=known_args.output, schema=schema, codec=DEFAULT_CODEC) else: output = counts | 'format parquet' >> beam.Map(format_dict) # Write the output using a "Write" transform that has side effects. # pylint: disable=expression-not-assigned output | 'write parquet' >> WriteToParquet( file_path_prefix=known_args.output, schema=PARQUET_SCHEMA, codec=DEFAULT_CODEC) result = p.run() result.wait_until_finish() # Do not query metrics when creating a template which doesn't run if (not hasattr(result, 'has_job') # direct runner or result.has_job): # not just a template creation empty_lines_filter = MetricsFilter().with_name('empty_lines') query_result = result.metrics().query(empty_lines_filter) if query_result['counters']: empty_lines_counter = query_result['counters'][0] logging.info('number of empty lines: %d', empty_lines_counter.result) word_lengths_filter = MetricsFilter().with_name('word_len_dist') query_result = result.metrics().query(word_lengths_filter) if query_result['distributions']: word_lengths_dist = query_result['distributions'][0] logging.info('average word length: %d', word_lengths_dist.result.mean)