def run(argv=None):
    """
    If you want to run just this file for rapid development, change runner to 'DirectRunner' and add
    GCS paths for --input and --avro_output, e.g.
    python qalert_requests_dataflow.py --input gs://pghpa_test_qalert/requests/2020/06/2020-06-17_requests.json
    --avro_output gs://pghpa_test_qalert/requests/avro_output/2020/06/2020-06-17/
    """

    known_args, pipeline_options, avro_schema = generate_args(
        job_name='parking-meters-dataflow',
        bucket='{}_parking'.format(os.environ['GCS_PREFIX']),
        argv=argv,
        schema_name='parking_meters'
    )

    with beam.Pipeline(options=pipeline_options) as p:

        field_name_swaps = [("longitude", "long"), ("latitude", "lat")]
        type_changes = [("long", "float"), ("lat", "float")]
        lines = p | ReadFromText(known_args.input, coder=JsonCoder())

        load = (
                lines
                | beam.ParDo(SwapFieldNames(field_name_swaps))
                | beam.ParDo(ChangeDataTypes(type_changes))
                | WriteToAvro(known_args.avro_output, schema=avro_schema, file_name_suffix='.avro', use_fastavro=True))
Ejemplo n.º 2
0
def run(argv=None):
    """
    If you want to run just this file for rapid development, pass the arg '-r DirectRunner' and add
    GCS paths for --input and --avro_output, e.g.
    python qalert_requests_dataflow.py --input gs://pghpa_test_qalert/requests/2020/06/2020-06-17_requests.json
    --avro_output gs://pghpa_test_qalert/requests/avro_output/2020/06/2020-06-17/ -r DirectRunner
    """

    known_args, pipeline_options, avro_schema = generate_args(
        job_name='wprdc-fire-dataflow',
        bucket='{}_ems_fire'.format(os.environ['GCS_PREFIX']),
        argv=argv,
        schema_name='ems_calls')

    with beam.Pipeline(options=pipeline_options) as p:

        field_name_swaps = [("census_block_group_center__x", "long"),
                            ("census_block_group_center__y", "lat")]
        lines = p | ReadFromText(known_args.input, coder=JsonCoder())

        load = (lines
                | beam.ParDo(SwapFieldNames(field_name_swaps))
                | WriteToAvro(known_args.avro_output,
                              schema=avro_schema,
                              file_name_suffix='.avro',
                              use_fastavro=True))
Ejemplo n.º 3
0
def run(argv=None):

    known_args, pipeline_options, avro_schema = generate_args(
        job_name='computronix-domi-permits',
        bucket='{}_computronix'.format(os.environ['GCS_PREFIX']),
        argv=argv,
        schema_name='domi_permits_computronix')

    with beam.Pipeline(options=pipeline_options) as p:

        field_name_swaps = [('PERMITTYPEPERMITTYPE', 'PERMITTYPE'),
                            ('TYPEOFWORKDESCRIPTION', 'WORKTYPE'),
                            ('APPLICANTCUSTOMFORMATTEDNAME', 'APPLICANTNAME'),
                            ('ALLCONTRACTORSNAME', 'CONTRACTORNAME'),
                            ('SPECIALPERMITINSTRUCTIONS',
                             'SPECIALINSTRUCTIONS'),
                            ('STATUSDESCRIPTION', 'STATUS')]

        lines = p | ReadFromText(known_args.input, coder=JsonCoder())

        load = (lines
                | beam.ParDo(ParseNestedFields())
                | beam.ParDo(SwapFieldNames(field_name_swaps))
                | beam.ParDo(GeocodeAddress())
                | WriteToAvro(known_args.avro_output,
                              schema=avro_schema,
                              file_name_suffix='.avro',
                              use_fastavro=True))
Ejemplo n.º 4
0
def run(argv=None):
    """Test Avro IO (backed by fastavro or Apache Avro) on a simple pipeline
  that transforms bitcoin transactions"""
    parser = argparse.ArgumentParser()
    parser.add_argument('--input',
                        dest='input',
                        default='gs://beam-avro-test/bitcoin/txns/*',
                        help='Input file(s) to process.')
    parser.add_argument('--output',
                        dest='output',
                        required=True,
                        help='Output file to write results to.')
    parser.add_argument('--compress',
                        dest='compress',
                        required=False,
                        action='store_true',
                        help='When set, compress the output data')
    parser.add_argument('--fastavro',
                        dest='use_fastavro',
                        required=False,
                        action='store_true',
                        help='When set, use fastavro for Avro I/O')

    opts, pipeline_args = parser.parse_known_args(argv)

    # We use the save_main_session option because one or more DoFn's in this
    # workflow rely on global context (e.g., a module imported at module level).
    pipeline_options = PipelineOptions(pipeline_args)
    pipeline_options.view_as(SetupOptions).save_main_session = True
    p = beam.Pipeline(options=pipeline_options)

    # Read the avro file[pattern] into a PCollection.
    records = \
        p | 'read' >> ReadFromAvro(opts.input, use_fastavro=opts.use_fastavro)

    measured = records | 'scan' >> beam.ParDo(BitcoinTxnCountDoFn())

    # pylint: disable=expression-not-assigned
    measured | 'write' >> \
        WriteToAvro(
            opts.output,
            schema=SCHEMA,
            codec=('deflate' if opts.compress else 'null'),
            use_fastavro=opts.use_fastavro
        )

    result = p.run()
    result.wait_until_finish()

    # Do not query metrics when creating a template which doesn't run
    if (not hasattr(result, 'has_job')  # direct runner
            or result.has_job):  # not just a template creation
        metrics = result.metrics().query()

        for counter in metrics['counters']:
            logging.info("Counter: %s", counter)

        for dist in metrics['distributions']:
            logging.info("Distribution: %s", dist)
Ejemplo n.º 5
0
def run(argv=None):

    known_args, pipeline_options, avro_schema = generate_args(
        job_name='accela-permits',
        bucket='{}_accela'.format(os.environ['GCS_PREFIX']),
        argv=argv,
        schema_name='accela_permits'
    )

    with beam.Pipeline(options=pipeline_options) as p:

        exclude_fields = [
            'module',
            'serviceProviderCode',
            'undistributedCost',
            'totalJobCost',
            'recordClass',
            'reportedChannel',
            'closedByDepartment',
            'estimatedProductionUnit',
            'actualProductionUnit',
            'createdByCloning',
            'closedByUser',
            'trackingId',
            'initiatedProduct',
            'createdBy',
            'value',
            'balance',
            'booking',
            'infraction',
            'misdemeanor',
            'offenseWitnessed',
            'defendantSignature',
            'parcels',
            'id',
            'statusDate',
            'jobValue',
            'reportedDate'
        ]

        address_field = 'address'

        field_name_swaps = [
            ('customId', 'id'),
            ('totalPay', 'total_paid')
        ]

        lines = p | ReadFromText(known_args.input, coder=JsonCoder())

        load = (
                lines
                | beam.ParDo(FilterInvalidRecord())
                | beam.ParDo(FilterFields(exclude_fields))
                | beam.ParDo(ParseNestedFields())
                | beam.ParDo(GeocodeAddress(address_field))
                | beam.ParDo(SwapFieldNames(field_name_swaps))
                | beam.ParDo(ColumnsCamelToSnakeCase())
                | WriteToAvro(known_args.avro_output, schema=avro_schema, file_name_suffix='.avro', use_fastavro=True))
Ejemplo n.º 6
0
def run(argv=None):

    known_args, pipeline_options, avro_schema = generate_args(
        job_name='twilio-311-dataflow',
        bucket='{}_twilio'.format(os.environ['GCS_PREFIX']),
        argv=argv,
        schema_name='twilio_reports'
    )

    with beam.Pipeline(options=pipeline_options) as p:
        # Read the text file[pattern] into a PCollection.
        lines = p | ReadFromText(known_args.input, coder=JsonCoder())

        load = (
                lines
                | WriteToAvro(known_args.avro_output, schema=avro_schema, file_name_suffix='.avro', use_fastavro=True))
Ejemplo n.º 7
0
def run(argv=None):

    known_args, pipeline_options, avro_schema = generate_args(
        job_name='firearms-dataflow',
        bucket='{}_firearm_seizures'.format(os.environ['GCS_PREFIX']),
        argv=argv,
        schema_name='firearm_seizures'
    )

    with beam.Pipeline(options=pipeline_options) as p:
        # Read the text file[pattern] into a PCollection.
        lines = p | ReadFromText(known_args.input, skip_header_lines=1)

        load = (
                lines
                | beam.ParDo(ConvertToDicts())
                | WriteToAvro(known_args.avro_output, schema=avro_schema, file_name_suffix='.avro', use_fastavro=True))
def run(argv=None):
    known_args, pipeline_options, avro_schema = generate_args(
        job_name='computronix-trades-dataflow',
        bucket='{}_computronix'.format(os.environ['GCS_PREFIX']),
        argv=argv,
        schema_name='trade_licenses_computronix')

    with beam.Pipeline(options=pipeline_options) as p:
        # Read the text file[pattern] into a PCollection.
        lines = p | ReadFromText(known_args.input, coder=JsonCoder())

        load = (lines
                | beam.ParDo(FormatColumnNames())
                | beam.ParDo(ConvertTypes())
                | WriteToAvro(known_args.avro_output,
                              schema=avro_schema,
                              file_name_suffix='.avro',
                              use_fastavro=True))
Ejemplo n.º 9
0
def run(argv=None):
    dt = datetime.now()
    parser = argparse.ArgumentParser()

    parser.add_argument(
        '--input',
        dest='input',
        default='gs://{}_311/requests/{}/{}/{}_requests.json'.format(
            os.environ['GCS_PREFIX'], dt.strftime('%Y'),
            dt.strftime('%m').lower(), dt.strftime("%Y-%m-%d")),
        help='Input file to process.')
    parser.add_argument(
        '--avro_output',
        dest='avro_output',
        default='gs://{}_311/requests/avro_output/{}/{}/{}/avro_output'.format(
            os.environ['GCS_PREFIX'], dt.strftime('%Y'),
            dt.strftime('%m').lower(), dt.strftime("%Y-%m-%d")),
        help='Output directory to write avro files.')

    known_args, pipeline_args = parser.parse_known_args(argv)

    #TODO: run on on-prem network when route is opened
    # Use runner=DataflowRunner to run in GCP environment, DirectRunner to run locally
    pipeline_args.extend(
        generate_args('qalert-requests-dataflow',
                      '{}_311'.format(os.environ['GCS_PREFIX']),
                      'DirectRunner'))

    avro_schema = get_schema('City_of_Pittsburgh_QAlert_Requests')

    pipeline_options = PipelineOptions(pipeline_args)

    with beam.Pipeline(options=pipeline_options) as p:
        # Read the text file[pattern] into a PCollection.
        lines = p | ReadFromText(known_args.input, coder=JsonCoder())

        load = (lines
                | beam.ParDo(GetStatus())
                | beam.ParDo(CleanLatLong())
                | WriteToAvro(known_args.avro_output,
                              schema=avro_schema,
                              file_name_suffix='.avro',
                              use_fastavro=True))
Ejemplo n.º 10
0
def run(argv=None):

    known_args, pipeline_options, avro_schema = dataflow_utils.generate_args(
        job_name='comm-ctr-attendance-dataflow',
        bucket='{}_community_centers'.format(os.environ['GCS_PREFIX']),
        argv=argv,
        schema_name='community_center_attendance')

    with beam.Pipeline(options=pipeline_options) as p:
        # Read the text file[pattern] into a PCollection.
        lines = p | ReadFromText(known_args.input,
                                 coder=dataflow_utils.JsonCoder())

        load = (lines
                | beam.ParDo(ColumnsCamelToSnakeCase())
                | WriteToAvro(known_args.avro_output,
                              schema=avro_schema,
                              file_name_suffix='.avro',
                              use_fastavro=True))
Ejemplo n.º 11
0
def run(argv=None):
    dt = datetime.now()
    parser = argparse.ArgumentParser()

    parser.add_argument(
        '--input',
        dest='input',
        default='gs://{}_firearm_seizures/{}/{}/{}_firearm_seizures.csv'.
        format(os.environ['GCS_PREFIX'], dt.strftime('%Y'),
               dt.strftime('%m').lower(), dt.strftime("%Y-%m-%d")),
        help='Input file to process.')
    parser.add_argument(
        '--avro_output',
        dest='avro_output',
        default='gs://{}_firearm_seizures/avro_output/{}/{}/{}/avro_output'.
        format(os.environ['GCS_PREFIX'], dt.strftime('%Y'),
               dt.strftime('%m').lower(), dt.strftime("%Y-%m-%d")),
        help='Output directory to write avro files.')

    known_args, pipeline_args = parser.parse_known_args(argv)

    # TODO: run on on-prem network when route is opened
    # Use runner=DataflowRunner to run in GCP environment, DirectRunner to run locally
    pipeline_args.extend(
        generate_args('firearms-dataflow',
                      '{}_firearm_seizures'.format(os.environ['GCS_PREFIX']),
                      'DirectRunner'))

    avro_schema = get_schema('firearm_seizures')

    pipeline_options = PipelineOptions(pipeline_args)

    with beam.Pipeline(options=pipeline_options) as p:
        # Read the text file[pattern] into a PCollection.
        lines = p | ReadFromText(known_args.input, skip_header_lines=1)

        load = (lines
                | beam.ParDo(ConvertToDicts())
                | WriteToAvro(known_args.avro_output,
                              schema=avro_schema,
                              file_name_suffix='.avro',
                              use_fastavro=True))
Ejemplo n.º 12
0
def run(argv=None):

    known_args, pipeline_options, avro_schema = generate_args(
        job_name='registered-businesses-dataflow',
        bucket='{}_finance'.format(os.environ['GCS_PREFIX']),
        argv=argv,
        schema_name='registered_businesses')

    with beam.Pipeline(options=pipeline_options) as p:
        lines = p | ReadFromText(known_args.input, coder=JsonCoder())

        load = (lines
                | beam.ParDo(ColumnsToLowerCase())
                | beam.ParDo(ParseAddress())
                | beam.ParDo(
                    NormalizeAddress(StaticValueProvider(str, 'address_full')))
                | WriteToAvro(known_args.avro_output,
                              schema=avro_schema,
                              file_name_suffix='.avro',
                              use_fastavro=True))
def run(argv=None):
    """
    If you want to run just this file for rapid development, change runner to 'DirectRunner' and add
    GCS paths for --input and --avro_output, e.g.
    python qalert_requests_dataflow.py --input gs://pghpa_test_qalert/requests/2020/06/2020-06-17_requests.json
    --avro_output gs://pghpa_test_qalert/requests/avro_output/2020/06/2020-06-17/
    """

    known_args, pipeline_options, avro_schema = generate_args(
        job_name='parking-transactions-dataflow',
        bucket='{}_parking'.format(os.environ['GCS_PREFIX']),
        argv=argv,
        schema_name='parking_transactions'
    )

    with beam.Pipeline(options=pipeline_options) as p:

        lines = p | ReadFromText(known_args.input, coder=JsonCoder())

        load = (
                lines
                | WriteToAvro(known_args.avro_output, schema=avro_schema, file_name_suffix='.avro', use_fastavro=True))
Ejemplo n.º 14
0
def run(argv=None):

    known_args, pipeline_options, avro_schema = generate_args(
        job_name='police-blotter-30-day-dataflow',
        bucket='{}_police'.format(os.environ['GCS_PREFIX']),
        argv=argv,
        schema_name='30_day_police_blotter')

    with beam.Pipeline(options=pipeline_options) as p:
        lines = p | ReadFromText(known_args.input, coder=JsonCoder())

        data_type_changes = [('CCR', 'int'), ('TRACT', 'int')]
        field_name_swaps = [('PK', 'id')]

        load = (lines
                | beam.ParDo(CleanPKs())
                | beam.ParDo(ChangeDataTypes(data_type_changes))
                | beam.ParDo(SwapFieldNames(field_name_swaps))
                | WriteToAvro(known_args.avro_output,
                              schema=avro_schema,
                              file_name_suffix='.avro',
                              use_fastavro=True))
Ejemplo n.º 15
0
def run(argv=None):
    """
    If you want to run just this file for rapid development, add the arg '-r DirectRunner' and add
    GCS paths for --input and --avro_output, e.g.
    python qalert_requests_dataflow.py --input gs://pghpa_test_qalert/requests/2020/06/2020-06-17_requests.json
    --avro_output gs://pghpa_test_qalert/requests/avro_output/2020/06/2020-06-17/ -r DirectRunner
    """

    known_args, pipeline_options, avro_schema = generate_args(
        job_name='qalert-requests-dataflow',
        bucket='{}_qalert'.format(os.environ['GCS_PREFIX']),
        argv=argv,
        schema_name='City_of_Pittsburgh_QAlert_Requests')

    with beam.Pipeline(options=pipeline_options) as p:

        date_conversions = [('lastActionUnix', 'lastAction'),
                            ('addDateUnix', 'createDate')]
        field_name_swaps = [('addDateUnix', 'createDateUnix'),
                            ('status', 'statusCode'), ('latitude', 'lat'),
                            ('longitude', 'long'),
                            ('master', 'masterRequestId'),
                            ('typeId', 'requestTypeId'),
                            ('typeName', 'requestType')]

        lines = p | ReadFromText(known_args.input, coder=JsonCoder())

        load = (lines
                | beam.ParDo(GetDateStrings(date_conversions))
                | beam.ParDo(SwapFieldNames(field_name_swaps))
                | beam.ParDo(GetStatus())
                | beam.ParDo(GetClosedDate())
                | WriteToAvro(known_args.avro_output,
                              schema=avro_schema,
                              file_name_suffix='.avro',
                              use_fastavro=True))
Ejemplo n.º 16
0
def run(argv=None):
    """Main entry point"""
    parser = argparse.ArgumentParser()
    # parser.add_argument('--project', type=str, required=False, help='project')
    parser.add_argument(
        '--records',
        dest='records',
        type=int,
        # default='gs://dataflow-samples/shakespeare/kinglear.txt',
        default='10',  # gsutil cp gs://dataflow-samples/shakespeare/kinglear.txt
        help='Number of records to be generate')
    parser.add_argument('--output',
                        dest='output',
                        required=False,
                        default='./',
                        help='Output file to write results to.')
    # Parse arguments from the command line.
    known_args, pipeline_args = parser.parse_known_args(argv)

    # Store the CLI arguments to variables
    # project_id = known_args.project

    # Setup the dataflow pipeline options
    pipeline_options = PipelineOptions(pipeline_args)
    # pipeline_options.view_as(SetupOptions).save_main_session = True
    # google_cloud_options = pipeline_options.view_as(GoogleCloudOptions)
    # google_cloud_options.project = project_id

    save_main_session = True
    pipeline_options.view_as(
        SetupOptions).save_main_session = save_main_session

    # SCHEMA_STRING = '''
    # {"namespace": "example.avro",
    # "type": "record",
    # "name": "User",
    # "fields": [
    #     {"name": "ACNO", "type": "int"},
    #     {"name": "PRIN_BAL", "type": "int"},
    #     {"name": "FEE_ANT", "default": null, "type": ["null", "double"]},
    #     {"name": "GENDER",  "default": null, "type": ["null", {"logicalType": "char", "type": "string", "maxLength": 1}]}

    # ]
    # }
    # '''

    SCHEMA = {
        "namespace":
        "example.avro",
        "type":
        "record",
        "name":
        "User",
        "fields": [{
            "name":
            "ACNO",
            "type": [
                "null", {
                    "logicalType": "char",
                    "type": "string",
                    "maxLength": 20
                }
            ]
        }, {
            "name":
            "FIELD_1",
            "type": [
                "null", {
                    "logicalType": "char",
                    "type": "float",
                    "maxLength": 20
                }
            ]
        }, {
            "name":
            "FIELD_2",
            "type": [
                "null", {
                    "logicalType": "char",
                    "type": "float",
                    "maxLength": 20
                }
            ]
        }]
    }

    # {"name": "GENDER', "type": "string"}

    # {"name": "FEE_ANT", "type": "long"}

    # p = beam.Pipeline(options=pipeline_options)
    rec_cnt = known_args.records
    with beam.Pipeline(options=pipeline_options) as p:
        left_pcol_name = 'p1'
        file = p | 'read_source' >> beam.io.ReadFromAvro(
            "./data/account_id_schema_new.avro")
        p1 = file | beam.Map(lambda x: {
            'ACNO': x['ACNO'],
            'FIELD_1': x["FIELD_1"]
        })
        p2 = file | beam.Map(lambda x: {
            'ACNO': x['ACNO'],
            'FIELD_2': x["FIELD_2"]
        })

        P1_1 = p1 | "write" >> beam.io.WriteToText('./data.csv')
        P2_2 = p2 | "write2" >> beam.io.WriteToText('./data2.csv')

        right_pcol_name = 'p2'

        join_keys = {
            left_pcol_name: [
                'ACNO'
                # 't1_col_B'
            ],
            right_pcol_name: [
                'ACNO'
                # 't2_col_B'
            ]
        }

        pipelines_dictionary = {left_pcol_name: p1, right_pcol_name: p2}
        test_pipeline = pipelines_dictionary | 'left join' >> Join(
            left_pcol_name=left_pcol_name,
            left_pcol=p1,
            right_pcol_name=right_pcol_name,
            right_pcol=p2,
            join_type='left',
            join_keys=join_keys)
        print(type(test_pipeline))
        test_pipeline | "print" >> beam.io.WriteToText('./test.csv')

        compressIdc = True
        use_fastavro = True
        #

        test_pipeline | 'write_fastavro' >> WriteToAvro(
            known_args.output,
            # '/tmp/dataflow/{}/{}'.format(
            #     'demo', 'output'),
            # parse_schema(json.loads(SCHEMA_STRING)),
            parse_schema(SCHEMA),
            use_fastavro=use_fastavro,
            file_name_suffix='.avro',
            codec=('deflate' if compressIdc else 'null'),
        )
    result = p.run()
    result.wait_until_finish()
Ejemplo n.º 17
0
def run(argv=None):
    """Main entry point"""
    parser = argparse.ArgumentParser()
    # parser.add_argument('--project', type=str, required=False, help='project')
    parser.add_argument(
        '--records',
        dest='records',
        type=int,
        # default='gs://dataflow-samples/shakespeare/kinglear.txt',
        default='10',  # gsutil cp gs://dataflow-samples/shakespeare/kinglear.txt
        help='Number of records to be generate')
    parser.add_argument('--output',
                        dest='output',
                        required=False,
                        default='./',
                        help='Output file to write results to.')
    # Parse arguments from the command line.
    known_args, pipeline_args = parser.parse_known_args(argv)

    # Store the CLI arguments to variables
    # project_id = known_args.project

    # Setup the dataflow pipeline options
    pipeline_options = PipelineOptions(pipeline_args)
    # pipeline_options.view_as(SetupOptions).save_main_session = True
    # google_cloud_options = pipeline_options.view_as(GoogleCloudOptions)
    # google_cloud_options.project = project_id

    save_main_session = True
    pipeline_options.view_as(
        SetupOptions).save_main_session = save_main_session
    print(pipeline_args)

    SCHEMA = {
        "namespace":
        "example.avro",
        "type":
        "record",
        "name":
        "User",
        "fields": [{
            "name":
            "ACNO",
            "type": [
                "null", {
                    "logicalType": "char",
                    "type": "string",
                    "maxLength": 20
                }
            ]
        }, {
            "name": "NUM_OF_MTHS_PD_30",
            "type": ["null", 'int', 'string']
        }, {
            "name":
            "FIELD_1",
            "type": [
                "null", {
                    "logicalType": "char",
                    "type": "float",
                    "maxLength": 20
                }
            ]
        }, {
            "name":
            "FIELD_2",
            "type": [
                "null", {
                    "logicalType": "char",
                    "type": "float",
                    "maxLength": 20
                }
            ]
        }, {
            "name":
            "FIELD_3",
            "type": [
                "null", {
                    "logicalType": "char",
                    "type": "float",
                    "maxLength": 20
                }
            ]
        }, {
            "name":
            "FIELD_4",
            "type": [
                "null", {
                    "logicalType": "char",
                    "type": "float",
                    "maxLength": 20
                }
            ]
        }, {
            "name":
            "FIELD_5",
            "type": [
                "null", {
                    "logicalType": "char",
                    "type": "float",
                    "maxLength": 20
                }
            ]
        }, {
            "name":
            "FIELD_6",
            "type": [
                "null", {
                    "logicalType": "char",
                    "type": "float",
                    "maxLength": 20
                }
            ]
        }, {
            "name":
            "FIELD_7",
            "type": [
                "null", {
                    "logicalType": "char",
                    "type": "float",
                    "maxLength": 20
                }
            ]
        }, {
            "name":
            "FIELD_8",
            "type": [
                "null", {
                    "logicalType": "char",
                    "type": "float",
                    "maxLength": 20
                }
            ]
        }, {
            "name":
            "FIELD_9",
            "type": [
                "null", {
                    "logicalType": "char",
                    "type": "float",
                    "maxLength": 20
                }
            ]
        }, {
            "name":
            "FIELD_10",
            "type": [
                "null", {
                    "logicalType": "char",
                    "type": "float",
                    "maxLength": 20
                }
            ]
        }]
    }

    rec_cnt = known_args.records
    with beam.Pipeline(options=pipeline_options) as p:
        left_pcol_name = 'p1'
        file = p | 'read_source' >> beam.io.ReadFromAvro(
            "./data/Curr_account.avro") | beam.Distinct()
        file2 = p | 'read_source2' >> beam.io.ReadFromAvro(
            "./data/Prev_account.avro")
        p1 = file | 'filter fields' >> beam.Filter(
            lambda x: int(x['NUM_OF_MTHS_PD_30']) >= 0)
        p2 = file2 | 'filter fields2' >> beam.Filter(
            lambda x: int(x['NUM_OF_MTHS_PD_30']) >= 0)
        # P1_1 = p1 | "write" >> beam.io.WriteToText('./data.csv')
        # P2_2 = p2 | "write2" >> beam.io.WriteToText('./data2.csv')

        right_pcol_name = 'p2'

        join_keys = {
            left_pcol_name: [
                'ACNO'
                # 't1_col_B'
            ],
            right_pcol_name: [
                'ACNO'
                # 't2_col_B'
            ]
        }

        pipelines_dictionary = {left_pcol_name: p1, right_pcol_name: p2}
        test_pipeline = pipelines_dictionary | 'left join' >> Join(
            left_pcol_name=left_pcol_name,
            left_pcol=p1,
            right_pcol_name=right_pcol_name,
            right_pcol=p2,
            join_type='left',
            join_keys=join_keys)
        test_pipeline | 'add 1 to NUM_OF_MTHS_PD_30' >> beam.Map(
            add_one) | "write4" >> beam.io.WriteToText('./data4.csv')
        print(type(test_pipeline))
        compressIdc = True
        use_fastavro = True
        #

        test_pipeline | 'write_fastavro' >> WriteToAvro(
            known_args.output,
            parse_schema(SCHEMA),
            use_fastavro=use_fastavro,
            file_name_suffix='.avro',
            codec=('deflate' if compressIdc else 'null'),
        )
    result = p.run()
    result.wait_until_finish()
Ejemplo n.º 18
0
def run(argv=None):
    """Main entry point"""
    parser = argparse.ArgumentParser()
    parser.add_argument('--project',
                        default='mvp-project-273913',
                        type=str,
                        required=False,
                        help='project')
    parser.add_argument('--job_name', default='rpm', type=str)
    parser.add_argument('--worker_node', default='n1-standard-4')
    parser.add_argument('--temp_location',
                        default='gs://zz_michael/dataflow_s/tmp')
    parser.add_argument('--location', default='gcs')
    parser.add_argument('--region', default='asia-east1')
    parser.add_argument('--staging_location',
                        default='gs://zz_michael/dataflow_s/stage')
    parser.add_argument(
        '--output',
        required=False,
        default=
        'gs://zz_michael/dataflow_s/RPM/output/account_id_schema_output.avro',
        help='Output file to write results to.')
    parser.add_argument(
        '--input',
        default='gs://zz_michael/dataflow_s/RPM/Curr_account.avro',
        help='input file to write results to.')
    parser.add_argument(
        '--input2',
        default='gs://zz_michael/dataflow_s/RPM/Prev_account.avro',
        help='input file to write results to.')
    # Parse arguments from the command line.
    # known_args, pipeline_args = parser.parse_known_args(argv)
    args = parser.parse_args()

    dataflow_options = [
        '--project=%s' % (args.project),
        '--job_name=%s' % (args.job_name),
        '--temp_location=%s' % (args.temp_location),
        '--worker_machine_type=%s' % (args.worker_node),
        '--region=%s' % (args.region)
    ]

    dataflow_options.append('--staging_location=%s' % (args.staging_location))
    options = PipelineOptions(dataflow_options)
    gcloud_options = options.view_as(GoogleCloudOptions)

    options.view_as(StandardOptions).runner = "dataflow"

    table_schema = {
        'fields': [{
            "name": "ACNO",
            "type": 'INTEGER',
            'mode': 'NULLABLE'
        }, {
            "name": "NUM_OF_MTHS_PD_30",
            "type": 'INTEGER',
            'mode': 'NULLABLE'
        }, {
            "name": "FIELD_1",
            "type": 'FLOAT',
            'mode': 'NULLABLE'
        }, {
            "name": "FIELD_2",
            "type": 'FLOAT',
            'mode': 'NULLABLE'
        }, {
            "name": "FIELD_3",
            "type": 'FLOAT',
            'mode': 'NULLABLE'
        }, {
            "name": "FIELD_4",
            "type": 'FLOAT',
            'mode': 'NULLABLE'
        }, {
            "name": "FIELD_5",
            "type": 'FLOAT',
            'mode': 'NULLABLE'
        }, {
            "name": "FIELD_6",
            "type": 'FLOAT',
            'mode': 'NULLABLE'
        }, {
            "name": "FIELD_7",
            "type": 'FLOAT',
            'mode': 'NULLABLE'
        }, {
            "name": "FIELD_8",
            "type": 'FLOAT',
            'mode': 'NULLABLE'
        }, {
            "name": "FIELD_9",
            "type": 'FLOAT',
            'mode': 'NULLABLE'
        }, {
            "name": "FIELD_10",
            "type": 'FLOAT',
            'mode': 'NULLABLE'
        }]
    }
    SCHEMA = {
        "namespace":
        "example.avro",
        "type":
        "record",
        "name":
        "User",
        "fields": [{
            "name":
            "ACNO",
            "type": [
                "null", {
                    "logicalType": "char",
                    "type": "string",
                    "maxLength": 20
                }
            ]
        }, {
            "name": "NUM_OF_MTHS_PD_30",
            "type": ["null", 'int', 'string']
        }, {
            "name":
            "FIELD_1",
            "type": [
                "null", {
                    "logicalType": "char",
                    "type": "float",
                    "maxLength": 20
                }
            ]
        }, {
            "name":
            "FIELD_2",
            "type": [
                "null", {
                    "logicalType": "char",
                    "type": "float",
                    "maxLength": 20
                }
            ]
        }, {
            "name":
            "FIELD_3",
            "type": [
                "null", {
                    "logicalType": "char",
                    "type": "float",
                    "maxLength": 20
                }
            ]
        }, {
            "name":
            "FIELD_4",
            "type": [
                "null", {
                    "logicalType": "char",
                    "type": "float",
                    "maxLength": 20
                }
            ]
        }, {
            "name":
            "FIELD_5",
            "type": [
                "null", {
                    "logicalType": "char",
                    "type": "float",
                    "maxLength": 20
                }
            ]
        }, {
            "name":
            "FIELD_6",
            "type": [
                "null", {
                    "logicalType": "char",
                    "type": "float",
                    "maxLength": 20
                }
            ]
        }, {
            "name":
            "FIELD_7",
            "type": [
                "null", {
                    "logicalType": "char",
                    "type": "float",
                    "maxLength": 20
                }
            ]
        }, {
            "name":
            "FIELD_8",
            "type": [
                "null", {
                    "logicalType": "char",
                    "type": "float",
                    "maxLength": 20
                }
            ]
        }, {
            "name":
            "FIELD_9",
            "type": [
                "null", {
                    "logicalType": "char",
                    "type": "float",
                    "maxLength": 20
                }
            ]
        }, {
            "name":
            "FIELD_10",
            "type": [
                "null", {
                    "logicalType": "char",
                    "type": "float",
                    "maxLength": 20
                }
            ]
        }]
    }

    with beam.Pipeline(options=options) as p:
        left_pcol_name = 'p1'
        Curr_Month = p | 'read_Curr_Month' >> beam.io.ReadFromAvro(args.input)
        Prev_Month = p | 'read_Prev_Month' >> beam.io.ReadFromAvro(args.input2)
        p1 = Curr_Month | 'select fields from Curr_Month' >> beam.Filter(
            lambda x: int(x['NUM_OF_MTHS_PD_30']) >= 0)
        p2 = Prev_Month | 'select fields2 from Prev_Month' >> beam.Filter(
            lambda x: int(x['NUM_OF_MTHS_PD_30']) >= 0)
        right_pcol_name = 'p2'

        join_keys = {
            left_pcol_name: [
                'ACNO'
                # 't1_col_B'
            ],
            right_pcol_name: [
                'ACNO'
                # 't2_col_B'
            ]
        }

        joinkey_dict = {left_pcol_name: p1, right_pcol_name: p2}
        joined_data = joinkey_dict | 'left join' >> Join(
            left_pcol_name=left_pcol_name,
            left_pcol=p1,
            right_pcol_name=right_pcol_name,
            right_pcol=p2,
            join_type='left',
            join_keys=join_keys)
        derived_result = joined_data | 'Transform (add 1 to fileld)' >> beam.Map(
            cycle_dlqn)
        print(type(joined_data))
        compressIdc = True
        use_fastavro = True

        compressIdc = True
        use_fastavro = True

        if args.location == "bigquery":
            derived_result | beam.io.WriteToBigQuery(
                table_spec,
                schema=table_schema,
                write_disposition=beam.io.BigQueryDisposition.WRITE_TRUNCATE,
                create_disposition=beam.io.BigQueryDisposition.CREATE_IF_NEEDED
            )
        else:
            derived_result | 'Write out Stage' >> WriteToAvro(
                args.output,
                parse_schema(SCHEMA),
                use_fastavro=use_fastavro,
                file_name_suffix='.avro',
                codec=('deflate' if compressIdc else 'null'),
            )

    result = p.run()
    result.wait_until_finish()
Ejemplo n.º 19
0
def run(argv=None):
    """Main entry point"""
    parser = argparse.ArgumentParser()
    # parser.add_argument('--project', type=str, required=False, help='project')
    parser.add_argument(
        '--records',
        dest='records',
        type=int,
        # default='gs://dataflow-samples/shakespeare/kinglear.txt',
        default='10',  # gsutil cp gs://dataflow-samples/shakespeare/kinglear.txt
        help='Number of records to be generate')
    parser.add_argument('--output',
                        dest='output',
                        required=False,
                        default='/tmp/dataflow/demo/output',
                        help='Output file to write results to.')
    # Parse arguments from the command line.
    known_args, pipeline_args = parser.parse_known_args(argv)

    # Store the CLI arguments to variables
    # project_id = known_args.project

    # Setup the dataflow pipeline options
    pipeline_options = PipelineOptions(pipeline_args)
    # pipeline_options.view_as(SetupOptions).save_main_session = True
    # google_cloud_options = pipeline_options.view_as(GoogleCloudOptions)
    # google_cloud_options.project = project_id

    save_main_session = True
    pipeline_options.view_as(
        SetupOptions).save_main_session = save_main_session

    # SCHEMA_STRING = '''
    # {"namespace": "example.avro",
    # "type": "record",
    # "name": "User",
    # "fields": [
    #     {"name": "ACNO", "type": "int"},
    #     {"name": "PRIN_BAL", "type": "int"},
    #     {"name": "FEE_ANT", "default": null, "type": ["null", "double"]},
    #     {"name": "GENDER",  "default": null, "type": ["null", {"logicalType": "char", "type": "string", "maxLength": 1}]}

    # ]
    # }
    # '''

    SCHEMA = {
        "namespace":
        "example.avro",
        "type":
        "record",
        "name":
        "User",
        "fields": [{
            "name": "ACNO",
            "type": "int"
        }, {
            "name": "PRIN_BAL",
            "type": "int"
        }, {
            "name": "FEE_ANT",
            "default": 'null',
            "type": ["null", "double"]
        }, {
            "name":
            "GENDER",
            "default":
            'null',
            "type": [
                "null", {
                    "logicalType": "char",
                    "type": "string",
                    "maxLength": 1
                }
            ]
        }]
    }

    # {"name": "GENDER', "type": "string"}

    # {"name": "FEE_ANT", "type": "long"}

    # p = beam.Pipeline(options=pipeline_options)
    rec_cnt = known_args.records
    with beam.Pipeline(options=pipeline_options) as p:
        left_pcol_name = 'p1'
        p1 = p | 'Create source data' >> beam.Create(
            [{
                'ACNO': i + 1,
                'PRIN_BAL': i + 1,
                'GENDER1': 'Y',
                'GENDER': random.choice(['Y', 'N']),
            } for i in range(rec_cnt)])

        right_pcol_name = 'p2'
        p2 = p | 'Create join data' >> beam.Create(
            [{
                'ACNO': i + 1,
                'FEE_ANT': random.random() * 100000000,
            } for i in range(rec_cnt)])

        join_keys = {
            left_pcol_name: [
                'ACNO'
                # 't1_col_B'
            ],
            right_pcol_name: [
                'ACNO'
                # 't2_col_B'
            ]
        }

        pipelines_dictionary = {left_pcol_name: p1, right_pcol_name: p2}
        test_pipeline = pipelines_dictionary | 'left join' >> Join(
            left_pcol_name=left_pcol_name,
            left_pcol=p1,
            right_pcol_name=right_pcol_name,
            right_pcol=p2,
            join_type='left',
            join_keys=join_keys)

        # test_pipeline | "print" >> beam.Map(printfn)

        compressIdc = True
        use_fastavro = True

        test_pipeline | 'write_fastavro' >> WriteToAvro(
            known_args.output,
            # '/tmp/dataflow/{}/{}'.format(
            #     'demo', 'output'),
            # parse_schema(json.loads(SCHEMA_STRING)),
            parse_schema(SCHEMA),
            use_fastavro=use_fastavro,
            file_name_suffix='.avro',
            codec=('deflate' if compressIdc else 'null'),
        )
Ejemplo n.º 20
0
def run(known_args, pipeline_args):
    # We use the save_main_session option because one or more DoFn's in this
    # workflow rely on global context (e.g., a module imported at module level).
    logging.getLogger().setLevel(logging.INFO)

    pipeline_options = PipelineOptions(pipeline_args)
    pipeline_options.view_as(SetupOptions).save_main_session = True
    p = beam.Pipeline(options=pipeline_options)

    # Read the text file[pattern] into a PCollection.
    lines = p | 'read' >> ReadFromText(known_args.input)

    # Count the occurrences of each word.
    def count_ones(word_ones):
        (word, ones) = word_ones
        return (word, sum(ones))

    counts = (lines | 'split' >>
              (beam.ParDo(WordExtractingDoFn()).with_output_types(unicode))
              | 'pair_with_one' >> beam.Map(lambda x: (x, 1))
              | 'group' >> beam.GroupByKey() | 'count' >> beam.Map(count_ones))

    # Format the counts into a PCollection of strings.
    def format_text(word_count):
        (word, count) = word_count
        return '%s: %d' % (word, count)

    # Format the counts into a PCollection of dictionary strings.

    def format_dict(word_count):
        (word, count) = word_count
        row = dict(zip(HEADER, [word, count]))
        return row

    if known_args.format == 'text':
        output = counts | 'format text' >> beam.Map(format_text)

        # Write the output using a "Write" transform that has side effects.
        # pylint: disable=expression-not-assigned
        output | 'write text' >> WriteToText(known_args.output)
    elif known_args.format == 'avro':
        output = counts | 'format avro' >> beam.Map(format_dict)

        schema = avro.schema.parse(json.dumps(AVRO_SCHEMA))

        # Write the output using a "Write" transform that has side effects.
        # pylint: disable=expression-not-assigned
        output | 'write avro' >> WriteToAvro(
            file_path_prefix=known_args.output,
            schema=schema,
            codec=DEFAULT_CODEC)
    else:
        output = counts | 'format parquet' >> beam.Map(format_dict)

        # Write the output using a "Write" transform that has side effects.
        # pylint: disable=expression-not-assigned
        output | 'write parquet' >> WriteToParquet(
            file_path_prefix=known_args.output,
            schema=PARQUET_SCHEMA,
            codec=DEFAULT_CODEC)

    result = p.run()
    result.wait_until_finish()

    # Do not query metrics when creating a template which doesn't run
    if (not hasattr(result, 'has_job')  # direct runner
            or result.has_job):  # not just a template creation
        empty_lines_filter = MetricsFilter().with_name('empty_lines')
        query_result = result.metrics().query(empty_lines_filter)
        if query_result['counters']:
            empty_lines_counter = query_result['counters'][0]
            logging.info('number of empty lines: %d',
                         empty_lines_counter.result)

        word_lengths_filter = MetricsFilter().with_name('word_len_dist')
        query_result = result.metrics().query(word_lengths_filter)
        if query_result['distributions']:
            word_lengths_dist = query_result['distributions'][0]
            logging.info('average word length: %d',
                         word_lengths_dist.result.mean)
Ejemplo n.º 21
0
    def test_avro_it(self):
        num_records = self.test_pipeline.get_option('records')
        num_records = int(num_records) if num_records else 1000000

        # Seed a `PCollection` with indices that will each be FlatMap'd into
        # `batch_size` records, to avoid having a too-large list in memory at
        # the outset
        batch_size = self.test_pipeline.get_option('batch-size')
        batch_size = int(batch_size) if batch_size else 10000

        # pylint: disable=range-builtin-not-iterating
        batches = range(int(num_records / batch_size))

        def batch_indices(start):
            # pylint: disable=range-builtin-not-iterating
            return range(start * batch_size, (start + 1) * batch_size)

        # A `PCollection` with `num_records` avro records
        records_pcoll = \
            self.test_pipeline \
            | 'create-batches' >> Create(batches) \
            | 'expand-batches' >> FlatMap(batch_indices) \
            | 'create-records' >> Map(record)

        fastavro_output = '/'.join([self.output, 'fastavro'])
        avro_output = '/'.join([self.output, 'avro'])

        self.addCleanup(delete_files, [self.output + '*'])

        # pylint: disable=expression-not-assigned
        records_pcoll \
        | 'write_fastavro' >> WriteToAvro(
            fastavro_output,
            self.SCHEMA,
            use_fastavro=True
        )

        # pylint: disable=expression-not-assigned
        records_pcoll \
        | 'write_avro' >> WriteToAvro(
            avro_output,
            self.SCHEMA,
            use_fastavro=False
        )

        result = self.test_pipeline.run()
        result.wait_until_finish()
        assert result.state == PipelineState.DONE

        fastavro_read_pipeline = TestPipeline(is_integration_test=True)

        fastavro_records = \
            fastavro_read_pipeline \
            | 'create-fastavro' >> Create(['%s*' % fastavro_output]) \
            | 'read-fastavro' >> ReadAllFromAvro(use_fastavro=True) \
            | Map(lambda rec: (rec['number'], rec))

        avro_records = \
            fastavro_read_pipeline \
            | 'create-avro' >> Create(['%s*' % avro_output]) \
            | 'read-avro' >> ReadAllFromAvro(use_fastavro=False) \
            | Map(lambda rec: (rec['number'], rec))

        def check(elem):
            v = elem[1]

            def assertEqual(l, r):
                if l != r:
                    raise BeamAssertException('Assertion failed: %s == %s' %
                                              (l, r))

            assertEqual(v.keys(), ['avro', 'fastavro'])
            avro_values = v['avro']
            fastavro_values = v['fastavro']
            assertEqual(avro_values, fastavro_values)
            assertEqual(len(avro_values), 1)

        # pylint: disable=expression-not-assigned
        {
            'avro': avro_records,
            'fastavro': fastavro_records
        } \
        | CoGroupByKey() \
        | Map(check)

        fastavro_read_pipeline.run().wait_until_finish()
        assert result.state == PipelineState.DONE
Ejemplo n.º 22
0
    def test_avro_it(self):
        num_records = self.test_pipeline.get_option('records')
        num_records = int(num_records) if num_records else 1000000
        fastavro_output = '/'.join([self.output, 'fastavro'])

        # Seed a `PCollection` with indices that will each be FlatMap'd into
        # `batch_size` records, to avoid having a too-large list in memory at
        # the outset
        batch_size = self.test_pipeline.get_option('batch-size')
        batch_size = int(batch_size) if batch_size else 10000

        # pylint: disable=bad-option-value
        batches = range(int(num_records / batch_size))

        def batch_indices(start):
            # pylint: disable=bad-option-value
            return range(start * batch_size, (start + 1) * batch_size)

        # A `PCollection` with `num_records` avro records
        records_pcoll = \
            self.test_pipeline \
            | 'create-batches' >> Create(batches) \
            | 'expand-batches' >> FlatMap(batch_indices) \
            | 'create-records' >> Map(record)

        # pylint: disable=expression-not-assigned
        records_pcoll \
        | 'write_fastavro' >> WriteToAvro(
            fastavro_output,
            parse_schema(json.loads(self.SCHEMA_STRING)),
        )
        result = self.test_pipeline.run()
        result.wait_until_finish()
        fastavro_pcoll = self.test_pipeline \
                         | 'create-fastavro' >> Create(['%s*' % fastavro_output]) \
                         | 'read-fastavro' >> ReadAllFromAvro()

        mapped_fastavro_pcoll = fastavro_pcoll | "map_fastavro" >> Map(
            lambda x: (x['number'], x))
        mapped_record_pcoll = records_pcoll | "map_record" >> Map(
            lambda x: (x['number'], x))

        def validate_record(elem):
            v = elem[1]

            def assertEqual(l, r):
                if l != r:
                    raise BeamAssertException('Assertion failed: %s == %s' %
                                              (l, r))

            assertEqual(sorted(v.keys()), ['fastavro', 'record_pcoll'])
            record_pcoll_values = v['record_pcoll']
            fastavro_values = v['fastavro']
            assertEqual(record_pcoll_values, fastavro_values)
            assertEqual(len(record_pcoll_values), 1)

        {
            "record_pcoll": mapped_record_pcoll,
            "fastavro": mapped_fastavro_pcoll
        } | CoGroupByKey() | Map(validate_record)

        result = self.test_pipeline.run()
        result.wait_until_finish()

        self.addCleanup(delete_files, [self.output])
        assert result.state == PipelineState.DONE
Ejemplo n.º 23
0
def run(argv=None):
    """Main entry point"""
    parser = argparse.ArgumentParser()
    parser.add_argument('--project',
                        default='query-11',
                        type=str,
                        required=False,
                        help='project')
    parser.add_argument('--job_name', default='basel3', type=str)
    parser.add_argument('--temp_location', default='gs://dataflow_s/tmp')
    parser.add_argument('--region', default='us-central1')
    parser.add_argument('--staging_location', default='gs://dataflow_s/stage')
    parser.add_argument(
        '--records',
        dest='records',
        type=int,
        # default='gs://dataflow-samples/shakespeare/kinglear.txt',
        default='10',  # gsutil cp gs://dataflow-samples/shakespeare/kinglear.txt
        help='Number of records to be generate')
    parser.add_argument(
        '--output',
        required=False,
        default='gs://dataflow_s/RPM/account_id_schema_output.avro',
        help='Output file to write results to.')
    parser.add_argument(
        '--input',
        default='gs://dataflow_s/RPM/account_id_schema_new.avro',
        help='input file to write results to.')
    # Parse arguments from the command line.
    # known_args, pipeline_args = parser.parse_known_args(argv)
    args = parser.parse_args()

    dataflow_options = [
        '--project=%s' % (args.project),
        '--job_name=%s' % (args.job_name),
        '--temp_location=%s' % (args.temp_location),
        '--region=%s' % (args.region)
    ]

    dataflow_options.append('--staging_location=%s' % (args.staging_location))
    options = PipelineOptions(dataflow_options)
    gcloud_options = options.view_as(GoogleCloudOptions)
    #
    options.view_as(StandardOptions).runner = "dataflow"

    input_filename = args.input
    output_filename = args.output

    SCHEMA = {
        "namespace":
        "example.avro",
        "type":
        "record",
        "name":
        "User",
        "fields": [{
            "name":
            "ACNO",
            "type": [
                "null", {
                    "logicalType": "char",
                    "type": "string",
                    "maxLength": 20
                }
            ]
        }, {
            "name":
            "FIELD_1",
            "type": [
                "null", {
                    "logicalType": "char",
                    "type": "float",
                    "maxLength": 20
                }
            ]
        }, {
            "name":
            "FIELD_2",
            "type": [
                "null", {
                    "logicalType": "char",
                    "type": "float",
                    "maxLength": 20
                }
            ]
        }, {
            "name":
            "FIELD_3",
            "type": [
                "null", {
                    "logicalType": "char",
                    "type": "float",
                    "maxLength": 20
                }
            ]
        }, {
            "name":
            "FIELD_4",
            "type": [
                "null", {
                    "logicalType": "char",
                    "type": "float",
                    "maxLength": 20
                }
            ]
        }, {
            "name":
            "FIELD_5",
            "type": [
                "null", {
                    "logicalType": "char",
                    "type": "float",
                    "maxLength": 20
                }
            ]
        }, {
            "name":
            "FIELD_6",
            "type": [
                "null", {
                    "logicalType": "char",
                    "type": "float",
                    "maxLength": 20
                }
            ]
        }, {
            "name":
            "FIELD_7",
            "type": [
                "null", {
                    "logicalType": "char",
                    "type": "float",
                    "maxLength": 20
                }
            ]
        }, {
            "name":
            "FIELD_8",
            "type": [
                "null", {
                    "logicalType": "char",
                    "type": "float",
                    "maxLength": 20
                }
            ]
        }, {
            "name":
            "FIELD_9",
            "type": [
                "null", {
                    "logicalType": "char",
                    "type": "float",
                    "maxLength": 20
                }
            ]
        }, {
            "name":
            "FIELD_10",
            "type": [
                "null", {
                    "logicalType": "char",
                    "type": "float",
                    "maxLength": 20
                }
            ]
        }]
    }

    rec_cnt = args.records
    with beam.Pipeline(options=options) as p:
        left_pcol_name = 'p1'
        file = p | 'read_source' >> beam.io.ReadFromAvro(
            'gs://dataflow_s/RPM/account_id_schema_new.avro')
        p1 = file | beam.Map(lambda x: {
            'ACNO': x['ACNO'],
            'FIELD_1': x["FIELD_1"]
        })
        p2 = file | beam.Map(lambda x: {
            'ACNO': x['ACNO'],
            'FIELD_2': x["FIELD_2"]
        })

        # P1_1 = p1 | "write" >> beam.io.WriteToText('./data.csv')
        # P2_2 = p2 | "write2" >> beam.io.WriteToText('./data2.csv')

        right_pcol_name = 'p2'

        join_keys = {
            left_pcol_name: [
                'ACNO'
                # 't1_col_B'
            ],
            right_pcol_name: [
                'ACNO'
                # 't2_col_B'
            ]
        }

        pipelines_dictionary = {left_pcol_name: p1, right_pcol_name: p2}
        test_pipeline = pipelines_dictionary | 'left join' >> Join(
            left_pcol_name=left_pcol_name,
            left_pcol=p1,
            right_pcol_name=right_pcol_name,
            right_pcol=p2,
            join_type='left',
            join_keys=join_keys)
        print(type(test_pipeline))

        compressIdc = True
        use_fastavro = True
        #

        test_pipeline | 'write_fastavro' >> WriteToAvro(
            args.output,
            parse_schema(SCHEMA),
            use_fastavro=use_fastavro,
            file_name_suffix='.avro',
            codec=('deflate' if compressIdc else 'null'),
        )
    result = p.run()
    result.wait_until_finish()
def run(argv=None):
    """Main entry point; defines and runs the wordcount pipeline."""
    parser = argparse.ArgumentParser()
    parser.add_argument(
        '--input',
        dest='input',
        default='gs://dataflow-samples/shakespeare/kinglear.txt',
        help='Input file to process.')
    parser.add_argument('--output',
                        dest='output',
                        required=True,
                        help='Output file to write results to.')
    parser.add_argument('--format',
                        dest='format',
                        default='text',
                        help='Supported output file formats: %s.' % FORMATS)
    known_args, pipeline_args = parser.parse_known_args(argv)

    if known_args.format not in FORMATS:
        raise ValueError('--format should be one of: %s' % FORMATS)

    # We use the save_main_session option because one or more DoFn's in this
    # workflow rely on global context (e.g., a module imported at module level).
    pipeline_options = PipelineOptions(pipeline_args)
    pipeline_options.view_as(SetupOptions).save_main_session = True
    p = beam.Pipeline(options=pipeline_options)

    # Read the text file[pattern] into a PCollection.
    lines = p | 'read' >> ReadFromText(known_args.input)

    # Count the occurrences of each word.
    def count_ones(word_ones):
        (word, ones) = word_ones
        return (word, sum(ones))

    counts = (lines
              | 'split' >>
              (beam.ParDo(WordExtractingDoFn()).with_output_types(unicode))
              | 'pair_with_one' >> beam.Map(lambda x: (x, 1))
              | 'group' >> beam.GroupByKey()
              | 'count' >> beam.Map(count_ones))

    # Format the counts into a PCollection of strings.
    def format_text(word_count):
        (word, count) = word_count
        return '%s: %d' % (word, count)

    # Format the counts into a PCollection of dictionary strings.

    def format_dict(word_count):
        (word, count) = word_count
        row = dict(zip(HEADER, [word, count]))
        return row

    if known_args.format == 'text':
        output = counts | 'format text' >> beam.Map(format_text)

        # Write the output using a "Write" transform that has side effects.
        # pylint: disable=expression-not-assigned
        output | 'write text' >> WriteToText(known_args.output)
    elif known_args.format == 'avro':
        output = counts | 'format avro' >> beam.Map(format_dict)

        schema = avro.schema.parse(json.dumps(AVRO_SCHEMA))

        # Write the output using a "Write" transform that has side effects.
        # pylint: disable=expression-not-assigned
        output | 'write avro' >> WriteToAvro(
            file_path_prefix=known_args.output,
            schema=schema,
            codec=DEFAULT_CODEC)
    else:
        output = counts | 'format parquet' >> beam.Map(format_dict)

        # Write the output using a "Write" transform that has side effects.
        # pylint: disable=expression-not-assigned
        output | 'write parquet' >> WriteToParquet(
            file_path_prefix=known_args.output,
            schema=PARQUET_SCHEMA,
            codec=DEFAULT_CODEC)

    result = p.run()
    result.wait_until_finish()

    # Do not query metrics when creating a template which doesn't run
    if (not hasattr(result, 'has_job')  # direct runner
            or result.has_job):  # not just a template creation
        empty_lines_filter = MetricsFilter().with_name('empty_lines')
        query_result = result.metrics().query(empty_lines_filter)
        if query_result['counters']:
            empty_lines_counter = query_result['counters'][0]
            logging.info('number of empty lines: %d',
                         empty_lines_counter.result)

        word_lengths_filter = MetricsFilter().with_name('word_len_dist')
        query_result = result.metrics().query(word_lengths_filter)
        if query_result['distributions']:
            word_lengths_dist = query_result['distributions'][0]
            logging.info('average word length: %d',
                         word_lengths_dist.result.mean)