Esempio n. 1
0
def run(argv=None):

    known_args, pipeline_options, avro_schema = generate_args(
        job_name='computronix-domi-permits',
        bucket='{}_computronix'.format(os.environ['GCS_PREFIX']),
        argv=argv,
        schema_name='domi_permits_computronix')

    with beam.Pipeline(options=pipeline_options) as p:

        field_name_swaps = [('PERMITTYPEPERMITTYPE', 'PERMITTYPE'),
                            ('TYPEOFWORKDESCRIPTION', 'WORKTYPE'),
                            ('APPLICANTCUSTOMFORMATTEDNAME', 'APPLICANTNAME'),
                            ('ALLCONTRACTORSNAME', 'CONTRACTORNAME'),
                            ('SPECIALPERMITINSTRUCTIONS',
                             'SPECIALINSTRUCTIONS'),
                            ('STATUSDESCRIPTION', 'STATUS')]

        lines = p | ReadFromText(known_args.input, coder=JsonCoder())

        load = (lines
                | beam.ParDo(ParseNestedFields())
                | beam.ParDo(SwapFieldNames(field_name_swaps))
                | beam.ParDo(GeocodeAddress())
                | WriteToAvro(known_args.avro_output,
                              schema=avro_schema,
                              file_name_suffix='.avro',
                              use_fastavro=True))
Esempio n. 2
0
def run(argv=None):
    """
    If you want to run just this file for rapid development, pass the arg '-r DirectRunner' and add
    GCS paths for --input and --avro_output, e.g.
    python qalert_requests_dataflow.py --input gs://pghpa_test_qalert/requests/2020/06/2020-06-17_requests.json
    --avro_output gs://pghpa_test_qalert/requests/avro_output/2020/06/2020-06-17/ -r DirectRunner
    """

    known_args, pipeline_options, avro_schema = generate_args(
        job_name='wprdc-fire-dataflow',
        bucket='{}_ems_fire'.format(os.environ['GCS_PREFIX']),
        argv=argv,
        schema_name='ems_calls')

    with beam.Pipeline(options=pipeline_options) as p:

        field_name_swaps = [("census_block_group_center__x", "long"),
                            ("census_block_group_center__y", "lat")]
        lines = p | ReadFromText(known_args.input, coder=JsonCoder())

        load = (lines
                | beam.ParDo(SwapFieldNames(field_name_swaps))
                | WriteToAvro(known_args.avro_output,
                              schema=avro_schema,
                              file_name_suffix='.avro',
                              use_fastavro=True))
def run(argv=None):
    """
    If you want to run just this file for rapid development, change runner to 'DirectRunner' and add
    GCS paths for --input and --avro_output, e.g.
    python qalert_requests_dataflow.py --input gs://pghpa_test_qalert/requests/2020/06/2020-06-17_requests.json
    --avro_output gs://pghpa_test_qalert/requests/avro_output/2020/06/2020-06-17/
    """

    known_args, pipeline_options, avro_schema = generate_args(
        job_name='parking-meters-dataflow',
        bucket='{}_parking'.format(os.environ['GCS_PREFIX']),
        argv=argv,
        schema_name='parking_meters'
    )

    with beam.Pipeline(options=pipeline_options) as p:

        field_name_swaps = [("longitude", "long"), ("latitude", "lat")]
        type_changes = [("long", "float"), ("lat", "float")]
        lines = p | ReadFromText(known_args.input, coder=JsonCoder())

        load = (
                lines
                | beam.ParDo(SwapFieldNames(field_name_swaps))
                | beam.ParDo(ChangeDataTypes(type_changes))
                | WriteToAvro(known_args.avro_output, schema=avro_schema, file_name_suffix='.avro', use_fastavro=True))
Esempio n. 4
0
def run(argv=None):

    known_args, pipeline_options, avro_schema = generate_args(
        job_name='accela-permits',
        bucket='{}_accela'.format(os.environ['GCS_PREFIX']),
        argv=argv,
        schema_name='accela_permits'
    )

    with beam.Pipeline(options=pipeline_options) as p:

        exclude_fields = [
            'module',
            'serviceProviderCode',
            'undistributedCost',
            'totalJobCost',
            'recordClass',
            'reportedChannel',
            'closedByDepartment',
            'estimatedProductionUnit',
            'actualProductionUnit',
            'createdByCloning',
            'closedByUser',
            'trackingId',
            'initiatedProduct',
            'createdBy',
            'value',
            'balance',
            'booking',
            'infraction',
            'misdemeanor',
            'offenseWitnessed',
            'defendantSignature',
            'parcels',
            'id',
            'statusDate',
            'jobValue',
            'reportedDate'
        ]

        address_field = 'address'

        field_name_swaps = [
            ('customId', 'id'),
            ('totalPay', 'total_paid')
        ]

        lines = p | ReadFromText(known_args.input, coder=JsonCoder())

        load = (
                lines
                | beam.ParDo(FilterInvalidRecord())
                | beam.ParDo(FilterFields(exclude_fields))
                | beam.ParDo(ParseNestedFields())
                | beam.ParDo(GeocodeAddress(address_field))
                | beam.ParDo(SwapFieldNames(field_name_swaps))
                | beam.ParDo(ColumnsCamelToSnakeCase())
                | WriteToAvro(known_args.avro_output, schema=avro_schema, file_name_suffix='.avro', use_fastavro=True))
Esempio n. 5
0
def run(argv=None):

    known_args, pipeline_options, avro_schema = generate_args(
        job_name='twilio-311-dataflow',
        bucket='{}_twilio'.format(os.environ['GCS_PREFIX']),
        argv=argv,
        schema_name='twilio_reports'
    )

    with beam.Pipeline(options=pipeline_options) as p:
        # Read the text file[pattern] into a PCollection.
        lines = p | ReadFromText(known_args.input, coder=JsonCoder())

        load = (
                lines
                | WriteToAvro(known_args.avro_output, schema=avro_schema, file_name_suffix='.avro', use_fastavro=True))
Esempio n. 6
0
def run(argv=None):

    known_args, pipeline_options, avro_schema = generate_args(
        job_name='firearms-dataflow',
        bucket='{}_firearm_seizures'.format(os.environ['GCS_PREFIX']),
        argv=argv,
        schema_name='firearm_seizures'
    )

    with beam.Pipeline(options=pipeline_options) as p:
        # Read the text file[pattern] into a PCollection.
        lines = p | ReadFromText(known_args.input, skip_header_lines=1)

        load = (
                lines
                | beam.ParDo(ConvertToDicts())
                | WriteToAvro(known_args.avro_output, schema=avro_schema, file_name_suffix='.avro', use_fastavro=True))
def run(argv=None):
    known_args, pipeline_options, avro_schema = generate_args(
        job_name='computronix-trades-dataflow',
        bucket='{}_computronix'.format(os.environ['GCS_PREFIX']),
        argv=argv,
        schema_name='trade_licenses_computronix')

    with beam.Pipeline(options=pipeline_options) as p:
        # Read the text file[pattern] into a PCollection.
        lines = p | ReadFromText(known_args.input, coder=JsonCoder())

        load = (lines
                | beam.ParDo(FormatColumnNames())
                | beam.ParDo(ConvertTypes())
                | WriteToAvro(known_args.avro_output,
                              schema=avro_schema,
                              file_name_suffix='.avro',
                              use_fastavro=True))
Esempio n. 8
0
def run(argv=None):

    known_args, pipeline_options, avro_schema = dataflow_utils.generate_args(
        job_name='comm-ctr-attendance-dataflow',
        bucket='{}_community_centers'.format(os.environ['GCS_PREFIX']),
        argv=argv,
        schema_name='community_center_attendance')

    with beam.Pipeline(options=pipeline_options) as p:
        # Read the text file[pattern] into a PCollection.
        lines = p | ReadFromText(known_args.input,
                                 coder=dataflow_utils.JsonCoder())

        load = (lines
                | beam.ParDo(ColumnsCamelToSnakeCase())
                | WriteToAvro(known_args.avro_output,
                              schema=avro_schema,
                              file_name_suffix='.avro',
                              use_fastavro=True))
Esempio n. 9
0
def run(argv=None):

    known_args, pipeline_options, avro_schema = generate_args(
        job_name='registered-businesses-dataflow',
        bucket='{}_finance'.format(os.environ['GCS_PREFIX']),
        argv=argv,
        schema_name='registered_businesses')

    with beam.Pipeline(options=pipeline_options) as p:
        lines = p | ReadFromText(known_args.input, coder=JsonCoder())

        load = (lines
                | beam.ParDo(ColumnsToLowerCase())
                | beam.ParDo(ParseAddress())
                | beam.ParDo(
                    NormalizeAddress(StaticValueProvider(str, 'address_full')))
                | WriteToAvro(known_args.avro_output,
                              schema=avro_schema,
                              file_name_suffix='.avro',
                              use_fastavro=True))
def run(argv=None):
    """
    If you want to run just this file for rapid development, change runner to 'DirectRunner' and add
    GCS paths for --input and --avro_output, e.g.
    python qalert_requests_dataflow.py --input gs://pghpa_test_qalert/requests/2020/06/2020-06-17_requests.json
    --avro_output gs://pghpa_test_qalert/requests/avro_output/2020/06/2020-06-17/
    """

    known_args, pipeline_options, avro_schema = generate_args(
        job_name='parking-transactions-dataflow',
        bucket='{}_parking'.format(os.environ['GCS_PREFIX']),
        argv=argv,
        schema_name='parking_transactions'
    )

    with beam.Pipeline(options=pipeline_options) as p:

        lines = p | ReadFromText(known_args.input, coder=JsonCoder())

        load = (
                lines
                | WriteToAvro(known_args.avro_output, schema=avro_schema, file_name_suffix='.avro', use_fastavro=True))
Esempio n. 11
0
def run(argv=None):

    known_args, pipeline_options, avro_schema = generate_args(
        job_name='police-blotter-30-day-dataflow',
        bucket='{}_police'.format(os.environ['GCS_PREFIX']),
        argv=argv,
        schema_name='30_day_police_blotter')

    with beam.Pipeline(options=pipeline_options) as p:
        lines = p | ReadFromText(known_args.input, coder=JsonCoder())

        data_type_changes = [('CCR', 'int'), ('TRACT', 'int')]
        field_name_swaps = [('PK', 'id')]

        load = (lines
                | beam.ParDo(CleanPKs())
                | beam.ParDo(ChangeDataTypes(data_type_changes))
                | beam.ParDo(SwapFieldNames(field_name_swaps))
                | WriteToAvro(known_args.avro_output,
                              schema=avro_schema,
                              file_name_suffix='.avro',
                              use_fastavro=True))
Esempio n. 12
0
def run(argv=None):
    """
    If you want to run just this file for rapid development, add the arg '-r DirectRunner' and add
    GCS paths for --input and --avro_output, e.g.
    python qalert_requests_dataflow.py --input gs://pghpa_test_qalert/requests/2020/06/2020-06-17_requests.json
    --avro_output gs://pghpa_test_qalert/requests/avro_output/2020/06/2020-06-17/ -r DirectRunner
    """

    known_args, pipeline_options, avro_schema = generate_args(
        job_name='qalert-requests-dataflow',
        bucket='{}_qalert'.format(os.environ['GCS_PREFIX']),
        argv=argv,
        schema_name='City_of_Pittsburgh_QAlert_Requests')

    with beam.Pipeline(options=pipeline_options) as p:

        date_conversions = [('lastActionUnix', 'lastAction'),
                            ('addDateUnix', 'createDate')]
        field_name_swaps = [('addDateUnix', 'createDateUnix'),
                            ('status', 'statusCode'), ('latitude', 'lat'),
                            ('longitude', 'long'),
                            ('master', 'masterRequestId'),
                            ('typeId', 'requestTypeId'),
                            ('typeName', 'requestType')]

        lines = p | ReadFromText(known_args.input, coder=JsonCoder())

        load = (lines
                | beam.ParDo(GetDateStrings(date_conversions))
                | beam.ParDo(SwapFieldNames(field_name_swaps))
                | beam.ParDo(GetStatus())
                | beam.ParDo(GetClosedDate())
                | WriteToAvro(known_args.avro_output,
                              schema=avro_schema,
                              file_name_suffix='.avro',
                              use_fastavro=True))