Beispiel #1
0
            'state': state
        }
        yield d


# Runs the main part of the pipeline. Errors will be tagged, clean politicians will continue on to BQ.
pol = (
    p
    | 'Read from CSV' >> beam.io.ReadFromText(
        '{0}/tmp/senate_members/*.csv'.format(os.path.expanduser('~')),
        skip_header_lines=1)
    | 'Split Values' >> beam.ParDo(SplitFn())
    # | 'Isolate Attributes' >> beam.ParDo(pt.IsolateAttrFn())
    | 'Scrub First Name' >> beam.ParDo(pt.ScrubFnameFn(), keep_suffix=True)
    | 'Fix Nicknames' >> beam.ParDo(
        pt.FixNicknameFn(), n_tbl=nickname_tbl, keep_nickname=True)
    | 'Scrub Last Name' >> beam.ParDo(pt.ScrubLnameFn())
    | 'Fix Nones' >> beam.ParDo(pt.FixNoneFn())
    | 'Tag Errors' >> beam.ParDo(pt.TagErrorsFn()).with_outputs('error_tag'))

error_pols = pol.error_tag
clean_pols = pol[None]

# A new Politician will only be published if thy are not already contained in the Politicians table. If they are new,
# they will be properly uploaded. If they are not new, then they will be ignored in this pipeline.
# new_pol = (
#         clean_pols
#         | 'Filter Existing Pols' >> beam.ParDo(pt.NewPolsOnlyFn(), pol_tbl=pols_tbl)
#         | 'Filter Pol Keys' >> beam.ParDo(pt.FilterKeysFn(), attr_lst=pol_attr_lst)
#         | 'Write Pol to BQ' >> beam.io.WriteToBigQuery(
#             table=pol_spec,
Beispiel #2
0
                'last_name': last_name,
                'party': party,
                'state': state,
                'district': district
            }
            yield d

# Runs the main part of the pipeline. Errors will be tagged, clean politicians will continue on to BQ.
pol = (
        p
        | 'Read from CSV' >> beam.io.ReadFromText('gs://{0}/tmp/house_members/*.csv'.format(os.path.expanduser('~')),
                                                  skip_header_lines=1)
        | 'Split Values' >> beam.ParDo(SplitFn())
        # | 'Isolate Attributes' >> beam.ParDo(pt.IsolateAttrFn())
        | 'Scrub First Name' >> beam.ParDo(pt.ScrubFnameFn(), keep_suffix=True)
        | 'Fix Nicknames' >> beam.ParDo(pt.FixNicknameFn(), n_tbl=nickname_tbl, keep_nickname=True)
        | 'Scrub Last Name' >> beam.ParDo(pt.ScrubLnameFn())
        | 'Map States' >> beam.ParDo(StateMapFn(), tbl=state_tbl)
        | 'Fix Nones' >> beam.ParDo(pt.FixNoneFn())
        | 'Tag Errors' >> beam.ParDo(pt.TagErrorsFn()).with_outputs('error_tag'))

error_pols = pol.error_tag
clean_pols = pol[None]

# A new Politician will only be published if thy are not already contained in the Politicians table. If they are new,
# they will be properly uploaded. If they are not new, then they will be ignored in this pipeline.
new_pol = (clean_pols
           | 'Filter Existing Pols' >> beam.ParDo(pt.NewPolsOnlyFn(), pol_tbl=pols_tbl)
           | 'Filter Keys' >> beam.ParDo(pt.FilterKeysFn(), attr_lst=pol_attr_lst)
           | 'Write Pol to BQ' >> beam.io.WriteToBigQuery(
            table=pol_spec,
Beispiel #3
0
house_tbl = [{str(k):str(v) for (k,v) in d.items()} for d in house_tbl]


# Runs the main part of the pipeline. Errors will be tagged, good votes will continue on to BQ.
vote = (
    p
    | beam.io.gcp.pubsub.ReadFromPubSub(
        topic = None,
        subscription = 'projects/{0}/subscriptions/{1}'
            .format(project_id, subscription_name),
        with_attributes = True)
    | 'Isolate Attributes' >> beam.ParDo(pt.IsolateAttrFn())
    | 'Fix Value Types' >> beam.ParDo(pt.FixTypesFn(), int_lst=['vote_cast'])
    | 'Fix House First Names' >> beam.ParDo(pt.FixHouseFirstNameFn(), tbl=house_tbl)
    | 'Scrub First Name' >> beam.ParDo(pt.ScrubFnameFn())
    | 'Fix Nicknames' >> beam.ParDo(pt.FixNicknameFn(), n_tbl=nickname_tbl)
    | 'Scrub Last Name' >> beam.ParDo(pt.ScrubLnameFn())
    | 'Fix Nones' >> beam.ParDo(pt.FixNoneFn())
    | 'Tag Errors' >> beam.ParDo(pt.TagErrorsFn()).with_outputs('error_tag'))

# Separates the clean and error votes for proper processing.
error_votes = vote.error_tag
clean_votes = vote[None]

# The cleaned elements will be sent to the proper BQ table for storage.
(clean_votes
    | 'Filter Keys' >> beam.ParDo(pt.FilterKeysFn(), attr_lst=attributes_lst)
    | 'Write to BQ' >> beam.io.WriteToBigQuery(
        table = bq_spec,
        write_disposition = beam.io.BigQueryDisposition.WRITE_APPEND,
        create_disposition = beam.io.BigQueryDisposition.CREATE_NEVER
nickname_tbl = query_job.result()
nickname_tbl = [dict(row.items()) for row in nickname_tbl]
nickname_tbl = [{str(k): str(v) for (k, v) in d.items()} for d in nickname_tbl]

cosponsor = (
    p
    | 'Read from PubSub' >> beam.io.gcp.pubsub.ReadFromPubSub(
        topic=None,
        subscription='projects/{0}/subscriptions/{1}'.format(
            project_id, subscription_name),
        with_attributes=True)
    | 'Isolate Attributes' >> beam.ParDo(pt.IsolateAttrFn())
    | 'Normalize Attributes' >> beam.ParDo(NormalizeAttributesFn())
    | 'Scrub First Name' >> beam.ParDo(pt.ScrubFnameFn(), keep_suffix=False)
    | 'Fix Nicknames' >> beam.ParDo(
        pt.FixNicknameFn(), n_tbl=nickname_tbl, keep_nickname=False)
    | 'Scrub Last Name' >> beam.ParDo(pt.ScrubLnameFn())
    # | 'Revert Attributes' >> beam.ParDo(RevertAttributesFn())
    | 'Fix Nones' >> beam.ParDo(pt.FixNoneFn())
    | 'Tag Errors' >> beam.ParDo(pt.TagErrorsFn()).with_outputs('error_tag'))

# Separates the clean and error votes for proper processing.
error_cosponsors = cosponsor.error_tag
clean_cosponsors = cosponsor[None]

(clean_cosponsors
 | 'Filter Keys' >> beam.ParDo(pt.FilterKeysFn(), attr_lst=attributes_lst)
 | 'Write to BQ' >> beam.io.WriteToBigQuery(
     table=bill_spec,
     write_disposition=beam.io.BigQueryDisposition.WRITE_APPEND,
     create_disposition=beam.io.BigQueryDisposition.CREATE_NEVER))
Beispiel #5
0
            yield element


# Runs the main part of the pipeline. Errors will be tagged, good votes will continue on to BQ.
bill = (
    p
    | 'Read from PubSub' >> beam.io.gcp.pubsub.ReadFromPubSub(
        topic=None,
        subscription='projects/{0}/subscriptions/{1}'.format(
            project_id, subscription_name),
        with_attributes=True)
    | 'Isolate Attributes' >> beam.ParDo(pt.IsolateAttrFn())
    | 'Normalize Attributes' >> beam.ParDo(NormalizeAttributesFn())
    | 'Scrub First Name' >> beam.ParDo(pt.ScrubFnameFn(), keep_suffix=False)
    | 'Fix Nicknames' >> beam.ParDo(
        pt.FixNicknameFn(), n_tbl=n_tbl_ex, keep_nickname=False)
    | 'Scrub Last Name' >> beam.ParDo(pt.ScrubLnameFn())
    | 'Revert Attributes' >> beam.ParDo(RevertAttributesFn())
    | 'Fix Nones' >> beam.ParDo(pt.FixNoneFn())
    | 'Tag Errors' >> beam.ParDo(pt.TagErrorsFn()).with_outputs('error_tag'))

# Separates the clean and error votes for proper processing.
error_bills = bill.error_tag
clean_bills = bill[None]

# The cleaned elements will be sent to the proper BQ table for storage.
(clean_bills
 | 'Filter Keys' >> beam.ParDo(pt.FilterKeysFn(), attr_lst=attributes_lst)
 | 'Write to BQ' >> beam.io.WriteToBigQuery(
     table=bill_spec,
     write_disposition=beam.io.BigQueryDisposition.WRITE_APPEND,