Exemple #1
0
    def _row_subset(self, item):
        subset = asarray(item[0])  # materialize
        partition_row_subsets = self._copartition(subset,
                                                  self.partition_row_counts)
        new_partition_row_counts = self._partition_row_counts(
            partition_row_subsets)
        new_shape = (builtins.sum(new_partition_row_counts), self.shape[1])

        # Beam doesn't have a direct equivalent of Spark's zip function, so we use a side input and join here
        # See https://github.com/apache/beam/blob/master/sdks/python/apache_beam/examples/snippets/snippets.py#L1295
        subset_pcollection = self.pipeline | gensym(
            "partition_row_subsets") >> beam.Create(
                enumerate(partition_row_subsets))

        def join_row_with_subset(index_row, subset_dict):
            index, row = index_row
            return index, row[subset_dict[index], :]

        new_pcollection = self.pcollection | gensym("row_subset") >> beam.Map(
            join_row_with_subset, AsDict(subset_pcollection))

        return self._new(
            pcollection=new_pcollection,
            shape=new_shape,
            partition_row_counts=new_partition_row_counts,
        )
def run(argv=None):
    parser = argparse.ArgumentParser()

    known_args, pipeline_args = parser.parse_known_args(argv)

    pipeline_options = PipelineOptions(pipeline_args)
    pipeline_options.view_as(SetupOptions).save_main_session = True
    offer_stat_pipeline_options = pipeline_options.view_as(OfferStatPipelineOptions)

    p = beam.Pipeline(options=pipeline_options)

    users = p | "Read users" >> beam.io.Read(beam.io.BigQuerySource(table=offer_stat_pipeline_options.users_bq_table, flatten_results=False)) \
            | beam.Map(lambda user_row: (user_row['account_id'], user_row['country']))

    account_offers = p | "Read account offers" >> beam.io.Read(beam.io.BigQuerySource(table=offer_stat_pipeline_options.account_offers_bq_table, flatten_results=False)) \
                     | beam.Map(lambda row: (row['account_id'], row))

    offers = p | "Read offers" >> beam.io.Read(beam.io.BigQuerySource(table=offer_stat_pipeline_options.offers_bq_table, flatten_results=False)) \
        | beam.Map(lambda row: (row['offer_id'], row['offer_name']))

    ({'users': users, 'account_offers': account_offers} | beam.CoGroupByKey()) \
        | beam.ParDo(UserCountryMerger()) \
        | beam.Map(merge_offer_name, offers=AsDict(offers)) \
        | beam.Map(lambda enriched_offer: ((enriched_offer['offer_name'], enriched_offer['user_country']), enriched_offer)) \
        | beam.combiners.Count.PerKey() \
        | 'Map to BQ row' >> beam.Map(convert_to_bq_row) \
        | 'Writing offers to BQ' >> beam.io.WriteToBigQuery(table=offer_stat_pipeline_options.offer_stat_bq_table,
                                                        create_disposition=BigQueryDisposition.CREATE_IF_NEEDED,
                                                        write_disposition=BigQueryDisposition.WRITE_TRUNCATE,
                                                        schema='offer_name:STRING,user_country:STRING,count:INTEGER')

    result = p.run()
    result.wait_until_finish()
    def run_test_pipeline(self, person_id: int, sentence: SentenceType,
                          us_mo_sentence_status_rows: List[Dict[str, str]],
                          expected_sentence: SentenceType):
        """Runs a test pipeline to test ConvertSentenceToStateSpecificType and checks the output against expected."""
        test_pipeline = TestPipeline()

        us_mo_sentence_statuses = (test_pipeline
                                   | 'Create MO sentence statuses' >>
                                   beam.Create(us_mo_sentence_status_rows))

        sentence_status_rankings_as_kv = (
            us_mo_sentence_statuses
            | 'Convert sentence status ranking table to KV tuples' >>
            beam.ParDo(ConvertDictToKVTuple(), 'sentence_external_id'))

        # Group the sentence status tuples by sentence_external_id
        us_mo_sentence_statuses_by_sentence = (
            sentence_status_rankings_as_kv |
            'Group the sentence status ranking tuples by sentence_external_id'
            >> beam.GroupByKey())

        output = (
            test_pipeline
            | beam.Create([(person_id, sentence)])
            | 'Convert sentence' >> beam.ParDo(
                entity_hydration_utils.ConvertSentenceToStateSpecificType(),
                AsDict(us_mo_sentence_statuses_by_sentence)))

        # Expect no change
        expected_output = [(person_id, expected_sentence)]

        assert_that(output,
                    self.convert_sentence_output_is_valid(expected_output))

        test_pipeline.run()
Exemple #4
0
def run(argv=None):
    parser = argparse.ArgumentParser()

    known_args, pipeline_args = parser.parse_known_args(argv)

    pipeline_options = PipelineOptions(pipeline_args)
    pipeline_options.view_as(SetupOptions).save_main_session = True
    offer_stat_pipeline_options = pipeline_options.view_as(
        OfferStatPipelineOptions)

    p = beam.Pipeline(options=pipeline_options)

    users = p | "Read users" >> beam.io.Read(beam.io.BigQuerySource(table=offer_stat_pipeline_options.users_bq_table, flatten_results=False)) \
            | beam.Map(lambda user_row: (user_row['account_id'], user_row['country']))

    account_offers = p | "Read account offers" >> beam.io.Read(beam.io.BigQuerySource(table=offer_stat_pipeline_options.account_offers_bq_table, flatten_results=False)) \
                     | beam.Map(lambda row: (row['account_id'], row))

    offers = p | "Read offers" >> beam.io.Read(beam.io.BigQuerySource(table=offer_stat_pipeline_options.offers_bq_table, flatten_results=False)) \
        | beam.Map(lambda row: (row['offer_id'], row['offer_name']))

    ({'users': users, 'account_offers': account_offers} | beam.CoGroupByKey()) \
        | beam.ParDo(UserCountryMerger()) \
        | beam.Map(merge_offer_name, offers=AsDict(offers)) \
        | beam.ParDo(beam_util.LoggerDoFn())

    result = p.run()
    result.wait_until_finish()
Exemple #5
0
    def test_pcollectionview_not_recreated(self):
        pipeline = Pipeline('DirectRunner')
        value = pipeline | 'create1' >> Create([1, 2, 3])
        value2 = pipeline | 'create2' >> Create([(1, 1), (2, 2), (3, 3)])
        value3 = pipeline | 'create3' >> Create([(1, 1), (2, 2), (3, 3)])
        self.assertEqual(AsSingleton(value), AsSingleton(value))
        self.assertEqual(AsSingleton('new', value, default_value=1),
                         AsSingleton('new', value, default_value=1))
        self.assertNotEqual(AsSingleton(value),
                            AsSingleton('new', value, default_value=1))
        self.assertEqual(AsIter(value), AsIter(value))
        self.assertEqual(AsList(value), AsList(value))
        self.assertEqual(AsDict(value2), AsDict(value2))

        self.assertNotEqual(AsSingleton(value), AsSingleton(value2))
        self.assertNotEqual(AsIter(value), AsIter(value2))
        self.assertNotEqual(AsList(value), AsList(value2))
        self.assertNotEqual(AsDict(value2), AsDict(value3))
Exemple #6
0
def run():
    main_query = """SELECT
          CAST(a.HSN_ACCT_NUM AS INT64) AS CUSTOMER_ID,
          CAST(a.ROW_CREATED_DATE AS TIMESTAMP) AS SOURCE_CREATE_DT,
          a.PRIMARY_PHONE_NUM AS PRIMARY_PHONE_NUMBER,
          CAST(SUBSTR(a.PRIMARY_PHONE_NUM,1,3) AS INT64) AS DERIVED_AREA_CODE,
          CASE WHEN LENGTH(a.BILL_SHIP_ADDR_SYNC_FLAG) = 0 THEN 0
          ELSE CAST(a.BILL_SHIP_ADDR_SYNC_FLAG AS INT64) 
          END AS BILLSHIP_ADDR_SYNC_IND,
          a.GUEST_CUSTOMER_FLAG AS GUEST_CODE,
          a.GUID AS DIGITAL_CUSTOMER_ID,
          a.MARKET_PLACE_ID AS MARKET_PLACE_ID,
          a.MARKET_PLACE_CUSTID AS MARKET_PLACE_CUST_ID,
          (srg_key.MAX_VALUE_KEY + ROW_NUMBER() OVER()) AS CUSTOMER_KEY,
          '999999999999' AS PRIMARY_ADDRESS_KEY,
          '999999999999' AS BILLING_ADDRESS_KEY,
          'CLIC' AS ETL_SOURCE_SYSTEM,
          '0' AS PURGED_IND,
          '0' AS MERGED_IND,
          '0' AS TEST_CUSTOMER_IND,
          '0' AS BLOCK_CURRENT_IND,
          '0' AS BLOCK_LIFETIME_IND, 
          '999999999999' AS IDCENTRIC_INDIVIDUAL_ID,
          '999999999999' AS IDCENTRIC_HOUSEHOLD_ID,
          '999999999999' AS IDCENTRIC_ADDRESS_ID,
          '0' AS VOID_IND,
          CAST(FORMAT_DATETIME('%Y%m%d%H%M%S', CURRENT_DATETIME()) AS INT64) AS UPD_BATCH_NBR,
          CAST(FORMAT_DATETIME('%Y%m%d%H%M%S', CURRENT_DATETIME()) AS INT64) AS INS_BATCH_NBR,
          '999999999999' AS MKTADDR_ADDRESS_KEY,
          '0' AS PRIVACY_IND
        FROM
          `automatic-asset-253215.STAGE.STG_CLIC_CUSTSORGEXT` a,
          `automatic-asset-253215.STAGE.STG_CLIC_SURROGKEYS` srg_key
          WHERE srg_key.TABLE_NAME = "IM_CUSTOMER_ATTRIBUTE_REF"
          """

    lookup_query = """SELECT DISTINCT CUSTOMER_ID FROM `automatic-asset-253215.CORE.IM_CUSTOMER_ATTRIBUTE_REF`"""

    IM_data = (
        p | 'Read Cust_Ids From IM' >> beam.io.Read(
            beam.io.BigQuerySource(query=lookup_query, use_standard_sql=True))
        | 'Get Cust_Ids ' >> beam.Map(lambda row: (row['CUSTOMER_ID'], row)))

    cust_org_data = (
        p | 'Read from custorgext' >> beam.io.Read(
            beam.io.BigQuerySource(query=main_query, use_standard_sql=True))
        | 'Lookup' >> beam.Map(lookup, AsDict(IM_data))
        | 'Filter' >> beam.ParDo(filter_out_nones)
        | 'Insert: Unmatched Records' >> beam.io.WriteToBigQuery(
            output_table,
            write_disposition=beam.io.BigQueryDisposition.WRITE_APPEND,
            create_disposition=beam.io.BigQueryDisposition.CREATE_NEVER)
        #| 'Updating Surrogate_Key' >> beam.ParDo(updateSrgKey)
    )

    p.run().wait_until_finish()
Exemple #7
0
def get_enriched_events(salesevent: beam.pvalue.PCollection,sideinput_collections: Dict[str,beam.pvalue.PCollection]) \
        -> beam.pvalue.PCollection:
    """Gets enriched events by
        a) Call a transform that combining primary event with corresponding side input values
        b) Group events by dummy key to combine all events in a window into one shard
        c) Discard dummy key

     Args:
        salesevent: Event representing sales transaction
        sideinput_collections: Set of Side Input Collections
    """
    # yapf: disable
    return (salesevent
             | "Enrich event" >> beam.Map(transforms.enrich_event,
                                       AsDict(sideinput_collections["bonuspoints"]),
                                       AsDict(sideinput_collections["discountpct"]),
                                       AsDict(sideinput_collections["category"]))
             | "Group events by dummy Key" >> beam.GroupByKey()
             | "Discard dummy Key" >> beam.Values()
          )
def run(argv=None):
    """The main function which creates the pipeline and runs it."""
    parser = argparse.ArgumentParser()
    # Here we add some specific command line arguments we expect.   Specifically
    # we have the input file to load and the output table to write to.
    parser.add_argument(
        '--input',
        dest='input',
        required=False,
        help='Input file to read.  This can be a local file or '
        'a file in a Google Storage Bucket.',
        # This example file contains a total of only 10 lines.
        # Useful for quickly debugging on a small set of data
        default='gs://python-dataflow-example/data_files/head_usa_names.csv')
    # The output defaults to the lake dataset in your BigQuery project.  You'll have
    # to create the lake dataset yourself using this command:
    # bq mk lake
    parser.add_argument('--output',
                        dest='output',
                        required=False,
                        help='Output BQ table to write results to.',
                        default='lake.usa_names_enriched')

    # Parse arguments from the command line.
    known_args, pipeline_args = parser.parse_known_args(argv)

    # DataIngestion is a class we built in this script to hold the logic for
    # transforming the file into a BigQuery table.
    data_ingestion = DataIngestion()

    # Initiate the pipeline using the pipeline arguments passed in from the
    # command line.  This includes information like where Dataflow should store
    #  temp files, and what the project id is
    p = beam.Pipeline(options=PipelineOptions(pipeline_args))
    schema = parse_table_schema_from_json(data_ingestion.schema_str)

    # This function adds in a full state name by looking up the
    # full name in the short_to_long_name_map.  The short_to_long_name_map
    # comes from a read from BigQuery in the next few lines
    def add_full_state_name(row, short_to_long_name_map):
        row['state_full_name'] = short_to_long_name_map[row['state']]
        return row

    # This is a second source of data.  The source is from BigQuery.
    # This will come into our pipeline a side input.

    read_query = """
    SELECT
    state_name,
    state_abbreviation
    FROM
    `python-dataflow-example.example_data.state_abbreviations`"""

    state_abbreviations = (
        p
        | 'Read from BigQuery' >> beam.io.Read(
            beam.io.BigQuerySource(query=read_query, use_standard_sql=True))
        # We must create a python tuple of key to value pairs here in order to
        # use the data as a side input.  Dataflow will use the keys to distribute the
        # work to the correct worker.
        | 'Abbreviation to Full Name' >>
        beam.Map(lambda row: (row['state_abbreviation'], row['state_name'])))

    (p
     # Read the file.  This is the source of the pipeline.  All further
     # processing starts with lines read from the file.  We use the input
     # argument from the command line.  We also skip the first line which is
     # a header row.
     | 'Read From Text' >> beam.io.ReadFromText(known_args.input,
                                                skip_header_lines=1)
     # Translates from the raw string data in the CSV to a dictionary.
     # The dictionary is a keyed by column names with the values being the values
     # we want to store in BigQuery.
     | 'String to BigQuery Row' >>
     beam.Map(lambda s: data_ingestion.parse_method(s))
     # Here we pass in a side input, which is data that comes from outside our
     # CSV source.  The side input contains a map of states to their full name.
     |
     'Join Data' >> beam.Map(add_full_state_name, AsDict(state_abbreviations))
     # This is the final stage of the pipeline, where we define the destination
     #  of the data.  In this case we are writing to BigQuery.
     | 'Write to BigQuery' >> beam.io.Write(
         beam.io.BigQuerySink(
             # The table name is a required argument for the BigQuery sink.
             # In this case we use the value passed in from the command line.
             known_args.output,
             # Here we use the JSON schema read in from a JSON file.
             # Specifying the schema allows the API to create the table correctly if it does not yet exist.
             schema=schema,
             # Creates the table in BigQuery if it does not yet exist.
             create_disposition=beam.io.BigQueryDisposition.CREATE_IF_NEEDED,
             # Deletes all data in the BigQuery table before writing.
             write_disposition=beam.io.BigQueryDisposition.WRITE_TRUNCATE)))
    p.run().wait_until_finish()
Exemple #9
0
def run(argv=None):
    """Runs the recidivism calculation pipeline."""

    # Workaround to load SQLAlchemy objects at start of pipeline. This is
    # necessary because the BuildRootEntity function tries to access attributes
    # of relationship properties on the SQLAlchemy room_schema_class before they
    # have been loaded. However, if *any* SQLAlchemy objects have been
    # instantiated, then the relationship properties are loaded and their
    # attributes can be successfully accessed.
    _ = schema.StatePerson()

    # Parse command-line arguments
    known_args, pipeline_args = parse_arguments(argv)

    pipeline_options = PipelineOptions(pipeline_args)
    pipeline_options.view_as(SetupOptions).save_main_session = True

    # Get pipeline job details
    all_pipeline_options = pipeline_options.get_all_options()

    query_dataset = all_pipeline_options['project'] + '.' + known_args.input
    reference_dataset = all_pipeline_options['project'] + '.' + \
                        known_args.reference_input

    with beam.Pipeline(argv=pipeline_args) as p:
        # Get StatePersons
        persons = (p
                   | 'Load Persons' >> BuildRootEntity(
                       dataset=query_dataset,
                       data_dict=None,
                       root_schema_class=schema.StatePerson,
                       root_entity_class=entities.StatePerson,
                       unifying_id_field='person_id',
                       build_related_entities=True))

        # Get StateIncarcerationPeriods
        incarceration_periods = (
            p
            | 'Load IncarcerationPeriods' >> BuildRootEntity(
                dataset=query_dataset,
                data_dict=None,
                root_schema_class=schema.StateIncarcerationPeriod,
                root_entity_class=entities.StateIncarcerationPeriod,
                unifying_id_field='person_id',
                build_related_entities=True))

        # Get StateSupervisionViolations
        supervision_violations = \
            (p
             | 'Load SupervisionViolations' >>
             BuildRootEntity(
                 dataset=query_dataset,
                 data_dict=None,
                 root_schema_class=schema.StateSupervisionViolation,
                 root_entity_class=entities.StateSupervisionViolation,
                 unifying_id_field='person_id',
                 build_related_entities=True
             ))

        # TODO(2769): Don't bring this in as a root entity
        # Get StateSupervisionViolationResponses
        supervision_violation_responses = \
            (p
             | 'Load SupervisionViolationResponses' >>
             BuildRootEntity(
                 dataset=query_dataset,
                 data_dict=None,
                 root_schema_class=schema.StateSupervisionViolationResponse,
                 root_entity_class=entities.StateSupervisionViolationResponse,
                 unifying_id_field='person_id',
                 build_related_entities=True
             ))

        # Group StateSupervisionViolationResponses and
        # StateSupervisionViolations by person_id
        supervision_violations_and_responses = (
            {
                'violations': supervision_violations,
                'violation_responses': supervision_violation_responses
            } | 'Group StateSupervisionViolationResponses to '
            'StateSupervisionViolations' >> beam.CoGroupByKey())

        # Set the fully hydrated StateSupervisionViolation entities on
        # the corresponding StateSupervisionViolationResponses
        violation_responses_with_hydrated_violations = (
            supervision_violations_and_responses
            | 'Set hydrated StateSupervisionViolations on '
            'the StateSupervisionViolationResponses' >> beam.ParDo(
                SetViolationOnViolationsResponse()))

        # Group StateIncarcerationPeriods and StateSupervisionViolationResponses
        # by person_id
        incarceration_periods_and_violation_responses = (
            {
                'incarceration_periods': incarceration_periods,
                'violation_responses':
                violation_responses_with_hydrated_violations
            }
            | 'Group StateIncarcerationPeriods to '
            'StateSupervisionViolationResponses' >> beam.CoGroupByKey())

        # Set the fully hydrated StateSupervisionViolationResponse entities on
        # the corresponding StateIncarcerationPeriods
        incarceration_periods_with_source_violations = (
            incarceration_periods_and_violation_responses
            | 'Set hydrated StateSupervisionViolationResponses on '
            'the StateIncarcerationPeriods' >> beam.ParDo(
                SetViolationResponseOnIncarcerationPeriod()))

        # Group each StatePerson with their StateIncarcerationPeriods
        person_and_incarceration_periods = (
            {
                'person':
                persons,
                'incarceration_periods':
                incarceration_periods_with_source_violations
            }
            | 'Group StatePerson to StateIncarcerationPeriods' >>
            beam.CoGroupByKey())

        # Bring in the table that associates people and their county of
        # residence
        person_id_to_county_query = \
            f"SELECT * FROM " \
            f"`{reference_dataset}.persons_to_recent_county_of_residence`"

        person_id_to_county_kv = (
            p
            | "Read person_id to county associations from BigQuery" >>
            beam.io.Read(
                beam.io.BigQuerySource(query=person_id_to_county_query,
                                       use_standard_sql=True))
            | "Convert person_id to county association table to KV" >>
            beam.ParDo(ConvertDictToKVTuple(), 'person_id'))

        # Identify ReleaseEvents events from the StatePerson's
        # StateIncarcerationPeriods
        person_events = (
            person_and_incarceration_periods
            | "ClassifyReleaseEvents" >> beam.ParDo(
                ClassifyReleaseEvents(), AsDict(person_id_to_county_kv)))

        # Get dimensions to include and methodologies to use
        inclusions, methodologies = dimensions_and_methodologies(known_args)

        # Get pipeline job details for accessing job_id
        all_pipeline_options = pipeline_options.get_all_options()

        # Add timestamp for local jobs
        job_timestamp = datetime.datetime.now().strftime(
            '%Y-%m-%d_%H_%M_%S.%f')
        all_pipeline_options['job_timestamp'] = job_timestamp

        # Get recidivism metrics
        recidivism_metrics = (
            person_events
            | 'Get Recidivism Metrics' >> GetRecidivismMetrics(
                pipeline_options=all_pipeline_options, inclusions=inclusions))

        filter_metrics_kwargs = {'methodologies': methodologies}

        # Filter out unneeded metrics
        final_recidivism_metrics = (
            recidivism_metrics
            | 'Filter out unwanted metrics' >> beam.ParDo(
                FilterMetrics(), **filter_metrics_kwargs))

        # Convert the metrics into a format that's writable to BQ
        writable_metrics = (
            final_recidivism_metrics
            | 'Convert to dict to be written to BQ' >> beam.ParDo(
                RecidivismMetricWritableDict()).with_outputs(
                    'rates', 'counts', 'liberties'))

        # Write the recidivism metrics to the output tables in BigQuery
        rates_table = known_args.output + '.recidivism_rate_metrics'
        counts_table = known_args.output + '.recidivism_count_metrics'
        liberty_table = known_args.output + '.recidivism_liberty_metrics'

        _ = (writable_metrics.rates
             | f"Write rate metrics to BQ table: {rates_table}" >>
             beam.io.WriteToBigQuery(
                 table=rates_table,
                 create_disposition=beam.io.BigQueryDisposition.CREATE_NEVER,
                 write_disposition=beam.io.BigQueryDisposition.WRITE_APPEND))

        _ = (writable_metrics.counts
             | f"Write count metrics to BQ table: {counts_table}" >>
             beam.io.WriteToBigQuery(
                 table=counts_table,
                 create_disposition=beam.io.BigQueryDisposition.CREATE_NEVER,
                 write_disposition=beam.io.BigQueryDisposition.WRITE_APPEND))

        _ = (writable_metrics.liberties
             | f"Write liberty metrics to BQ table: {liberty_table}" >>
             beam.io.WriteToBigQuery(
                 table=liberty_table,
                 create_disposition=beam.io.BigQueryDisposition.CREATE_NEVER,
                 write_disposition=beam.io.BigQueryDisposition.WRITE_APPEND))
Exemple #10
0
    def testClassifyProgramAssignments_NoAssessments(self):
        """Tests the ClassifyProgramAssignments DoFn."""
        fake_person_id = 12345

        fake_person = entities.StatePerson.new_with_defaults(
            person_id=fake_person_id,
            gender=Gender.MALE,
            birthdate=date(1970, 1, 1),
            residency_status=ResidencyStatus.PERMANENT)

        program_assignment = entities.StateProgramAssignment.new_with_defaults(
            state_code='US_CA',
            program_id='PG3',
            referral_date=date(2009, 10, 3))

        supervision_period = \
            entities.StateSupervisionPeriod.new_with_defaults(
                supervision_period_id=111,
                status=StateSupervisionPeriodStatus.TERMINATED,
                state_code='UT',
                start_date=date(2008, 3, 5),
                termination_date=date(2010, 5, 19),
                termination_reason=
                StateSupervisionPeriodTerminationReason.DISCHARGE,
                supervision_type=StateSupervisionType.PAROLE
            )

        person_periods = {
            'person': [fake_person],
            'program_assignments': [program_assignment],
            'assessments': [],
            'supervision_periods': [supervision_period]
        }

        program_event = ProgramReferralEvent(
            state_code=program_assignment.state_code,
            program_id=program_assignment.program_id,
            event_date=program_assignment.referral_date,
            supervision_type=supervision_period.supervision_type,
            supervising_officer_external_id='OFFICER0009',
            supervising_district_external_id='10')

        correct_output = [(fake_person, [program_event])]

        test_pipeline = TestPipeline()

        supervision_period_to_agent_map = {
            'agent_id': 1010,
            'agent_external_id': 'OFFICER0009',
            'district_external_id': '10',
            'supervision_period_id': supervision_period.supervision_period_id
        }

        supervision_period_to_agent_associations = (
            test_pipeline
            | 'Create SupervisionPeriod to Agent table' >> beam.Create(
                [supervision_period_to_agent_map]))

        supervision_periods_to_agent_associations_as_kv = (
            supervision_period_to_agent_associations
            | 'Convert SupervisionPeriod to Agent table to KV tuples' >>
            beam.ParDo(pipeline.ConvertDictToKVTuple(),
                       'supervision_period_id'))

        output = (test_pipeline
                  | beam.Create([(fake_person_id, person_periods)])
                  | 'Identify Program Events' >> beam.ParDo(
                      pipeline.ClassifyProgramAssignments(),
                      AsDict(supervision_periods_to_agent_associations_as_kv)))

        assert_that(output, equal_to(correct_output))

        test_pipeline.run()
def run(apache_beam_pipeline_options: PipelineOptions, data_input: str,
        reference_input: str, output: str, calculation_month_count: int,
        metric_types: List[str], state_code: Optional[str],
        calculation_end_month: Optional[str],
        person_filter_ids: Optional[List[int]]):
    """Runs the incarceration calculation pipeline."""

    # Workaround to load SQLAlchemy objects at start of pipeline. This is necessary because the BuildRootEntity
    # function tries to access attributes of relationship properties on the SQLAlchemy room_schema_class before they
    # have been loaded. However, if *any* SQLAlchemy objects have been instantiated, then the relationship properties
    # are loaded and their attributes can be successfully accessed.
    _ = schema.StatePerson()

    apache_beam_pipeline_options.view_as(SetupOptions).save_main_session = True

    # Get pipeline job details
    all_apache_beam_pipeline_options = apache_beam_pipeline_options.get_all_options(
    )

    query_dataset = all_apache_beam_pipeline_options[
        'project'] + '.' + data_input
    reference_dataset = all_apache_beam_pipeline_options[
        'project'] + '.' + reference_input

    person_id_filter_set = set(
        person_filter_ids) if person_filter_ids else None

    with beam.Pipeline(options=apache_beam_pipeline_options) as p:
        # Get StatePersons
        persons = (p | 'Load StatePersons' >> BuildRootEntity(
            dataset=query_dataset,
            root_entity_class=entities.StatePerson,
            unifying_id_field=entities.StatePerson.get_class_id_name(),
            build_related_entities=True,
            unifying_id_field_filter_set=person_id_filter_set))

        # Get StateSentenceGroups
        sentence_groups = (p | 'Load StateSentenceGroups' >> BuildRootEntity(
            dataset=query_dataset,
            root_entity_class=entities.StateSentenceGroup,
            unifying_id_field=entities.StatePerson.get_class_id_name(),
            build_related_entities=True,
            unifying_id_field_filter_set=person_id_filter_set,
            state_code=state_code))

        # Get StateIncarcerationSentences
        incarceration_sentences = (
            p | 'Load StateIncarcerationSentences' >> BuildRootEntity(
                dataset=query_dataset,
                root_entity_class=entities.StateIncarcerationSentence,
                unifying_id_field=entities.StatePerson.get_class_id_name(),
                build_related_entities=True,
                unifying_id_field_filter_set=person_id_filter_set,
                state_code=state_code))

        # Get StateSupervisionSentences
        supervision_sentences = (
            p | 'Load StateSupervisionSentences' >> BuildRootEntity(
                dataset=query_dataset,
                root_entity_class=entities.StateSupervisionSentence,
                unifying_id_field=entities.StatePerson.get_class_id_name(),
                build_related_entities=True,
                unifying_id_field_filter_set=person_id_filter_set,
                state_code=state_code))

        if state_code is None or state_code == 'US_MO':
            # Bring in the reference table that includes sentence status ranking information
            us_mo_sentence_status_query = f"SELECT * FROM `{reference_dataset}.us_mo_sentence_statuses`"

            us_mo_sentence_statuses = (
                p |
                "Read MO sentence status table from BigQuery" >> beam.io.Read(
                    beam.io.BigQuerySource(query=us_mo_sentence_status_query,
                                           use_standard_sql=True)))
        else:
            us_mo_sentence_statuses = (
                p |
                f"Generate empty MO statuses list for non-MO state run: {state_code} "
                >> beam.Create([]))

        us_mo_sentence_status_rankings_as_kv = (
            us_mo_sentence_statuses
            | 'Convert MO sentence status ranking table to KV tuples' >>
            beam.ParDo(ConvertDictToKVTuple(), 'person_id'))

        supervision_sentences_and_statuses = (
            {
                'incarceration_sentences': incarceration_sentences,
                'supervision_sentences': supervision_sentences,
                'sentence_statuses': us_mo_sentence_status_rankings_as_kv
            }
            | 'Group sentences to the sentence statuses for that person' >>
            beam.CoGroupByKey())

        sentences_converted = (
            supervision_sentences_and_statuses
            | 'Convert to state-specific sentences' >> beam.ParDo(
                ConvertSentencesToStateSpecificType()).with_outputs(
                    'incarceration_sentences', 'supervision_sentences'))

        sentences_and_sentence_groups = (
            {
                'sentence_groups': sentence_groups,
                'incarceration_sentences':
                sentences_converted.incarceration_sentences,
                'supervision_sentences':
                sentences_converted.supervision_sentences
            }
            | 'Group sentences to sentence groups' >> beam.CoGroupByKey())

        # Set hydrated sentences on the corresponding sentence groups
        sentence_groups_with_hydrated_sentences = (
            sentences_and_sentence_groups
            | 'Set hydrated sentences on sentence groups' >> beam.ParDo(
                SetSentencesOnSentenceGroup()))

        # Group each StatePerson with their related entities
        person_and_sentence_groups = (
            {
                'person': persons,
                'sentence_groups': sentence_groups_with_hydrated_sentences
            }
            | 'Group StatePerson to SentenceGroups' >> beam.CoGroupByKey())

        # Bring in the table that associates people and their county of residence
        person_id_to_county_query = \
            f"SELECT * FROM `{reference_dataset}.persons_to_recent_county_of_residence`"

        person_id_to_county_kv = (
            p | "Read person_id to county associations from BigQuery" >>
            beam.io.Read(
                beam.io.BigQuerySource(query=person_id_to_county_query,
                                       use_standard_sql=True))
            | "Convert person_id to county association table to KV" >>
            beam.ParDo(ConvertDictToKVTuple(), 'person_id'))

        # Identify IncarcerationEvents events from the StatePerson's StateIncarcerationPeriods
        person_events = (
            person_and_sentence_groups
            | 'Classify Incarceration Events' >> beam.ParDo(
                ClassifyIncarcerationEvents(), AsDict(person_id_to_county_kv)))

        # Get pipeline job details for accessing job_id
        all_pipeline_options = apache_beam_pipeline_options.get_all_options()

        # Add timestamp for local jobs
        job_timestamp = datetime.datetime.now().strftime(
            '%Y-%m-%d_%H_%M_%S.%f')
        all_apache_beam_pipeline_options['job_timestamp'] = job_timestamp

        # Get the type of metric to calculate
        metric_types_set = set(metric_types)

        # Get IncarcerationMetrics
        incarceration_metrics = (
            person_events
            | 'Get Incarceration Metrics' >> GetIncarcerationMetrics(
                pipeline_options=all_pipeline_options,
                metric_types=metric_types_set,
                calculation_end_month=calculation_end_month,
                calculation_month_count=calculation_month_count))

        if person_id_filter_set:
            logging.warning(
                "Non-empty person filter set - returning before writing metrics."
            )
            return

        # Convert the metrics into a format that's writable to BQ
        writable_metrics = (
            incarceration_metrics | 'Convert to dict to be written to BQ' >>
            beam.ParDo(IncarcerationMetricWritableDict()).with_outputs(
                'admissions', 'populations', 'releases'))

        # Write the metrics to the output tables in BigQuery
        admissions_table = output + '.incarceration_admission_metrics'

        population_table = output + '.incarceration_population_metrics'

        releases_table = output + '.incarceration_release_metrics'

        _ = (writable_metrics.admissions
             | f"Write admission metrics to BQ table: {admissions_table}" >>
             beam.io.WriteToBigQuery(
                 table=admissions_table,
                 create_disposition=beam.io.BigQueryDisposition.CREATE_NEVER,
                 write_disposition=beam.io.BigQueryDisposition.WRITE_APPEND))

        _ = (writable_metrics.populations
             | f"Write population metrics to BQ table: {population_table}" >>
             beam.io.WriteToBigQuery(
                 table=population_table,
                 create_disposition=beam.io.BigQueryDisposition.CREATE_NEVER,
                 write_disposition=beam.io.BigQueryDisposition.WRITE_APPEND))

        _ = (writable_metrics.releases
             | f"Write release metrics to BQ table: {releases_table}" >>
             beam.io.WriteToBigQuery(
                 table=releases_table,
                 create_disposition=beam.io.BigQueryDisposition.CREATE_NEVER,
                 write_disposition=beam.io.BigQueryDisposition.WRITE_APPEND))
Exemple #12
0
def run(argv=None):
    """Runs the program calculation pipeline."""

    # Workaround to load SQLAlchemy objects at start of pipeline. This is necessary because the BuildRootEntity
    # function tries to access attributes of relationship properties on the SQLAlchemy room_schema_class before they
    # have been loaded. However, if *any* SQLAlchemy objects have been instantiated, then the relationship properties
    # are loaded and their attributes can be successfully accessed.
    _ = schema.StatePerson()

    # Parse command-line arguments
    known_args, pipeline_args = parse_arguments(argv)

    pipeline_options = PipelineOptions(pipeline_args)
    pipeline_options.view_as(SetupOptions).save_main_session = True

    # Get pipeline job details
    all_pipeline_options = pipeline_options.get_all_options()

    input_dataset = all_pipeline_options['project'] + '.' + known_args.input
    reference_dataset = all_pipeline_options['project'] + '.' + \
        known_args.reference_input

    with beam.Pipeline(argv=pipeline_args) as p:
        # Get StatePersons
        persons = (p | 'Load Persons' >> BuildRootEntity(
            dataset=input_dataset,
            data_dict=None,
            root_schema_class=schema.StatePerson,
            root_entity_class=entities.StatePerson,
            unifying_id_field='person_id',
            build_related_entities=True))

        # Get StateProgramAssignments
        program_assignments = (
            p | 'Load Program Assignments' >> BuildRootEntity(
                dataset=input_dataset,
                data_dict=None,
                root_schema_class=schema.StateProgramAssignment,
                root_entity_class=entities.StateProgramAssignment,
                unifying_id_field='person_id',
                build_related_entities=True))

        # Get StateAssessments
        assessments = (p | 'Load Assessments' >> BuildRootEntity(
            dataset=input_dataset,
            data_dict=None,
            root_schema_class=schema.StateAssessment,
            root_entity_class=entities.StateAssessment,
            unifying_id_field='person_id',
            build_related_entities=False))

        # Get StateSupervisionPeriods
        supervision_periods = (
            p | 'Load SupervisionPeriods' >> BuildRootEntity(
                dataset=input_dataset,
                data_dict=None,
                root_schema_class=schema.StateSupervisionPeriod,
                root_entity_class=entities.StateSupervisionPeriod,
                unifying_id_field='person_id',
                build_related_entities=False))

        supervision_period_to_agent_association_query = \
            f"SELECT * FROM `{reference_dataset}.supervision_period_to_agent_association`"

        supervision_period_to_agent_associations = (
            p | "Read Supervision Period to Agent table from BigQuery" >>
            beam.io.Read(
                beam.io.BigQuerySource(
                    query=supervision_period_to_agent_association_query,
                    use_standard_sql=True)))

        # Convert the association table rows into key-value tuples with the value for the supervision_period_id column
        # as the key
        supervision_period_to_agent_associations_as_kv = (
            supervision_period_to_agent_associations
            | 'Convert Supervision Period to Agent table to KV tuples' >>
            beam.ParDo(ConvertDictToKVTuple(), 'supervision_period_id'))

        # Group each StatePerson with their other entities
        persons_entities = ({
            'person': persons,
            'program_assignments': program_assignments,
            'assessments': assessments,
            'supervision_periods': supervision_periods
        }
                            |
                            'Group StatePerson to StateProgramAssignments and'
                            >> beam.CoGroupByKey())

        # Identify ProgramEvents from the StatePerson's StateProgramAssignments
        person_program_events = (
            persons_entities
            | beam.ParDo(
                ClassifyProgramAssignments(),
                AsDict(supervision_period_to_agent_associations_as_kv)))

        # Get dimensions to include and methodologies to use
        inclusions, _ = dimensions_and_methodologies(known_args)

        # Get pipeline job details for accessing job_id
        all_pipeline_options = pipeline_options.get_all_options()

        # The number of months to limit the monthly calculation output to
        calculation_month_limit = known_args.calculation_month_limit

        # Add timestamp for local jobs
        job_timestamp = datetime.datetime.now().strftime(
            '%Y-%m-%d_%H_%M_%S.%f')
        all_pipeline_options['job_timestamp'] = job_timestamp

        # Get program metrics
        program_metrics = (
            person_program_events | 'Get Program Metrics' >> GetProgramMetrics(
                pipeline_options=all_pipeline_options,
                inclusions=inclusions,
                calculation_month_limit=calculation_month_limit))

        # Convert the metrics into a format that's writable to BQ
        writable_metrics = (
            program_metrics
            | 'Convert to dict to be written to BQ' >> beam.ParDo(
                ProgramMetricWritableDict()).with_outputs('referrals'))

        # Write the metrics to the output tables in BigQuery
        referrals_table = known_args.output + '.program_referral_metrics'

        _ = (writable_metrics.referrals
             | f"Write referral metrics to BQ table: {referrals_table}" >>
             beam.io.WriteToBigQuery(
                 table=referrals_table,
                 create_disposition=beam.io.BigQueryDisposition.CREATE_NEVER,
                 write_disposition=beam.io.BigQueryDisposition.WRITE_APPEND))
def run(argv=None):
    """The main function which creates the pipeline and runs it."""
    parser = argparse.ArgumentParser()
    # Here we add some specific command line arguments we expect.   S
    # This defaults the output table in your BigQuery you'll have
    # to create the example_data dataset yourself using bq mk temp
    parser.add_argument('--output', dest='output', required=False,
                        help='Output BQ table to write results to.',
                        default='lake.orders_denormalized_sideinput')

    # Parse arguments from the command line.
    known_args, pipeline_args = parser.parse_known_args(argv)

    # DataLakeToDataMart is a class we built in this script to hold the logic for
    # transforming the file into a BigQuery table.
    data_lake_to_data_mart = DataLakeToDataMart()

    p = beam.Pipeline(options=PipelineOptions(pipeline_args))
    schema = parse_table_schema_from_json(data_lake_to_data_mart.schema_str)
    pipeline = beam.Pipeline(options=PipelineOptions(pipeline_args))

    # This query returns details about the account, normalized into a
    # different table.  We will be joining the data in to the main orders dataset in order
    # to create a denormalized table.
    account_details_source = (
        pipeline
        | 'Read Account Details from BigQuery ' >> beam.io.Read(
            beam.io.BigQuerySource(query="""
                SELECT
                  acct_number,
                  acct_company_name,
                  acct_group_name,
                  acct_name,
                  acct_org_name,
                  address,
                  city,
                  state,
                  zip_code,
                  country
                FROM
                  `qwiklabs-resources.python_dataflow_example.account`""",
                                   # This next stage of the pipeline maps the acct_number to a single row of
                                   # results from BigQuery.  Mapping this way helps Dataflow move your data around
                                   # to different workers.  When later stages of the pipeline run, all results from
                                   # a given account number will run on one worker.
                                   use_standard_sql=True))
        | 'Account Details' >> beam.Map(
            lambda row: (
                row['acct_number'], row
            )))

    orders_query = data_lake_to_data_mart.get_orders_query()
    (p
     # Read the orders from BigQuery.  This is the source of the pipeline.  All further
     # processing starts with rows read from the query results here.
     | 'Read Orders from BigQuery ' >> beam.io.Read(
        beam.io.BigQuerySource(query=orders_query, use_standard_sql=True))
     # Here we pass in a side input, which is data that comes from outside our
     # main source.  The side input contains a map of states to their full name
     | 'Join Data with sideInput' >> beam.Map(data_lake_to_data_mart.add_account_details, AsDict(
        account_details_source))
     # This is the final stage of the pipeline, where we define the destination
     # of the data.  In this case we are writing to BigQuery.
     | 'Write Data to BigQuery' >> beam.io.Write(
        beam.io.BigQuerySink(
            # The table name is a required argument for the BigQuery sink.
            # In this case we use the value passed in from the command line.
            known_args.output,
            # Here we use the JSON schema read in from a JSON file.
            # Specifying the schema allows the API to create the table correctly if it does not yet exist.
            schema=schema,
            # Creates the table in BigQuery if it does not yet exist.
            create_disposition=beam.io.BigQueryDisposition.CREATE_IF_NEEDED,
            # Deletes all data in the BigQuery table before writing.
            write_disposition=beam.io.BigQueryDisposition.WRITE_TRUNCATE)))
    p.run().wait_until_finish()
Exemple #14
0
    def testIncarcerationPipeline(self):
        fake_person_id = 12345

        fake_person = schema.StatePerson(
            person_id=fake_person_id,
            gender=Gender.MALE,
            birthdate=date(1970, 1, 1),
            residency_status=ResidencyStatus.PERMANENT)

        persons_data = [normalized_database_base_dict(fake_person)]

        race_1 = schema.StatePersonRace(person_race_id=111,
                                        state_code='CA',
                                        race=Race.BLACK,
                                        person_id=fake_person_id)

        race_2 = schema.StatePersonRace(person_race_id=111,
                                        state_code='ND',
                                        race=Race.WHITE,
                                        person_id=fake_person_id)

        races_data = normalized_database_base_dict_list([race_1, race_2])

        ethnicity = schema.StatePersonEthnicity(person_ethnicity_id=111,
                                                state_code='CA',
                                                ethnicity=Ethnicity.HISPANIC,
                                                person_id=fake_person_id)

        ethnicity_data = normalized_database_base_dict_list([ethnicity])

        sentence_group = schema.StateSentenceGroup(sentence_group_id=111,
                                                   person_id=fake_person_id)

        initial_incarceration = schema.StateIncarcerationPeriod(
            incarceration_period_id=1111,
            status=StateIncarcerationPeriodStatus.NOT_IN_CUSTODY,
            state_code='CA',
            county_code='124',
            facility='San Quentin',
            facility_security_level=StateIncarcerationFacilitySecurityLevel.
            MAXIMUM,
            admission_reason=StateIncarcerationPeriodAdmissionReason.
            NEW_ADMISSION,
            projected_release_reason=StateIncarcerationPeriodReleaseReason.
            CONDITIONAL_RELEASE,
            admission_date=date(2008, 11, 20),
            release_date=date(2010, 12, 4),
            release_reason=StateIncarcerationPeriodReleaseReason.
            SENTENCE_SERVED,
            person_id=fake_person_id,
        )

        first_reincarceration = schema.StateIncarcerationPeriod(
            incarceration_period_id=2222,
            status=StateIncarcerationPeriodStatus.NOT_IN_CUSTODY,
            state_code='CA',
            county_code='124',
            facility='San Quentin',
            facility_security_level=StateIncarcerationFacilitySecurityLevel.
            MAXIMUM,
            admission_reason=StateIncarcerationPeriodAdmissionReason.
            NEW_ADMISSION,
            projected_release_reason=StateIncarcerationPeriodReleaseReason.
            CONDITIONAL_RELEASE,
            admission_date=date(2011, 4, 5),
            release_date=date(2014, 4, 14),
            release_reason=StateIncarcerationPeriodReleaseReason.
            SENTENCE_SERVED,
            person_id=fake_person_id)

        subsequent_reincarceration = schema.StateIncarcerationPeriod(
            incarceration_period_id=3333,
            status=StateIncarcerationPeriodStatus.IN_CUSTODY,
            state_code='CA',
            county_code='124',
            facility='San Quentin',
            facility_security_level=StateIncarcerationFacilitySecurityLevel.
            MAXIMUM,
            admission_reason=StateIncarcerationPeriodAdmissionReason.
            NEW_ADMISSION,
            projected_release_reason=StateIncarcerationPeriodReleaseReason.
            CONDITIONAL_RELEASE,
            admission_date=date(2017, 1, 4),
            person_id=fake_person_id)

        incarceration_sentence = schema.StateIncarcerationSentence(
            incarceration_sentence_id=1111,
            sentence_group_id=sentence_group.sentence_group_id,
            incarceration_periods=[
                initial_incarceration, first_reincarceration,
                subsequent_reincarceration
            ],
            person_id=fake_person_id)

        supervision_sentence = schema.StateSupervisionSentence(
            supervision_sentence_id=123, person_id=fake_person_id)

        sentence_group.incarceration_sentences = [incarceration_sentence]

        sentence_group_data = [normalized_database_base_dict(sentence_group)]

        incarceration_sentence_data = [
            normalized_database_base_dict(incarceration_sentence)
        ]

        supervision_sentence_data = [
            normalized_database_base_dict(supervision_sentence)
        ]

        incarceration_periods_data = [
            normalized_database_base_dict(initial_incarceration),
            normalized_database_base_dict(first_reincarceration),
            normalized_database_base_dict(subsequent_reincarceration)
        ]

        state_incarceration_sentence_incarceration_period_association = [
            {
                'incarceration_period_id':
                initial_incarceration.incarceration_period_id,
                'incarceration_sentence_id':
                incarceration_sentence.incarceration_sentence_id,
            },
            {
                'incarceration_period_id':
                first_reincarceration.incarceration_period_id,
                'incarceration_sentence_id':
                incarceration_sentence.incarceration_sentence_id,
            },
            {
                'incarceration_period_id':
                subsequent_reincarceration.incarceration_period_id,
                'incarceration_sentence_id':
                incarceration_sentence.incarceration_sentence_id,
            },
        ]

        data_dict = {
            schema.StatePerson.__tablename__:
            persons_data,
            schema.StatePersonRace.__tablename__:
            races_data,
            schema.StatePersonEthnicity.__tablename__:
            ethnicity_data,
            schema.StateSentenceGroup.__tablename__:
            sentence_group_data,
            schema.StateIncarcerationSentence.__tablename__:
            incarceration_sentence_data,
            schema.StateSupervisionSentence.__tablename__:
            supervision_sentence_data,
            schema.StateIncarcerationPeriod.__tablename__:
            incarceration_periods_data,
            schema.state_incarceration_sentence_incarceration_period_association_table.name:
            state_incarceration_sentence_incarceration_period_association,
            schema.state_supervision_sentence_incarceration_period_association_table.name:
            [{}]
        }

        test_pipeline = TestPipeline()

        # Get StatePersons
        persons = (test_pipeline
                   | 'Load Persons' >> extractor_utils.BuildRootEntity(
                       dataset=None,
                       data_dict=data_dict,
                       root_schema_class=schema.StatePerson,
                       root_entity_class=entities.StatePerson,
                       unifying_id_field='person_id',
                       build_related_entities=True))

        # Get StateSentenceGroups
        sentence_groups = (
            test_pipeline
            | 'Load StateSentencegroups' >> extractor_utils.BuildRootEntity(
                dataset=None,
                data_dict=data_dict,
                root_schema_class=schema.StateSentenceGroup,
                root_entity_class=entities.StateSentenceGroup,
                unifying_id_field='person_id',
                build_related_entities=True))

        # Get StateIncarcerationSentences
        incarceration_sentences = (
            test_pipeline | 'Load StateIncarcerationSentences' >>
            extractor_utils.BuildRootEntity(
                dataset=None,
                data_dict=data_dict,
                root_schema_class=schema.StateIncarcerationSentence,
                root_entity_class=entities.StateIncarcerationSentence,
                unifying_id_field='person_id',
                build_related_entities=True))

        # Get StateSupervisionSentences
        supervision_sentences = (
            test_pipeline | 'Load StateSupervisionSentences' >>
            extractor_utils.BuildRootEntity(
                dataset=None,
                data_dict=data_dict,
                root_schema_class=schema.StateSupervisionSentence,
                root_entity_class=entities.StateSupervisionSentence,
                unifying_id_field='person_id',
                build_related_entities=True))

        sentences_and_sentence_groups = (
            {
                'sentence_groups': sentence_groups,
                'incarceration_sentences': incarceration_sentences,
                'supervision_sentences': supervision_sentences
            }
            | 'Group sentences to sentence groups' >> beam.CoGroupByKey())

        sentence_groups_with_hydrated_sentences = (
            sentences_and_sentence_groups
            | 'Set hydrated sentences on sentence groups' >> beam.ParDo(
                SetSentencesOnSentenceGroup()))

        # Group each StatePerson with their related entities
        person_and_sentence_groups = (
            {
                'person': persons,
                'sentence_groups': sentence_groups_with_hydrated_sentences
            }
            | 'Group StatePerson to SentenceGroups' >> beam.CoGroupByKey())

        # Identify IncarcerationEvents events from the StatePerson's
        # StateIncarcerationPeriods
        fake_person_id_to_county_query_result = [{
            'person_id':
            fake_person_id,
            'county_of_residence':
            _COUNTY_OF_RESIDENCE
        }]
        person_id_to_county_kv = (
            test_pipeline
            | "Read person id to county associations from BigQuery" >>
            beam.Create(fake_person_id_to_county_query_result)
            |
            "Convert to KV" >> beam.ParDo(ConvertDictToKVTuple(), 'person_id'))

        person_events = (person_and_sentence_groups
                         | 'Classify Incarceration Events' >> beam.ParDo(
                             pipeline.ClassifyIncarcerationEvents(),
                             AsDict(person_id_to_county_kv)))

        # Get pipeline job details for accessing job_id
        all_pipeline_options = PipelineOptions().get_all_options()

        # Add timestamp for local jobs
        job_timestamp = datetime.datetime.now().strftime(
            '%Y-%m-%d_%H_%M_%S.%f')
        all_pipeline_options['job_timestamp'] = job_timestamp

        # Get IncarcerationMetrics
        incarceration_metrics = (
            person_events
            | 'Get Incarceration Metrics' >> pipeline.GetIncarcerationMetrics(
                pipeline_options=all_pipeline_options,
                inclusions=ALL_INCLUSIONS_DICT,
                calculation_month_limit=-1))

        assert_that(incarceration_metrics,
                    AssertMatchers.validate_metric_type())

        test_pipeline.run()
Exemple #15
0
def run(apache_beam_pipeline_options: PipelineOptions, data_input: str,
        reference_view_input: str, static_reference_input: str, output: str,
        calculation_month_count: int, metric_types: List[str],
        state_code: Optional[str], calculation_end_month: Optional[str],
        person_filter_ids: Optional[List[int]]):
    """Runs the program calculation pipeline."""

    # Workaround to load SQLAlchemy objects at start of pipeline. This is necessary because the BuildRootEntity
    # function tries to access attributes of relationship properties on the SQLAlchemy room_schema_class before they
    # have been loaded. However, if *any* SQLAlchemy objects have been instantiated, then the relationship properties
    # are loaded and their attributes can be successfully accessed.
    _ = schema.StatePerson()

    apache_beam_pipeline_options.view_as(SetupOptions).save_main_session = True

    # Get pipeline job details
    all_pipeline_options = apache_beam_pipeline_options.get_all_options()
    project_id = all_pipeline_options['project']

    input_dataset = project_id + '.' + data_input
    reference_dataset = project_id + '.' + reference_view_input
    static_reference_dataset = project_id + '.' + static_reference_input

    person_id_filter_set = set(
        person_filter_ids) if person_filter_ids else None

    with beam.Pipeline(options=apache_beam_pipeline_options) as p:
        # Get StatePersons
        persons = (p | 'Load Persons' >> BuildRootEntity(
            dataset=input_dataset,
            root_entity_class=entities.StatePerson,
            unifying_id_field=entities.StatePerson.get_class_id_name(),
            build_related_entities=True,
            unifying_id_field_filter_set=person_id_filter_set,
            state_code=state_code))

        # Get StateProgramAssignments
        program_assignments = (
            p | 'Load Program Assignments' >> BuildRootEntity(
                dataset=input_dataset,
                root_entity_class=entities.StateProgramAssignment,
                unifying_id_field=entities.StatePerson.get_class_id_name(),
                build_related_entities=True,
                unifying_id_field_filter_set=person_id_filter_set,
                state_code=state_code))

        # Get StateAssessments
        assessments = (p | 'Load Assessments' >> BuildRootEntity(
            dataset=input_dataset,
            root_entity_class=entities.StateAssessment,
            unifying_id_field=entities.StatePerson.get_class_id_name(),
            build_related_entities=False,
            unifying_id_field_filter_set=person_id_filter_set,
            state_code=state_code))

        # Get StateSupervisionPeriods
        supervision_periods = (
            p | 'Load SupervisionPeriods' >> BuildRootEntity(
                dataset=input_dataset,
                root_entity_class=entities.StateSupervisionPeriod,
                unifying_id_field=entities.StatePerson.get_class_id_name(),
                build_related_entities=False,
                unifying_id_field_filter_set=person_id_filter_set,
                state_code=state_code))

        supervision_period_to_agent_associations_as_kv = (
            p | 'Load supervision_period_to_agent_associations_as_kv' >>
            ImportTableAsKVTuples(
                dataset_id=reference_dataset,
                table_id=SUPERVISION_PERIOD_TO_AGENT_ASSOCIATION_VIEW_NAME,
                table_key='supervision_period_id',
                state_code_filter=state_code,
                person_id_filter_set=person_id_filter_set))

        state_race_ethnicity_population_counts = (
            p | 'Load state_race_ethnicity_population_counts' >> ImportTable(
                dataset_id=static_reference_dataset,
                table_id='state_race_ethnicity_population_counts',
                state_code_filter=state_code,
                person_id_filter_set=None))

        # Group each StatePerson with their other entities
        persons_entities = ({
            'person': persons,
            'program_assignments': program_assignments,
            'assessments': assessments,
            'supervision_periods': supervision_periods
        }
                            |
                            'Group StatePerson to StateProgramAssignments and'
                            >> beam.CoGroupByKey())

        # Identify ProgramEvents from the StatePerson's StateProgramAssignments
        person_program_events = (
            persons_entities
            | beam.ParDo(
                ClassifyProgramAssignments(),
                AsDict(supervision_period_to_agent_associations_as_kv)))

        person_metadata = (
            persons
            | "Build the person_metadata dictionary" >> beam.ParDo(
                BuildPersonMetadata(),
                AsList(state_race_ethnicity_population_counts)))

        person_program_events_with_metadata = (
            {
                'person_events': person_program_events,
                'person_metadata': person_metadata
            }
            | 'Group ProgramEvents with person-level metadata' >>
            beam.CoGroupByKey()
            |
            'Organize StatePerson, PersonMetadata and ProgramEvents for calculations'
            >> beam.ParDo(ExtractPersonEventsMetadata()))

        # Get pipeline job details for accessing job_id
        all_pipeline_options = apache_beam_pipeline_options.get_all_options()

        # Add timestamp for local jobs
        job_timestamp = datetime.datetime.now().strftime(
            '%Y-%m-%d_%H_%M_%S.%f')
        all_pipeline_options['job_timestamp'] = job_timestamp

        # Get the type of metric to calculate
        metric_types_set = set(metric_types)

        # Get program metrics
        program_metrics = (
            person_program_events_with_metadata | 'Get Program Metrics' >>
            GetProgramMetrics(pipeline_options=all_pipeline_options,
                              metric_types=metric_types_set,
                              calculation_end_month=calculation_end_month,
                              calculation_month_count=calculation_month_count))

        if person_id_filter_set:
            logging.warning(
                "Non-empty person filter set - returning before writing metrics."
            )
            return

        # Convert the metrics into a format that's writable to BQ
        writable_metrics = (
            program_metrics
            | 'Convert to dict to be written to BQ' >> beam.ParDo(
                RecidivizMetricWritableDict()).with_outputs(
                    ProgramMetricType.PROGRAM_PARTICIPATION.value,
                    ProgramMetricType.PROGRAM_REFERRAL.value))

        # Write the metrics to the output tables in BigQuery
        referrals_table_id = DATAFLOW_METRICS_TO_TABLES.get(
            ProgramReferralMetric)
        participation_table_id = DATAFLOW_METRICS_TO_TABLES.get(
            ProgramParticipationMetric)

        _ = (writable_metrics.PROGRAM_REFERRAL
             | f"Write referral metrics to BQ table: {referrals_table_id}" >>
             beam.io.WriteToBigQuery(
                 table=referrals_table_id,
                 dataset=output,
                 create_disposition=beam.io.BigQueryDisposition.CREATE_NEVER,
                 write_disposition=beam.io.BigQueryDisposition.WRITE_APPEND,
                 method=beam.io.WriteToBigQuery.Method.FILE_LOADS))

        _ = (
            writable_metrics.PROGRAM_PARTICIPATION
            |
            f"Write participation metrics to BQ table: {participation_table_id}"
            >> beam.io.WriteToBigQuery(
                table=participation_table_id,
                dataset=output,
                create_disposition=beam.io.BigQueryDisposition.CREATE_NEVER,
                write_disposition=beam.io.BigQueryDisposition.WRITE_APPEND,
                method=beam.io.WriteToBigQuery.Method.FILE_LOADS))
Exemple #16
0
def run(apache_beam_pipeline_options: PipelineOptions, data_input: str,
        reference_input: str, output: str, calculation_month_count: int,
        metric_types: List[str], state_code: Optional[str],
        calculation_end_month: Optional[str],
        person_filter_ids: Optional[List[int]]):
    """Runs the program calculation pipeline."""

    # Workaround to load SQLAlchemy objects at start of pipeline. This is necessary because the BuildRootEntity
    # function tries to access attributes of relationship properties on the SQLAlchemy room_schema_class before they
    # have been loaded. However, if *any* SQLAlchemy objects have been instantiated, then the relationship properties
    # are loaded and their attributes can be successfully accessed.
    _ = schema.StatePerson()

    apache_beam_pipeline_options.view_as(SetupOptions).save_main_session = True

    # Get pipeline job details
    all_pipeline_options = apache_beam_pipeline_options.get_all_options()

    input_dataset = all_pipeline_options['project'] + '.' + data_input
    reference_dataset = all_pipeline_options['project'] + '.' + reference_input

    person_id_filter_set = set(
        person_filter_ids) if person_filter_ids else None

    with beam.Pipeline(options=apache_beam_pipeline_options) as p:
        # Get StatePersons
        persons = (p | 'Load Persons' >> BuildRootEntity(
            dataset=input_dataset,
            root_entity_class=entities.StatePerson,
            unifying_id_field=entities.StatePerson.get_class_id_name(),
            build_related_entities=True,
            unifying_id_field_filter_set=person_id_filter_set))

        # Get StateProgramAssignments
        program_assignments = (
            p | 'Load Program Assignments' >> BuildRootEntity(
                dataset=input_dataset,
                root_entity_class=entities.StateProgramAssignment,
                unifying_id_field=entities.StatePerson.get_class_id_name(),
                build_related_entities=True,
                unifying_id_field_filter_set=person_id_filter_set,
                state_code=state_code))

        # Get StateAssessments
        assessments = (p | 'Load Assessments' >> BuildRootEntity(
            dataset=input_dataset,
            root_entity_class=entities.StateAssessment,
            unifying_id_field=entities.StatePerson.get_class_id_name(),
            build_related_entities=False,
            unifying_id_field_filter_set=person_id_filter_set,
            state_code=state_code))

        # Get StateSupervisionPeriods
        supervision_periods = (
            p | 'Load SupervisionPeriods' >> BuildRootEntity(
                dataset=input_dataset,
                root_entity_class=entities.StateSupervisionPeriod,
                unifying_id_field=entities.StatePerson.get_class_id_name(),
                build_related_entities=False,
                unifying_id_field_filter_set=person_id_filter_set,
                state_code=state_code))

        supervision_period_to_agent_association_query = \
            f"SELECT * FROM `{reference_dataset}.supervision_period_to_agent_association`"

        supervision_period_to_agent_associations = (
            p | "Read Supervision Period to Agent table from BigQuery" >>
            beam.io.Read(
                beam.io.BigQuerySource(
                    query=supervision_period_to_agent_association_query,
                    use_standard_sql=True)))

        # Convert the association table rows into key-value tuples with the value for the supervision_period_id column
        # as the key
        supervision_period_to_agent_associations_as_kv = (
            supervision_period_to_agent_associations
            | 'Convert Supervision Period to Agent table to KV tuples' >>
            beam.ParDo(ConvertDictToKVTuple(), 'supervision_period_id'))

        # Group each StatePerson with their other entities
        persons_entities = ({
            'person': persons,
            'program_assignments': program_assignments,
            'assessments': assessments,
            'supervision_periods': supervision_periods
        }
                            |
                            'Group StatePerson to StateProgramAssignments and'
                            >> beam.CoGroupByKey())

        # Identify ProgramEvents from the StatePerson's StateProgramAssignments
        person_program_events = (
            persons_entities
            | beam.ParDo(
                ClassifyProgramAssignments(),
                AsDict(supervision_period_to_agent_associations_as_kv)))

        # Get pipeline job details for accessing job_id
        all_pipeline_options = apache_beam_pipeline_options.get_all_options()

        # Add timestamp for local jobs
        job_timestamp = datetime.datetime.now().strftime(
            '%Y-%m-%d_%H_%M_%S.%f')
        all_pipeline_options['job_timestamp'] = job_timestamp

        # Get the type of metric to calculate
        metric_types_set = set(metric_types)

        # Get program metrics
        program_metrics = (
            person_program_events | 'Get Program Metrics' >> GetProgramMetrics(
                pipeline_options=all_pipeline_options,
                metric_types=metric_types_set,
                calculation_end_month=calculation_end_month,
                calculation_month_count=calculation_month_count))

        if person_id_filter_set:
            logging.warning(
                "Non-empty person filter set - returning before writing metrics."
            )
            return

        # Convert the metrics into a format that's writable to BQ
        writable_metrics = (
            program_metrics
            | 'Convert to dict to be written to BQ' >> beam.ParDo(
                ProgramMetricWritableDict()).with_outputs('referrals'))

        # Write the metrics to the output tables in BigQuery
        referrals_table = output + '.program_referral_metrics'

        _ = (writable_metrics.referrals
             | f"Write referral metrics to BQ table: {referrals_table}" >>
             beam.io.WriteToBigQuery(
                 table=referrals_table,
                 create_disposition=beam.io.BigQueryDisposition.CREATE_NEVER,
                 write_disposition=beam.io.BigQueryDisposition.WRITE_APPEND))
Exemple #17
0
def run(apache_beam_pipeline_options: PipelineOptions,
        data_input: str,
        reference_view_input: str,
        static_reference_input: str,
        output: str,
        metric_types: List[str],
        state_code: Optional[str],
        person_filter_ids: Optional[List[int]]):
    """Runs the recidivism calculation pipeline."""

    # Workaround to load SQLAlchemy objects at start of pipeline. This is
    # necessary because the BuildRootEntity function tries to access attributes
    # of relationship properties on the SQLAlchemy room_schema_class before they
    # have been loaded. However, if *any* SQLAlchemy objects have been
    # instantiated, then the relationship properties are loaded and their
    # attributes can be successfully accessed.
    _ = schema.StatePerson()

    apache_beam_pipeline_options.view_as(SetupOptions).save_main_session = True

    # Get pipeline job details
    all_pipeline_options = apache_beam_pipeline_options.get_all_options()
    project_id = all_pipeline_options['project']

    input_dataset = project_id + '.' + data_input
    reference_dataset = project_id + '.' + reference_view_input
    static_reference_dataset = project_id + '.' + static_reference_input

    person_id_filter_set = set(person_filter_ids) if person_filter_ids else None

    with beam.Pipeline(options=apache_beam_pipeline_options) as p:
        # Get StatePersons
        persons = (p
                   | 'Load Persons' >>
                   BuildRootEntity(dataset=input_dataset, root_entity_class=entities.StatePerson,
                                   unifying_id_field=entities.StatePerson.get_class_id_name(),
                                   build_related_entities=True, unifying_id_field_filter_set=person_id_filter_set,
                                   state_code=state_code))

        # Get StateIncarcerationPeriods
        incarceration_periods = (p
                                 | 'Load IncarcerationPeriods' >>
                                 BuildRootEntity(dataset=input_dataset,
                                                 root_entity_class=entities.StateIncarcerationPeriod,
                                                 unifying_id_field=entities.StatePerson.get_class_id_name(),
                                                 build_related_entities=True,
                                                 unifying_id_field_filter_set=person_id_filter_set,
                                                 state_code=state_code
                                                 ))

        # Get StateSupervisionViolations
        supervision_violations = \
            (p
             | 'Load SupervisionViolations' >>
             BuildRootEntity(dataset=input_dataset, root_entity_class=entities.StateSupervisionViolation,
                             unifying_id_field=entities.StatePerson.get_class_id_name(), build_related_entities=True,
                             unifying_id_field_filter_set=person_id_filter_set,
                             state_code=state_code
                             ))

        # TODO(#2769): Don't bring this in as a root entity
        # Get StateSupervisionViolationResponses
        supervision_violation_responses = \
            (p
             | 'Load SupervisionViolationResponses' >>
             BuildRootEntity(dataset=input_dataset, root_entity_class=entities.StateSupervisionViolationResponse,
                             unifying_id_field=entities.StatePerson.get_class_id_name(), build_related_entities=True,
                             unifying_id_field_filter_set=person_id_filter_set,
                             state_code=state_code
                             ))

        # Group StateSupervisionViolationResponses and
        # StateSupervisionViolations by person_id
        supervision_violations_and_responses = (
            {'violations': supervision_violations,
             'violation_responses': supervision_violation_responses
             } | 'Group StateSupervisionViolationResponses to '
                 'StateSupervisionViolations' >>
            beam.CoGroupByKey()
        )

        # Set the fully hydrated StateSupervisionViolation entities on
        # the corresponding StateSupervisionViolationResponses
        violation_responses_with_hydrated_violations = (
            supervision_violations_and_responses
            | 'Set hydrated StateSupervisionViolations on '
              'the StateSupervisionViolationResponses' >>
            beam.ParDo(SetViolationOnViolationsResponse()))

        # Group StateIncarcerationPeriods and StateSupervisionViolationResponses
        # by person_id
        incarceration_periods_and_violation_responses = (
            {'incarceration_periods': incarceration_periods,
             'violation_responses':
                 violation_responses_with_hydrated_violations}
            | 'Group StateIncarcerationPeriods to '
              'StateSupervisionViolationResponses' >>
            beam.CoGroupByKey()
        )

        # Set the fully hydrated StateSupervisionViolationResponse entities on
        # the corresponding StateIncarcerationPeriods
        incarceration_periods_with_source_violations = (
            incarceration_periods_and_violation_responses
            | 'Set hydrated StateSupervisionViolationResponses on '
            'the StateIncarcerationPeriods' >>
            beam.ParDo(SetViolationResponseOnIncarcerationPeriod()))

        # Group each StatePerson with their StateIncarcerationPeriods
        person_and_incarceration_periods = (
            {'person': persons,
             'incarceration_periods':
                 incarceration_periods_with_source_violations}
            | 'Group StatePerson to StateIncarcerationPeriods' >>
            beam.CoGroupByKey()
        )

        # Bring in the table that associates people and their county of residence
        person_id_to_county_kv = (p | 'Load person_id_to_county_kv' >> ImportTableAsKVTuples(
            dataset_id=reference_dataset,
            table_id=PERSONS_TO_RECENT_COUNTY_OF_RESIDENCE_VIEW_NAME,
            table_key='person_id',
            state_code_filter=state_code,
            person_id_filter_set=person_id_filter_set
        ))

        state_race_ethnicity_population_counts = (
                p | 'Load state_race_ethnicity_population_counts' >>
                ImportTable(
                    dataset_id=static_reference_dataset,
                    table_id='state_race_ethnicity_population_counts',
                    state_code_filter=state_code,
                    person_id_filter_set=None
                ))

        # Identify ReleaseEvents events from the StatePerson's StateIncarcerationPeriods
        person_release_events = (
            person_and_incarceration_periods
            | "ClassifyReleaseEvents" >>
            beam.ParDo(ClassifyReleaseEvents(), AsDict(person_id_to_county_kv))
        )

        person_metadata = (persons
                           | "Build the person_metadata dictionary" >>
                           beam.ParDo(BuildPersonMetadata(),
                                      AsList(state_race_ethnicity_population_counts)))

        person_release_events_with_metadata = (
            {
                'person_events': person_release_events,
                'person_metadata': person_metadata
            }
            | 'Group ReleaseEvents with person-level metadata' >> beam.CoGroupByKey()
            | 'Organize StatePerson, PersonMetadata and ReleaseEvents for calculations' >>
            beam.ParDo(ExtractPersonReleaseEventsMetadata())
        )

        # Get pipeline job details for accessing job_id
        all_pipeline_options = apache_beam_pipeline_options.get_all_options()

        # Add timestamp for local jobs
        job_timestamp = datetime.datetime.now().strftime('%Y-%m-%d_%H_%M_%S.%f')
        all_pipeline_options['job_timestamp'] = job_timestamp

        # Get the type of metric to calculate
        metric_types_set = set(metric_types)

        # Get recidivism metrics
        recidivism_metrics = (person_release_events_with_metadata
                              | 'Get Recidivism Metrics' >>
                              GetRecidivismMetrics(
                                  pipeline_options=all_pipeline_options,
                                  metric_types=metric_types_set))

        if person_id_filter_set:
            logging.warning("Non-empty person filter set - returning before writing metrics.")
            return

        # Convert the metrics into a format that's writable to BQ
        writable_metrics = (recidivism_metrics
                            | 'Convert to dict to be written to BQ' >>
                            beam.ParDo(RecidivizMetricWritableDict()).with_outputs(
                                ReincarcerationRecidivismMetricType.REINCARCERATION_RATE.value,
                                ReincarcerationRecidivismMetricType.REINCARCERATION_COUNT.value
                            ))

        # Write the recidivism metrics to the output tables in BigQuery
        rates_table_id = DATAFLOW_METRICS_TO_TABLES.get(ReincarcerationRecidivismRateMetric)
        counts_table_id = DATAFLOW_METRICS_TO_TABLES.get(ReincarcerationRecidivismCountMetric)

        _ = (writable_metrics.REINCARCERATION_RATE
             | f"Write rate metrics to BQ table: {rates_table_id}" >>
             beam.io.WriteToBigQuery(
                 table=rates_table_id,
                 dataset=output,
                 create_disposition=beam.io.BigQueryDisposition.CREATE_NEVER,
                 write_disposition=beam.io.BigQueryDisposition.WRITE_APPEND,
                 method=beam.io.WriteToBigQuery.Method.FILE_LOADS
             ))

        _ = (writable_metrics.REINCARCERATION_COUNT
             | f"Write count metrics to BQ table: {counts_table_id}" >>
             beam.io.WriteToBigQuery(
                 table=counts_table_id,
                 dataset=output,
                 create_disposition=beam.io.BigQueryDisposition.CREATE_NEVER,
                 write_disposition=beam.io.BigQueryDisposition.WRITE_APPEND,
                 method=beam.io.WriteToBigQuery.Method.FILE_LOADS
             ))
Exemple #18
0
def process_datastore_tweets(project, dataset, pipeline_options):
    """Creates a pipeline that reads tweets from Cloud Datastore from the last
  N days. The pipeline finds the top most-used words, the top most-tweeted
  URLs, ranks word co-occurrences by an 'interestingness' metric (similar to
  on tf* idf).
  """
    ts = str(datetime.datetime.utcnow())
    p = beam.Pipeline(options=pipeline_options)
    # Create a query to read entities from datastore.
    query = make_query('Tweet')

    # Read entities from Cloud Datastore into a PCollection.
    lines = (p
             |
             'read from datastore' >> ReadFromDatastore(project, query, None))

    global_count = AsSingleton(
        lines
        | 'global count' >> beam.combiners.Count.Globally())

    # Count the occurrences of each word.
    percents = (lines
                | 'split' >>
                (beam.ParDo(WordExtractingDoFn()).with_output_types(unicode))
                | 'pair_with_one' >> beam.Map(lambda x: (x, 1))
                | 'group' >> beam.GroupByKey()
                | 'count' >> beam.Map(lambda (word, ones): (word, sum(ones)))
                | 'in tweets percent' >> beam.Map(
                    lambda (word, wsum), gc:
                    (word, float(wsum) / gc), global_count))
    top_percents = (
        percents
        | 'top 500' >> combiners.Top.Of(500, lambda x, y: x[1] < y[1]))
    # Count the occurrences of each expanded url in the tweets
    url_counts = (
        lines
        | 'geturls' >>
        (beam.ParDo(URLExtractingDoFn()).with_output_types(unicode))
        | 'urls_pair_with_one' >> beam.Map(lambda x: (x, 1))
        | 'urls_group' >> beam.GroupByKey()
        | 'urls_count' >> beam.Map(lambda (word, ones): (word, sum(ones)))
        | 'urls top 300' >> combiners.Top.Of(300, lambda x, y: x[1] < y[1]))

    # Define some inline helper functions.

    def join_cinfo(cooccur, percents):
        """Calculate a co-occurence ranking."""
        import math

        word1 = cooccur[0][0]
        word2 = cooccur[0][1]
        try:
            word1_percent = percents[word1]
            weight1 = 1 / word1_percent
            word2_percent = percents[word2]
            weight2 = 1 / word2_percent
            return (cooccur[0], cooccur[1],
                    cooccur[1] * math.log(min(weight1, weight2)))
        except:
            return 0

    def generate_cooccur_schema():
        """BigQuery schema for the word co-occurrence table."""
        json_str = json.dumps({
            'fields': [{
                'name': 'w1',
                'type': 'STRING',
                'mode': 'NULLABLE'
            }, {
                'name': 'w2',
                'type': 'STRING',
                'mode': 'NULLABLE'
            }, {
                'name': 'count',
                'type': 'INTEGER',
                'mode': 'NULLABLE'
            }, {
                'name': 'log_weight',
                'type': 'FLOAT',
                'mode': 'NULLABLE'
            }, {
                'name': 'ts',
                'type': 'TIMESTAMP',
                'mode': 'NULLABLE'
            }]
        })
        return parse_table_schema_from_json(json_str)

    def generate_url_schema():
        """BigQuery schema for the urls count table."""
        json_str = json.dumps({
            'fields': [{
                'name': 'url',
                'type': 'STRING',
                'mode': 'NULLABLE'
            }, {
                'name': 'count',
                'type': 'INTEGER',
                'mode': 'NULLABLE'
            }, {
                'name': 'ts',
                'type': 'TIMESTAMP',
                'mode': 'NULLABLE'
            }]
        })
        return parse_table_schema_from_json(json_str)

    def generate_wc_schema():
        """BigQuery schema for the word count table."""
        json_str = json.dumps({
            'fields': [{
                'name': 'word',
                'type': 'STRING',
                'mode': 'NULLABLE'
            }, {
                'name': 'percent',
                'type': 'FLOAT',
                'mode': 'NULLABLE'
            }, {
                'name': 'ts',
                'type': 'TIMESTAMP',
                'mode': 'NULLABLE'
            }]
        })
        return parse_table_schema_from_json(json_str)

    # Now build the rest of the pipeline.
    # Calculate the word co-occurence scores.
    cooccur_rankings = (
        lines
        | 'getcooccur' >> (beam.ParDo(CoOccurExtractingDoFn()))
        | 'co_pair_with_one' >> beam.Map(lambda x: (x, 1))
        | 'co_group' >> beam.GroupByKey()
        | 'co_count' >> beam.Map(lambda (wordts, ones): (wordts, sum(ones)))
        | 'weights' >> beam.Map(join_cinfo, AsDict(percents))
        | 'co top 300' >> combiners.Top.Of(300, lambda x, y: x[2] < y[2]))

    # Format the counts into a PCollection of strings.
    wc_records = top_percents | 'format' >> beam.FlatMap(
        lambda x: [{
            'word': xx[0],
            'percent': xx[1],
            'ts': ts
        } for xx in x])

    url_records = url_counts | 'urls_format' >> beam.FlatMap(
        lambda x: [{
            'url': xx[0],
            'count': xx[1],
            'ts': ts
        } for xx in x])

    co_records = cooccur_rankings | 'co_format' >> beam.FlatMap(
        lambda x: [{
            'w1': xx[0][0],
            'w2': xx[0][1],
            'count': xx[1],
            'log_weight': xx[2],
            'ts': ts
        } for xx in x])

    # Write the results to three BigQuery tables.
    wc_records | 'wc_write_bq' >> beam.io.Write(
        beam.io.BigQuerySink(
            '%s:%s.word_counts' % (project, dataset),
            schema=generate_wc_schema(),
            create_disposition=beam.io.BigQueryDisposition.CREATE_IF_NEEDED,
            write_disposition=beam.io.BigQueryDisposition.WRITE_APPEND))

    url_records | 'urls_write_bq' >> beam.io.Write(
        beam.io.BigQuerySink(
            '%s:%s.urls' % (project, dataset),
            schema=generate_url_schema(),
            create_disposition=beam.io.BigQueryDisposition.CREATE_IF_NEEDED,
            write_disposition=beam.io.BigQueryDisposition.WRITE_APPEND))

    co_records | 'co_write_bq' >> beam.io.Write(
        beam.io.BigQuerySink(
            '%s:%s.word_cooccur' % (project, dataset),
            schema=generate_cooccur_schema(),
            create_disposition=beam.io.BigQueryDisposition.CREATE_IF_NEEDED,
            write_disposition=beam.io.BigQueryDisposition.WRITE_APPEND))

    # Actually run the pipeline.
    return p.run()
Exemple #19
0
    def testClassifyProgramAssignments(self):
        """Tests the ClassifyProgramAssignments DoFn."""
        fake_person_id = 12345

        fake_person = entities.StatePerson.new_with_defaults(
            person_id=fake_person_id, gender=Gender.MALE,
            birthdate=date(1970, 1, 1),
            residency_status=ResidencyStatus.PERMANENT
        )

        program_assignment = entities.StateProgramAssignment.new_with_defaults(
            state_code='US_XX',
            program_id='PG3',
            program_location_id='XYZ',
            referral_date=date(2009, 10, 3),
            participation_status=StateProgramAssignmentParticipationStatus.IN_PROGRESS,
            start_date=date(2009, 10, 19)
        )

        assessment = entities.StateAssessment.new_with_defaults(
            state_code='US_XX',
            assessment_type=StateAssessmentType.ORAS,
            assessment_score=33,
            assessment_date=date(2009, 7, 10)
        )

        supervision_period = \
            entities.StateSupervisionPeriod.new_with_defaults(
                supervision_period_id=111,
                status=StateSupervisionPeriodStatus.TERMINATED,
                state_code='US_XX',
                start_date=date(2008, 3, 5),
                supervision_type=StateSupervisionType.PAROLE
            )

        person_periods = {'person': [fake_person],
                          'program_assignments': [program_assignment],
                          'assessments': [assessment],
                          'supervision_periods': [supervision_period]
                          }

        program_events = [ProgramReferralEvent(
            state_code=program_assignment.state_code,
            program_id=program_assignment.program_id,
            event_date=program_assignment.referral_date,
            participation_status=program_assignment.participation_status,
            assessment_score=33,
            assessment_type=StateAssessmentType.ORAS,
            supervision_type=supervision_period.supervision_type,
            supervising_officer_external_id='OFFICER0009',
            supervising_district_external_id='10'
        ), ProgramParticipationEvent(
            state_code=program_assignment.state_code,
            program_id=program_assignment.program_id,
            program_location_id=program_assignment.program_location_id,
            event_date=date.today(),
            is_first_day_in_program=True,
            supervision_type=supervision_period.supervision_type
        )]

        correct_output = [(fake_person.person_id, (fake_person, program_events))]

        test_pipeline = TestPipeline()

        supervision_period_to_agent_map = {
            'agent_id': 1010,
            'agent_external_id': 'OFFICER0009',
            'district_external_id': '10',
            'supervision_period_id':
                supervision_period.supervision_period_id
        }

        supervision_period_to_agent_associations = (
            test_pipeline
            | 'Create SupervisionPeriod to Agent table' >>
            beam.Create([supervision_period_to_agent_map])
        )

        supervision_periods_to_agent_associations_as_kv = (
            supervision_period_to_agent_associations |
            'Convert SupervisionPeriod to Agent table to KV tuples' >>
            beam.ParDo(ConvertDictToKVTuple(),
                       'supervision_period_id')
        )

        output = (test_pipeline
                  | beam.Create([(fake_person_id,
                                  person_periods)])
                  | 'Identify Program Events' >>
                  beam.ParDo(
                      pipeline.ClassifyProgramAssignments(),
                      AsDict(supervision_periods_to_agent_associations_as_kv))
                  )

        assert_that(output, equal_to(correct_output))

        test_pipeline.run()
Exemple #20
0
def run(argv=None):
    """Runs the supervision calculation pipeline."""

    # Workaround to load SQLAlchemy objects at start of pipeline. This is necessary because the BuildRootEntity
    # function tries to access attributes of relationship properties on the SQLAlchemy room_schema_class before they
    # have been loaded. However, if *any* SQLAlchemy objects have been instantiated, then the relationship properties
    # are loaded and their attributes can be successfully accessed.
    _ = schema.StatePerson()

    # Parse command-line arguments
    known_args, pipeline_args = parse_arguments(argv)

    pipeline_options = PipelineOptions(pipeline_args)
    pipeline_options.view_as(SetupOptions).save_main_session = True

    # Get pipeline job details
    all_pipeline_options = pipeline_options.get_all_options()

    input_dataset = all_pipeline_options['project'] + '.' + known_args.input
    reference_dataset = all_pipeline_options['project'] + '.' + \
        known_args.reference_input

    with beam.Pipeline(argv=pipeline_args) as p:
        # Get StatePersons
        persons = (p | 'Load Persons' >> BuildRootEntity(
            dataset=input_dataset,
            data_dict=None,
            root_schema_class=schema.StatePerson,
            root_entity_class=entities.StatePerson,
            unifying_id_field='person_id',
            build_related_entities=True))

        # Get StateIncarcerationPeriods
        incarceration_periods = (
            p | 'Load IncarcerationPeriods' >> BuildRootEntity(
                dataset=input_dataset,
                data_dict=None,
                root_schema_class=schema.StateIncarcerationPeriod,
                root_entity_class=entities.StateIncarcerationPeriod,
                unifying_id_field='person_id',
                build_related_entities=True))

        # Get StateSupervisionViolations
        supervision_violations = (
            p | 'Load SupervisionViolations' >> BuildRootEntity(
                dataset=input_dataset,
                data_dict=None,
                root_schema_class=schema.StateSupervisionViolation,
                root_entity_class=entities.StateSupervisionViolation,
                unifying_id_field='person_id',
                build_related_entities=True))

        # TODO(2769): Don't bring this in as a root entity
        # Get StateSupervisionViolationResponses
        supervision_violation_responses = (
            p | 'Load SupervisionViolationResponses' >> BuildRootEntity(
                dataset=input_dataset,
                data_dict=None,
                root_schema_class=schema.StateSupervisionViolationResponse,
                root_entity_class=entities.StateSupervisionViolationResponse,
                unifying_id_field='person_id',
                build_related_entities=True))

        # Get StateSupervisionSentences
        supervision_sentences = (
            p | 'Load SupervisionSentences' >> BuildRootEntity(
                dataset=input_dataset,
                data_dict=None,
                root_schema_class=schema.StateSupervisionSentence,
                root_entity_class=entities.StateSupervisionSentence,
                unifying_id_field='person_id',
                build_related_entities=True))

        # Get StateIncarcerationSentences
        incarceration_sentences = (
            p | 'Load IncarcerationSentences' >> BuildRootEntity(
                dataset=input_dataset,
                data_dict=None,
                root_schema_class=schema.StateIncarcerationSentence,
                root_entity_class=entities.StateIncarcerationSentence,
                unifying_id_field='person_id',
                build_related_entities=True))

        # Get StateSupervisionPeriods
        supervision_periods = (
            p | 'Load SupervisionPeriods' >> BuildRootEntity(
                dataset=input_dataset,
                data_dict=None,
                root_schema_class=schema.StateSupervisionPeriod,
                root_entity_class=entities.StateSupervisionPeriod,
                unifying_id_field='person_id',
                build_related_entities=True))

        # Get StateAssessments
        assessments = (p | 'Load Assessments' >> BuildRootEntity(
            dataset=input_dataset,
            data_dict=None,
            root_schema_class=schema.StateAssessment,
            root_entity_class=entities.StateAssessment,
            unifying_id_field='person_id',
            build_related_entities=False))

        # Bring in the table that associates StateSupervisionViolationResponses to information about StateAgents
        ssvr_to_agent_association_query = f"SELECT * FROM `{reference_dataset}.ssvr_to_agent_association`"

        ssvr_to_agent_associations = (
            p | "Read SSVR to Agent table from BigQuery" >> beam.io.Read(
                beam.io.BigQuerySource(query=ssvr_to_agent_association_query,
                                       use_standard_sql=True)))

        # Convert the association table rows into key-value tuples with the value for the
        # supervision_violation_response_id column as the key
        ssvr_agent_associations_as_kv = (
            ssvr_to_agent_associations
            | 'Convert SSVR to Agent table to KV tuples' >> beam.ParDo(
                ConvertDictToKVTuple(), 'supervision_violation_response_id'))

        supervision_period_to_agent_association_query = f"SELECT * FROM `{reference_dataset}." \
                                                        f"supervision_period_to_agent_association`"

        supervision_period_to_agent_associations = (
            p | "Read Supervision Period to Agent table from BigQuery" >>
            beam.io.Read(
                beam.io.BigQuerySource(
                    query=supervision_period_to_agent_association_query,
                    use_standard_sql=True)))

        # Convert the association table rows into key-value tuples with the value for the supervision_period_id column
        # as the key
        supervision_period_to_agent_associations_as_kv = (
            supervision_period_to_agent_associations
            | 'Convert Supervision Period to Agent table to KV tuples' >>
            beam.ParDo(ConvertDictToKVTuple(), 'supervision_period_id'))

        # Group StateSupervisionViolationResponses and StateSupervisionViolations by person_id
        supervision_violations_and_responses = (
            {
                'violations': supervision_violations,
                'violation_responses': supervision_violation_responses
            } | 'Group StateSupervisionViolationResponses to '
            'StateSupervisionViolations' >> beam.CoGroupByKey())

        # Set the fully hydrated StateSupervisionViolation entities on the corresponding
        # StateSupervisionViolationResponses
        violation_responses_with_hydrated_violations = (
            supervision_violations_and_responses
            | 'Set hydrated StateSupervisionViolations on '
            'the StateSupervisionViolationResponses' >> beam.ParDo(
                SetViolationOnViolationsResponse()))

        # Group StateIncarcerationPeriods and StateSupervisionViolationResponses by person_id
        incarceration_periods_and_violation_responses = (
            {
                'incarceration_periods': incarceration_periods,
                'violation_responses':
                violation_responses_with_hydrated_violations
            }
            | 'Group StateIncarcerationPeriods to '
            'StateSupervisionViolationResponses' >> beam.CoGroupByKey())

        # Set the fully hydrated StateSupervisionViolationResponse entities on the corresponding
        # StateIncarcerationPeriods
        incarceration_periods_with_source_violations = (
            incarceration_periods_and_violation_responses
            | 'Set hydrated StateSupervisionViolationResponses on '
            'the StateIncarcerationPeriods' >> beam.ParDo(
                SetViolationResponseOnIncarcerationPeriod()))

        # Group each StatePerson with their StateIncarcerationPeriods and StateSupervisionSentences
        person_periods_and_sentences = (
            {
                'person': persons,
                'assessments': assessments,
                'incarceration_periods':
                incarceration_periods_with_source_violations,
                'supervision_periods': supervision_periods,
                'supervision_sentences': supervision_sentences,
                'incarceration_sentences': incarceration_sentences,
                'violation_responses':
                violation_responses_with_hydrated_violations
            }
            | 'Group StatePerson to all entities' >> beam.CoGroupByKey())

        # The state_code to run calculations on
        state_code = known_args.state_code

        identifier_options = {'state_code': state_code}

        # Identify SupervisionTimeBuckets from the StatePerson's StateSupervisionSentences and StateIncarcerationPeriods
        person_time_buckets = (
            person_periods_and_sentences
            | 'Get SupervisionTimeBuckets' >> beam.ParDo(
                ClassifySupervisionTimeBuckets(),
                AsDict(ssvr_agent_associations_as_kv),
                AsDict(supervision_period_to_agent_associations_as_kv), **
                identifier_options))

        # Get dimensions to include and methodologies to use
        inclusions, _ = dimensions_and_methodologies(known_args)

        # Get pipeline job details for accessing job_id
        all_pipeline_options = pipeline_options.get_all_options()

        # Get the type of metric to calculate
        metric_type = known_args.metric_type

        # The number of months to limit the monthly calculation output to
        calculation_month_limit = known_args.calculation_month_limit

        # Add timestamp for local jobs
        job_timestamp = datetime.datetime.now().strftime(
            '%Y-%m-%d_%H_%M_%S.%f')
        all_pipeline_options['job_timestamp'] = job_timestamp

        # Get supervision metrics
        supervision_metrics = (
            person_time_buckets
            | 'Get Supervision Metrics' >> GetSupervisionMetrics(
                pipeline_options=all_pipeline_options,
                inclusions=inclusions,
                metric_type=metric_type,
                calculation_month_limit=calculation_month_limit))

        # Convert the metrics into a format that's writable to BQ
        writable_metrics = (
            supervision_metrics | 'Convert to dict to be written to BQ' >>
            beam.ParDo(SupervisionMetricWritableDict()).with_outputs(
                'populations', 'revocations', 'successes',
                'assessment_changes', 'revocation_analyses',
                'revocation_violation_type_analyses'))

        # Write the metrics to the output tables in BigQuery
        populations_table = known_args.output + '.supervision_population_metrics'

        revocations_table = known_args.output + '.supervision_revocation_metrics'

        successes_table = known_args.output + '.supervision_success_metrics'

        assessment_changes_table = known_args.output + '.terminated_supervision_assessment_score_change_metrics'

        revocation_analysis_table = known_args.output + '.supervision_revocation_analysis_metrics'

        revocation_violation_type_analysis_table = known_args.output + \
            '.supervision_revocation_violation_type_analysis_metrics'

        _ = (writable_metrics.populations
             | f"Write population metrics to BQ table: {populations_table}" >>
             beam.io.WriteToBigQuery(
                 table=populations_table,
                 create_disposition=beam.io.BigQueryDisposition.CREATE_NEVER,
                 write_disposition=beam.io.BigQueryDisposition.WRITE_APPEND))

        _ = (writable_metrics.revocations
             | f"Write revocation metrics to BQ table: {revocations_table}" >>
             beam.io.WriteToBigQuery(
                 table=revocations_table,
                 create_disposition=beam.io.BigQueryDisposition.CREATE_NEVER,
                 write_disposition=beam.io.BigQueryDisposition.WRITE_APPEND))

        _ = (writable_metrics.successes
             | f"Write success metrics to BQ table: {successes_table}" >>
             beam.io.WriteToBigQuery(
                 table=successes_table,
                 create_disposition=beam.io.BigQueryDisposition.CREATE_NEVER,
                 write_disposition=beam.io.BigQueryDisposition.WRITE_APPEND))

        _ = (
            writable_metrics.assessment_changes
            |
            f"Write assessment change metrics to BQ table: {assessment_changes_table}"
            >> beam.io.WriteToBigQuery(
                table=assessment_changes_table,
                create_disposition=beam.io.BigQueryDisposition.CREATE_NEVER,
                write_disposition=beam.io.BigQueryDisposition.WRITE_APPEND))

        _ = (
            writable_metrics.revocation_analyses
            |
            f"Write revocation analyses metrics to BQ table: {revocation_analysis_table}"
            >> beam.io.WriteToBigQuery(
                table=revocation_analysis_table,
                create_disposition=beam.io.BigQueryDisposition.CREATE_NEVER,
                write_disposition=beam.io.BigQueryDisposition.WRITE_APPEND))

        _ = (writable_metrics.revocation_violation_type_analyses
             |
             f"Write revocation violation type analyses metrics to BQ table: "
             f"{revocation_violation_type_analysis_table}" >>
             beam.io.WriteToBigQuery(
                 table=revocation_violation_type_analysis_table,
                 create_disposition=beam.io.BigQueryDisposition.CREATE_NEVER,
                 write_disposition=beam.io.BigQueryDisposition.WRITE_APPEND))
def run(apache_beam_pipeline_options: PipelineOptions, data_input: str,
        reference_input: str, output: str, metric_types: List[str],
        state_code: Optional[str], person_filter_ids: Optional[List[int]]):
    """Runs the recidivism calculation pipeline."""

    # Workaround to load SQLAlchemy objects at start of pipeline. This is
    # necessary because the BuildRootEntity function tries to access attributes
    # of relationship properties on the SQLAlchemy room_schema_class before they
    # have been loaded. However, if *any* SQLAlchemy objects have been
    # instantiated, then the relationship properties are loaded and their
    # attributes can be successfully accessed.
    _ = schema.StatePerson()

    apache_beam_pipeline_options.view_as(SetupOptions).save_main_session = True

    # Get pipeline job details
    all_pipeline_options = apache_beam_pipeline_options.get_all_options()

    query_dataset = all_pipeline_options['project'] + '.' + data_input
    reference_dataset = all_pipeline_options['project'] + '.' + reference_input

    person_id_filter_set = set(
        person_filter_ids) if person_filter_ids else None

    with beam.Pipeline(options=apache_beam_pipeline_options) as p:
        # Get StatePersons
        persons = (
            p
            | 'Load Persons' >> BuildRootEntity(
                dataset=query_dataset,
                root_entity_class=entities.StatePerson,
                unifying_id_field=entities.StatePerson.get_class_id_name(),
                build_related_entities=True,
                unifying_id_field_filter_set=person_id_filter_set))

        # Get StateIncarcerationPeriods
        incarceration_periods = (
            p
            | 'Load IncarcerationPeriods' >> BuildRootEntity(
                dataset=query_dataset,
                root_entity_class=entities.StateIncarcerationPeriod,
                unifying_id_field=entities.StatePerson.get_class_id_name(),
                build_related_entities=True,
                unifying_id_field_filter_set=person_id_filter_set,
                state_code=state_code))

        # Get StateSupervisionViolations
        supervision_violations = \
            (p
             | 'Load SupervisionViolations' >>
             BuildRootEntity(dataset=query_dataset, root_entity_class=entities.StateSupervisionViolation,
                             unifying_id_field=entities.StatePerson.get_class_id_name(), build_related_entities=True,
                             unifying_id_field_filter_set=person_id_filter_set,
                             state_code=state_code
                             ))

        # TODO(2769): Don't bring this in as a root entity
        # Get StateSupervisionViolationResponses
        supervision_violation_responses = \
            (p
             | 'Load SupervisionViolationResponses' >>
             BuildRootEntity(dataset=query_dataset, root_entity_class=entities.StateSupervisionViolationResponse,
                             unifying_id_field=entities.StatePerson.get_class_id_name(), build_related_entities=True,
                             unifying_id_field_filter_set=person_id_filter_set,
                             state_code=state_code
                             ))

        # Group StateSupervisionViolationResponses and
        # StateSupervisionViolations by person_id
        supervision_violations_and_responses = (
            {
                'violations': supervision_violations,
                'violation_responses': supervision_violation_responses
            } | 'Group StateSupervisionViolationResponses to '
            'StateSupervisionViolations' >> beam.CoGroupByKey())

        # Set the fully hydrated StateSupervisionViolation entities on
        # the corresponding StateSupervisionViolationResponses
        violation_responses_with_hydrated_violations = (
            supervision_violations_and_responses
            | 'Set hydrated StateSupervisionViolations on '
            'the StateSupervisionViolationResponses' >> beam.ParDo(
                SetViolationOnViolationsResponse()))

        # Group StateIncarcerationPeriods and StateSupervisionViolationResponses
        # by person_id
        incarceration_periods_and_violation_responses = (
            {
                'incarceration_periods': incarceration_periods,
                'violation_responses':
                violation_responses_with_hydrated_violations
            }
            | 'Group StateIncarcerationPeriods to '
            'StateSupervisionViolationResponses' >> beam.CoGroupByKey())

        # Set the fully hydrated StateSupervisionViolationResponse entities on
        # the corresponding StateIncarcerationPeriods
        incarceration_periods_with_source_violations = (
            incarceration_periods_and_violation_responses
            | 'Set hydrated StateSupervisionViolationResponses on '
            'the StateIncarcerationPeriods' >> beam.ParDo(
                SetViolationResponseOnIncarcerationPeriod()))

        # Group each StatePerson with their StateIncarcerationPeriods
        person_and_incarceration_periods = (
            {
                'person':
                persons,
                'incarceration_periods':
                incarceration_periods_with_source_violations
            }
            | 'Group StatePerson to StateIncarcerationPeriods' >>
            beam.CoGroupByKey())

        # Bring in the table that associates people and their county of residence
        person_id_to_county_query = select_all_by_person_query(
            reference_dataset,
            PERSONS_TO_RECENT_COUNTY_OF_RESIDENCE_VIEW_NAME,
            # TODO(3602): Once we put state_code on StatePerson objects, we can update the
            # persons_to_recent_county_of_residence query to have a state_code field, allowing us to also filter the
            # output by state_code.
            state_code_filter=None,
            person_id_filter_set=person_id_filter_set)

        person_id_to_county_kv = (
            p
            | "Read person_id to county associations from BigQuery" >>
            beam.io.Read(
                beam.io.BigQuerySource(query=person_id_to_county_query,
                                       use_standard_sql=True))
            | "Convert person_id to county association table to KV" >>
            beam.ParDo(ConvertDictToKVTuple(), 'person_id'))

        # Identify ReleaseEvents events from the StatePerson's
        # StateIncarcerationPeriods
        person_events = (
            person_and_incarceration_periods
            | "ClassifyReleaseEvents" >> beam.ParDo(
                ClassifyReleaseEvents(), AsDict(person_id_to_county_kv)))

        # Get pipeline job details for accessing job_id
        all_pipeline_options = apache_beam_pipeline_options.get_all_options()

        # Add timestamp for local jobs
        job_timestamp = datetime.datetime.now().strftime(
            '%Y-%m-%d_%H_%M_%S.%f')
        all_pipeline_options['job_timestamp'] = job_timestamp

        # Get the type of metric to calculate
        metric_types_set = set(metric_types)

        # Get recidivism metrics
        recidivism_metrics = (person_events
                              |
                              'Get Recidivism Metrics' >> GetRecidivismMetrics(
                                  pipeline_options=all_pipeline_options,
                                  metric_types=metric_types_set))

        if person_id_filter_set:
            logging.warning(
                "Non-empty person filter set - returning before writing metrics."
            )
            return

        # Convert the metrics into a format that's writable to BQ
        writable_metrics = (
            recidivism_metrics
            | 'Convert to dict to be written to BQ' >> beam.ParDo(
                RecidivismMetricWritableDict()).with_outputs(
                    'rates', 'counts'))

        # Write the recidivism metrics to the output tables in BigQuery
        rates_table_id = DATAFLOW_METRICS_TO_TABLES.get(
            ReincarcerationRecidivismRateMetric)
        counts_table_id = DATAFLOW_METRICS_TO_TABLES.get(
            ReincarcerationRecidivismCountMetric)

        _ = (writable_metrics.rates
             | f"Write rate metrics to BQ table: {rates_table_id}" >>
             beam.io.WriteToBigQuery(
                 table=rates_table_id,
                 dataset=output,
                 create_disposition=beam.io.BigQueryDisposition.CREATE_NEVER,
                 write_disposition=beam.io.BigQueryDisposition.WRITE_APPEND,
                 method=beam.io.WriteToBigQuery.Method.FILE_LOADS))

        _ = (writable_metrics.counts
             | f"Write count metrics to BQ table: {counts_table_id}" >>
             beam.io.WriteToBigQuery(
                 table=counts_table_id,
                 dataset=output,
                 create_disposition=beam.io.BigQueryDisposition.CREATE_NEVER,
                 write_disposition=beam.io.BigQueryDisposition.WRITE_APPEND,
                 method=beam.io.WriteToBigQuery.Method.FILE_LOADS))
Exemple #22
0
def run(apache_beam_pipeline_options: PipelineOptions, data_input: str,
        reference_input: str, output: str, calculation_month_count: int,
        metric_types: List[str], state_code: Optional[str],
        calculation_end_month: Optional[str],
        person_filter_ids: Optional[List[int]]):
    """Runs the incarceration calculation pipeline."""

    # Workaround to load SQLAlchemy objects at start of pipeline. This is necessary because the BuildRootEntity
    # function tries to access attributes of relationship properties on the SQLAlchemy room_schema_class before they
    # have been loaded. However, if *any* SQLAlchemy objects have been instantiated, then the relationship properties
    # are loaded and their attributes can be successfully accessed.
    _ = schema.StatePerson()

    apache_beam_pipeline_options.view_as(SetupOptions).save_main_session = True

    # Get pipeline job details
    all_apache_beam_pipeline_options = apache_beam_pipeline_options.get_all_options(
    )

    query_dataset = all_apache_beam_pipeline_options[
        'project'] + '.' + data_input
    reference_dataset = all_apache_beam_pipeline_options[
        'project'] + '.' + reference_input

    person_id_filter_set = set(
        person_filter_ids) if person_filter_ids else None

    with beam.Pipeline(options=apache_beam_pipeline_options) as p:
        # Get StatePersons
        persons = (p | 'Load StatePersons' >> BuildRootEntity(
            dataset=query_dataset,
            root_entity_class=entities.StatePerson,
            unifying_id_field=entities.StatePerson.get_class_id_name(),
            build_related_entities=True,
            unifying_id_field_filter_set=person_id_filter_set))

        # Get StateSentenceGroups
        sentence_groups = (p | 'Load StateSentenceGroups' >> BuildRootEntity(
            dataset=query_dataset,
            root_entity_class=entities.StateSentenceGroup,
            unifying_id_field=entities.StatePerson.get_class_id_name(),
            build_related_entities=True,
            unifying_id_field_filter_set=person_id_filter_set,
            state_code=state_code))

        # Get StateIncarcerationSentences
        incarceration_sentences = (
            p | 'Load StateIncarcerationSentences' >> BuildRootEntity(
                dataset=query_dataset,
                root_entity_class=entities.StateIncarcerationSentence,
                unifying_id_field=entities.StatePerson.get_class_id_name(),
                build_related_entities=True,
                unifying_id_field_filter_set=person_id_filter_set,
                state_code=state_code))

        # Get StateSupervisionSentences
        supervision_sentences = (
            p | 'Load StateSupervisionSentences' >> BuildRootEntity(
                dataset=query_dataset,
                root_entity_class=entities.StateSupervisionSentence,
                unifying_id_field=entities.StatePerson.get_class_id_name(),
                build_related_entities=True,
                unifying_id_field_filter_set=person_id_filter_set,
                state_code=state_code))

        if state_code is None or state_code == 'US_MO':
            # Bring in the reference table that includes sentence status ranking information
            us_mo_sentence_status_query = select_all_by_person_query(
                reference_dataset, US_MO_SENTENCE_STATUSES_VIEW_NAME,
                state_code, person_id_filter_set)

            us_mo_sentence_statuses = (
                p |
                "Read MO sentence status table from BigQuery" >> beam.io.Read(
                    beam.io.BigQuerySource(query=us_mo_sentence_status_query,
                                           use_standard_sql=True)))
        else:
            us_mo_sentence_statuses = (
                p |
                f"Generate empty MO statuses list for non-MO state run: {state_code} "
                >> beam.Create([]))

        us_mo_sentence_status_rankings_as_kv = (
            us_mo_sentence_statuses
            | 'Convert MO sentence status ranking table to KV tuples' >>
            beam.ParDo(ConvertDictToKVTuple(), 'person_id'))

        supervision_sentences_and_statuses = (
            {
                'incarceration_sentences': incarceration_sentences,
                'supervision_sentences': supervision_sentences,
                'sentence_statuses': us_mo_sentence_status_rankings_as_kv
            }
            | 'Group sentences to the sentence statuses for that person' >>
            beam.CoGroupByKey())

        sentences_converted = (
            supervision_sentences_and_statuses
            | 'Convert to state-specific sentences' >> beam.ParDo(
                ConvertSentencesToStateSpecificType()).with_outputs(
                    'incarceration_sentences', 'supervision_sentences'))

        sentences_and_sentence_groups = (
            {
                'sentence_groups': sentence_groups,
                'incarceration_sentences':
                sentences_converted.incarceration_sentences,
                'supervision_sentences':
                sentences_converted.supervision_sentences
            }
            | 'Group sentences to sentence groups' >> beam.CoGroupByKey())

        # Set hydrated sentences on the corresponding sentence groups
        sentence_groups_with_hydrated_sentences = (
            sentences_and_sentence_groups
            | 'Set hydrated sentences on sentence groups' >> beam.ParDo(
                SetSentencesOnSentenceGroup()))

        # Bring in the table that associates people and their county of residence
        person_id_to_county_query = select_all_by_person_query(
            reference_dataset,
            PERSONS_TO_RECENT_COUNTY_OF_RESIDENCE_VIEW_NAME,
            # TODO(3602): Once we put state_code on StatePerson objects, we can update the
            # persons_to_recent_county_of_residence query to have a state_code field, allowing us to also filter the
            # output by state_code.
            state_code_filter=None,
            person_id_filter_set=person_id_filter_set)

        person_id_to_county_kv = (
            p | "Read person_id to county associations from BigQuery" >>
            beam.io.Read(
                beam.io.BigQuerySource(query=person_id_to_county_query,
                                       use_standard_sql=True))
            | "Convert person_id to county association table to KV" >>
            beam.ParDo(ConvertDictToKVTuple(), 'person_id'))

        # Bring in the judicial districts associated with incarceration_periods
        ip_to_judicial_district_query = select_all_by_person_query(
            reference_dataset,
            INCARCERATION_PERIOD_JUDICIAL_DISTRICT_ASSOCIATION_VIEW_NAME,
            state_code, person_id_filter_set)

        ip_to_judicial_district_kv = (
            p |
            "Read incarceration_period to judicial_district associations from BigQuery"
            >> beam.io.Read(
                beam.io.BigQuerySource(query=ip_to_judicial_district_query,
                                       use_standard_sql=True))
            |
            "Convert incarceration_period to judicial_district association table to KV"
            >> beam.ParDo(ConvertDictToKVTuple(), 'person_id'))

        # Group each StatePerson with their related entities
        person_entities = (
            {
                'person':
                persons,
                'sentence_groups':
                sentence_groups_with_hydrated_sentences,
                'incarceration_period_judicial_district_association':
                ip_to_judicial_district_kv
            }
            | 'Group StatePerson to SentenceGroups' >> beam.CoGroupByKey())

        # Identify IncarcerationEvents events from the StatePerson's StateIncarcerationPeriods
        person_events = (
            person_entities | 'Classify Incarceration Events' >> beam.ParDo(
                ClassifyIncarcerationEvents(), AsDict(person_id_to_county_kv)))

        # Get pipeline job details for accessing job_id
        all_pipeline_options = apache_beam_pipeline_options.get_all_options()

        # Add timestamp for local jobs
        job_timestamp = datetime.datetime.now().strftime(
            '%Y-%m-%d_%H_%M_%S.%f')
        all_apache_beam_pipeline_options['job_timestamp'] = job_timestamp

        # Get the type of metric to calculate
        metric_types_set = set(metric_types)

        # Get IncarcerationMetrics
        incarceration_metrics = (
            person_events
            | 'Get Incarceration Metrics' >> GetIncarcerationMetrics(
                pipeline_options=all_pipeline_options,
                metric_types=metric_types_set,
                calculation_end_month=calculation_end_month,
                calculation_month_count=calculation_month_count))

        if person_id_filter_set:
            logging.warning(
                "Non-empty person filter set - returning before writing metrics."
            )
            return

        # Convert the metrics into a format that's writable to BQ
        writable_metrics = (
            incarceration_metrics | 'Convert to dict to be written to BQ' >>
            beam.ParDo(RecidivizMetricWritableDict()).with_outputs(
                IncarcerationMetricType.INCARCERATION_ADMISSION.value,
                IncarcerationMetricType.INCARCERATION_POPULATION.value,
                IncarcerationMetricType.INCARCERATION_RELEASE.value))

        # Write the metrics to the output tables in BigQuery
        admissions_table_id = DATAFLOW_METRICS_TO_TABLES.get(
            IncarcerationAdmissionMetric)
        population_table_id = DATAFLOW_METRICS_TO_TABLES.get(
            IncarcerationPopulationMetric)
        releases_table_id = DATAFLOW_METRICS_TO_TABLES.get(
            IncarcerationReleaseMetric)

        _ = (writable_metrics.INCARCERATION_ADMISSION
             | f"Write admission metrics to BQ table: {admissions_table_id}" >>
             beam.io.WriteToBigQuery(
                 table=admissions_table_id,
                 dataset=output,
                 create_disposition=beam.io.BigQueryDisposition.CREATE_NEVER,
                 write_disposition=beam.io.BigQueryDisposition.WRITE_APPEND,
                 method=beam.io.WriteToBigQuery.Method.FILE_LOADS))

        _ = (writable_metrics.INCARCERATION_POPULATION
             | f"Write population metrics to BQ table: {population_table_id}"
             >> beam.io.WriteToBigQuery(
                 table=population_table_id,
                 dataset=output,
                 create_disposition=beam.io.BigQueryDisposition.CREATE_NEVER,
                 write_disposition=beam.io.BigQueryDisposition.WRITE_APPEND,
                 method=beam.io.WriteToBigQuery.Method.FILE_LOADS))

        _ = (writable_metrics.INCARCERATION_RELEASE
             | f"Write release metrics to BQ table: {releases_table_id}" >>
             beam.io.WriteToBigQuery(
                 table=releases_table_id,
                 dataset=output,
                 create_disposition=beam.io.BigQueryDisposition.CREATE_NEVER,
                 write_disposition=beam.io.BigQueryDisposition.WRITE_APPEND,
                 method=beam.io.WriteToBigQuery.Method.FILE_LOADS))
Exemple #23
0
    def run_test_pipeline(
            self,
            dataset: str,
            fake_supervision_period_id: int,
            unifying_id_field_filter_set: Optional[Set[int]] = None,
            metric_types_filter: Optional[Set[str]] = None):
        """Runs a test version of the program pipeline."""
        test_pipeline = TestPipeline()

        # Get StatePersons
        persons = (
            test_pipeline
            | 'Load Persons' >>  # type: ignore
            extractor_utils.BuildRootEntity(
                dataset=dataset,
                root_entity_class=entities.StatePerson,
                unifying_id_field=entities.StatePerson.get_class_id_name(),
                build_related_entities=True))

        # Get StateProgramAssignments
        program_assignments = (
            test_pipeline
            | 'Load Program Assignments' >>  # type: ignore
            extractor_utils.BuildRootEntity(
                dataset=dataset,
                root_entity_class=entities.StateProgramAssignment,
                unifying_id_field=entities.StatePerson.get_class_id_name(),
                build_related_entities=True,
                unifying_id_field_filter_set=unifying_id_field_filter_set))

        # Get StateAssessments
        assessments = (
            test_pipeline
            | 'Load Assessments' >>  # type: ignore
            extractor_utils.BuildRootEntity(
                dataset=dataset,
                root_entity_class=entities.StateAssessment,
                unifying_id_field=entities.StatePerson.get_class_id_name(),
                build_related_entities=False,
                unifying_id_field_filter_set=unifying_id_field_filter_set))

        # Get StateSupervisionPeriods
        supervision_periods = (
            test_pipeline
            | 'Load SupervisionPeriods' >>  # type: ignore
            extractor_utils.BuildRootEntity(
                dataset=dataset,
                root_entity_class=entities.StateSupervisionPeriod,
                unifying_id_field=entities.StatePerson.get_class_id_name(),
                build_related_entities=False,
                unifying_id_field_filter_set=unifying_id_field_filter_set))

        supervision_period_to_agent_map = {
            'agent_id': 1010,
            'agent_external_id': 'OFFICER0009',
            'district_external_id': '10',
            'supervision_period_id': fake_supervision_period_id
        }

        supervision_period_to_agent_associations = (
            test_pipeline
            | 'Create SupervisionPeriod to Agent table' >> beam.Create(
                [supervision_period_to_agent_map]))

        supervision_period_to_agent_associations_as_kv = (
            supervision_period_to_agent_associations
            | 'Convert SupervisionPeriod to Agent table to KV tuples' >>
            beam.ParDo(pipeline.ConvertDictToKVTuple(),
                       'supervision_period_id'))

        # Group each StatePerson with their other entities
        persons_entities = ({
            'person': persons,
            'program_assignments': program_assignments,
            'assessments': assessments,
            'supervision_periods': supervision_periods
        }
                            |
                            'Group StatePerson to StateProgramAssignments and'
                            >> beam.CoGroupByKey())

        # Identify ProgramEvents from the StatePerson's
        # StateProgramAssignments
        person_program_events = (
            persons_entities
            | beam.ParDo(
                pipeline.ClassifyProgramAssignments(),
                AsDict(supervision_period_to_agent_associations_as_kv)))

        # Get pipeline job details for accessing job_id
        all_pipeline_options = PipelineOptions().get_all_options()

        # Add timestamp for local jobs
        job_timestamp = datetime.datetime.now().strftime(
            '%Y-%m-%d_%H_%M_%S.%f')
        all_pipeline_options['job_timestamp'] = job_timestamp

        metric_types = metric_types_filter if metric_types_filter else {'ALL'}

        # Get program metrics
        program_metrics = (
            person_program_events
            | 'Get Program Metrics' >>  # type: ignore
            pipeline.GetProgramMetrics(pipeline_options=all_pipeline_options,
                                       metric_types=metric_types,
                                       calculation_end_month=None,
                                       calculation_month_count=-1))

        assert_that(program_metrics, AssertMatchers.validate_pipeline_test())

        test_pipeline.run()
Exemple #24
0
    def run_test_pipeline(
            fake_person_id: int,
            state_code: str,
            dataset: str,
            expected_metric_types: Set[IncarcerationMetricType],
            allow_empty: bool = False,
            unifying_id_field_filter_set: Optional[Set[int]] = None,
            metric_types_filter: Optional[Set[str]] = None):
        """Runs a test version of the incarceration pipeline."""
        test_pipeline = TestPipeline()

        # Get StatePersons
        persons = (
            test_pipeline
            | 'Load Persons' >>  # type: ignore
            extractor_utils.BuildRootEntity(
                dataset=dataset,
                root_entity_class=entities.StatePerson,
                unifying_id_field=entities.StatePerson.get_class_id_name(),
                build_related_entities=True))

        # Get StateSentenceGroups
        sentence_groups = (
            test_pipeline
            | 'Load StateSentenceGroups' >>  # type: ignore
            extractor_utils.BuildRootEntity(
                dataset=dataset,
                root_entity_class=entities.StateSentenceGroup,
                unifying_id_field=entities.StatePerson.get_class_id_name(),
                build_related_entities=True,
                unifying_id_field_filter_set=unifying_id_field_filter_set))

        # Get StateIncarcerationSentences
        incarceration_sentences = (
            test_pipeline
            | 'Load StateIncarcerationSentences' >>  # type: ignore
            extractor_utils.BuildRootEntity(
                dataset=dataset,
                root_entity_class=entities.StateIncarcerationSentence,
                unifying_id_field=entities.StatePerson.get_class_id_name(),
                build_related_entities=True,
                unifying_id_field_filter_set=unifying_id_field_filter_set))

        # Get StateSupervisionSentences
        supervision_sentences = (
            test_pipeline | 'Load StateSupervisionSentences' >>  # type: ignore
            extractor_utils.BuildRootEntity(
                dataset=dataset,
                root_entity_class=entities.StateSupervisionSentence,
                unifying_id_field=entities.StatePerson.get_class_id_name(),
                build_related_entities=True,
                unifying_id_field_filter_set=unifying_id_field_filter_set))

        us_mo_sentence_status_rows: List[Dict[str, Any]] = [{
            'person_id':
            fake_person_id,
            'sentence_external_id':
            'XXX',
            'sentence_status_external_id':
            'YYY',
            'status_code':
            'ZZZ',
            'status_date':
            'not_a_date',
            'status_description':
            'XYZ'
        }]

        us_mo_sentence_statuses = (test_pipeline
                                   | 'Create MO sentence statuses' >>
                                   beam.Create(us_mo_sentence_status_rows))

        us_mo_sentence_status_rankings_as_kv = (
            us_mo_sentence_statuses
            | 'Convert sentence status ranking table to KV tuples' >>
            beam.ParDo(ConvertDictToKVTuple(), 'person_id'))

        sentences_and_statuses = (
            {
                'incarceration_sentences': incarceration_sentences,
                'supervision_sentences': supervision_sentences,
                'sentence_statuses': us_mo_sentence_status_rankings_as_kv
            }
            | 'Group sentences to the sentence statuses for that person' >>
            beam.CoGroupByKey())

        sentences_converted = (
            sentences_and_statuses
            | 'Convert to state-specific sentences' >> beam.ParDo(
                ConvertSentencesToStateSpecificType()).with_outputs(
                    'incarceration_sentences', 'supervision_sentences'))

        sentences_and_sentence_groups = (
            {
                'sentence_groups': sentence_groups,
                'incarceration_sentences':
                sentences_converted.incarceration_sentences,
                'supervision_sentences':
                sentences_converted.supervision_sentences
            }
            | 'Group sentences to sentence groups' >> beam.CoGroupByKey())

        sentence_groups_with_hydrated_sentences = (
            sentences_and_sentence_groups
            | 'Set hydrated sentences on sentence groups' >> beam.ParDo(
                SetSentencesOnSentenceGroup()))

        # Identify IncarcerationEvents events from the StatePerson's
        # StateIncarcerationPeriods
        fake_person_id_to_county_query_result = [{
            'person_id':
            fake_person_id,
            'county_of_residence':
            _COUNTY_OF_RESIDENCE
        }]
        person_id_to_county_kv = (
            test_pipeline
            | "Read person id to county associations from BigQuery" >>
            beam.Create(fake_person_id_to_county_query_result)
            | "Convert person_id to counties to KV" >> beam.ParDo(
                ConvertDictToKVTuple(), 'person_id'))

        incarceration_period_judicial_district_association_row = \
            {'person_id': fake_person_id, 'incarceration_period_id': 123, 'judicial_district_code': 'NW'}

        ip_to_judicial_district_kv = (
            test_pipeline
            |
            "Read incarceration_period to judicial_district associations from BigQuery"
            >> beam.Create(
                [incarceration_period_judicial_district_association_row])
            | "Convert ips to judicial districts to KV" >> beam.ParDo(
                ConvertDictToKVTuple(), 'person_id'))

        state_race_ethnicity_population_count = {
            'state_code': state_code,
            'race_or_ethnicity': 'BLACK',
            'population_count': 1,
            'representation_priority': 1
        }

        state_race_ethnicity_population_counts = (
            test_pipeline
            | 'Create state_race_ethnicity_population_count table' >>
            beam.Create([state_race_ethnicity_population_count]))

        # Group each StatePerson with their related entities
        person_entities = (
            {
                'person':
                persons,
                'sentence_groups':
                sentence_groups_with_hydrated_sentences,
                'incarceration_period_judicial_district_association':
                ip_to_judicial_district_kv
            }
            | 'Group StatePerson to SentenceGroups' >> beam.CoGroupByKey())

        # Identify IncarcerationEvents events from the StatePerson's StateIncarcerationPeriods
        person_incarceration_events = (
            person_entities | 'Classify Incarceration Events' >> beam.ParDo(
                pipeline.ClassifyIncarcerationEvents(),
                AsDict(person_id_to_county_kv)))

        person_metadata = (
            persons
            | "Build the person_metadata dictionary" >> beam.ParDo(
                BuildPersonMetadata(),
                AsList(state_race_ethnicity_population_counts)))

        person_incarceration_events_with_metadata = (
            {
                'person_events': person_incarceration_events,
                'person_metadata': person_metadata
            }
            | 'Group IncarcerationEvents with person-level metadata' >>
            beam.CoGroupByKey()
            |
            'Organize StatePerson, PersonMetadata and IncarcerationEvents for calculations'
            >> beam.ParDo(ExtractPersonEventsMetadata()))

        # Get pipeline job details for accessing job_id
        all_pipeline_options = PipelineOptions().get_all_options()

        # Add timestamp for local jobs
        job_timestamp = datetime.datetime.now().strftime(
            '%Y-%m-%d_%H_%M_%S.%f')
        all_pipeline_options['job_timestamp'] = job_timestamp

        metric_types = metric_types_filter if metric_types_filter else {'ALL'}

        # Get IncarcerationMetrics
        incarceration_metrics = (
            person_incarceration_events_with_metadata
            | 'Get Incarceration Metrics' >>  # type: ignore
            pipeline.GetIncarcerationMetrics(
                pipeline_options=all_pipeline_options,
                metric_types=metric_types,
                calculation_end_month=None,
                calculation_month_count=-1))

        assert_that(
            incarceration_metrics,
            AssertMatchers.validate_metric_type(allow_empty=allow_empty),
            'Assert that all metrics are of the expected type.')

        assert_that(
            incarceration_metrics,
            AssertMatchers.validate_pipeline_test(expected_metric_types),
            'Assert the type of metrics produced are expected')

        test_pipeline.run()
Exemple #25
0
    def testClassifyProgramAssignments_NoSupervision(self):
        """Tests the ClassifyProgramAssignments DoFn."""
        fake_person_id = 12345

        fake_person = entities.StatePerson.new_with_defaults(
            person_id=fake_person_id,
            gender=Gender.MALE,
            birthdate=date(1970, 1, 1),
            residency_status=ResidencyStatus.PERMANENT)

        program_assignment = entities.StateProgramAssignment.new_with_defaults(
            state_code='US_CA',
            program_id='PG3',
            referral_date=date(2009, 10, 3))

        assessment = entities.StateAssessment.new_with_defaults(
            state_code='US_CA',
            assessment_type=StateAssessmentType.ORAS,
            assessment_score=33,
            assessment_date=date(2009, 7, 10))

        person_periods = {
            'person': [fake_person],
            'program_assignments': [program_assignment],
            'assessments': [assessment],
            'supervision_periods': []
        }

        program_event = ProgramReferralEvent(
            state_code=program_assignment.state_code,
            program_id=program_assignment.program_id,
            event_date=program_assignment.referral_date,
            assessment_score=33,
            assessment_type=StateAssessmentType.ORAS,
        )

        correct_output = [(fake_person, [program_event])]

        test_pipeline = TestPipeline()

        supervision_period_to_agent_map = {'fake': 'map'}

        supervision_period_to_agent_associations = (
            test_pipeline
            | 'Create SupervisionPeriod to Agent table' >> beam.Create(
                [supervision_period_to_agent_map]))

        supervision_periods_to_agent_associations_as_kv = (
            supervision_period_to_agent_associations
            | 'Convert SupervisionPeriod to Agent table to KV tuples' >>
            beam.ParDo(pipeline.ConvertDictToKVTuple(),
                       'supervision_period_id'))

        output = (test_pipeline
                  | beam.Create([(fake_person_id, person_periods)])
                  | 'Identify Program Events' >> beam.ParDo(
                      pipeline.ClassifyProgramAssignments(),
                      AsDict(supervision_periods_to_agent_associations_as_kv)))

        assert_that(output, equal_to(correct_output))

        test_pipeline.run()
Exemple #26
0
    def testClassifyIncarcerationEvents(self):
        """Tests the ClassifyIncarcerationEvents DoFn."""
        fake_person_id = 12345

        fake_person = StatePerson.new_with_defaults(
            state_code='US_XX',
            person_id=fake_person_id,
            gender=Gender.MALE,
            birthdate=date(1970, 1, 1),
            residency_status=ResidencyStatus.PERMANENT)

        incarceration_period = StateIncarcerationPeriod.new_with_defaults(
            incarceration_period_id=1111,
            incarceration_type=StateIncarcerationType.STATE_PRISON,
            status=StateIncarcerationPeriodStatus.NOT_IN_CUSTODY,
            state_code='TX',
            facility='PRISON XX',
            admission_date=date(2010, 11, 20),
            admission_reason=StateIncarcerationPeriodAdmissionReason.
            PROBATION_REVOCATION,
            release_date=date(2010, 11, 21),
            release_reason=StateIncarcerationPeriodReleaseReason.
            SENTENCE_SERVED,
            specialized_purpose_for_incarceration=
            StateSpecializedPurposeForIncarceration.PAROLE_BOARD_HOLD)

        incarceration_sentence = StateIncarcerationSentence.new_with_defaults(
            incarceration_sentence_id=123,
            incarceration_periods=[incarceration_period],
            start_date=date(2009, 2, 9),
            charges=[
                StateCharge.new_with_defaults(ncic_code='5699',
                                              statute='30A123',
                                              offense_date=date(2009, 1, 9))
            ])

        sentence_group = StateSentenceGroup.new_with_defaults(
            sentence_group_id=123,
            incarceration_sentences=[incarceration_sentence])

        incarceration_sentence.sentence_group = sentence_group

        incarceration_period.incarceration_sentences = [incarceration_sentence]

        fake_person_id_to_county_query_result = [{
            'person_id':
            fake_person_id,
            'county_of_residence':
            _COUNTY_OF_RESIDENCE
        }]

        fake_incarceration_period_judicial_district_association_result = \
            {'person_id': fake_person_id, 'incarceration_period_id': 123, 'judicial_district_code': 'NW'}

        incarceration_events = [
            IncarcerationStayEvent(
                admission_reason=incarceration_period.admission_reason,
                admission_reason_raw_text=incarceration_period.
                admission_reason_raw_text,
                supervision_type_at_admission=
                StateSupervisionPeriodSupervisionType.PROBATION,
                state_code=incarceration_period.state_code,
                event_date=incarceration_period.admission_date,
                facility=incarceration_period.facility,
                county_of_residence=_COUNTY_OF_RESIDENCE,
                most_serious_offense_ncic_code='5699',
                most_serious_offense_statute='30A123',
                specialized_purpose_for_incarceration=
                StateSpecializedPurposeForIncarceration.PAROLE_BOARD_HOLD),
            IncarcerationAdmissionEvent(
                state_code=incarceration_period.state_code,
                event_date=incarceration_period.admission_date,
                facility=incarceration_period.facility,
                county_of_residence=_COUNTY_OF_RESIDENCE,
                admission_reason=incarceration_period.admission_reason,
                admission_reason_raw_text=incarceration_period.
                admission_reason_raw_text,
                supervision_type_at_admission=
                StateSupervisionPeriodSupervisionType.PROBATION,
                specialized_purpose_for_incarceration=
                StateSpecializedPurposeForIncarceration.PAROLE_BOARD_HOLD),
            IncarcerationReleaseEvent(
                state_code=incarceration_period.state_code,
                event_date=incarceration_period.release_date,
                facility=incarceration_period.facility,
                county_of_residence=_COUNTY_OF_RESIDENCE,
                release_reason=incarceration_period.release_reason,
                admission_reason=incarceration_period.admission_reason,
                total_days_incarcerated=(
                    incarceration_period.release_date -
                    incarceration_period.admission_date).days,
                purpose_for_incarceration=
                StateSpecializedPurposeForIncarceration.PAROLE_BOARD_HOLD)
        ]

        correct_output = [(fake_person_id, (fake_person, incarceration_events))
                          ]

        test_pipeline = TestPipeline()

        person_id_to_county_kv = (
            test_pipeline
            | "Read person id to county associations from BigQuery" >>
            beam.Create(fake_person_id_to_county_query_result)
            |
            "Convert to KV" >> beam.ParDo(ConvertDictToKVTuple(), 'person_id'))

        person_entities = {
            'person': [fake_person],
            'sentence_groups': [sentence_group],
            'incarceration_period_judicial_district_association':
            [fake_incarceration_period_judicial_district_association_result]
        }

        output = (test_pipeline
                  | beam.Create([(fake_person_id, person_entities)])
                  | 'Identify Incarceration Events' >> beam.ParDo(
                      pipeline.ClassifyIncarcerationEvents(),
                      AsDict(person_id_to_county_kv)))

        assert_that(output, equal_to(correct_output))

        test_pipeline.run()
Exemple #27
0
def run(apache_beam_pipeline_options: PipelineOptions, data_input: str,
        reference_input: str, output: str, calculation_month_count: int,
        metric_types: List[str], state_code: Optional[str],
        calculation_end_month: Optional[str],
        person_filter_ids: Optional[List[int]]):
    """Runs the supervision calculation pipeline."""

    # Workaround to load SQLAlchemy objects at start of pipeline. This is necessary because the BuildRootEntity
    # function tries to access attributes of relationship properties on the SQLAlchemy room_schema_class before they
    # have been loaded. However, if *any* SQLAlchemy objects have been instantiated, then the relationship properties
    # are loaded and their attributes can be successfully accessed.
    _ = schema.StatePerson()

    apache_beam_pipeline_options.view_as(SetupOptions).save_main_session = True

    # Get pipeline job details
    all_pipeline_options = apache_beam_pipeline_options.get_all_options()

    input_dataset = all_pipeline_options['project'] + '.' + data_input
    reference_dataset = all_pipeline_options['project'] + '.' + reference_input

    person_id_filter_set = set(
        person_filter_ids) if person_filter_ids else None

    with beam.Pipeline(options=apache_beam_pipeline_options) as p:
        # Get StatePersons
        persons = (p | 'Load Persons' >> BuildRootEntity(
            dataset=input_dataset,
            root_entity_class=entities.StatePerson,
            unifying_id_field=entities.StatePerson.get_class_id_name(),
            build_related_entities=True,
            unifying_id_field_filter_set=person_id_filter_set,
            state_code=state_code))

        # Get StateIncarcerationPeriods
        incarceration_periods = (
            p | 'Load IncarcerationPeriods' >> BuildRootEntity(
                dataset=input_dataset,
                root_entity_class=entities.StateIncarcerationPeriod,
                unifying_id_field=entities.StatePerson.get_class_id_name(),
                build_related_entities=True,
                unifying_id_field_filter_set=person_id_filter_set,
                state_code=state_code))

        # Get StateSupervisionViolations
        supervision_violations = (
            p | 'Load SupervisionViolations' >> BuildRootEntity(
                dataset=input_dataset,
                root_entity_class=entities.StateSupervisionViolation,
                unifying_id_field=entities.StatePerson.get_class_id_name(),
                build_related_entities=True,
                unifying_id_field_filter_set=person_id_filter_set,
                state_code=state_code))

        # TODO(2769): Don't bring this in as a root entity
        # Get StateSupervisionViolationResponses
        supervision_violation_responses = (
            p | 'Load SupervisionViolationResponses' >> BuildRootEntity(
                dataset=input_dataset,
                root_entity_class=entities.StateSupervisionViolationResponse,
                unifying_id_field=entities.StatePerson.get_class_id_name(),
                build_related_entities=True,
                unifying_id_field_filter_set=person_id_filter_set,
                state_code=state_code))

        # Get StateSupervisionSentences
        supervision_sentences = (
            p | 'Load SupervisionSentences' >> BuildRootEntity(
                dataset=input_dataset,
                root_entity_class=entities.StateSupervisionSentence,
                unifying_id_field=entities.StatePerson.get_class_id_name(),
                build_related_entities=True,
                unifying_id_field_filter_set=person_id_filter_set,
                state_code=state_code))

        # Get StateIncarcerationSentences
        incarceration_sentences = (
            p | 'Load IncarcerationSentences' >> BuildRootEntity(
                dataset=input_dataset,
                root_entity_class=entities.StateIncarcerationSentence,
                unifying_id_field=entities.StatePerson.get_class_id_name(),
                build_related_entities=True,
                unifying_id_field_filter_set=person_id_filter_set,
                state_code=state_code))

        # Get StateSupervisionPeriods
        supervision_periods = (
            p | 'Load SupervisionPeriods' >> BuildRootEntity(
                dataset=input_dataset,
                root_entity_class=entities.StateSupervisionPeriod,
                unifying_id_field=entities.StatePerson.get_class_id_name(),
                build_related_entities=True,
                unifying_id_field_filter_set=person_id_filter_set,
                state_code=state_code))

        # Get StateAssessments
        assessments = (p | 'Load Assessments' >> BuildRootEntity(
            dataset=input_dataset,
            root_entity_class=entities.StateAssessment,
            unifying_id_field=entities.StatePerson.get_class_id_name(),
            build_related_entities=False,
            unifying_id_field_filter_set=person_id_filter_set,
            state_code=state_code))

        supervision_contacts = (
            p | 'Load StateSupervisionContacts' >> BuildRootEntity(
                dataset=input_dataset,
                root_entity_class=entities.StateSupervisionContact,
                unifying_id_field=entities.StatePerson.get_class_id_name(),
                build_related_entities=False,
                unifying_id_field_filter_set=person_id_filter_set,
                state_code=state_code))

        # Bring in the table that associates StateSupervisionViolationResponses to information about StateAgents
        ssvr_to_agent_association_query = select_all_by_person_query(
            reference_dataset, SSVR_TO_AGENT_ASSOCIATION_VIEW_NAME, state_code,
            person_id_filter_set)

        ssvr_to_agent_associations = (
            p | "Read SSVR to Agent table from BigQuery" >> beam.io.Read(
                beam.io.BigQuerySource(query=ssvr_to_agent_association_query,
                                       use_standard_sql=True)))

        # Convert the association table rows into key-value tuples with the value for the
        # supervision_violation_response_id column as the key
        ssvr_agent_associations_as_kv = (
            ssvr_to_agent_associations
            | 'Convert SSVR to Agent table to KV tuples' >> beam.ParDo(
                ConvertDictToKVTuple(), 'supervision_violation_response_id'))

        supervision_period_to_agent_association_query = select_all_by_person_query(
            reference_dataset,
            SUPERVISION_PERIOD_TO_AGENT_ASSOCIATION_VIEW_NAME, state_code,
            person_id_filter_set)

        supervision_period_to_agent_associations = (
            p | "Read Supervision Period to Agent table from BigQuery" >>
            beam.io.Read(
                beam.io.BigQuerySource(
                    query=supervision_period_to_agent_association_query,
                    use_standard_sql=True)))

        # Convert the association table rows into key-value tuples with the value for the supervision_period_id column
        # as the key
        supervision_period_to_agent_associations_as_kv = (
            supervision_period_to_agent_associations
            | 'Convert Supervision Period to Agent table to KV tuples' >>
            beam.ParDo(ConvertDictToKVTuple(), 'supervision_period_id'))

        if state_code is None or state_code == 'US_MO':
            # Bring in the reference table that includes sentence status ranking information
            us_mo_sentence_status_query = select_all_by_person_query(
                reference_dataset, US_MO_SENTENCE_STATUSES_VIEW_NAME,
                state_code, person_id_filter_set)

            us_mo_sentence_statuses = (
                p |
                "Read MO sentence status table from BigQuery" >> beam.io.Read(
                    beam.io.BigQuerySource(query=us_mo_sentence_status_query,
                                           use_standard_sql=True)))
        else:
            us_mo_sentence_statuses = (
                p |
                f"Generate empty MO statuses list for non-MO state run: {state_code} "
                >> beam.Create([]))

        us_mo_sentence_status_rankings_as_kv = (
            us_mo_sentence_statuses
            | 'Convert MO sentence status ranking table to KV tuples' >>
            beam.ParDo(ConvertDictToKVTuple(), 'person_id'))

        sentences_and_statuses = (
            {
                'incarceration_sentences': incarceration_sentences,
                'supervision_sentences': supervision_sentences,
                'sentence_statuses': us_mo_sentence_status_rankings_as_kv
            }
            | 'Group sentences to the sentence statuses for that person' >>
            beam.CoGroupByKey())

        sentences_converted = (
            sentences_and_statuses
            | 'Convert to state-specific sentences' >> beam.ParDo(
                ConvertSentencesToStateSpecificType()).with_outputs(
                    'incarceration_sentences', 'supervision_sentences'))

        # Bring in the judicial districts associated with supervision_periods
        sp_to_judicial_district_query = select_all_by_person_query(
            reference_dataset,
            SUPERVISION_PERIOD_JUDICIAL_DISTRICT_ASSOCIATION_VIEW_NAME,
            state_code, person_id_filter_set)

        sp_to_judicial_district_kv = (
            p |
            "Read supervision_period to judicial_district associations from BigQuery"
            >> beam.io.Read(
                beam.io.BigQuerySource(query=sp_to_judicial_district_query,
                                       use_standard_sql=True))
            |
            "Convert supervision_period to judicial_district association table to KV"
            >> beam.ParDo(ConvertDictToKVTuple(), 'person_id'))

        # Group StateSupervisionViolationResponses and StateSupervisionViolations by person_id
        supervision_violations_and_responses = (
            {
                'violations': supervision_violations,
                'violation_responses': supervision_violation_responses
            } | 'Group StateSupervisionViolationResponses to '
            'StateSupervisionViolations' >> beam.CoGroupByKey())

        # Set the fully hydrated StateSupervisionViolation entities on the corresponding
        # StateSupervisionViolationResponses
        violation_responses_with_hydrated_violations = (
            supervision_violations_and_responses
            | 'Set hydrated StateSupervisionViolations on '
            'the StateSupervisionViolationResponses' >> beam.ParDo(
                SetViolationOnViolationsResponse()))

        # Group StateIncarcerationPeriods and StateSupervisionViolationResponses by person_id
        incarceration_periods_and_violation_responses = (
            {
                'incarceration_periods': incarceration_periods,
                'violation_responses':
                violation_responses_with_hydrated_violations
            }
            | 'Group StateIncarcerationPeriods to '
            'StateSupervisionViolationResponses' >> beam.CoGroupByKey())

        # Set the fully hydrated StateSupervisionViolationResponse entities on the corresponding
        # StateIncarcerationPeriods
        incarceration_periods_with_source_violations = (
            incarceration_periods_and_violation_responses
            | 'Set hydrated StateSupervisionViolationResponses on '
            'the StateIncarcerationPeriods' >> beam.ParDo(
                SetViolationResponseOnIncarcerationPeriod()))

        # Group each StatePerson with their related entities
        person_entities = (
            {
                'person':
                persons,
                'assessments':
                assessments,
                'incarceration_periods':
                incarceration_periods_with_source_violations,
                'supervision_periods':
                supervision_periods,
                'supervision_sentences':
                sentences_converted.supervision_sentences,
                'incarceration_sentences':
                sentences_converted.incarceration_sentences,
                'violation_responses':
                violation_responses_with_hydrated_violations,
                'supervision_contacts':
                supervision_contacts,
                'supervision_period_judicial_district_association':
                sp_to_judicial_district_kv
            }
            | 'Group StatePerson to all entities' >> beam.CoGroupByKey())

        # Identify SupervisionTimeBuckets from the StatePerson's StateSupervisionSentences and StateIncarcerationPeriods
        person_time_buckets = (
            person_entities
            | 'Get SupervisionTimeBuckets' >> beam.ParDo(
                ClassifySupervisionTimeBuckets(),
                AsDict(ssvr_agent_associations_as_kv),
                AsDict(supervision_period_to_agent_associations_as_kv)))

        # Get pipeline job details for accessing job_id
        all_pipeline_options = apache_beam_pipeline_options.get_all_options()

        # Get the type of metric to calculate
        metric_types_set = set(metric_types)

        # Add timestamp for local jobs
        job_timestamp = datetime.datetime.now().strftime(
            '%Y-%m-%d_%H_%M_%S.%f')
        all_pipeline_options['job_timestamp'] = job_timestamp

        # Get supervision metrics
        supervision_metrics = (
            person_time_buckets
            | 'Get Supervision Metrics' >> GetSupervisionMetrics(
                pipeline_options=all_pipeline_options,
                metric_types=metric_types_set,
                calculation_end_month=calculation_end_month,
                calculation_month_count=calculation_month_count))
        if person_id_filter_set:
            logging.warning(
                "Non-empty person filter set - returning before writing metrics."
            )
            return

        # Convert the metrics into a format that's writable to BQ
        writable_metrics = (
            supervision_metrics | 'Convert to dict to be written to BQ' >>
            beam.ParDo(RecidivizMetricWritableDict()).with_outputs(
                SupervisionMetricType.SUPERVISION_COMPLIANCE.value,
                SupervisionMetricType.SUPERVISION_POPULATION.value,
                SupervisionMetricType.SUPERVISION_REVOCATION.value,
                SupervisionMetricType.SUPERVISION_REVOCATION_ANALYSIS.value,
                SupervisionMetricType.
                SUPERVISION_REVOCATION_VIOLATION_TYPE_ANALYSIS.value,
                SupervisionMetricType.SUPERVISION_SUCCESS.value,
                SupervisionMetricType.
                SUPERVISION_SUCCESSFUL_SENTENCE_DAYS_SERVED.value,
                SupervisionMetricType.SUPERVISION_TERMINATION.value))

        # Write the metrics to the output tables in BigQuery
        terminations_table_id = DATAFLOW_METRICS_TO_TABLES.get(
            SupervisionTerminationMetric)
        compliance_table_id = DATAFLOW_METRICS_TO_TABLES.get(
            SupervisionCaseComplianceMetric)
        populations_table_id = DATAFLOW_METRICS_TO_TABLES.get(
            SupervisionPopulationMetric)
        revocations_table_id = DATAFLOW_METRICS_TO_TABLES.get(
            SupervisionRevocationMetric)
        revocation_analysis_table_id = DATAFLOW_METRICS_TO_TABLES.get(
            SupervisionRevocationAnalysisMetric)
        revocation_violation_type_analysis_table_id = \
            DATAFLOW_METRICS_TO_TABLES.get(SupervisionRevocationViolationTypeAnalysisMetric)
        successes_table_id = DATAFLOW_METRICS_TO_TABLES.get(
            SupervisionSuccessMetric)
        successful_sentence_lengths_table_id = DATAFLOW_METRICS_TO_TABLES.get(
            SuccessfulSupervisionSentenceDaysServedMetric)

        _ = (writable_metrics.SUPERVISION_POPULATION
             | f"Write population metrics to BQ table: {populations_table_id}"
             >> beam.io.WriteToBigQuery(
                 table=populations_table_id,
                 dataset=output,
                 create_disposition=beam.io.BigQueryDisposition.CREATE_NEVER,
                 write_disposition=beam.io.BigQueryDisposition.WRITE_APPEND,
                 method=beam.io.WriteToBigQuery.Method.FILE_LOADS))

        _ = (writable_metrics.SUPERVISION_REVOCATION
             | f"Write revocation metrics to BQ table: {revocations_table_id}"
             >> beam.io.WriteToBigQuery(
                 table=revocations_table_id,
                 dataset=output,
                 create_disposition=beam.io.BigQueryDisposition.CREATE_NEVER,
                 write_disposition=beam.io.BigQueryDisposition.WRITE_APPEND,
                 method=beam.io.WriteToBigQuery.Method.FILE_LOADS))

        _ = (writable_metrics.SUPERVISION_SUCCESS
             | f"Write success metrics to BQ table: {successes_table_id}" >>
             beam.io.WriteToBigQuery(
                 table=successes_table_id,
                 dataset=output,
                 create_disposition=beam.io.BigQueryDisposition.CREATE_NEVER,
                 write_disposition=beam.io.BigQueryDisposition.WRITE_APPEND,
                 method=beam.io.WriteToBigQuery.Method.FILE_LOADS))

        _ = (writable_metrics.SUPERVISION_SUCCESSFUL_SENTENCE_DAYS_SERVED
             | f"Write supervision successful sentence length metrics to BQ"
             f" table: {successful_sentence_lengths_table_id}" >>
             beam.io.WriteToBigQuery(
                 table=successful_sentence_lengths_table_id,
                 dataset=output,
                 create_disposition=beam.io.BigQueryDisposition.CREATE_NEVER,
                 write_disposition=beam.io.BigQueryDisposition.WRITE_APPEND,
                 method=beam.io.WriteToBigQuery.Method.FILE_LOADS))

        _ = (writable_metrics.SUPERVISION_TERMINATION
             |
             f"Write termination metrics to BQ table: {terminations_table_id}"
             >> beam.io.WriteToBigQuery(
                 table=terminations_table_id,
                 dataset=output,
                 create_disposition=beam.io.BigQueryDisposition.CREATE_NEVER,
                 write_disposition=beam.io.BigQueryDisposition.WRITE_APPEND,
                 method=beam.io.WriteToBigQuery.Method.FILE_LOADS))

        _ = (
            writable_metrics.SUPERVISION_REVOCATION_ANALYSIS
            |
            f"Write revocation analyses metrics to BQ table: {revocation_analysis_table_id}"
            >> beam.io.WriteToBigQuery(
                table=revocation_analysis_table_id,
                dataset=output,
                create_disposition=beam.io.BigQueryDisposition.CREATE_NEVER,
                write_disposition=beam.io.BigQueryDisposition.WRITE_APPEND,
                method=beam.io.WriteToBigQuery.Method.FILE_LOADS))

        _ = (writable_metrics.SUPERVISION_REVOCATION_VIOLATION_TYPE_ANALYSIS
             |
             f"Write revocation violation type analyses metrics to BQ table: "
             f"{revocation_violation_type_analysis_table_id}" >>
             beam.io.WriteToBigQuery(
                 table=revocation_violation_type_analysis_table_id,
                 dataset=output,
                 create_disposition=beam.io.BigQueryDisposition.CREATE_NEVER,
                 write_disposition=beam.io.BigQueryDisposition.WRITE_APPEND,
                 method=beam.io.WriteToBigQuery.Method.FILE_LOADS))

        _ = (writable_metrics.SUPERVISION_COMPLIANCE
             | f"Write compliance metrics to BQ table: {compliance_table_id}"
             >> beam.io.WriteToBigQuery(
                 table=compliance_table_id,
                 dataset=output,
                 create_disposition=beam.io.BigQueryDisposition.CREATE_NEVER,
                 write_disposition=beam.io.BigQueryDisposition.WRITE_APPEND,
                 method=beam.io.WriteToBigQuery.Method.FILE_LOADS))
def run():
    address_scd = """SELECT * FROM `automatic-asset-253215.CORE.IM_CUSTOMER_ADDRESS_SCD`"""

    addr_old_data_update = """SELECT
  CAST(a.HSN_ACCT_NUM AS INT64) AS CUSTOMER_ID,
  a.ADDRESS_NAME AS ADDR_NAME,
  'CLIC' AS ETL_SOURCE_SYSTEM,
  a.FILE_SET_DATE AS ETL_END_EFFECTIVE_DT,
  '0' AS ETL_CURRENT_IND,
  CAST(FORMAT_DATETIME('%Y%m%d%H%M%S', CURRENT_DATETIME()) AS INT64) AS UPD_BATCH_NBR
FROM
  `automatic-asset-253215.STAGE.STG_CLIC_CUSTADDRORG` a"""

    primary_pipeline_1 = 'addrs_SCD'
    addrs_SCD = p | 'Target Table' >> beam.io.Read(
        beam.io.BigQuerySource(query=address_scd, use_standard_sql=True))

    common_key = {'CUSTOMER_ID', 'ADDR_NAME', 'ETL_SOURCE_SYSTEM'}

    join_pipeline_1 = 'addrorg_old_update_tb'
    addrorg_old_update_tb = p | 'addrorg Table' >> beam.io.Read(
        beam.io.BigQuerySource(query=addr_old_data_update,
                               use_standard_sql=True))

    pipelines_dictionary_2 = {
        primary_pipeline_1: addrs_SCD,
        join_pipeline_1: addrorg_old_update_tb
    }

    updated_old_data = (pipelines_dictionary_2
                        | 'Updating addrs Fields' >> LeftJoin(
                            primary_pipeline_1, addrs_SCD, join_pipeline_1,
                            addrorg_old_update_tb, common_key))

    address_scd_query = """SELECT
  (srg_key.MAX_VALUE_KEY + ROW_NUMBER() OVER()) AS CUSTOMER_ADDRESS_KEY,
  '' AS CUSTOMER_KEY,
  CAST(a.HSN_ACCT_NUM AS INT64) AS CUSTOMER_ID,
  a.ADDRESS_NAME AS ADDR_NAME,
  'CLIC' AS ETL_SOURCE_SYSTEM,
  CAST(a.ROW_CREATED_DATE AS TIMESTAMP) AS SOURCE_CREATE_DT,
  a.ADDRESS_LINE_1 AS ADDR_LINE1_TXT,
  a.ADDRESS_LINE_2 AS ADDR_LINE2_TXT,
  a.CITY AS CITY_NAME,
  a.STATE AS STATE_CODE,
  a.COUNTRY AS COUNTRY_CODE,
  SUBSTR(a.ZIP_CODE,1,5) AS POSTAL_ZIP,
  SUBSTR(a.ZIP_CODE,6,9) AS POSTAL_ZIP4,
  CASE WHEN a.DISABLE_CLEANSING_FLAG = 'N' THEN 1
          ELSE 0 
          END AS ADDR_CLEANSING_IND,
  CASE WHEN a.FRAUD_BAD_ACCT_FLAG = 'Y' THEN 1
          ELSE 0 
          END AS ADDR_FRAUD_IND,
  CASE WHEN a.AGENT_VERIFIED_ADDRESS = 'Y' THEN 1
          ELSE 0 
          END AS ADDR_QAS_VERIFIED_IND,
  a.ADDRESS_TYPE_CODE AS ADDR_TYPE_CODE,
  a.SHIP_TO_FIRST_NAME AS SHIPTO_FIRST_NAME,
  a.SHIP_TO_LAST_NAME AS SHIPTO_LAST_NAME,
  TIMESTAMP_ADD(a.FILE_SET_DATE, INTERVAL 1 DAY) AS ETL_BEGIN_EFFECTIVE_DT,
  CAST('2099-12-31 00:00:00' AS TIMESTAMP) AS ETL_END_EFFECTIVE_DT,
  '1' AS ETL_CURRENT_IND,
  '2' AS ETL_VERSION_NBR, --should be a sequntial number
  '0' AS VOID_IND,
  CAST(FORMAT_DATETIME('%Y%m%d%H%M%S',
    CURRENT_DATETIME()) AS INT64) AS UPD_BATCH_NBR,
  CAST(FORMAT_DATETIME('%Y%m%d%H%M%S',
    CURRENT_DATETIME()) AS INT64) AS INS_BATCH_NBR,
  '0' AS Privacy_Ind
FROM
  `automatic-asset-253215.STAGE.STG_CLIC_CUSTADDRORG` a,
  `automatic-asset-253215.STAGE.STG_CLIC_SURROGKEYS` srg_key
          WHERE srg_key.TABLE_NAME = "IM_CUSTOMER_ADDRESS_SCD"
   """

    Attribute_ref_query = """SELECT
    CUSTOMER_KEY,
    CUSTOMER_ID
  FROM
    `automatic-asset-253215.CORE.IM_CUSTOMER_ATTRIBUTE_REF` b"""

    lookup_data = (
        addrs_SCD |
        'Get Cust_Ids ' >> beam.Map(lambda row: (str(row['CUSTOMER_ID']) + row[
            'ADDR_NAME'] + row['ETL_SOURCE_SYSTEM'], row)))

    primary_pipeline_2 = 'new_ins_data'
    new_ins_data = (p | 'Read from custorgext' >> beam.io.Read(
        beam.io.BigQuerySource(query=address_scd_query, use_standard_sql=True))
                    | 'Lookup' >> beam.Map(lookup, AsDict(lookup_data))
                    | 'Filter' >> beam.ParDo(filter_out_nones))

    join_pipeline_2 = 'attribute_ref_table'
    attribute_ref_table = (p | 'Read From Attribute Ref Table' >> beam.io.Read(
        beam.io.BigQuerySource(query=Attribute_ref_query,
                               use_standard_sql=True)))

    common_key = 'CUSTOMER_ID'
    pipelines_dictionary = {
        primary_pipeline_2: new_ins_data,
        join_pipeline_2: attribute_ref_table
    }

    new_ins_data_2 = (pipelines_dictionary
                      | 'Left join' >> LeftJoin2(
                          primary_pipeline_2, new_ins_data, join_pipeline_2,
                          attribute_ref_table, common_key)
                      | 'Filter Nulls' >> beam.Filter(filter_null))

    ((updated_old_data, new_ins_data_2)
     | 'Merge PCollections' >> beam.Flatten()
     | 'Write to IM_CUSTOMER_ADDRESS_SCD' >> beam.io.WriteToBigQuery(
         output_table,
         write_disposition=beam.io.BigQueryDisposition.WRITE_TRUNCATE,
         create_disposition=beam.io.BigQueryDisposition.CREATE_NEVER))

    p.run().wait_until_finish()
def run(argv=None):
    # Parse arguments
    parser = argparse.ArgumentParser()
    parser.add_argument('--input-basics',
                        dest='input_basics',
                        required=True,
                        help='Input movie base file to process.')
    parser.add_argument('--input-ratings',
                        dest='input_ratings',
                        required=True,
                        help='Input rating file to process.')
    parser.add_argument('--output',
                        dest='output',
                        required=True,
                        help='Output to write results to.')
    known_args, pipeline_args = parser.parse_known_args(argv)

    pipeline_options = PipelineOptions(pipeline_args)
    pipeline_options.view_as(SetupOptions).save_main_session = True

    # Set DirectRunner Capacity
    from apache_beam.options.pipeline_options import DirectOptions
    #pipeline_options.view_as(DirectOptions).direct_num_workers = 4

    # Columns
    columns_title_basic = [
        'tconst', 'titleType', 'primaryTitle', 'originalTitle', 'isAdult',
        'startYear', 'endYear', 'runtimeMinutes', 'genres'
    ]
    columns_ratings = ['tconst', 'averageRating', 'numVotes']

    # Create the pipeline
    with beam.Pipeline(options=pipeline_options) as p:
        basic_data = (
            p | 'Read data' >> beam.io.ReadFromText(known_args.input_basics,
                                                    skip_header_lines=1)
            | 'Parse CSV' >> beam.ParDo(ParseCsv(columns_title_basic))
            | 'Clean data' >> beam.ParDo(
                CleanData(bool_cols=['isAdult'],
                          int_cols=['startYear', 'endYear', 'runtimeMinutes']))
            | 'Filter data' >> beam.ParDo(FilterBasicData()))

        rating_data = (
            p | 'Read data (Details)' >> beam.io.ReadFromText(
                known_args.input_ratings, skip_header_lines=1)
            | 'Parse CSV (Details)' >> beam.ParDo(ParseCsv(columns_ratings))
            | 'Clean data (Details)' >> beam.ParDo(
                CleanData(int_cols=['numVotes'], float_cols=['averageRating']))
            | 'Filter data (Details)' >> beam.ParDo(FilterRatingData()))

        # As dict
        rating_data_d = (rating_data | beam.Map(lambda d: (d['tconst'], d)))
        # Join the PCollections
        joined_dicts = (
            basic_data
            | 'Join' >> beam.ParDo(JoinRatings(), AsDict(rating_data_d)))

        # Write to disk
        joined_dicts | 'write' >> beam.io.WriteToText(known_args.output)

        result = p.run()
        result.wait_until_finish()
Exemple #30
0
def run(argv=None):
    """Runs the incarceration calculation pipeline."""

    # Workaround to load SQLAlchemy objects at start of pipeline. This is necessary because the BuildRootEntity
    # function tries to access attributes of relationship properties on the SQLAlchemy room_schema_class before they
    # have been loaded. However, if *any* SQLAlchemy objects have been instantiated, then the relationship properties
    # are loaded and their attributes can be successfully accessed.
    _ = schema.StatePerson()

    # Parse command-line arguments
    known_args, pipeline_args = parse_arguments(argv)

    pipeline_options = PipelineOptions(pipeline_args)
    pipeline_options.view_as(SetupOptions).save_main_session = True

    # Get pipeline job details
    all_pipeline_options = pipeline_options.get_all_options()

    query_dataset = all_pipeline_options['project'] + '.' + known_args.input
    reference_dataset = all_pipeline_options[
        'project'] + '.' + known_args.reference_input

    with beam.Pipeline(argv=pipeline_args) as p:
        # Get StatePersons
        persons = (p | 'Load StatePersons' >> BuildRootEntity(
            dataset=query_dataset,
            data_dict=None,
            root_schema_class=schema.StatePerson,
            root_entity_class=entities.StatePerson,
            unifying_id_field='person_id',
            build_related_entities=True))

        # Get StateSentenceGroups
        sentence_groups = (p | 'Load StateSentenceGroups' >> BuildRootEntity(
            dataset=query_dataset,
            data_dict=None,
            root_schema_class=schema.StateSentenceGroup,
            root_entity_class=entities.StateSentenceGroup,
            unifying_id_field='person_id',
            build_related_entities=True))

        # Get StateIncarcerationSentences
        incarceration_sentences = (
            p | 'Load StateIncarcerationSentences' >> BuildRootEntity(
                dataset=query_dataset,
                data_dict=None,
                root_schema_class=schema.StateIncarcerationSentence,
                root_entity_class=entities.StateIncarcerationSentence,
                unifying_id_field='person_id',
                build_related_entities=True))

        # Get StateSupervisionSentences
        supervision_sentences = (
            p | 'Load StateSupervisionSentences' >> BuildRootEntity(
                dataset=query_dataset,
                data_dict=None,
                root_schema_class=schema.StateSupervisionSentence,
                root_entity_class=entities.StateSupervisionSentence,
                unifying_id_field='person_id',
                build_related_entities=True))

        sentences_and_sentence_groups = (
            {
                'sentence_groups': sentence_groups,
                'incarceration_sentences': incarceration_sentences,
                'supervision_sentences': supervision_sentences
            }
            | 'Group sentences to sentence groups' >> beam.CoGroupByKey())

        # Set hydrated sentences on the corresponding sentence groups
        sentence_groups_with_hydrated_sentences = (
            sentences_and_sentence_groups
            | 'Set hydrated sentences on sentence groups' >> beam.ParDo(
                SetSentencesOnSentenceGroup()))

        # Group each StatePerson with their related entities
        person_and_sentence_groups = (
            {
                'person': persons,
                'sentence_groups': sentence_groups_with_hydrated_sentences
            }
            | 'Group StatePerson to SentenceGroups' >> beam.CoGroupByKey())

        # Bring in the table that associates people and their county of residence
        person_id_to_county_query = \
            f"SELECT * FROM `{reference_dataset}.persons_to_recent_county_of_residence`"

        person_id_to_county_kv = (
            p | "Read person_id to county associations from BigQuery" >>
            beam.io.Read(
                beam.io.BigQuerySource(query=person_id_to_county_query,
                                       use_standard_sql=True))
            | "Convert person_id to county association table to KV" >>
            beam.ParDo(ConvertDictToKVTuple(), 'person_id'))

        # Identify IncarcerationEvents events from the StatePerson's StateIncarcerationPeriods
        person_events = (
            person_and_sentence_groups
            | 'Classify Incarceration Events' >> beam.ParDo(
                ClassifyIncarcerationEvents(), AsDict(person_id_to_county_kv)))

        # Get dimensions to include and methodologies to use
        inclusions, _ = dimensions_and_methodologies(known_args)

        # Get pipeline job details for accessing job_id
        all_pipeline_options = pipeline_options.get_all_options()

        # The number of months to limit the monthly calculation output to
        calculation_month_limit = known_args.calculation_month_limit

        # Add timestamp for local jobs
        job_timestamp = datetime.datetime.now().strftime(
            '%Y-%m-%d_%H_%M_%S.%f')
        all_pipeline_options['job_timestamp'] = job_timestamp

        # Get IncarcerationMetrics
        incarceration_metrics = (
            person_events
            | 'Get Incarceration Metrics' >> GetIncarcerationMetrics(
                pipeline_options=all_pipeline_options,
                inclusions=inclusions,
                calculation_month_limit=calculation_month_limit))

        # Convert the metrics into a format that's writable to BQ
        writable_metrics = (
            incarceration_metrics | 'Convert to dict to be written to BQ' >>
            beam.ParDo(IncarcerationMetricWritableDict()).with_outputs(
                'admissions', 'populations', 'releases'))

        # Write the metrics to the output tables in BigQuery
        admissions_table = known_args.output + '.incarceration_admission_metrics'

        population_table = known_args.output + '.incarceration_population_metrics'

        releases_table = known_args.output + '.incarceration_release_metrics'

        _ = (writable_metrics.admissions
             | f"Write admission metrics to BQ table: {admissions_table}" >>
             beam.io.WriteToBigQuery(
                 table=admissions_table,
                 create_disposition=beam.io.BigQueryDisposition.CREATE_NEVER,
                 write_disposition=beam.io.BigQueryDisposition.WRITE_APPEND))

        _ = (writable_metrics.populations
             | f"Write population metrics to BQ table: {population_table}" >>
             beam.io.WriteToBigQuery(
                 table=population_table,
                 create_disposition=beam.io.BigQueryDisposition.CREATE_NEVER,
                 write_disposition=beam.io.BigQueryDisposition.WRITE_APPEND))

        _ = (writable_metrics.releases
             | f"Write release metrics to BQ table: {releases_table}" >>
             beam.io.WriteToBigQuery(
                 table=releases_table,
                 create_disposition=beam.io.BigQueryDisposition.CREATE_NEVER,
                 write_disposition=beam.io.BigQueryDisposition.WRITE_APPEND))