Beispiel #1
0
    def test_select_all_state_code_and_ids_filter(self):
        expected_query = (
            "SELECT * FROM `project-id.my_dataset.TABLE_WHERE_DATA_IS` "
            "WHERE state_code IN ('US_XX') AND person_id IN (1234)")

        self.assertEqual(
            expected_query,
            select_all_by_person_query(
                self.dataset,
                self.table_id,
                state_code_filter="US_XX",
                person_id_filter_set={1234},
            ),
        )

        expected_query = (
            "SELECT * FROM `project-id.my_dataset.TABLE_WHERE_DATA_IS` "
            "WHERE state_code IN ('US_XX') AND field_name IN (1234, 56)")
        self.assertEqual(
            expected_query,
            select_all_query(
                self.dataset,
                self.table_id,
                state_code_filter="US_XX",
                unifying_id_field="field_name",
                unifying_id_field_filter_set={1234, 56},
            ),
        )
Beispiel #2
0
    def expand(self, pipeline: Pipeline):
        # Bring in the table from BigQuery
        table_query = select_all_by_person_query(
            self.dataset_id, self.table_id, self.state_code_filter, self.person_id_filter_set)

        table_contents = (pipeline
                          | f"Read {self.dataset_id}.{self.table_id} table from BigQuery" >>
                          ReadFromBigQuery(query=table_query))

        return table_contents
Beispiel #3
0
    def test_select_all_with_state_code_filter_only(self):
        expected_query = 'SELECT * FROM `project-id.my_dataset.TABLE_WHERE_DATA_IS` WHERE state_code IN (\'US_XX\')'

        self.assertEqual(
            expected_query,
            select_all_by_person_query(self.dataset,
                                       self.table_id,
                                       state_code_filter='US_XX',
                                       person_id_filter_set=None))

        self.assertEqual(
            expected_query,
            select_all_query(self.dataset,
                             self.table_id,
                             state_code_filter='US_XX',
                             unifying_id_field='field_name',
                             unifying_id_field_filter_set=None))
Beispiel #4
0
    def test_simple_select_all_no_filters(self):
        expected_query = 'SELECT * FROM `project-id.my_dataset.TABLE_WHERE_DATA_IS`'

        self.assertEqual(
            expected_query,
            select_all_by_person_query(self.dataset,
                                       self.table_id,
                                       state_code_filter=None,
                                       person_id_filter_set=None))

        self.assertEqual(
            expected_query,
            select_all_query(self.dataset,
                             self.table_id,
                             state_code_filter=None,
                             unifying_id_field='field_name',
                             unifying_id_field_filter_set=None))
Beispiel #5
0
    def test_select_all_with_ids_filter_only(self):
        expected_query = 'SELECT * FROM `project-id.my_dataset.TABLE_WHERE_DATA_IS` WHERE person_id IN (1234)'

        self.assertEqual(
            expected_query,
            select_all_by_person_query(self.dataset,
                                       self.table_id,
                                       state_code_filter=None,
                                       person_id_filter_set={1234}))

        expected_query = 'SELECT * FROM `project-id.my_dataset.TABLE_WHERE_DATA_IS` WHERE field_name IN (1234)'
        self.assertEqual(
            expected_query,
            select_all_query(self.dataset,
                             self.table_id,
                             state_code_filter=None,
                             unifying_id_field='field_name',
                             unifying_id_field_filter_set={1234}))
Beispiel #6
0
    def test_select_all_state_code_and_ids_filter(self):
        expected_query = \
            'SELECT * FROM `project-id.my_dataset.TABLE_WHERE_DATA_IS` ' \
            'WHERE person_id IN (1234) AND state_code IN (\'US_XX\')'

        self.assertEqual(
            expected_query,
            select_all_by_person_query(self.dataset,
                                       self.table_id,
                                       state_code_filter='US_XX',
                                       person_id_filter_set={1234}))

        expected_query = \
            'SELECT * FROM `project-id.my_dataset.TABLE_WHERE_DATA_IS` ' \
            'WHERE field_name IN (1234, 56) AND state_code IN (\'US_XX\')'
        self.assertEqual(
            expected_query,
            select_all_query(self.dataset,
                             self.table_id,
                             state_code_filter='US_XX',
                             unifying_id_field='field_name',
                             unifying_id_field_filter_set={1234, 56}))
Beispiel #7
0
    def test_select_all_with_state_code_filter_only(self):
        expected_query = "SELECT * FROM `project-id.my_dataset.TABLE_WHERE_DATA_IS` WHERE state_code IN ('US_XX')"

        self.assertEqual(
            expected_query,
            select_all_by_person_query(
                self.dataset,
                self.table_id,
                state_code_filter="US_XX",
                person_id_filter_set=None,
            ),
        )

        self.assertEqual(
            expected_query,
            select_all_query(
                self.dataset,
                self.table_id,
                state_code_filter="US_XX",
                unifying_id_field="field_name",
                unifying_id_field_filter_set=None,
            ),
        )
Beispiel #8
0
def run(apache_beam_pipeline_options: PipelineOptions, data_input: str,
        reference_input: str, output: str, calculation_month_count: int,
        metric_types: List[str], state_code: Optional[str],
        calculation_end_month: Optional[str],
        person_filter_ids: Optional[List[int]]):
    """Runs the supervision calculation pipeline."""

    # Workaround to load SQLAlchemy objects at start of pipeline. This is necessary because the BuildRootEntity
    # function tries to access attributes of relationship properties on the SQLAlchemy room_schema_class before they
    # have been loaded. However, if *any* SQLAlchemy objects have been instantiated, then the relationship properties
    # are loaded and their attributes can be successfully accessed.
    _ = schema.StatePerson()

    apache_beam_pipeline_options.view_as(SetupOptions).save_main_session = True

    # Get pipeline job details
    all_pipeline_options = apache_beam_pipeline_options.get_all_options()

    input_dataset = all_pipeline_options['project'] + '.' + data_input
    reference_dataset = all_pipeline_options['project'] + '.' + reference_input

    person_id_filter_set = set(
        person_filter_ids) if person_filter_ids else None

    with beam.Pipeline(options=apache_beam_pipeline_options) as p:
        # Get StatePersons
        persons = (p | 'Load Persons' >> BuildRootEntity(
            dataset=input_dataset,
            root_entity_class=entities.StatePerson,
            unifying_id_field=entities.StatePerson.get_class_id_name(),
            build_related_entities=True,
            unifying_id_field_filter_set=person_id_filter_set,
            state_code=state_code))

        # Get StateIncarcerationPeriods
        incarceration_periods = (
            p | 'Load IncarcerationPeriods' >> BuildRootEntity(
                dataset=input_dataset,
                root_entity_class=entities.StateIncarcerationPeriod,
                unifying_id_field=entities.StatePerson.get_class_id_name(),
                build_related_entities=True,
                unifying_id_field_filter_set=person_id_filter_set,
                state_code=state_code))

        # Get StateSupervisionViolations
        supervision_violations = (
            p | 'Load SupervisionViolations' >> BuildRootEntity(
                dataset=input_dataset,
                root_entity_class=entities.StateSupervisionViolation,
                unifying_id_field=entities.StatePerson.get_class_id_name(),
                build_related_entities=True,
                unifying_id_field_filter_set=person_id_filter_set,
                state_code=state_code))

        # TODO(2769): Don't bring this in as a root entity
        # Get StateSupervisionViolationResponses
        supervision_violation_responses = (
            p | 'Load SupervisionViolationResponses' >> BuildRootEntity(
                dataset=input_dataset,
                root_entity_class=entities.StateSupervisionViolationResponse,
                unifying_id_field=entities.StatePerson.get_class_id_name(),
                build_related_entities=True,
                unifying_id_field_filter_set=person_id_filter_set,
                state_code=state_code))

        # Get StateSupervisionSentences
        supervision_sentences = (
            p | 'Load SupervisionSentences' >> BuildRootEntity(
                dataset=input_dataset,
                root_entity_class=entities.StateSupervisionSentence,
                unifying_id_field=entities.StatePerson.get_class_id_name(),
                build_related_entities=True,
                unifying_id_field_filter_set=person_id_filter_set,
                state_code=state_code))

        # Get StateIncarcerationSentences
        incarceration_sentences = (
            p | 'Load IncarcerationSentences' >> BuildRootEntity(
                dataset=input_dataset,
                root_entity_class=entities.StateIncarcerationSentence,
                unifying_id_field=entities.StatePerson.get_class_id_name(),
                build_related_entities=True,
                unifying_id_field_filter_set=person_id_filter_set,
                state_code=state_code))

        # Get StateSupervisionPeriods
        supervision_periods = (
            p | 'Load SupervisionPeriods' >> BuildRootEntity(
                dataset=input_dataset,
                root_entity_class=entities.StateSupervisionPeriod,
                unifying_id_field=entities.StatePerson.get_class_id_name(),
                build_related_entities=True,
                unifying_id_field_filter_set=person_id_filter_set,
                state_code=state_code))

        # Get StateAssessments
        assessments = (p | 'Load Assessments' >> BuildRootEntity(
            dataset=input_dataset,
            root_entity_class=entities.StateAssessment,
            unifying_id_field=entities.StatePerson.get_class_id_name(),
            build_related_entities=False,
            unifying_id_field_filter_set=person_id_filter_set,
            state_code=state_code))

        supervision_contacts = (
            p | 'Load StateSupervisionContacts' >> BuildRootEntity(
                dataset=input_dataset,
                root_entity_class=entities.StateSupervisionContact,
                unifying_id_field=entities.StatePerson.get_class_id_name(),
                build_related_entities=False,
                unifying_id_field_filter_set=person_id_filter_set,
                state_code=state_code))

        # Bring in the table that associates StateSupervisionViolationResponses to information about StateAgents
        ssvr_to_agent_association_query = select_all_by_person_query(
            reference_dataset, SSVR_TO_AGENT_ASSOCIATION_VIEW_NAME, state_code,
            person_id_filter_set)

        ssvr_to_agent_associations = (
            p | "Read SSVR to Agent table from BigQuery" >> beam.io.Read(
                beam.io.BigQuerySource(query=ssvr_to_agent_association_query,
                                       use_standard_sql=True)))

        # Convert the association table rows into key-value tuples with the value for the
        # supervision_violation_response_id column as the key
        ssvr_agent_associations_as_kv = (
            ssvr_to_agent_associations
            | 'Convert SSVR to Agent table to KV tuples' >> beam.ParDo(
                ConvertDictToKVTuple(), 'supervision_violation_response_id'))

        supervision_period_to_agent_association_query = select_all_by_person_query(
            reference_dataset,
            SUPERVISION_PERIOD_TO_AGENT_ASSOCIATION_VIEW_NAME, state_code,
            person_id_filter_set)

        supervision_period_to_agent_associations = (
            p | "Read Supervision Period to Agent table from BigQuery" >>
            beam.io.Read(
                beam.io.BigQuerySource(
                    query=supervision_period_to_agent_association_query,
                    use_standard_sql=True)))

        # Convert the association table rows into key-value tuples with the value for the supervision_period_id column
        # as the key
        supervision_period_to_agent_associations_as_kv = (
            supervision_period_to_agent_associations
            | 'Convert Supervision Period to Agent table to KV tuples' >>
            beam.ParDo(ConvertDictToKVTuple(), 'supervision_period_id'))

        if state_code is None or state_code == 'US_MO':
            # Bring in the reference table that includes sentence status ranking information
            us_mo_sentence_status_query = select_all_by_person_query(
                reference_dataset, US_MO_SENTENCE_STATUSES_VIEW_NAME,
                state_code, person_id_filter_set)

            us_mo_sentence_statuses = (
                p |
                "Read MO sentence status table from BigQuery" >> beam.io.Read(
                    beam.io.BigQuerySource(query=us_mo_sentence_status_query,
                                           use_standard_sql=True)))
        else:
            us_mo_sentence_statuses = (
                p |
                f"Generate empty MO statuses list for non-MO state run: {state_code} "
                >> beam.Create([]))

        us_mo_sentence_status_rankings_as_kv = (
            us_mo_sentence_statuses
            | 'Convert MO sentence status ranking table to KV tuples' >>
            beam.ParDo(ConvertDictToKVTuple(), 'person_id'))

        sentences_and_statuses = (
            {
                'incarceration_sentences': incarceration_sentences,
                'supervision_sentences': supervision_sentences,
                'sentence_statuses': us_mo_sentence_status_rankings_as_kv
            }
            | 'Group sentences to the sentence statuses for that person' >>
            beam.CoGroupByKey())

        sentences_converted = (
            sentences_and_statuses
            | 'Convert to state-specific sentences' >> beam.ParDo(
                ConvertSentencesToStateSpecificType()).with_outputs(
                    'incarceration_sentences', 'supervision_sentences'))

        # Bring in the judicial districts associated with supervision_periods
        sp_to_judicial_district_query = select_all_by_person_query(
            reference_dataset,
            SUPERVISION_PERIOD_JUDICIAL_DISTRICT_ASSOCIATION_VIEW_NAME,
            state_code, person_id_filter_set)

        sp_to_judicial_district_kv = (
            p |
            "Read supervision_period to judicial_district associations from BigQuery"
            >> beam.io.Read(
                beam.io.BigQuerySource(query=sp_to_judicial_district_query,
                                       use_standard_sql=True))
            |
            "Convert supervision_period to judicial_district association table to KV"
            >> beam.ParDo(ConvertDictToKVTuple(), 'person_id'))

        # Group StateSupervisionViolationResponses and StateSupervisionViolations by person_id
        supervision_violations_and_responses = (
            {
                'violations': supervision_violations,
                'violation_responses': supervision_violation_responses
            } | 'Group StateSupervisionViolationResponses to '
            'StateSupervisionViolations' >> beam.CoGroupByKey())

        # Set the fully hydrated StateSupervisionViolation entities on the corresponding
        # StateSupervisionViolationResponses
        violation_responses_with_hydrated_violations = (
            supervision_violations_and_responses
            | 'Set hydrated StateSupervisionViolations on '
            'the StateSupervisionViolationResponses' >> beam.ParDo(
                SetViolationOnViolationsResponse()))

        # Group StateIncarcerationPeriods and StateSupervisionViolationResponses by person_id
        incarceration_periods_and_violation_responses = (
            {
                'incarceration_periods': incarceration_periods,
                'violation_responses':
                violation_responses_with_hydrated_violations
            }
            | 'Group StateIncarcerationPeriods to '
            'StateSupervisionViolationResponses' >> beam.CoGroupByKey())

        # Set the fully hydrated StateSupervisionViolationResponse entities on the corresponding
        # StateIncarcerationPeriods
        incarceration_periods_with_source_violations = (
            incarceration_periods_and_violation_responses
            | 'Set hydrated StateSupervisionViolationResponses on '
            'the StateIncarcerationPeriods' >> beam.ParDo(
                SetViolationResponseOnIncarcerationPeriod()))

        # Group each StatePerson with their related entities
        person_entities = (
            {
                'person':
                persons,
                'assessments':
                assessments,
                'incarceration_periods':
                incarceration_periods_with_source_violations,
                'supervision_periods':
                supervision_periods,
                'supervision_sentences':
                sentences_converted.supervision_sentences,
                'incarceration_sentences':
                sentences_converted.incarceration_sentences,
                'violation_responses':
                violation_responses_with_hydrated_violations,
                'supervision_contacts':
                supervision_contacts,
                'supervision_period_judicial_district_association':
                sp_to_judicial_district_kv
            }
            | 'Group StatePerson to all entities' >> beam.CoGroupByKey())

        # Identify SupervisionTimeBuckets from the StatePerson's StateSupervisionSentences and StateIncarcerationPeriods
        person_time_buckets = (
            person_entities
            | 'Get SupervisionTimeBuckets' >> beam.ParDo(
                ClassifySupervisionTimeBuckets(),
                AsDict(ssvr_agent_associations_as_kv),
                AsDict(supervision_period_to_agent_associations_as_kv)))

        # Get pipeline job details for accessing job_id
        all_pipeline_options = apache_beam_pipeline_options.get_all_options()

        # Get the type of metric to calculate
        metric_types_set = set(metric_types)

        # Add timestamp for local jobs
        job_timestamp = datetime.datetime.now().strftime(
            '%Y-%m-%d_%H_%M_%S.%f')
        all_pipeline_options['job_timestamp'] = job_timestamp

        # Get supervision metrics
        supervision_metrics = (
            person_time_buckets
            | 'Get Supervision Metrics' >> GetSupervisionMetrics(
                pipeline_options=all_pipeline_options,
                metric_types=metric_types_set,
                calculation_end_month=calculation_end_month,
                calculation_month_count=calculation_month_count))
        if person_id_filter_set:
            logging.warning(
                "Non-empty person filter set - returning before writing metrics."
            )
            return

        # Convert the metrics into a format that's writable to BQ
        writable_metrics = (
            supervision_metrics | 'Convert to dict to be written to BQ' >>
            beam.ParDo(RecidivizMetricWritableDict()).with_outputs(
                SupervisionMetricType.SUPERVISION_COMPLIANCE.value,
                SupervisionMetricType.SUPERVISION_POPULATION.value,
                SupervisionMetricType.SUPERVISION_REVOCATION.value,
                SupervisionMetricType.SUPERVISION_REVOCATION_ANALYSIS.value,
                SupervisionMetricType.
                SUPERVISION_REVOCATION_VIOLATION_TYPE_ANALYSIS.value,
                SupervisionMetricType.SUPERVISION_SUCCESS.value,
                SupervisionMetricType.
                SUPERVISION_SUCCESSFUL_SENTENCE_DAYS_SERVED.value,
                SupervisionMetricType.SUPERVISION_TERMINATION.value))

        # Write the metrics to the output tables in BigQuery
        terminations_table_id = DATAFLOW_METRICS_TO_TABLES.get(
            SupervisionTerminationMetric)
        compliance_table_id = DATAFLOW_METRICS_TO_TABLES.get(
            SupervisionCaseComplianceMetric)
        populations_table_id = DATAFLOW_METRICS_TO_TABLES.get(
            SupervisionPopulationMetric)
        revocations_table_id = DATAFLOW_METRICS_TO_TABLES.get(
            SupervisionRevocationMetric)
        revocation_analysis_table_id = DATAFLOW_METRICS_TO_TABLES.get(
            SupervisionRevocationAnalysisMetric)
        revocation_violation_type_analysis_table_id = \
            DATAFLOW_METRICS_TO_TABLES.get(SupervisionRevocationViolationTypeAnalysisMetric)
        successes_table_id = DATAFLOW_METRICS_TO_TABLES.get(
            SupervisionSuccessMetric)
        successful_sentence_lengths_table_id = DATAFLOW_METRICS_TO_TABLES.get(
            SuccessfulSupervisionSentenceDaysServedMetric)

        _ = (writable_metrics.SUPERVISION_POPULATION
             | f"Write population metrics to BQ table: {populations_table_id}"
             >> beam.io.WriteToBigQuery(
                 table=populations_table_id,
                 dataset=output,
                 create_disposition=beam.io.BigQueryDisposition.CREATE_NEVER,
                 write_disposition=beam.io.BigQueryDisposition.WRITE_APPEND,
                 method=beam.io.WriteToBigQuery.Method.FILE_LOADS))

        _ = (writable_metrics.SUPERVISION_REVOCATION
             | f"Write revocation metrics to BQ table: {revocations_table_id}"
             >> beam.io.WriteToBigQuery(
                 table=revocations_table_id,
                 dataset=output,
                 create_disposition=beam.io.BigQueryDisposition.CREATE_NEVER,
                 write_disposition=beam.io.BigQueryDisposition.WRITE_APPEND,
                 method=beam.io.WriteToBigQuery.Method.FILE_LOADS))

        _ = (writable_metrics.SUPERVISION_SUCCESS
             | f"Write success metrics to BQ table: {successes_table_id}" >>
             beam.io.WriteToBigQuery(
                 table=successes_table_id,
                 dataset=output,
                 create_disposition=beam.io.BigQueryDisposition.CREATE_NEVER,
                 write_disposition=beam.io.BigQueryDisposition.WRITE_APPEND,
                 method=beam.io.WriteToBigQuery.Method.FILE_LOADS))

        _ = (writable_metrics.SUPERVISION_SUCCESSFUL_SENTENCE_DAYS_SERVED
             | f"Write supervision successful sentence length metrics to BQ"
             f" table: {successful_sentence_lengths_table_id}" >>
             beam.io.WriteToBigQuery(
                 table=successful_sentence_lengths_table_id,
                 dataset=output,
                 create_disposition=beam.io.BigQueryDisposition.CREATE_NEVER,
                 write_disposition=beam.io.BigQueryDisposition.WRITE_APPEND,
                 method=beam.io.WriteToBigQuery.Method.FILE_LOADS))

        _ = (writable_metrics.SUPERVISION_TERMINATION
             |
             f"Write termination metrics to BQ table: {terminations_table_id}"
             >> beam.io.WriteToBigQuery(
                 table=terminations_table_id,
                 dataset=output,
                 create_disposition=beam.io.BigQueryDisposition.CREATE_NEVER,
                 write_disposition=beam.io.BigQueryDisposition.WRITE_APPEND,
                 method=beam.io.WriteToBigQuery.Method.FILE_LOADS))

        _ = (
            writable_metrics.SUPERVISION_REVOCATION_ANALYSIS
            |
            f"Write revocation analyses metrics to BQ table: {revocation_analysis_table_id}"
            >> beam.io.WriteToBigQuery(
                table=revocation_analysis_table_id,
                dataset=output,
                create_disposition=beam.io.BigQueryDisposition.CREATE_NEVER,
                write_disposition=beam.io.BigQueryDisposition.WRITE_APPEND,
                method=beam.io.WriteToBigQuery.Method.FILE_LOADS))

        _ = (writable_metrics.SUPERVISION_REVOCATION_VIOLATION_TYPE_ANALYSIS
             |
             f"Write revocation violation type analyses metrics to BQ table: "
             f"{revocation_violation_type_analysis_table_id}" >>
             beam.io.WriteToBigQuery(
                 table=revocation_violation_type_analysis_table_id,
                 dataset=output,
                 create_disposition=beam.io.BigQueryDisposition.CREATE_NEVER,
                 write_disposition=beam.io.BigQueryDisposition.WRITE_APPEND,
                 method=beam.io.WriteToBigQuery.Method.FILE_LOADS))

        _ = (writable_metrics.SUPERVISION_COMPLIANCE
             | f"Write compliance metrics to BQ table: {compliance_table_id}"
             >> beam.io.WriteToBigQuery(
                 table=compliance_table_id,
                 dataset=output,
                 create_disposition=beam.io.BigQueryDisposition.CREATE_NEVER,
                 write_disposition=beam.io.BigQueryDisposition.WRITE_APPEND,
                 method=beam.io.WriteToBigQuery.Method.FILE_LOADS))
Beispiel #9
0
def run(apache_beam_pipeline_options: PipelineOptions, data_input: str,
        reference_input: str, output: str, metric_types: List[str],
        state_code: Optional[str], person_filter_ids: Optional[List[int]]):
    """Runs the recidivism calculation pipeline."""

    # Workaround to load SQLAlchemy objects at start of pipeline. This is
    # necessary because the BuildRootEntity function tries to access attributes
    # of relationship properties on the SQLAlchemy room_schema_class before they
    # have been loaded. However, if *any* SQLAlchemy objects have been
    # instantiated, then the relationship properties are loaded and their
    # attributes can be successfully accessed.
    _ = schema.StatePerson()

    apache_beam_pipeline_options.view_as(SetupOptions).save_main_session = True

    # Get pipeline job details
    all_pipeline_options = apache_beam_pipeline_options.get_all_options()

    query_dataset = all_pipeline_options['project'] + '.' + data_input
    reference_dataset = all_pipeline_options['project'] + '.' + reference_input

    person_id_filter_set = set(
        person_filter_ids) if person_filter_ids else None

    with beam.Pipeline(options=apache_beam_pipeline_options) as p:
        # Get StatePersons
        persons = (
            p
            | 'Load Persons' >> BuildRootEntity(
                dataset=query_dataset,
                root_entity_class=entities.StatePerson,
                unifying_id_field=entities.StatePerson.get_class_id_name(),
                build_related_entities=True,
                unifying_id_field_filter_set=person_id_filter_set))

        # Get StateIncarcerationPeriods
        incarceration_periods = (
            p
            | 'Load IncarcerationPeriods' >> BuildRootEntity(
                dataset=query_dataset,
                root_entity_class=entities.StateIncarcerationPeriod,
                unifying_id_field=entities.StatePerson.get_class_id_name(),
                build_related_entities=True,
                unifying_id_field_filter_set=person_id_filter_set,
                state_code=state_code))

        # Get StateSupervisionViolations
        supervision_violations = \
            (p
             | 'Load SupervisionViolations' >>
             BuildRootEntity(dataset=query_dataset, root_entity_class=entities.StateSupervisionViolation,
                             unifying_id_field=entities.StatePerson.get_class_id_name(), build_related_entities=True,
                             unifying_id_field_filter_set=person_id_filter_set,
                             state_code=state_code
                             ))

        # TODO(2769): Don't bring this in as a root entity
        # Get StateSupervisionViolationResponses
        supervision_violation_responses = \
            (p
             | 'Load SupervisionViolationResponses' >>
             BuildRootEntity(dataset=query_dataset, root_entity_class=entities.StateSupervisionViolationResponse,
                             unifying_id_field=entities.StatePerson.get_class_id_name(), build_related_entities=True,
                             unifying_id_field_filter_set=person_id_filter_set,
                             state_code=state_code
                             ))

        # Group StateSupervisionViolationResponses and
        # StateSupervisionViolations by person_id
        supervision_violations_and_responses = (
            {
                'violations': supervision_violations,
                'violation_responses': supervision_violation_responses
            } | 'Group StateSupervisionViolationResponses to '
            'StateSupervisionViolations' >> beam.CoGroupByKey())

        # Set the fully hydrated StateSupervisionViolation entities on
        # the corresponding StateSupervisionViolationResponses
        violation_responses_with_hydrated_violations = (
            supervision_violations_and_responses
            | 'Set hydrated StateSupervisionViolations on '
            'the StateSupervisionViolationResponses' >> beam.ParDo(
                SetViolationOnViolationsResponse()))

        # Group StateIncarcerationPeriods and StateSupervisionViolationResponses
        # by person_id
        incarceration_periods_and_violation_responses = (
            {
                'incarceration_periods': incarceration_periods,
                'violation_responses':
                violation_responses_with_hydrated_violations
            }
            | 'Group StateIncarcerationPeriods to '
            'StateSupervisionViolationResponses' >> beam.CoGroupByKey())

        # Set the fully hydrated StateSupervisionViolationResponse entities on
        # the corresponding StateIncarcerationPeriods
        incarceration_periods_with_source_violations = (
            incarceration_periods_and_violation_responses
            | 'Set hydrated StateSupervisionViolationResponses on '
            'the StateIncarcerationPeriods' >> beam.ParDo(
                SetViolationResponseOnIncarcerationPeriod()))

        # Group each StatePerson with their StateIncarcerationPeriods
        person_and_incarceration_periods = (
            {
                'person':
                persons,
                'incarceration_periods':
                incarceration_periods_with_source_violations
            }
            | 'Group StatePerson to StateIncarcerationPeriods' >>
            beam.CoGroupByKey())

        # Bring in the table that associates people and their county of residence
        person_id_to_county_query = select_all_by_person_query(
            reference_dataset,
            PERSONS_TO_RECENT_COUNTY_OF_RESIDENCE_VIEW_NAME,
            # TODO(3602): Once we put state_code on StatePerson objects, we can update the
            # persons_to_recent_county_of_residence query to have a state_code field, allowing us to also filter the
            # output by state_code.
            state_code_filter=None,
            person_id_filter_set=person_id_filter_set)

        person_id_to_county_kv = (
            p
            | "Read person_id to county associations from BigQuery" >>
            beam.io.Read(
                beam.io.BigQuerySource(query=person_id_to_county_query,
                                       use_standard_sql=True))
            | "Convert person_id to county association table to KV" >>
            beam.ParDo(ConvertDictToKVTuple(), 'person_id'))

        # Identify ReleaseEvents events from the StatePerson's
        # StateIncarcerationPeriods
        person_events = (
            person_and_incarceration_periods
            | "ClassifyReleaseEvents" >> beam.ParDo(
                ClassifyReleaseEvents(), AsDict(person_id_to_county_kv)))

        # Get pipeline job details for accessing job_id
        all_pipeline_options = apache_beam_pipeline_options.get_all_options()

        # Add timestamp for local jobs
        job_timestamp = datetime.datetime.now().strftime(
            '%Y-%m-%d_%H_%M_%S.%f')
        all_pipeline_options['job_timestamp'] = job_timestamp

        # Get the type of metric to calculate
        metric_types_set = set(metric_types)

        # Get recidivism metrics
        recidivism_metrics = (person_events
                              |
                              'Get Recidivism Metrics' >> GetRecidivismMetrics(
                                  pipeline_options=all_pipeline_options,
                                  metric_types=metric_types_set))

        if person_id_filter_set:
            logging.warning(
                "Non-empty person filter set - returning before writing metrics."
            )
            return

        # Convert the metrics into a format that's writable to BQ
        writable_metrics = (
            recidivism_metrics
            | 'Convert to dict to be written to BQ' >> beam.ParDo(
                RecidivismMetricWritableDict()).with_outputs(
                    'rates', 'counts'))

        # Write the recidivism metrics to the output tables in BigQuery
        rates_table_id = DATAFLOW_METRICS_TO_TABLES.get(
            ReincarcerationRecidivismRateMetric)
        counts_table_id = DATAFLOW_METRICS_TO_TABLES.get(
            ReincarcerationRecidivismCountMetric)

        _ = (writable_metrics.rates
             | f"Write rate metrics to BQ table: {rates_table_id}" >>
             beam.io.WriteToBigQuery(
                 table=rates_table_id,
                 dataset=output,
                 create_disposition=beam.io.BigQueryDisposition.CREATE_NEVER,
                 write_disposition=beam.io.BigQueryDisposition.WRITE_APPEND,
                 method=beam.io.WriteToBigQuery.Method.FILE_LOADS))

        _ = (writable_metrics.counts
             | f"Write count metrics to BQ table: {counts_table_id}" >>
             beam.io.WriteToBigQuery(
                 table=counts_table_id,
                 dataset=output,
                 create_disposition=beam.io.BigQueryDisposition.CREATE_NEVER,
                 write_disposition=beam.io.BigQueryDisposition.WRITE_APPEND,
                 method=beam.io.WriteToBigQuery.Method.FILE_LOADS))
Beispiel #10
0
def run(apache_beam_pipeline_options: PipelineOptions, data_input: str,
        reference_input: str, output: str, calculation_month_count: int,
        metric_types: List[str], state_code: Optional[str],
        calculation_end_month: Optional[str],
        person_filter_ids: Optional[List[int]]):
    """Runs the incarceration calculation pipeline."""

    # Workaround to load SQLAlchemy objects at start of pipeline. This is necessary because the BuildRootEntity
    # function tries to access attributes of relationship properties on the SQLAlchemy room_schema_class before they
    # have been loaded. However, if *any* SQLAlchemy objects have been instantiated, then the relationship properties
    # are loaded and their attributes can be successfully accessed.
    _ = schema.StatePerson()

    apache_beam_pipeline_options.view_as(SetupOptions).save_main_session = True

    # Get pipeline job details
    all_apache_beam_pipeline_options = apache_beam_pipeline_options.get_all_options(
    )

    query_dataset = all_apache_beam_pipeline_options[
        'project'] + '.' + data_input
    reference_dataset = all_apache_beam_pipeline_options[
        'project'] + '.' + reference_input

    person_id_filter_set = set(
        person_filter_ids) if person_filter_ids else None

    with beam.Pipeline(options=apache_beam_pipeline_options) as p:
        # Get StatePersons
        persons = (p | 'Load StatePersons' >> BuildRootEntity(
            dataset=query_dataset,
            root_entity_class=entities.StatePerson,
            unifying_id_field=entities.StatePerson.get_class_id_name(),
            build_related_entities=True,
            unifying_id_field_filter_set=person_id_filter_set))

        # Get StateSentenceGroups
        sentence_groups = (p | 'Load StateSentenceGroups' >> BuildRootEntity(
            dataset=query_dataset,
            root_entity_class=entities.StateSentenceGroup,
            unifying_id_field=entities.StatePerson.get_class_id_name(),
            build_related_entities=True,
            unifying_id_field_filter_set=person_id_filter_set,
            state_code=state_code))

        # Get StateIncarcerationSentences
        incarceration_sentences = (
            p | 'Load StateIncarcerationSentences' >> BuildRootEntity(
                dataset=query_dataset,
                root_entity_class=entities.StateIncarcerationSentence,
                unifying_id_field=entities.StatePerson.get_class_id_name(),
                build_related_entities=True,
                unifying_id_field_filter_set=person_id_filter_set,
                state_code=state_code))

        # Get StateSupervisionSentences
        supervision_sentences = (
            p | 'Load StateSupervisionSentences' >> BuildRootEntity(
                dataset=query_dataset,
                root_entity_class=entities.StateSupervisionSentence,
                unifying_id_field=entities.StatePerson.get_class_id_name(),
                build_related_entities=True,
                unifying_id_field_filter_set=person_id_filter_set,
                state_code=state_code))

        if state_code is None or state_code == 'US_MO':
            # Bring in the reference table that includes sentence status ranking information
            us_mo_sentence_status_query = select_all_by_person_query(
                reference_dataset, US_MO_SENTENCE_STATUSES_VIEW_NAME,
                state_code, person_id_filter_set)

            us_mo_sentence_statuses = (
                p |
                "Read MO sentence status table from BigQuery" >> beam.io.Read(
                    beam.io.BigQuerySource(query=us_mo_sentence_status_query,
                                           use_standard_sql=True)))
        else:
            us_mo_sentence_statuses = (
                p |
                f"Generate empty MO statuses list for non-MO state run: {state_code} "
                >> beam.Create([]))

        us_mo_sentence_status_rankings_as_kv = (
            us_mo_sentence_statuses
            | 'Convert MO sentence status ranking table to KV tuples' >>
            beam.ParDo(ConvertDictToKVTuple(), 'person_id'))

        supervision_sentences_and_statuses = (
            {
                'incarceration_sentences': incarceration_sentences,
                'supervision_sentences': supervision_sentences,
                'sentence_statuses': us_mo_sentence_status_rankings_as_kv
            }
            | 'Group sentences to the sentence statuses for that person' >>
            beam.CoGroupByKey())

        sentences_converted = (
            supervision_sentences_and_statuses
            | 'Convert to state-specific sentences' >> beam.ParDo(
                ConvertSentencesToStateSpecificType()).with_outputs(
                    'incarceration_sentences', 'supervision_sentences'))

        sentences_and_sentence_groups = (
            {
                'sentence_groups': sentence_groups,
                'incarceration_sentences':
                sentences_converted.incarceration_sentences,
                'supervision_sentences':
                sentences_converted.supervision_sentences
            }
            | 'Group sentences to sentence groups' >> beam.CoGroupByKey())

        # Set hydrated sentences on the corresponding sentence groups
        sentence_groups_with_hydrated_sentences = (
            sentences_and_sentence_groups
            | 'Set hydrated sentences on sentence groups' >> beam.ParDo(
                SetSentencesOnSentenceGroup()))

        # Bring in the table that associates people and their county of residence
        person_id_to_county_query = select_all_by_person_query(
            reference_dataset,
            PERSONS_TO_RECENT_COUNTY_OF_RESIDENCE_VIEW_NAME,
            # TODO(3602): Once we put state_code on StatePerson objects, we can update the
            # persons_to_recent_county_of_residence query to have a state_code field, allowing us to also filter the
            # output by state_code.
            state_code_filter=None,
            person_id_filter_set=person_id_filter_set)

        person_id_to_county_kv = (
            p | "Read person_id to county associations from BigQuery" >>
            beam.io.Read(
                beam.io.BigQuerySource(query=person_id_to_county_query,
                                       use_standard_sql=True))
            | "Convert person_id to county association table to KV" >>
            beam.ParDo(ConvertDictToKVTuple(), 'person_id'))

        # Bring in the judicial districts associated with incarceration_periods
        ip_to_judicial_district_query = select_all_by_person_query(
            reference_dataset,
            INCARCERATION_PERIOD_JUDICIAL_DISTRICT_ASSOCIATION_VIEW_NAME,
            state_code, person_id_filter_set)

        ip_to_judicial_district_kv = (
            p |
            "Read incarceration_period to judicial_district associations from BigQuery"
            >> beam.io.Read(
                beam.io.BigQuerySource(query=ip_to_judicial_district_query,
                                       use_standard_sql=True))
            |
            "Convert incarceration_period to judicial_district association table to KV"
            >> beam.ParDo(ConvertDictToKVTuple(), 'person_id'))

        # Group each StatePerson with their related entities
        person_entities = (
            {
                'person':
                persons,
                'sentence_groups':
                sentence_groups_with_hydrated_sentences,
                'incarceration_period_judicial_district_association':
                ip_to_judicial_district_kv
            }
            | 'Group StatePerson to SentenceGroups' >> beam.CoGroupByKey())

        # Identify IncarcerationEvents events from the StatePerson's StateIncarcerationPeriods
        person_events = (
            person_entities | 'Classify Incarceration Events' >> beam.ParDo(
                ClassifyIncarcerationEvents(), AsDict(person_id_to_county_kv)))

        # Get pipeline job details for accessing job_id
        all_pipeline_options = apache_beam_pipeline_options.get_all_options()

        # Add timestamp for local jobs
        job_timestamp = datetime.datetime.now().strftime(
            '%Y-%m-%d_%H_%M_%S.%f')
        all_apache_beam_pipeline_options['job_timestamp'] = job_timestamp

        # Get the type of metric to calculate
        metric_types_set = set(metric_types)

        # Get IncarcerationMetrics
        incarceration_metrics = (
            person_events
            | 'Get Incarceration Metrics' >> GetIncarcerationMetrics(
                pipeline_options=all_pipeline_options,
                metric_types=metric_types_set,
                calculation_end_month=calculation_end_month,
                calculation_month_count=calculation_month_count))

        if person_id_filter_set:
            logging.warning(
                "Non-empty person filter set - returning before writing metrics."
            )
            return

        # Convert the metrics into a format that's writable to BQ
        writable_metrics = (
            incarceration_metrics | 'Convert to dict to be written to BQ' >>
            beam.ParDo(RecidivizMetricWritableDict()).with_outputs(
                IncarcerationMetricType.INCARCERATION_ADMISSION.value,
                IncarcerationMetricType.INCARCERATION_POPULATION.value,
                IncarcerationMetricType.INCARCERATION_RELEASE.value))

        # Write the metrics to the output tables in BigQuery
        admissions_table_id = DATAFLOW_METRICS_TO_TABLES.get(
            IncarcerationAdmissionMetric)
        population_table_id = DATAFLOW_METRICS_TO_TABLES.get(
            IncarcerationPopulationMetric)
        releases_table_id = DATAFLOW_METRICS_TO_TABLES.get(
            IncarcerationReleaseMetric)

        _ = (writable_metrics.INCARCERATION_ADMISSION
             | f"Write admission metrics to BQ table: {admissions_table_id}" >>
             beam.io.WriteToBigQuery(
                 table=admissions_table_id,
                 dataset=output,
                 create_disposition=beam.io.BigQueryDisposition.CREATE_NEVER,
                 write_disposition=beam.io.BigQueryDisposition.WRITE_APPEND,
                 method=beam.io.WriteToBigQuery.Method.FILE_LOADS))

        _ = (writable_metrics.INCARCERATION_POPULATION
             | f"Write population metrics to BQ table: {population_table_id}"
             >> beam.io.WriteToBigQuery(
                 table=population_table_id,
                 dataset=output,
                 create_disposition=beam.io.BigQueryDisposition.CREATE_NEVER,
                 write_disposition=beam.io.BigQueryDisposition.WRITE_APPEND,
                 method=beam.io.WriteToBigQuery.Method.FILE_LOADS))

        _ = (writable_metrics.INCARCERATION_RELEASE
             | f"Write release metrics to BQ table: {releases_table_id}" >>
             beam.io.WriteToBigQuery(
                 table=releases_table_id,
                 dataset=output,
                 create_disposition=beam.io.BigQueryDisposition.CREATE_NEVER,
                 write_disposition=beam.io.BigQueryDisposition.WRITE_APPEND,
                 method=beam.io.WriteToBigQuery.Method.FILE_LOADS))
Beispiel #11
0
def run(apache_beam_pipeline_options: PipelineOptions, data_input: str,
        reference_input: str, output: str, calculation_month_count: int,
        metric_types: List[str], state_code: Optional[str],
        calculation_end_month: Optional[str],
        person_filter_ids: Optional[List[int]]):
    """Runs the program calculation pipeline."""

    # Workaround to load SQLAlchemy objects at start of pipeline. This is necessary because the BuildRootEntity
    # function tries to access attributes of relationship properties on the SQLAlchemy room_schema_class before they
    # have been loaded. However, if *any* SQLAlchemy objects have been instantiated, then the relationship properties
    # are loaded and their attributes can be successfully accessed.
    _ = schema.StatePerson()

    apache_beam_pipeline_options.view_as(SetupOptions).save_main_session = True

    # Get pipeline job details
    all_pipeline_options = apache_beam_pipeline_options.get_all_options()

    input_dataset = all_pipeline_options['project'] + '.' + data_input
    reference_dataset = all_pipeline_options['project'] + '.' + reference_input

    person_id_filter_set = set(
        person_filter_ids) if person_filter_ids else None

    with beam.Pipeline(options=apache_beam_pipeline_options) as p:
        # Get StatePersons
        persons = (p | 'Load Persons' >> BuildRootEntity(
            dataset=input_dataset,
            root_entity_class=entities.StatePerson,
            unifying_id_field=entities.StatePerson.get_class_id_name(),
            build_related_entities=True,
            unifying_id_field_filter_set=person_id_filter_set))

        # Get StateProgramAssignments
        program_assignments = (
            p | 'Load Program Assignments' >> BuildRootEntity(
                dataset=input_dataset,
                root_entity_class=entities.StateProgramAssignment,
                unifying_id_field=entities.StatePerson.get_class_id_name(),
                build_related_entities=True,
                unifying_id_field_filter_set=person_id_filter_set,
                state_code=state_code))

        # Get StateAssessments
        assessments = (p | 'Load Assessments' >> BuildRootEntity(
            dataset=input_dataset,
            root_entity_class=entities.StateAssessment,
            unifying_id_field=entities.StatePerson.get_class_id_name(),
            build_related_entities=False,
            unifying_id_field_filter_set=person_id_filter_set,
            state_code=state_code))

        # Get StateSupervisionPeriods
        supervision_periods = (
            p | 'Load SupervisionPeriods' >> BuildRootEntity(
                dataset=input_dataset,
                root_entity_class=entities.StateSupervisionPeriod,
                unifying_id_field=entities.StatePerson.get_class_id_name(),
                build_related_entities=False,
                unifying_id_field_filter_set=person_id_filter_set,
                state_code=state_code))

        supervision_period_to_agent_association_query = select_all_by_person_query(
            reference_dataset,
            SUPERVISION_PERIOD_TO_AGENT_ASSOCIATION_VIEW_NAME, state_code,
            person_id_filter_set)

        supervision_period_to_agent_associations = (
            p | "Read Supervision Period to Agent table from BigQuery" >>
            beam.io.Read(
                beam.io.BigQuerySource(
                    query=supervision_period_to_agent_association_query,
                    use_standard_sql=True)))

        # Convert the association table rows into key-value tuples with the value for the supervision_period_id column
        # as the key
        supervision_period_to_agent_associations_as_kv = (
            supervision_period_to_agent_associations
            | 'Convert Supervision Period to Agent table to KV tuples' >>
            beam.ParDo(ConvertDictToKVTuple(), 'supervision_period_id'))

        # Group each StatePerson with their other entities
        persons_entities = ({
            'person': persons,
            'program_assignments': program_assignments,
            'assessments': assessments,
            'supervision_periods': supervision_periods
        }
                            |
                            'Group StatePerson to StateProgramAssignments and'
                            >> beam.CoGroupByKey())

        # Identify ProgramEvents from the StatePerson's StateProgramAssignments
        person_program_events = (
            persons_entities
            | beam.ParDo(
                ClassifyProgramAssignments(),
                AsDict(supervision_period_to_agent_associations_as_kv)))

        # Get pipeline job details for accessing job_id
        all_pipeline_options = apache_beam_pipeline_options.get_all_options()

        # Add timestamp for local jobs
        job_timestamp = datetime.datetime.now().strftime(
            '%Y-%m-%d_%H_%M_%S.%f')
        all_pipeline_options['job_timestamp'] = job_timestamp

        # Get the type of metric to calculate
        metric_types_set = set(metric_types)

        # Get program metrics
        program_metrics = (
            person_program_events | 'Get Program Metrics' >> GetProgramMetrics(
                pipeline_options=all_pipeline_options,
                metric_types=metric_types_set,
                calculation_end_month=calculation_end_month,
                calculation_month_count=calculation_month_count))

        if person_id_filter_set:
            logging.warning(
                "Non-empty person filter set - returning before writing metrics."
            )
            return

        # Convert the metrics into a format that's writable to BQ
        writable_metrics = (
            program_metrics
            | 'Convert to dict to be written to BQ' >> beam.ParDo(
                ProgramMetricWritableDict()).with_outputs(
                    'participation', 'referrals'))

        # Write the metrics to the output tables in BigQuery
        referrals_table_id = DATAFLOW_METRICS_TO_TABLES.get(
            ProgramReferralMetric)
        participation_table_id = DATAFLOW_METRICS_TO_TABLES.get(
            ProgramParticipationMetric)

        _ = (writable_metrics.referrals
             | f"Write referral metrics to BQ table: {referrals_table_id}" >>
             beam.io.WriteToBigQuery(
                 table=referrals_table_id,
                 dataset=output,
                 create_disposition=beam.io.BigQueryDisposition.CREATE_NEVER,
                 write_disposition=beam.io.BigQueryDisposition.WRITE_APPEND,
                 method=beam.io.WriteToBigQuery.Method.FILE_LOADS))

        _ = (
            writable_metrics.participation |
            f"Write participation metrics to BQ table: {participation_table_id}"
            >> beam.io.WriteToBigQuery(
                table=participation_table_id,
                dataset=output,
                create_disposition=beam.io.BigQueryDisposition.CREATE_NEVER,
                write_disposition=beam.io.BigQueryDisposition.WRITE_APPEND,
                method=beam.io.WriteToBigQuery.Method.FILE_LOADS))
Beispiel #12
0
def run(
    apache_beam_pipeline_options: PipelineOptions,
    data_input: str,
    reference_view_input: str,
    static_reference_input: str,
    output: str,
    calculation_month_count: int,
    metric_types: List[str],
    state_code: str,
    calculation_end_month: Optional[str],
    person_filter_ids: Optional[List[int]],
):
    """Runs the incarceration calculation pipeline."""

    # Workaround to load SQLAlchemy objects at start of pipeline. This is necessary because the BuildRootEntity
    # function tries to access attributes of relationship properties on the SQLAlchemy room_schema_class before they
    # have been loaded. However, if *any* SQLAlchemy objects have been instantiated, then the relationship properties
    # are loaded and their attributes can be successfully accessed.
    _ = schema.StatePerson()

    apache_beam_pipeline_options.view_as(SetupOptions).save_main_session = True

    # Get pipeline job details
    all_pipeline_options = apache_beam_pipeline_options.get_all_options()
    project_id = all_pipeline_options["project"]

    if project_id is None:
        raise ValueError(f"No project set in pipeline options: {all_pipeline_options}")

    if state_code is None:
        raise ValueError("No state_code set for pipeline")

    input_dataset = project_id + "." + data_input
    reference_dataset = project_id + "." + reference_view_input
    static_reference_dataset = project_id + "." + static_reference_input

    person_id_filter_set = set(person_filter_ids) if person_filter_ids else None

    with beam.Pipeline(options=apache_beam_pipeline_options) as p:
        # Get StatePersons
        persons = p | "Load StatePersons" >> BuildRootEntity(
            dataset=input_dataset,
            root_entity_class=entities.StatePerson,
            unifying_id_field=entities.StatePerson.get_class_id_name(),
            build_related_entities=True,
            unifying_id_field_filter_set=person_id_filter_set,
            state_code=state_code,
        )

        # Get StateSentenceGroups
        sentence_groups = p | "Load StateSentenceGroups" >> BuildRootEntity(
            dataset=input_dataset,
            root_entity_class=entities.StateSentenceGroup,
            unifying_id_field=entities.StatePerson.get_class_id_name(),
            build_related_entities=True,
            unifying_id_field_filter_set=person_id_filter_set,
            state_code=state_code,
        )

        # Get StateIncarcerationSentences
        incarceration_sentences = (
            p
            | "Load StateIncarcerationSentences"
            >> BuildRootEntity(
                dataset=input_dataset,
                root_entity_class=entities.StateIncarcerationSentence,
                unifying_id_field=entities.StatePerson.get_class_id_name(),
                build_related_entities=True,
                unifying_id_field_filter_set=person_id_filter_set,
                state_code=state_code,
            )
        )

        # Get StateSupervisionSentences
        supervision_sentences = p | "Load StateSupervisionSentences" >> BuildRootEntity(
            dataset=input_dataset,
            root_entity_class=entities.StateSupervisionSentence,
            unifying_id_field=entities.StatePerson.get_class_id_name(),
            build_related_entities=True,
            unifying_id_field_filter_set=person_id_filter_set,
            state_code=state_code,
        )

        if state_code == "US_MO":
            # Bring in the reference table that includes sentence status ranking information
            us_mo_sentence_status_query = select_all_by_person_query(
                reference_dataset,
                US_MO_SENTENCE_STATUSES_VIEW_NAME,
                state_code,
                person_id_filter_set,
            )

            us_mo_sentence_statuses = (
                p
                | "Read MO sentence status table from BigQuery"
                >> ReadFromBigQuery(query=us_mo_sentence_status_query)
            )
        else:
            us_mo_sentence_statuses = (
                p
                | f"Generate empty MO statuses list for non-MO state run: {state_code} "
                >> beam.Create([])
            )

        us_mo_sentence_status_rankings_as_kv = (
            us_mo_sentence_statuses
            | "Convert MO sentence status ranking table to KV tuples"
            >> beam.ParDo(ConvertDictToKVTuple(), "person_id")
        )

        supervision_sentences_and_statuses = (
            {
                "incarceration_sentences": incarceration_sentences,
                "supervision_sentences": supervision_sentences,
                "sentence_statuses": us_mo_sentence_status_rankings_as_kv,
            }
            | "Group sentences to the sentence statuses for that person"
            >> beam.CoGroupByKey()
        )

        sentences_converted = (
            supervision_sentences_and_statuses
            | "Convert to state-specific sentences"
            >> beam.ParDo(ConvertSentencesToStateSpecificType()).with_outputs(
                "incarceration_sentences", "supervision_sentences"
            )
        )

        sentences_and_sentence_groups = {
            "sentence_groups": sentence_groups,
            "incarceration_sentences": sentences_converted.incarceration_sentences,
            "supervision_sentences": sentences_converted.supervision_sentences,
        } | "Group sentences to sentence groups" >> beam.CoGroupByKey()

        # Set hydrated sentences on the corresponding sentence groups
        sentence_groups_with_hydrated_sentences = (
            sentences_and_sentence_groups
            | "Set hydrated sentences on sentence groups"
            >> beam.ParDo(SetSentencesOnSentenceGroup())
        )

        # Bring in the table that associates people and their county of residence
        person_id_to_county_kv = (
            p
            | "Load person_id_to_county_kv"
            >> ImportTableAsKVTuples(
                dataset_id=reference_dataset,
                table_id=PERSONS_TO_RECENT_COUNTY_OF_RESIDENCE_VIEW_NAME,
                table_key="person_id",
                state_code_filter=state_code,
                person_id_filter_set=person_id_filter_set,
            )
        )

        ip_to_judicial_district_kv = (
            p
            | "Load ip_to_judicial_district_kv"
            >> ImportTableAsKVTuples(
                dataset_id=reference_dataset,
                table_id=INCARCERATION_PERIOD_JUDICIAL_DISTRICT_ASSOCIATION_VIEW_NAME,
                table_key="person_id",
                state_code_filter=state_code,
                person_id_filter_set=person_id_filter_set,
            )
        )

        state_race_ethnicity_population_counts = (
            p
            | "Load state_race_ethnicity_population_counts"
            >> ImportTable(
                dataset_id=static_reference_dataset,
                table_id="state_race_ethnicity_population_counts",
                state_code_filter=state_code,
                person_id_filter_set=None,
            )
        )

        # Group each StatePerson with their related entities
        person_entities = {
            "person": persons,
            "sentence_groups": sentence_groups_with_hydrated_sentences,
            "incarceration_period_judicial_district_association": ip_to_judicial_district_kv,
            "persons_to_recent_county_of_residence": person_id_to_county_kv,
        } | "Group StatePerson to SentenceGroups" >> beam.CoGroupByKey()

        # Identify IncarcerationEvents events from the StatePerson's StateIncarcerationPeriods
        person_incarceration_events = (
            person_entities
            | "Classify Incarceration Events"
            >> beam.ParDo(ClassifyIncarcerationEvents())
        )

        person_metadata = (
            persons
            | "Build the person_metadata dictionary"
            >> beam.ParDo(
                BuildPersonMetadata(), AsList(state_race_ethnicity_population_counts)
            )
        )

        person_incarceration_events_with_metadata = (
            {
                "person_events": person_incarceration_events,
                "person_metadata": person_metadata,
            }
            | "Group IncarcerationEvents with person-level metadata"
            >> beam.CoGroupByKey()
            | "Organize StatePerson, PersonMetadata and IncarcerationEvents for calculations"
            >> beam.ParDo(ExtractPersonEventsMetadata())
        )

        # Get pipeline job details for accessing job_id
        all_pipeline_options = apache_beam_pipeline_options.get_all_options()

        # Add timestamp for local jobs
        job_timestamp = datetime.datetime.now().strftime("%Y-%m-%d_%H_%M_%S.%f")
        all_pipeline_options["job_timestamp"] = job_timestamp

        # Get the type of metric to calculate
        metric_types_set = set(metric_types)

        # Get IncarcerationMetrics
        incarceration_metrics = (
            person_incarceration_events_with_metadata
            | "Get Incarceration Metrics"
            >> GetIncarcerationMetrics(
                pipeline_options=all_pipeline_options,
                metric_types=metric_types_set,
                calculation_end_month=calculation_end_month,
                calculation_month_count=calculation_month_count,
            )
        )

        if person_id_filter_set:
            logging.warning(
                "Non-empty person filter set - returning before writing metrics."
            )
            return

        # Convert the metrics into a format that's writable to BQ
        writable_metrics = (
            incarceration_metrics
            | "Convert to dict to be written to BQ"
            >> beam.ParDo(RecidivizMetricWritableDict()).with_outputs(
                IncarcerationMetricType.INCARCERATION_ADMISSION.value,
                IncarcerationMetricType.INCARCERATION_POPULATION.value,
                IncarcerationMetricType.INCARCERATION_RELEASE.value,
            )
        )

        # Write the metrics to the output tables in BigQuery
        admissions_table_id = DATAFLOW_METRICS_TO_TABLES[IncarcerationAdmissionMetric]
        population_table_id = DATAFLOW_METRICS_TO_TABLES[IncarcerationPopulationMetric]
        releases_table_id = DATAFLOW_METRICS_TO_TABLES[IncarcerationReleaseMetric]

        _ = (
            writable_metrics.INCARCERATION_ADMISSION
            | f"Write admission metrics to BQ table: {admissions_table_id}"
            >> WriteAppendToBigQuery(
                output_table=admissions_table_id,
                output_dataset=output,
            )
        )

        _ = (
            writable_metrics.INCARCERATION_POPULATION
            | f"Write population metrics to BQ table: {population_table_id}"
            >> WriteAppendToBigQuery(
                output_table=population_table_id,
                output_dataset=output,
            )
        )

        _ = (
            writable_metrics.INCARCERATION_RELEASE
            | f"Write release metrics to BQ table: {releases_table_id}"
            >> WriteAppendToBigQuery(
                output_table=releases_table_id,
                output_dataset=output,
            )
        )
Beispiel #13
0
    def execute_pipeline(
        self,
        pipeline: beam.Pipeline,
        all_pipeline_options: Dict[str, Any],
        state_code: str,
        input_dataset: str,
        reference_dataset: str,
        static_reference_dataset: str,
        metric_types: List[str],
        person_id_filter_set: Optional[Set[int]],
        calculation_month_count: int = -1,
        calculation_end_month: Optional[str] = None,
    ) -> beam.Pipeline:
        persons = pipeline | "Load StatePersons" >> BuildRootEntity(
            dataset=input_dataset,
            root_entity_class=entities.StatePerson,
            unifying_id_field=entities.StatePerson.get_class_id_name(),
            build_related_entities=True,
            unifying_id_field_filter_set=person_id_filter_set,
            state_code=state_code,
        )

        # Get StateSentenceGroups
        sentence_groups = pipeline | "Load StateSentenceGroups" >> BuildRootEntity(
            dataset=input_dataset,
            root_entity_class=entities.StateSentenceGroup,
            unifying_id_field=entities.StatePerson.get_class_id_name(),
            build_related_entities=True,
            unifying_id_field_filter_set=person_id_filter_set,
            state_code=state_code,
        )

        # Get StateIncarcerationSentences
        incarceration_sentences = (
            pipeline
            | "Load StateIncarcerationSentences" >> BuildRootEntity(
                dataset=input_dataset,
                root_entity_class=entities.StateIncarcerationSentence,
                unifying_id_field=entities.StatePerson.get_class_id_name(),
                build_related_entities=True,
                unifying_id_field_filter_set=person_id_filter_set,
                state_code=state_code,
            ))

        # Get StateSupervisionSentences
        supervision_sentences = (
            pipeline
            | "Load StateSupervisionSentences" >> BuildRootEntity(
                dataset=input_dataset,
                root_entity_class=entities.StateSupervisionSentence,
                unifying_id_field=entities.StatePerson.get_class_id_name(),
                build_related_entities=True,
                unifying_id_field_filter_set=person_id_filter_set,
                state_code=state_code,
            ))

        # Get StateSupervisionPeriods
        supervision_periods = (
            pipeline
            | "Load StateSupervisionPeriods" >> BuildRootEntity(
                dataset=input_dataset,
                root_entity_class=entities.StateSupervisionPeriod,
                unifying_id_field=entities.StatePerson.get_class_id_name(),
                build_related_entities=True,
                unifying_id_field_filter_set=person_id_filter_set,
                state_code=state_code,
            ))

        # Get StateAssessments
        assessments = pipeline | "Load Assessments" >> BuildRootEntity(
            dataset=input_dataset,
            root_entity_class=entities.StateAssessment,
            unifying_id_field=entities.StatePerson.get_class_id_name(),
            build_related_entities=False,
            unifying_id_field_filter_set=person_id_filter_set,
            state_code=state_code,
        )

        # Get StateSupervisionViolations
        supervision_violations = (
            pipeline
            | "Load SupervisionViolations" >> BuildRootEntity(
                dataset=input_dataset,
                root_entity_class=entities.StateSupervisionViolation,
                unifying_id_field=entities.StatePerson.get_class_id_name(),
                build_related_entities=True,
                unifying_id_field_filter_set=person_id_filter_set,
                state_code=state_code,
            ))

        # Get StateSupervisionViolationResponses
        supervision_violation_responses = (
            pipeline
            | "Load SupervisionViolationResponses" >> BuildRootEntity(
                dataset=input_dataset,
                root_entity_class=entities.StateSupervisionViolationResponse,
                unifying_id_field=entities.StatePerson.get_class_id_name(),
                build_related_entities=True,
                unifying_id_field_filter_set=person_id_filter_set,
                state_code=state_code,
            ))

        if state_code == "US_MO":
            # Bring in the reference table that includes sentence status ranking information
            us_mo_sentence_status_query = select_all_by_person_query(
                reference_dataset,
                US_MO_SENTENCE_STATUSES_VIEW_NAME,
                state_code,
                person_id_filter_set,
            )

            us_mo_sentence_statuses = (
                pipeline
                | "Read MO sentence status table from BigQuery" >>
                ReadFromBigQuery(query=us_mo_sentence_status_query))
        else:
            us_mo_sentence_statuses = (
                pipeline
                |
                f"Generate empty MO statuses list for non-MO state run: {state_code} "
                >> beam.Create([]))

        us_mo_sentence_status_rankings_as_kv = (
            us_mo_sentence_statuses
            | "Convert MO sentence status ranking table to KV tuples" >>
            beam.ParDo(ConvertDictToKVTuple(), "person_id"))

        supervision_sentences_and_statuses = (
            {
                "incarceration_sentences": incarceration_sentences,
                "supervision_sentences": supervision_sentences,
                "sentence_statuses": us_mo_sentence_status_rankings_as_kv,
            }
            | "Group sentences to the sentence statuses for that person" >>
            beam.CoGroupByKey())

        sentences_converted = (
            supervision_sentences_and_statuses
            | "Convert to state-specific sentences" >> beam.ParDo(
                ConvertSentencesToStateSpecificType()).with_outputs(
                    "incarceration_sentences", "supervision_sentences"))

        # Set hydrated supervision periods on the corresponding incarceration sentences
        incarceration_sentences_with_hydrated_sps = (
            {
                "supervision_periods": supervision_periods,
                "sentences": sentences_converted.incarceration_sentences,
            }
            | "Group supervision periods to incarceration sentences" >>
            beam.CoGroupByKey()
            | "Set hydrated supervision periods on incarceration sentences" >>
            beam.ParDo(SetSupervisionPeriodsOnSentences()))

        # Set hydrated supervision periods on the corresponding supervision sentences
        supervision_sentences_with_hydrated_sps = (
            {
                "supervision_periods": supervision_periods,
                "sentences": sentences_converted.supervision_sentences,
            }
            | "Group supervision periods to supervision sentences" >>
            beam.CoGroupByKey()
            | "Set hydrated supervision periods on supervision sentences" >>
            beam.ParDo(SetSupervisionPeriodsOnSentences()))

        sentences_and_sentence_groups = {
            "sentence_groups": sentence_groups,
            "incarceration_sentences":
            incarceration_sentences_with_hydrated_sps,
            "supervision_sentences": supervision_sentences_with_hydrated_sps,
        } | "Group sentences to sentence groups" >> beam.CoGroupByKey()

        # Set hydrated sentences on the corresponding sentence groups
        sentence_groups_with_hydrated_sentences = (
            sentences_and_sentence_groups
            | "Set hydrated sentences on sentence groups" >> beam.ParDo(
                SetSentencesOnSentenceGroup()))

        # Bring in the table that associates people and their county of residence
        person_id_to_county_kv = (
            pipeline
            | "Load person_id_to_county_kv" >> ImportTableAsKVTuples(
                dataset_id=reference_dataset,
                table_id=PERSONS_TO_RECENT_COUNTY_OF_RESIDENCE_VIEW_NAME,
                table_key="person_id",
                state_code_filter=state_code,
                person_id_filter_set=person_id_filter_set,
            ))

        ip_to_judicial_district_kv = (
            pipeline
            | "Load ip_to_judicial_district_kv" >> ImportTableAsKVTuples(
                dataset_id=reference_dataset,
                table_id=
                INCARCERATION_PERIOD_JUDICIAL_DISTRICT_ASSOCIATION_VIEW_NAME,
                table_key="person_id",
                state_code_filter=state_code,
                person_id_filter_set=person_id_filter_set,
            ))

        supervision_period_to_agent_associations_as_kv = (
            pipeline
            | "Load supervision_period_to_agent_associations_as_kv" >>
            ImportTableAsKVTuples(
                dataset_id=reference_dataset,
                table_id=SUPERVISION_PERIOD_TO_AGENT_ASSOCIATION_VIEW_NAME,
                table_key="person_id",
                state_code_filter=state_code,
                person_id_filter_set=person_id_filter_set,
            ))

        state_race_ethnicity_population_counts = (
            pipeline
            | "Load state_race_ethnicity_population_counts" >> ImportTable(
                dataset_id=static_reference_dataset,
                table_id="state_race_ethnicity_population_counts",
                state_code_filter=state_code,
                person_id_filter_set=None,
            ))

        # Group StateSupervisionViolationResponses and StateSupervisionViolations by person_id
        supervision_violations_and_responses = (
            {
                "violations": supervision_violations,
                "violation_responses": supervision_violation_responses,
            }
            | "Group StateSupervisionViolationResponses to "
            "StateSupervisionViolations" >> beam.CoGroupByKey())

        # Set the fully hydrated StateSupervisionViolation entities on the corresponding
        # StateSupervisionViolationResponses
        violation_responses_with_hydrated_violations = (
            supervision_violations_and_responses
            | "Set hydrated StateSupervisionViolations on "
            "the StateSupervisionViolationResponses" >> beam.ParDo(
                SetViolationOnViolationsResponse()))

        # Group each StatePerson with their related entities
        person_entities = {
            "person": persons,
            "assessments": assessments,
            "sentence_groups": sentence_groups_with_hydrated_sentences,
            "violation_responses":
            violation_responses_with_hydrated_violations,
            "incarceration_period_judicial_district_association":
            ip_to_judicial_district_kv,
            "supervision_period_to_agent_association":
            supervision_period_to_agent_associations_as_kv,
            "persons_to_recent_county_of_residence": person_id_to_county_kv,
        } | "Group StatePerson to SentenceGroups" >> beam.CoGroupByKey()

        # Identify IncarcerationEvents events from the StatePerson's StateIncarcerationPeriods
        person_incarceration_events = (
            person_entities
            | "Classify Incarceration Events" >> beam.ParDo(
                ClassifyEvents(), identifier=self.pipeline_config.identifier))

        person_metadata = (
            persons
            | "Build the person_metadata dictionary" >> beam.ParDo(
                BuildPersonMetadata(),
                state_race_ethnicity_population_counts=AsList(
                    state_race_ethnicity_population_counts),
            ))

        person_incarceration_events_with_metadata = (
            {
                "person_events": person_incarceration_events,
                "person_metadata": person_metadata,
            }
            | "Group IncarcerationEvents with person-level metadata" >>
            beam.CoGroupByKey()
            |
            "Organize StatePerson, PersonMetadata and IncarcerationEvents for calculations"
            >> beam.ParDo(ExtractPersonEventsMetadata()))

        # Add timestamp for local jobs
        job_timestamp = datetime.datetime.now().strftime(
            "%Y-%m-%d_%H_%M_%S.%f")
        all_pipeline_options["job_timestamp"] = job_timestamp

        # Get the type of metric to calculate
        metric_types_set = set(metric_types)

        # Get IncarcerationMetrics
        incarceration_metrics = (
            person_incarceration_events_with_metadata
            | "Get Incarceration Metrics" >> GetMetrics(
                pipeline_options=all_pipeline_options,
                pipeline_config=self.pipeline_config,
                metric_types_to_include=metric_types_set,
                calculation_end_month=calculation_end_month,
                calculation_month_count=calculation_month_count,
            ))

        return incarceration_metrics
Beispiel #14
0
def run(
    apache_beam_pipeline_options: PipelineOptions,
    data_input: str,
    reference_view_input: str,
    static_reference_input: str,
    output: str,
    calculation_month_count: int,
    metric_types: List[str],
    state_code: str,
    calculation_end_month: Optional[str],
    person_filter_ids: Optional[List[int]],
) -> None:
    """Runs the supervision calculation pipeline."""

    # Workaround to load SQLAlchemy objects at start of pipeline. This is necessary because the BuildRootEntity
    # function tries to access attributes of relationship properties on the SQLAlchemy room_schema_class before they
    # have been loaded. However, if *any* SQLAlchemy objects have been instantiated, then the relationship properties
    # are loaded and their attributes can be successfully accessed.
    _ = schema.StatePerson()

    apache_beam_pipeline_options.view_as(SetupOptions).save_main_session = True

    # Get pipeline job details
    all_pipeline_options = apache_beam_pipeline_options.get_all_options()
    project_id = all_pipeline_options["project"]

    if project_id is None:
        raise ValueError(
            f"No project set in pipeline options: {all_pipeline_options}")

    if state_code is None:
        raise ValueError("No state_code set for pipeline")

    input_dataset = project_id + "." + data_input
    reference_dataset = project_id + "." + reference_view_input
    static_reference_dataset = project_id + "." + static_reference_input

    person_id_filter_set = set(
        person_filter_ids) if person_filter_ids else None

    with beam.Pipeline(options=apache_beam_pipeline_options) as p:
        # Get StatePersons
        persons = p | "Load Persons" >> BuildRootEntity(
            dataset=input_dataset,
            root_entity_class=entities.StatePerson,
            unifying_id_field=entities.StatePerson.get_class_id_name(),
            build_related_entities=True,
            unifying_id_field_filter_set=person_id_filter_set,
            state_code=state_code,
        )

        # Get StateIncarcerationPeriods
        incarceration_periods = p | "Load IncarcerationPeriods" >> BuildRootEntity(
            dataset=input_dataset,
            root_entity_class=entities.StateIncarcerationPeriod,
            unifying_id_field=entities.StatePerson.get_class_id_name(),
            build_related_entities=True,
            unifying_id_field_filter_set=person_id_filter_set,
            state_code=state_code,
        )

        # Get StateSupervisionViolations
        supervision_violations = p | "Load SupervisionViolations" >> BuildRootEntity(
            dataset=input_dataset,
            root_entity_class=entities.StateSupervisionViolation,
            unifying_id_field=entities.StatePerson.get_class_id_name(),
            build_related_entities=True,
            unifying_id_field_filter_set=person_id_filter_set,
            state_code=state_code,
        )

        # TODO(#2769): Don't bring this in as a root entity
        # Get StateSupervisionViolationResponses
        supervision_violation_responses = (
            p
            | "Load SupervisionViolationResponses" >> BuildRootEntity(
                dataset=input_dataset,
                root_entity_class=entities.StateSupervisionViolationResponse,
                unifying_id_field=entities.StatePerson.get_class_id_name(),
                build_related_entities=True,
                unifying_id_field_filter_set=person_id_filter_set,
                state_code=state_code,
            ))

        # Get StateSupervisionSentences
        supervision_sentences = p | "Load SupervisionSentences" >> BuildRootEntity(
            dataset=input_dataset,
            root_entity_class=entities.StateSupervisionSentence,
            unifying_id_field=entities.StatePerson.get_class_id_name(),
            build_related_entities=True,
            unifying_id_field_filter_set=person_id_filter_set,
            state_code=state_code,
        )

        # Get StateIncarcerationSentences
        incarceration_sentences = p | "Load IncarcerationSentences" >> BuildRootEntity(
            dataset=input_dataset,
            root_entity_class=entities.StateIncarcerationSentence,
            unifying_id_field=entities.StatePerson.get_class_id_name(),
            build_related_entities=True,
            unifying_id_field_filter_set=person_id_filter_set,
            state_code=state_code,
        )

        # Get StateSupervisionPeriods
        supervision_periods = p | "Load SupervisionPeriods" >> BuildRootEntity(
            dataset=input_dataset,
            root_entity_class=entities.StateSupervisionPeriod,
            unifying_id_field=entities.StatePerson.get_class_id_name(),
            build_related_entities=True,
            unifying_id_field_filter_set=person_id_filter_set,
            state_code=state_code,
        )

        # Get StateAssessments
        assessments = p | "Load Assessments" >> BuildRootEntity(
            dataset=input_dataset,
            root_entity_class=entities.StateAssessment,
            unifying_id_field=entities.StatePerson.get_class_id_name(),
            build_related_entities=False,
            unifying_id_field_filter_set=person_id_filter_set,
            state_code=state_code,
        )

        supervision_contacts = p | "Load StateSupervisionContacts" >> BuildRootEntity(
            dataset=input_dataset,
            root_entity_class=entities.StateSupervisionContact,
            unifying_id_field=entities.StatePerson.get_class_id_name(),
            build_related_entities=False,
            unifying_id_field_filter_set=person_id_filter_set,
            state_code=state_code,
        )

        supervision_period_to_agent_associations_as_kv = (
            p
            | "Load supervision_period_to_agent_associations_as_kv" >>
            ImportTableAsKVTuples(
                dataset_id=reference_dataset,
                table_id=SUPERVISION_PERIOD_TO_AGENT_ASSOCIATION_VIEW_NAME,
                table_key="person_id",
                state_code_filter=state_code,
                person_id_filter_set=person_id_filter_set,
            ))

        # Bring in the judicial districts associated with supervision_periods
        sp_to_judicial_district_kv = (
            p
            | "Load sp_to_judicial_district_kv" >> ImportTableAsKVTuples(
                dataset_id=reference_dataset,
                table_id=
                SUPERVISION_PERIOD_JUDICIAL_DISTRICT_ASSOCIATION_VIEW_NAME,
                state_code_filter=state_code,
                person_id_filter_set=person_id_filter_set,
                table_key="person_id",
            ))

        state_race_ethnicity_population_counts = (
            p
            | "Load state_race_ethnicity_population_counts" >> ImportTable(
                dataset_id=static_reference_dataset,
                table_id="state_race_ethnicity_population_counts",
                state_code_filter=state_code,
                person_id_filter_set=None,
            ))

        if state_code == "US_MO":
            # Bring in the reference table that includes sentence status ranking information
            us_mo_sentence_status_query = select_all_by_person_query(
                reference_dataset,
                US_MO_SENTENCE_STATUSES_VIEW_NAME,
                state_code,
                person_id_filter_set,
            )

            us_mo_sentence_statuses = (
                p
                | "Read MO sentence status table from BigQuery" >>
                ReadFromBigQuery(query=us_mo_sentence_status_query))
        else:
            us_mo_sentence_statuses = (
                p
                |
                f"Generate empty MO statuses list for non-MO state run: {state_code} "
                >> beam.Create([]))

        us_mo_sentence_status_rankings_as_kv = (
            us_mo_sentence_statuses
            | "Convert MO sentence status ranking table to KV tuples" >>
            beam.ParDo(ConvertDictToKVTuple(), "person_id"))

        sentences_and_statuses = (
            {
                "incarceration_sentences": incarceration_sentences,
                "supervision_sentences": supervision_sentences,
                "sentence_statuses": us_mo_sentence_status_rankings_as_kv,
            }
            | "Group sentences to the sentence statuses for that person" >>
            beam.CoGroupByKey())

        sentences_converted = (
            sentences_and_statuses
            | "Convert to state-specific sentences" >> beam.ParDo(
                ConvertSentencesToStateSpecificType()).with_outputs(
                    "incarceration_sentences", "supervision_sentences"))

        # Group StateSupervisionViolationResponses and StateSupervisionViolations by person_id
        supervision_violations_and_responses = (
            {
                "violations": supervision_violations,
                "violation_responses": supervision_violation_responses,
            }
            | "Group StateSupervisionViolationResponses to "
            "StateSupervisionViolations" >> beam.CoGroupByKey())

        # Set the fully hydrated StateSupervisionViolation entities on the corresponding
        # StateSupervisionViolationResponses
        violation_responses_with_hydrated_violations = (
            supervision_violations_and_responses
            | "Set hydrated StateSupervisionViolations on "
            "the StateSupervisionViolationResponses" >> beam.ParDo(
                SetViolationOnViolationsResponse()))

        # Group StateIncarcerationPeriods and StateSupervisionViolationResponses by person_id
        incarceration_periods_and_violation_responses = (
            {
                "incarceration_periods":
                incarceration_periods,
                "violation_responses":
                violation_responses_with_hydrated_violations,
            }
            | "Group StateIncarcerationPeriods to "
            "StateSupervisionViolationResponses" >> beam.CoGroupByKey())

        # Set the fully hydrated StateSupervisionViolationResponse entities on the corresponding
        # StateIncarcerationPeriods
        incarceration_periods_with_source_violations = (
            incarceration_periods_and_violation_responses
            | "Set hydrated StateSupervisionViolationResponses on "
            "the StateIncarcerationPeriods" >> beam.ParDo(
                SetViolationResponseOnIncarcerationPeriod()))

        # Group each StatePerson with their related entities
        person_entities = {
            "person":
            persons,
            "assessments":
            assessments,
            "incarceration_periods":
            incarceration_periods_with_source_violations,
            "supervision_periods":
            supervision_periods,
            "supervision_sentences":
            sentences_converted.supervision_sentences,
            "incarceration_sentences":
            sentences_converted.incarceration_sentences,
            "violation_responses":
            violation_responses_with_hydrated_violations,
            "supervision_contacts":
            supervision_contacts,
            "supervision_period_judicial_district_association":
            sp_to_judicial_district_kv,
            "supervision_period_to_agent_association":
            supervision_period_to_agent_associations_as_kv,
        } | "Group StatePerson to all entities" >> beam.CoGroupByKey()

        # Identify SupervisionTimeBuckets from the StatePerson's StateSupervisionSentences and StateIncarcerationPeriods
        person_time_buckets = (person_entities
                               | "Get SupervisionTimeBuckets" >> beam.ParDo(
                                   ClassifySupervisionTimeBuckets()))

        person_metadata = (
            persons
            | "Build the person_metadata dictionary" >> beam.ParDo(
                BuildPersonMetadata(),
                AsList(state_race_ethnicity_population_counts)))

        person_time_buckets_with_metadata = (
            {
                "person_events": person_time_buckets,
                "person_metadata": person_metadata
            }
            | "Group SupervisionTimeBuckets with person-level metadata" >>
            beam.CoGroupByKey()
            |
            "Organize StatePerson, PersonMetadata and SupervisionTimeBuckets for calculations"
            >> beam.ParDo(ExtractPersonEventsMetadata()))

        # Get pipeline job details for accessing job_id
        all_pipeline_options = apache_beam_pipeline_options.get_all_options()

        # Get the type of metric to calculate
        metric_types_set = set(metric_types)

        # Add timestamp for local jobs
        job_timestamp = datetime.datetime.now().strftime(
            "%Y-%m-%d_%H_%M_%S.%f")
        all_pipeline_options["job_timestamp"] = job_timestamp

        # Get supervision metrics
        supervision_metrics = (
            person_time_buckets_with_metadata
            | "Get Supervision Metrics" >> GetSupervisionMetrics(
                pipeline_options=all_pipeline_options,
                metric_types=metric_types_set,
                calculation_end_month=calculation_end_month,
                calculation_month_count=calculation_month_count,
            ))
        if person_id_filter_set:
            logging.warning(
                "Non-empty person filter set - returning before writing metrics."
            )
            return

        # Convert the metrics into a format that's writable to BQ
        writable_metrics = (
            supervision_metrics
            | "Convert to dict to be written to BQ" >>
            beam.ParDo(RecidivizMetricWritableDict()).with_outputs(
                SupervisionMetricType.SUPERVISION_COMPLIANCE.value,
                SupervisionMetricType.SUPERVISION_POPULATION.value,
                SupervisionMetricType.SUPERVISION_REVOCATION.value,
                SupervisionMetricType.SUPERVISION_REVOCATION_ANALYSIS.value,
                SupervisionMetricType.SUPERVISION_START.value,
                SupervisionMetricType.SUPERVISION_SUCCESS.value,
                SupervisionMetricType.
                SUPERVISION_SUCCESSFUL_SENTENCE_DAYS_SERVED.value,
                SupervisionMetricType.SUPERVISION_TERMINATION.value,
                SupervisionMetricType.SUPERVISION_OUT_OF_STATE_POPULATION.
                value,
                SupervisionMetricType.SUPERVISION_DOWNGRADE.value,
            ))

        terminations_table_id = DATAFLOW_METRICS_TO_TABLES[
            SupervisionTerminationMetric]
        compliance_table_id = DATAFLOW_METRICS_TO_TABLES[
            SupervisionCaseComplianceMetric]
        populations_table_id = DATAFLOW_METRICS_TO_TABLES[
            SupervisionPopulationMetric]
        revocations_table_id = DATAFLOW_METRICS_TO_TABLES[
            SupervisionRevocationMetric]
        revocation_analysis_table_id = DATAFLOW_METRICS_TO_TABLES[
            SupervisionRevocationAnalysisMetric]
        successes_table_id = DATAFLOW_METRICS_TO_TABLES[
            SupervisionSuccessMetric]
        successful_sentence_lengths_table_id = DATAFLOW_METRICS_TO_TABLES[
            SuccessfulSupervisionSentenceDaysServedMetric]
        supervision_starts_table_id = DATAFLOW_METRICS_TO_TABLES[
            SupervisionStartMetric]
        out_of_state_populations_table_id = DATAFLOW_METRICS_TO_TABLES[
            SupervisionOutOfStatePopulationMetric]
        supervision_downgrade_table_id = DATAFLOW_METRICS_TO_TABLES[
            SupervisionDowngradeMetric]

        _ = (writable_metrics.SUPERVISION_POPULATION
             | f"Write population metrics to BQ table: {populations_table_id}"
             >> WriteAppendToBigQuery(
                 output_table=populations_table_id,
                 output_dataset=output,
             ))

        _ = (writable_metrics.SUPERVISION_OUT_OF_STATE_POPULATION
             | f"Write out of state population metrics to BQ table: "
             f"{out_of_state_populations_table_id}" >> WriteAppendToBigQuery(
                 output_table=out_of_state_populations_table_id,
                 output_dataset=output,
             ))

        _ = (writable_metrics.SUPERVISION_REVOCATION
             | f"Write revocation metrics to BQ table: {revocations_table_id}"
             >> WriteAppendToBigQuery(
                 output_table=revocations_table_id,
                 output_dataset=output,
             ))

        _ = (writable_metrics.SUPERVISION_SUCCESS
             | f"Write success metrics to BQ table: {successes_table_id}" >>
             WriteAppendToBigQuery(
                 output_table=successes_table_id,
                 output_dataset=output,
             ))

        _ = (writable_metrics.SUPERVISION_SUCCESSFUL_SENTENCE_DAYS_SERVED
             | f"Write supervision successful sentence length metrics to BQ"
             f" table: {successful_sentence_lengths_table_id}" >>
             WriteAppendToBigQuery(
                 output_table=successful_sentence_lengths_table_id,
                 output_dataset=output,
             ))

        _ = (writable_metrics.SUPERVISION_TERMINATION
             |
             f"Write termination metrics to BQ table: {terminations_table_id}"
             >> WriteAppendToBigQuery(
                 output_table=terminations_table_id,
                 output_dataset=output,
             ))

        _ = (writable_metrics.SUPERVISION_REVOCATION_ANALYSIS
             | f"Write revocation analyses metrics to BQ table: "
             f"{revocation_analysis_table_id}" >> WriteAppendToBigQuery(
                 output_table=revocation_analysis_table_id,
                 output_dataset=output,
             ))

        _ = (writable_metrics.SUPERVISION_COMPLIANCE
             | f"Write compliance metrics to BQ table: {compliance_table_id}"
             >> WriteAppendToBigQuery(
                 output_table=compliance_table_id,
                 output_dataset=output,
             ))

        _ = (writable_metrics.SUPERVISION_START
             |
             f"Write start metrics to BQ table: {supervision_starts_table_id}"
             >> WriteAppendToBigQuery(
                 output_table=supervision_starts_table_id,
                 output_dataset=output,
             ))

        _ = (
            writable_metrics.SUPERVISION_DOWNGRADE
            |
            f"Write downgrade metrics to BQ table: {supervision_downgrade_table_id}"
            >> WriteAppendToBigQuery(
                output_table=supervision_downgrade_table_id,
                output_dataset=output,
            ))