Exemple #1
0
 def setUp(self) -> None:
     self.fake_bq_source_factory = FakeReadFromBigQueryFactory()
     self.assessment_types_patcher = mock.patch(
         'recidiviz.calculator.pipeline.program.identifier.assessment_utils.'
         '_assessment_types_of_class_for_state')
     self.mock_assessment_types = self.assessment_types_patcher.start()
     self.mock_assessment_types.return_value = [StateAssessmentType.ORAS]
 def setUp(self) -> None:
     self.fake_bq_source_factory = FakeReadFromBigQueryFactory()
     self.fake_bq_sink_factory = FakeWriteToBigQueryFactory(
         FakeWriteToBigQuery)
     self.violation_delegate_patcher = mock.patch(
         "recidiviz.calculator.pipeline.violation.identifier.get_state_specific_violation_delegate"
     )
     self.mock_violation_delegate = self.violation_delegate_patcher.start()
     self.mock_violation_delegate.return_value = UsXxViolationDelegate()
Exemple #3
0
 def setUp(self) -> None:
     self.fake_bq_source_factory = FakeReadFromBigQueryFactory()
Exemple #4
0
class TestProgramPipeline(unittest.TestCase):
    """Tests the entire program pipeline."""
    def setUp(self) -> None:
        self.fake_bq_source_factory = FakeReadFromBigQueryFactory()

    @staticmethod
    def build_data_dict(fake_person_id: int, fake_supervision_period_id: int):
        """Builds a data_dict for a basic run of the pipeline."""
        fake_person = schema.StatePerson(
            person_id=fake_person_id,
            gender=Gender.MALE,
            birthdate=date(1970, 1, 1),
            residency_status=ResidencyStatus.PERMANENT)

        persons_data = [normalized_database_base_dict(fake_person)]

        race_1 = schema.StatePersonRace(person_race_id=111,
                                        state_code='CA',
                                        race=Race.BLACK,
                                        person_id=fake_person_id)

        race_2 = schema.StatePersonRace(person_race_id=111,
                                        state_code='ND',
                                        race=Race.WHITE,
                                        person_id=fake_person_id)

        races_data = normalized_database_base_dict_list([race_1, race_2])

        ethnicity = schema.StatePersonEthnicity(person_ethnicity_id=111,
                                                state_code='CA',
                                                ethnicity=Ethnicity.HISPANIC,
                                                person_id=fake_person_id)

        ethnicity_data = normalized_database_base_dict_list([ethnicity])

        program_assignment = schema.StateProgramAssignment(
            state_code='CA',
            program_assignment_id=123,
            referral_date=date(2015, 5, 10),
            person_id=fake_person_id)

        assessment = schema.StateAssessment(assessment_id=298374,
                                            assessment_date=date(2015, 3, 19),
                                            assessment_type='LSIR',
                                            person_id=fake_person_id)

        supervision_period = schema.StateSupervisionPeriod(
            supervision_period_id=fake_supervision_period_id,
            state_code='CA',
            county_code='124',
            start_date=date(2015, 3, 14),
            termination_date=date(2016, 12, 29),
            supervision_type=StateSupervisionType.PROBATION,
            person_id=fake_person_id)

        program_assignment_data = [
            normalized_database_base_dict(program_assignment)
        ]

        assessment_data = [normalized_database_base_dict(assessment)]

        supervision_periods_data = [
            normalized_database_base_dict(supervision_period)
        ]

        supervision_violation_response = \
            database_test_utils.generate_test_supervision_violation_response(
                fake_person_id)

        supervision_violation_response_data = [
            normalized_database_base_dict(supervision_violation_response)
        ]

        data_dict = {
            schema.StatePerson.__tablename__: persons_data,
            schema.StatePersonRace.__tablename__: races_data,
            schema.StatePersonEthnicity.__tablename__: ethnicity_data,
            schema.StateSupervisionViolationResponse.__tablename__:
            supervision_violation_response_data,
            schema.StateSupervisionPeriod.__tablename__:
            supervision_periods_data,
            schema.StateProgramAssignment.__tablename__:
            program_assignment_data,
            schema.StateAssessment.__tablename__: assessment_data,
            schema.StatePersonExternalId.__tablename__: [],
            schema.StatePersonAlias.__tablename__: [],
            schema.StateSentenceGroup.__tablename__: [],
        }

        return data_dict

    def testProgramPipeline(self):
        """Tests the program pipeline."""
        fake_person_id = 12345
        fake_supervision_period_id = 12345

        data_dict = self.build_data_dict(fake_person_id,
                                         fake_supervision_period_id)

        dataset = 'recidiviz-123.state'

        with patch(
                'recidiviz.calculator.pipeline.utils.extractor_utils.ReadFromBigQuery',
                self.fake_bq_source_factory.create_fake_bq_source_constructor(
                    dataset, data_dict)):
            self.run_test_pipeline(dataset, fake_supervision_period_id)

    def testProgramPipelineWithFilterSet(self):
        """Tests the program pipeline."""
        fake_person_id = 12345
        fake_supervision_period_id = 12345

        data_dict = self.build_data_dict(fake_person_id,
                                         fake_supervision_period_id)

        dataset = 'recidiviz-123.state'

        with patch(
                'recidiviz.calculator.pipeline.utils.extractor_utils.ReadFromBigQuery',
                self.fake_bq_source_factory.create_fake_bq_source_constructor(
                    dataset, data_dict)):
            self.run_test_pipeline(
                dataset,
                fake_supervision_period_id,
                unifying_id_field_filter_set={fake_person_id})

    def run_test_pipeline(
            self,
            dataset: str,
            fake_supervision_period_id: int,
            unifying_id_field_filter_set: Optional[Set[int]] = None,
            metric_types_filter: Optional[Set[str]] = None):
        """Runs a test version of the program pipeline."""
        test_pipeline = TestPipeline()

        # Get StatePersons
        persons = (
            test_pipeline
            | 'Load Persons' >>  # type: ignore
            extractor_utils.BuildRootEntity(
                dataset=dataset,
                root_entity_class=entities.StatePerson,
                unifying_id_field=entities.StatePerson.get_class_id_name(),
                build_related_entities=True))

        # Get StateProgramAssignments
        program_assignments = (
            test_pipeline
            | 'Load Program Assignments' >>  # type: ignore
            extractor_utils.BuildRootEntity(
                dataset=dataset,
                root_entity_class=entities.StateProgramAssignment,
                unifying_id_field=entities.StatePerson.get_class_id_name(),
                build_related_entities=True,
                unifying_id_field_filter_set=unifying_id_field_filter_set))

        # Get StateAssessments
        assessments = (
            test_pipeline
            | 'Load Assessments' >>  # type: ignore
            extractor_utils.BuildRootEntity(
                dataset=dataset,
                root_entity_class=entities.StateAssessment,
                unifying_id_field=entities.StatePerson.get_class_id_name(),
                build_related_entities=False,
                unifying_id_field_filter_set=unifying_id_field_filter_set))

        # Get StateSupervisionPeriods
        supervision_periods = (
            test_pipeline
            | 'Load SupervisionPeriods' >>  # type: ignore
            extractor_utils.BuildRootEntity(
                dataset=dataset,
                root_entity_class=entities.StateSupervisionPeriod,
                unifying_id_field=entities.StatePerson.get_class_id_name(),
                build_related_entities=False,
                unifying_id_field_filter_set=unifying_id_field_filter_set))

        supervision_period_to_agent_map = {
            'agent_id': 1010,
            'agent_external_id': 'OFFICER0009',
            'district_external_id': '10',
            'supervision_period_id': fake_supervision_period_id
        }

        supervision_period_to_agent_associations = (
            test_pipeline
            | 'Create SupervisionPeriod to Agent table' >> beam.Create(
                [supervision_period_to_agent_map]))

        supervision_period_to_agent_associations_as_kv = (
            supervision_period_to_agent_associations
            | 'Convert SupervisionPeriod to Agent table to KV tuples' >>
            beam.ParDo(pipeline.ConvertDictToKVTuple(),
                       'supervision_period_id'))

        # Group each StatePerson with their other entities
        persons_entities = ({
            'person': persons,
            'program_assignments': program_assignments,
            'assessments': assessments,
            'supervision_periods': supervision_periods
        }
                            |
                            'Group StatePerson to StateProgramAssignments and'
                            >> beam.CoGroupByKey())

        # Identify ProgramEvents from the StatePerson's
        # StateProgramAssignments
        person_program_events = (
            persons_entities
            | beam.ParDo(
                pipeline.ClassifyProgramAssignments(),
                AsDict(supervision_period_to_agent_associations_as_kv)))

        # Get pipeline job details for accessing job_id
        all_pipeline_options = PipelineOptions().get_all_options()

        # Add timestamp for local jobs
        job_timestamp = datetime.datetime.now().strftime(
            '%Y-%m-%d_%H_%M_%S.%f')
        all_pipeline_options['job_timestamp'] = job_timestamp

        metric_types = metric_types_filter if metric_types_filter else {'ALL'}

        # Get program metrics
        program_metrics = (
            person_program_events
            | 'Get Program Metrics' >>  # type: ignore
            pipeline.GetProgramMetrics(pipeline_options=all_pipeline_options,
                                       metric_types=metric_types,
                                       calculation_end_month=None,
                                       calculation_month_count=-1))

        assert_that(program_metrics, AssertMatchers.validate_pipeline_test())

        test_pipeline.run()

    def testProgramPipelineNoReferrals(self):
        """Tests the program pipeline where one person does not have any
        program assignment entities."""
        fake_person_id = 12345
        fake_person_id_2 = 9876

        fake_person = schema.StatePerson(
            person_id=fake_person_id,
            gender=Gender.MALE,
            birthdate=date(1970, 1, 1),
            residency_status=ResidencyStatus.PERMANENT)

        fake_person_2 = schema.StatePerson(
            person_id=fake_person_id_2,
            gender=Gender.MALE,
            birthdate=date(1974, 3, 12),
            residency_status=ResidencyStatus.PERMANENT)

        persons_data = normalized_database_base_dict_list(
            [fake_person, fake_person_2])

        race_1 = schema.StatePersonRace(person_race_id=111,
                                        state_code='CA',
                                        race=Race.BLACK,
                                        person_id=fake_person_id)

        race_2 = schema.StatePersonRace(person_race_id=111,
                                        state_code='ND',
                                        race=Race.WHITE,
                                        person_id=fake_person_id)

        races_data = normalized_database_base_dict_list([race_1, race_2])

        ethnicity = schema.StatePersonEthnicity(person_ethnicity_id=111,
                                                state_code='CA',
                                                ethnicity=Ethnicity.HISPANIC,
                                                person_id=fake_person_id)

        ethnicity_data = normalized_database_base_dict_list([ethnicity])

        # Program assignment for a different person
        program_assignment = schema.StateProgramAssignment(
            state_code='CA',
            program_assignment_id=123,
            referral_date=date(2015, 5, 10),
            person_id=fake_person_id_2)

        assessment = schema.StateAssessment(assessment_id=298374,
                                            assessment_date=date(2015, 3, 19),
                                            assessment_type='LSIR',
                                            person_id=fake_person_id)

        supervision_period = schema.StateSupervisionPeriod(
            supervision_period_id=1111,
            state_code='CA',
            county_code='124',
            start_date=date(2015, 3, 14),
            termination_date=date(2016, 12, 29),
            supervision_type=StateSupervisionType.PROBATION,
            person_id=fake_person_id)

        program_assignment_data = [
            normalized_database_base_dict(program_assignment)
        ]

        assessment_data = [normalized_database_base_dict(assessment)]

        supervision_periods_data = [
            normalized_database_base_dict(supervision_period)
        ]

        supervision_violation_response = \
            database_test_utils.generate_test_supervision_violation_response(
                fake_person_id)

        supervision_violation_response_data = [
            normalized_database_base_dict(supervision_violation_response)
        ]

        data_dict = {
            schema.StatePerson.__tablename__: persons_data,
            schema.StatePersonRace.__tablename__: races_data,
            schema.StatePersonEthnicity.__tablename__: ethnicity_data,
            schema.StateSupervisionViolationResponse.__tablename__:
            supervision_violation_response_data,
            schema.StateSupervisionPeriod.__tablename__:
            supervision_periods_data,
            schema.StateProgramAssignment.__tablename__:
            program_assignment_data,
            schema.StateAssessment.__tablename__: assessment_data,
            schema.StatePersonExternalId.__tablename__: [],
            schema.StatePersonAlias.__tablename__: [],
            schema.StateSentenceGroup.__tablename__: [],
        }

        dataset = 'recidiviz-123.state'

        with patch(
                'recidiviz.calculator.pipeline.utils.extractor_utils.ReadFromBigQuery',
                self.fake_bq_source_factory.create_fake_bq_source_constructor(
                    dataset, data_dict)):
            self.run_test_pipeline(dataset,
                                   supervision_period.supervision_period_id)
Exemple #5
0
class TestProgramPipeline(unittest.TestCase):
    """Tests the entire program pipeline."""

    def setUp(self) -> None:
        self.fake_bq_source_factory = FakeReadFromBigQueryFactory()
        self.fake_bq_sink_factory = FakeWriteToBigQueryFactory(FakeWriteToBigQuery)
        self.assessment_types_patcher = mock.patch(
            "recidiviz.calculator.pipeline.program.identifier.assessment_utils."
            "_assessment_types_of_class_for_state"
        )
        self.mock_assessment_types = self.assessment_types_patcher.start()
        self.mock_assessment_types.return_value = [StateAssessmentType.ORAS]

    def tearDown(self) -> None:
        self.assessment_types_patcher.stop()

    @staticmethod
    def build_data_dict(fake_person_id: int, fake_supervision_period_id: int):
        """Builds a data_dict for a basic run of the pipeline."""
        fake_person = schema.StatePerson(
            state_code="US_XX",
            person_id=fake_person_id,
            gender=Gender.MALE,
            birthdate=date(1970, 1, 1),
            residency_status=ResidencyStatus.PERMANENT,
        )

        persons_data = [normalized_database_base_dict(fake_person)]

        race_1 = schema.StatePersonRace(
            person_race_id=111,
            state_code="US_XX",
            race=Race.BLACK,
            person_id=fake_person_id,
        )

        race_2 = schema.StatePersonRace(
            person_race_id=111,
            state_code="US_XX",
            race=Race.WHITE,
            person_id=fake_person_id,
        )

        races_data = normalized_database_base_dict_list([race_1, race_2])

        ethnicity = schema.StatePersonEthnicity(
            person_ethnicity_id=111,
            state_code="US_XX",
            ethnicity=Ethnicity.HISPANIC,
            person_id=fake_person_id,
        )

        ethnicity_data = normalized_database_base_dict_list([ethnicity])

        program_assignment = schema.StateProgramAssignment(
            state_code="US_XX",
            program_assignment_id=123,
            referral_date=date(2015, 5, 10),
            person_id=fake_person_id,
            participation_status=StateProgramAssignmentParticipationStatus.IN_PROGRESS,
        )

        assessment = schema.StateAssessment(
            assessment_id=298374,
            state_code="US_XX",
            assessment_date=date(2015, 3, 19),
            assessment_type="LSIR",
            person_id=fake_person_id,
        )

        supervision_period = schema.StateSupervisionPeriod(
            supervision_period_id=fake_supervision_period_id,
            state_code="US_XX",
            county_code="124",
            start_date=date(2015, 3, 14),
            termination_date=date(2016, 12, 29),
            supervision_type=StateSupervisionType.PROBATION,
            person_id=fake_person_id,
            status=StateSupervisionPeriodStatus.PRESENT_WITHOUT_INFO,
        )

        program_assignment_data = [normalized_database_base_dict(program_assignment)]

        assessment_data = [normalized_database_base_dict(assessment)]

        supervision_periods_data = [normalized_database_base_dict(supervision_period)]

        supervision_violation_response = (
            database_test_utils.generate_test_supervision_violation_response(
                fake_person_id
            )
        )

        supervision_violation_response_data = [
            normalized_database_base_dict(supervision_violation_response)
        ]

        supervision_period_to_agent_data = [
            {
                "agent_id": 1010,
                "person_id": fake_person_id,
                "state_code": "US_XX",
                "agent_external_id": "OFFICER0009",
                "supervision_period_id": fake_supervision_period_id,
            }
        ]

        state_race_ethnicity_population_count_data = [
            {
                "state_code": "US_XX",
                "race_or_ethnicity": "BLACK",
                "population_count": 1,
                "representation_priority": 1,
            }
        ]

        data_dict = {
            schema.StatePerson.__tablename__: persons_data,
            schema.StatePersonRace.__tablename__: races_data,
            schema.StatePersonEthnicity.__tablename__: ethnicity_data,
            schema.StateSupervisionViolationResponse.__tablename__: supervision_violation_response_data,
            schema.StateSupervisionPeriod.__tablename__: supervision_periods_data,
            schema.StateProgramAssignment.__tablename__: program_assignment_data,
            schema.StateAssessment.__tablename__: assessment_data,
            schema.StatePersonExternalId.__tablename__: [],
            schema.StatePersonAlias.__tablename__: [],
            schema.StateSentenceGroup.__tablename__: [],
            "supervision_period_to_agent_association": supervision_period_to_agent_data,
            "state_race_ethnicity_population_counts": state_race_ethnicity_population_count_data,
        }

        return data_dict

    def testProgramPipeline(self):
        """Tests the program pipeline."""
        fake_person_id = 12345
        fake_supervision_period_id = 12345

        data_dict = self.build_data_dict(fake_person_id, fake_supervision_period_id)

        dataset = "recidiviz-123.state"

        self.run_test_pipeline(dataset, data_dict)

    def testProgramPipelineWithFilterSet(self):
        """Tests the program pipeline."""
        fake_person_id = 12345
        fake_supervision_period_id = 12345

        data_dict = self.build_data_dict(fake_person_id, fake_supervision_period_id)

        dataset = "recidiviz-123.state"

        self.run_test_pipeline(
            dataset, data_dict, unifying_id_field_filter_set={fake_person_id}
        )

    def run_test_pipeline(
        self,
        dataset: str,
        data_dict: DataTablesDict,
        unifying_id_field_filter_set: Optional[Set[int]] = None,
        metric_types_filter: Optional[Set[str]] = None,
    ):
        """Runs a test version of the program pipeline."""

        expected_metric_types = {
            ProgramMetricType.PROGRAM_REFERRAL,
        }

        read_from_bq_constructor = (
            self.fake_bq_source_factory.create_fake_bq_source_constructor(
                dataset, data_dict
            )
        )
        write_to_bq_constructor = (
            self.fake_bq_sink_factory.create_fake_bq_sink_constructor(
                dataset,
                expected_output_metric_types=expected_metric_types,
            )
        )
        run_test_pipeline(
            pipeline_module=pipeline,
            state_code="US_XX",
            dataset=dataset,
            read_from_bq_constructor=read_from_bq_constructor,
            write_to_bq_constructor=write_to_bq_constructor,
            unifying_id_field_filter_set=unifying_id_field_filter_set,
            metric_types_filter=metric_types_filter,
        )

    def testProgramPipelineNoReferrals(self):
        """Tests the program pipeline where one person does not have any
        program assignment entities."""
        fake_person_id = 12345
        fake_person_id_2 = 9876

        fake_person = schema.StatePerson(
            state_code="US_XX",
            person_id=fake_person_id,
            gender=Gender.MALE,
            birthdate=date(1970, 1, 1),
            residency_status=ResidencyStatus.PERMANENT,
        )

        fake_person_2 = schema.StatePerson(
            state_code="US_XX",
            person_id=fake_person_id_2,
            gender=Gender.MALE,
            birthdate=date(1974, 3, 12),
            residency_status=ResidencyStatus.PERMANENT,
        )

        persons_data = normalized_database_base_dict_list([fake_person, fake_person_2])

        race_1 = schema.StatePersonRace(
            person_race_id=111,
            state_code="US_XX",
            race=Race.BLACK,
            person_id=fake_person_id,
        )

        race_2 = schema.StatePersonRace(
            person_race_id=111,
            state_code="US_XX",
            race=Race.WHITE,
            person_id=fake_person_id,
        )

        races_data = normalized_database_base_dict_list([race_1, race_2])

        ethnicity = schema.StatePersonEthnicity(
            person_ethnicity_id=111,
            state_code="US_XX",
            ethnicity=Ethnicity.HISPANIC,
            person_id=fake_person_id,
        )

        ethnicity_data = normalized_database_base_dict_list([ethnicity])

        # Program assignment for a different person
        program_assignment = schema.StateProgramAssignment(
            state_code="US_XX",
            program_assignment_id=123,
            referral_date=date(2015, 5, 10),
            person_id=fake_person_id_2,
            participation_status=StateProgramAssignmentParticipationStatus.DENIED,
        )

        assessment = schema.StateAssessment(
            assessment_id=298374,
            state_code="US_XX",
            assessment_date=date(2015, 3, 19),
            assessment_type="LSIR",
            person_id=fake_person_id,
        )

        supervision_period = schema.StateSupervisionPeriod(
            supervision_period_id=1111,
            state_code="US_XX",
            county_code="124",
            start_date=date(2015, 3, 14),
            termination_date=date(2016, 12, 29),
            supervision_type=StateSupervisionType.PROBATION,
            person_id=fake_person_id,
            status=StateSupervisionPeriodStatus.PRESENT_WITHOUT_INFO,
        )

        program_assignment_data = [normalized_database_base_dict(program_assignment)]

        assessment_data = [normalized_database_base_dict(assessment)]

        supervision_periods_data = [normalized_database_base_dict(supervision_period)]

        supervision_violation_response = (
            database_test_utils.generate_test_supervision_violation_response(
                fake_person_id
            )
        )

        supervision_violation_response_data = [
            normalized_database_base_dict(supervision_violation_response)
        ]

        supervision_period_to_agent_data = [
            {
                "agent_id": 1010,
                "person_id": fake_person_id,
                "state_code": "US_XX",
                "agent_external_id": "OFFICER0009",
                "supervision_period_id": supervision_period.supervision_period_id,
            }
        ]

        state_race_ethnicity_population_count_data = [
            {
                "state_code": "US_XX",
                "race_or_ethnicity": "BLACK",
                "population_count": 1,
                "representation_priority": 1,
            }
        ]

        data_dict = {
            schema.StatePerson.__tablename__: persons_data,
            schema.StatePersonRace.__tablename__: races_data,
            schema.StatePersonEthnicity.__tablename__: ethnicity_data,
            schema.StateSupervisionViolationResponse.__tablename__: supervision_violation_response_data,
            schema.StateSupervisionPeriod.__tablename__: supervision_periods_data,
            schema.StateProgramAssignment.__tablename__: program_assignment_data,
            schema.StateAssessment.__tablename__: assessment_data,
            schema.StatePersonExternalId.__tablename__: [],
            schema.StatePersonAlias.__tablename__: [],
            schema.StateSentenceGroup.__tablename__: [],
            "supervision_period_to_agent_association": supervision_period_to_agent_data,
            "state_race_ethnicity_population_counts": state_race_ethnicity_population_count_data,
        }

        dataset = "recidiviz-123.state"

        self.run_test_pipeline(dataset, data_dict)
Exemple #6
0
class TestIncarcerationPipeline(unittest.TestCase):
    """Tests the entire incarceration pipeline."""
    def setUp(self) -> None:
        self.fake_bq_source_factory = FakeReadFromBigQueryFactory()

    @staticmethod
    def _default_data_dict():
        return {
            schema.StatePerson.__tablename__: [],
            schema.StatePersonRace.__tablename__: [],
            schema.StatePersonEthnicity.__tablename__: [],
            schema.StateSentenceGroup.__tablename__: [],
            schema.StateIncarcerationSentence.__tablename__: [],
            schema.StateSupervisionSentence.__tablename__: [],
            schema.StateIncarcerationPeriod.__tablename__: [],
            schema.state_incarceration_sentence_incarceration_period_association_table.name:
            [],
            schema.state_supervision_sentence_incarceration_period_association_table.name:
            [],
            schema.StatePersonExternalId.__tablename__: [],
            schema.StatePersonAlias.__tablename__: [],
            schema.StateAssessment.__tablename__: [],
            schema.StateProgramAssignment.__tablename__: [],
            schema.StateFine.__tablename__: [],
            schema.StateCharge.__tablename__: [],
            schema.StateSupervisionPeriod.__tablename__: [],
            schema.StateEarlyDischarge.__tablename__: [],
            schema.state_charge_incarceration_sentence_association_table.name:
            [],
            schema.state_charge_supervision_sentence_association_table.name:
            [],
            schema.state_incarceration_sentence_supervision_period_association_table.name:
            [],
            schema.state_supervision_sentence_supervision_period_association_table.name:
            [],
        }

    def build_incarceration_pipeline_data_dict(self,
                                               fake_person_id: int,
                                               state_code: str = 'US_XX'):
        """Builds a data_dict for a basic run of the pipeline."""
        fake_person = schema.StatePerson(
            state_code=state_code,
            person_id=fake_person_id,
            gender=Gender.MALE,
            birthdate=date(1970, 1, 1),
            residency_status=ResidencyStatus.PERMANENT)

        persons_data = [normalized_database_base_dict(fake_person)]

        race_1 = schema.StatePersonRace(person_race_id=111,
                                        state_code=state_code,
                                        race=Race.BLACK,
                                        person_id=fake_person_id)

        race_2 = schema.StatePersonRace(person_race_id=111,
                                        state_code=state_code,
                                        race=Race.WHITE,
                                        person_id=fake_person_id)

        races_data = normalized_database_base_dict_list([race_1, race_2])

        ethnicity = schema.StatePersonEthnicity(person_ethnicity_id=111,
                                                state_code=state_code,
                                                ethnicity=Ethnicity.HISPANIC,
                                                person_id=fake_person_id)

        ethnicity_data = normalized_database_base_dict_list([ethnicity])

        sentence_group = schema.StateSentenceGroup(sentence_group_id=111,
                                                   person_id=fake_person_id)

        initial_incarceration = schema.StateIncarcerationPeriod(
            incarceration_period_id=1111,
            incarceration_type=StateIncarcerationType.STATE_PRISON,
            status=StateIncarcerationPeriodStatus.NOT_IN_CUSTODY,
            state_code=state_code,
            county_code='124',
            facility='San Quentin',
            facility_security_level=StateIncarcerationFacilitySecurityLevel.
            MAXIMUM,
            admission_reason=StateIncarcerationPeriodAdmissionReason.
            NEW_ADMISSION,
            projected_release_reason=StateIncarcerationPeriodReleaseReason.
            CONDITIONAL_RELEASE,
            admission_date=date(2008, 11, 20),
            release_date=date(2010, 12, 4),
            release_reason=StateIncarcerationPeriodReleaseReason.
            SENTENCE_SERVED,
            person_id=fake_person_id,
        )

        first_reincarceration = schema.StateIncarcerationPeriod(
            incarceration_period_id=2222,
            incarceration_type=StateIncarcerationType.STATE_PRISON,
            status=StateIncarcerationPeriodStatus.NOT_IN_CUSTODY,
            state_code=state_code,
            county_code='124',
            facility='San Quentin',
            facility_security_level=StateIncarcerationFacilitySecurityLevel.
            MAXIMUM,
            admission_reason=StateIncarcerationPeriodAdmissionReason.
            NEW_ADMISSION,
            projected_release_reason=StateIncarcerationPeriodReleaseReason.
            CONDITIONAL_RELEASE,
            admission_date=date(2011, 4, 5),
            release_date=date(2014, 4, 14),
            release_reason=StateIncarcerationPeriodReleaseReason.
            SENTENCE_SERVED,
            person_id=fake_person_id)

        subsequent_reincarceration = schema.StateIncarcerationPeriod(
            incarceration_period_id=3333,
            incarceration_type=StateIncarcerationType.STATE_PRISON,
            status=StateIncarcerationPeriodStatus.IN_CUSTODY,
            state_code=state_code,
            county_code='124',
            facility='San Quentin',
            facility_security_level=StateIncarcerationFacilitySecurityLevel.
            MAXIMUM,
            admission_reason=StateIncarcerationPeriodAdmissionReason.
            NEW_ADMISSION,
            projected_release_reason=StateIncarcerationPeriodReleaseReason.
            CONDITIONAL_RELEASE,
            admission_date=date(2017, 1, 4),
            person_id=fake_person_id)

        incarceration_sentence = schema.StateIncarcerationSentence(
            incarceration_sentence_id=1111,
            state_code=state_code,
            sentence_group_id=sentence_group.sentence_group_id,
            incarceration_periods=[
                initial_incarceration, first_reincarceration,
                subsequent_reincarceration
            ],
            person_id=fake_person_id)

        supervision_sentence = schema.StateSupervisionSentence(
            supervision_sentence_id=123,
            state_code=state_code,
            person_id=fake_person_id)

        sentence_group.incarceration_sentences = [incarceration_sentence]

        sentence_group_data = [normalized_database_base_dict(sentence_group)]

        incarceration_sentence_data = [
            normalized_database_base_dict(incarceration_sentence)
        ]

        supervision_sentence_data = [
            normalized_database_base_dict(supervision_sentence)
        ]

        incarceration_periods_data = [
            normalized_database_base_dict(initial_incarceration),
            normalized_database_base_dict(first_reincarceration),
            normalized_database_base_dict(subsequent_reincarceration)
        ]

        state_incarceration_sentence_incarceration_period_association = [
            {
                'incarceration_period_id':
                initial_incarceration.incarceration_period_id,
                'incarceration_sentence_id':
                incarceration_sentence.incarceration_sentence_id,
            },
            {
                'incarceration_period_id':
                first_reincarceration.incarceration_period_id,
                'incarceration_sentence_id':
                incarceration_sentence.incarceration_sentence_id,
            },
            {
                'incarceration_period_id':
                subsequent_reincarceration.incarceration_period_id,
                'incarceration_sentence_id':
                incarceration_sentence.incarceration_sentence_id,
            },
        ]

        data_dict = self._default_data_dict()
        data_dict_overrides = {
            schema.StatePerson.__tablename__:
            persons_data,
            schema.StatePersonRace.__tablename__:
            races_data,
            schema.StatePersonEthnicity.__tablename__:
            ethnicity_data,
            schema.StateSentenceGroup.__tablename__:
            sentence_group_data,
            schema.StateIncarcerationSentence.__tablename__:
            incarceration_sentence_data,
            schema.StateSupervisionSentence.__tablename__:
            supervision_sentence_data,
            schema.StateIncarcerationPeriod.__tablename__:
            incarceration_periods_data,
            schema.state_incarceration_sentence_incarceration_period_association_table.name:
            state_incarceration_sentence_incarceration_period_association,
        }
        data_dict.update(data_dict_overrides)
        return data_dict

    def testIncarcerationPipeline(self):
        fake_person_id = 12345
        data_dict = self.build_incarceration_pipeline_data_dict(
            fake_person_id=fake_person_id)
        dataset = 'recidiviz-123.state'

        with patch(
                'recidiviz.calculator.pipeline.utils.extractor_utils.ReadFromBigQuery',
                self.fake_bq_source_factory.create_fake_bq_source_constructor(
                    dataset, data_dict)):
            self.run_test_pipeline(fake_person_id,
                                   _STATE_CODE,
                                   dataset,
                                   expected_metric_types=ALL_METRIC_TYPES_SET)

    def testIncarcerationPipelineFilterMetrics(self):
        fake_person_id = 12345
        data_dict = self.build_incarceration_pipeline_data_dict(
            fake_person_id=fake_person_id)
        dataset = 'recidiviz-123.state'

        expected_metric_types = {
            IncarcerationMetricType.INCARCERATION_ADMISSION
        }
        metric_types_filter = {
            IncarcerationMetricType.INCARCERATION_ADMISSION.value
        }

        with patch(
                'recidiviz.calculator.pipeline.utils.extractor_utils.ReadFromBigQuery',
                self.fake_bq_source_factory.create_fake_bq_source_constructor(
                    dataset, data_dict)):
            self.run_test_pipeline(fake_person_id,
                                   _STATE_CODE,
                                   dataset,
                                   expected_metric_types=expected_metric_types,
                                   metric_types_filter=metric_types_filter)

    def testIncarcerationPipelineUsMo(self):
        fake_person_id = 12345
        data_dict = self.build_incarceration_pipeline_data_dict(
            fake_person_id=fake_person_id, state_code='US_MO')
        dataset = 'recidiviz-123.state'

        with patch(
                'recidiviz.calculator.pipeline.utils.extractor_utils.ReadFromBigQuery',
                self.fake_bq_source_factory.create_fake_bq_source_constructor(
                    dataset, data_dict)):
            self.run_test_pipeline(fake_person_id,
                                   'US_MO',
                                   dataset,
                                   expected_metric_types=ALL_METRIC_TYPES_SET)

    def testIncarcerationPipelineWithFilterSet(self):
        fake_person_id = 12345
        data_dict = self.build_incarceration_pipeline_data_dict(
            fake_person_id=fake_person_id)
        dataset = 'recidivz-staging.state'

        with patch(
                'recidiviz.calculator.pipeline.utils.extractor_utils.ReadFromBigQuery',
                self.fake_bq_source_factory.create_fake_bq_source_constructor(
                    dataset, data_dict)):
            self.run_test_pipeline(
                fake_person_id,
                _STATE_CODE,
                dataset,
                unifying_id_field_filter_set={fake_person_id},
                expected_metric_types=ALL_METRIC_TYPES_SET)

    # TODO(#4375): Update tests to run actual pipeline code and only mock BQ I/O
    @staticmethod
    def run_test_pipeline(
            fake_person_id: int,
            state_code: str,
            dataset: str,
            expected_metric_types: Set[IncarcerationMetricType],
            allow_empty: bool = False,
            unifying_id_field_filter_set: Optional[Set[int]] = None,
            metric_types_filter: Optional[Set[str]] = None):
        """Runs a test version of the incarceration pipeline."""
        test_pipeline = TestPipeline()

        # Get StatePersons
        persons = (
            test_pipeline
            | 'Load Persons' >>  # type: ignore
            extractor_utils.BuildRootEntity(
                dataset=dataset,
                root_entity_class=entities.StatePerson,
                unifying_id_field=entities.StatePerson.get_class_id_name(),
                build_related_entities=True))

        # Get StateSentenceGroups
        sentence_groups = (
            test_pipeline
            | 'Load StateSentenceGroups' >>  # type: ignore
            extractor_utils.BuildRootEntity(
                dataset=dataset,
                root_entity_class=entities.StateSentenceGroup,
                unifying_id_field=entities.StatePerson.get_class_id_name(),
                build_related_entities=True,
                unifying_id_field_filter_set=unifying_id_field_filter_set))

        # Get StateIncarcerationSentences
        incarceration_sentences = (
            test_pipeline
            | 'Load StateIncarcerationSentences' >>  # type: ignore
            extractor_utils.BuildRootEntity(
                dataset=dataset,
                root_entity_class=entities.StateIncarcerationSentence,
                unifying_id_field=entities.StatePerson.get_class_id_name(),
                build_related_entities=True,
                unifying_id_field_filter_set=unifying_id_field_filter_set))

        # Get StateSupervisionSentences
        supervision_sentences = (
            test_pipeline | 'Load StateSupervisionSentences' >>  # type: ignore
            extractor_utils.BuildRootEntity(
                dataset=dataset,
                root_entity_class=entities.StateSupervisionSentence,
                unifying_id_field=entities.StatePerson.get_class_id_name(),
                build_related_entities=True,
                unifying_id_field_filter_set=unifying_id_field_filter_set))

        us_mo_sentence_status_rows: List[Dict[str, Any]] = [{
            'person_id':
            fake_person_id,
            'sentence_external_id':
            'XXX',
            'sentence_status_external_id':
            'YYY',
            'status_code':
            'ZZZ',
            'status_date':
            'not_a_date',
            'status_description':
            'XYZ'
        }]

        us_mo_sentence_statuses = (test_pipeline
                                   | 'Create MO sentence statuses' >>
                                   beam.Create(us_mo_sentence_status_rows))

        us_mo_sentence_status_rankings_as_kv = (
            us_mo_sentence_statuses
            | 'Convert sentence status ranking table to KV tuples' >>
            beam.ParDo(ConvertDictToKVTuple(), 'person_id'))

        sentences_and_statuses = (
            {
                'incarceration_sentences': incarceration_sentences,
                'supervision_sentences': supervision_sentences,
                'sentence_statuses': us_mo_sentence_status_rankings_as_kv
            }
            | 'Group sentences to the sentence statuses for that person' >>
            beam.CoGroupByKey())

        sentences_converted = (
            sentences_and_statuses
            | 'Convert to state-specific sentences' >> beam.ParDo(
                ConvertSentencesToStateSpecificType()).with_outputs(
                    'incarceration_sentences', 'supervision_sentences'))

        sentences_and_sentence_groups = (
            {
                'sentence_groups': sentence_groups,
                'incarceration_sentences':
                sentences_converted.incarceration_sentences,
                'supervision_sentences':
                sentences_converted.supervision_sentences
            }
            | 'Group sentences to sentence groups' >> beam.CoGroupByKey())

        sentence_groups_with_hydrated_sentences = (
            sentences_and_sentence_groups
            | 'Set hydrated sentences on sentence groups' >> beam.ParDo(
                SetSentencesOnSentenceGroup()))

        # Identify IncarcerationEvents events from the StatePerson's
        # StateIncarcerationPeriods
        fake_person_id_to_county_query_result = [{
            'person_id':
            fake_person_id,
            'county_of_residence':
            _COUNTY_OF_RESIDENCE
        }]
        person_id_to_county_kv = (
            test_pipeline
            | "Read person id to county associations from BigQuery" >>
            beam.Create(fake_person_id_to_county_query_result)
            | "Convert person_id to counties to KV" >> beam.ParDo(
                ConvertDictToKVTuple(), 'person_id'))

        incarceration_period_judicial_district_association_row = \
            {'person_id': fake_person_id, 'incarceration_period_id': 123, 'judicial_district_code': 'NW'}

        ip_to_judicial_district_kv = (
            test_pipeline
            |
            "Read incarceration_period to judicial_district associations from BigQuery"
            >> beam.Create(
                [incarceration_period_judicial_district_association_row])
            | "Convert ips to judicial districts to KV" >> beam.ParDo(
                ConvertDictToKVTuple(), 'person_id'))

        state_race_ethnicity_population_count = {
            'state_code': state_code,
            'race_or_ethnicity': 'BLACK',
            'population_count': 1,
            'representation_priority': 1
        }

        state_race_ethnicity_population_counts = (
            test_pipeline
            | 'Create state_race_ethnicity_population_count table' >>
            beam.Create([state_race_ethnicity_population_count]))

        # Group each StatePerson with their related entities
        person_entities = (
            {
                'person':
                persons,
                'sentence_groups':
                sentence_groups_with_hydrated_sentences,
                'incarceration_period_judicial_district_association':
                ip_to_judicial_district_kv
            }
            | 'Group StatePerson to SentenceGroups' >> beam.CoGroupByKey())

        # Identify IncarcerationEvents events from the StatePerson's StateIncarcerationPeriods
        person_incarceration_events = (
            person_entities | 'Classify Incarceration Events' >> beam.ParDo(
                pipeline.ClassifyIncarcerationEvents(),
                AsDict(person_id_to_county_kv)))

        person_metadata = (
            persons
            | "Build the person_metadata dictionary" >> beam.ParDo(
                BuildPersonMetadata(),
                AsList(state_race_ethnicity_population_counts)))

        person_incarceration_events_with_metadata = (
            {
                'person_events': person_incarceration_events,
                'person_metadata': person_metadata
            }
            | 'Group IncarcerationEvents with person-level metadata' >>
            beam.CoGroupByKey()
            |
            'Organize StatePerson, PersonMetadata and IncarcerationEvents for calculations'
            >> beam.ParDo(ExtractPersonEventsMetadata()))

        # Get pipeline job details for accessing job_id
        all_pipeline_options = PipelineOptions().get_all_options()

        # Add timestamp for local jobs
        job_timestamp = datetime.datetime.now().strftime(
            '%Y-%m-%d_%H_%M_%S.%f')
        all_pipeline_options['job_timestamp'] = job_timestamp

        metric_types = metric_types_filter if metric_types_filter else {'ALL'}

        # Get IncarcerationMetrics
        incarceration_metrics = (
            person_incarceration_events_with_metadata
            | 'Get Incarceration Metrics' >>  # type: ignore
            pipeline.GetIncarcerationMetrics(
                pipeline_options=all_pipeline_options,
                metric_types=metric_types,
                calculation_end_month=None,
                calculation_month_count=-1))

        assert_that(
            incarceration_metrics,
            AssertMatchers.validate_metric_type(allow_empty=allow_empty),
            'Assert that all metrics are of the expected type.')

        assert_that(
            incarceration_metrics,
            AssertMatchers.validate_pipeline_test(expected_metric_types),
            'Assert the type of metrics produced are expected')

        test_pipeline.run()

    def build_incarceration_pipeline_data_dict_no_incarceration(
            self, fake_person_id: int):
        """Builds a data_dict for a run of the pipeline where the person has no incarceration."""
        fake_person_1 = schema.StatePerson(
            state_code='US_XX',
            person_id=fake_person_id,
            gender=Gender.MALE,
            birthdate=date(1970, 1, 1),
            residency_status=ResidencyStatus.PERMANENT)

        fake_person_id_2 = 6789

        fake_person_2 = schema.StatePerson(
            state_code='US_XX',
            person_id=fake_person_id_2,
            gender=Gender.FEMALE,
            birthdate=date(1990, 1, 1),
            residency_status=ResidencyStatus.PERMANENT)

        persons_data = [
            normalized_database_base_dict(fake_person_1),
            normalized_database_base_dict(fake_person_2)
        ]

        sentence_group = schema.StateSentenceGroup(sentence_group_id=111,
                                                   person_id=fake_person_id)

        incarceration_period = schema.StateIncarcerationPeriod(
            incarceration_period_id=1111,
            status=StateIncarcerationPeriodStatus.NOT_IN_CUSTODY,
            state_code='US_XX',
            county_code='124',
            facility='San Quentin',
            facility_security_level=StateIncarcerationFacilitySecurityLevel.
            MAXIMUM,
            admission_reason=StateIncarcerationPeriodAdmissionReason.
            NEW_ADMISSION,
            projected_release_reason=StateIncarcerationPeriodReleaseReason.
            CONDITIONAL_RELEASE,
            admission_date=date(2008, 11, 20),
            release_date=date(2010, 12, 4),
            release_reason=StateIncarcerationPeriodReleaseReason.
            SENTENCE_SERVED,
            person_id=fake_person_id)

        incarceration_sentence = schema.StateIncarcerationSentence(
            incarceration_sentence_id=1111,
            sentence_group_id=sentence_group.sentence_group_id,
            incarceration_periods=[incarceration_period],
            person_id=fake_person_id)

        supervision_sentence = schema.StateSupervisionSentence(
            supervision_sentence_id=123, person_id=fake_person_id)

        sentence_group.incarceration_sentences = [incarceration_sentence]

        sentence_group_data = [normalized_database_base_dict(sentence_group)]

        incarceration_sentence_data = [
            normalized_database_base_dict(incarceration_sentence)
        ]

        supervision_sentence_data = [
            normalized_database_base_dict(supervision_sentence)
        ]

        incarceration_periods_data = [
            normalized_database_base_dict(incarceration_period)
        ]

        state_incarceration_sentence_incarceration_period_association = [
            {
                'incarceration_period_id':
                incarceration_period.incarceration_period_id,
                'incarceration_sentence_id':
                incarceration_sentence.incarceration_sentence_id,
            },
        ]

        data_dict = self._default_data_dict()
        data_dict_overrides = {
            schema.StatePerson.__tablename__:
            persons_data,
            schema.StateSentenceGroup.__tablename__:
            sentence_group_data,
            schema.StateIncarcerationSentence.__tablename__:
            incarceration_sentence_data,
            schema.StateSupervisionSentence.__tablename__:
            supervision_sentence_data,
            schema.StateIncarcerationPeriod.__tablename__:
            incarceration_periods_data,
            schema.state_incarceration_sentence_incarceration_period_association_table.name:
            state_incarceration_sentence_incarceration_period_association,
        }
        data_dict.update(data_dict_overrides)

        return data_dict

    def testIncarcerationPipelineNoIncarceration(self):
        """Tests the incarceration pipeline when a person doesn't have any
        incarceration periods."""
        fake_person_id = 12345
        data_dict = self.build_incarceration_pipeline_data_dict_no_incarceration(
            fake_person_id)
        dataset = 'recidiviz-123.state'

        with patch(
                'recidiviz.calculator.pipeline.utils.extractor_utils.ReadFromBigQuery',
                self.fake_bq_source_factory.create_fake_bq_source_constructor(
                    dataset, data_dict)):
            self.run_test_pipeline(fake_person_id,
                                   _STATE_CODE,
                                   dataset,
                                   expected_metric_types=set(),
                                   allow_empty=True)
class TestViolationPipeline(unittest.TestCase):
    """Tests the entire violation pipeline."""
    def setUp(self) -> None:
        self.fake_bq_source_factory = FakeReadFromBigQueryFactory()
        self.fake_bq_sink_factory = FakeWriteToBigQueryFactory(
            FakeWriteToBigQuery)
        self.violation_delegate_patcher = mock.patch(
            "recidiviz.calculator.pipeline.violation.identifier.get_state_specific_violation_delegate"
        )
        self.mock_violation_delegate = self.violation_delegate_patcher.start()
        self.mock_violation_delegate.return_value = UsXxViolationDelegate()

    def tearDown(self) -> None:
        self.violation_delegate_patcher.stop()

    @staticmethod
    def build_data_dict(
            fake_person_id: int,
            fake_supervision_violation_id: int) -> Dict[str, List[Any]]:
        """Builds a data_dict for a basic run of the pipeline."""
        fake_person = schema.StatePerson(
            state_code="US_XX",
            person_id=fake_person_id,
            gender=Gender.FEMALE,
            birthdate=date(1985, 2, 1),
        )
        persons_data = [normalized_database_base_dict(fake_person)]

        race_1 = schema.StatePersonRace(
            person_race_id=111,
            state_code="US_XX",
            race=Race.ASIAN,
            person_id=fake_person_id,
        )
        race_2 = schema.StatePersonRace(
            person_race_id=111,
            state_code="US_XX",
            race=Race.AMERICAN_INDIAN_ALASKAN_NATIVE,
            person_id=fake_person_id,
        )

        races_data = normalized_database_base_dict_list([race_1, race_2])

        ethnicity = schema.StatePersonEthnicity(
            person_ethnicity_id=111,
            state_code="US_XX",
            ethnicity=Ethnicity.NOT_HISPANIC,
            person_id=fake_person_id,
        )

        ethnicity_data = normalized_database_base_dict_list([ethnicity])

        violation_type = schema.StateSupervisionViolationTypeEntry(
            state_code="US_XX",
            violation_type=StateSupervisionViolationType.FELONY,
            person_id=fake_person_id,
        )
        incomplete_response = schema.StateSupervisionViolationResponse(
            state_code="US_XX",
            supervision_violation_response_id=1234,
            response_type=entities.StateSupervisionViolationResponseType.
            VIOLATION_REPORT,
            response_date=date(2021, 1, 4),
            is_draft=False,
            person_id=fake_person_id,
        )
        violation = schema.StateSupervisionViolation(
            state_code="US_XX",
            supervision_violation_id=fake_supervision_violation_id,
            violation_date=date(2021, 1, 1),
            is_violent=False,
            is_sex_offense=False,
            supervision_violation_types=[violation_type],
            supervision_violation_responses=[incomplete_response],
            person_id=fake_person_id,
        )
        incomplete_response.supervision_violation_id = fake_supervision_violation_id
        violation_type.supervision_violation_id = fake_supervision_violation_id

        violation_decision = schema.StateSupervisionViolationResponseDecisionEntry(
            state_code="US_XX",
            decision=StateSupervisionViolationResponseDecision.
            SHOCK_INCARCERATION,
            person_id=fake_person_id,
            supervision_violation_response_decision_entry_id=234,
            supervision_violation_response_id=1234,
        )
        complete_violation_response = schema.StateSupervisionViolationResponse(
            state_code="US_XX",
            supervision_violation_response_id=1234,
            response_type=entities.StateSupervisionViolationResponseType.
            VIOLATION_REPORT,
            response_date=date(2021, 1, 4),
            is_draft=False,
            supervision_violation_response_decisions=[violation_decision],
            person_id=fake_person_id,
        )
        complete_violation_response.supervision_violation_id = (
            fake_supervision_violation_id)

        violations_data = [normalized_database_base_dict(violation)]
        violation_responses_data = [
            normalized_database_base_dict(complete_violation_response)
        ]
        violation_types_data = [normalized_database_base_dict(violation_type)]
        violation_decisions_data = [
            normalized_database_base_dict(violation_decision)
        ]

        state_race_ethnicity_population_count_data = [{
            "state_code":
            "US_XX",
            "race_or_ethnicity":
            "ASIAN",
            "population_count":
            1,
            "representation_priority":
            1,
        }]

        data_dict: Dict[str, List[Any]] = {
            schema.StatePerson.__tablename__:
            persons_data,
            schema.StatePersonRace.__tablename__:
            races_data,
            schema.StatePersonEthnicity.__tablename__:
            ethnicity_data,
            schema.StateSupervisionViolation.__tablename__:
            violations_data,
            schema.StateSupervisionViolationResponse.__tablename__:
            violation_responses_data,
            schema.StateSupervisionViolationTypeEntry.__tablename__:
            violation_types_data,
            schema.StateSupervisionViolatedConditionEntry.__tablename__: [],
            schema.StateSupervisionViolationResponseDecisionEntry.__tablename__:
            violation_decisions_data,
            schema.StatePersonExternalId.__tablename__: [],
            schema.StatePersonAlias.__tablename__: [],
            schema.StateAssessment.__tablename__: [],
            schema.StateProgramAssignment.__tablename__: [],
            schema.StateSentenceGroup.__tablename__: [],
            "state_race_ethnicity_population_counts":
            state_race_ethnicity_population_count_data,
        }
        return data_dict

    def run_test_pipeline(
        self,
        dataset: str,
        data_dict: DataTablesDict,
        expected_metric_types: Set[ViolationMetricType],
        unifying_id_field_filter_set: Optional[Set[int]] = None,
        metric_types_filter: Optional[Set[str]] = None,
    ) -> None:
        """Runs a test version of the violation pipeline."""
        read_from_bq_constructor = (
            self.fake_bq_source_factory.create_fake_bq_source_constructor(
                dataset, data_dict))
        write_to_bq_constructor = (
            self.fake_bq_sink_factory.create_fake_bq_sink_constructor(
                dataset, expected_output_metric_types=expected_metric_types))
        run_test_pipeline(
            pipeline=ViolationPipeline(),
            state_code="US_XX",
            dataset=dataset,
            read_from_bq_constructor=read_from_bq_constructor,
            write_to_bq_constructor=write_to_bq_constructor,
            unifying_id_field_filter_set=unifying_id_field_filter_set,
            metric_types_filter=metric_types_filter,
        )

    def testViolationPipeline(self) -> None:
        """Tests the violaitons pipeline."""
        data_dict = self.build_data_dict(fake_person_id=12345,
                                         fake_supervision_violation_id=23456)

        dataset = "recidiviz-123.state"

        self.run_test_pipeline(
            dataset,
            data_dict,
            expected_metric_types={ViolationMetricType.VIOLATION})

    def testViolationPipelineWithFilterSet(self) -> None:
        """Tests the violation pipeline with a proper filter set."""
        data_dict = self.build_data_dict(fake_person_id=12345,
                                         fake_supervision_violation_id=23456)

        dataset = "recidiviz-123.state"

        self.run_test_pipeline(
            dataset,
            data_dict,
            expected_metric_types={ViolationMetricType.VIOLATION},
            unifying_id_field_filter_set={12345},
        )

    def testViolationPipelineWithNoViolations(self) -> None:
        """Tests the violation pipeline when a person does not have any violations."""
        data_dict = self.build_data_dict(fake_person_id=12345,
                                         fake_supervision_violation_id=23456)
        data_dict[schema.StateSupervisionViolation.__tablename__] = []
        data_dict[schema.StateSupervisionViolationResponse.__tablename__] = []
        data_dict[schema.StateSupervisionViolationTypeEntry.__tablename__] = []
        data_dict[schema.StateSupervisionViolationResponseDecisionEntry.
                  __tablename__] = []

        dataset = "recidiviz-123.state"

        self.run_test_pipeline(
            dataset,
            data_dict,
            expected_metric_types=set(),
        )
 def setUp(self) -> None:
     self.fake_bq_source_factory = FakeReadFromBigQueryFactory()
     self.fake_bq_sink_factory = FakeWriteToBigQueryFactory(
         FakeWriteToBigQuery)
class TestIncarcerationPipeline(unittest.TestCase):
    """Tests the entire incarceration pipeline."""
    def setUp(self) -> None:
        self.fake_bq_source_factory = FakeReadFromBigQueryFactory()
        self.fake_bq_sink_factory = FakeWriteToBigQueryFactory(
            FakeWriteToBigQuery)

    @staticmethod
    def _default_data_dict():
        return {
            schema.StatePerson.__tablename__: [],
            schema.StatePersonRace.__tablename__: [],
            schema.StatePersonEthnicity.__tablename__: [],
            schema.StateSentenceGroup.__tablename__: [],
            schema.StateIncarcerationSentence.__tablename__: [],
            schema.StateSupervisionSentence.__tablename__: [],
            schema.StateIncarcerationPeriod.__tablename__: [],
            schema.state_incarceration_sentence_incarceration_period_association_table.name:
            [],
            schema.state_supervision_sentence_incarceration_period_association_table.name:
            [],
            schema.StatePersonExternalId.__tablename__: [],
            schema.StatePersonAlias.__tablename__: [],
            schema.StateAssessment.__tablename__: [],
            schema.StateProgramAssignment.__tablename__: [],
            schema.StateFine.__tablename__: [],
            schema.StateCharge.__tablename__: [],
            schema.StateSupervisionPeriod.__tablename__: [],
            schema.StateEarlyDischarge.__tablename__: [],
            schema.state_charge_incarceration_sentence_association_table.name:
            [],
            schema.state_charge_supervision_sentence_association_table.name:
            [],
            schema.state_incarceration_sentence_supervision_period_association_table.name:
            [],
            schema.state_supervision_sentence_supervision_period_association_table.name:
            [],
        }

    def build_incarceration_pipeline_data_dict(self,
                                               fake_person_id: int,
                                               state_code: str = "US_XX"):
        """Builds a data_dict for a basic run of the pipeline."""
        fake_person = schema.StatePerson(
            state_code=state_code,
            person_id=fake_person_id,
            gender=Gender.MALE,
            birthdate=date(1970, 1, 1),
            residency_status=ResidencyStatus.PERMANENT,
        )

        persons_data = [normalized_database_base_dict(fake_person)]

        race_1 = schema.StatePersonRace(
            person_race_id=111,
            state_code=state_code,
            race=Race.BLACK,
            person_id=fake_person_id,
        )

        race_2 = schema.StatePersonRace(
            person_race_id=111,
            state_code=state_code,
            race=Race.WHITE,
            person_id=fake_person_id,
        )

        races_data = normalized_database_base_dict_list([race_1, race_2])

        ethnicity = schema.StatePersonEthnicity(
            person_ethnicity_id=111,
            state_code=state_code,
            ethnicity=Ethnicity.HISPANIC,
            person_id=fake_person_id,
        )

        ethnicity_data = normalized_database_base_dict_list([ethnicity])

        sentence_group = schema.StateSentenceGroup(
            sentence_group_id=98765,
            state_code=state_code,
            status=StateSentenceStatus.PRESENT_WITHOUT_INFO,
            person_id=fake_person_id,
        )

        initial_incarceration = schema.StateIncarcerationPeriod(
            incarceration_period_id=1111,
            incarceration_type=StateIncarcerationType.STATE_PRISON,
            status=StateIncarcerationPeriodStatus.NOT_IN_CUSTODY,
            state_code=state_code,
            county_code="124",
            facility="San Quentin",
            facility_security_level=StateIncarcerationFacilitySecurityLevel.
            MAXIMUM,
            admission_reason=StateIncarcerationPeriodAdmissionReason.
            NEW_ADMISSION,
            projected_release_reason=StateIncarcerationPeriodReleaseReason.
            CONDITIONAL_RELEASE,
            admission_date=date(2008, 11, 20),
            release_date=date(2010, 12, 4),
            release_reason=StateIncarcerationPeriodReleaseReason.
            SENTENCE_SERVED,
            person_id=fake_person_id,
        )

        first_reincarceration = schema.StateIncarcerationPeriod(
            incarceration_period_id=2222,
            incarceration_type=StateIncarcerationType.STATE_PRISON,
            status=StateIncarcerationPeriodStatus.NOT_IN_CUSTODY,
            state_code=state_code,
            county_code="124",
            facility="San Quentin",
            facility_security_level=StateIncarcerationFacilitySecurityLevel.
            MAXIMUM,
            admission_reason=StateIncarcerationPeriodAdmissionReason.
            NEW_ADMISSION,
            projected_release_reason=StateIncarcerationPeriodReleaseReason.
            CONDITIONAL_RELEASE,
            admission_date=date(2011, 4, 5),
            release_date=date(2014, 4, 14),
            release_reason=StateIncarcerationPeriodReleaseReason.
            SENTENCE_SERVED,
            person_id=fake_person_id,
        )

        subsequent_reincarceration = schema.StateIncarcerationPeriod(
            incarceration_period_id=3333,
            incarceration_type=StateIncarcerationType.STATE_PRISON,
            status=StateIncarcerationPeriodStatus.IN_CUSTODY,
            state_code=state_code,
            county_code="124",
            facility="San Quentin",
            facility_security_level=StateIncarcerationFacilitySecurityLevel.
            MAXIMUM,
            admission_reason=StateIncarcerationPeriodAdmissionReason.
            NEW_ADMISSION,
            projected_release_reason=StateIncarcerationPeriodReleaseReason.
            CONDITIONAL_RELEASE,
            admission_date=date(2017, 1, 4),
            person_id=fake_person_id,
        )

        incarceration_sentence = schema.StateIncarcerationSentence(
            incarceration_sentence_id=1111,
            state_code=state_code,
            status=StateSentenceStatus.PRESENT_WITHOUT_INFO,
            sentence_group_id=sentence_group.sentence_group_id,
            incarceration_periods=[
                initial_incarceration,
                first_reincarceration,
                subsequent_reincarceration,
            ],
            person_id=fake_person_id,
        )

        supervision_sentence = schema.StateSupervisionSentence(
            supervision_sentence_id=123,
            state_code=state_code,
            sentence_group_id=sentence_group.sentence_group_id,
            person_id=fake_person_id,
            status=StateSentenceStatus.PRESENT_WITHOUT_INFO,
        )

        sentence_group.incarceration_sentences = [incarceration_sentence]
        sentence_group.supervision_sentences = [supervision_sentence]

        sentence_group_data = [normalized_database_base_dict(sentence_group)]

        incarceration_sentence_data = [
            normalized_database_base_dict(incarceration_sentence)
        ]

        supervision_sentence_data = [
            normalized_database_base_dict(supervision_sentence)
        ]

        incarceration_periods_data = [
            normalized_database_base_dict(initial_incarceration),
            normalized_database_base_dict(first_reincarceration),
            normalized_database_base_dict(subsequent_reincarceration),
        ]

        state_incarceration_sentence_incarceration_period_association = [
            {
                "incarceration_period_id":
                initial_incarceration.incarceration_period_id,
                "incarceration_sentence_id":
                incarceration_sentence.incarceration_sentence_id,
            },
            {
                "incarceration_period_id":
                first_reincarceration.incarceration_period_id,
                "incarceration_sentence_id":
                incarceration_sentence.incarceration_sentence_id,
            },
            {
                "incarceration_period_id":
                subsequent_reincarceration.incarceration_period_id,
                "incarceration_sentence_id":
                incarceration_sentence.incarceration_sentence_id,
            },
        ]

        fake_person_id_to_county_query_result = [{
            "state_code":
            state_code,
            "person_id":
            fake_person_id,
            "county_of_residence":
            _COUNTY_OF_RESIDENCE,
        }]

        us_mo_sentence_status_data: List[Dict[str, Any]] = [{
            "state_code":
            "US_MO",
            "person_id":
            fake_person_id,
            "sentence_external_id":
            "XXX",
            "sentence_status_external_id":
            "YYY",
            "status_code":
            "ZZZ",
            "status_date":
            "not_a_date",
            "status_description":
            "XYZ",
        }]

        incarceration_period_judicial_district_association_data = [{
            "state_code":
            state_code,
            "person_id":
            fake_person_id,
            "incarceration_period_id":
            123,
            "judicial_district_code":
            "NW",
        }]

        state_race_ethnicity_population_count_data = [{
            "state_code":
            state_code,
            "race_or_ethnicity":
            "BLACK",
            "population_count":
            1,
            "representation_priority":
            1,
        }]

        data_dict = self._default_data_dict()
        data_dict_overrides = {
            schema.StatePerson.__tablename__:
            persons_data,
            schema.StatePersonRace.__tablename__:
            races_data,
            schema.StatePersonEthnicity.__tablename__:
            ethnicity_data,
            schema.StateSentenceGroup.__tablename__:
            sentence_group_data,
            schema.StateIncarcerationSentence.__tablename__:
            incarceration_sentence_data,
            schema.StateSupervisionSentence.__tablename__:
            supervision_sentence_data,
            schema.StateIncarcerationPeriod.__tablename__:
            incarceration_periods_data,
            schema.state_incarceration_sentence_incarceration_period_association_table.name:
            state_incarceration_sentence_incarceration_period_association,
            "persons_to_recent_county_of_residence":
            fake_person_id_to_county_query_result,
            "incarceration_period_judicial_district_association":
            incarceration_period_judicial_district_association_data,
            "state_race_ethnicity_population_counts":
            state_race_ethnicity_population_count_data,
            "us_mo_sentence_statuses":
            us_mo_sentence_status_data,
        }
        data_dict.update(data_dict_overrides)
        return data_dict

    @freeze_time("2015-01-31")
    def testIncarcerationPipeline(self):
        fake_person_id = 12345
        data_dict = self.build_incarceration_pipeline_data_dict(
            fake_person_id=fake_person_id)
        dataset = "recidiviz-123.state"

        self.run_test_pipeline(
            state_code=_STATE_CODE,
            dataset=dataset,
            data_dict=data_dict,
            expected_metric_types=ALL_METRIC_TYPES_SET,
        )

    @freeze_time("2015-01-31")
    def testIncarcerationPipelineFilterMetrics(self):
        fake_person_id = 12345
        data_dict = self.build_incarceration_pipeline_data_dict(
            fake_person_id=fake_person_id)
        dataset = "recidiviz-123.state"

        expected_metric_types = {
            IncarcerationMetricType.INCARCERATION_ADMISSION
        }
        metric_types_filter = {
            IncarcerationMetricType.INCARCERATION_ADMISSION.value
        }

        self.run_test_pipeline(
            state_code=_STATE_CODE,
            dataset=dataset,
            data_dict=data_dict,
            expected_metric_types=expected_metric_types,
            metric_types_filter=metric_types_filter,
        )

    def testIncarcerationPipelineUsMo(self):
        fake_person_id = 12345
        data_dict = self.build_incarceration_pipeline_data_dict(
            fake_person_id=fake_person_id, state_code="US_MO")
        dataset = "recidiviz-123.state"

        self.run_test_pipeline(
            state_code="US_MO",
            dataset=dataset,
            data_dict=data_dict,
            expected_metric_types=ALL_METRIC_TYPES_SET,
        )

    def testIncarcerationPipelineWithFilterSet(self):
        fake_person_id = 12345
        data_dict = self.build_incarceration_pipeline_data_dict(
            fake_person_id=fake_person_id)
        dataset = "recidivz-staging.state"

        self.run_test_pipeline(
            state_code=_STATE_CODE,
            dataset=dataset,
            data_dict=data_dict,
            expected_metric_types=ALL_METRIC_TYPES_SET,
            unifying_id_field_filter_set={fake_person_id},
        )

    def run_test_pipeline(
        self,
        state_code: str,
        dataset: str,
        data_dict: Dict[str, List[Dict]],
        expected_metric_types: Set[IncarcerationMetricType],
        unifying_id_field_filter_set: Optional[Set[int]] = None,
        metric_types_filter: Optional[Set[str]] = None,
    ) -> None:
        """Runs a test version of the supervision pipeline."""
        read_from_bq_constructor = (
            self.fake_bq_source_factory.create_fake_bq_source_constructor(
                dataset, data_dict))
        write_to_bq_constructor = (
            self.fake_bq_sink_factory.create_fake_bq_sink_constructor(
                dataset,
                expected_output_metric_types=expected_metric_types,
            ))
        with patch(
                f"{INCARCERATION_PIPELINE_PACKAGE_NAME}.ReadFromBigQuery",
                read_from_bq_constructor,
        ):
            run_test_pipeline(
                pipeline_module=pipeline,
                state_code=state_code,
                dataset=dataset,
                read_from_bq_constructor=read_from_bq_constructor,
                write_to_bq_constructor=write_to_bq_constructor,
                unifying_id_field_filter_set=unifying_id_field_filter_set,
                metric_types_filter=metric_types_filter,
            )

    def build_incarceration_pipeline_data_dict_no_incarceration(
            self, fake_person_id: int):
        """Builds a data_dict for a run of the pipeline where the person has no incarceration."""
        fake_person_1 = schema.StatePerson(
            state_code="US_XX",
            person_id=fake_person_id,
            gender=Gender.MALE,
            birthdate=date(1970, 1, 1),
            residency_status=ResidencyStatus.PERMANENT,
        )

        fake_person_id_2 = 6789

        fake_person_2 = schema.StatePerson(
            state_code="US_XX",
            person_id=fake_person_id_2,
            gender=Gender.FEMALE,
            birthdate=date(1990, 1, 1),
            residency_status=ResidencyStatus.PERMANENT,
        )

        persons_data = [
            normalized_database_base_dict(fake_person_1),
            normalized_database_base_dict(fake_person_2),
        ]

        sentence_group = schema.StateSentenceGroup(
            sentence_group_id=111,
            state_code="US_XX",
            status=StateSentenceStatus.PRESENT_WITHOUT_INFO,
            person_id=fake_person_id,
        )

        incarceration_period = schema.StateIncarcerationPeriod(
            incarceration_period_id=1111,
            status=StateIncarcerationPeriodStatus.NOT_IN_CUSTODY,
            state_code="US_XX",
            county_code="124",
            facility="San Quentin",
            facility_security_level=StateIncarcerationFacilitySecurityLevel.
            MAXIMUM,
            admission_reason=StateIncarcerationPeriodAdmissionReason.
            NEW_ADMISSION,
            projected_release_reason=StateIncarcerationPeriodReleaseReason.
            CONDITIONAL_RELEASE,
            admission_date=date(2008, 11, 20),
            release_date=date(2010, 12, 4),
            release_reason=StateIncarcerationPeriodReleaseReason.
            SENTENCE_SERVED,
            person_id=fake_person_id,
        )

        incarceration_sentence = schema.StateIncarcerationSentence(
            incarceration_sentence_id=1111,
            state_code="US_XX",
            sentence_group_id=sentence_group.sentence_group_id,
            incarceration_periods=[incarceration_period],
            person_id=fake_person_id,
            status=StateSentenceStatus.PRESENT_WITHOUT_INFO,
        )

        supervision_sentence = schema.StateSupervisionSentence(
            supervision_sentence_id=123,
            state_code="US_XX",
            person_id=fake_person_id,
            status=StateSentenceStatus.PRESENT_WITHOUT_INFO,
        )

        sentence_group.incarceration_sentences = [incarceration_sentence]

        sentence_group_data = [normalized_database_base_dict(sentence_group)]

        incarceration_sentence_data = [
            normalized_database_base_dict(incarceration_sentence)
        ]

        supervision_sentence_data = [
            normalized_database_base_dict(supervision_sentence)
        ]

        incarceration_periods_data = [
            normalized_database_base_dict(incarceration_period)
        ]

        state_incarceration_sentence_incarceration_period_association = [
            {
                "incarceration_period_id":
                incarceration_period.incarceration_period_id,
                "incarceration_sentence_id":
                incarceration_sentence.incarceration_sentence_id,
            },
        ]

        fake_person_id_to_county_query_result = [{
            "state_code":
            "US_XX",
            "person_id":
            fake_person_id,
            "county_of_residence":
            _COUNTY_OF_RESIDENCE,
        }]

        us_mo_sentence_status_data: List[Dict[str, Any]] = [{
            "state_code":
            "US_MO",
            "person_id":
            fake_person_id,
            "sentence_external_id":
            "XXX",
            "sentence_status_external_id":
            "YYY",
            "status_code":
            "ZZZ",
            "status_date":
            "not_a_date",
            "status_description":
            "XYZ",
        }]

        incarceration_period_judicial_district_association_data = [{
            "state_code":
            "US_XX",
            "person_id":
            fake_person_id,
            "incarceration_period_id":
            123,
            "judicial_district_code":
            "NW",
        }]

        state_race_ethnicity_population_count_data = [{
            "state_code":
            "US_XX",
            "race_or_ethnicity":
            "BLACK",
            "population_count":
            1,
            "representation_priority":
            1,
        }]

        data_dict = self._default_data_dict()
        data_dict_overrides = {
            schema.StatePerson.__tablename__:
            persons_data,
            schema.StateSentenceGroup.__tablename__:
            sentence_group_data,
            schema.StateIncarcerationSentence.__tablename__:
            incarceration_sentence_data,
            schema.StateSupervisionSentence.__tablename__:
            supervision_sentence_data,
            schema.StateIncarcerationPeriod.__tablename__:
            incarceration_periods_data,
            schema.state_incarceration_sentence_incarceration_period_association_table.name:
            state_incarceration_sentence_incarceration_period_association,
            "persons_to_recent_county_of_residence":
            fake_person_id_to_county_query_result,
            "incarceration_period_judicial_district_association":
            incarceration_period_judicial_district_association_data,
            "state_race_ethnicity_population_counts":
            state_race_ethnicity_population_count_data,
            "us_mo_sentence_statuses":
            us_mo_sentence_status_data,
        }
        data_dict.update(data_dict_overrides)

        return data_dict

    def testIncarcerationPipelineNoIncarceration(self):
        """Tests the incarceration pipeline when a person doesn't have any
        incarceration periods."""
        fake_person_id = 12345
        data_dict = self.build_incarceration_pipeline_data_dict_no_incarceration(
            fake_person_id)
        dataset = "recidiviz-123.state"

        self.run_test_pipeline(_STATE_CODE,
                               dataset,
                               data_dict,
                               expected_metric_types=set())