def run( apache_beam_pipeline_options: PipelineOptions, data_input: str, reference_view_input: str, static_reference_input: str, output: str, calculation_month_count: int, metric_types: List[str], state_code: str, calculation_end_month: Optional[str], person_filter_ids: Optional[List[int]], ) -> None: """Runs the supervision calculation pipeline.""" # Workaround to load SQLAlchemy objects at start of pipeline. This is necessary because the BuildRootEntity # function tries to access attributes of relationship properties on the SQLAlchemy room_schema_class before they # have been loaded. However, if *any* SQLAlchemy objects have been instantiated, then the relationship properties # are loaded and their attributes can be successfully accessed. _ = schema.StatePerson() apache_beam_pipeline_options.view_as(SetupOptions).save_main_session = True # Get pipeline job details all_pipeline_options = apache_beam_pipeline_options.get_all_options() project_id = all_pipeline_options["project"] if project_id is None: raise ValueError( f"No project set in pipeline options: {all_pipeline_options}") if state_code is None: raise ValueError("No state_code set for pipeline") input_dataset = project_id + "." + data_input reference_dataset = project_id + "." + reference_view_input static_reference_dataset = project_id + "." + static_reference_input person_id_filter_set = set( person_filter_ids) if person_filter_ids else None with beam.Pipeline(options=apache_beam_pipeline_options) as p: # Get StatePersons persons = p | "Load Persons" >> BuildRootEntity( dataset=input_dataset, root_entity_class=entities.StatePerson, unifying_id_field=entities.StatePerson.get_class_id_name(), build_related_entities=True, unifying_id_field_filter_set=person_id_filter_set, state_code=state_code, ) # Get StateIncarcerationPeriods incarceration_periods = p | "Load IncarcerationPeriods" >> BuildRootEntity( dataset=input_dataset, root_entity_class=entities.StateIncarcerationPeriod, unifying_id_field=entities.StatePerson.get_class_id_name(), build_related_entities=True, unifying_id_field_filter_set=person_id_filter_set, state_code=state_code, ) # Get StateSupervisionViolations supervision_violations = p | "Load SupervisionViolations" >> BuildRootEntity( dataset=input_dataset, root_entity_class=entities.StateSupervisionViolation, unifying_id_field=entities.StatePerson.get_class_id_name(), build_related_entities=True, unifying_id_field_filter_set=person_id_filter_set, state_code=state_code, ) # TODO(#2769): Don't bring this in as a root entity # Get StateSupervisionViolationResponses supervision_violation_responses = ( p | "Load SupervisionViolationResponses" >> BuildRootEntity( dataset=input_dataset, root_entity_class=entities.StateSupervisionViolationResponse, unifying_id_field=entities.StatePerson.get_class_id_name(), build_related_entities=True, unifying_id_field_filter_set=person_id_filter_set, state_code=state_code, )) # Get StateSupervisionSentences supervision_sentences = p | "Load SupervisionSentences" >> BuildRootEntity( dataset=input_dataset, root_entity_class=entities.StateSupervisionSentence, unifying_id_field=entities.StatePerson.get_class_id_name(), build_related_entities=True, unifying_id_field_filter_set=person_id_filter_set, state_code=state_code, ) # Get StateIncarcerationSentences incarceration_sentences = p | "Load IncarcerationSentences" >> BuildRootEntity( dataset=input_dataset, root_entity_class=entities.StateIncarcerationSentence, unifying_id_field=entities.StatePerson.get_class_id_name(), build_related_entities=True, unifying_id_field_filter_set=person_id_filter_set, state_code=state_code, ) # Get StateSupervisionPeriods supervision_periods = p | "Load SupervisionPeriods" >> BuildRootEntity( dataset=input_dataset, root_entity_class=entities.StateSupervisionPeriod, unifying_id_field=entities.StatePerson.get_class_id_name(), build_related_entities=True, unifying_id_field_filter_set=person_id_filter_set, state_code=state_code, ) # Get StateAssessments assessments = p | "Load Assessments" >> BuildRootEntity( dataset=input_dataset, root_entity_class=entities.StateAssessment, unifying_id_field=entities.StatePerson.get_class_id_name(), build_related_entities=False, unifying_id_field_filter_set=person_id_filter_set, state_code=state_code, ) supervision_contacts = p | "Load StateSupervisionContacts" >> BuildRootEntity( dataset=input_dataset, root_entity_class=entities.StateSupervisionContact, unifying_id_field=entities.StatePerson.get_class_id_name(), build_related_entities=False, unifying_id_field_filter_set=person_id_filter_set, state_code=state_code, ) supervision_period_to_agent_associations_as_kv = ( p | "Load supervision_period_to_agent_associations_as_kv" >> ImportTableAsKVTuples( dataset_id=reference_dataset, table_id=SUPERVISION_PERIOD_TO_AGENT_ASSOCIATION_VIEW_NAME, table_key="person_id", state_code_filter=state_code, person_id_filter_set=person_id_filter_set, )) # Bring in the judicial districts associated with supervision_periods sp_to_judicial_district_kv = ( p | "Load sp_to_judicial_district_kv" >> ImportTableAsKVTuples( dataset_id=reference_dataset, table_id= SUPERVISION_PERIOD_JUDICIAL_DISTRICT_ASSOCIATION_VIEW_NAME, state_code_filter=state_code, person_id_filter_set=person_id_filter_set, table_key="person_id", )) state_race_ethnicity_population_counts = ( p | "Load state_race_ethnicity_population_counts" >> ImportTable( dataset_id=static_reference_dataset, table_id="state_race_ethnicity_population_counts", state_code_filter=state_code, person_id_filter_set=None, )) if state_code == "US_MO": # Bring in the reference table that includes sentence status ranking information us_mo_sentence_status_query = select_all_by_person_query( reference_dataset, US_MO_SENTENCE_STATUSES_VIEW_NAME, state_code, person_id_filter_set, ) us_mo_sentence_statuses = ( p | "Read MO sentence status table from BigQuery" >> ReadFromBigQuery(query=us_mo_sentence_status_query)) else: us_mo_sentence_statuses = ( p | f"Generate empty MO statuses list for non-MO state run: {state_code} " >> beam.Create([])) us_mo_sentence_status_rankings_as_kv = ( us_mo_sentence_statuses | "Convert MO sentence status ranking table to KV tuples" >> beam.ParDo(ConvertDictToKVTuple(), "person_id")) sentences_and_statuses = ( { "incarceration_sentences": incarceration_sentences, "supervision_sentences": supervision_sentences, "sentence_statuses": us_mo_sentence_status_rankings_as_kv, } | "Group sentences to the sentence statuses for that person" >> beam.CoGroupByKey()) sentences_converted = ( sentences_and_statuses | "Convert to state-specific sentences" >> beam.ParDo( ConvertSentencesToStateSpecificType()).with_outputs( "incarceration_sentences", "supervision_sentences")) # Group StateSupervisionViolationResponses and StateSupervisionViolations by person_id supervision_violations_and_responses = ( { "violations": supervision_violations, "violation_responses": supervision_violation_responses, } | "Group StateSupervisionViolationResponses to " "StateSupervisionViolations" >> beam.CoGroupByKey()) # Set the fully hydrated StateSupervisionViolation entities on the corresponding # StateSupervisionViolationResponses violation_responses_with_hydrated_violations = ( supervision_violations_and_responses | "Set hydrated StateSupervisionViolations on " "the StateSupervisionViolationResponses" >> beam.ParDo( SetViolationOnViolationsResponse())) # Group StateIncarcerationPeriods and StateSupervisionViolationResponses by person_id incarceration_periods_and_violation_responses = ( { "incarceration_periods": incarceration_periods, "violation_responses": violation_responses_with_hydrated_violations, } | "Group StateIncarcerationPeriods to " "StateSupervisionViolationResponses" >> beam.CoGroupByKey()) # Set the fully hydrated StateSupervisionViolationResponse entities on the corresponding # StateIncarcerationPeriods incarceration_periods_with_source_violations = ( incarceration_periods_and_violation_responses | "Set hydrated StateSupervisionViolationResponses on " "the StateIncarcerationPeriods" >> beam.ParDo( SetViolationResponseOnIncarcerationPeriod())) # Group each StatePerson with their related entities person_entities = { "person": persons, "assessments": assessments, "incarceration_periods": incarceration_periods_with_source_violations, "supervision_periods": supervision_periods, "supervision_sentences": sentences_converted.supervision_sentences, "incarceration_sentences": sentences_converted.incarceration_sentences, "violation_responses": violation_responses_with_hydrated_violations, "supervision_contacts": supervision_contacts, "supervision_period_judicial_district_association": sp_to_judicial_district_kv, "supervision_period_to_agent_association": supervision_period_to_agent_associations_as_kv, } | "Group StatePerson to all entities" >> beam.CoGroupByKey() # Identify SupervisionTimeBuckets from the StatePerson's StateSupervisionSentences and StateIncarcerationPeriods person_time_buckets = (person_entities | "Get SupervisionTimeBuckets" >> beam.ParDo( ClassifySupervisionTimeBuckets())) person_metadata = ( persons | "Build the person_metadata dictionary" >> beam.ParDo( BuildPersonMetadata(), AsList(state_race_ethnicity_population_counts))) person_time_buckets_with_metadata = ( { "person_events": person_time_buckets, "person_metadata": person_metadata } | "Group SupervisionTimeBuckets with person-level metadata" >> beam.CoGroupByKey() | "Organize StatePerson, PersonMetadata and SupervisionTimeBuckets for calculations" >> beam.ParDo(ExtractPersonEventsMetadata())) # Get pipeline job details for accessing job_id all_pipeline_options = apache_beam_pipeline_options.get_all_options() # Get the type of metric to calculate metric_types_set = set(metric_types) # Add timestamp for local jobs job_timestamp = datetime.datetime.now().strftime( "%Y-%m-%d_%H_%M_%S.%f") all_pipeline_options["job_timestamp"] = job_timestamp # Get supervision metrics supervision_metrics = ( person_time_buckets_with_metadata | "Get Supervision Metrics" >> GetSupervisionMetrics( pipeline_options=all_pipeline_options, metric_types=metric_types_set, calculation_end_month=calculation_end_month, calculation_month_count=calculation_month_count, )) if person_id_filter_set: logging.warning( "Non-empty person filter set - returning before writing metrics." ) return # Convert the metrics into a format that's writable to BQ writable_metrics = ( supervision_metrics | "Convert to dict to be written to BQ" >> beam.ParDo(RecidivizMetricWritableDict()).with_outputs( SupervisionMetricType.SUPERVISION_COMPLIANCE.value, SupervisionMetricType.SUPERVISION_POPULATION.value, SupervisionMetricType.SUPERVISION_REVOCATION.value, SupervisionMetricType.SUPERVISION_REVOCATION_ANALYSIS.value, SupervisionMetricType.SUPERVISION_START.value, SupervisionMetricType.SUPERVISION_SUCCESS.value, SupervisionMetricType. SUPERVISION_SUCCESSFUL_SENTENCE_DAYS_SERVED.value, SupervisionMetricType.SUPERVISION_TERMINATION.value, SupervisionMetricType.SUPERVISION_OUT_OF_STATE_POPULATION. value, SupervisionMetricType.SUPERVISION_DOWNGRADE.value, )) terminations_table_id = DATAFLOW_METRICS_TO_TABLES[ SupervisionTerminationMetric] compliance_table_id = DATAFLOW_METRICS_TO_TABLES[ SupervisionCaseComplianceMetric] populations_table_id = DATAFLOW_METRICS_TO_TABLES[ SupervisionPopulationMetric] revocations_table_id = DATAFLOW_METRICS_TO_TABLES[ SupervisionRevocationMetric] revocation_analysis_table_id = DATAFLOW_METRICS_TO_TABLES[ SupervisionRevocationAnalysisMetric] successes_table_id = DATAFLOW_METRICS_TO_TABLES[ SupervisionSuccessMetric] successful_sentence_lengths_table_id = DATAFLOW_METRICS_TO_TABLES[ SuccessfulSupervisionSentenceDaysServedMetric] supervision_starts_table_id = DATAFLOW_METRICS_TO_TABLES[ SupervisionStartMetric] out_of_state_populations_table_id = DATAFLOW_METRICS_TO_TABLES[ SupervisionOutOfStatePopulationMetric] supervision_downgrade_table_id = DATAFLOW_METRICS_TO_TABLES[ SupervisionDowngradeMetric] _ = (writable_metrics.SUPERVISION_POPULATION | f"Write population metrics to BQ table: {populations_table_id}" >> WriteAppendToBigQuery( output_table=populations_table_id, output_dataset=output, )) _ = (writable_metrics.SUPERVISION_OUT_OF_STATE_POPULATION | f"Write out of state population metrics to BQ table: " f"{out_of_state_populations_table_id}" >> WriteAppendToBigQuery( output_table=out_of_state_populations_table_id, output_dataset=output, )) _ = (writable_metrics.SUPERVISION_REVOCATION | f"Write revocation metrics to BQ table: {revocations_table_id}" >> WriteAppendToBigQuery( output_table=revocations_table_id, output_dataset=output, )) _ = (writable_metrics.SUPERVISION_SUCCESS | f"Write success metrics to BQ table: {successes_table_id}" >> WriteAppendToBigQuery( output_table=successes_table_id, output_dataset=output, )) _ = (writable_metrics.SUPERVISION_SUCCESSFUL_SENTENCE_DAYS_SERVED | f"Write supervision successful sentence length metrics to BQ" f" table: {successful_sentence_lengths_table_id}" >> WriteAppendToBigQuery( output_table=successful_sentence_lengths_table_id, output_dataset=output, )) _ = (writable_metrics.SUPERVISION_TERMINATION | f"Write termination metrics to BQ table: {terminations_table_id}" >> WriteAppendToBigQuery( output_table=terminations_table_id, output_dataset=output, )) _ = (writable_metrics.SUPERVISION_REVOCATION_ANALYSIS | f"Write revocation analyses metrics to BQ table: " f"{revocation_analysis_table_id}" >> WriteAppendToBigQuery( output_table=revocation_analysis_table_id, output_dataset=output, )) _ = (writable_metrics.SUPERVISION_COMPLIANCE | f"Write compliance metrics to BQ table: {compliance_table_id}" >> WriteAppendToBigQuery( output_table=compliance_table_id, output_dataset=output, )) _ = (writable_metrics.SUPERVISION_START | f"Write start metrics to BQ table: {supervision_starts_table_id}" >> WriteAppendToBigQuery( output_table=supervision_starts_table_id, output_dataset=output, )) _ = ( writable_metrics.SUPERVISION_DOWNGRADE | f"Write downgrade metrics to BQ table: {supervision_downgrade_table_id}" >> WriteAppendToBigQuery( output_table=supervision_downgrade_table_id, output_dataset=output, ))
def execute_pipeline( self, pipeline: beam.Pipeline, all_pipeline_options: Dict[str, Any], state_code: str, input_dataset: str, reference_dataset: str, static_reference_dataset: str, metric_types: List[str], person_id_filter_set: Optional[Set[int]], calculation_month_count: int = -1, calculation_end_month: Optional[str] = None, ) -> beam.Pipeline: # Get StatePersons persons = pipeline | "Load Persons" >> BuildRootEntity( dataset=input_dataset, root_entity_class=entities.StatePerson, unifying_id_field=entities.StatePerson.get_class_id_name(), build_related_entities=True, unifying_id_field_filter_set=person_id_filter_set, state_code=state_code, ) # Get StateProgramAssignments program_assignments = pipeline | "Load Program Assignments" >> BuildRootEntity( dataset=input_dataset, root_entity_class=entities.StateProgramAssignment, unifying_id_field=entities.StatePerson.get_class_id_name(), build_related_entities=True, unifying_id_field_filter_set=person_id_filter_set, state_code=state_code, ) # Get StateAssessments assessments = pipeline | "Load Assessments" >> BuildRootEntity( dataset=input_dataset, root_entity_class=entities.StateAssessment, unifying_id_field=entities.StatePerson.get_class_id_name(), build_related_entities=False, unifying_id_field_filter_set=person_id_filter_set, state_code=state_code, ) # Get StateSupervisionPeriods supervision_periods = pipeline | "Load SupervisionPeriods" >> BuildRootEntity( dataset=input_dataset, root_entity_class=entities.StateSupervisionPeriod, unifying_id_field=entities.StatePerson.get_class_id_name(), build_related_entities=False, unifying_id_field_filter_set=person_id_filter_set, state_code=state_code, ) supervision_period_to_agent_associations_as_kv = ( pipeline | "Load supervision_period_to_agent_associations_as_kv" >> ImportTableAsKVTuples( dataset_id=reference_dataset, table_id=SUPERVISION_PERIOD_TO_AGENT_ASSOCIATION_VIEW_NAME, table_key="person_id", state_code_filter=state_code, person_id_filter_set=person_id_filter_set, )) state_race_ethnicity_population_counts = ( pipeline | "Load state_race_ethnicity_population_counts" >> ImportTable( dataset_id=static_reference_dataset, table_id="state_race_ethnicity_population_counts", state_code_filter=state_code, person_id_filter_set=None, )) # Group each StatePerson with their other entities persons_entities = { "person": persons, "program_assignments": program_assignments, "assessments": assessments, "supervision_periods": supervision_periods, "supervision_period_to_agent_association": supervision_period_to_agent_associations_as_kv, } | "Group StatePerson to StateProgramAssignments and" >> beam.CoGroupByKey( ) # Identify ProgramEvents from the StatePerson's StateProgramAssignments person_program_events = persons_entities | beam.ParDo( ClassifyEvents(), identifier=self.pipeline_config.identifier) person_metadata = ( persons | "Build the person_metadata dictionary" >> beam.ParDo( BuildPersonMetadata(), state_race_ethnicity_population_counts=AsList( state_race_ethnicity_population_counts), )) person_program_events_with_metadata = ( { "person_events": person_program_events, "person_metadata": person_metadata } | "Group ProgramEvents with person-level metadata" >> beam.CoGroupByKey() | "Organize StatePerson, PersonMetadata and ProgramEvents for calculations" >> beam.ParDo(ExtractPersonEventsMetadata())) # Add timestamp for local jobs job_timestamp = datetime.datetime.now().strftime( "%Y-%m-%d_%H_%M_%S.%f") all_pipeline_options["job_timestamp"] = job_timestamp # Get the type of metric to calculate metric_types_set = set(metric_types) # Get program metrics program_metrics = (person_program_events_with_metadata | "Get Program Metrics" >> GetMetrics( pipeline_options=all_pipeline_options, pipeline_config=self.pipeline_config, metric_types_to_include=metric_types_set, calculation_end_month=calculation_end_month, calculation_month_count=calculation_month_count, )) return program_metrics