def test_incarceration_pipeline_specify_person_id_filters(self): # Arrange argv = [ '--job_name', 'incarceration-args-test', '--project', 'recidiviz-staging', '--person_filter_ids', '685253', '12345', '99999', '--setup_file', './setup.py' ] # Act incarceration_pipeline_args, apache_beam_args = incarceration_pipeline.parse_arguments( argv) pipeline_options = get_apache_beam_pipeline_options_from_args( apache_beam_args) # Assert expected_incarceration_pipeline_args = Namespace( **self.DEFAULT_INCARCERATION_PIPELINE_ARGS.__dict__) expected_incarceration_pipeline_args.person_filter_ids = [ 685253, 12345, 99999 ] self.assertEqual(incarceration_pipeline_args, expected_incarceration_pipeline_args) self.assertEqual(pipeline_options.get_all_options(drop_default=True), self.DEFAULT_APACHE_BEAM_OPTIONS_DICT)
def test_incarceration_pipeline_args_missing_arg(self): # Arrange argv = [ "--job_name", "incarceration-args-test", "--runner", "DirectRunner", # project arg omitted here "--setup_file", "./setup2.py", "--bucket", "recidiviz-123-my-bucket", "--region=us-central1", "--data_input", "county", "--reference_view_input", "reference_views_2", "--output", "dataflow_metrics_2", "--calculation_month_count=6", "--calculation_end_month=2009-07", "--save_as_template", ] # Act ( _incarceration_pipeline_args, apache_beam_args, ) = incarceration_pipeline.get_arg_parser().parse_known_args(argv) with self.assertRaises(SystemExit) as e: _ = get_apache_beam_pipeline_options_from_args(apache_beam_args) self.assertEqual(2, e.exception.code)
def test_minimal_incarceration_pipeline_args_save_to_template(self): # Arrange argv = [ "--job_name", "incarceration-args-test", "--project", "recidiviz-staging", "--save_as_template", ] # Act ( incarceration_pipeline_args, apache_beam_args, ) = incarceration_pipeline.get_arg_parser().parse_known_args(argv) pipeline_options = get_apache_beam_pipeline_options_from_args( apache_beam_args) # Assert self.assertEqual(incarceration_pipeline_args, self.DEFAULT_INCARCERATION_PIPELINE_ARGS) expected_apache_beam_options_dict = self.DEFAULT_APACHE_BEAM_OPTIONS_DICT.copy( ) expected_apache_beam_options_dict[ "template_location"] = "gs://recidiviz-staging-dataflow-templates/templates/incarceration-args-test" self.assertEqual( pipeline_options.get_all_options(drop_default=True), expected_apache_beam_options_dict, )
def test_minimal_incarceration_pipeline_args(self): # Arrange argv = [ "--job_name", "incarceration-args-test", "--project", "recidiviz-staging", "--extra_package", "dist/recidiviz-calculation-pipelines.tar.gz", ] # Act ( incarceration_pipeline_args, apache_beam_args, ) = self.TEST_PIPELINE.get_arg_parser().parse_known_args(argv) pipeline_options = get_apache_beam_pipeline_options_from_args( apache_beam_args) # Assert self.assertEqual(incarceration_pipeline_args, self.DEFAULT_INCARCERATION_PIPELINE_ARGS) self.assertEqual( pipeline_options.get_all_options(drop_default=True), self.DEFAULT_APACHE_BEAM_OPTIONS_DICT, )
def test_incarceration_pipeline_args_missing_arg(self): # Arrange argv = [ '--job_name', 'incarceration-args-test', '--runner', 'DirectRunner', # project arg omitted here '--setup_file', './setup2.py', '--bucket', 'recidiviz-123-my-bucket', '--region=us-central1', '--data_input', 'county', '--reference_view_input', 'reference_views_2', '--output', 'dataflow_metrics_2', '--calculation_month_count=6', '--calculation_end_month=2009-07', '--save_as_template' ] # Act _incarceration_pipeline_args, apache_beam_args = incarceration_pipeline.get_arg_parser( ).parse_known_args(argv) with self.assertRaises(SystemExit) as e: _ = get_apache_beam_pipeline_options_from_args(apache_beam_args) self.assertEqual(2, e.exception.code)
def run_pipeline(pipeline_module, argv): """Runs the given pipeline_module with the arguments contained in argv.""" known_args, remaining_args = pipeline_module.get_arg_parser( ).parse_known_args(argv) apache_beam_pipeline_options = get_apache_beam_pipeline_options_from_args( remaining_args) pipeline_module.run(apache_beam_pipeline_options, **vars(known_args))
def test_incarceration_pipeline_args_defaults_changed(self): # Arrange argv = [ '--job_name', 'incarceration-args-test', '--runner', 'DirectRunner', '--project', 'recidiviz-staging', '--setup_file', './setup2.py', '--bucket', 'recidiviz-123-my-bucket', '--region=us-central1', '--data_input', 'county', '--reference_view_input', 'reference_views_2', '--static_reference_input', 'static_reference_2', '--output', 'dataflow_metrics_2', '--calculation_month_count=6', '--calculation_end_month=2009-07', '--save_as_template' ] # Act incarceration_pipeline_args, apache_beam_args = incarceration_pipeline.get_arg_parser( ).parse_known_args(argv) pipeline_options = get_apache_beam_pipeline_options_from_args( apache_beam_args) # Assert expected_incarceration_pipeline_args = \ Namespace(calculation_month_count=6, calculation_end_month='2009-07', data_input='county', output='dataflow_metrics_2', metric_types={'ALL'}, person_filter_ids=None, reference_view_input='reference_views_2', static_reference_input='static_reference_2', state_code=None) self.assertEqual(incarceration_pipeline_args, expected_incarceration_pipeline_args) expected_apache_beam_options_dict = { 'runner': 'DirectRunner', 'project': 'recidiviz-staging', 'job_name': 'incarceration-args-test', # Locations based on the overriden bucket, not the project! 'staging_location': 'gs://recidiviz-123-my-bucket/staging/', 'temp_location': 'gs://recidiviz-123-my-bucket/temp/', 'template_location': 'gs://recidiviz-123-my-bucket/templates/incarceration-args-test', 'region': 'us-central1', 'machine_type': 'n1-standard-4', 'network': 'default', 'subnetwork': 'https://www.googleapis.com/compute/v1/projects/recidiviz-staging/regions/us-central1/subnetworks/default', 'use_public_ips': False, 'experiments': ['shuffle_mode=service', 'use_beam_bq_sink'], 'setup_file': './setup2.py', 'disk_size_gb': 50, } self.assertEqual(expected_apache_beam_options_dict, pipeline_options.get_all_options(drop_default=True))
def test_incarceration_pipeline_args_defaults_changed(self): # Arrange argv = [ '--job_name', 'incarceration-args-test', '--runner', 'DirectRunner', '--project', 'recidiviz-staging', '--setup_file', './setup2.py', '--bucket', 'recidiviz-123-my-bucket', '--region=us-central1', '--input', 'county', '--reference_input', 'dashboard_views_2', '--output', 'dataflow_metrics_2', '--methodology=EVENT', '--calculation_month_limit=6', '--include_race=False', '--include_age=False', '--include_ethnicity=False', '--include_gender=False', '--save_as_template' ] # Act incarceration_pipeline_args, apache_beam_args = incarceration_pipeline.parse_arguments( argv) pipeline_options = get_apache_beam_pipeline_options_from_args( apache_beam_args) # Assert expected_incarceration_pipeline_args = \ Namespace(calculation_month_limit=6, include_age=False, include_ethnicity=False, include_gender=False, include_race=False, input='county', methodology='EVENT', output='dataflow_metrics_2', person_filter_ids=None, reference_input='dashboard_views_2', state_code=None) self.assertEqual(incarceration_pipeline_args, expected_incarceration_pipeline_args) expected_apache_beam_options_dict = { 'runner': 'DirectRunner', 'project': 'recidiviz-staging', 'job_name': 'incarceration-args-test', # Locations based on the overriden bucket, not the project! 'staging_location': 'gs://recidiviz-123-my-bucket/staging/', 'temp_location': 'gs://recidiviz-123-my-bucket/temp/', 'template_location': 'gs://recidiviz-123-my-bucket/templates/incarceration-args-test', 'machine_type': 'n1-standard-4', 'network': 'default', 'subnetwork': 'https://www.googleapis.com/compute/v1/projects/recidiviz-staging/regions/us-central1/subnetworks/default', 'use_public_ips': False, 'experiments': ['shuffle_mode=service'], 'setup_file': './setup2.py' } self.assertEqual(pipeline_options.get_all_options(drop_default=True), expected_apache_beam_options_dict)
def test_minimal_incarceration_pipeline_args(self): # Arrange argv = [ '--job_name', 'incarceration-args-test', '--project', 'recidiviz-staging' ] # Act incarceration_pipeline_args, apache_beam_args = incarceration_pipeline.parse_arguments( argv) pipeline_options = get_apache_beam_pipeline_options_from_args( apache_beam_args) # Assert self.assertEqual(incarceration_pipeline_args, self.DEFAULT_INCARCERATION_PIPELINE_ARGS) self.assertEqual(pipeline_options.get_all_options(drop_default=True), self.DEFAULT_APACHE_BEAM_OPTIONS_DICT)
def test_minimal_incarceration_pipeline_args_save_to_template(self): # Arrange argv = [ '--job_name', 'incarceration-args-test', '--project', 'recidiviz-staging', '--save_as_template' ] # Act incarceration_pipeline_args, apache_beam_args = incarceration_pipeline.parse_arguments( argv) pipeline_options = get_apache_beam_pipeline_options_from_args( apache_beam_args) # Assert self.assertEqual(incarceration_pipeline_args, self.DEFAULT_INCARCERATION_PIPELINE_ARGS) expected_apache_beam_options_dict = self.DEFAULT_APACHE_BEAM_OPTIONS_DICT.copy( ) expected_apache_beam_options_dict['template_location'] = \ 'gs://recidiviz-staging-dataflow-templates/templates/incarceration-args-test' self.assertEqual(pipeline_options.get_all_options(drop_default=True), expected_apache_beam_options_dict)
def test_incarceration_pipeline_specify_person_id_filters(self): # Arrange argv = [ "--job_name", "incarceration-args-test", "--project", "recidiviz-staging", "--person_filter_ids", "685253", "12345", "99999", "--setup_file", "./setup.py", ] # Act ( incarceration_pipeline_args, apache_beam_args, ) = incarceration_pipeline.get_arg_parser().parse_known_args(argv) pipeline_options = get_apache_beam_pipeline_options_from_args( apache_beam_args) # Assert expected_incarceration_pipeline_args = Namespace( **self.DEFAULT_INCARCERATION_PIPELINE_ARGS.__dict__) expected_incarceration_pipeline_args.person_filter_ids = [ 685253, 12345, 99999 ] self.assertEqual(incarceration_pipeline_args, expected_incarceration_pipeline_args) self.assertEqual( pipeline_options.get_all_options(drop_default=True), self.DEFAULT_APACHE_BEAM_OPTIONS_DICT, )
def run(argv): """Runs the recidivism calculation pipeline.""" # Workaround to load SQLAlchemy objects at start of pipeline. This is # necessary because the BuildRootEntity function tries to access attributes # of relationship properties on the SQLAlchemy room_schema_class before they # have been loaded. However, if *any* SQLAlchemy objects have been # instantiated, then the relationship properties are loaded and their # attributes can be successfully accessed. _ = schema.StatePerson() # Parse command-line arguments known_args, remaining_args = parse_arguments(argv) pipeline_options = get_apache_beam_pipeline_options_from_args(remaining_args) pipeline_options.view_as(SetupOptions).save_main_session = True # Get pipeline job details all_pipeline_options = pipeline_options.get_all_options() query_dataset = all_pipeline_options['project'] + '.' + known_args.input reference_dataset = all_pipeline_options['project'] + '.' + \ known_args.reference_input person_id_filter_set = set(known_args.person_filter_ids) if known_args.person_filter_ids else None state_code = known_args.state_code with beam.Pipeline(options=pipeline_options) as p: # Get StatePersons persons = (p | 'Load Persons' >> BuildRootEntity(dataset=query_dataset, root_entity_class=entities.StatePerson, unifying_id_field=entities.StatePerson.get_class_id_name(), build_related_entities=True, unifying_id_field_filter_set=person_id_filter_set)) # Get StateIncarcerationPeriods incarceration_periods = (p | 'Load IncarcerationPeriods' >> BuildRootEntity(dataset=query_dataset, root_entity_class=entities.StateIncarcerationPeriod, unifying_id_field=entities.StatePerson.get_class_id_name(), build_related_entities=True, unifying_id_field_filter_set=person_id_filter_set, state_code=state_code )) # Get StateSupervisionViolations supervision_violations = \ (p | 'Load SupervisionViolations' >> BuildRootEntity(dataset=query_dataset, root_entity_class=entities.StateSupervisionViolation, unifying_id_field=entities.StatePerson.get_class_id_name(), build_related_entities=True, unifying_id_field_filter_set=person_id_filter_set, state_code=state_code )) # TODO(2769): Don't bring this in as a root entity # Get StateSupervisionViolationResponses supervision_violation_responses = \ (p | 'Load SupervisionViolationResponses' >> BuildRootEntity(dataset=query_dataset, root_entity_class=entities.StateSupervisionViolationResponse, unifying_id_field=entities.StatePerson.get_class_id_name(), build_related_entities=True, unifying_id_field_filter_set=person_id_filter_set, state_code=state_code )) # Group StateSupervisionViolationResponses and # StateSupervisionViolations by person_id supervision_violations_and_responses = ( {'violations': supervision_violations, 'violation_responses': supervision_violation_responses } | 'Group StateSupervisionViolationResponses to ' 'StateSupervisionViolations' >> beam.CoGroupByKey() ) # Set the fully hydrated StateSupervisionViolation entities on # the corresponding StateSupervisionViolationResponses violation_responses_with_hydrated_violations = ( supervision_violations_and_responses | 'Set hydrated StateSupervisionViolations on ' 'the StateSupervisionViolationResponses' >> beam.ParDo(SetViolationOnViolationsResponse())) # Group StateIncarcerationPeriods and StateSupervisionViolationResponses # by person_id incarceration_periods_and_violation_responses = ( {'incarceration_periods': incarceration_periods, 'violation_responses': violation_responses_with_hydrated_violations} | 'Group StateIncarcerationPeriods to ' 'StateSupervisionViolationResponses' >> beam.CoGroupByKey() ) # Set the fully hydrated StateSupervisionViolationResponse entities on # the corresponding StateIncarcerationPeriods incarceration_periods_with_source_violations = ( incarceration_periods_and_violation_responses | 'Set hydrated StateSupervisionViolationResponses on ' 'the StateIncarcerationPeriods' >> beam.ParDo(SetViolationResponseOnIncarcerationPeriod())) # Group each StatePerson with their StateIncarcerationPeriods person_and_incarceration_periods = ( {'person': persons, 'incarceration_periods': incarceration_periods_with_source_violations} | 'Group StatePerson to StateIncarcerationPeriods' >> beam.CoGroupByKey() ) # Bring in the table that associates people and their county of # residence person_id_to_county_query = \ f"SELECT * FROM " \ f"`{reference_dataset}.persons_to_recent_county_of_residence`" person_id_to_county_kv = ( p | "Read person_id to county associations from BigQuery" >> beam.io.Read(beam.io.BigQuerySource( query=person_id_to_county_query, use_standard_sql=True)) | "Convert person_id to county association table to KV" >> beam.ParDo(ConvertDictToKVTuple(), 'person_id') ) # Identify ReleaseEvents events from the StatePerson's # StateIncarcerationPeriods person_events = ( person_and_incarceration_periods | "ClassifyReleaseEvents" >> beam.ParDo(ClassifyReleaseEvents(), AsDict(person_id_to_county_kv)) ) # Get dimensions to include and methodologies to use inclusions, methodologies = dimensions_and_methodologies(known_args) # Get pipeline job details for accessing job_id all_pipeline_options = pipeline_options.get_all_options() # Add timestamp for local jobs job_timestamp = datetime.datetime.now().strftime('%Y-%m-%d_%H_%M_%S.%f') all_pipeline_options['job_timestamp'] = job_timestamp # Get recidivism metrics recidivism_metrics = (person_events | 'Get Recidivism Metrics' >> GetRecidivismMetrics( pipeline_options=all_pipeline_options, inclusions=inclusions)) filter_metrics_kwargs = {'methodologies': methodologies} # Filter out unneeded metrics final_recidivism_metrics = ( recidivism_metrics | 'Filter out unwanted metrics' >> beam.ParDo(FilterMetrics(), **filter_metrics_kwargs)) if person_id_filter_set: logging.warning("Non-empty person filter set - returning before writing metrics.") return # Convert the metrics into a format that's writable to BQ writable_metrics = (final_recidivism_metrics | 'Convert to dict to be written to BQ' >> beam.ParDo( RecidivismMetricWritableDict()).with_outputs( 'rates', 'counts', 'liberties')) # Write the recidivism metrics to the output tables in BigQuery rates_table = known_args.output + '.recidivism_rate_metrics' counts_table = known_args.output + '.recidivism_count_metrics' liberty_table = known_args.output + '.recidivism_liberty_metrics' _ = (writable_metrics.rates | f"Write rate metrics to BQ table: {rates_table}" >> beam.io.WriteToBigQuery( table=rates_table, create_disposition=beam.io.BigQueryDisposition.CREATE_NEVER, write_disposition=beam.io.BigQueryDisposition.WRITE_APPEND )) _ = (writable_metrics.counts | f"Write count metrics to BQ table: {counts_table}" >> beam.io.WriteToBigQuery( table=counts_table, create_disposition=beam.io.BigQueryDisposition.CREATE_NEVER, write_disposition=beam.io.BigQueryDisposition.WRITE_APPEND )) _ = (writable_metrics.liberties | f"Write liberty metrics to BQ table: {liberty_table}" >> beam.io.WriteToBigQuery( table=liberty_table, create_disposition=beam.io.BigQueryDisposition.CREATE_NEVER, write_disposition=beam.io.BigQueryDisposition.WRITE_APPEND ))
def test_incarceration_pipeline_args_defaults_changed(self): # Arrange argv = [ "--job_name", "incarceration-args-test", "--runner", "DirectRunner", "--project", "recidiviz-staging", "--setup_file", "./setup2.py", "--bucket", "recidiviz-123-my-bucket", "--region=us-central1", "--data_input", "county", "--reference_view_input", "reference_views_2", "--static_reference_input", "static_reference_2", "--output", "dataflow_metrics_2", "--calculation_month_count=6", "--calculation_end_month=2009-07", "--save_as_template", ] # Act ( incarceration_pipeline_args, apache_beam_args, ) = incarceration_pipeline.get_arg_parser().parse_known_args(argv) pipeline_options = get_apache_beam_pipeline_options_from_args( apache_beam_args) # Assert expected_incarceration_pipeline_args = Namespace( calculation_month_count=6, calculation_end_month="2009-07", data_input="county", output="dataflow_metrics_2", metric_types={"ALL"}, person_filter_ids=None, reference_view_input="reference_views_2", static_reference_input="static_reference_2", state_code=None, ) self.assertEqual(incarceration_pipeline_args, expected_incarceration_pipeline_args) expected_apache_beam_options_dict = { "runner": "DirectRunner", "project": "recidiviz-staging", "job_name": "incarceration-args-test", # Locations based on the overriden bucket, not the project! "staging_location": "gs://recidiviz-123-my-bucket/staging/", "temp_location": "gs://recidiviz-123-my-bucket/temp/", "template_location": "gs://recidiviz-123-my-bucket/templates/incarceration-args-test", "region": "us-central1", "machine_type": "n1-standard-4", "network": "default", "subnetwork": "https://www.googleapis.com/compute/v1/projects/recidiviz-staging/" "regions/us-central1/subnetworks/default", "use_public_ips": False, "experiments": ["shuffle_mode=service", "use_beam_bq_sink"], "setup_file": "./setup2.py", "disk_size_gb": 50, } self.assertEqual( expected_apache_beam_options_dict, pipeline_options.get_all_options(drop_default=True), )
def run(argv): """Runs the program calculation pipeline.""" # Workaround to load SQLAlchemy objects at start of pipeline. This is necessary because the BuildRootEntity # function tries to access attributes of relationship properties on the SQLAlchemy room_schema_class before they # have been loaded. However, if *any* SQLAlchemy objects have been instantiated, then the relationship properties # are loaded and their attributes can be successfully accessed. _ = schema.StatePerson() # Parse command-line arguments known_args, remaining_args = parse_arguments(argv) pipeline_options = get_apache_beam_pipeline_options_from_args( remaining_args) pipeline_options.view_as(SetupOptions).save_main_session = True # Get pipeline job details all_pipeline_options = pipeline_options.get_all_options() input_dataset = all_pipeline_options['project'] + '.' + known_args.input reference_dataset = all_pipeline_options['project'] + '.' + \ known_args.reference_input person_id_filter_set = set( known_args.person_filter_ids) if known_args.person_filter_ids else None state_code = known_args.state_code with beam.Pipeline(options=pipeline_options) as p: # Get StatePersons persons = (p | 'Load Persons' >> BuildRootEntity( dataset=input_dataset, root_entity_class=entities.StatePerson, unifying_id_field=entities.StatePerson.get_class_id_name(), build_related_entities=True, unifying_id_field_filter_set=person_id_filter_set)) # Get StateProgramAssignments program_assignments = ( p | 'Load Program Assignments' >> BuildRootEntity( dataset=input_dataset, root_entity_class=entities.StateProgramAssignment, unifying_id_field=entities.StatePerson.get_class_id_name(), build_related_entities=True, unifying_id_field_filter_set=person_id_filter_set, state_code=state_code)) # Get StateAssessments assessments = (p | 'Load Assessments' >> BuildRootEntity( dataset=input_dataset, root_entity_class=entities.StateAssessment, unifying_id_field=entities.StatePerson.get_class_id_name(), build_related_entities=False, unifying_id_field_filter_set=person_id_filter_set, state_code=state_code)) # Get StateSupervisionPeriods supervision_periods = ( p | 'Load SupervisionPeriods' >> BuildRootEntity( dataset=input_dataset, root_entity_class=entities.StateSupervisionPeriod, unifying_id_field=entities.StatePerson.get_class_id_name(), build_related_entities=False, unifying_id_field_filter_set=person_id_filter_set, state_code=state_code)) supervision_period_to_agent_association_query = \ f"SELECT * FROM `{reference_dataset}.supervision_period_to_agent_association`" supervision_period_to_agent_associations = ( p | "Read Supervision Period to Agent table from BigQuery" >> beam.io.Read( beam.io.BigQuerySource( query=supervision_period_to_agent_association_query, use_standard_sql=True))) # Convert the association table rows into key-value tuples with the value for the supervision_period_id column # as the key supervision_period_to_agent_associations_as_kv = ( supervision_period_to_agent_associations | 'Convert Supervision Period to Agent table to KV tuples' >> beam.ParDo(ConvertDictToKVTuple(), 'supervision_period_id')) # Group each StatePerson with their other entities persons_entities = ({ 'person': persons, 'program_assignments': program_assignments, 'assessments': assessments, 'supervision_periods': supervision_periods } | 'Group StatePerson to StateProgramAssignments and' >> beam.CoGroupByKey()) # Identify ProgramEvents from the StatePerson's StateProgramAssignments person_program_events = ( persons_entities | beam.ParDo( ClassifyProgramAssignments(), AsDict(supervision_period_to_agent_associations_as_kv))) # Get dimensions to include and methodologies to use inclusions, _ = dimensions_and_methodologies(known_args) # Get pipeline job details for accessing job_id all_pipeline_options = pipeline_options.get_all_options() # The number of months to limit the monthly calculation output to calculation_month_limit = known_args.calculation_month_limit # Add timestamp for local jobs job_timestamp = datetime.datetime.now().strftime( '%Y-%m-%d_%H_%M_%S.%f') all_pipeline_options['job_timestamp'] = job_timestamp # Get program metrics program_metrics = ( person_program_events | 'Get Program Metrics' >> GetProgramMetrics( pipeline_options=all_pipeline_options, inclusions=inclusions, calculation_month_limit=calculation_month_limit)) if person_id_filter_set: logging.warning( "Non-empty person filter set - returning before writing metrics." ) return # Convert the metrics into a format that's writable to BQ writable_metrics = ( program_metrics | 'Convert to dict to be written to BQ' >> beam.ParDo( ProgramMetricWritableDict()).with_outputs('referrals')) # Write the metrics to the output tables in BigQuery referrals_table = known_args.output + '.program_referral_metrics' _ = (writable_metrics.referrals | f"Write referral metrics to BQ table: {referrals_table}" >> beam.io.WriteToBigQuery( table=referrals_table, create_disposition=beam.io.BigQueryDisposition.CREATE_NEVER, write_disposition=beam.io.BigQueryDisposition.WRITE_APPEND))
def run(argv): """Runs the incarceration calculation pipeline.""" # Workaround to load SQLAlchemy objects at start of pipeline. This is necessary because the BuildRootEntity # function tries to access attributes of relationship properties on the SQLAlchemy room_schema_class before they # have been loaded. However, if *any* SQLAlchemy objects have been instantiated, then the relationship properties # are loaded and their attributes can be successfully accessed. _ = schema.StatePerson() # Parse command-line arguments known_args, remaining_args = parse_arguments(argv) pipeline_options = get_apache_beam_pipeline_options_from_args(remaining_args) pipeline_options.view_as(SetupOptions).save_main_session = True # Get pipeline job details all_pipeline_options = pipeline_options.get_all_options() query_dataset = all_pipeline_options['project'] + '.' + known_args.input reference_dataset = all_pipeline_options['project'] + '.' + known_args.reference_input person_id_filter_set = set(known_args.person_filter_ids) if known_args.person_filter_ids else None state_code = known_args.state_code with beam.Pipeline(options=pipeline_options) as p: # Get StatePersons persons = (p | 'Load StatePersons' >> BuildRootEntity(dataset=query_dataset, root_entity_class=entities.StatePerson, unifying_id_field=entities.StatePerson.get_class_id_name(), build_related_entities=True, unifying_id_field_filter_set=person_id_filter_set)) # Get StateSentenceGroups sentence_groups = (p | 'Load StateSentenceGroups' >> BuildRootEntity( dataset=query_dataset, root_entity_class=entities.StateSentenceGroup, unifying_id_field=entities.StatePerson.get_class_id_name(), build_related_entities=True, unifying_id_field_filter_set=person_id_filter_set, state_code=state_code )) # Get StateIncarcerationSentences incarceration_sentences = (p | 'Load StateIncarcerationSentences' >> BuildRootEntity( dataset=query_dataset, root_entity_class=entities.StateIncarcerationSentence, unifying_id_field=entities.StatePerson.get_class_id_name(), build_related_entities=True, unifying_id_field_filter_set=person_id_filter_set, state_code=state_code )) # Get StateSupervisionSentences supervision_sentences = (p | 'Load StateSupervisionSentences' >> BuildRootEntity( dataset=query_dataset, root_entity_class=entities.StateSupervisionSentence, unifying_id_field=entities.StatePerson.get_class_id_name(), build_related_entities=True, unifying_id_field_filter_set=person_id_filter_set, state_code=state_code )) if state_code is None or state_code == 'US_MO': # Bring in the reference table that includes sentence status ranking information us_mo_sentence_status_query = f"SELECT * FROM `{reference_dataset}.us_mo_sentence_statuses`" us_mo_sentence_statuses = (p | "Read MO sentence status table from BigQuery" >> beam.io.Read(beam.io.BigQuerySource(query=us_mo_sentence_status_query, use_standard_sql=True))) else: us_mo_sentence_statuses = (p | f"Generate empty MO statuses list for non-MO state run: {state_code} " >> beam.Create([])) us_mo_sentence_status_rankings_as_kv = ( us_mo_sentence_statuses | 'Convert MO sentence status ranking table to KV tuples' >> beam.ParDo(ConvertDictToKVTuple(), 'sentence_external_id') ) # Group the sentence status tuples by sentence_external_id us_mo_sentence_statuses_by_sentence = ( us_mo_sentence_status_rankings_as_kv | 'Group the MO sentence status ranking tuples by sentence_external_id' >> beam.GroupByKey() ) supervision_sentences_converted = ( supervision_sentences | 'Convert to state-specific supervision sentences' >> beam.ParDo(ConvertSentenceToStateSpecificType(), AsDict(us_mo_sentence_statuses_by_sentence)) ) incarceration_sentences_converted = ( incarceration_sentences | 'Convert to state-specific incarceration sentences' >> beam.ParDo(ConvertSentenceToStateSpecificType(), AsDict(us_mo_sentence_statuses_by_sentence)) ) sentences_and_sentence_groups = ( {'sentence_groups': sentence_groups, 'incarceration_sentences': incarceration_sentences_converted, 'supervision_sentences': supervision_sentences_converted} | 'Group sentences to sentence groups' >> beam.CoGroupByKey() ) # Set hydrated sentences on the corresponding sentence groups sentence_groups_with_hydrated_sentences = ( sentences_and_sentence_groups | 'Set hydrated sentences on sentence groups' >> beam.ParDo(SetSentencesOnSentenceGroup()) ) # Group each StatePerson with their related entities person_and_sentence_groups = ( {'person': persons, 'sentence_groups': sentence_groups_with_hydrated_sentences} | 'Group StatePerson to SentenceGroups' >> beam.CoGroupByKey() ) # Bring in the table that associates people and their county of residence person_id_to_county_query = \ f"SELECT * FROM `{reference_dataset}.persons_to_recent_county_of_residence`" person_id_to_county_kv = ( p | "Read person_id to county associations from BigQuery" >> beam.io.Read(beam.io.BigQuerySource( query=person_id_to_county_query, use_standard_sql=True)) | "Convert person_id to county association table to KV" >> beam.ParDo(ConvertDictToKVTuple(), 'person_id') ) # Identify IncarcerationEvents events from the StatePerson's StateIncarcerationPeriods person_events = (person_and_sentence_groups | 'Classify Incarceration Events' >> beam.ParDo(ClassifyIncarcerationEvents(), AsDict(person_id_to_county_kv))) # Get dimensions to include and methodologies to use inclusions, _ = dimensions_and_methodologies(known_args) # Get pipeline job details for accessing job_id all_pipeline_options = pipeline_options.get_all_options() # The number of months to limit the monthly calculation output to calculation_month_limit = known_args.calculation_month_limit # Add timestamp for local jobs job_timestamp = datetime.datetime.now().strftime('%Y-%m-%d_%H_%M_%S.%f') all_pipeline_options['job_timestamp'] = job_timestamp # Get IncarcerationMetrics incarceration_metrics = (person_events | 'Get Incarceration Metrics' >> GetIncarcerationMetrics( pipeline_options=all_pipeline_options, inclusions=inclusions, calculation_month_limit=calculation_month_limit)) if person_id_filter_set: logging.warning("Non-empty person filter set - returning before writing metrics.") return # Convert the metrics into a format that's writable to BQ writable_metrics = (incarceration_metrics | 'Convert to dict to be written to BQ' >> beam.ParDo(IncarcerationMetricWritableDict()).with_outputs( 'admissions', 'populations', 'releases')) # Write the metrics to the output tables in BigQuery admissions_table = known_args.output + '.incarceration_admission_metrics' population_table = known_args.output + '.incarceration_population_metrics' releases_table = known_args.output + '.incarceration_release_metrics' _ = (writable_metrics.admissions | f"Write admission metrics to BQ table: {admissions_table}" >> beam.io.WriteToBigQuery( table=admissions_table, create_disposition=beam.io.BigQueryDisposition.CREATE_NEVER, write_disposition=beam.io.BigQueryDisposition.WRITE_APPEND )) _ = (writable_metrics.populations | f"Write population metrics to BQ table: {population_table}" >> beam.io.WriteToBigQuery( table=population_table, create_disposition=beam.io.BigQueryDisposition.CREATE_NEVER, write_disposition=beam.io.BigQueryDisposition.WRITE_APPEND )) _ = (writable_metrics.releases | f"Write release metrics to BQ table: {releases_table}" >> beam.io.WriteToBigQuery( table=releases_table, create_disposition=beam.io.BigQueryDisposition.CREATE_NEVER, write_disposition=beam.io.BigQueryDisposition.WRITE_APPEND ))