def _row_subset(self, item): subset = asarray(item[0]) # materialize partition_row_subsets = self._copartition(subset, self.partition_row_counts) new_partition_row_counts = self._partition_row_counts( partition_row_subsets) new_shape = (builtins.sum(new_partition_row_counts), self.shape[1]) # Beam doesn't have a direct equivalent of Spark's zip function, so we use a side input and join here # See https://github.com/apache/beam/blob/master/sdks/python/apache_beam/examples/snippets/snippets.py#L1295 subset_pcollection = self.pipeline | gensym( "partition_row_subsets") >> beam.Create( enumerate(partition_row_subsets)) def join_row_with_subset(index_row, subset_dict): index, row = index_row return index, row[subset_dict[index], :] new_pcollection = self.pcollection | gensym("row_subset") >> beam.Map( join_row_with_subset, AsDict(subset_pcollection)) return self._new( pcollection=new_pcollection, shape=new_shape, partition_row_counts=new_partition_row_counts, )
def run(argv=None): parser = argparse.ArgumentParser() known_args, pipeline_args = parser.parse_known_args(argv) pipeline_options = PipelineOptions(pipeline_args) pipeline_options.view_as(SetupOptions).save_main_session = True offer_stat_pipeline_options = pipeline_options.view_as(OfferStatPipelineOptions) p = beam.Pipeline(options=pipeline_options) users = p | "Read users" >> beam.io.Read(beam.io.BigQuerySource(table=offer_stat_pipeline_options.users_bq_table, flatten_results=False)) \ | beam.Map(lambda user_row: (user_row['account_id'], user_row['country'])) account_offers = p | "Read account offers" >> beam.io.Read(beam.io.BigQuerySource(table=offer_stat_pipeline_options.account_offers_bq_table, flatten_results=False)) \ | beam.Map(lambda row: (row['account_id'], row)) offers = p | "Read offers" >> beam.io.Read(beam.io.BigQuerySource(table=offer_stat_pipeline_options.offers_bq_table, flatten_results=False)) \ | beam.Map(lambda row: (row['offer_id'], row['offer_name'])) ({'users': users, 'account_offers': account_offers} | beam.CoGroupByKey()) \ | beam.ParDo(UserCountryMerger()) \ | beam.Map(merge_offer_name, offers=AsDict(offers)) \ | beam.Map(lambda enriched_offer: ((enriched_offer['offer_name'], enriched_offer['user_country']), enriched_offer)) \ | beam.combiners.Count.PerKey() \ | 'Map to BQ row' >> beam.Map(convert_to_bq_row) \ | 'Writing offers to BQ' >> beam.io.WriteToBigQuery(table=offer_stat_pipeline_options.offer_stat_bq_table, create_disposition=BigQueryDisposition.CREATE_IF_NEEDED, write_disposition=BigQueryDisposition.WRITE_TRUNCATE, schema='offer_name:STRING,user_country:STRING,count:INTEGER') result = p.run() result.wait_until_finish()
def run_test_pipeline(self, person_id: int, sentence: SentenceType, us_mo_sentence_status_rows: List[Dict[str, str]], expected_sentence: SentenceType): """Runs a test pipeline to test ConvertSentenceToStateSpecificType and checks the output against expected.""" test_pipeline = TestPipeline() us_mo_sentence_statuses = (test_pipeline | 'Create MO sentence statuses' >> beam.Create(us_mo_sentence_status_rows)) sentence_status_rankings_as_kv = ( us_mo_sentence_statuses | 'Convert sentence status ranking table to KV tuples' >> beam.ParDo(ConvertDictToKVTuple(), 'sentence_external_id')) # Group the sentence status tuples by sentence_external_id us_mo_sentence_statuses_by_sentence = ( sentence_status_rankings_as_kv | 'Group the sentence status ranking tuples by sentence_external_id' >> beam.GroupByKey()) output = ( test_pipeline | beam.Create([(person_id, sentence)]) | 'Convert sentence' >> beam.ParDo( entity_hydration_utils.ConvertSentenceToStateSpecificType(), AsDict(us_mo_sentence_statuses_by_sentence))) # Expect no change expected_output = [(person_id, expected_sentence)] assert_that(output, self.convert_sentence_output_is_valid(expected_output)) test_pipeline.run()
def run(argv=None): parser = argparse.ArgumentParser() known_args, pipeline_args = parser.parse_known_args(argv) pipeline_options = PipelineOptions(pipeline_args) pipeline_options.view_as(SetupOptions).save_main_session = True offer_stat_pipeline_options = pipeline_options.view_as( OfferStatPipelineOptions) p = beam.Pipeline(options=pipeline_options) users = p | "Read users" >> beam.io.Read(beam.io.BigQuerySource(table=offer_stat_pipeline_options.users_bq_table, flatten_results=False)) \ | beam.Map(lambda user_row: (user_row['account_id'], user_row['country'])) account_offers = p | "Read account offers" >> beam.io.Read(beam.io.BigQuerySource(table=offer_stat_pipeline_options.account_offers_bq_table, flatten_results=False)) \ | beam.Map(lambda row: (row['account_id'], row)) offers = p | "Read offers" >> beam.io.Read(beam.io.BigQuerySource(table=offer_stat_pipeline_options.offers_bq_table, flatten_results=False)) \ | beam.Map(lambda row: (row['offer_id'], row['offer_name'])) ({'users': users, 'account_offers': account_offers} | beam.CoGroupByKey()) \ | beam.ParDo(UserCountryMerger()) \ | beam.Map(merge_offer_name, offers=AsDict(offers)) \ | beam.ParDo(beam_util.LoggerDoFn()) result = p.run() result.wait_until_finish()
def test_pcollectionview_not_recreated(self): pipeline = Pipeline('DirectRunner') value = pipeline | 'create1' >> Create([1, 2, 3]) value2 = pipeline | 'create2' >> Create([(1, 1), (2, 2), (3, 3)]) value3 = pipeline | 'create3' >> Create([(1, 1), (2, 2), (3, 3)]) self.assertEqual(AsSingleton(value), AsSingleton(value)) self.assertEqual(AsSingleton('new', value, default_value=1), AsSingleton('new', value, default_value=1)) self.assertNotEqual(AsSingleton(value), AsSingleton('new', value, default_value=1)) self.assertEqual(AsIter(value), AsIter(value)) self.assertEqual(AsList(value), AsList(value)) self.assertEqual(AsDict(value2), AsDict(value2)) self.assertNotEqual(AsSingleton(value), AsSingleton(value2)) self.assertNotEqual(AsIter(value), AsIter(value2)) self.assertNotEqual(AsList(value), AsList(value2)) self.assertNotEqual(AsDict(value2), AsDict(value3))
def run(): main_query = """SELECT CAST(a.HSN_ACCT_NUM AS INT64) AS CUSTOMER_ID, CAST(a.ROW_CREATED_DATE AS TIMESTAMP) AS SOURCE_CREATE_DT, a.PRIMARY_PHONE_NUM AS PRIMARY_PHONE_NUMBER, CAST(SUBSTR(a.PRIMARY_PHONE_NUM,1,3) AS INT64) AS DERIVED_AREA_CODE, CASE WHEN LENGTH(a.BILL_SHIP_ADDR_SYNC_FLAG) = 0 THEN 0 ELSE CAST(a.BILL_SHIP_ADDR_SYNC_FLAG AS INT64) END AS BILLSHIP_ADDR_SYNC_IND, a.GUEST_CUSTOMER_FLAG AS GUEST_CODE, a.GUID AS DIGITAL_CUSTOMER_ID, a.MARKET_PLACE_ID AS MARKET_PLACE_ID, a.MARKET_PLACE_CUSTID AS MARKET_PLACE_CUST_ID, (srg_key.MAX_VALUE_KEY + ROW_NUMBER() OVER()) AS CUSTOMER_KEY, '999999999999' AS PRIMARY_ADDRESS_KEY, '999999999999' AS BILLING_ADDRESS_KEY, 'CLIC' AS ETL_SOURCE_SYSTEM, '0' AS PURGED_IND, '0' AS MERGED_IND, '0' AS TEST_CUSTOMER_IND, '0' AS BLOCK_CURRENT_IND, '0' AS BLOCK_LIFETIME_IND, '999999999999' AS IDCENTRIC_INDIVIDUAL_ID, '999999999999' AS IDCENTRIC_HOUSEHOLD_ID, '999999999999' AS IDCENTRIC_ADDRESS_ID, '0' AS VOID_IND, CAST(FORMAT_DATETIME('%Y%m%d%H%M%S', CURRENT_DATETIME()) AS INT64) AS UPD_BATCH_NBR, CAST(FORMAT_DATETIME('%Y%m%d%H%M%S', CURRENT_DATETIME()) AS INT64) AS INS_BATCH_NBR, '999999999999' AS MKTADDR_ADDRESS_KEY, '0' AS PRIVACY_IND FROM `automatic-asset-253215.STAGE.STG_CLIC_CUSTSORGEXT` a, `automatic-asset-253215.STAGE.STG_CLIC_SURROGKEYS` srg_key WHERE srg_key.TABLE_NAME = "IM_CUSTOMER_ATTRIBUTE_REF" """ lookup_query = """SELECT DISTINCT CUSTOMER_ID FROM `automatic-asset-253215.CORE.IM_CUSTOMER_ATTRIBUTE_REF`""" IM_data = ( p | 'Read Cust_Ids From IM' >> beam.io.Read( beam.io.BigQuerySource(query=lookup_query, use_standard_sql=True)) | 'Get Cust_Ids ' >> beam.Map(lambda row: (row['CUSTOMER_ID'], row))) cust_org_data = ( p | 'Read from custorgext' >> beam.io.Read( beam.io.BigQuerySource(query=main_query, use_standard_sql=True)) | 'Lookup' >> beam.Map(lookup, AsDict(IM_data)) | 'Filter' >> beam.ParDo(filter_out_nones) | 'Insert: Unmatched Records' >> beam.io.WriteToBigQuery( output_table, write_disposition=beam.io.BigQueryDisposition.WRITE_APPEND, create_disposition=beam.io.BigQueryDisposition.CREATE_NEVER) #| 'Updating Surrogate_Key' >> beam.ParDo(updateSrgKey) ) p.run().wait_until_finish()
def get_enriched_events(salesevent: beam.pvalue.PCollection,sideinput_collections: Dict[str,beam.pvalue.PCollection]) \ -> beam.pvalue.PCollection: """Gets enriched events by a) Call a transform that combining primary event with corresponding side input values b) Group events by dummy key to combine all events in a window into one shard c) Discard dummy key Args: salesevent: Event representing sales transaction sideinput_collections: Set of Side Input Collections """ # yapf: disable return (salesevent | "Enrich event" >> beam.Map(transforms.enrich_event, AsDict(sideinput_collections["bonuspoints"]), AsDict(sideinput_collections["discountpct"]), AsDict(sideinput_collections["category"])) | "Group events by dummy Key" >> beam.GroupByKey() | "Discard dummy Key" >> beam.Values() )
def run(argv=None): """The main function which creates the pipeline and runs it.""" parser = argparse.ArgumentParser() # Here we add some specific command line arguments we expect. Specifically # we have the input file to load and the output table to write to. parser.add_argument( '--input', dest='input', required=False, help='Input file to read. This can be a local file or ' 'a file in a Google Storage Bucket.', # This example file contains a total of only 10 lines. # Useful for quickly debugging on a small set of data default='gs://python-dataflow-example/data_files/head_usa_names.csv') # The output defaults to the lake dataset in your BigQuery project. You'll have # to create the lake dataset yourself using this command: # bq mk lake parser.add_argument('--output', dest='output', required=False, help='Output BQ table to write results to.', default='lake.usa_names_enriched') # Parse arguments from the command line. known_args, pipeline_args = parser.parse_known_args(argv) # DataIngestion is a class we built in this script to hold the logic for # transforming the file into a BigQuery table. data_ingestion = DataIngestion() # Initiate the pipeline using the pipeline arguments passed in from the # command line. This includes information like where Dataflow should store # temp files, and what the project id is p = beam.Pipeline(options=PipelineOptions(pipeline_args)) schema = parse_table_schema_from_json(data_ingestion.schema_str) # This function adds in a full state name by looking up the # full name in the short_to_long_name_map. The short_to_long_name_map # comes from a read from BigQuery in the next few lines def add_full_state_name(row, short_to_long_name_map): row['state_full_name'] = short_to_long_name_map[row['state']] return row # This is a second source of data. The source is from BigQuery. # This will come into our pipeline a side input. read_query = """ SELECT state_name, state_abbreviation FROM `python-dataflow-example.example_data.state_abbreviations`""" state_abbreviations = ( p | 'Read from BigQuery' >> beam.io.Read( beam.io.BigQuerySource(query=read_query, use_standard_sql=True)) # We must create a python tuple of key to value pairs here in order to # use the data as a side input. Dataflow will use the keys to distribute the # work to the correct worker. | 'Abbreviation to Full Name' >> beam.Map(lambda row: (row['state_abbreviation'], row['state_name']))) (p # Read the file. This is the source of the pipeline. All further # processing starts with lines read from the file. We use the input # argument from the command line. We also skip the first line which is # a header row. | 'Read From Text' >> beam.io.ReadFromText(known_args.input, skip_header_lines=1) # Translates from the raw string data in the CSV to a dictionary. # The dictionary is a keyed by column names with the values being the values # we want to store in BigQuery. | 'String to BigQuery Row' >> beam.Map(lambda s: data_ingestion.parse_method(s)) # Here we pass in a side input, which is data that comes from outside our # CSV source. The side input contains a map of states to their full name. | 'Join Data' >> beam.Map(add_full_state_name, AsDict(state_abbreviations)) # This is the final stage of the pipeline, where we define the destination # of the data. In this case we are writing to BigQuery. | 'Write to BigQuery' >> beam.io.Write( beam.io.BigQuerySink( # The table name is a required argument for the BigQuery sink. # In this case we use the value passed in from the command line. known_args.output, # Here we use the JSON schema read in from a JSON file. # Specifying the schema allows the API to create the table correctly if it does not yet exist. schema=schema, # Creates the table in BigQuery if it does not yet exist. create_disposition=beam.io.BigQueryDisposition.CREATE_IF_NEEDED, # Deletes all data in the BigQuery table before writing. write_disposition=beam.io.BigQueryDisposition.WRITE_TRUNCATE))) p.run().wait_until_finish()
def run(argv=None): """Runs the recidivism calculation pipeline.""" # Workaround to load SQLAlchemy objects at start of pipeline. This is # necessary because the BuildRootEntity function tries to access attributes # of relationship properties on the SQLAlchemy room_schema_class before they # have been loaded. However, if *any* SQLAlchemy objects have been # instantiated, then the relationship properties are loaded and their # attributes can be successfully accessed. _ = schema.StatePerson() # Parse command-line arguments known_args, pipeline_args = parse_arguments(argv) pipeline_options = PipelineOptions(pipeline_args) pipeline_options.view_as(SetupOptions).save_main_session = True # Get pipeline job details all_pipeline_options = pipeline_options.get_all_options() query_dataset = all_pipeline_options['project'] + '.' + known_args.input reference_dataset = all_pipeline_options['project'] + '.' + \ known_args.reference_input with beam.Pipeline(argv=pipeline_args) as p: # Get StatePersons persons = (p | 'Load Persons' >> BuildRootEntity( dataset=query_dataset, data_dict=None, root_schema_class=schema.StatePerson, root_entity_class=entities.StatePerson, unifying_id_field='person_id', build_related_entities=True)) # Get StateIncarcerationPeriods incarceration_periods = ( p | 'Load IncarcerationPeriods' >> BuildRootEntity( dataset=query_dataset, data_dict=None, root_schema_class=schema.StateIncarcerationPeriod, root_entity_class=entities.StateIncarcerationPeriod, unifying_id_field='person_id', build_related_entities=True)) # Get StateSupervisionViolations supervision_violations = \ (p | 'Load SupervisionViolations' >> BuildRootEntity( dataset=query_dataset, data_dict=None, root_schema_class=schema.StateSupervisionViolation, root_entity_class=entities.StateSupervisionViolation, unifying_id_field='person_id', build_related_entities=True )) # TODO(2769): Don't bring this in as a root entity # Get StateSupervisionViolationResponses supervision_violation_responses = \ (p | 'Load SupervisionViolationResponses' >> BuildRootEntity( dataset=query_dataset, data_dict=None, root_schema_class=schema.StateSupervisionViolationResponse, root_entity_class=entities.StateSupervisionViolationResponse, unifying_id_field='person_id', build_related_entities=True )) # Group StateSupervisionViolationResponses and # StateSupervisionViolations by person_id supervision_violations_and_responses = ( { 'violations': supervision_violations, 'violation_responses': supervision_violation_responses } | 'Group StateSupervisionViolationResponses to ' 'StateSupervisionViolations' >> beam.CoGroupByKey()) # Set the fully hydrated StateSupervisionViolation entities on # the corresponding StateSupervisionViolationResponses violation_responses_with_hydrated_violations = ( supervision_violations_and_responses | 'Set hydrated StateSupervisionViolations on ' 'the StateSupervisionViolationResponses' >> beam.ParDo( SetViolationOnViolationsResponse())) # Group StateIncarcerationPeriods and StateSupervisionViolationResponses # by person_id incarceration_periods_and_violation_responses = ( { 'incarceration_periods': incarceration_periods, 'violation_responses': violation_responses_with_hydrated_violations } | 'Group StateIncarcerationPeriods to ' 'StateSupervisionViolationResponses' >> beam.CoGroupByKey()) # Set the fully hydrated StateSupervisionViolationResponse entities on # the corresponding StateIncarcerationPeriods incarceration_periods_with_source_violations = ( incarceration_periods_and_violation_responses | 'Set hydrated StateSupervisionViolationResponses on ' 'the StateIncarcerationPeriods' >> beam.ParDo( SetViolationResponseOnIncarcerationPeriod())) # Group each StatePerson with their StateIncarcerationPeriods person_and_incarceration_periods = ( { 'person': persons, 'incarceration_periods': incarceration_periods_with_source_violations } | 'Group StatePerson to StateIncarcerationPeriods' >> beam.CoGroupByKey()) # Bring in the table that associates people and their county of # residence person_id_to_county_query = \ f"SELECT * FROM " \ f"`{reference_dataset}.persons_to_recent_county_of_residence`" person_id_to_county_kv = ( p | "Read person_id to county associations from BigQuery" >> beam.io.Read( beam.io.BigQuerySource(query=person_id_to_county_query, use_standard_sql=True)) | "Convert person_id to county association table to KV" >> beam.ParDo(ConvertDictToKVTuple(), 'person_id')) # Identify ReleaseEvents events from the StatePerson's # StateIncarcerationPeriods person_events = ( person_and_incarceration_periods | "ClassifyReleaseEvents" >> beam.ParDo( ClassifyReleaseEvents(), AsDict(person_id_to_county_kv))) # Get dimensions to include and methodologies to use inclusions, methodologies = dimensions_and_methodologies(known_args) # Get pipeline job details for accessing job_id all_pipeline_options = pipeline_options.get_all_options() # Add timestamp for local jobs job_timestamp = datetime.datetime.now().strftime( '%Y-%m-%d_%H_%M_%S.%f') all_pipeline_options['job_timestamp'] = job_timestamp # Get recidivism metrics recidivism_metrics = ( person_events | 'Get Recidivism Metrics' >> GetRecidivismMetrics( pipeline_options=all_pipeline_options, inclusions=inclusions)) filter_metrics_kwargs = {'methodologies': methodologies} # Filter out unneeded metrics final_recidivism_metrics = ( recidivism_metrics | 'Filter out unwanted metrics' >> beam.ParDo( FilterMetrics(), **filter_metrics_kwargs)) # Convert the metrics into a format that's writable to BQ writable_metrics = ( final_recidivism_metrics | 'Convert to dict to be written to BQ' >> beam.ParDo( RecidivismMetricWritableDict()).with_outputs( 'rates', 'counts', 'liberties')) # Write the recidivism metrics to the output tables in BigQuery rates_table = known_args.output + '.recidivism_rate_metrics' counts_table = known_args.output + '.recidivism_count_metrics' liberty_table = known_args.output + '.recidivism_liberty_metrics' _ = (writable_metrics.rates | f"Write rate metrics to BQ table: {rates_table}" >> beam.io.WriteToBigQuery( table=rates_table, create_disposition=beam.io.BigQueryDisposition.CREATE_NEVER, write_disposition=beam.io.BigQueryDisposition.WRITE_APPEND)) _ = (writable_metrics.counts | f"Write count metrics to BQ table: {counts_table}" >> beam.io.WriteToBigQuery( table=counts_table, create_disposition=beam.io.BigQueryDisposition.CREATE_NEVER, write_disposition=beam.io.BigQueryDisposition.WRITE_APPEND)) _ = (writable_metrics.liberties | f"Write liberty metrics to BQ table: {liberty_table}" >> beam.io.WriteToBigQuery( table=liberty_table, create_disposition=beam.io.BigQueryDisposition.CREATE_NEVER, write_disposition=beam.io.BigQueryDisposition.WRITE_APPEND))
def testClassifyProgramAssignments_NoAssessments(self): """Tests the ClassifyProgramAssignments DoFn.""" fake_person_id = 12345 fake_person = entities.StatePerson.new_with_defaults( person_id=fake_person_id, gender=Gender.MALE, birthdate=date(1970, 1, 1), residency_status=ResidencyStatus.PERMANENT) program_assignment = entities.StateProgramAssignment.new_with_defaults( state_code='US_CA', program_id='PG3', referral_date=date(2009, 10, 3)) supervision_period = \ entities.StateSupervisionPeriod.new_with_defaults( supervision_period_id=111, status=StateSupervisionPeriodStatus.TERMINATED, state_code='UT', start_date=date(2008, 3, 5), termination_date=date(2010, 5, 19), termination_reason= StateSupervisionPeriodTerminationReason.DISCHARGE, supervision_type=StateSupervisionType.PAROLE ) person_periods = { 'person': [fake_person], 'program_assignments': [program_assignment], 'assessments': [], 'supervision_periods': [supervision_period] } program_event = ProgramReferralEvent( state_code=program_assignment.state_code, program_id=program_assignment.program_id, event_date=program_assignment.referral_date, supervision_type=supervision_period.supervision_type, supervising_officer_external_id='OFFICER0009', supervising_district_external_id='10') correct_output = [(fake_person, [program_event])] test_pipeline = TestPipeline() supervision_period_to_agent_map = { 'agent_id': 1010, 'agent_external_id': 'OFFICER0009', 'district_external_id': '10', 'supervision_period_id': supervision_period.supervision_period_id } supervision_period_to_agent_associations = ( test_pipeline | 'Create SupervisionPeriod to Agent table' >> beam.Create( [supervision_period_to_agent_map])) supervision_periods_to_agent_associations_as_kv = ( supervision_period_to_agent_associations | 'Convert SupervisionPeriod to Agent table to KV tuples' >> beam.ParDo(pipeline.ConvertDictToKVTuple(), 'supervision_period_id')) output = (test_pipeline | beam.Create([(fake_person_id, person_periods)]) | 'Identify Program Events' >> beam.ParDo( pipeline.ClassifyProgramAssignments(), AsDict(supervision_periods_to_agent_associations_as_kv))) assert_that(output, equal_to(correct_output)) test_pipeline.run()
def run(apache_beam_pipeline_options: PipelineOptions, data_input: str, reference_input: str, output: str, calculation_month_count: int, metric_types: List[str], state_code: Optional[str], calculation_end_month: Optional[str], person_filter_ids: Optional[List[int]]): """Runs the incarceration calculation pipeline.""" # Workaround to load SQLAlchemy objects at start of pipeline. This is necessary because the BuildRootEntity # function tries to access attributes of relationship properties on the SQLAlchemy room_schema_class before they # have been loaded. However, if *any* SQLAlchemy objects have been instantiated, then the relationship properties # are loaded and their attributes can be successfully accessed. _ = schema.StatePerson() apache_beam_pipeline_options.view_as(SetupOptions).save_main_session = True # Get pipeline job details all_apache_beam_pipeline_options = apache_beam_pipeline_options.get_all_options( ) query_dataset = all_apache_beam_pipeline_options[ 'project'] + '.' + data_input reference_dataset = all_apache_beam_pipeline_options[ 'project'] + '.' + reference_input person_id_filter_set = set( person_filter_ids) if person_filter_ids else None with beam.Pipeline(options=apache_beam_pipeline_options) as p: # Get StatePersons persons = (p | 'Load StatePersons' >> BuildRootEntity( dataset=query_dataset, root_entity_class=entities.StatePerson, unifying_id_field=entities.StatePerson.get_class_id_name(), build_related_entities=True, unifying_id_field_filter_set=person_id_filter_set)) # Get StateSentenceGroups sentence_groups = (p | 'Load StateSentenceGroups' >> BuildRootEntity( dataset=query_dataset, root_entity_class=entities.StateSentenceGroup, unifying_id_field=entities.StatePerson.get_class_id_name(), build_related_entities=True, unifying_id_field_filter_set=person_id_filter_set, state_code=state_code)) # Get StateIncarcerationSentences incarceration_sentences = ( p | 'Load StateIncarcerationSentences' >> BuildRootEntity( dataset=query_dataset, root_entity_class=entities.StateIncarcerationSentence, unifying_id_field=entities.StatePerson.get_class_id_name(), build_related_entities=True, unifying_id_field_filter_set=person_id_filter_set, state_code=state_code)) # Get StateSupervisionSentences supervision_sentences = ( p | 'Load StateSupervisionSentences' >> BuildRootEntity( dataset=query_dataset, root_entity_class=entities.StateSupervisionSentence, unifying_id_field=entities.StatePerson.get_class_id_name(), build_related_entities=True, unifying_id_field_filter_set=person_id_filter_set, state_code=state_code)) if state_code is None or state_code == 'US_MO': # Bring in the reference table that includes sentence status ranking information us_mo_sentence_status_query = f"SELECT * FROM `{reference_dataset}.us_mo_sentence_statuses`" us_mo_sentence_statuses = ( p | "Read MO sentence status table from BigQuery" >> beam.io.Read( beam.io.BigQuerySource(query=us_mo_sentence_status_query, use_standard_sql=True))) else: us_mo_sentence_statuses = ( p | f"Generate empty MO statuses list for non-MO state run: {state_code} " >> beam.Create([])) us_mo_sentence_status_rankings_as_kv = ( us_mo_sentence_statuses | 'Convert MO sentence status ranking table to KV tuples' >> beam.ParDo(ConvertDictToKVTuple(), 'person_id')) supervision_sentences_and_statuses = ( { 'incarceration_sentences': incarceration_sentences, 'supervision_sentences': supervision_sentences, 'sentence_statuses': us_mo_sentence_status_rankings_as_kv } | 'Group sentences to the sentence statuses for that person' >> beam.CoGroupByKey()) sentences_converted = ( supervision_sentences_and_statuses | 'Convert to state-specific sentences' >> beam.ParDo( ConvertSentencesToStateSpecificType()).with_outputs( 'incarceration_sentences', 'supervision_sentences')) sentences_and_sentence_groups = ( { 'sentence_groups': sentence_groups, 'incarceration_sentences': sentences_converted.incarceration_sentences, 'supervision_sentences': sentences_converted.supervision_sentences } | 'Group sentences to sentence groups' >> beam.CoGroupByKey()) # Set hydrated sentences on the corresponding sentence groups sentence_groups_with_hydrated_sentences = ( sentences_and_sentence_groups | 'Set hydrated sentences on sentence groups' >> beam.ParDo( SetSentencesOnSentenceGroup())) # Group each StatePerson with their related entities person_and_sentence_groups = ( { 'person': persons, 'sentence_groups': sentence_groups_with_hydrated_sentences } | 'Group StatePerson to SentenceGroups' >> beam.CoGroupByKey()) # Bring in the table that associates people and their county of residence person_id_to_county_query = \ f"SELECT * FROM `{reference_dataset}.persons_to_recent_county_of_residence`" person_id_to_county_kv = ( p | "Read person_id to county associations from BigQuery" >> beam.io.Read( beam.io.BigQuerySource(query=person_id_to_county_query, use_standard_sql=True)) | "Convert person_id to county association table to KV" >> beam.ParDo(ConvertDictToKVTuple(), 'person_id')) # Identify IncarcerationEvents events from the StatePerson's StateIncarcerationPeriods person_events = ( person_and_sentence_groups | 'Classify Incarceration Events' >> beam.ParDo( ClassifyIncarcerationEvents(), AsDict(person_id_to_county_kv))) # Get pipeline job details for accessing job_id all_pipeline_options = apache_beam_pipeline_options.get_all_options() # Add timestamp for local jobs job_timestamp = datetime.datetime.now().strftime( '%Y-%m-%d_%H_%M_%S.%f') all_apache_beam_pipeline_options['job_timestamp'] = job_timestamp # Get the type of metric to calculate metric_types_set = set(metric_types) # Get IncarcerationMetrics incarceration_metrics = ( person_events | 'Get Incarceration Metrics' >> GetIncarcerationMetrics( pipeline_options=all_pipeline_options, metric_types=metric_types_set, calculation_end_month=calculation_end_month, calculation_month_count=calculation_month_count)) if person_id_filter_set: logging.warning( "Non-empty person filter set - returning before writing metrics." ) return # Convert the metrics into a format that's writable to BQ writable_metrics = ( incarceration_metrics | 'Convert to dict to be written to BQ' >> beam.ParDo(IncarcerationMetricWritableDict()).with_outputs( 'admissions', 'populations', 'releases')) # Write the metrics to the output tables in BigQuery admissions_table = output + '.incarceration_admission_metrics' population_table = output + '.incarceration_population_metrics' releases_table = output + '.incarceration_release_metrics' _ = (writable_metrics.admissions | f"Write admission metrics to BQ table: {admissions_table}" >> beam.io.WriteToBigQuery( table=admissions_table, create_disposition=beam.io.BigQueryDisposition.CREATE_NEVER, write_disposition=beam.io.BigQueryDisposition.WRITE_APPEND)) _ = (writable_metrics.populations | f"Write population metrics to BQ table: {population_table}" >> beam.io.WriteToBigQuery( table=population_table, create_disposition=beam.io.BigQueryDisposition.CREATE_NEVER, write_disposition=beam.io.BigQueryDisposition.WRITE_APPEND)) _ = (writable_metrics.releases | f"Write release metrics to BQ table: {releases_table}" >> beam.io.WriteToBigQuery( table=releases_table, create_disposition=beam.io.BigQueryDisposition.CREATE_NEVER, write_disposition=beam.io.BigQueryDisposition.WRITE_APPEND))
def run(argv=None): """Runs the program calculation pipeline.""" # Workaround to load SQLAlchemy objects at start of pipeline. This is necessary because the BuildRootEntity # function tries to access attributes of relationship properties on the SQLAlchemy room_schema_class before they # have been loaded. However, if *any* SQLAlchemy objects have been instantiated, then the relationship properties # are loaded and their attributes can be successfully accessed. _ = schema.StatePerson() # Parse command-line arguments known_args, pipeline_args = parse_arguments(argv) pipeline_options = PipelineOptions(pipeline_args) pipeline_options.view_as(SetupOptions).save_main_session = True # Get pipeline job details all_pipeline_options = pipeline_options.get_all_options() input_dataset = all_pipeline_options['project'] + '.' + known_args.input reference_dataset = all_pipeline_options['project'] + '.' + \ known_args.reference_input with beam.Pipeline(argv=pipeline_args) as p: # Get StatePersons persons = (p | 'Load Persons' >> BuildRootEntity( dataset=input_dataset, data_dict=None, root_schema_class=schema.StatePerson, root_entity_class=entities.StatePerson, unifying_id_field='person_id', build_related_entities=True)) # Get StateProgramAssignments program_assignments = ( p | 'Load Program Assignments' >> BuildRootEntity( dataset=input_dataset, data_dict=None, root_schema_class=schema.StateProgramAssignment, root_entity_class=entities.StateProgramAssignment, unifying_id_field='person_id', build_related_entities=True)) # Get StateAssessments assessments = (p | 'Load Assessments' >> BuildRootEntity( dataset=input_dataset, data_dict=None, root_schema_class=schema.StateAssessment, root_entity_class=entities.StateAssessment, unifying_id_field='person_id', build_related_entities=False)) # Get StateSupervisionPeriods supervision_periods = ( p | 'Load SupervisionPeriods' >> BuildRootEntity( dataset=input_dataset, data_dict=None, root_schema_class=schema.StateSupervisionPeriod, root_entity_class=entities.StateSupervisionPeriod, unifying_id_field='person_id', build_related_entities=False)) supervision_period_to_agent_association_query = \ f"SELECT * FROM `{reference_dataset}.supervision_period_to_agent_association`" supervision_period_to_agent_associations = ( p | "Read Supervision Period to Agent table from BigQuery" >> beam.io.Read( beam.io.BigQuerySource( query=supervision_period_to_agent_association_query, use_standard_sql=True))) # Convert the association table rows into key-value tuples with the value for the supervision_period_id column # as the key supervision_period_to_agent_associations_as_kv = ( supervision_period_to_agent_associations | 'Convert Supervision Period to Agent table to KV tuples' >> beam.ParDo(ConvertDictToKVTuple(), 'supervision_period_id')) # Group each StatePerson with their other entities persons_entities = ({ 'person': persons, 'program_assignments': program_assignments, 'assessments': assessments, 'supervision_periods': supervision_periods } | 'Group StatePerson to StateProgramAssignments and' >> beam.CoGroupByKey()) # Identify ProgramEvents from the StatePerson's StateProgramAssignments person_program_events = ( persons_entities | beam.ParDo( ClassifyProgramAssignments(), AsDict(supervision_period_to_agent_associations_as_kv))) # Get dimensions to include and methodologies to use inclusions, _ = dimensions_and_methodologies(known_args) # Get pipeline job details for accessing job_id all_pipeline_options = pipeline_options.get_all_options() # The number of months to limit the monthly calculation output to calculation_month_limit = known_args.calculation_month_limit # Add timestamp for local jobs job_timestamp = datetime.datetime.now().strftime( '%Y-%m-%d_%H_%M_%S.%f') all_pipeline_options['job_timestamp'] = job_timestamp # Get program metrics program_metrics = ( person_program_events | 'Get Program Metrics' >> GetProgramMetrics( pipeline_options=all_pipeline_options, inclusions=inclusions, calculation_month_limit=calculation_month_limit)) # Convert the metrics into a format that's writable to BQ writable_metrics = ( program_metrics | 'Convert to dict to be written to BQ' >> beam.ParDo( ProgramMetricWritableDict()).with_outputs('referrals')) # Write the metrics to the output tables in BigQuery referrals_table = known_args.output + '.program_referral_metrics' _ = (writable_metrics.referrals | f"Write referral metrics to BQ table: {referrals_table}" >> beam.io.WriteToBigQuery( table=referrals_table, create_disposition=beam.io.BigQueryDisposition.CREATE_NEVER, write_disposition=beam.io.BigQueryDisposition.WRITE_APPEND))
def run(argv=None): """The main function which creates the pipeline and runs it.""" parser = argparse.ArgumentParser() # Here we add some specific command line arguments we expect. S # This defaults the output table in your BigQuery you'll have # to create the example_data dataset yourself using bq mk temp parser.add_argument('--output', dest='output', required=False, help='Output BQ table to write results to.', default='lake.orders_denormalized_sideinput') # Parse arguments from the command line. known_args, pipeline_args = parser.parse_known_args(argv) # DataLakeToDataMart is a class we built in this script to hold the logic for # transforming the file into a BigQuery table. data_lake_to_data_mart = DataLakeToDataMart() p = beam.Pipeline(options=PipelineOptions(pipeline_args)) schema = parse_table_schema_from_json(data_lake_to_data_mart.schema_str) pipeline = beam.Pipeline(options=PipelineOptions(pipeline_args)) # This query returns details about the account, normalized into a # different table. We will be joining the data in to the main orders dataset in order # to create a denormalized table. account_details_source = ( pipeline | 'Read Account Details from BigQuery ' >> beam.io.Read( beam.io.BigQuerySource(query=""" SELECT acct_number, acct_company_name, acct_group_name, acct_name, acct_org_name, address, city, state, zip_code, country FROM `qwiklabs-resources.python_dataflow_example.account`""", # This next stage of the pipeline maps the acct_number to a single row of # results from BigQuery. Mapping this way helps Dataflow move your data around # to different workers. When later stages of the pipeline run, all results from # a given account number will run on one worker. use_standard_sql=True)) | 'Account Details' >> beam.Map( lambda row: ( row['acct_number'], row ))) orders_query = data_lake_to_data_mart.get_orders_query() (p # Read the orders from BigQuery. This is the source of the pipeline. All further # processing starts with rows read from the query results here. | 'Read Orders from BigQuery ' >> beam.io.Read( beam.io.BigQuerySource(query=orders_query, use_standard_sql=True)) # Here we pass in a side input, which is data that comes from outside our # main source. The side input contains a map of states to their full name | 'Join Data with sideInput' >> beam.Map(data_lake_to_data_mart.add_account_details, AsDict( account_details_source)) # This is the final stage of the pipeline, where we define the destination # of the data. In this case we are writing to BigQuery. | 'Write Data to BigQuery' >> beam.io.Write( beam.io.BigQuerySink( # The table name is a required argument for the BigQuery sink. # In this case we use the value passed in from the command line. known_args.output, # Here we use the JSON schema read in from a JSON file. # Specifying the schema allows the API to create the table correctly if it does not yet exist. schema=schema, # Creates the table in BigQuery if it does not yet exist. create_disposition=beam.io.BigQueryDisposition.CREATE_IF_NEEDED, # Deletes all data in the BigQuery table before writing. write_disposition=beam.io.BigQueryDisposition.WRITE_TRUNCATE))) p.run().wait_until_finish()
def testIncarcerationPipeline(self): fake_person_id = 12345 fake_person = schema.StatePerson( person_id=fake_person_id, gender=Gender.MALE, birthdate=date(1970, 1, 1), residency_status=ResidencyStatus.PERMANENT) persons_data = [normalized_database_base_dict(fake_person)] race_1 = schema.StatePersonRace(person_race_id=111, state_code='CA', race=Race.BLACK, person_id=fake_person_id) race_2 = schema.StatePersonRace(person_race_id=111, state_code='ND', race=Race.WHITE, person_id=fake_person_id) races_data = normalized_database_base_dict_list([race_1, race_2]) ethnicity = schema.StatePersonEthnicity(person_ethnicity_id=111, state_code='CA', ethnicity=Ethnicity.HISPANIC, person_id=fake_person_id) ethnicity_data = normalized_database_base_dict_list([ethnicity]) sentence_group = schema.StateSentenceGroup(sentence_group_id=111, person_id=fake_person_id) initial_incarceration = schema.StateIncarcerationPeriod( incarceration_period_id=1111, status=StateIncarcerationPeriodStatus.NOT_IN_CUSTODY, state_code='CA', county_code='124', facility='San Quentin', facility_security_level=StateIncarcerationFacilitySecurityLevel. MAXIMUM, admission_reason=StateIncarcerationPeriodAdmissionReason. NEW_ADMISSION, projected_release_reason=StateIncarcerationPeriodReleaseReason. CONDITIONAL_RELEASE, admission_date=date(2008, 11, 20), release_date=date(2010, 12, 4), release_reason=StateIncarcerationPeriodReleaseReason. SENTENCE_SERVED, person_id=fake_person_id, ) first_reincarceration = schema.StateIncarcerationPeriod( incarceration_period_id=2222, status=StateIncarcerationPeriodStatus.NOT_IN_CUSTODY, state_code='CA', county_code='124', facility='San Quentin', facility_security_level=StateIncarcerationFacilitySecurityLevel. MAXIMUM, admission_reason=StateIncarcerationPeriodAdmissionReason. NEW_ADMISSION, projected_release_reason=StateIncarcerationPeriodReleaseReason. CONDITIONAL_RELEASE, admission_date=date(2011, 4, 5), release_date=date(2014, 4, 14), release_reason=StateIncarcerationPeriodReleaseReason. SENTENCE_SERVED, person_id=fake_person_id) subsequent_reincarceration = schema.StateIncarcerationPeriod( incarceration_period_id=3333, status=StateIncarcerationPeriodStatus.IN_CUSTODY, state_code='CA', county_code='124', facility='San Quentin', facility_security_level=StateIncarcerationFacilitySecurityLevel. MAXIMUM, admission_reason=StateIncarcerationPeriodAdmissionReason. NEW_ADMISSION, projected_release_reason=StateIncarcerationPeriodReleaseReason. CONDITIONAL_RELEASE, admission_date=date(2017, 1, 4), person_id=fake_person_id) incarceration_sentence = schema.StateIncarcerationSentence( incarceration_sentence_id=1111, sentence_group_id=sentence_group.sentence_group_id, incarceration_periods=[ initial_incarceration, first_reincarceration, subsequent_reincarceration ], person_id=fake_person_id) supervision_sentence = schema.StateSupervisionSentence( supervision_sentence_id=123, person_id=fake_person_id) sentence_group.incarceration_sentences = [incarceration_sentence] sentence_group_data = [normalized_database_base_dict(sentence_group)] incarceration_sentence_data = [ normalized_database_base_dict(incarceration_sentence) ] supervision_sentence_data = [ normalized_database_base_dict(supervision_sentence) ] incarceration_periods_data = [ normalized_database_base_dict(initial_incarceration), normalized_database_base_dict(first_reincarceration), normalized_database_base_dict(subsequent_reincarceration) ] state_incarceration_sentence_incarceration_period_association = [ { 'incarceration_period_id': initial_incarceration.incarceration_period_id, 'incarceration_sentence_id': incarceration_sentence.incarceration_sentence_id, }, { 'incarceration_period_id': first_reincarceration.incarceration_period_id, 'incarceration_sentence_id': incarceration_sentence.incarceration_sentence_id, }, { 'incarceration_period_id': subsequent_reincarceration.incarceration_period_id, 'incarceration_sentence_id': incarceration_sentence.incarceration_sentence_id, }, ] data_dict = { schema.StatePerson.__tablename__: persons_data, schema.StatePersonRace.__tablename__: races_data, schema.StatePersonEthnicity.__tablename__: ethnicity_data, schema.StateSentenceGroup.__tablename__: sentence_group_data, schema.StateIncarcerationSentence.__tablename__: incarceration_sentence_data, schema.StateSupervisionSentence.__tablename__: supervision_sentence_data, schema.StateIncarcerationPeriod.__tablename__: incarceration_periods_data, schema.state_incarceration_sentence_incarceration_period_association_table.name: state_incarceration_sentence_incarceration_period_association, schema.state_supervision_sentence_incarceration_period_association_table.name: [{}] } test_pipeline = TestPipeline() # Get StatePersons persons = (test_pipeline | 'Load Persons' >> extractor_utils.BuildRootEntity( dataset=None, data_dict=data_dict, root_schema_class=schema.StatePerson, root_entity_class=entities.StatePerson, unifying_id_field='person_id', build_related_entities=True)) # Get StateSentenceGroups sentence_groups = ( test_pipeline | 'Load StateSentencegroups' >> extractor_utils.BuildRootEntity( dataset=None, data_dict=data_dict, root_schema_class=schema.StateSentenceGroup, root_entity_class=entities.StateSentenceGroup, unifying_id_field='person_id', build_related_entities=True)) # Get StateIncarcerationSentences incarceration_sentences = ( test_pipeline | 'Load StateIncarcerationSentences' >> extractor_utils.BuildRootEntity( dataset=None, data_dict=data_dict, root_schema_class=schema.StateIncarcerationSentence, root_entity_class=entities.StateIncarcerationSentence, unifying_id_field='person_id', build_related_entities=True)) # Get StateSupervisionSentences supervision_sentences = ( test_pipeline | 'Load StateSupervisionSentences' >> extractor_utils.BuildRootEntity( dataset=None, data_dict=data_dict, root_schema_class=schema.StateSupervisionSentence, root_entity_class=entities.StateSupervisionSentence, unifying_id_field='person_id', build_related_entities=True)) sentences_and_sentence_groups = ( { 'sentence_groups': sentence_groups, 'incarceration_sentences': incarceration_sentences, 'supervision_sentences': supervision_sentences } | 'Group sentences to sentence groups' >> beam.CoGroupByKey()) sentence_groups_with_hydrated_sentences = ( sentences_and_sentence_groups | 'Set hydrated sentences on sentence groups' >> beam.ParDo( SetSentencesOnSentenceGroup())) # Group each StatePerson with their related entities person_and_sentence_groups = ( { 'person': persons, 'sentence_groups': sentence_groups_with_hydrated_sentences } | 'Group StatePerson to SentenceGroups' >> beam.CoGroupByKey()) # Identify IncarcerationEvents events from the StatePerson's # StateIncarcerationPeriods fake_person_id_to_county_query_result = [{ 'person_id': fake_person_id, 'county_of_residence': _COUNTY_OF_RESIDENCE }] person_id_to_county_kv = ( test_pipeline | "Read person id to county associations from BigQuery" >> beam.Create(fake_person_id_to_county_query_result) | "Convert to KV" >> beam.ParDo(ConvertDictToKVTuple(), 'person_id')) person_events = (person_and_sentence_groups | 'Classify Incarceration Events' >> beam.ParDo( pipeline.ClassifyIncarcerationEvents(), AsDict(person_id_to_county_kv))) # Get pipeline job details for accessing job_id all_pipeline_options = PipelineOptions().get_all_options() # Add timestamp for local jobs job_timestamp = datetime.datetime.now().strftime( '%Y-%m-%d_%H_%M_%S.%f') all_pipeline_options['job_timestamp'] = job_timestamp # Get IncarcerationMetrics incarceration_metrics = ( person_events | 'Get Incarceration Metrics' >> pipeline.GetIncarcerationMetrics( pipeline_options=all_pipeline_options, inclusions=ALL_INCLUSIONS_DICT, calculation_month_limit=-1)) assert_that(incarceration_metrics, AssertMatchers.validate_metric_type()) test_pipeline.run()
def run(apache_beam_pipeline_options: PipelineOptions, data_input: str, reference_view_input: str, static_reference_input: str, output: str, calculation_month_count: int, metric_types: List[str], state_code: Optional[str], calculation_end_month: Optional[str], person_filter_ids: Optional[List[int]]): """Runs the program calculation pipeline.""" # Workaround to load SQLAlchemy objects at start of pipeline. This is necessary because the BuildRootEntity # function tries to access attributes of relationship properties on the SQLAlchemy room_schema_class before they # have been loaded. However, if *any* SQLAlchemy objects have been instantiated, then the relationship properties # are loaded and their attributes can be successfully accessed. _ = schema.StatePerson() apache_beam_pipeline_options.view_as(SetupOptions).save_main_session = True # Get pipeline job details all_pipeline_options = apache_beam_pipeline_options.get_all_options() project_id = all_pipeline_options['project'] input_dataset = project_id + '.' + data_input reference_dataset = project_id + '.' + reference_view_input static_reference_dataset = project_id + '.' + static_reference_input person_id_filter_set = set( person_filter_ids) if person_filter_ids else None with beam.Pipeline(options=apache_beam_pipeline_options) as p: # Get StatePersons persons = (p | 'Load Persons' >> BuildRootEntity( dataset=input_dataset, root_entity_class=entities.StatePerson, unifying_id_field=entities.StatePerson.get_class_id_name(), build_related_entities=True, unifying_id_field_filter_set=person_id_filter_set, state_code=state_code)) # Get StateProgramAssignments program_assignments = ( p | 'Load Program Assignments' >> BuildRootEntity( dataset=input_dataset, root_entity_class=entities.StateProgramAssignment, unifying_id_field=entities.StatePerson.get_class_id_name(), build_related_entities=True, unifying_id_field_filter_set=person_id_filter_set, state_code=state_code)) # Get StateAssessments assessments = (p | 'Load Assessments' >> BuildRootEntity( dataset=input_dataset, root_entity_class=entities.StateAssessment, unifying_id_field=entities.StatePerson.get_class_id_name(), build_related_entities=False, unifying_id_field_filter_set=person_id_filter_set, state_code=state_code)) # Get StateSupervisionPeriods supervision_periods = ( p | 'Load SupervisionPeriods' >> BuildRootEntity( dataset=input_dataset, root_entity_class=entities.StateSupervisionPeriod, unifying_id_field=entities.StatePerson.get_class_id_name(), build_related_entities=False, unifying_id_field_filter_set=person_id_filter_set, state_code=state_code)) supervision_period_to_agent_associations_as_kv = ( p | 'Load supervision_period_to_agent_associations_as_kv' >> ImportTableAsKVTuples( dataset_id=reference_dataset, table_id=SUPERVISION_PERIOD_TO_AGENT_ASSOCIATION_VIEW_NAME, table_key='supervision_period_id', state_code_filter=state_code, person_id_filter_set=person_id_filter_set)) state_race_ethnicity_population_counts = ( p | 'Load state_race_ethnicity_population_counts' >> ImportTable( dataset_id=static_reference_dataset, table_id='state_race_ethnicity_population_counts', state_code_filter=state_code, person_id_filter_set=None)) # Group each StatePerson with their other entities persons_entities = ({ 'person': persons, 'program_assignments': program_assignments, 'assessments': assessments, 'supervision_periods': supervision_periods } | 'Group StatePerson to StateProgramAssignments and' >> beam.CoGroupByKey()) # Identify ProgramEvents from the StatePerson's StateProgramAssignments person_program_events = ( persons_entities | beam.ParDo( ClassifyProgramAssignments(), AsDict(supervision_period_to_agent_associations_as_kv))) person_metadata = ( persons | "Build the person_metadata dictionary" >> beam.ParDo( BuildPersonMetadata(), AsList(state_race_ethnicity_population_counts))) person_program_events_with_metadata = ( { 'person_events': person_program_events, 'person_metadata': person_metadata } | 'Group ProgramEvents with person-level metadata' >> beam.CoGroupByKey() | 'Organize StatePerson, PersonMetadata and ProgramEvents for calculations' >> beam.ParDo(ExtractPersonEventsMetadata())) # Get pipeline job details for accessing job_id all_pipeline_options = apache_beam_pipeline_options.get_all_options() # Add timestamp for local jobs job_timestamp = datetime.datetime.now().strftime( '%Y-%m-%d_%H_%M_%S.%f') all_pipeline_options['job_timestamp'] = job_timestamp # Get the type of metric to calculate metric_types_set = set(metric_types) # Get program metrics program_metrics = ( person_program_events_with_metadata | 'Get Program Metrics' >> GetProgramMetrics(pipeline_options=all_pipeline_options, metric_types=metric_types_set, calculation_end_month=calculation_end_month, calculation_month_count=calculation_month_count)) if person_id_filter_set: logging.warning( "Non-empty person filter set - returning before writing metrics." ) return # Convert the metrics into a format that's writable to BQ writable_metrics = ( program_metrics | 'Convert to dict to be written to BQ' >> beam.ParDo( RecidivizMetricWritableDict()).with_outputs( ProgramMetricType.PROGRAM_PARTICIPATION.value, ProgramMetricType.PROGRAM_REFERRAL.value)) # Write the metrics to the output tables in BigQuery referrals_table_id = DATAFLOW_METRICS_TO_TABLES.get( ProgramReferralMetric) participation_table_id = DATAFLOW_METRICS_TO_TABLES.get( ProgramParticipationMetric) _ = (writable_metrics.PROGRAM_REFERRAL | f"Write referral metrics to BQ table: {referrals_table_id}" >> beam.io.WriteToBigQuery( table=referrals_table_id, dataset=output, create_disposition=beam.io.BigQueryDisposition.CREATE_NEVER, write_disposition=beam.io.BigQueryDisposition.WRITE_APPEND, method=beam.io.WriteToBigQuery.Method.FILE_LOADS)) _ = ( writable_metrics.PROGRAM_PARTICIPATION | f"Write participation metrics to BQ table: {participation_table_id}" >> beam.io.WriteToBigQuery( table=participation_table_id, dataset=output, create_disposition=beam.io.BigQueryDisposition.CREATE_NEVER, write_disposition=beam.io.BigQueryDisposition.WRITE_APPEND, method=beam.io.WriteToBigQuery.Method.FILE_LOADS))
def run(apache_beam_pipeline_options: PipelineOptions, data_input: str, reference_input: str, output: str, calculation_month_count: int, metric_types: List[str], state_code: Optional[str], calculation_end_month: Optional[str], person_filter_ids: Optional[List[int]]): """Runs the program calculation pipeline.""" # Workaround to load SQLAlchemy objects at start of pipeline. This is necessary because the BuildRootEntity # function tries to access attributes of relationship properties on the SQLAlchemy room_schema_class before they # have been loaded. However, if *any* SQLAlchemy objects have been instantiated, then the relationship properties # are loaded and their attributes can be successfully accessed. _ = schema.StatePerson() apache_beam_pipeline_options.view_as(SetupOptions).save_main_session = True # Get pipeline job details all_pipeline_options = apache_beam_pipeline_options.get_all_options() input_dataset = all_pipeline_options['project'] + '.' + data_input reference_dataset = all_pipeline_options['project'] + '.' + reference_input person_id_filter_set = set( person_filter_ids) if person_filter_ids else None with beam.Pipeline(options=apache_beam_pipeline_options) as p: # Get StatePersons persons = (p | 'Load Persons' >> BuildRootEntity( dataset=input_dataset, root_entity_class=entities.StatePerson, unifying_id_field=entities.StatePerson.get_class_id_name(), build_related_entities=True, unifying_id_field_filter_set=person_id_filter_set)) # Get StateProgramAssignments program_assignments = ( p | 'Load Program Assignments' >> BuildRootEntity( dataset=input_dataset, root_entity_class=entities.StateProgramAssignment, unifying_id_field=entities.StatePerson.get_class_id_name(), build_related_entities=True, unifying_id_field_filter_set=person_id_filter_set, state_code=state_code)) # Get StateAssessments assessments = (p | 'Load Assessments' >> BuildRootEntity( dataset=input_dataset, root_entity_class=entities.StateAssessment, unifying_id_field=entities.StatePerson.get_class_id_name(), build_related_entities=False, unifying_id_field_filter_set=person_id_filter_set, state_code=state_code)) # Get StateSupervisionPeriods supervision_periods = ( p | 'Load SupervisionPeriods' >> BuildRootEntity( dataset=input_dataset, root_entity_class=entities.StateSupervisionPeriod, unifying_id_field=entities.StatePerson.get_class_id_name(), build_related_entities=False, unifying_id_field_filter_set=person_id_filter_set, state_code=state_code)) supervision_period_to_agent_association_query = \ f"SELECT * FROM `{reference_dataset}.supervision_period_to_agent_association`" supervision_period_to_agent_associations = ( p | "Read Supervision Period to Agent table from BigQuery" >> beam.io.Read( beam.io.BigQuerySource( query=supervision_period_to_agent_association_query, use_standard_sql=True))) # Convert the association table rows into key-value tuples with the value for the supervision_period_id column # as the key supervision_period_to_agent_associations_as_kv = ( supervision_period_to_agent_associations | 'Convert Supervision Period to Agent table to KV tuples' >> beam.ParDo(ConvertDictToKVTuple(), 'supervision_period_id')) # Group each StatePerson with their other entities persons_entities = ({ 'person': persons, 'program_assignments': program_assignments, 'assessments': assessments, 'supervision_periods': supervision_periods } | 'Group StatePerson to StateProgramAssignments and' >> beam.CoGroupByKey()) # Identify ProgramEvents from the StatePerson's StateProgramAssignments person_program_events = ( persons_entities | beam.ParDo( ClassifyProgramAssignments(), AsDict(supervision_period_to_agent_associations_as_kv))) # Get pipeline job details for accessing job_id all_pipeline_options = apache_beam_pipeline_options.get_all_options() # Add timestamp for local jobs job_timestamp = datetime.datetime.now().strftime( '%Y-%m-%d_%H_%M_%S.%f') all_pipeline_options['job_timestamp'] = job_timestamp # Get the type of metric to calculate metric_types_set = set(metric_types) # Get program metrics program_metrics = ( person_program_events | 'Get Program Metrics' >> GetProgramMetrics( pipeline_options=all_pipeline_options, metric_types=metric_types_set, calculation_end_month=calculation_end_month, calculation_month_count=calculation_month_count)) if person_id_filter_set: logging.warning( "Non-empty person filter set - returning before writing metrics." ) return # Convert the metrics into a format that's writable to BQ writable_metrics = ( program_metrics | 'Convert to dict to be written to BQ' >> beam.ParDo( ProgramMetricWritableDict()).with_outputs('referrals')) # Write the metrics to the output tables in BigQuery referrals_table = output + '.program_referral_metrics' _ = (writable_metrics.referrals | f"Write referral metrics to BQ table: {referrals_table}" >> beam.io.WriteToBigQuery( table=referrals_table, create_disposition=beam.io.BigQueryDisposition.CREATE_NEVER, write_disposition=beam.io.BigQueryDisposition.WRITE_APPEND))
def run(apache_beam_pipeline_options: PipelineOptions, data_input: str, reference_view_input: str, static_reference_input: str, output: str, metric_types: List[str], state_code: Optional[str], person_filter_ids: Optional[List[int]]): """Runs the recidivism calculation pipeline.""" # Workaround to load SQLAlchemy objects at start of pipeline. This is # necessary because the BuildRootEntity function tries to access attributes # of relationship properties on the SQLAlchemy room_schema_class before they # have been loaded. However, if *any* SQLAlchemy objects have been # instantiated, then the relationship properties are loaded and their # attributes can be successfully accessed. _ = schema.StatePerson() apache_beam_pipeline_options.view_as(SetupOptions).save_main_session = True # Get pipeline job details all_pipeline_options = apache_beam_pipeline_options.get_all_options() project_id = all_pipeline_options['project'] input_dataset = project_id + '.' + data_input reference_dataset = project_id + '.' + reference_view_input static_reference_dataset = project_id + '.' + static_reference_input person_id_filter_set = set(person_filter_ids) if person_filter_ids else None with beam.Pipeline(options=apache_beam_pipeline_options) as p: # Get StatePersons persons = (p | 'Load Persons' >> BuildRootEntity(dataset=input_dataset, root_entity_class=entities.StatePerson, unifying_id_field=entities.StatePerson.get_class_id_name(), build_related_entities=True, unifying_id_field_filter_set=person_id_filter_set, state_code=state_code)) # Get StateIncarcerationPeriods incarceration_periods = (p | 'Load IncarcerationPeriods' >> BuildRootEntity(dataset=input_dataset, root_entity_class=entities.StateIncarcerationPeriod, unifying_id_field=entities.StatePerson.get_class_id_name(), build_related_entities=True, unifying_id_field_filter_set=person_id_filter_set, state_code=state_code )) # Get StateSupervisionViolations supervision_violations = \ (p | 'Load SupervisionViolations' >> BuildRootEntity(dataset=input_dataset, root_entity_class=entities.StateSupervisionViolation, unifying_id_field=entities.StatePerson.get_class_id_name(), build_related_entities=True, unifying_id_field_filter_set=person_id_filter_set, state_code=state_code )) # TODO(#2769): Don't bring this in as a root entity # Get StateSupervisionViolationResponses supervision_violation_responses = \ (p | 'Load SupervisionViolationResponses' >> BuildRootEntity(dataset=input_dataset, root_entity_class=entities.StateSupervisionViolationResponse, unifying_id_field=entities.StatePerson.get_class_id_name(), build_related_entities=True, unifying_id_field_filter_set=person_id_filter_set, state_code=state_code )) # Group StateSupervisionViolationResponses and # StateSupervisionViolations by person_id supervision_violations_and_responses = ( {'violations': supervision_violations, 'violation_responses': supervision_violation_responses } | 'Group StateSupervisionViolationResponses to ' 'StateSupervisionViolations' >> beam.CoGroupByKey() ) # Set the fully hydrated StateSupervisionViolation entities on # the corresponding StateSupervisionViolationResponses violation_responses_with_hydrated_violations = ( supervision_violations_and_responses | 'Set hydrated StateSupervisionViolations on ' 'the StateSupervisionViolationResponses' >> beam.ParDo(SetViolationOnViolationsResponse())) # Group StateIncarcerationPeriods and StateSupervisionViolationResponses # by person_id incarceration_periods_and_violation_responses = ( {'incarceration_periods': incarceration_periods, 'violation_responses': violation_responses_with_hydrated_violations} | 'Group StateIncarcerationPeriods to ' 'StateSupervisionViolationResponses' >> beam.CoGroupByKey() ) # Set the fully hydrated StateSupervisionViolationResponse entities on # the corresponding StateIncarcerationPeriods incarceration_periods_with_source_violations = ( incarceration_periods_and_violation_responses | 'Set hydrated StateSupervisionViolationResponses on ' 'the StateIncarcerationPeriods' >> beam.ParDo(SetViolationResponseOnIncarcerationPeriod())) # Group each StatePerson with their StateIncarcerationPeriods person_and_incarceration_periods = ( {'person': persons, 'incarceration_periods': incarceration_periods_with_source_violations} | 'Group StatePerson to StateIncarcerationPeriods' >> beam.CoGroupByKey() ) # Bring in the table that associates people and their county of residence person_id_to_county_kv = (p | 'Load person_id_to_county_kv' >> ImportTableAsKVTuples( dataset_id=reference_dataset, table_id=PERSONS_TO_RECENT_COUNTY_OF_RESIDENCE_VIEW_NAME, table_key='person_id', state_code_filter=state_code, person_id_filter_set=person_id_filter_set )) state_race_ethnicity_population_counts = ( p | 'Load state_race_ethnicity_population_counts' >> ImportTable( dataset_id=static_reference_dataset, table_id='state_race_ethnicity_population_counts', state_code_filter=state_code, person_id_filter_set=None )) # Identify ReleaseEvents events from the StatePerson's StateIncarcerationPeriods person_release_events = ( person_and_incarceration_periods | "ClassifyReleaseEvents" >> beam.ParDo(ClassifyReleaseEvents(), AsDict(person_id_to_county_kv)) ) person_metadata = (persons | "Build the person_metadata dictionary" >> beam.ParDo(BuildPersonMetadata(), AsList(state_race_ethnicity_population_counts))) person_release_events_with_metadata = ( { 'person_events': person_release_events, 'person_metadata': person_metadata } | 'Group ReleaseEvents with person-level metadata' >> beam.CoGroupByKey() | 'Organize StatePerson, PersonMetadata and ReleaseEvents for calculations' >> beam.ParDo(ExtractPersonReleaseEventsMetadata()) ) # Get pipeline job details for accessing job_id all_pipeline_options = apache_beam_pipeline_options.get_all_options() # Add timestamp for local jobs job_timestamp = datetime.datetime.now().strftime('%Y-%m-%d_%H_%M_%S.%f') all_pipeline_options['job_timestamp'] = job_timestamp # Get the type of metric to calculate metric_types_set = set(metric_types) # Get recidivism metrics recidivism_metrics = (person_release_events_with_metadata | 'Get Recidivism Metrics' >> GetRecidivismMetrics( pipeline_options=all_pipeline_options, metric_types=metric_types_set)) if person_id_filter_set: logging.warning("Non-empty person filter set - returning before writing metrics.") return # Convert the metrics into a format that's writable to BQ writable_metrics = (recidivism_metrics | 'Convert to dict to be written to BQ' >> beam.ParDo(RecidivizMetricWritableDict()).with_outputs( ReincarcerationRecidivismMetricType.REINCARCERATION_RATE.value, ReincarcerationRecidivismMetricType.REINCARCERATION_COUNT.value )) # Write the recidivism metrics to the output tables in BigQuery rates_table_id = DATAFLOW_METRICS_TO_TABLES.get(ReincarcerationRecidivismRateMetric) counts_table_id = DATAFLOW_METRICS_TO_TABLES.get(ReincarcerationRecidivismCountMetric) _ = (writable_metrics.REINCARCERATION_RATE | f"Write rate metrics to BQ table: {rates_table_id}" >> beam.io.WriteToBigQuery( table=rates_table_id, dataset=output, create_disposition=beam.io.BigQueryDisposition.CREATE_NEVER, write_disposition=beam.io.BigQueryDisposition.WRITE_APPEND, method=beam.io.WriteToBigQuery.Method.FILE_LOADS )) _ = (writable_metrics.REINCARCERATION_COUNT | f"Write count metrics to BQ table: {counts_table_id}" >> beam.io.WriteToBigQuery( table=counts_table_id, dataset=output, create_disposition=beam.io.BigQueryDisposition.CREATE_NEVER, write_disposition=beam.io.BigQueryDisposition.WRITE_APPEND, method=beam.io.WriteToBigQuery.Method.FILE_LOADS ))
def process_datastore_tweets(project, dataset, pipeline_options): """Creates a pipeline that reads tweets from Cloud Datastore from the last N days. The pipeline finds the top most-used words, the top most-tweeted URLs, ranks word co-occurrences by an 'interestingness' metric (similar to on tf* idf). """ ts = str(datetime.datetime.utcnow()) p = beam.Pipeline(options=pipeline_options) # Create a query to read entities from datastore. query = make_query('Tweet') # Read entities from Cloud Datastore into a PCollection. lines = (p | 'read from datastore' >> ReadFromDatastore(project, query, None)) global_count = AsSingleton( lines | 'global count' >> beam.combiners.Count.Globally()) # Count the occurrences of each word. percents = (lines | 'split' >> (beam.ParDo(WordExtractingDoFn()).with_output_types(unicode)) | 'pair_with_one' >> beam.Map(lambda x: (x, 1)) | 'group' >> beam.GroupByKey() | 'count' >> beam.Map(lambda (word, ones): (word, sum(ones))) | 'in tweets percent' >> beam.Map( lambda (word, wsum), gc: (word, float(wsum) / gc), global_count)) top_percents = ( percents | 'top 500' >> combiners.Top.Of(500, lambda x, y: x[1] < y[1])) # Count the occurrences of each expanded url in the tweets url_counts = ( lines | 'geturls' >> (beam.ParDo(URLExtractingDoFn()).with_output_types(unicode)) | 'urls_pair_with_one' >> beam.Map(lambda x: (x, 1)) | 'urls_group' >> beam.GroupByKey() | 'urls_count' >> beam.Map(lambda (word, ones): (word, sum(ones))) | 'urls top 300' >> combiners.Top.Of(300, lambda x, y: x[1] < y[1])) # Define some inline helper functions. def join_cinfo(cooccur, percents): """Calculate a co-occurence ranking.""" import math word1 = cooccur[0][0] word2 = cooccur[0][1] try: word1_percent = percents[word1] weight1 = 1 / word1_percent word2_percent = percents[word2] weight2 = 1 / word2_percent return (cooccur[0], cooccur[1], cooccur[1] * math.log(min(weight1, weight2))) except: return 0 def generate_cooccur_schema(): """BigQuery schema for the word co-occurrence table.""" json_str = json.dumps({ 'fields': [{ 'name': 'w1', 'type': 'STRING', 'mode': 'NULLABLE' }, { 'name': 'w2', 'type': 'STRING', 'mode': 'NULLABLE' }, { 'name': 'count', 'type': 'INTEGER', 'mode': 'NULLABLE' }, { 'name': 'log_weight', 'type': 'FLOAT', 'mode': 'NULLABLE' }, { 'name': 'ts', 'type': 'TIMESTAMP', 'mode': 'NULLABLE' }] }) return parse_table_schema_from_json(json_str) def generate_url_schema(): """BigQuery schema for the urls count table.""" json_str = json.dumps({ 'fields': [{ 'name': 'url', 'type': 'STRING', 'mode': 'NULLABLE' }, { 'name': 'count', 'type': 'INTEGER', 'mode': 'NULLABLE' }, { 'name': 'ts', 'type': 'TIMESTAMP', 'mode': 'NULLABLE' }] }) return parse_table_schema_from_json(json_str) def generate_wc_schema(): """BigQuery schema for the word count table.""" json_str = json.dumps({ 'fields': [{ 'name': 'word', 'type': 'STRING', 'mode': 'NULLABLE' }, { 'name': 'percent', 'type': 'FLOAT', 'mode': 'NULLABLE' }, { 'name': 'ts', 'type': 'TIMESTAMP', 'mode': 'NULLABLE' }] }) return parse_table_schema_from_json(json_str) # Now build the rest of the pipeline. # Calculate the word co-occurence scores. cooccur_rankings = ( lines | 'getcooccur' >> (beam.ParDo(CoOccurExtractingDoFn())) | 'co_pair_with_one' >> beam.Map(lambda x: (x, 1)) | 'co_group' >> beam.GroupByKey() | 'co_count' >> beam.Map(lambda (wordts, ones): (wordts, sum(ones))) | 'weights' >> beam.Map(join_cinfo, AsDict(percents)) | 'co top 300' >> combiners.Top.Of(300, lambda x, y: x[2] < y[2])) # Format the counts into a PCollection of strings. wc_records = top_percents | 'format' >> beam.FlatMap( lambda x: [{ 'word': xx[0], 'percent': xx[1], 'ts': ts } for xx in x]) url_records = url_counts | 'urls_format' >> beam.FlatMap( lambda x: [{ 'url': xx[0], 'count': xx[1], 'ts': ts } for xx in x]) co_records = cooccur_rankings | 'co_format' >> beam.FlatMap( lambda x: [{ 'w1': xx[0][0], 'w2': xx[0][1], 'count': xx[1], 'log_weight': xx[2], 'ts': ts } for xx in x]) # Write the results to three BigQuery tables. wc_records | 'wc_write_bq' >> beam.io.Write( beam.io.BigQuerySink( '%s:%s.word_counts' % (project, dataset), schema=generate_wc_schema(), create_disposition=beam.io.BigQueryDisposition.CREATE_IF_NEEDED, write_disposition=beam.io.BigQueryDisposition.WRITE_APPEND)) url_records | 'urls_write_bq' >> beam.io.Write( beam.io.BigQuerySink( '%s:%s.urls' % (project, dataset), schema=generate_url_schema(), create_disposition=beam.io.BigQueryDisposition.CREATE_IF_NEEDED, write_disposition=beam.io.BigQueryDisposition.WRITE_APPEND)) co_records | 'co_write_bq' >> beam.io.Write( beam.io.BigQuerySink( '%s:%s.word_cooccur' % (project, dataset), schema=generate_cooccur_schema(), create_disposition=beam.io.BigQueryDisposition.CREATE_IF_NEEDED, write_disposition=beam.io.BigQueryDisposition.WRITE_APPEND)) # Actually run the pipeline. return p.run()
def testClassifyProgramAssignments(self): """Tests the ClassifyProgramAssignments DoFn.""" fake_person_id = 12345 fake_person = entities.StatePerson.new_with_defaults( person_id=fake_person_id, gender=Gender.MALE, birthdate=date(1970, 1, 1), residency_status=ResidencyStatus.PERMANENT ) program_assignment = entities.StateProgramAssignment.new_with_defaults( state_code='US_XX', program_id='PG3', program_location_id='XYZ', referral_date=date(2009, 10, 3), participation_status=StateProgramAssignmentParticipationStatus.IN_PROGRESS, start_date=date(2009, 10, 19) ) assessment = entities.StateAssessment.new_with_defaults( state_code='US_XX', assessment_type=StateAssessmentType.ORAS, assessment_score=33, assessment_date=date(2009, 7, 10) ) supervision_period = \ entities.StateSupervisionPeriod.new_with_defaults( supervision_period_id=111, status=StateSupervisionPeriodStatus.TERMINATED, state_code='US_XX', start_date=date(2008, 3, 5), supervision_type=StateSupervisionType.PAROLE ) person_periods = {'person': [fake_person], 'program_assignments': [program_assignment], 'assessments': [assessment], 'supervision_periods': [supervision_period] } program_events = [ProgramReferralEvent( state_code=program_assignment.state_code, program_id=program_assignment.program_id, event_date=program_assignment.referral_date, participation_status=program_assignment.participation_status, assessment_score=33, assessment_type=StateAssessmentType.ORAS, supervision_type=supervision_period.supervision_type, supervising_officer_external_id='OFFICER0009', supervising_district_external_id='10' ), ProgramParticipationEvent( state_code=program_assignment.state_code, program_id=program_assignment.program_id, program_location_id=program_assignment.program_location_id, event_date=date.today(), is_first_day_in_program=True, supervision_type=supervision_period.supervision_type )] correct_output = [(fake_person.person_id, (fake_person, program_events))] test_pipeline = TestPipeline() supervision_period_to_agent_map = { 'agent_id': 1010, 'agent_external_id': 'OFFICER0009', 'district_external_id': '10', 'supervision_period_id': supervision_period.supervision_period_id } supervision_period_to_agent_associations = ( test_pipeline | 'Create SupervisionPeriod to Agent table' >> beam.Create([supervision_period_to_agent_map]) ) supervision_periods_to_agent_associations_as_kv = ( supervision_period_to_agent_associations | 'Convert SupervisionPeriod to Agent table to KV tuples' >> beam.ParDo(ConvertDictToKVTuple(), 'supervision_period_id') ) output = (test_pipeline | beam.Create([(fake_person_id, person_periods)]) | 'Identify Program Events' >> beam.ParDo( pipeline.ClassifyProgramAssignments(), AsDict(supervision_periods_to_agent_associations_as_kv)) ) assert_that(output, equal_to(correct_output)) test_pipeline.run()
def run(argv=None): """Runs the supervision calculation pipeline.""" # Workaround to load SQLAlchemy objects at start of pipeline. This is necessary because the BuildRootEntity # function tries to access attributes of relationship properties on the SQLAlchemy room_schema_class before they # have been loaded. However, if *any* SQLAlchemy objects have been instantiated, then the relationship properties # are loaded and their attributes can be successfully accessed. _ = schema.StatePerson() # Parse command-line arguments known_args, pipeline_args = parse_arguments(argv) pipeline_options = PipelineOptions(pipeline_args) pipeline_options.view_as(SetupOptions).save_main_session = True # Get pipeline job details all_pipeline_options = pipeline_options.get_all_options() input_dataset = all_pipeline_options['project'] + '.' + known_args.input reference_dataset = all_pipeline_options['project'] + '.' + \ known_args.reference_input with beam.Pipeline(argv=pipeline_args) as p: # Get StatePersons persons = (p | 'Load Persons' >> BuildRootEntity( dataset=input_dataset, data_dict=None, root_schema_class=schema.StatePerson, root_entity_class=entities.StatePerson, unifying_id_field='person_id', build_related_entities=True)) # Get StateIncarcerationPeriods incarceration_periods = ( p | 'Load IncarcerationPeriods' >> BuildRootEntity( dataset=input_dataset, data_dict=None, root_schema_class=schema.StateIncarcerationPeriod, root_entity_class=entities.StateIncarcerationPeriod, unifying_id_field='person_id', build_related_entities=True)) # Get StateSupervisionViolations supervision_violations = ( p | 'Load SupervisionViolations' >> BuildRootEntity( dataset=input_dataset, data_dict=None, root_schema_class=schema.StateSupervisionViolation, root_entity_class=entities.StateSupervisionViolation, unifying_id_field='person_id', build_related_entities=True)) # TODO(2769): Don't bring this in as a root entity # Get StateSupervisionViolationResponses supervision_violation_responses = ( p | 'Load SupervisionViolationResponses' >> BuildRootEntity( dataset=input_dataset, data_dict=None, root_schema_class=schema.StateSupervisionViolationResponse, root_entity_class=entities.StateSupervisionViolationResponse, unifying_id_field='person_id', build_related_entities=True)) # Get StateSupervisionSentences supervision_sentences = ( p | 'Load SupervisionSentences' >> BuildRootEntity( dataset=input_dataset, data_dict=None, root_schema_class=schema.StateSupervisionSentence, root_entity_class=entities.StateSupervisionSentence, unifying_id_field='person_id', build_related_entities=True)) # Get StateIncarcerationSentences incarceration_sentences = ( p | 'Load IncarcerationSentences' >> BuildRootEntity( dataset=input_dataset, data_dict=None, root_schema_class=schema.StateIncarcerationSentence, root_entity_class=entities.StateIncarcerationSentence, unifying_id_field='person_id', build_related_entities=True)) # Get StateSupervisionPeriods supervision_periods = ( p | 'Load SupervisionPeriods' >> BuildRootEntity( dataset=input_dataset, data_dict=None, root_schema_class=schema.StateSupervisionPeriod, root_entity_class=entities.StateSupervisionPeriod, unifying_id_field='person_id', build_related_entities=True)) # Get StateAssessments assessments = (p | 'Load Assessments' >> BuildRootEntity( dataset=input_dataset, data_dict=None, root_schema_class=schema.StateAssessment, root_entity_class=entities.StateAssessment, unifying_id_field='person_id', build_related_entities=False)) # Bring in the table that associates StateSupervisionViolationResponses to information about StateAgents ssvr_to_agent_association_query = f"SELECT * FROM `{reference_dataset}.ssvr_to_agent_association`" ssvr_to_agent_associations = ( p | "Read SSVR to Agent table from BigQuery" >> beam.io.Read( beam.io.BigQuerySource(query=ssvr_to_agent_association_query, use_standard_sql=True))) # Convert the association table rows into key-value tuples with the value for the # supervision_violation_response_id column as the key ssvr_agent_associations_as_kv = ( ssvr_to_agent_associations | 'Convert SSVR to Agent table to KV tuples' >> beam.ParDo( ConvertDictToKVTuple(), 'supervision_violation_response_id')) supervision_period_to_agent_association_query = f"SELECT * FROM `{reference_dataset}." \ f"supervision_period_to_agent_association`" supervision_period_to_agent_associations = ( p | "Read Supervision Period to Agent table from BigQuery" >> beam.io.Read( beam.io.BigQuerySource( query=supervision_period_to_agent_association_query, use_standard_sql=True))) # Convert the association table rows into key-value tuples with the value for the supervision_period_id column # as the key supervision_period_to_agent_associations_as_kv = ( supervision_period_to_agent_associations | 'Convert Supervision Period to Agent table to KV tuples' >> beam.ParDo(ConvertDictToKVTuple(), 'supervision_period_id')) # Group StateSupervisionViolationResponses and StateSupervisionViolations by person_id supervision_violations_and_responses = ( { 'violations': supervision_violations, 'violation_responses': supervision_violation_responses } | 'Group StateSupervisionViolationResponses to ' 'StateSupervisionViolations' >> beam.CoGroupByKey()) # Set the fully hydrated StateSupervisionViolation entities on the corresponding # StateSupervisionViolationResponses violation_responses_with_hydrated_violations = ( supervision_violations_and_responses | 'Set hydrated StateSupervisionViolations on ' 'the StateSupervisionViolationResponses' >> beam.ParDo( SetViolationOnViolationsResponse())) # Group StateIncarcerationPeriods and StateSupervisionViolationResponses by person_id incarceration_periods_and_violation_responses = ( { 'incarceration_periods': incarceration_periods, 'violation_responses': violation_responses_with_hydrated_violations } | 'Group StateIncarcerationPeriods to ' 'StateSupervisionViolationResponses' >> beam.CoGroupByKey()) # Set the fully hydrated StateSupervisionViolationResponse entities on the corresponding # StateIncarcerationPeriods incarceration_periods_with_source_violations = ( incarceration_periods_and_violation_responses | 'Set hydrated StateSupervisionViolationResponses on ' 'the StateIncarcerationPeriods' >> beam.ParDo( SetViolationResponseOnIncarcerationPeriod())) # Group each StatePerson with their StateIncarcerationPeriods and StateSupervisionSentences person_periods_and_sentences = ( { 'person': persons, 'assessments': assessments, 'incarceration_periods': incarceration_periods_with_source_violations, 'supervision_periods': supervision_periods, 'supervision_sentences': supervision_sentences, 'incarceration_sentences': incarceration_sentences, 'violation_responses': violation_responses_with_hydrated_violations } | 'Group StatePerson to all entities' >> beam.CoGroupByKey()) # The state_code to run calculations on state_code = known_args.state_code identifier_options = {'state_code': state_code} # Identify SupervisionTimeBuckets from the StatePerson's StateSupervisionSentences and StateIncarcerationPeriods person_time_buckets = ( person_periods_and_sentences | 'Get SupervisionTimeBuckets' >> beam.ParDo( ClassifySupervisionTimeBuckets(), AsDict(ssvr_agent_associations_as_kv), AsDict(supervision_period_to_agent_associations_as_kv), ** identifier_options)) # Get dimensions to include and methodologies to use inclusions, _ = dimensions_and_methodologies(known_args) # Get pipeline job details for accessing job_id all_pipeline_options = pipeline_options.get_all_options() # Get the type of metric to calculate metric_type = known_args.metric_type # The number of months to limit the monthly calculation output to calculation_month_limit = known_args.calculation_month_limit # Add timestamp for local jobs job_timestamp = datetime.datetime.now().strftime( '%Y-%m-%d_%H_%M_%S.%f') all_pipeline_options['job_timestamp'] = job_timestamp # Get supervision metrics supervision_metrics = ( person_time_buckets | 'Get Supervision Metrics' >> GetSupervisionMetrics( pipeline_options=all_pipeline_options, inclusions=inclusions, metric_type=metric_type, calculation_month_limit=calculation_month_limit)) # Convert the metrics into a format that's writable to BQ writable_metrics = ( supervision_metrics | 'Convert to dict to be written to BQ' >> beam.ParDo(SupervisionMetricWritableDict()).with_outputs( 'populations', 'revocations', 'successes', 'assessment_changes', 'revocation_analyses', 'revocation_violation_type_analyses')) # Write the metrics to the output tables in BigQuery populations_table = known_args.output + '.supervision_population_metrics' revocations_table = known_args.output + '.supervision_revocation_metrics' successes_table = known_args.output + '.supervision_success_metrics' assessment_changes_table = known_args.output + '.terminated_supervision_assessment_score_change_metrics' revocation_analysis_table = known_args.output + '.supervision_revocation_analysis_metrics' revocation_violation_type_analysis_table = known_args.output + \ '.supervision_revocation_violation_type_analysis_metrics' _ = (writable_metrics.populations | f"Write population metrics to BQ table: {populations_table}" >> beam.io.WriteToBigQuery( table=populations_table, create_disposition=beam.io.BigQueryDisposition.CREATE_NEVER, write_disposition=beam.io.BigQueryDisposition.WRITE_APPEND)) _ = (writable_metrics.revocations | f"Write revocation metrics to BQ table: {revocations_table}" >> beam.io.WriteToBigQuery( table=revocations_table, create_disposition=beam.io.BigQueryDisposition.CREATE_NEVER, write_disposition=beam.io.BigQueryDisposition.WRITE_APPEND)) _ = (writable_metrics.successes | f"Write success metrics to BQ table: {successes_table}" >> beam.io.WriteToBigQuery( table=successes_table, create_disposition=beam.io.BigQueryDisposition.CREATE_NEVER, write_disposition=beam.io.BigQueryDisposition.WRITE_APPEND)) _ = ( writable_metrics.assessment_changes | f"Write assessment change metrics to BQ table: {assessment_changes_table}" >> beam.io.WriteToBigQuery( table=assessment_changes_table, create_disposition=beam.io.BigQueryDisposition.CREATE_NEVER, write_disposition=beam.io.BigQueryDisposition.WRITE_APPEND)) _ = ( writable_metrics.revocation_analyses | f"Write revocation analyses metrics to BQ table: {revocation_analysis_table}" >> beam.io.WriteToBigQuery( table=revocation_analysis_table, create_disposition=beam.io.BigQueryDisposition.CREATE_NEVER, write_disposition=beam.io.BigQueryDisposition.WRITE_APPEND)) _ = (writable_metrics.revocation_violation_type_analyses | f"Write revocation violation type analyses metrics to BQ table: " f"{revocation_violation_type_analysis_table}" >> beam.io.WriteToBigQuery( table=revocation_violation_type_analysis_table, create_disposition=beam.io.BigQueryDisposition.CREATE_NEVER, write_disposition=beam.io.BigQueryDisposition.WRITE_APPEND))
def run(apache_beam_pipeline_options: PipelineOptions, data_input: str, reference_input: str, output: str, metric_types: List[str], state_code: Optional[str], person_filter_ids: Optional[List[int]]): """Runs the recidivism calculation pipeline.""" # Workaround to load SQLAlchemy objects at start of pipeline. This is # necessary because the BuildRootEntity function tries to access attributes # of relationship properties on the SQLAlchemy room_schema_class before they # have been loaded. However, if *any* SQLAlchemy objects have been # instantiated, then the relationship properties are loaded and their # attributes can be successfully accessed. _ = schema.StatePerson() apache_beam_pipeline_options.view_as(SetupOptions).save_main_session = True # Get pipeline job details all_pipeline_options = apache_beam_pipeline_options.get_all_options() query_dataset = all_pipeline_options['project'] + '.' + data_input reference_dataset = all_pipeline_options['project'] + '.' + reference_input person_id_filter_set = set( person_filter_ids) if person_filter_ids else None with beam.Pipeline(options=apache_beam_pipeline_options) as p: # Get StatePersons persons = ( p | 'Load Persons' >> BuildRootEntity( dataset=query_dataset, root_entity_class=entities.StatePerson, unifying_id_field=entities.StatePerson.get_class_id_name(), build_related_entities=True, unifying_id_field_filter_set=person_id_filter_set)) # Get StateIncarcerationPeriods incarceration_periods = ( p | 'Load IncarcerationPeriods' >> BuildRootEntity( dataset=query_dataset, root_entity_class=entities.StateIncarcerationPeriod, unifying_id_field=entities.StatePerson.get_class_id_name(), build_related_entities=True, unifying_id_field_filter_set=person_id_filter_set, state_code=state_code)) # Get StateSupervisionViolations supervision_violations = \ (p | 'Load SupervisionViolations' >> BuildRootEntity(dataset=query_dataset, root_entity_class=entities.StateSupervisionViolation, unifying_id_field=entities.StatePerson.get_class_id_name(), build_related_entities=True, unifying_id_field_filter_set=person_id_filter_set, state_code=state_code )) # TODO(2769): Don't bring this in as a root entity # Get StateSupervisionViolationResponses supervision_violation_responses = \ (p | 'Load SupervisionViolationResponses' >> BuildRootEntity(dataset=query_dataset, root_entity_class=entities.StateSupervisionViolationResponse, unifying_id_field=entities.StatePerson.get_class_id_name(), build_related_entities=True, unifying_id_field_filter_set=person_id_filter_set, state_code=state_code )) # Group StateSupervisionViolationResponses and # StateSupervisionViolations by person_id supervision_violations_and_responses = ( { 'violations': supervision_violations, 'violation_responses': supervision_violation_responses } | 'Group StateSupervisionViolationResponses to ' 'StateSupervisionViolations' >> beam.CoGroupByKey()) # Set the fully hydrated StateSupervisionViolation entities on # the corresponding StateSupervisionViolationResponses violation_responses_with_hydrated_violations = ( supervision_violations_and_responses | 'Set hydrated StateSupervisionViolations on ' 'the StateSupervisionViolationResponses' >> beam.ParDo( SetViolationOnViolationsResponse())) # Group StateIncarcerationPeriods and StateSupervisionViolationResponses # by person_id incarceration_periods_and_violation_responses = ( { 'incarceration_periods': incarceration_periods, 'violation_responses': violation_responses_with_hydrated_violations } | 'Group StateIncarcerationPeriods to ' 'StateSupervisionViolationResponses' >> beam.CoGroupByKey()) # Set the fully hydrated StateSupervisionViolationResponse entities on # the corresponding StateIncarcerationPeriods incarceration_periods_with_source_violations = ( incarceration_periods_and_violation_responses | 'Set hydrated StateSupervisionViolationResponses on ' 'the StateIncarcerationPeriods' >> beam.ParDo( SetViolationResponseOnIncarcerationPeriod())) # Group each StatePerson with their StateIncarcerationPeriods person_and_incarceration_periods = ( { 'person': persons, 'incarceration_periods': incarceration_periods_with_source_violations } | 'Group StatePerson to StateIncarcerationPeriods' >> beam.CoGroupByKey()) # Bring in the table that associates people and their county of residence person_id_to_county_query = select_all_by_person_query( reference_dataset, PERSONS_TO_RECENT_COUNTY_OF_RESIDENCE_VIEW_NAME, # TODO(3602): Once we put state_code on StatePerson objects, we can update the # persons_to_recent_county_of_residence query to have a state_code field, allowing us to also filter the # output by state_code. state_code_filter=None, person_id_filter_set=person_id_filter_set) person_id_to_county_kv = ( p | "Read person_id to county associations from BigQuery" >> beam.io.Read( beam.io.BigQuerySource(query=person_id_to_county_query, use_standard_sql=True)) | "Convert person_id to county association table to KV" >> beam.ParDo(ConvertDictToKVTuple(), 'person_id')) # Identify ReleaseEvents events from the StatePerson's # StateIncarcerationPeriods person_events = ( person_and_incarceration_periods | "ClassifyReleaseEvents" >> beam.ParDo( ClassifyReleaseEvents(), AsDict(person_id_to_county_kv))) # Get pipeline job details for accessing job_id all_pipeline_options = apache_beam_pipeline_options.get_all_options() # Add timestamp for local jobs job_timestamp = datetime.datetime.now().strftime( '%Y-%m-%d_%H_%M_%S.%f') all_pipeline_options['job_timestamp'] = job_timestamp # Get the type of metric to calculate metric_types_set = set(metric_types) # Get recidivism metrics recidivism_metrics = (person_events | 'Get Recidivism Metrics' >> GetRecidivismMetrics( pipeline_options=all_pipeline_options, metric_types=metric_types_set)) if person_id_filter_set: logging.warning( "Non-empty person filter set - returning before writing metrics." ) return # Convert the metrics into a format that's writable to BQ writable_metrics = ( recidivism_metrics | 'Convert to dict to be written to BQ' >> beam.ParDo( RecidivismMetricWritableDict()).with_outputs( 'rates', 'counts')) # Write the recidivism metrics to the output tables in BigQuery rates_table_id = DATAFLOW_METRICS_TO_TABLES.get( ReincarcerationRecidivismRateMetric) counts_table_id = DATAFLOW_METRICS_TO_TABLES.get( ReincarcerationRecidivismCountMetric) _ = (writable_metrics.rates | f"Write rate metrics to BQ table: {rates_table_id}" >> beam.io.WriteToBigQuery( table=rates_table_id, dataset=output, create_disposition=beam.io.BigQueryDisposition.CREATE_NEVER, write_disposition=beam.io.BigQueryDisposition.WRITE_APPEND, method=beam.io.WriteToBigQuery.Method.FILE_LOADS)) _ = (writable_metrics.counts | f"Write count metrics to BQ table: {counts_table_id}" >> beam.io.WriteToBigQuery( table=counts_table_id, dataset=output, create_disposition=beam.io.BigQueryDisposition.CREATE_NEVER, write_disposition=beam.io.BigQueryDisposition.WRITE_APPEND, method=beam.io.WriteToBigQuery.Method.FILE_LOADS))
def run(apache_beam_pipeline_options: PipelineOptions, data_input: str, reference_input: str, output: str, calculation_month_count: int, metric_types: List[str], state_code: Optional[str], calculation_end_month: Optional[str], person_filter_ids: Optional[List[int]]): """Runs the incarceration calculation pipeline.""" # Workaround to load SQLAlchemy objects at start of pipeline. This is necessary because the BuildRootEntity # function tries to access attributes of relationship properties on the SQLAlchemy room_schema_class before they # have been loaded. However, if *any* SQLAlchemy objects have been instantiated, then the relationship properties # are loaded and their attributes can be successfully accessed. _ = schema.StatePerson() apache_beam_pipeline_options.view_as(SetupOptions).save_main_session = True # Get pipeline job details all_apache_beam_pipeline_options = apache_beam_pipeline_options.get_all_options( ) query_dataset = all_apache_beam_pipeline_options[ 'project'] + '.' + data_input reference_dataset = all_apache_beam_pipeline_options[ 'project'] + '.' + reference_input person_id_filter_set = set( person_filter_ids) if person_filter_ids else None with beam.Pipeline(options=apache_beam_pipeline_options) as p: # Get StatePersons persons = (p | 'Load StatePersons' >> BuildRootEntity( dataset=query_dataset, root_entity_class=entities.StatePerson, unifying_id_field=entities.StatePerson.get_class_id_name(), build_related_entities=True, unifying_id_field_filter_set=person_id_filter_set)) # Get StateSentenceGroups sentence_groups = (p | 'Load StateSentenceGroups' >> BuildRootEntity( dataset=query_dataset, root_entity_class=entities.StateSentenceGroup, unifying_id_field=entities.StatePerson.get_class_id_name(), build_related_entities=True, unifying_id_field_filter_set=person_id_filter_set, state_code=state_code)) # Get StateIncarcerationSentences incarceration_sentences = ( p | 'Load StateIncarcerationSentences' >> BuildRootEntity( dataset=query_dataset, root_entity_class=entities.StateIncarcerationSentence, unifying_id_field=entities.StatePerson.get_class_id_name(), build_related_entities=True, unifying_id_field_filter_set=person_id_filter_set, state_code=state_code)) # Get StateSupervisionSentences supervision_sentences = ( p | 'Load StateSupervisionSentences' >> BuildRootEntity( dataset=query_dataset, root_entity_class=entities.StateSupervisionSentence, unifying_id_field=entities.StatePerson.get_class_id_name(), build_related_entities=True, unifying_id_field_filter_set=person_id_filter_set, state_code=state_code)) if state_code is None or state_code == 'US_MO': # Bring in the reference table that includes sentence status ranking information us_mo_sentence_status_query = select_all_by_person_query( reference_dataset, US_MO_SENTENCE_STATUSES_VIEW_NAME, state_code, person_id_filter_set) us_mo_sentence_statuses = ( p | "Read MO sentence status table from BigQuery" >> beam.io.Read( beam.io.BigQuerySource(query=us_mo_sentence_status_query, use_standard_sql=True))) else: us_mo_sentence_statuses = ( p | f"Generate empty MO statuses list for non-MO state run: {state_code} " >> beam.Create([])) us_mo_sentence_status_rankings_as_kv = ( us_mo_sentence_statuses | 'Convert MO sentence status ranking table to KV tuples' >> beam.ParDo(ConvertDictToKVTuple(), 'person_id')) supervision_sentences_and_statuses = ( { 'incarceration_sentences': incarceration_sentences, 'supervision_sentences': supervision_sentences, 'sentence_statuses': us_mo_sentence_status_rankings_as_kv } | 'Group sentences to the sentence statuses for that person' >> beam.CoGroupByKey()) sentences_converted = ( supervision_sentences_and_statuses | 'Convert to state-specific sentences' >> beam.ParDo( ConvertSentencesToStateSpecificType()).with_outputs( 'incarceration_sentences', 'supervision_sentences')) sentences_and_sentence_groups = ( { 'sentence_groups': sentence_groups, 'incarceration_sentences': sentences_converted.incarceration_sentences, 'supervision_sentences': sentences_converted.supervision_sentences } | 'Group sentences to sentence groups' >> beam.CoGroupByKey()) # Set hydrated sentences on the corresponding sentence groups sentence_groups_with_hydrated_sentences = ( sentences_and_sentence_groups | 'Set hydrated sentences on sentence groups' >> beam.ParDo( SetSentencesOnSentenceGroup())) # Bring in the table that associates people and their county of residence person_id_to_county_query = select_all_by_person_query( reference_dataset, PERSONS_TO_RECENT_COUNTY_OF_RESIDENCE_VIEW_NAME, # TODO(3602): Once we put state_code on StatePerson objects, we can update the # persons_to_recent_county_of_residence query to have a state_code field, allowing us to also filter the # output by state_code. state_code_filter=None, person_id_filter_set=person_id_filter_set) person_id_to_county_kv = ( p | "Read person_id to county associations from BigQuery" >> beam.io.Read( beam.io.BigQuerySource(query=person_id_to_county_query, use_standard_sql=True)) | "Convert person_id to county association table to KV" >> beam.ParDo(ConvertDictToKVTuple(), 'person_id')) # Bring in the judicial districts associated with incarceration_periods ip_to_judicial_district_query = select_all_by_person_query( reference_dataset, INCARCERATION_PERIOD_JUDICIAL_DISTRICT_ASSOCIATION_VIEW_NAME, state_code, person_id_filter_set) ip_to_judicial_district_kv = ( p | "Read incarceration_period to judicial_district associations from BigQuery" >> beam.io.Read( beam.io.BigQuerySource(query=ip_to_judicial_district_query, use_standard_sql=True)) | "Convert incarceration_period to judicial_district association table to KV" >> beam.ParDo(ConvertDictToKVTuple(), 'person_id')) # Group each StatePerson with their related entities person_entities = ( { 'person': persons, 'sentence_groups': sentence_groups_with_hydrated_sentences, 'incarceration_period_judicial_district_association': ip_to_judicial_district_kv } | 'Group StatePerson to SentenceGroups' >> beam.CoGroupByKey()) # Identify IncarcerationEvents events from the StatePerson's StateIncarcerationPeriods person_events = ( person_entities | 'Classify Incarceration Events' >> beam.ParDo( ClassifyIncarcerationEvents(), AsDict(person_id_to_county_kv))) # Get pipeline job details for accessing job_id all_pipeline_options = apache_beam_pipeline_options.get_all_options() # Add timestamp for local jobs job_timestamp = datetime.datetime.now().strftime( '%Y-%m-%d_%H_%M_%S.%f') all_apache_beam_pipeline_options['job_timestamp'] = job_timestamp # Get the type of metric to calculate metric_types_set = set(metric_types) # Get IncarcerationMetrics incarceration_metrics = ( person_events | 'Get Incarceration Metrics' >> GetIncarcerationMetrics( pipeline_options=all_pipeline_options, metric_types=metric_types_set, calculation_end_month=calculation_end_month, calculation_month_count=calculation_month_count)) if person_id_filter_set: logging.warning( "Non-empty person filter set - returning before writing metrics." ) return # Convert the metrics into a format that's writable to BQ writable_metrics = ( incarceration_metrics | 'Convert to dict to be written to BQ' >> beam.ParDo(RecidivizMetricWritableDict()).with_outputs( IncarcerationMetricType.INCARCERATION_ADMISSION.value, IncarcerationMetricType.INCARCERATION_POPULATION.value, IncarcerationMetricType.INCARCERATION_RELEASE.value)) # Write the metrics to the output tables in BigQuery admissions_table_id = DATAFLOW_METRICS_TO_TABLES.get( IncarcerationAdmissionMetric) population_table_id = DATAFLOW_METRICS_TO_TABLES.get( IncarcerationPopulationMetric) releases_table_id = DATAFLOW_METRICS_TO_TABLES.get( IncarcerationReleaseMetric) _ = (writable_metrics.INCARCERATION_ADMISSION | f"Write admission metrics to BQ table: {admissions_table_id}" >> beam.io.WriteToBigQuery( table=admissions_table_id, dataset=output, create_disposition=beam.io.BigQueryDisposition.CREATE_NEVER, write_disposition=beam.io.BigQueryDisposition.WRITE_APPEND, method=beam.io.WriteToBigQuery.Method.FILE_LOADS)) _ = (writable_metrics.INCARCERATION_POPULATION | f"Write population metrics to BQ table: {population_table_id}" >> beam.io.WriteToBigQuery( table=population_table_id, dataset=output, create_disposition=beam.io.BigQueryDisposition.CREATE_NEVER, write_disposition=beam.io.BigQueryDisposition.WRITE_APPEND, method=beam.io.WriteToBigQuery.Method.FILE_LOADS)) _ = (writable_metrics.INCARCERATION_RELEASE | f"Write release metrics to BQ table: {releases_table_id}" >> beam.io.WriteToBigQuery( table=releases_table_id, dataset=output, create_disposition=beam.io.BigQueryDisposition.CREATE_NEVER, write_disposition=beam.io.BigQueryDisposition.WRITE_APPEND, method=beam.io.WriteToBigQuery.Method.FILE_LOADS))
def run_test_pipeline( self, dataset: str, fake_supervision_period_id: int, unifying_id_field_filter_set: Optional[Set[int]] = None, metric_types_filter: Optional[Set[str]] = None): """Runs a test version of the program pipeline.""" test_pipeline = TestPipeline() # Get StatePersons persons = ( test_pipeline | 'Load Persons' >> # type: ignore extractor_utils.BuildRootEntity( dataset=dataset, root_entity_class=entities.StatePerson, unifying_id_field=entities.StatePerson.get_class_id_name(), build_related_entities=True)) # Get StateProgramAssignments program_assignments = ( test_pipeline | 'Load Program Assignments' >> # type: ignore extractor_utils.BuildRootEntity( dataset=dataset, root_entity_class=entities.StateProgramAssignment, unifying_id_field=entities.StatePerson.get_class_id_name(), build_related_entities=True, unifying_id_field_filter_set=unifying_id_field_filter_set)) # Get StateAssessments assessments = ( test_pipeline | 'Load Assessments' >> # type: ignore extractor_utils.BuildRootEntity( dataset=dataset, root_entity_class=entities.StateAssessment, unifying_id_field=entities.StatePerson.get_class_id_name(), build_related_entities=False, unifying_id_field_filter_set=unifying_id_field_filter_set)) # Get StateSupervisionPeriods supervision_periods = ( test_pipeline | 'Load SupervisionPeriods' >> # type: ignore extractor_utils.BuildRootEntity( dataset=dataset, root_entity_class=entities.StateSupervisionPeriod, unifying_id_field=entities.StatePerson.get_class_id_name(), build_related_entities=False, unifying_id_field_filter_set=unifying_id_field_filter_set)) supervision_period_to_agent_map = { 'agent_id': 1010, 'agent_external_id': 'OFFICER0009', 'district_external_id': '10', 'supervision_period_id': fake_supervision_period_id } supervision_period_to_agent_associations = ( test_pipeline | 'Create SupervisionPeriod to Agent table' >> beam.Create( [supervision_period_to_agent_map])) supervision_period_to_agent_associations_as_kv = ( supervision_period_to_agent_associations | 'Convert SupervisionPeriod to Agent table to KV tuples' >> beam.ParDo(pipeline.ConvertDictToKVTuple(), 'supervision_period_id')) # Group each StatePerson with their other entities persons_entities = ({ 'person': persons, 'program_assignments': program_assignments, 'assessments': assessments, 'supervision_periods': supervision_periods } | 'Group StatePerson to StateProgramAssignments and' >> beam.CoGroupByKey()) # Identify ProgramEvents from the StatePerson's # StateProgramAssignments person_program_events = ( persons_entities | beam.ParDo( pipeline.ClassifyProgramAssignments(), AsDict(supervision_period_to_agent_associations_as_kv))) # Get pipeline job details for accessing job_id all_pipeline_options = PipelineOptions().get_all_options() # Add timestamp for local jobs job_timestamp = datetime.datetime.now().strftime( '%Y-%m-%d_%H_%M_%S.%f') all_pipeline_options['job_timestamp'] = job_timestamp metric_types = metric_types_filter if metric_types_filter else {'ALL'} # Get program metrics program_metrics = ( person_program_events | 'Get Program Metrics' >> # type: ignore pipeline.GetProgramMetrics(pipeline_options=all_pipeline_options, metric_types=metric_types, calculation_end_month=None, calculation_month_count=-1)) assert_that(program_metrics, AssertMatchers.validate_pipeline_test()) test_pipeline.run()
def run_test_pipeline( fake_person_id: int, state_code: str, dataset: str, expected_metric_types: Set[IncarcerationMetricType], allow_empty: bool = False, unifying_id_field_filter_set: Optional[Set[int]] = None, metric_types_filter: Optional[Set[str]] = None): """Runs a test version of the incarceration pipeline.""" test_pipeline = TestPipeline() # Get StatePersons persons = ( test_pipeline | 'Load Persons' >> # type: ignore extractor_utils.BuildRootEntity( dataset=dataset, root_entity_class=entities.StatePerson, unifying_id_field=entities.StatePerson.get_class_id_name(), build_related_entities=True)) # Get StateSentenceGroups sentence_groups = ( test_pipeline | 'Load StateSentenceGroups' >> # type: ignore extractor_utils.BuildRootEntity( dataset=dataset, root_entity_class=entities.StateSentenceGroup, unifying_id_field=entities.StatePerson.get_class_id_name(), build_related_entities=True, unifying_id_field_filter_set=unifying_id_field_filter_set)) # Get StateIncarcerationSentences incarceration_sentences = ( test_pipeline | 'Load StateIncarcerationSentences' >> # type: ignore extractor_utils.BuildRootEntity( dataset=dataset, root_entity_class=entities.StateIncarcerationSentence, unifying_id_field=entities.StatePerson.get_class_id_name(), build_related_entities=True, unifying_id_field_filter_set=unifying_id_field_filter_set)) # Get StateSupervisionSentences supervision_sentences = ( test_pipeline | 'Load StateSupervisionSentences' >> # type: ignore extractor_utils.BuildRootEntity( dataset=dataset, root_entity_class=entities.StateSupervisionSentence, unifying_id_field=entities.StatePerson.get_class_id_name(), build_related_entities=True, unifying_id_field_filter_set=unifying_id_field_filter_set)) us_mo_sentence_status_rows: List[Dict[str, Any]] = [{ 'person_id': fake_person_id, 'sentence_external_id': 'XXX', 'sentence_status_external_id': 'YYY', 'status_code': 'ZZZ', 'status_date': 'not_a_date', 'status_description': 'XYZ' }] us_mo_sentence_statuses = (test_pipeline | 'Create MO sentence statuses' >> beam.Create(us_mo_sentence_status_rows)) us_mo_sentence_status_rankings_as_kv = ( us_mo_sentence_statuses | 'Convert sentence status ranking table to KV tuples' >> beam.ParDo(ConvertDictToKVTuple(), 'person_id')) sentences_and_statuses = ( { 'incarceration_sentences': incarceration_sentences, 'supervision_sentences': supervision_sentences, 'sentence_statuses': us_mo_sentence_status_rankings_as_kv } | 'Group sentences to the sentence statuses for that person' >> beam.CoGroupByKey()) sentences_converted = ( sentences_and_statuses | 'Convert to state-specific sentences' >> beam.ParDo( ConvertSentencesToStateSpecificType()).with_outputs( 'incarceration_sentences', 'supervision_sentences')) sentences_and_sentence_groups = ( { 'sentence_groups': sentence_groups, 'incarceration_sentences': sentences_converted.incarceration_sentences, 'supervision_sentences': sentences_converted.supervision_sentences } | 'Group sentences to sentence groups' >> beam.CoGroupByKey()) sentence_groups_with_hydrated_sentences = ( sentences_and_sentence_groups | 'Set hydrated sentences on sentence groups' >> beam.ParDo( SetSentencesOnSentenceGroup())) # Identify IncarcerationEvents events from the StatePerson's # StateIncarcerationPeriods fake_person_id_to_county_query_result = [{ 'person_id': fake_person_id, 'county_of_residence': _COUNTY_OF_RESIDENCE }] person_id_to_county_kv = ( test_pipeline | "Read person id to county associations from BigQuery" >> beam.Create(fake_person_id_to_county_query_result) | "Convert person_id to counties to KV" >> beam.ParDo( ConvertDictToKVTuple(), 'person_id')) incarceration_period_judicial_district_association_row = \ {'person_id': fake_person_id, 'incarceration_period_id': 123, 'judicial_district_code': 'NW'} ip_to_judicial_district_kv = ( test_pipeline | "Read incarceration_period to judicial_district associations from BigQuery" >> beam.Create( [incarceration_period_judicial_district_association_row]) | "Convert ips to judicial districts to KV" >> beam.ParDo( ConvertDictToKVTuple(), 'person_id')) state_race_ethnicity_population_count = { 'state_code': state_code, 'race_or_ethnicity': 'BLACK', 'population_count': 1, 'representation_priority': 1 } state_race_ethnicity_population_counts = ( test_pipeline | 'Create state_race_ethnicity_population_count table' >> beam.Create([state_race_ethnicity_population_count])) # Group each StatePerson with their related entities person_entities = ( { 'person': persons, 'sentence_groups': sentence_groups_with_hydrated_sentences, 'incarceration_period_judicial_district_association': ip_to_judicial_district_kv } | 'Group StatePerson to SentenceGroups' >> beam.CoGroupByKey()) # Identify IncarcerationEvents events from the StatePerson's StateIncarcerationPeriods person_incarceration_events = ( person_entities | 'Classify Incarceration Events' >> beam.ParDo( pipeline.ClassifyIncarcerationEvents(), AsDict(person_id_to_county_kv))) person_metadata = ( persons | "Build the person_metadata dictionary" >> beam.ParDo( BuildPersonMetadata(), AsList(state_race_ethnicity_population_counts))) person_incarceration_events_with_metadata = ( { 'person_events': person_incarceration_events, 'person_metadata': person_metadata } | 'Group IncarcerationEvents with person-level metadata' >> beam.CoGroupByKey() | 'Organize StatePerson, PersonMetadata and IncarcerationEvents for calculations' >> beam.ParDo(ExtractPersonEventsMetadata())) # Get pipeline job details for accessing job_id all_pipeline_options = PipelineOptions().get_all_options() # Add timestamp for local jobs job_timestamp = datetime.datetime.now().strftime( '%Y-%m-%d_%H_%M_%S.%f') all_pipeline_options['job_timestamp'] = job_timestamp metric_types = metric_types_filter if metric_types_filter else {'ALL'} # Get IncarcerationMetrics incarceration_metrics = ( person_incarceration_events_with_metadata | 'Get Incarceration Metrics' >> # type: ignore pipeline.GetIncarcerationMetrics( pipeline_options=all_pipeline_options, metric_types=metric_types, calculation_end_month=None, calculation_month_count=-1)) assert_that( incarceration_metrics, AssertMatchers.validate_metric_type(allow_empty=allow_empty), 'Assert that all metrics are of the expected type.') assert_that( incarceration_metrics, AssertMatchers.validate_pipeline_test(expected_metric_types), 'Assert the type of metrics produced are expected') test_pipeline.run()
def testClassifyProgramAssignments_NoSupervision(self): """Tests the ClassifyProgramAssignments DoFn.""" fake_person_id = 12345 fake_person = entities.StatePerson.new_with_defaults( person_id=fake_person_id, gender=Gender.MALE, birthdate=date(1970, 1, 1), residency_status=ResidencyStatus.PERMANENT) program_assignment = entities.StateProgramAssignment.new_with_defaults( state_code='US_CA', program_id='PG3', referral_date=date(2009, 10, 3)) assessment = entities.StateAssessment.new_with_defaults( state_code='US_CA', assessment_type=StateAssessmentType.ORAS, assessment_score=33, assessment_date=date(2009, 7, 10)) person_periods = { 'person': [fake_person], 'program_assignments': [program_assignment], 'assessments': [assessment], 'supervision_periods': [] } program_event = ProgramReferralEvent( state_code=program_assignment.state_code, program_id=program_assignment.program_id, event_date=program_assignment.referral_date, assessment_score=33, assessment_type=StateAssessmentType.ORAS, ) correct_output = [(fake_person, [program_event])] test_pipeline = TestPipeline() supervision_period_to_agent_map = {'fake': 'map'} supervision_period_to_agent_associations = ( test_pipeline | 'Create SupervisionPeriod to Agent table' >> beam.Create( [supervision_period_to_agent_map])) supervision_periods_to_agent_associations_as_kv = ( supervision_period_to_agent_associations | 'Convert SupervisionPeriod to Agent table to KV tuples' >> beam.ParDo(pipeline.ConvertDictToKVTuple(), 'supervision_period_id')) output = (test_pipeline | beam.Create([(fake_person_id, person_periods)]) | 'Identify Program Events' >> beam.ParDo( pipeline.ClassifyProgramAssignments(), AsDict(supervision_periods_to_agent_associations_as_kv))) assert_that(output, equal_to(correct_output)) test_pipeline.run()
def testClassifyIncarcerationEvents(self): """Tests the ClassifyIncarcerationEvents DoFn.""" fake_person_id = 12345 fake_person = StatePerson.new_with_defaults( state_code='US_XX', person_id=fake_person_id, gender=Gender.MALE, birthdate=date(1970, 1, 1), residency_status=ResidencyStatus.PERMANENT) incarceration_period = StateIncarcerationPeriod.new_with_defaults( incarceration_period_id=1111, incarceration_type=StateIncarcerationType.STATE_PRISON, status=StateIncarcerationPeriodStatus.NOT_IN_CUSTODY, state_code='TX', facility='PRISON XX', admission_date=date(2010, 11, 20), admission_reason=StateIncarcerationPeriodAdmissionReason. PROBATION_REVOCATION, release_date=date(2010, 11, 21), release_reason=StateIncarcerationPeriodReleaseReason. SENTENCE_SERVED, specialized_purpose_for_incarceration= StateSpecializedPurposeForIncarceration.PAROLE_BOARD_HOLD) incarceration_sentence = StateIncarcerationSentence.new_with_defaults( incarceration_sentence_id=123, incarceration_periods=[incarceration_period], start_date=date(2009, 2, 9), charges=[ StateCharge.new_with_defaults(ncic_code='5699', statute='30A123', offense_date=date(2009, 1, 9)) ]) sentence_group = StateSentenceGroup.new_with_defaults( sentence_group_id=123, incarceration_sentences=[incarceration_sentence]) incarceration_sentence.sentence_group = sentence_group incarceration_period.incarceration_sentences = [incarceration_sentence] fake_person_id_to_county_query_result = [{ 'person_id': fake_person_id, 'county_of_residence': _COUNTY_OF_RESIDENCE }] fake_incarceration_period_judicial_district_association_result = \ {'person_id': fake_person_id, 'incarceration_period_id': 123, 'judicial_district_code': 'NW'} incarceration_events = [ IncarcerationStayEvent( admission_reason=incarceration_period.admission_reason, admission_reason_raw_text=incarceration_period. admission_reason_raw_text, supervision_type_at_admission= StateSupervisionPeriodSupervisionType.PROBATION, state_code=incarceration_period.state_code, event_date=incarceration_period.admission_date, facility=incarceration_period.facility, county_of_residence=_COUNTY_OF_RESIDENCE, most_serious_offense_ncic_code='5699', most_serious_offense_statute='30A123', specialized_purpose_for_incarceration= StateSpecializedPurposeForIncarceration.PAROLE_BOARD_HOLD), IncarcerationAdmissionEvent( state_code=incarceration_period.state_code, event_date=incarceration_period.admission_date, facility=incarceration_period.facility, county_of_residence=_COUNTY_OF_RESIDENCE, admission_reason=incarceration_period.admission_reason, admission_reason_raw_text=incarceration_period. admission_reason_raw_text, supervision_type_at_admission= StateSupervisionPeriodSupervisionType.PROBATION, specialized_purpose_for_incarceration= StateSpecializedPurposeForIncarceration.PAROLE_BOARD_HOLD), IncarcerationReleaseEvent( state_code=incarceration_period.state_code, event_date=incarceration_period.release_date, facility=incarceration_period.facility, county_of_residence=_COUNTY_OF_RESIDENCE, release_reason=incarceration_period.release_reason, admission_reason=incarceration_period.admission_reason, total_days_incarcerated=( incarceration_period.release_date - incarceration_period.admission_date).days, purpose_for_incarceration= StateSpecializedPurposeForIncarceration.PAROLE_BOARD_HOLD) ] correct_output = [(fake_person_id, (fake_person, incarceration_events)) ] test_pipeline = TestPipeline() person_id_to_county_kv = ( test_pipeline | "Read person id to county associations from BigQuery" >> beam.Create(fake_person_id_to_county_query_result) | "Convert to KV" >> beam.ParDo(ConvertDictToKVTuple(), 'person_id')) person_entities = { 'person': [fake_person], 'sentence_groups': [sentence_group], 'incarceration_period_judicial_district_association': [fake_incarceration_period_judicial_district_association_result] } output = (test_pipeline | beam.Create([(fake_person_id, person_entities)]) | 'Identify Incarceration Events' >> beam.ParDo( pipeline.ClassifyIncarcerationEvents(), AsDict(person_id_to_county_kv))) assert_that(output, equal_to(correct_output)) test_pipeline.run()
def run(apache_beam_pipeline_options: PipelineOptions, data_input: str, reference_input: str, output: str, calculation_month_count: int, metric_types: List[str], state_code: Optional[str], calculation_end_month: Optional[str], person_filter_ids: Optional[List[int]]): """Runs the supervision calculation pipeline.""" # Workaround to load SQLAlchemy objects at start of pipeline. This is necessary because the BuildRootEntity # function tries to access attributes of relationship properties on the SQLAlchemy room_schema_class before they # have been loaded. However, if *any* SQLAlchemy objects have been instantiated, then the relationship properties # are loaded and their attributes can be successfully accessed. _ = schema.StatePerson() apache_beam_pipeline_options.view_as(SetupOptions).save_main_session = True # Get pipeline job details all_pipeline_options = apache_beam_pipeline_options.get_all_options() input_dataset = all_pipeline_options['project'] + '.' + data_input reference_dataset = all_pipeline_options['project'] + '.' + reference_input person_id_filter_set = set( person_filter_ids) if person_filter_ids else None with beam.Pipeline(options=apache_beam_pipeline_options) as p: # Get StatePersons persons = (p | 'Load Persons' >> BuildRootEntity( dataset=input_dataset, root_entity_class=entities.StatePerson, unifying_id_field=entities.StatePerson.get_class_id_name(), build_related_entities=True, unifying_id_field_filter_set=person_id_filter_set, state_code=state_code)) # Get StateIncarcerationPeriods incarceration_periods = ( p | 'Load IncarcerationPeriods' >> BuildRootEntity( dataset=input_dataset, root_entity_class=entities.StateIncarcerationPeriod, unifying_id_field=entities.StatePerson.get_class_id_name(), build_related_entities=True, unifying_id_field_filter_set=person_id_filter_set, state_code=state_code)) # Get StateSupervisionViolations supervision_violations = ( p | 'Load SupervisionViolations' >> BuildRootEntity( dataset=input_dataset, root_entity_class=entities.StateSupervisionViolation, unifying_id_field=entities.StatePerson.get_class_id_name(), build_related_entities=True, unifying_id_field_filter_set=person_id_filter_set, state_code=state_code)) # TODO(2769): Don't bring this in as a root entity # Get StateSupervisionViolationResponses supervision_violation_responses = ( p | 'Load SupervisionViolationResponses' >> BuildRootEntity( dataset=input_dataset, root_entity_class=entities.StateSupervisionViolationResponse, unifying_id_field=entities.StatePerson.get_class_id_name(), build_related_entities=True, unifying_id_field_filter_set=person_id_filter_set, state_code=state_code)) # Get StateSupervisionSentences supervision_sentences = ( p | 'Load SupervisionSentences' >> BuildRootEntity( dataset=input_dataset, root_entity_class=entities.StateSupervisionSentence, unifying_id_field=entities.StatePerson.get_class_id_name(), build_related_entities=True, unifying_id_field_filter_set=person_id_filter_set, state_code=state_code)) # Get StateIncarcerationSentences incarceration_sentences = ( p | 'Load IncarcerationSentences' >> BuildRootEntity( dataset=input_dataset, root_entity_class=entities.StateIncarcerationSentence, unifying_id_field=entities.StatePerson.get_class_id_name(), build_related_entities=True, unifying_id_field_filter_set=person_id_filter_set, state_code=state_code)) # Get StateSupervisionPeriods supervision_periods = ( p | 'Load SupervisionPeriods' >> BuildRootEntity( dataset=input_dataset, root_entity_class=entities.StateSupervisionPeriod, unifying_id_field=entities.StatePerson.get_class_id_name(), build_related_entities=True, unifying_id_field_filter_set=person_id_filter_set, state_code=state_code)) # Get StateAssessments assessments = (p | 'Load Assessments' >> BuildRootEntity( dataset=input_dataset, root_entity_class=entities.StateAssessment, unifying_id_field=entities.StatePerson.get_class_id_name(), build_related_entities=False, unifying_id_field_filter_set=person_id_filter_set, state_code=state_code)) supervision_contacts = ( p | 'Load StateSupervisionContacts' >> BuildRootEntity( dataset=input_dataset, root_entity_class=entities.StateSupervisionContact, unifying_id_field=entities.StatePerson.get_class_id_name(), build_related_entities=False, unifying_id_field_filter_set=person_id_filter_set, state_code=state_code)) # Bring in the table that associates StateSupervisionViolationResponses to information about StateAgents ssvr_to_agent_association_query = select_all_by_person_query( reference_dataset, SSVR_TO_AGENT_ASSOCIATION_VIEW_NAME, state_code, person_id_filter_set) ssvr_to_agent_associations = ( p | "Read SSVR to Agent table from BigQuery" >> beam.io.Read( beam.io.BigQuerySource(query=ssvr_to_agent_association_query, use_standard_sql=True))) # Convert the association table rows into key-value tuples with the value for the # supervision_violation_response_id column as the key ssvr_agent_associations_as_kv = ( ssvr_to_agent_associations | 'Convert SSVR to Agent table to KV tuples' >> beam.ParDo( ConvertDictToKVTuple(), 'supervision_violation_response_id')) supervision_period_to_agent_association_query = select_all_by_person_query( reference_dataset, SUPERVISION_PERIOD_TO_AGENT_ASSOCIATION_VIEW_NAME, state_code, person_id_filter_set) supervision_period_to_agent_associations = ( p | "Read Supervision Period to Agent table from BigQuery" >> beam.io.Read( beam.io.BigQuerySource( query=supervision_period_to_agent_association_query, use_standard_sql=True))) # Convert the association table rows into key-value tuples with the value for the supervision_period_id column # as the key supervision_period_to_agent_associations_as_kv = ( supervision_period_to_agent_associations | 'Convert Supervision Period to Agent table to KV tuples' >> beam.ParDo(ConvertDictToKVTuple(), 'supervision_period_id')) if state_code is None or state_code == 'US_MO': # Bring in the reference table that includes sentence status ranking information us_mo_sentence_status_query = select_all_by_person_query( reference_dataset, US_MO_SENTENCE_STATUSES_VIEW_NAME, state_code, person_id_filter_set) us_mo_sentence_statuses = ( p | "Read MO sentence status table from BigQuery" >> beam.io.Read( beam.io.BigQuerySource(query=us_mo_sentence_status_query, use_standard_sql=True))) else: us_mo_sentence_statuses = ( p | f"Generate empty MO statuses list for non-MO state run: {state_code} " >> beam.Create([])) us_mo_sentence_status_rankings_as_kv = ( us_mo_sentence_statuses | 'Convert MO sentence status ranking table to KV tuples' >> beam.ParDo(ConvertDictToKVTuple(), 'person_id')) sentences_and_statuses = ( { 'incarceration_sentences': incarceration_sentences, 'supervision_sentences': supervision_sentences, 'sentence_statuses': us_mo_sentence_status_rankings_as_kv } | 'Group sentences to the sentence statuses for that person' >> beam.CoGroupByKey()) sentences_converted = ( sentences_and_statuses | 'Convert to state-specific sentences' >> beam.ParDo( ConvertSentencesToStateSpecificType()).with_outputs( 'incarceration_sentences', 'supervision_sentences')) # Bring in the judicial districts associated with supervision_periods sp_to_judicial_district_query = select_all_by_person_query( reference_dataset, SUPERVISION_PERIOD_JUDICIAL_DISTRICT_ASSOCIATION_VIEW_NAME, state_code, person_id_filter_set) sp_to_judicial_district_kv = ( p | "Read supervision_period to judicial_district associations from BigQuery" >> beam.io.Read( beam.io.BigQuerySource(query=sp_to_judicial_district_query, use_standard_sql=True)) | "Convert supervision_period to judicial_district association table to KV" >> beam.ParDo(ConvertDictToKVTuple(), 'person_id')) # Group StateSupervisionViolationResponses and StateSupervisionViolations by person_id supervision_violations_and_responses = ( { 'violations': supervision_violations, 'violation_responses': supervision_violation_responses } | 'Group StateSupervisionViolationResponses to ' 'StateSupervisionViolations' >> beam.CoGroupByKey()) # Set the fully hydrated StateSupervisionViolation entities on the corresponding # StateSupervisionViolationResponses violation_responses_with_hydrated_violations = ( supervision_violations_and_responses | 'Set hydrated StateSupervisionViolations on ' 'the StateSupervisionViolationResponses' >> beam.ParDo( SetViolationOnViolationsResponse())) # Group StateIncarcerationPeriods and StateSupervisionViolationResponses by person_id incarceration_periods_and_violation_responses = ( { 'incarceration_periods': incarceration_periods, 'violation_responses': violation_responses_with_hydrated_violations } | 'Group StateIncarcerationPeriods to ' 'StateSupervisionViolationResponses' >> beam.CoGroupByKey()) # Set the fully hydrated StateSupervisionViolationResponse entities on the corresponding # StateIncarcerationPeriods incarceration_periods_with_source_violations = ( incarceration_periods_and_violation_responses | 'Set hydrated StateSupervisionViolationResponses on ' 'the StateIncarcerationPeriods' >> beam.ParDo( SetViolationResponseOnIncarcerationPeriod())) # Group each StatePerson with their related entities person_entities = ( { 'person': persons, 'assessments': assessments, 'incarceration_periods': incarceration_periods_with_source_violations, 'supervision_periods': supervision_periods, 'supervision_sentences': sentences_converted.supervision_sentences, 'incarceration_sentences': sentences_converted.incarceration_sentences, 'violation_responses': violation_responses_with_hydrated_violations, 'supervision_contacts': supervision_contacts, 'supervision_period_judicial_district_association': sp_to_judicial_district_kv } | 'Group StatePerson to all entities' >> beam.CoGroupByKey()) # Identify SupervisionTimeBuckets from the StatePerson's StateSupervisionSentences and StateIncarcerationPeriods person_time_buckets = ( person_entities | 'Get SupervisionTimeBuckets' >> beam.ParDo( ClassifySupervisionTimeBuckets(), AsDict(ssvr_agent_associations_as_kv), AsDict(supervision_period_to_agent_associations_as_kv))) # Get pipeline job details for accessing job_id all_pipeline_options = apache_beam_pipeline_options.get_all_options() # Get the type of metric to calculate metric_types_set = set(metric_types) # Add timestamp for local jobs job_timestamp = datetime.datetime.now().strftime( '%Y-%m-%d_%H_%M_%S.%f') all_pipeline_options['job_timestamp'] = job_timestamp # Get supervision metrics supervision_metrics = ( person_time_buckets | 'Get Supervision Metrics' >> GetSupervisionMetrics( pipeline_options=all_pipeline_options, metric_types=metric_types_set, calculation_end_month=calculation_end_month, calculation_month_count=calculation_month_count)) if person_id_filter_set: logging.warning( "Non-empty person filter set - returning before writing metrics." ) return # Convert the metrics into a format that's writable to BQ writable_metrics = ( supervision_metrics | 'Convert to dict to be written to BQ' >> beam.ParDo(RecidivizMetricWritableDict()).with_outputs( SupervisionMetricType.SUPERVISION_COMPLIANCE.value, SupervisionMetricType.SUPERVISION_POPULATION.value, SupervisionMetricType.SUPERVISION_REVOCATION.value, SupervisionMetricType.SUPERVISION_REVOCATION_ANALYSIS.value, SupervisionMetricType. SUPERVISION_REVOCATION_VIOLATION_TYPE_ANALYSIS.value, SupervisionMetricType.SUPERVISION_SUCCESS.value, SupervisionMetricType. SUPERVISION_SUCCESSFUL_SENTENCE_DAYS_SERVED.value, SupervisionMetricType.SUPERVISION_TERMINATION.value)) # Write the metrics to the output tables in BigQuery terminations_table_id = DATAFLOW_METRICS_TO_TABLES.get( SupervisionTerminationMetric) compliance_table_id = DATAFLOW_METRICS_TO_TABLES.get( SupervisionCaseComplianceMetric) populations_table_id = DATAFLOW_METRICS_TO_TABLES.get( SupervisionPopulationMetric) revocations_table_id = DATAFLOW_METRICS_TO_TABLES.get( SupervisionRevocationMetric) revocation_analysis_table_id = DATAFLOW_METRICS_TO_TABLES.get( SupervisionRevocationAnalysisMetric) revocation_violation_type_analysis_table_id = \ DATAFLOW_METRICS_TO_TABLES.get(SupervisionRevocationViolationTypeAnalysisMetric) successes_table_id = DATAFLOW_METRICS_TO_TABLES.get( SupervisionSuccessMetric) successful_sentence_lengths_table_id = DATAFLOW_METRICS_TO_TABLES.get( SuccessfulSupervisionSentenceDaysServedMetric) _ = (writable_metrics.SUPERVISION_POPULATION | f"Write population metrics to BQ table: {populations_table_id}" >> beam.io.WriteToBigQuery( table=populations_table_id, dataset=output, create_disposition=beam.io.BigQueryDisposition.CREATE_NEVER, write_disposition=beam.io.BigQueryDisposition.WRITE_APPEND, method=beam.io.WriteToBigQuery.Method.FILE_LOADS)) _ = (writable_metrics.SUPERVISION_REVOCATION | f"Write revocation metrics to BQ table: {revocations_table_id}" >> beam.io.WriteToBigQuery( table=revocations_table_id, dataset=output, create_disposition=beam.io.BigQueryDisposition.CREATE_NEVER, write_disposition=beam.io.BigQueryDisposition.WRITE_APPEND, method=beam.io.WriteToBigQuery.Method.FILE_LOADS)) _ = (writable_metrics.SUPERVISION_SUCCESS | f"Write success metrics to BQ table: {successes_table_id}" >> beam.io.WriteToBigQuery( table=successes_table_id, dataset=output, create_disposition=beam.io.BigQueryDisposition.CREATE_NEVER, write_disposition=beam.io.BigQueryDisposition.WRITE_APPEND, method=beam.io.WriteToBigQuery.Method.FILE_LOADS)) _ = (writable_metrics.SUPERVISION_SUCCESSFUL_SENTENCE_DAYS_SERVED | f"Write supervision successful sentence length metrics to BQ" f" table: {successful_sentence_lengths_table_id}" >> beam.io.WriteToBigQuery( table=successful_sentence_lengths_table_id, dataset=output, create_disposition=beam.io.BigQueryDisposition.CREATE_NEVER, write_disposition=beam.io.BigQueryDisposition.WRITE_APPEND, method=beam.io.WriteToBigQuery.Method.FILE_LOADS)) _ = (writable_metrics.SUPERVISION_TERMINATION | f"Write termination metrics to BQ table: {terminations_table_id}" >> beam.io.WriteToBigQuery( table=terminations_table_id, dataset=output, create_disposition=beam.io.BigQueryDisposition.CREATE_NEVER, write_disposition=beam.io.BigQueryDisposition.WRITE_APPEND, method=beam.io.WriteToBigQuery.Method.FILE_LOADS)) _ = ( writable_metrics.SUPERVISION_REVOCATION_ANALYSIS | f"Write revocation analyses metrics to BQ table: {revocation_analysis_table_id}" >> beam.io.WriteToBigQuery( table=revocation_analysis_table_id, dataset=output, create_disposition=beam.io.BigQueryDisposition.CREATE_NEVER, write_disposition=beam.io.BigQueryDisposition.WRITE_APPEND, method=beam.io.WriteToBigQuery.Method.FILE_LOADS)) _ = (writable_metrics.SUPERVISION_REVOCATION_VIOLATION_TYPE_ANALYSIS | f"Write revocation violation type analyses metrics to BQ table: " f"{revocation_violation_type_analysis_table_id}" >> beam.io.WriteToBigQuery( table=revocation_violation_type_analysis_table_id, dataset=output, create_disposition=beam.io.BigQueryDisposition.CREATE_NEVER, write_disposition=beam.io.BigQueryDisposition.WRITE_APPEND, method=beam.io.WriteToBigQuery.Method.FILE_LOADS)) _ = (writable_metrics.SUPERVISION_COMPLIANCE | f"Write compliance metrics to BQ table: {compliance_table_id}" >> beam.io.WriteToBigQuery( table=compliance_table_id, dataset=output, create_disposition=beam.io.BigQueryDisposition.CREATE_NEVER, write_disposition=beam.io.BigQueryDisposition.WRITE_APPEND, method=beam.io.WriteToBigQuery.Method.FILE_LOADS))
def run(): address_scd = """SELECT * FROM `automatic-asset-253215.CORE.IM_CUSTOMER_ADDRESS_SCD`""" addr_old_data_update = """SELECT CAST(a.HSN_ACCT_NUM AS INT64) AS CUSTOMER_ID, a.ADDRESS_NAME AS ADDR_NAME, 'CLIC' AS ETL_SOURCE_SYSTEM, a.FILE_SET_DATE AS ETL_END_EFFECTIVE_DT, '0' AS ETL_CURRENT_IND, CAST(FORMAT_DATETIME('%Y%m%d%H%M%S', CURRENT_DATETIME()) AS INT64) AS UPD_BATCH_NBR FROM `automatic-asset-253215.STAGE.STG_CLIC_CUSTADDRORG` a""" primary_pipeline_1 = 'addrs_SCD' addrs_SCD = p | 'Target Table' >> beam.io.Read( beam.io.BigQuerySource(query=address_scd, use_standard_sql=True)) common_key = {'CUSTOMER_ID', 'ADDR_NAME', 'ETL_SOURCE_SYSTEM'} join_pipeline_1 = 'addrorg_old_update_tb' addrorg_old_update_tb = p | 'addrorg Table' >> beam.io.Read( beam.io.BigQuerySource(query=addr_old_data_update, use_standard_sql=True)) pipelines_dictionary_2 = { primary_pipeline_1: addrs_SCD, join_pipeline_1: addrorg_old_update_tb } updated_old_data = (pipelines_dictionary_2 | 'Updating addrs Fields' >> LeftJoin( primary_pipeline_1, addrs_SCD, join_pipeline_1, addrorg_old_update_tb, common_key)) address_scd_query = """SELECT (srg_key.MAX_VALUE_KEY + ROW_NUMBER() OVER()) AS CUSTOMER_ADDRESS_KEY, '' AS CUSTOMER_KEY, CAST(a.HSN_ACCT_NUM AS INT64) AS CUSTOMER_ID, a.ADDRESS_NAME AS ADDR_NAME, 'CLIC' AS ETL_SOURCE_SYSTEM, CAST(a.ROW_CREATED_DATE AS TIMESTAMP) AS SOURCE_CREATE_DT, a.ADDRESS_LINE_1 AS ADDR_LINE1_TXT, a.ADDRESS_LINE_2 AS ADDR_LINE2_TXT, a.CITY AS CITY_NAME, a.STATE AS STATE_CODE, a.COUNTRY AS COUNTRY_CODE, SUBSTR(a.ZIP_CODE,1,5) AS POSTAL_ZIP, SUBSTR(a.ZIP_CODE,6,9) AS POSTAL_ZIP4, CASE WHEN a.DISABLE_CLEANSING_FLAG = 'N' THEN 1 ELSE 0 END AS ADDR_CLEANSING_IND, CASE WHEN a.FRAUD_BAD_ACCT_FLAG = 'Y' THEN 1 ELSE 0 END AS ADDR_FRAUD_IND, CASE WHEN a.AGENT_VERIFIED_ADDRESS = 'Y' THEN 1 ELSE 0 END AS ADDR_QAS_VERIFIED_IND, a.ADDRESS_TYPE_CODE AS ADDR_TYPE_CODE, a.SHIP_TO_FIRST_NAME AS SHIPTO_FIRST_NAME, a.SHIP_TO_LAST_NAME AS SHIPTO_LAST_NAME, TIMESTAMP_ADD(a.FILE_SET_DATE, INTERVAL 1 DAY) AS ETL_BEGIN_EFFECTIVE_DT, CAST('2099-12-31 00:00:00' AS TIMESTAMP) AS ETL_END_EFFECTIVE_DT, '1' AS ETL_CURRENT_IND, '2' AS ETL_VERSION_NBR, --should be a sequntial number '0' AS VOID_IND, CAST(FORMAT_DATETIME('%Y%m%d%H%M%S', CURRENT_DATETIME()) AS INT64) AS UPD_BATCH_NBR, CAST(FORMAT_DATETIME('%Y%m%d%H%M%S', CURRENT_DATETIME()) AS INT64) AS INS_BATCH_NBR, '0' AS Privacy_Ind FROM `automatic-asset-253215.STAGE.STG_CLIC_CUSTADDRORG` a, `automatic-asset-253215.STAGE.STG_CLIC_SURROGKEYS` srg_key WHERE srg_key.TABLE_NAME = "IM_CUSTOMER_ADDRESS_SCD" """ Attribute_ref_query = """SELECT CUSTOMER_KEY, CUSTOMER_ID FROM `automatic-asset-253215.CORE.IM_CUSTOMER_ATTRIBUTE_REF` b""" lookup_data = ( addrs_SCD | 'Get Cust_Ids ' >> beam.Map(lambda row: (str(row['CUSTOMER_ID']) + row[ 'ADDR_NAME'] + row['ETL_SOURCE_SYSTEM'], row))) primary_pipeline_2 = 'new_ins_data' new_ins_data = (p | 'Read from custorgext' >> beam.io.Read( beam.io.BigQuerySource(query=address_scd_query, use_standard_sql=True)) | 'Lookup' >> beam.Map(lookup, AsDict(lookup_data)) | 'Filter' >> beam.ParDo(filter_out_nones)) join_pipeline_2 = 'attribute_ref_table' attribute_ref_table = (p | 'Read From Attribute Ref Table' >> beam.io.Read( beam.io.BigQuerySource(query=Attribute_ref_query, use_standard_sql=True))) common_key = 'CUSTOMER_ID' pipelines_dictionary = { primary_pipeline_2: new_ins_data, join_pipeline_2: attribute_ref_table } new_ins_data_2 = (pipelines_dictionary | 'Left join' >> LeftJoin2( primary_pipeline_2, new_ins_data, join_pipeline_2, attribute_ref_table, common_key) | 'Filter Nulls' >> beam.Filter(filter_null)) ((updated_old_data, new_ins_data_2) | 'Merge PCollections' >> beam.Flatten() | 'Write to IM_CUSTOMER_ADDRESS_SCD' >> beam.io.WriteToBigQuery( output_table, write_disposition=beam.io.BigQueryDisposition.WRITE_TRUNCATE, create_disposition=beam.io.BigQueryDisposition.CREATE_NEVER)) p.run().wait_until_finish()
def run(argv=None): # Parse arguments parser = argparse.ArgumentParser() parser.add_argument('--input-basics', dest='input_basics', required=True, help='Input movie base file to process.') parser.add_argument('--input-ratings', dest='input_ratings', required=True, help='Input rating file to process.') parser.add_argument('--output', dest='output', required=True, help='Output to write results to.') known_args, pipeline_args = parser.parse_known_args(argv) pipeline_options = PipelineOptions(pipeline_args) pipeline_options.view_as(SetupOptions).save_main_session = True # Set DirectRunner Capacity from apache_beam.options.pipeline_options import DirectOptions #pipeline_options.view_as(DirectOptions).direct_num_workers = 4 # Columns columns_title_basic = [ 'tconst', 'titleType', 'primaryTitle', 'originalTitle', 'isAdult', 'startYear', 'endYear', 'runtimeMinutes', 'genres' ] columns_ratings = ['tconst', 'averageRating', 'numVotes'] # Create the pipeline with beam.Pipeline(options=pipeline_options) as p: basic_data = ( p | 'Read data' >> beam.io.ReadFromText(known_args.input_basics, skip_header_lines=1) | 'Parse CSV' >> beam.ParDo(ParseCsv(columns_title_basic)) | 'Clean data' >> beam.ParDo( CleanData(bool_cols=['isAdult'], int_cols=['startYear', 'endYear', 'runtimeMinutes'])) | 'Filter data' >> beam.ParDo(FilterBasicData())) rating_data = ( p | 'Read data (Details)' >> beam.io.ReadFromText( known_args.input_ratings, skip_header_lines=1) | 'Parse CSV (Details)' >> beam.ParDo(ParseCsv(columns_ratings)) | 'Clean data (Details)' >> beam.ParDo( CleanData(int_cols=['numVotes'], float_cols=['averageRating'])) | 'Filter data (Details)' >> beam.ParDo(FilterRatingData())) # As dict rating_data_d = (rating_data | beam.Map(lambda d: (d['tconst'], d))) # Join the PCollections joined_dicts = ( basic_data | 'Join' >> beam.ParDo(JoinRatings(), AsDict(rating_data_d))) # Write to disk joined_dicts | 'write' >> beam.io.WriteToText(known_args.output) result = p.run() result.wait_until_finish()
def run(argv=None): """Runs the incarceration calculation pipeline.""" # Workaround to load SQLAlchemy objects at start of pipeline. This is necessary because the BuildRootEntity # function tries to access attributes of relationship properties on the SQLAlchemy room_schema_class before they # have been loaded. However, if *any* SQLAlchemy objects have been instantiated, then the relationship properties # are loaded and their attributes can be successfully accessed. _ = schema.StatePerson() # Parse command-line arguments known_args, pipeline_args = parse_arguments(argv) pipeline_options = PipelineOptions(pipeline_args) pipeline_options.view_as(SetupOptions).save_main_session = True # Get pipeline job details all_pipeline_options = pipeline_options.get_all_options() query_dataset = all_pipeline_options['project'] + '.' + known_args.input reference_dataset = all_pipeline_options[ 'project'] + '.' + known_args.reference_input with beam.Pipeline(argv=pipeline_args) as p: # Get StatePersons persons = (p | 'Load StatePersons' >> BuildRootEntity( dataset=query_dataset, data_dict=None, root_schema_class=schema.StatePerson, root_entity_class=entities.StatePerson, unifying_id_field='person_id', build_related_entities=True)) # Get StateSentenceGroups sentence_groups = (p | 'Load StateSentenceGroups' >> BuildRootEntity( dataset=query_dataset, data_dict=None, root_schema_class=schema.StateSentenceGroup, root_entity_class=entities.StateSentenceGroup, unifying_id_field='person_id', build_related_entities=True)) # Get StateIncarcerationSentences incarceration_sentences = ( p | 'Load StateIncarcerationSentences' >> BuildRootEntity( dataset=query_dataset, data_dict=None, root_schema_class=schema.StateIncarcerationSentence, root_entity_class=entities.StateIncarcerationSentence, unifying_id_field='person_id', build_related_entities=True)) # Get StateSupervisionSentences supervision_sentences = ( p | 'Load StateSupervisionSentences' >> BuildRootEntity( dataset=query_dataset, data_dict=None, root_schema_class=schema.StateSupervisionSentence, root_entity_class=entities.StateSupervisionSentence, unifying_id_field='person_id', build_related_entities=True)) sentences_and_sentence_groups = ( { 'sentence_groups': sentence_groups, 'incarceration_sentences': incarceration_sentences, 'supervision_sentences': supervision_sentences } | 'Group sentences to sentence groups' >> beam.CoGroupByKey()) # Set hydrated sentences on the corresponding sentence groups sentence_groups_with_hydrated_sentences = ( sentences_and_sentence_groups | 'Set hydrated sentences on sentence groups' >> beam.ParDo( SetSentencesOnSentenceGroup())) # Group each StatePerson with their related entities person_and_sentence_groups = ( { 'person': persons, 'sentence_groups': sentence_groups_with_hydrated_sentences } | 'Group StatePerson to SentenceGroups' >> beam.CoGroupByKey()) # Bring in the table that associates people and their county of residence person_id_to_county_query = \ f"SELECT * FROM `{reference_dataset}.persons_to_recent_county_of_residence`" person_id_to_county_kv = ( p | "Read person_id to county associations from BigQuery" >> beam.io.Read( beam.io.BigQuerySource(query=person_id_to_county_query, use_standard_sql=True)) | "Convert person_id to county association table to KV" >> beam.ParDo(ConvertDictToKVTuple(), 'person_id')) # Identify IncarcerationEvents events from the StatePerson's StateIncarcerationPeriods person_events = ( person_and_sentence_groups | 'Classify Incarceration Events' >> beam.ParDo( ClassifyIncarcerationEvents(), AsDict(person_id_to_county_kv))) # Get dimensions to include and methodologies to use inclusions, _ = dimensions_and_methodologies(known_args) # Get pipeline job details for accessing job_id all_pipeline_options = pipeline_options.get_all_options() # The number of months to limit the monthly calculation output to calculation_month_limit = known_args.calculation_month_limit # Add timestamp for local jobs job_timestamp = datetime.datetime.now().strftime( '%Y-%m-%d_%H_%M_%S.%f') all_pipeline_options['job_timestamp'] = job_timestamp # Get IncarcerationMetrics incarceration_metrics = ( person_events | 'Get Incarceration Metrics' >> GetIncarcerationMetrics( pipeline_options=all_pipeline_options, inclusions=inclusions, calculation_month_limit=calculation_month_limit)) # Convert the metrics into a format that's writable to BQ writable_metrics = ( incarceration_metrics | 'Convert to dict to be written to BQ' >> beam.ParDo(IncarcerationMetricWritableDict()).with_outputs( 'admissions', 'populations', 'releases')) # Write the metrics to the output tables in BigQuery admissions_table = known_args.output + '.incarceration_admission_metrics' population_table = known_args.output + '.incarceration_population_metrics' releases_table = known_args.output + '.incarceration_release_metrics' _ = (writable_metrics.admissions | f"Write admission metrics to BQ table: {admissions_table}" >> beam.io.WriteToBigQuery( table=admissions_table, create_disposition=beam.io.BigQueryDisposition.CREATE_NEVER, write_disposition=beam.io.BigQueryDisposition.WRITE_APPEND)) _ = (writable_metrics.populations | f"Write population metrics to BQ table: {population_table}" >> beam.io.WriteToBigQuery( table=population_table, create_disposition=beam.io.BigQueryDisposition.CREATE_NEVER, write_disposition=beam.io.BigQueryDisposition.WRITE_APPEND)) _ = (writable_metrics.releases | f"Write release metrics to BQ table: {releases_table}" >> beam.io.WriteToBigQuery( table=releases_table, create_disposition=beam.io.BigQueryDisposition.CREATE_NEVER, write_disposition=beam.io.BigQueryDisposition.WRITE_APPEND))