def test_upload_spark_model_inputs_with_missing_yaml_inputs(self) -> None: with self.assertRaises(ValueError) as e: upload_spark_model_inputs( "recidiviz-staging", "test", self.outflows_data, self.transitions_data, self.total_population_data, get_inputs_path("super_simulation_missing_inputs.yaml"), ) self.assertTrue(str(e.exception).startswith("Missing yaml inputs"))
def test_upload_spark_model_inputs_with_valid_inputs( self, mock_store: Any) -> None: upload_spark_model_inputs( "recidiviz-staging", "test", self.outflows_data, self.transitions_data, self.total_population_data, get_inputs_path("super_simulation_data_ingest.yaml"), ) assert mock_store.call_count == 3
def test_upload_spark_model_inputs_with_null_values(self) -> None: with self.assertRaises(ValueError) as e: upload_spark_model_inputs( "recidiviz-staging", "test", self.outflows_data, self.transitions_data_with_null_values, self.total_population_data, get_inputs_path("super_simulation_data_ingest.yaml"), ) self.assertEqual( str(e.exception), "Table 'transitions_data' must not contain null values")
def test_upload_spark_model_inputs_with_missing_column(self) -> None: with self.assertRaises(ValueError) as e: upload_spark_model_inputs( "recidiviz-staging", "test", self.outflows_data, self.transitions_data, self.total_population_data_missing_column, get_inputs_path("super_simulation_data_ingest.yaml"), ) self.assertEqual( str(e.exception), "Table 'total_population_data' missing required columns {'time_step'}", )
def test_upload_spark_model_inputs_with_extra_column(self) -> None: with self.assertRaises(ValueError) as e: upload_spark_model_inputs( "recidiviz-staging", "test", self.outflows_data, self.transitions_data, self.total_population_data_extra_column, get_inputs_path("super_simulation_data_ingest.yaml"), ) self.assertEqual( str(e.exception), "Table 'total_population_data' contains unexpected columns {'random_extra_column'}", )
def test_upload_spark_model_inputs_with_column_wrong_type(self) -> None: with self.assertRaises(ValueError) as e: upload_spark_model_inputs( "recidiviz-staging", "test", self.outflows_data, self.transitions_data, self.total_population_data_wrong_type, get_inputs_path("super_simulation_data_ingest.yaml"), ) self.assertEqual( str(e.exception), "Table 'total_population_data' has wrong type for column 'total_population'. Type 'int64' should be 'float64'", )
def test_upload_spark_model_inputs_with_invalid_project_id(self) -> None: with self.assertRaises(ValueError) as e: upload_spark_model_inputs( "bad_project_id", "test", self.outflows_data, self.transitions_data, self.total_population_data, get_inputs_path("super_simulation_data_ingest.yaml"), ) self.assertEqual( str(e.exception), "bad_project_id is not a supported gcloud BigQuery project", )
def test_upload_spark_model_inputs_with_wrong_disaggregation_axis_in_yaml( self, ) -> None: with self.assertRaises(ValueError) as e: upload_spark_model_inputs( "recidiviz-staging", "test", self.outflows_data_wrong_disaggregation_axis, self.transitions_data_wrong_disaggregation_axis, self.total_population_data_wrong_disaggregation_axis, get_inputs_path("super_simulation_data_ingest.yaml"), ) self.assertEqual( str(e.exception), "All disagregation axes must be included in the input dataframe columns\n" "Expected: ['crime_type'], Actual: Index(['compartment', 'outflow_to', 'time_step', 'age', 'total_population'], dtype='object')", )
def test_upload_spark_model_inputs_with_missing_disaggregation_axis( self) -> None: with self.assertRaises(ValueError) as e: # import pdb # pdb.set_trace() upload_spark_model_inputs( "recidiviz-staging", "test", self.outflows_data_no_disaggregation_axis, self.transitions_data, self.total_population_data, get_inputs_path("super_simulation_data_ingest.yaml"), ) self.assertEqual( str(e.exception), "Tables ['outflows_data'] must have dissaggregation axis of 'crime', 'crime_type', 'age', or 'race'", )
final_outflows = pd.DataFrame() for year in outflows_data.time_step.unique(): year_outflows = outflows_data[outflows_data.time_step == year] for month in range(12): month_outflows = year_outflows.copy() month_outflows.time_step = 12 * month_outflows.time_step - month month_outflows.total_population /= 12 final_outflows = pd.concat([final_outflows, month_outflows]) outflows_data = final_outflows transitions_data.compartment_duration *= 12 final_pops = pd.DataFrame() for year in total_population_data.time_step.unique(): year_pops = total_population_data[total_population_data.time_step == year] for month in range(12): month_pops = year_pops.copy() month_pops.time_step = 12 * month_pops.time_step - month final_pops = pd.concat([final_pops, month_pops]) total_population_data = final_pops upload_spark_model_inputs( "recidiviz-staging", "AZ_HB_2376", outflows_data, transitions_data, total_population_data, "recidiviz/calculator/modeling/population_projection/state/AZ/ax_state_prison_HB_2376_model_inputs.yaml", )
transitions_data = pd.concat( [prison_to_parole_transitions, parole_to_prison_transitions] ) # TOTAL POPULATION TABLE (prison) total_population_data = pd.read_csv( "recidiviz/calculator/modeling/population_projection/state/SC/total_population.csv" ) # ignore parole final_pop_data = pd.DataFrame() for year in total_population_data.time_step.unique(): year_pops = total_population_data[total_population_data.time_step == year] for month in range(12): month_pops = year_pops.copy() month_pops.time_step = 12 * month_pops.time_step + month final_pop_data = pd.concat([final_pop_data, month_pops]) total_population_data = final_pop_data # STORE DATA upload_spark_model_inputs( "recidiviz-staging", "SC_prison", outflows_data, transitions_data, total_population_data, "recidiviz/calculator/modeling/population_projection/state/SC/SC_prison_model_inputs.yaml", )
"recidiviz/calculator/modeling/population_projection/state/AZ/financial_incentives/transitions_AZ_data.csv" ), ]) # OUTFLOWS TABLE outflows_data = pd.concat([ outflows_data, pd.read_csv( "recidiviz/calculator/modeling/population_projection/state/AZ/financial_incentives/outflows_data AZ.csv" ), ]) # TOTAL POPULATION TABLE total_population_data = pd.concat([ total_population_data, pd.read_csv( "recidiviz/calculator/modeling/population_projection/state/AZ/financial_incentives/total_population_data AZ.csv" ), ]) # STORE DATA simulation_tag = "AZ_financialincentives" upload_spark_model_inputs( "recidiviz-staging", simulation_tag, outflows_data, transitions_data, total_population_data, "recidiviz/calculator/modeling/population_projection/state/AZ/financial_incentives/AZ_supervision_model_inputs.yaml", )
for year in range(2013, 2021): temp_monthly_total_population_data = pd.DataFrame({ "time_step": [ i for i in range((year - reference_year) * 12, (year - reference_year + 1) * 12) ] * 2, "compartment": ["prison"] * 24, "crime_type": ["non-violent"] * 12 + ["violent"] * 12, "total_population": [ yearly_total_population_data.iloc[(year - reference_year) * 2, 3] for month in range(12) ] + [ yearly_total_population_data.iloc[(year - reference_year) * 2 + 1, 3] for month in range(12) ], }) total_population_data = pd.concat( [total_population_data, temp_monthly_total_population_data], sort=False) # STORE DATA upload_spark_model_inputs( "recidiviz-staging", "LA_HB_364", outflows_data, transitions_data, total_population_data, "recidiviz/calculator/modeling/population_projection/state/LA/LA_prison_habitual_model_inputs.yaml", )
pop = pop_valid.copy() pop = pop[["custodyStatus", "crime"]] pop_in_custody = pop[pop.custodyStatus == "IN CUSTODY"] total_pop = pop_in_custody.groupby("crime").count() total_pop.reset_index(inplace=True) total_pop["compartment"] = "prison" total_pop["total_population"] = total_pop.custodyStatus # population as of Feb 2021 == 254 months since 2000 total_pop["time_step"] = 254 total_pop["crime_type"] = total_pop.crime pop_out = total_pop[[ "compartment", "total_population", "time_step", "crime_type" ]] if SAVE_TO_CSV: pop_out.to_csv( "/Users/jpouls/recidiviz/nyrecidiviz/mm_preprocessing/total_population/total_population" + str(int(time.time())) + ".csv") ############ SPARK MODEL UPLOAD upload_spark_model_inputs( "recidiviz-staging", "NY_mms", outflows, transitions, pop_out, "recidiviz/calculator/modeling/population_projection/state/NY/mms/ny_state_prison_model_inputs", )
"offense_code", "time_step", "sentence_type", "compartment", ]].groupby( [ "offense_group", "offense_code", "compartment", "sentence_type", "time_step" ], as_index=False, ).count()) jail_prison_admissions = jail_prison_admissions.rename( { "off1_vcc": "total_population", "offense_code": "crime", "sentence_type": "outflow_to", }, axis=1, ).drop("offense_group", axis=1) outflows_data = jail_prison_admissions # STORE DATA upload_spark_model_inputs( "recidiviz-staging", "VA_parole", outflows_data, transitions_data, pd.DataFrame(), "recidiviz/calculator/modeling/population_projection/state/VA/VA_parole_model_inputs.yaml", )
# OUTFLOWS TABLE outflows_data = pd.concat([ outflows_data, pd.read_csv( "recidiviz/calculator/modeling/population_projection/state/TX/TX_data/Outflows Data-Table 1.csv" ), ]) outflows_data = outflows_data.rename({"placeholder_axis": "crime_type"}, axis=1) # TOTAL POPULATION TABLE total_population_data = pd.concat([ total_population_data, pd.read_csv( "recidiviz/calculator/modeling/population_projection/state/TX/TX_data/Total Population Data-Table 1.csv" ), ]) total_population_data = total_population_data.rename( {"placeholder_axis": "crime_type"}, axis=1) # STORE DATA upload_spark_model_inputs( "recidiviz-staging", "TX_PO_incentives", outflows_data, transitions_data, total_population_data, "recidiviz/calculator/modeling/population_projection/state/TX/TX_prison_revocations_model_inputs.yaml", )
# OUTFLOWS TABLE outflows_data = pd.concat( [ outflows_data, pd.read_csv( "recidiviz/calculator/modeling/population_projection/state/AZ/felony_reclassification/outflows_data_felonyAZ.csv" ), ] ) # TOTAL POPULATION TABLE total_population_data = pd.concat( [ total_population_data, pd.read_csv( "recidiviz/calculator/modeling/population_projection/state/AZ/felony_reclassification/total_population_data_felonyAZ.csv" ), ] ) # STORE DATA simulation_tag = "AZ_reclassification" upload_spark_model_inputs( "recidiviz-staging", simulation_tag, outflows_data, transitions_data, total_population_data, "recidiviz/calculator/modeling/population_projection/state/AZ/felony_reclassification/AZ_reclassification_model_inputs.yaml", )
# TODO(#99999): add one column to transitions_data & outflows_data per disaggregation # axis. If none exist, add place-holder axis. transitions_data = pd.DataFrame(columns=[ "compartment", "outflow_to", "total_population", "compartment_duration" ]) outflows_data = pd.DataFrame( columns=["compartment", "outflow_to", "total_population", "time_step"]) total_population_data = pd.DataFrame( columns=["compartment", "total_population", "time_step"]) # TRANSITIONS TABLE # TODO(#99999): populate transitions_data from raw data # OUTFLOWS TABLE # TODO(#99999): populate outflows_data from raw data # TOTAL POPULATION TABLE # TODO(#99999): populate total_population_data from raw data # STORE DATA # TODO(#99999): fill in `simulation_tag' simulation_tag = "TKTK" upload_spark_model_inputs( "recidiviz-staging", simulation_tag, outflows_data, transitions_data, total_population_data, "path_to_your_yaml", )
temp_monthly_outflows_data = pd.DataFrame({ "time_step": [ i for i in range((year - reference_year) * 12, (year - reference_year + 1) * 12) ] * 2, "compartment": ["pretrial"] * 24, "outflow_to": ["prison"] * 24, "race": ["white"] * 12 + ["non-white"] * 12, "total_population": [ yearly_outflows_data.iloc[(year - reference_year) * 2, 4] / 12 for month in range(12) ] + [ yearly_outflows_data.iloc[(year - reference_year) * 2 + 1, 4] / 12 for month in range(12) ], }) outflows_data = pd.concat([outflows_data, temp_monthly_outflows_data]) # TOTAL POPULATION TABLE # none # STORE DATA upload_spark_model_inputs( "recidiviz-staging", "IL_prison_three_strikes", outflows_data, transitions_data, pd.DataFrame(), "recidiviz/calculator/modeling/population_projection/state/IL/IL_prison_three_strikes_model_inputs.yaml", )
for year in outflows_data.time_step.unique(): year_outflows = outflows_data[outflows_data.time_step == year] for month in range(12): month_outflows = year_outflows.copy() month_outflows.time_step = 12 * month_outflows.time_step - month month_outflows.total_population /= 12 final_outflows = pd.concat([final_outflows, month_outflows]) outflows_data = final_outflows # TOTAL POPULATION TABLE # TODO(#99999): populate total_population_data from raw data final_pops = pd.DataFrame() for year in total_population_data.time_step.unique(): year_pops = total_population_data[total_population_data.time_step == year] for month in range(12): month_pops = year_pops.copy() month_pops.time_step = 12 * month_pops.time_step - month final_pops = pd.concat([final_pops, month_pops]) total_population_data = final_pops # STORE DATA # TODO(#99999): fill in `state` and `primary_compartment` upload_spark_model_inputs( "recidiviz-staging", "MS_SB_2123", outflows_data, transitions_data, total_population_data, "recidiviz/calculator/modeling/population_projection/state/MS/SB_2123_Parole_Eligibility/MS_prison_model_inputs.yaml", )
) ) # switch tech revs to leapfrog to full_release transitions_data.loc[ transitions_data.compartment == "prison_tech_rev", "outflow_to" ] = "full_release" # scale down outflows to avoid double counting outflows_data.total_population *= ( 1 - transitions_data.loc[ (transitions_data.compartment == "release") & (transitions_data.outflow_to == "prison_new_crime"), "total_population", ].iloc[0] ) column_names = ["compartment", "total_population", "time_step", "age"] total_population_data = pd.DataFrame(columns=column_names) transitions_data = transitions_data.astype({"compartment_duration": "float64"}) upload_spark_model_inputs( "recidiviz-staging", "wv_prison_p1", outflows_data, transitions_data, total_population_data, "./recidiviz/calculator/modeling/population_projection/state/WV/HB_2257/WV_hb2257_model_inputs.yaml", )
for month in range(12) ] + [ yearly_outflows_data.iloc[(year - reference_year) * 10 + 5, 4] / 12 for month in range(12) ] + [ yearly_outflows_data.iloc[(year - reference_year) * 10 + 6, 4] / 12 for month in range(12) ] + [ yearly_outflows_data.iloc[(year - reference_year) * 10 + 7, 4] / 12 for month in range(12) ] + [ yearly_outflows_data.iloc[(year - reference_year) * 10 + 8, 4] / 12 for month in range(12) ] + [ yearly_outflows_data.iloc[(year - reference_year) * 10 + 9, 4] / 12 for month in range(12) ], }) outflows_data = pd.concat([outflows_data, temp_monthly_outflows_data]) # STORE DATA simulation_tag = "OK_prison" upload_spark_model_inputs( "recidiviz-staging", simulation_tag, outflows_data, transitions_data, total_population_data, "recidiviz/calculator/modeling/population_projection/state/OK/OK_habitual_sentencing/OK_prison_model_inputs.yaml", )
]) transitions_data.loc[transitions_data.compartment == "prison_technical", "outflow_to"] = "release" transitions_data.loc[transitions_data.compartment == "probation ", "compartment"] = "probation" # OUTFLOWS TABLE outflows_data = pd.concat([ outflows_data, pd.read_csv( "recidiviz/calculator/modeling/population_projection/state/OK/OK_data/Outflows Data-Table 1.csv" ), ]) outflows_data = outflows_data.rename({"placeholder_axis": "crime_type"}, axis=1) transitions_data = transitions_data.rename({"placeholder_axis": "crime_type"}, axis=1) # STORE DATA # NB IF YOU RUN THIS FILE: There were two yaml files in the folder - please make sure the one passed in below is the correct one upload_spark_model_inputs( "recidiviz-staging", "OK_probation", outflows_data, transitions_data, pd.DataFrame(), "recidiviz/calculator/modeling/population_projection/state/OK/OK_earned_credits/OK_probation_model_inputs_average_cost.yaml", )
"total_population", ] = total_population_data.loc[ total_population_data.age == "50_and_under", "total_population"].iloc[0] outflows_data.loc[(outflows_data.time_step == 0) & (outflows_data.age == "51_and_up"), "total_population", ] = total_population_data.loc[ total_population_data.age == "51_and_up", "total_population"].iloc[0] outflows_data = outflows_data.append( pd.DataFrame({ "total_population": [0] * 20, "age": ["50_and_under"] * 10 + ["51_and_up"] * 10, "time_step": list(range(1, 11)) * 2, })).ffill() # STORE DATA fake_total_population_data = pd.DataFrame({ "compartment": ["prison"] * 2, "time_step": [-1] * 2, "total_population": [0] * 2, "age": ["50_and_under", "51_and_up"], }) upload_spark_model_inputs( "recidiviz-staging", "MS_habitual_offenders_A", outflows_data, transitions_data, fake_total_population_data, "recidiviz/calculator/modeling/population_projection/state/MS/habitual_sentencing/MS_prison_habitual_A_model_inputs.yaml", )
yearly_total_population_data.iloc[(year - reference_year) * 3 + 1, 3] for month in range(12) ] + [ yearly_total_population_data.iloc[(year - reference_year) * 3 + 2, 3] for month in range(12) ], } ) monthly_total_population_data = pd.concat( [monthly_total_population_data, temp_monthly_total_population_data] ) # STORE DATA monthly_outflows_data = monthly_outflows_data.rename( {"tis_percentage": "crime_type"}, axis=1 ) transitions_data = transitions_data.rename({"tis_percentage": "crime_type"}, axis=1) monthly_total_population_data = monthly_total_population_data.rename( {"tis_percentage": "crime_type"}, axis=1 ) upload_spark_model_inputs( "recidiviz-staging", "IL_prison_RAE", monthly_outflows_data, transitions_data, monthly_total_population_data, "recidiviz/calculator/modeling/population_projection/state/IL/IL_prison_RAE_model_inputs.yaml", )
), ]) total_population_data = total_population_data[ (total_population_data.time_step < 0) | (total_population_data.compartment != "probation")] total_population_data = pd.concat([total_population_data, PROBATION_POP_DATA]) # drop duplicate probation data total_population_data = total_population_data[ (total_population_data.compartment != "probation") | (total_population_data.crime_type != "newcrime")] # move disaggregation axis to compartments prison_populations = total_population_data.loc[ total_population_data.compartment == "prison"] total_population_data.loc[total_population_data.compartment == "prison", "compartment"] = (prison_populations.compartment + "_" + prison_populations.crime_type) total_population_data.crime_type = "NA" # STORE DATA simulation_tag = "VA_HB2038" upload_spark_model_inputs( "recidiviz-staging", simulation_tag, outflows_data, transitions_data, total_population_data, "recidiviz/calculator/modeling/population_projection/state/VA/HB_2038/VAHB2038_model_inputs.yaml", )
year_outflows = outflows_data[outflows_data.time_step == year] for month in range(12): month_outflows = year_outflows.copy() month_outflows.time_step = 12 * month_outflows.time_step - month month_outflows.total_population /= 12 final_outflows = pd.concat([final_outflows, month_outflows]) outflows_data = final_outflows # TOTAL POPULATION TABLE final_pops = pd.DataFrame() for year in total_population_data.time_step.unique(): year_pops = total_population_data[total_population_data.time_step == year] for month in range(12): month_pops = year_pops.copy() month_pops.time_step = 12 * month_pops.time_step - month final_pops = pd.concat([final_pops, month_pops]) total_population_data = final_pops # STORE DATA # TODO(#99999): fill in `state` and `primary_compartment` simulation_tag = "OH_SB3_prong2" upload_spark_model_inputs( "recidiviz-staging", simulation_tag, outflows_data, transitions_data, total_population_data, "recidiviz/calculator/modeling/population_projection/state/OH/Prong2/OH_prison_p2_model_inputs.yaml", )
final_outflows = pd.DataFrame() for year in outflows_data.time_step.unique(): year_outflows = outflows_data[outflows_data.time_step == year] for month in range(12): month_outflows = year_outflows.copy() month_outflows.time_step = 12 * month_outflows.time_step + month month_outflows.total_population /= 12 final_outflows = pd.concat([final_outflows, month_outflows]) outflows_data = final_outflows transitions_data = pd.DataFrame({ "compartment": ["prison"] * 3 + ["release"] * 36 + ["release", "release_full"], "outflow_to": ["release"] * 3 + ["prison"] * 36 + ["release_full", "release_full"], "compartment_duration": [888.2 / 365 * 12, 962.9 / 365 * 12, 1089.9 / 365 * 12] + list(range(1, 37)) + [36, 36], "total_population": [1.0] * 3 + [0.23 / 36] * 36 + [0.77, 1], "crime_type": ["NA"] * 41, }) upload_spark_model_inputs( "recidiviz-staging", "OK_resentencing", outflows_data, transitions_data, pd.DataFrame(), "recidiviz/calculator/modeling/population_projection/state/OK/OK_resentencing/OK_prison_model_inputs.yaml", )
released, 1, 1, ], } ) # TOTAL POPULATION TABLE (parole to new offense revocation) total_population_data = pd.read_csv( "recidiviz/calculator/modeling/population_projection/state/CA/parole_total_population.csv" ) # STORE DATA # state = 'CA' # primary_compartment = 'parole' # pd.concat([transitions_data, outflows_data, total_population_data], sort=False).to_csv( # f'recidiviz/calculator/modeling/population_projection/state/{state}/preprocessed_data_{state}_{primary_compartment}.csv') print("OUTFLOWS = ", outflows_data) print("TRANSITIONS = ", transitions_data) print("TOTAL POP = ", total_population_data) upload_spark_model_inputs( "recidiviz-staging", "CA_parole", outflows_data, transitions_data, total_population_data, "recidiviz/calculator/modeling/population_projection/state/CA/PO_incentives/CA_parole_model_inputs.yaml", )
two_parole_transitions.loc[two_parole_transitions.outflow_to == "prison_parole_rev_new", "outflow_to"] = "prison_parole_rev_new_two" two_parole_transitions.compartment = "parole_two" three_parole_transitions.loc[three_parole_transitions.outflow_to == "prison_parole_rev_new", "outflow_to"] = "prison_parole_rev_new_three" three_parole_transitions.compartment = "parole_three" two_rev_transitions.outflow_to = "parole_three" two_rev_transitions.compartment = "prison_parole_rev_new_two" three_rev_transitions.outflow_to = "parole_three" three_rev_transitions.compartment = "prison_parole_rev_new_three" transitions_data = pd.concat([ transitions_data, two_rev_transitions, two_parole_transitions, three_rev_transitions, three_parole_transitions, ]) upload_spark_model_inputs( "recidiviz-staging", "va_prison_p1", outflows_data, transitions_data, total_population_data, "recidiviz/calculator/modeling/population_projection/state/NY/mms/ny_state_prison_model_inputs", )