def parse(location: str, filename: str) -> Dict[DeclarativeMeta, pd.DataFrame]: _setup() fl_county_table = _parse_county_table(location, filename) fl_county_table = fips.add_column_to_df( fl_county_table, fl_county_table.county_name, us.states.FL) # TODO(#689): Also set the facility_fips fl_facility_table = _parse_facility_table(location, filename) names = fl_facility_table.facility_name.apply(_pretend_facility_is_county) fl_facility_table = fips.add_column_to_df( fl_facility_table, names, us.states.FL) result = { FlCountyAggregate: fl_county_table, FlFacilityAggregate: fl_facility_table } date_scraped = _parse_date(filename) for table in result.values(): table['report_date'] = date_scraped table['aggregation_window'] = enum_strings.monthly_granularity table['report_frequency'] = enum_strings.monthly_granularity return result
def testCountyNotInState_RaisesFipsMergingError(self): # Arrange subject = pd.DataFrame({"county": ["Jo Daviess County"]}) # Act/Assert with self.assertRaises(FipsMergingError): incorrect_state = us.states.FL fips.add_column_to_df(subject, subject.county, incorrect_state)
def testStateWithNoFipsMappings_RaisesFipsMergingError(self): # Arrange class FakeState: fips = "123456789" # Non-existing state fips maps to no county fips subject = pd.DataFrame({"county": ["Jo Daviess County"]}) # Act/Assert with self.assertRaises(FipsMergingError): fips.add_column_to_df(subject, subject.county, FakeState)
def testStateWithNoFipsMappings_RaisesFipsMergingError(self) -> None: # Create fake state fake_state = us.states.State( **{ "abbr": "FS", "fips": "123456789", # Non-existing state fips maps to no county fips "name": "Fake State", }) subject = pd.DataFrame({"county": ["Jo Daviess County"]}) # Act/Assert with self.assertRaises(FipsMergingError): fips.add_column_to_df(subject, subject.county, fake_state)
def _parse_tab_2(filename: str): df = pd.read_excel(filename, sheet_name=1, header=1) # Drop Totals footer df = df[:-4] # Set index/columns with correct names df = df.rename({df.columns[0]: 'county_name'}, axis='columns') df = df.set_index('county_name') df = df.rename_axis('report_date', axis='columns') # Collapse each column into a new row df = df.unstack() df = df.rename('pre_sentenced_population') df = df.reset_index() df['county_name'] = df['county_name'].str.rstrip(' ') df['report_date'] = df['report_date'].dt.date df['pre_sentenced_population'] = _to_numeric( df['pre_sentenced_population']) df = fips.add_column_to_df(df, df['county_name'], us.states.PA) df['aggregation_window'] = enum_strings.daily_granularity df['report_frequency'] = enum_strings.quarterly_granularity return df
def _parse_tab_2(filename: str): df = pd.read_excel(filename, sheet_name=1, header=1, engine="openpyxl") # Drop Totals footer df = df[:-4] # Set index/columns with correct names df = df.rename({df.columns[0]: "county_name"}, axis="columns") df = df.set_index("county_name") df = df.rename_axis("report_date", axis="columns") # Collapse each column into a new row df = df.unstack() df = df.rename("pre_sentenced_population") df = df.reset_index() df["county_name"] = df["county_name"].str.rstrip(" ") df["report_date"] = df["report_date"].dt.date df["pre_sentenced_population"] = _to_numeric( df["pre_sentenced_population"]) df = fips.add_column_to_df(df, df["county_name"], us.states.PA) df["aggregation_window"] = enum_strings.daily_granularity df["report_frequency"] = enum_strings.quarterly_granularity return df
def parse(filename: str) -> Dict[DeclarativeMeta, pd.DataFrame]: df = _parse_table(filename) df = fips.add_column_to_df(df, df["county"], us.states.WV) # type: ignore df["aggregation_window"] = enum_strings.daily_granularity df["report_frequency"] = enum_strings.daily_granularity return {WvFacilityAggregate: df}
def parse(_, filename: str) -> Dict[DeclarativeMeta, pd.DataFrame]: table = _parse_table(filename) county_names = table.jurisdiction_name.map(_pretend_jurisdiction_is_county) table = fips.add_column_to_df(table, county_names, us.states.CA) table['aggregation_window'] = enum_strings.monthly_granularity table['report_frequency'] = enum_strings.monthly_granularity return {CaFacilityAggregate: table}
def parse(location: str, filename: str) -> Dict[DeclarativeMeta, pd.DataFrame]: report_date = _parse_date(filename) table = _parse_table(location, filename, report_date) county_names = table.facility_name.map(_pretend_facility_is_county) table = fips.add_column_to_df(table, county_names, us.states.TX) table['report_date'] = report_date table['aggregation_window'] = enum_strings.daily_granularity table['report_frequency'] = enum_strings.monthly_granularity return {TxCountyAggregate: table}
def parse(location: str, filename: str) -> Dict[DeclarativeMeta, pd.DataFrame]: table = _parse_table(location, filename) # Fuzzy match each facility_name to a county fips county_names = table.county_name.map(_sanitize_county_name) table = fips.add_column_to_df(table, county_names, us.states.GA) table['report_date'] = _parse_date(filename) table['aggregation_window'] = enum_strings.daily_granularity table['report_frequency'] = enum_strings.monthly_granularity return {GaCountyAggregate: table}
def parse(filename: str) -> Dict[DeclarativeMeta, pd.DataFrame]: table = _parse_table(filename) # Fuzzy match each facility_name to a county fips county_names = table.facility_name.map(_pretend_facility_is_county) table = fips.add_column_to_df(table, county_names, us.states.KY) table["report_date"] = parse_date(filename) table["aggregation_window"] = enum_strings.daily_granularity table["report_frequency"] = enum_strings.weekly_granularity return {KyFacilityAggregate: table}
def parse(location: str, filename: str) -> Dict[DeclarativeMeta, pd.DataFrame]: _setup() table = _parse_table(location, filename) # Fuzzy match each facility_name to a county fips county_names = table.facility_name.map(_pretend_facility_is_county) table = fips.add_column_to_df(table, county_names, us.states.NY) table['aggregation_window'] = enum_strings.monthly_granularity table['report_frequency'] = enum_strings.monthly_granularity return {NyFacilityAggregate: table}
def parse(filename: str) -> Dict[DeclarativeMeta, pd.DataFrame]: table = _parse_table(filename) report_date = _parse_date(filename) county_names = table.county table = fips.add_column_to_df(table, county_names, us.states.MA) # type: ignore table["report_date"] = report_date table["aggregation_window"] = enum_strings.daily_granularity table["report_frequency"] = enum_strings.weekly_granularity return {MaFacilityAggregate: table}
def _parse_tab_1(filename: str) -> pd.DataFrame: """Parses the first tab in the PA aggregate report.""" column_names = { r"County Name": "facility_name", r"Bed Capacity": "bed_capacity", r".*Community Corrections Beds.*": "work_release_community_corrections_beds", r".*In-House Daily Pop.*": "in_house_adp", r".*Housed Elsewhere Daily Pop.*": "housed_elsewhere_adp", r".*In-House Work Release.*": "work_release_adp", r"Admissions": "admissions", r"Discharge": "discharge", } # Parse everything directly to allow us to correctly map "N/A" and "N/R" keep_default_na = False df = pd.read_excel( filename, sheet_name=0, header=1, keep_default_na=keep_default_na, engine="openpyxl", ) # Drop "F/T" and "P/T" line df = df[1:] # Drop Totals footer df = df[:-9] df.columns = df.columns.map(lambda name: name.rstrip(" ")) df = aggregate_ingest_utils.rename_columns_and_select(df, column_names, use_regex=True) # Some cells have extra '*' df = df.applymap(lambda e: str(e).rstrip(" *")) df = df.apply(_to_numeric) df["report_date"] = _report_date_tab_1(filename) df = fips.add_column_to_df(df, df["facility_name"], us.states.PA) df["aggregation_window"] = enum_strings.yearly_granularity df["report_frequency"] = enum_strings.yearly_granularity return df.reset_index(drop=True)
def _parse_tab_1(filename: str) -> pd.DataFrame: """Parses the first tab in the PA aggregate report.""" column_names = { r'County Name': 'facility_name', r'Bed Capacity': 'bed_capacity', r'.*Community Corrections Beds.*': 'work_release_community_corrections_beds', r'.*In-House Daily Pop.*': 'in_house_adp', r'.*Housed Elsewhere Daily Pop.*': 'housed_elsewhere_adp', r'.*In-House Work Release.*': 'work_release_adp', r'Admissions': 'admissions', r'Discharge': 'discharge' } # Parse everything directly to allow us to correctly map "N/A" and "N/R" keep_default_na = False df = pd.read_excel(filename, sheet_name=0, header=1, keep_default_na=keep_default_na) # Drop "F/T" and "P/T" line df = df[1:] # Drop Totals footer df = df[:-9] df.columns = df.columns.map(lambda name: name.rstrip(' ')) df = aggregate_ingest_utils.rename_columns_and_select(df, column_names, use_regex=True) # Some cells have extra '*' df = df.applymap(lambda e: str(e).rstrip(' *')) df = df.apply(_to_numeric) df['report_date'] = _report_date_tab_1(filename) df = fips.add_column_to_df(df, df['facility_name'], us.states.PA) df['aggregation_window'] = enum_strings.yearly_granularity df['report_frequency'] = enum_strings.yearly_granularity return df.reset_index(drop=True)
def parse(location: str, filename: str) -> Dict[DeclarativeMeta, pd.DataFrame]: # There are two types of reports, total jail population and female # jail population. The reports are very similar, but need to be # handled slightly differently. is_female = 'female' in filename report_date = _parse_date(filename) table = _parse_table(location, filename, is_female, report_date) names = table.facility_name.apply(_pretend_facility_is_county) table = fips.add_column_to_df(table, names, us.states.TN) table['report_date'] = report_date table['aggregation_window'] = enum_strings.daily_granularity table['report_frequency'] = enum_strings.monthly_granularity return { TnFacilityFemaleAggregate: table } if is_female else { TnFacilityAggregate: table }
def testValidCountyNames_SanitizesFieldsAndFuzzyJoinsFips(self): # Arrange subject = pd.DataFrame({ 'county': [ 'DuPage', # No County suffix 'Efingham County', # Spelled incorrect: Efingham -> Effingham 'Jo Daviess County' ] # Exact match }) # Act result = fips.add_column_to_df(subject, subject.county, us.states.IL) # Assert expected_result = pd.DataFrame({ 'county': ['DuPage', 'Efingham County', 'Jo Daviess County'], 'fips': [ _DUPAGE_COUNTY_FIPS, _EFFINGHAM_COUNTY_FIPS, _JO_DAVIESS_COUNTY_FIPS ] }) assert_frame_equal(result, expected_result)
def testValidCountyNames_SanitizesFieldsAndFuzzyJoinsFips(self) -> None: # Arrange subject = pd.DataFrame({ "county": [ "DuPage", # No County suffix "Efingham County", # Spelled incorrect: Efingham -> Effingham "Jo Daviess County", ] # Exact match }) # Act result = fips.add_column_to_df(subject, subject.county, us.states.IL) # Assert expected_result = pd.DataFrame({ "county": ["DuPage", "Efingham County", "Jo Daviess County"], "fips": [ _DUPAGE_COUNTY_FIPS, _EFFINGHAM_COUNTY_FIPS, _JO_DAVIESS_COUNTY_FIPS, ], }) assert_frame_equal(result, expected_result)
def _parse_table(filename: str) -> pd.DataFrame: """ Parse the Colorado jail data csv by turning the data into wide format consistent with other aggregate data integration efforts. :return: parsed colorado df """ data = pd.read_csv(filename, encoding="cp1252") county_names = data.County data = fips.add_column_to_df(data, county_names, us.states.CO) # type: ignore def label_date(row): """Impute date_collected based on collection quarter""" md = {1: "1/1", 2: "4/1", 3: "7/1", 4: "10/1"}[row["Qtr"]] return f"{md}/{row['QtrYear']}" data["date_collected"] = data.apply(label_date, axis=1) data["date_collected"] = pd.to_datetime(data["date_collected"]) data = data.assign( datecounty=data["date_collected"].dt.strftime("%Y-%m-%d") + data["County"]) def move_column_inplace(df, col, pos): """Move newly created columns to the front of the dataset""" col = df.pop(col) df.insert(pos, col.name, col) move_column_inplace(data, "date_collected", 0) move_column_inplace(data, "fips", 0) # We use the unique county jail and date collected as a unique identifier # for each column in the final dataset. data = data.set_index(["date_collected", "County"]) # create head, a df that keeps all columns independent from the "measures" # used to categorize the data, such as the first 11 columns, i.e. beds, # deaths) for each unique county jail at a specific date. we will # concatenate all of the columns that are different for different measures # to the end of the head df's head = data.iloc[:, :9].drop_duplicates() # Create a tail df for each type of measure. Concatenate each of the # tail df's for each measure to the head df: for measure in data.Measure.unique(): num_inmates_df = data[data.Measure == measure] tail = num_inmates_df.iloc[:, 11:21].add_suffix( "_" + measure).drop_duplicates() head = pd.concat([head, tail], axis=1, join="outer") # add a na_message column, which accumulates all na messages for any measure # in the CO dataset per county. data = data.assign(na_message=np.where( data["Not Available"].isnull(), np.NaN, "**" + data.Measure + "**: " + data["Not Available"] + ", ", )) na_mappings = (data.fillna("").groupby(["date_collected", "County" ]).agg({"na_message": "".join})) final = pd.concat([head, na_mappings], axis=1, join="inner") # turn off multi-indexing final = final.reset_index() # clean column names final.columns = final.columns.str.replace(" - ", "_") final.columns = final.columns.str.replace(" ", "_") final.columns = final.columns.str.replace("-", "_") final.columns = final.columns.str.lower() return final
def add_fips_to_state_df(df: pd.DataFrame) -> pd.DataFrame: state_code = StateCode(df.name) df = df.copy() return fips.add_column_to_df(df, df[TEMP_COUNTY_NAME_COL], state_code.get_state())