def testRenameWithRegex(self) -> None: # Arrange subject = pd.DataFrame( { "County": ["Anderson", "Andrews", "Angelina"], "PRETRIAL": [90, 20, 105], "CON. Felons": [2, 11, 26], } ) rename_dict = { r".*Cou.*": "facility_name", r"PRETRIAL": "pretrial_adp", r"CON\. Felons": "convicted_adp", } # Act result = aggregate_ingest_utils.rename_columns_and_select( subject, rename_dict, use_regex=True ) # Assert expected_result = pd.DataFrame( { "facility_name": ["Anderson", "Andrews", "Angelina"], "pretrial_adp": [90, 20, 105], "convicted_adp": [2, 11, 26], } ) assert_frame_equal(result, expected_result)
def _format_df(df: pd.DataFrame) -> pd.DataFrame: """Format the DataFrame to match the schema.""" result = _transpose_df(df) result = aggregate_ingest_utils.rename_columns_and_select(result, { 'report_date': 'report_date', 'Census': 'census', 'In House': 'in_house', 'Boarded In': 'boarded_in', 'Boarded Out': 'boarded_out', '- Sentenced': 'sentenced', '- Civil': 'civil', '- Federal': 'federal', '- Technical Parole Violators': 'technical_parole_violators', '- State Readies': 'state_readies', '- Other Unsentenced **': 'other_unsentenced' }) result['report_date'] = result['report_date'].apply(_parse_report_date) for column_name in set(result.columns) - {'report_date'}: result[column_name] = result[column_name].apply(locale.atoi) result['facility_name'] = df['FACILITY'].iloc[0] return result
def _parse_county_table(filename: str) -> pd.DataFrame: """Parses the FL County - Table 1 in the PDF.""" [result] = tabula.read_pdf( filename, pages=[3, 4], multiple_tables=False, pandas_options={"skipfooter": 1, "engine": "python"}, ) result.columns = [c.replace("\r", " ") for c in result.columns] result = aggregate_ingest_utils.rename_columns_and_select( result, { "Florida County": "county_name", "County Population": "county_population", "Average Daily Population (ADP)": "average_daily_population", "*Date Reported": "date_reported", }, ) # Drop rows from header on second table (page 4) result = result[~result["county_name"].isin(("Florida", "County"))] for column_name in {"county_population", "average_daily_population"}: result[column_name] = result[column_name].apply(locale.atoi) # Sometimes extra notes are indicated in the date reported field. result["date_reported"] = result["date_reported"].str.replace(r"^\*\*$", "") result["date_reported"] = pd.to_datetime(result["date_reported"]) return result
def _parse_county_table(location: str, filename: str) -> pd.DataFrame: """Parses the FL County - Table 1 in the PDF.""" part1 = read_pdf(location, filename, pages=[3], pandas_options={ 'header': [0, 1], }) part2 = read_pdf( location, filename, pages=[4], pandas_options={ 'header': [0, 1], 'skipfooter': 1, # The last row is the total 'engine': 'python' # Only python engine supports 'skipfooter' }) result = part1.append(part2, ignore_index=True) result.columns = aggregate_ingest_utils.collapse_header(result.columns) result = aggregate_ingest_utils.rename_columns_and_select( result, { 'Florida County': 'county_name', 'County Population': 'county_population', 'Average Daily Population (ADP)': 'average_daily_population', '*Date Reported': 'date_reported' }) for column_name in {'county_population', 'average_daily_population'}: result[column_name] = result[column_name].apply(locale.atoi) result['date_reported'] = pd.to_datetime(result['date_reported']) return result
def _format_df(df: pd.DataFrame) -> pd.DataFrame: """Format the DataFrame to match the schema.""" result = _transpose_df(df) result = aggregate_ingest_utils.rename_columns_and_select( result, { "report_date": "report_date", "Census": "census", "In House": "in_house", "Boarded In": "boarded_in", "Boarded Out": "boarded_out", "- Sentenced": "sentenced", "- Civil": "civil", "- Federal": "federal", "- Technical Parole Violators": "technical_parole_violators", "- State Readies": "state_readies", "- Other Unsentenced **": "other_unsentenced", }, ) result["report_date"] = result["report_date"].apply(_parse_report_date) for column_name in set(result.columns) - {"report_date"}: result[column_name] = result[column_name].apply( lambda d: int(d) if isinstance(d, (int, float)) else 0 if "(" in d else locale.atoi(d)) result["facility_name"] = df["FACILITY"].iloc[0] return result
def _parse_table(filename: str) -> pd.DataFrame: """Parses the CA aggregate report.""" # Although the file is downloaded with the '.xls' extension, the contents of # the file are in the shape of an HTML file. df = pd.read_html(filename, header=0)[0] df = df.fillna(0) df['report_date'] = df[['Year', 'Month']].apply(_last_date_of_month, axis='columns') df = aggregate_ingest_utils.rename_columns_and_select( df, { 'Jurisdiction': 'jurisdiction_name', 'Facility': 'facility_name', 'Total facility ADP': 'average_daily_population', 'Unsentenced males': 'unsentenced_male_adp', 'Unsentenced females': 'unsentenced_female_adp', 'Sentenced males': 'sentenced_male_adp', 'Sentenced females': 'sentenced_female_adp', 'report_date': 'report_date' }) string_columns = {'jurisdiction_name', 'facility_name', 'report_date'} df = aggregate_ingest_utils.cast_columns_to_int( df, ignore_columns=string_columns) return df
def _parse_table(filename: str) -> pd.DataFrame: """Parses the CA aggregate report.""" # Although the file is downloaded with the '.xls' extension, the contents of # the file are in the shape of an HTML file. df = pd.read_html(filename, header=0)[0] df = df.fillna(0) df["report_date"] = df[["Year", "Month"]].apply(_last_date_of_month, axis="columns") df = aggregate_ingest_utils.rename_columns_and_select( df, { "Jurisdiction": "jurisdiction_name", "Facility": "facility_name", "Total facility ADP": "average_daily_population", "Unsentenced males": "unsentenced_male_adp", "Unsentenced females": "unsentenced_female_adp", "Sentenced males": "sentenced_male_adp", "Sentenced females": "sentenced_female_adp", "report_date": "report_date", }, ) string_columns = {"jurisdiction_name", "facility_name", "report_date"} df = aggregate_ingest_utils.cast_columns_to_int(df, ignore_columns=string_columns) return df
def testRenameWithRegex(self): # Arrange subject = pd.DataFrame({ 'County': ['Anderson', 'Andrews', 'Angelina'], 'PRETRIAL': [90, 20, 105], 'CON. Felons': [2, 11, 26], }) rename_dict = { r'.*Cou.*': 'facility_name', r'PRETRIAL': 'pretrial_adp', r'CON\. Felons': 'convicted_adp' } # Act result = aggregate_ingest_utils.rename_columns_and_select( subject, rename_dict, use_regex=True) # Assert expected_result = pd.DataFrame({ 'facility_name': ['Anderson', 'Andrews', 'Angelina'], 'pretrial_adp': [90, 20, 105], 'convicted_adp': [2, 11, 26], }) assert_frame_equal(result, expected_result)
def _parse_table(location: str, filename: str) -> pd.DataFrame: """Parses the last table in the GA PDF.""" # Set column names since the pdf makes them hard to parse directly column_names = [ 'Index', 'Jurisdiction', 'Total Number of Inmates In Jail', 'Jail Capacity', 'Inmates as % of Capacity', 'Number of Inmates Sentenced to State [Number]', 'Number of Inmates Sentenced to State [% of Total]', 'Number of Inmates Awaiting Trial in Jail [Number]', 'Number of Inmates Awaiting Trial in Jail [% of Total]', 'Number of Inmates Serving County Sentence [Number]', 'Number of Inmates Serving County Sentence [% of Total]', 'Number of Other Inmates [Number]', 'Number of Other Inmates [% of Total]' ] # Tables at the end of the doc contain all data we want to parse pages = [8, 9, 10, 11] # Use lattice parsing since default parsing fails to parse columns on # the right half of the page use_lattice = True result = read_pdf( location, filename, pages=pages, lattice=use_lattice, pandas_options={ 'names': column_names, 'skiprows': _header_on_each_page(), 'skipfooter': 1, # The last row is the grand totals 'engine': 'python' # Only python engine supports 'skipfooter' }) result = aggregate_ingest_utils.rename_columns_and_select( result, { 'Jurisdiction': 'county_name', 'Total Number of Inmates In Jail': 'total_number_of_inmates_in_jail', 'Jail Capacity': 'jail_capacity', 'Number of Inmates Sentenced to State [Number]': 'number_of_inmates_sentenced_to_state', 'Number of Inmates Awaiting Trial in Jail [Number]': 'number_of_inmates_awaiting_trial', 'Number of Inmates Serving County Sentence [Number]': 'number_of_inmates_serving_county_sentence', 'Number of Other Inmates [Number]': 'number_of_other_inmates' }) # Tabula may parse extra empty rows result = result.dropna() aggregate_ingest_utils.cast_columns_to_int(result, ignore_columns={'county_name'}) return result
def _parse_facility_table(_: str, filename: str) -> pd.DataFrame: """Parse the FL County Pretrial Inmate Report - Table 2 in the PDF.""" # Set column names directly since the pdf format makes them hard to parse column_names = [ "Detention Facility Name", "Average Daily Population", "Number Felony Pretrial", "Number Misdemeanor Pretrial", "Total Percent Pretrial", ] part1 = tabula.read_pdf( filename, pages=[5], pandas_options={ "skiprows": [0, 1, 2], "names": column_names, }, ) part2 = tabula.read_pdf( filename, pages=[6], pandas_options={ "skiprows": [0, 1, 2], "usecols": [0, 2, 3, 4, 5], # Column 1 contains no data "names": column_names, "skipfooter": 2, # The last 2 rows are the totals "engine": "python", # Only python engine supports 'skipfooter' }, ) result = part1.append(part2, ignore_index=True) result = aggregate_ingest_utils.rename_columns_and_select( result, { "Detention Facility Name": "facility_name", "Average Daily Population": "average_daily_population", "Number Felony Pretrial": "number_felony_pretrial", "Number Misdemeanor Pretrial": "number_misdemeanor_pretrial", }, ) result["average_daily_population"] = ( result["average_daily_population"].apply(_use_stale_adp).apply(_to_int) ) for column_name in {"number_felony_pretrial", "number_misdemeanor_pretrial"}: result[column_name] = result[column_name].apply(_to_int) return result
def _parse_tab_1(filename: str) -> pd.DataFrame: """Parses the first tab in the PA aggregate report.""" column_names = { r"County Name": "facility_name", r"Bed Capacity": "bed_capacity", r".*Community Corrections Beds.*": "work_release_community_corrections_beds", r".*In-House Daily Pop.*": "in_house_adp", r".*Housed Elsewhere Daily Pop.*": "housed_elsewhere_adp", r".*In-House Work Release.*": "work_release_adp", r"Admissions": "admissions", r"Discharge": "discharge", } # Parse everything directly to allow us to correctly map "N/A" and "N/R" keep_default_na = False df = pd.read_excel( filename, sheet_name=0, header=1, keep_default_na=keep_default_na, engine="openpyxl", ) # Drop "F/T" and "P/T" line df = df[1:] # Drop Totals footer df = df[:-9] df.columns = df.columns.map(lambda name: name.rstrip(" ")) df = aggregate_ingest_utils.rename_columns_and_select(df, column_names, use_regex=True) # Some cells have extra '*' df = df.applymap(lambda e: str(e).rstrip(" *")) df = df.apply(_to_numeric) df["report_date"] = _report_date_tab_1(filename) df = fips.add_column_to_df(df, df["facility_name"], us.states.PA) df["aggregation_window"] = enum_strings.yearly_granularity df["report_frequency"] = enum_strings.yearly_granularity return df.reset_index(drop=True)
def _parse_facility_table(location: str, filename: str) -> pd.DataFrame: """Parse the FL County Pretrial Inmate Report - Table 2 in the PDF.""" # Set column names directly since the pdf format makes them hard to parse column_names = [ 'Detention Facility Name', 'Average Daily Population', 'Number Felony Pretrial', 'Number Misdemeanor Pretrial', 'Total Percent Pretrial'] part1 = read_pdf( location, filename, pages=[5], pandas_options={ 'skiprows': [0, 1, 2], 'names': column_names, }) part2 = read_pdf( location, filename, pages=[6], pandas_options={ 'skiprows': [0, 1, 2], 'usecols': [0, 2, 3, 4, 5], # Column 1 contains no data 'names': column_names, 'skipfooter': 2, # The last 2 rows are the totals 'engine': 'python' # Only python engine supports 'skipfooter' }) result = part1.append(part2, ignore_index=True) result = aggregate_ingest_utils.rename_columns_and_select(result, { 'Detention Facility Name': 'facility_name', 'Average Daily Population': 'average_daily_population', 'Number Felony Pretrial': 'number_felony_pretrial', 'Number Misdemeanor Pretrial': 'number_misdemeanor_pretrial' }) result['average_daily_population'] = result[ 'average_daily_population'].apply(_use_stale_adp).apply(_to_int) for column_name in {'number_felony_pretrial', 'number_misdemeanor_pretrial'}: result[column_name] = result[column_name].apply(_to_int) return result
def _parse_tab_1(filename: str) -> pd.DataFrame: """Parses the first tab in the PA aggregate report.""" column_names = { r'County Name': 'facility_name', r'Bed Capacity': 'bed_capacity', r'.*Community Corrections Beds.*': 'work_release_community_corrections_beds', r'.*In-House Daily Pop.*': 'in_house_adp', r'.*Housed Elsewhere Daily Pop.*': 'housed_elsewhere_adp', r'.*In-House Work Release.*': 'work_release_adp', r'Admissions': 'admissions', r'Discharge': 'discharge' } # Parse everything directly to allow us to correctly map "N/A" and "N/R" keep_default_na = False df = pd.read_excel(filename, sheet_name=0, header=1, keep_default_na=keep_default_na) # Drop "F/T" and "P/T" line df = df[1:] # Drop Totals footer df = df[:-9] df.columns = df.columns.map(lambda name: name.rstrip(' ')) df = aggregate_ingest_utils.rename_columns_and_select(df, column_names, use_regex=True) # Some cells have extra '*' df = df.applymap(lambda e: str(e).rstrip(' *')) df = df.apply(_to_numeric) df['report_date'] = _report_date_tab_1(filename) df = fips.add_column_to_df(df, df['facility_name'], us.states.PA) df['aggregation_window'] = enum_strings.yearly_granularity df['report_frequency'] = enum_strings.yearly_granularity return df.reset_index(drop=True)
def _parse_facility_table(filename: str) -> pd.DataFrame: """Parse the FL County Pretrial Inmate Report - Table 2 in the PDF.""" # Set column names directly since the pdf format makes them hard to parse column_names = [ "Detention Facility Name", "Average Daily Population", "Number Felony Pretrial", "Number Misdemeanor Pretrial", "Total Percent Pretrial", ] [result] = tabula.read_pdf( filename, pages=[5, 6], multiple_tables=False, pandas_options={ "usecols": range(1, 6), "names": column_names, "skiprows": [0], "skipfooter": 2, "engine": "python", }, ) result = aggregate_ingest_utils.rename_columns_and_select( result, { "Detention Facility Name": "facility_name", "Average Daily Population": "average_daily_population", "Number Felony Pretrial": "number_felony_pretrial", "Number Misdemeanor Pretrial": "number_misdemeanor_pretrial", }, ) result = result.replace("Detention\rFacility\rName", None).dropna(how="all") result["average_daily_population"] = ( result["average_daily_population"].apply(_use_stale_adp).apply(_to_int) ) for column_name in {"number_felony_pretrial", "number_misdemeanor_pretrial"}: result[column_name] = result[column_name].apply(_to_int) return result
def _parse_county_table(_: str, filename: str) -> pd.DataFrame: """Parses the FL County - Table 1 in the PDF.""" part1 = tabula.read_pdf( filename, pages=[3], pandas_options={ "header": [0, 1], }, ) part2 = tabula.read_pdf( filename, pages=[4], pandas_options={ "header": [0, 1], "skipfooter": 1, # The last row is the total "engine": "python", # Only python engine supports 'skipfooter' }, ) result = part1.append(part2, ignore_index=True) result.columns = aggregate_ingest_utils.collapse_header(result.columns) result = aggregate_ingest_utils.rename_columns_and_select( result, { "Florida County": "county_name", "County Population": "county_population", "Average Daily Population (ADP)": "average_daily_population", "*Date Reported": "date_reported", }, ) for column_name in {"county_population", "average_daily_population"}: result[column_name] = result[column_name].apply(locale.atoi) # Sometimes extra notes are indicated in the date reported field. result["date_reported"] = result["date_reported"].str.replace(r"^\*\*$", "") result["date_reported"] = pd.to_datetime(result["date_reported"]) return result
def _parse_table(_, filename: str) -> pd.DataFrame: """Parses the table in the KY PDF.""" whole_df = tabula.read_pdf( filename, pages='all', lattice=True ) if filename.endswith('04-16-20.pdf'): whole_df[323:331] = whole_df[323:331].shift(-1, axis='columns') elif filename.endswith('07-09-20.pdf'): whole_df.loc[432] = whole_df.iloc[432].shift(-1) whole_df.loc[434:436] = whole_df.loc[434:436].shift(-1, axis='columns') whole_df.loc[438] = whole_df.iloc[438].shift(-1) whole_df.loc[440] = whole_df.iloc[440].shift(-1) whole_df.loc[442:445] = whole_df.loc[442:445].shift(-1, axis='columns') whole_df.loc[447:462] = whole_df.loc[447:462].shift(-1, axis='columns') whole_df.loc[464:] = whole_df.loc[464:].shift(-1, axis='columns') whole_df.loc[451, 'County'] = 86 whole_df.loc[456, 'County'] = 264 whole_df.loc[461, 'County'] = 52 whole_df.loc[464, 'County'] = 161 whole_df.loc[469, 'County'] = 70 whole_df.loc[472, 'County'] = 204 whole_df.loc[477, 'County'] = 182 whole_df.loc[482, 'County'] = 137 whole_df.loc[487, 'County'] = 45 whole_df.loc[492, 'County'] = 410 whole_df.loc[497, 'County'] = 152 whole_df.loc[500, 'County'] = 95 whole_df.loc[505, 'County'] = 85 whole_df.loc[508, 'County'] = 194 whole_df.loc[513, 'County'] = 72 whole_df.loc[516, 'County'] = 134 whole_df.loc[521, 'County'] = 50 whole_df.loc[524, 'County'] = 63 whole_df.loc[529, 'County'] = 32 # Remove totals separate from parsing since it's a variable length totals_start_index = np.where(whole_df['Date'].str.contains('Totals'))[0][0] whole_df = whole_df[:totals_start_index] # Some rows are parsed including the date, which shift them 1 too far right shifted_rows = whole_df['County'].astype(str).str.contains('Secure') whole_df[shifted_rows] = whole_df[shifted_rows].shift(-1, axis='columns') whole_df = whole_df[whole_df['County'].astype(str) != 'County'] whole_df.reset_index(drop=True) whole_df = _shift_headers(whole_df) whole_df.columns = whole_df.columns.str.replace('\n', ' ') whole_df.columns = whole_df.columns.str.replace('\r', ' ') # Column names can change over time : ( column_name_map = { 'CC Eligible Inmates': 'Community Custody Inmates', } whole_df.columns = [column_name_map[c] if c in column_name_map else c for c in whole_df.columns] # Each block of county data starts with a filled in 'Total Jail Beds' start_of_county_indices = np.where(whole_df['Total Jail Beds'].notnull())[0] dfs_split_by_county = _split_df(whole_df, start_of_county_indices) dfs_grouped_by_gender = [] for df in dfs_split_by_county: # This is a typo in several reports if '12/' in df['Federal Inmates'].values: df['Federal Inmates'] = df['Federal Inmates'].replace({'12/': '12'}) # Cast everything to int before summing below df = df.fillna(0) df = aggregate_ingest_utils.cast_columns_to_int( df, ignore_columns={'County', 'Facility Security', 'Inmate Cusody'}) df['Gender'] = None df = _collapse_by_gender_rows(df, 'Male') df = _collapse_by_gender_rows(df, 'Female') # The first row contains header data for both Male and Female df['County'] = df['County'][0] df['total_jail_beds'] = df['Total Jail Beds'][0] df['reported_population'] = \ df['Reported Population (Total and Male/Female)'][0] df = df[1:] dfs_grouped_by_gender.append(df) df_by_gender = pd.concat(dfs_grouped_by_gender) # Split into male_df and female_df to independently set column headers male_df = df_by_gender[df_by_gender['Gender'] == 'Male'] female_df = df_by_gender[df_by_gender['Gender'] == 'Female'] # Since both male_df and female_df contain shared data, pick arbitrarily shared_df = aggregate_ingest_utils.rename_columns_and_select(female_df, { 'County': 'facility_name', 'total_jail_beds': 'total_jail_beds', 'reported_population': 'reported_population', }) male_df = aggregate_ingest_utils.rename_columns_and_select(male_df, { 'County': 'facility_name', # Since we've grouped by Male, this Reported Population is only Male 'Reported Population (Total and Male/Female)': 'male_population', 'Class D Inmates': 'class_d_male_population', 'Community Custody Inmates': 'community_custody_male_population', 'Alternative Sentence': 'alternative_sentence_male_population', 'Controlled Intake': 'controlled_intake_male_population', 'Parole Violators': 'parole_violators_male_population', 'Federal Inmates': 'federal_male_population', }) female_df = aggregate_ingest_utils.rename_columns_and_select(female_df, { 'County': 'facility_name', # Since we've grouped by Female, this Reported Population is only Female 'Reported Population (Total and Male/Female)': 'female_population', 'Class D Inmates': 'class_d_female_population', 'Community Custody Inmates': 'community_custody_female_population', 'Alternative Sentence': 'alternative_sentence_female_population', 'Controlled Intake': 'controlled_intake_female_population', 'Parole Violators': 'parole_violators_female_population', 'Federal Inmates': 'federal_female_population', }) result = shared_df.join(male_df.set_index('facility_name'), on='facility_name') result = result.join(female_df.set_index('facility_name'), on='facility_name') if filename.endswith('04-16-20.pdf'): result.loc[result['facility_name'] == 'Lincoln', 'total_jail_beds'] = 72 return result.reset_index(drop=True)
def _parse_table(filename: str) -> pd.DataFrame: """Parses the table in the KY PDF.""" whole_df = one( tabula.read_pdf(filename, pages="all", multiple_tables=False, lattice=True)) if filename.endswith("04-16-20.pdf"): whole_df[323:331] = whole_df[323:331].shift(-1, axis="columns") elif filename.endswith("07-09-20.pdf"): whole_df.loc[432] = whole_df.iloc[432].shift(-1) whole_df.loc[434:436] = whole_df.loc[434:436].shift(-1, axis="columns") whole_df.loc[438] = whole_df.iloc[438].shift(-1) whole_df.loc[440] = whole_df.iloc[440].shift(-1) whole_df.loc[442:445] = whole_df.loc[442:445].shift(-1, axis="columns") whole_df.loc[447:462] = whole_df.loc[447:462].shift(-1, axis="columns") whole_df.loc[464:] = whole_df.loc[464:].shift(-1, axis="columns") whole_df.loc[451, "County"] = 86 whole_df.loc[456, "County"] = 264 whole_df.loc[461, "County"] = 52 whole_df.loc[464, "County"] = 161 whole_df.loc[469, "County"] = 70 whole_df.loc[472, "County"] = 204 whole_df.loc[477, "County"] = 182 whole_df.loc[482, "County"] = 137 whole_df.loc[487, "County"] = 45 whole_df.loc[492, "County"] = 410 whole_df.loc[497, "County"] = 152 whole_df.loc[500, "County"] = 95 whole_df.loc[505, "County"] = 85 whole_df.loc[508, "County"] = 194 whole_df.loc[513, "County"] = 72 whole_df.loc[516, "County"] = 134 whole_df.loc[521, "County"] = 50 whole_df.loc[524, "County"] = 63 whole_df.loc[529, "County"] = 32 # Remove totals separate from parsing since it's a variable length totals_start_index = np.where( whole_df["Date"].str.contains("Totals"))[0][0] whole_df = whole_df[:totals_start_index] # Some rows are parsed including the date, which shift them 1 too far right shifted_rows = whole_df["County"].astype(str).str.contains("Secure") whole_df[shifted_rows] = whole_df[shifted_rows].shift(-1, axis="columns") whole_df = whole_df[whole_df["County"].astype(str) != "County"] whole_df.reset_index(drop=True) whole_df = _shift_headers(whole_df) whole_df.columns = whole_df.columns.str.replace("\n", " ") whole_df.columns = whole_df.columns.str.replace("\r", " ") # Column names can change over time : ( column_name_map = { "CC Eligible Inmates": "Community Custody Inmates", } whole_df.columns = [ column_name_map[c] if c in column_name_map else c for c in whole_df.columns ] # Each block of county data starts with a filled in 'Total Jail Beds' start_of_county_indices = np.where( whole_df["Total Jail Beds"].notnull())[0] dfs_split_by_county = _split_df(whole_df, start_of_county_indices) dfs_grouped_by_gender = [] for df in dfs_split_by_county: # This is a typo in several reports if "12/" in df["Federal Inmates"].values: df["Federal Inmates"] = df["Federal Inmates"].replace( {"12/": "12"}) if "yo" in df["Federal Inmates"].values: df["Federal Inmates"] = df["Federal Inmates"].replace({"yo": "0"}) if "pe" in df["Federal Inmates"].values: df["Federal Inmates"] = df["Federal Inmates"].replace({"pe": "0"}) if "(" in df["Reported Population (Total and Male/Female)"].values: df["Reported Population (Total and Male/Female)"] = df[ "Reported Population (Total and Male/Female)"].replace( {"(": "0"}) # Cast everything to int before summing below df = df.fillna(0) df = aggregate_ingest_utils.cast_columns_to_int( df, ignore_columns={"County", "Facility Security", "Inmate Cusody"}) df["Gender"] = None df = _collapse_by_gender_rows(df, "Male") df = _collapse_by_gender_rows(df, "Female") # The first row contains header data for both Male and Female df["County"] = df["County"][0] df["total_jail_beds"] = df["Total Jail Beds"][0] df["reported_population"] = df[ "Reported Population (Total and Male/Female)"][0] df = df[1:] dfs_grouped_by_gender.append(df) df_by_gender = pd.concat(dfs_grouped_by_gender) # Split into male_df and female_df to independently set column headers male_df = df_by_gender[df_by_gender["Gender"] == "Male"] female_df = df_by_gender[df_by_gender["Gender"] == "Female"] # Since both male_df and female_df contain shared data, pick arbitrarily shared_df = aggregate_ingest_utils.rename_columns_and_select( female_df, { "County": "facility_name", "total_jail_beds": "total_jail_beds", "reported_population": "reported_population", }, ) male_df = aggregate_ingest_utils.rename_columns_and_select( male_df, { "County": "facility_name", # Since we've grouped by Male, this Reported Population is only Male "Reported Population (Total and Male/Female)": "male_population", "Class D Inmates": "class_d_male_population", "Community Custody Inmates": "community_custody_male_population", "Alternative Sentence": "alternative_sentence_male_population", "Controlled Intake": "controlled_intake_male_population", "Parole Violators": "parole_violators_male_population", "Federal Inmates": "federal_male_population", }, ) female_df = aggregate_ingest_utils.rename_columns_and_select( female_df, { "County": "facility_name", # Since we've grouped by Female, this Reported Population is only Female "Reported Population (Total and Male/Female)": "female_population", "Class D Inmates": "class_d_female_population", "Community Custody Inmates": "community_custody_female_population", "Alternative Sentence": "alternative_sentence_female_population", "Controlled Intake": "controlled_intake_female_population", "Parole Violators": "parole_violators_female_population", "Federal Inmates": "federal_female_population", }, ) result = shared_df.join(male_df.set_index("facility_name"), on="facility_name") result = result.join(female_df.set_index("facility_name"), on="facility_name") if filename.endswith("04-16-20.pdf"): result.loc[result["facility_name"] == "Lincoln", "total_jail_beds"] = 72 return result.reset_index(drop=True)
def _parse_table(filename: str, report_date: datetime.date) -> pd.DataFrame: """Parses the last table in the GA PDF.""" # Set column names since the pdf makes them hard to parse directly column_names = [ "Index", "Jurisdiction", "Total Number of Inmates In Jail", "Jail Capacity", "Inmates as % of Capacity", "Number of Inmates Sentenced to State [Number]", "Number of Inmates Sentenced to State [% of Total]", "Number of Inmates Awaiting Trial in Jail [Number]", "Number of Inmates Awaiting Trial in Jail [% of Total]", "Number of Inmates Serving County Sentence [Number]", "Number of Inmates Serving County Sentence [% of Total]", "Number of Other Inmates [Number]", "Number of Other Inmates [% of Total]", ] # Tables at the end of the doc contain all data we want to parse pages = [8, 9, 10, 11] # Use lattice parsing since default parsing fails to parse columns on # the right half of the page use_lattice = True if filename.endswith("jun_19.pdf"): # Tabula can't handle the multiple tables because it thinks the one on # the last page has extra columns. This concats them manually. *dfs, df4 = tabula.read_pdf(filename, pages=pages, lattice=use_lattice, multiple_tables=True) df4 = df4.iloc[:-1, 1:14] df4.columns = range(13) df4.iloc[33, 1] = df4.iloc[33, 1].strip(" '") dfs.append(df4) result = pd.concat(df.iloc[1:] for df in dfs) result.columns = column_names elif report_date >= datetime.date(2020, 11, 5): # Skip every 48th row for new-style reports result = one( tabula.read_pdf( filename, pages=pages, lattice=use_lattice, multiple_tables=False, pandas_options={ "names": column_names, "skiprows": [x * 48 for x in range(4)], "skipfooter": 1, # The last row is the grand totals "engine": "python", # Only python engine supports 'skipfooter' }, )) else: result = one( tabula.read_pdf( filename, pages=pages, lattice=use_lattice, multiple_tables=False, pandas_options={ "names": column_names, "skiprows": _header_on_each_page(), "skipfooter": 1, # The last row is the grand totals "engine": "python", # Only python engine supports 'skipfooter' }, )) result = aggregate_ingest_utils.rename_columns_and_select( result, { "Jurisdiction": "county_name", "Total Number of Inmates In Jail": "total_number_of_inmates_in_jail", "Jail Capacity": "jail_capacity", "Number of Inmates Sentenced to State [Number]": "number_of_inmates_sentenced_to_state", "Number of Inmates Awaiting Trial in Jail [Number]": "number_of_inmates_awaiting_trial", "Number of Inmates Serving County Sentence [Number]": "number_of_inmates_serving_county_sentence", "Number of Other Inmates [Number]": "number_of_other_inmates", }, ) # Tabula may parse extra empty rows result = result.dropna() aggregate_ingest_utils.cast_columns_to_int(result, ignore_columns={"county_name"}) return result
def _parse_table(location, filename: str) -> pd.DataFrame: """Parses the table in the KY PDF.""" whole_df = read_pdf(location, filename, pages='all', lattice=True) # Remove totals separate from parsing since it's a variable length totals_start_index = np.where( whole_df['Date'].str.contains('Totals'))[0][0] whole_df = whole_df[:totals_start_index] # Some rows are parsed including the date, which shift them 1 too far right shifted_rows = whole_df['County'].astype(str).str.contains('Secure') whole_df[shifted_rows] = whole_df[shifted_rows].shift(-1, axis='columns') whole_df = whole_df[whole_df['County'].astype(str) != 'County'] whole_df.reset_index(drop=True) whole_df = _shift_headers(whole_df) whole_df.columns = whole_df.columns.str.replace('\n', ' ') whole_df.columns = whole_df.columns.str.replace('\r', ' ') # Column names can change over time : ( column_name_map = { 'CC Eligible Inmates': 'Community Custody Inmates', } whole_df.columns = [ column_name_map[c] if c in column_name_map else c for c in whole_df.columns ] # Each block of county data starts with a filled in 'Total Jail Beds' start_of_county_indices = np.where( whole_df['Total Jail Beds'].notnull())[0] dfs_split_by_county = _split_df(whole_df, start_of_county_indices) dfs_grouped_by_gender = [] for df in dfs_split_by_county: # Cast everything to int before summing below df = df.fillna(0) df = aggregate_ingest_utils.cast_columns_to_int( df, ignore_columns={'County', 'Facility Security', 'Inmate Cusody'}) df['Gender'] = None df = _collapse_by_gender_rows(df, 'Male') df = _collapse_by_gender_rows(df, 'Female') # The first row contains header data for both Male and Female df['County'] = df['County'][0] df['total_jail_beds'] = df['Total Jail Beds'][0] df['reported_population'] = \ df['Reported Population (Total and Male/Female)'][0] df = df[1:] dfs_grouped_by_gender.append(df) df_by_gender = pd.concat(dfs_grouped_by_gender) # Split into male_df and female_df to independently set column headers male_df = df_by_gender[df_by_gender['Gender'] == 'Male'] female_df = df_by_gender[df_by_gender['Gender'] == 'Female'] # Since both male_df and female_df contain shared data, pick arbitrarily shared_df = aggregate_ingest_utils.rename_columns_and_select( female_df, { 'County': 'facility_name', 'total_jail_beds': 'total_jail_beds', 'reported_population': 'reported_population', }) male_df = aggregate_ingest_utils.rename_columns_and_select( male_df, { 'County': 'facility_name', # Since we've grouped by Male, this Reported Population is only Male 'Reported Population (Total and Male/Female)': 'male_population', 'Class D Inmates': 'class_d_male_population', 'Community Custody Inmates': 'community_custody_male_population', 'Alternative Sentence': 'alternative_sentence_male_population', 'Controlled Intake': 'controlled_intake_male_population', 'Parole Violators': 'parole_violators_male_population', 'Federal Inmates': 'federal_male_population', }) female_df = aggregate_ingest_utils.rename_columns_and_select( female_df, { 'County': 'facility_name', # Since we've grouped by Female, this Reported Population is only Female 'Reported Population (Total and Male/Female)': 'female_population', 'Class D Inmates': 'class_d_female_population', 'Community Custody Inmates': 'community_custody_female_population', 'Alternative Sentence': 'alternative_sentence_female_population', 'Controlled Intake': 'controlled_intake_female_population', 'Parole Violators': 'parole_violators_female_population', 'Federal Inmates': 'federal_female_population', }) result = shared_df.join(male_df.set_index('facility_name'), on='facility_name') result = result.join(female_df.set_index('facility_name'), on='facility_name') return result.reset_index(drop=True)
def _parse_table(location: str, filename: str, report_date: datetime.date) -> pd.DataFrame: """Parses the TX County Table in the PDF.""" num_pages = 9 columns_to_schema = _get_column_names(report_date) pages = [] for page_num in range(1, num_pages + 1): # Each page has 1 or more tables on it with the last table being the # one with the data on it. The headers are poorly read by tabula and # some years have different responses to this call so we generally # just get all of the tables and consider only the one with numbers on # it. That lets us clean it up by dropping nonsense columns and rows, # and then assigning our own columns names to them. df = read_pdf( location, filename, multiple_tables=True, pages=page_num, ) df = df[-1] df = df.dropna(axis='columns', thresh=5) # We want to remove all of the rows and columns that have no data. numeric_elements = df.apply(pd.to_numeric, errors='coerce').notnull() rows_containing_data = numeric_elements.any(axis='columns') df = df.loc[rows_containing_data] # Next we finally break up some of the columns that were incorrectly # concatenated. for column in df.columns[1:]: # By this point we should only have numeric data in the rows, # if this happens it means some columns were concatenated and they # must be split. If the columns are concatenated, we need only # check one of the rows for a space because they are all # concatenated. if ' ' in df[column].iloc[0]: index_to_insert = df.columns.get_loc(column) df_temp = pd.DataFrame( df.pop(column).str.split(n=1, expand=True)) df.insert(index_to_insert, str(column) + '_a', df_temp[0]) df.insert(index_to_insert + 1, str(column) + '_b', df_temp[1]) pages.append(df) # Drop last rows since it's the 'Totals' section pages[-1] = pages[-1].drop(pages[-1].tail(1).index) # Build result for all the pages. We rename the columns before calling # concat because the column names might all be different. Renaming them # allows concat to pass happily. columns_to_drop = ['percent_capacity', 'total_local'] for i, page in enumerate(pages): page.columns = columns_to_schema.keys() page = aggregate_ingest_utils.rename_columns_and_select( page, columns_to_schema) # We don't care about % of capacity and total_local so we drop these # columns. page = page.drop(columns_to_drop, axis='columns') pages[i] = page result = pd.concat(pages, ignore_index=True) for column_name in set(result.columns) - {'facility_name'}: result[column_name] = result[column_name].astype(int) return result