def _parse_table(location: str, filename: str) -> pd.DataFrame: """Parses the last table in the GA PDF.""" # Set column names since the pdf makes them hard to parse directly column_names = [ 'Index', 'Jurisdiction', 'Total Number of Inmates In Jail', 'Jail Capacity', 'Inmates as % of Capacity', 'Number of Inmates Sentenced to State [Number]', 'Number of Inmates Sentenced to State [% of Total]', 'Number of Inmates Awaiting Trial in Jail [Number]', 'Number of Inmates Awaiting Trial in Jail [% of Total]', 'Number of Inmates Serving County Sentence [Number]', 'Number of Inmates Serving County Sentence [% of Total]', 'Number of Other Inmates [Number]', 'Number of Other Inmates [% of Total]' ] # Tables at the end of the doc contain all data we want to parse pages = [8, 9, 10, 11] # Use lattice parsing since default parsing fails to parse columns on # the right half of the page use_lattice = True result = read_pdf( location, filename, pages=pages, lattice=use_lattice, pandas_options={ 'names': column_names, 'skiprows': _header_on_each_page(), 'skipfooter': 1, # The last row is the grand totals 'engine': 'python' # Only python engine supports 'skipfooter' }) result = aggregate_ingest_utils.rename_columns_and_select( result, { 'Jurisdiction': 'county_name', 'Total Number of Inmates In Jail': 'total_number_of_inmates_in_jail', 'Jail Capacity': 'jail_capacity', 'Number of Inmates Sentenced to State [Number]': 'number_of_inmates_sentenced_to_state', 'Number of Inmates Awaiting Trial in Jail [Number]': 'number_of_inmates_awaiting_trial', 'Number of Inmates Serving County Sentence [Number]': 'number_of_inmates_serving_county_sentence', 'Number of Other Inmates [Number]': 'number_of_other_inmates' }) # Tabula may parse extra empty rows result = result.dropna() aggregate_ingest_utils.cast_columns_to_int(result, ignore_columns={'county_name'}) return result
def _parse_table(_: str, filename: str, is_female: bool, report_date: datetime.date) -> pd.DataFrame: # Most but not all PDFs have data on pages 2-4. pages = ([1, 2] if 2000 <= report_date.year <= 2005 else [3, 4, 5] if report_date.year in (2006, 2009) else [2, 3, 4]) table = tabula.read_pdf(filename, pages=pages, multiple_tables=True) if is_female and report_date.year == 2020 and report_date.month in (4, 5, 6): table = [table[0], pd.concat((table[1], table[2])), pd.concat((table[3], table[4]))] formatted_dfs = [_format_table(df, is_female, report_date.year) for df in table] table = pd.concat(formatted_dfs, ignore_index=True) # Discard 'TOTAL' row. table = table.iloc[:-1] table = aggregate_ingest_utils.cast_columns_to_int( table, ignore_columns={'facility_name'}) return table
def _parse_table(filename: str) -> pd.DataFrame: """Parses the CA aggregate report.""" # Although the file is downloaded with the '.xls' extension, the contents of # the file are in the shape of an HTML file. df = pd.read_html(filename, header=0)[0] df = df.fillna(0) df['report_date'] = df[['Year', 'Month']].apply(_last_date_of_month, axis='columns') df = aggregate_ingest_utils.rename_columns_and_select( df, { 'Jurisdiction': 'jurisdiction_name', 'Facility': 'facility_name', 'Total facility ADP': 'average_daily_population', 'Unsentenced males': 'unsentenced_male_adp', 'Unsentenced females': 'unsentenced_female_adp', 'Sentenced males': 'sentenced_male_adp', 'Sentenced females': 'sentenced_female_adp', 'report_date': 'report_date' }) string_columns = {'jurisdiction_name', 'facility_name', 'report_date'} df = aggregate_ingest_utils.cast_columns_to_int( df, ignore_columns=string_columns) return df
def _parse_table(filename: str) -> pd.DataFrame: """Parses the CA aggregate report.""" # Although the file is downloaded with the '.xls' extension, the contents of # the file are in the shape of an HTML file. df = pd.read_html(filename, header=0)[0] df = df.fillna(0) df["report_date"] = df[["Year", "Month"]].apply(_last_date_of_month, axis="columns") df = aggregate_ingest_utils.rename_columns_and_select( df, { "Jurisdiction": "jurisdiction_name", "Facility": "facility_name", "Total facility ADP": "average_daily_population", "Unsentenced males": "unsentenced_male_adp", "Unsentenced females": "unsentenced_female_adp", "Sentenced males": "sentenced_male_adp", "Sentenced females": "sentenced_female_adp", "report_date": "report_date", }, ) string_columns = {"jurisdiction_name", "facility_name", "report_date"} df = aggregate_ingest_utils.cast_columns_to_int(df, ignore_columns=string_columns) return df
def _parse_table(location: str, filename: str, is_female: bool, year: int) -> pd.DataFrame: # Most but not all PDFs have data on pages 2-4. pages = ([1, 2] if 2000 <= year <= 2005 else [3, 4, 5] if year in (2006, 2009) else [2, 3, 4]) table = read_pdf(location, filename, pages=pages, multiple_tables=True) formatted_dfs = [_format_table(df, is_female, year) for df in table] table = pd.concat(formatted_dfs, ignore_index=True) # Discard 'TOTAL' row. table = table.iloc[:-1] table = aggregate_ingest_utils.cast_columns_to_int( table, ignore_columns={'facility_name'}) return table
def _parse_table(_, filename: str) -> pd.DataFrame: """Parses the table in the KY PDF.""" whole_df = tabula.read_pdf( filename, pages='all', lattice=True ) if filename.endswith('04-16-20.pdf'): whole_df[323:331] = whole_df[323:331].shift(-1, axis='columns') elif filename.endswith('07-09-20.pdf'): whole_df.loc[432] = whole_df.iloc[432].shift(-1) whole_df.loc[434:436] = whole_df.loc[434:436].shift(-1, axis='columns') whole_df.loc[438] = whole_df.iloc[438].shift(-1) whole_df.loc[440] = whole_df.iloc[440].shift(-1) whole_df.loc[442:445] = whole_df.loc[442:445].shift(-1, axis='columns') whole_df.loc[447:462] = whole_df.loc[447:462].shift(-1, axis='columns') whole_df.loc[464:] = whole_df.loc[464:].shift(-1, axis='columns') whole_df.loc[451, 'County'] = 86 whole_df.loc[456, 'County'] = 264 whole_df.loc[461, 'County'] = 52 whole_df.loc[464, 'County'] = 161 whole_df.loc[469, 'County'] = 70 whole_df.loc[472, 'County'] = 204 whole_df.loc[477, 'County'] = 182 whole_df.loc[482, 'County'] = 137 whole_df.loc[487, 'County'] = 45 whole_df.loc[492, 'County'] = 410 whole_df.loc[497, 'County'] = 152 whole_df.loc[500, 'County'] = 95 whole_df.loc[505, 'County'] = 85 whole_df.loc[508, 'County'] = 194 whole_df.loc[513, 'County'] = 72 whole_df.loc[516, 'County'] = 134 whole_df.loc[521, 'County'] = 50 whole_df.loc[524, 'County'] = 63 whole_df.loc[529, 'County'] = 32 # Remove totals separate from parsing since it's a variable length totals_start_index = np.where(whole_df['Date'].str.contains('Totals'))[0][0] whole_df = whole_df[:totals_start_index] # Some rows are parsed including the date, which shift them 1 too far right shifted_rows = whole_df['County'].astype(str).str.contains('Secure') whole_df[shifted_rows] = whole_df[shifted_rows].shift(-1, axis='columns') whole_df = whole_df[whole_df['County'].astype(str) != 'County'] whole_df.reset_index(drop=True) whole_df = _shift_headers(whole_df) whole_df.columns = whole_df.columns.str.replace('\n', ' ') whole_df.columns = whole_df.columns.str.replace('\r', ' ') # Column names can change over time : ( column_name_map = { 'CC Eligible Inmates': 'Community Custody Inmates', } whole_df.columns = [column_name_map[c] if c in column_name_map else c for c in whole_df.columns] # Each block of county data starts with a filled in 'Total Jail Beds' start_of_county_indices = np.where(whole_df['Total Jail Beds'].notnull())[0] dfs_split_by_county = _split_df(whole_df, start_of_county_indices) dfs_grouped_by_gender = [] for df in dfs_split_by_county: # This is a typo in several reports if '12/' in df['Federal Inmates'].values: df['Federal Inmates'] = df['Federal Inmates'].replace({'12/': '12'}) # Cast everything to int before summing below df = df.fillna(0) df = aggregate_ingest_utils.cast_columns_to_int( df, ignore_columns={'County', 'Facility Security', 'Inmate Cusody'}) df['Gender'] = None df = _collapse_by_gender_rows(df, 'Male') df = _collapse_by_gender_rows(df, 'Female') # The first row contains header data for both Male and Female df['County'] = df['County'][0] df['total_jail_beds'] = df['Total Jail Beds'][0] df['reported_population'] = \ df['Reported Population (Total and Male/Female)'][0] df = df[1:] dfs_grouped_by_gender.append(df) df_by_gender = pd.concat(dfs_grouped_by_gender) # Split into male_df and female_df to independently set column headers male_df = df_by_gender[df_by_gender['Gender'] == 'Male'] female_df = df_by_gender[df_by_gender['Gender'] == 'Female'] # Since both male_df and female_df contain shared data, pick arbitrarily shared_df = aggregate_ingest_utils.rename_columns_and_select(female_df, { 'County': 'facility_name', 'total_jail_beds': 'total_jail_beds', 'reported_population': 'reported_population', }) male_df = aggregate_ingest_utils.rename_columns_and_select(male_df, { 'County': 'facility_name', # Since we've grouped by Male, this Reported Population is only Male 'Reported Population (Total and Male/Female)': 'male_population', 'Class D Inmates': 'class_d_male_population', 'Community Custody Inmates': 'community_custody_male_population', 'Alternative Sentence': 'alternative_sentence_male_population', 'Controlled Intake': 'controlled_intake_male_population', 'Parole Violators': 'parole_violators_male_population', 'Federal Inmates': 'federal_male_population', }) female_df = aggregate_ingest_utils.rename_columns_and_select(female_df, { 'County': 'facility_name', # Since we've grouped by Female, this Reported Population is only Female 'Reported Population (Total and Male/Female)': 'female_population', 'Class D Inmates': 'class_d_female_population', 'Community Custody Inmates': 'community_custody_female_population', 'Alternative Sentence': 'alternative_sentence_female_population', 'Controlled Intake': 'controlled_intake_female_population', 'Parole Violators': 'parole_violators_female_population', 'Federal Inmates': 'federal_female_population', }) result = shared_df.join(male_df.set_index('facility_name'), on='facility_name') result = result.join(female_df.set_index('facility_name'), on='facility_name') if filename.endswith('04-16-20.pdf'): result.loc[result['facility_name'] == 'Lincoln', 'total_jail_beds'] = 72 return result.reset_index(drop=True)
def _parse_table(filename: str) -> pd.DataFrame: """Parses the table in the KY PDF.""" whole_df = one( tabula.read_pdf(filename, pages="all", multiple_tables=False, lattice=True)) if filename.endswith("04-16-20.pdf"): whole_df[323:331] = whole_df[323:331].shift(-1, axis="columns") elif filename.endswith("07-09-20.pdf"): whole_df.loc[432] = whole_df.iloc[432].shift(-1) whole_df.loc[434:436] = whole_df.loc[434:436].shift(-1, axis="columns") whole_df.loc[438] = whole_df.iloc[438].shift(-1) whole_df.loc[440] = whole_df.iloc[440].shift(-1) whole_df.loc[442:445] = whole_df.loc[442:445].shift(-1, axis="columns") whole_df.loc[447:462] = whole_df.loc[447:462].shift(-1, axis="columns") whole_df.loc[464:] = whole_df.loc[464:].shift(-1, axis="columns") whole_df.loc[451, "County"] = 86 whole_df.loc[456, "County"] = 264 whole_df.loc[461, "County"] = 52 whole_df.loc[464, "County"] = 161 whole_df.loc[469, "County"] = 70 whole_df.loc[472, "County"] = 204 whole_df.loc[477, "County"] = 182 whole_df.loc[482, "County"] = 137 whole_df.loc[487, "County"] = 45 whole_df.loc[492, "County"] = 410 whole_df.loc[497, "County"] = 152 whole_df.loc[500, "County"] = 95 whole_df.loc[505, "County"] = 85 whole_df.loc[508, "County"] = 194 whole_df.loc[513, "County"] = 72 whole_df.loc[516, "County"] = 134 whole_df.loc[521, "County"] = 50 whole_df.loc[524, "County"] = 63 whole_df.loc[529, "County"] = 32 # Remove totals separate from parsing since it's a variable length totals_start_index = np.where( whole_df["Date"].str.contains("Totals"))[0][0] whole_df = whole_df[:totals_start_index] # Some rows are parsed including the date, which shift them 1 too far right shifted_rows = whole_df["County"].astype(str).str.contains("Secure") whole_df[shifted_rows] = whole_df[shifted_rows].shift(-1, axis="columns") whole_df = whole_df[whole_df["County"].astype(str) != "County"] whole_df.reset_index(drop=True) whole_df = _shift_headers(whole_df) whole_df.columns = whole_df.columns.str.replace("\n", " ") whole_df.columns = whole_df.columns.str.replace("\r", " ") # Column names can change over time : ( column_name_map = { "CC Eligible Inmates": "Community Custody Inmates", } whole_df.columns = [ column_name_map[c] if c in column_name_map else c for c in whole_df.columns ] # Each block of county data starts with a filled in 'Total Jail Beds' start_of_county_indices = np.where( whole_df["Total Jail Beds"].notnull())[0] dfs_split_by_county = _split_df(whole_df, start_of_county_indices) dfs_grouped_by_gender = [] for df in dfs_split_by_county: # This is a typo in several reports if "12/" in df["Federal Inmates"].values: df["Federal Inmates"] = df["Federal Inmates"].replace( {"12/": "12"}) if "yo" in df["Federal Inmates"].values: df["Federal Inmates"] = df["Federal Inmates"].replace({"yo": "0"}) if "pe" in df["Federal Inmates"].values: df["Federal Inmates"] = df["Federal Inmates"].replace({"pe": "0"}) if "(" in df["Reported Population (Total and Male/Female)"].values: df["Reported Population (Total and Male/Female)"] = df[ "Reported Population (Total and Male/Female)"].replace( {"(": "0"}) # Cast everything to int before summing below df = df.fillna(0) df = aggregate_ingest_utils.cast_columns_to_int( df, ignore_columns={"County", "Facility Security", "Inmate Cusody"}) df["Gender"] = None df = _collapse_by_gender_rows(df, "Male") df = _collapse_by_gender_rows(df, "Female") # The first row contains header data for both Male and Female df["County"] = df["County"][0] df["total_jail_beds"] = df["Total Jail Beds"][0] df["reported_population"] = df[ "Reported Population (Total and Male/Female)"][0] df = df[1:] dfs_grouped_by_gender.append(df) df_by_gender = pd.concat(dfs_grouped_by_gender) # Split into male_df and female_df to independently set column headers male_df = df_by_gender[df_by_gender["Gender"] == "Male"] female_df = df_by_gender[df_by_gender["Gender"] == "Female"] # Since both male_df and female_df contain shared data, pick arbitrarily shared_df = aggregate_ingest_utils.rename_columns_and_select( female_df, { "County": "facility_name", "total_jail_beds": "total_jail_beds", "reported_population": "reported_population", }, ) male_df = aggregate_ingest_utils.rename_columns_and_select( male_df, { "County": "facility_name", # Since we've grouped by Male, this Reported Population is only Male "Reported Population (Total and Male/Female)": "male_population", "Class D Inmates": "class_d_male_population", "Community Custody Inmates": "community_custody_male_population", "Alternative Sentence": "alternative_sentence_male_population", "Controlled Intake": "controlled_intake_male_population", "Parole Violators": "parole_violators_male_population", "Federal Inmates": "federal_male_population", }, ) female_df = aggregate_ingest_utils.rename_columns_and_select( female_df, { "County": "facility_name", # Since we've grouped by Female, this Reported Population is only Female "Reported Population (Total and Male/Female)": "female_population", "Class D Inmates": "class_d_female_population", "Community Custody Inmates": "community_custody_female_population", "Alternative Sentence": "alternative_sentence_female_population", "Controlled Intake": "controlled_intake_female_population", "Parole Violators": "parole_violators_female_population", "Federal Inmates": "federal_female_population", }, ) result = shared_df.join(male_df.set_index("facility_name"), on="facility_name") result = result.join(female_df.set_index("facility_name"), on="facility_name") if filename.endswith("04-16-20.pdf"): result.loc[result["facility_name"] == "Lincoln", "total_jail_beds"] = 72 return result.reset_index(drop=True)
def _parse_table(location, filename: str) -> pd.DataFrame: """Parses the table in the KY PDF.""" whole_df = read_pdf(location, filename, pages='all', lattice=True) # Remove totals separate from parsing since it's a variable length totals_start_index = np.where( whole_df['Date'].str.contains('Totals'))[0][0] whole_df = whole_df[:totals_start_index] # Some rows are parsed including the date, which shift them 1 too far right shifted_rows = whole_df['County'].astype(str).str.contains('Secure') whole_df[shifted_rows] = whole_df[shifted_rows].shift(-1, axis='columns') whole_df = whole_df[whole_df['County'].astype(str) != 'County'] whole_df.reset_index(drop=True) whole_df = _shift_headers(whole_df) whole_df.columns = whole_df.columns.str.replace('\n', ' ') whole_df.columns = whole_df.columns.str.replace('\r', ' ') # Column names can change over time : ( column_name_map = { 'CC Eligible Inmates': 'Community Custody Inmates', } whole_df.columns = [ column_name_map[c] if c in column_name_map else c for c in whole_df.columns ] # Each block of county data starts with a filled in 'Total Jail Beds' start_of_county_indices = np.where( whole_df['Total Jail Beds'].notnull())[0] dfs_split_by_county = _split_df(whole_df, start_of_county_indices) dfs_grouped_by_gender = [] for df in dfs_split_by_county: # Cast everything to int before summing below df = df.fillna(0) df = aggregate_ingest_utils.cast_columns_to_int( df, ignore_columns={'County', 'Facility Security', 'Inmate Cusody'}) df['Gender'] = None df = _collapse_by_gender_rows(df, 'Male') df = _collapse_by_gender_rows(df, 'Female') # The first row contains header data for both Male and Female df['County'] = df['County'][0] df['total_jail_beds'] = df['Total Jail Beds'][0] df['reported_population'] = \ df['Reported Population (Total and Male/Female)'][0] df = df[1:] dfs_grouped_by_gender.append(df) df_by_gender = pd.concat(dfs_grouped_by_gender) # Split into male_df and female_df to independently set column headers male_df = df_by_gender[df_by_gender['Gender'] == 'Male'] female_df = df_by_gender[df_by_gender['Gender'] == 'Female'] # Since both male_df and female_df contain shared data, pick arbitrarily shared_df = aggregate_ingest_utils.rename_columns_and_select( female_df, { 'County': 'facility_name', 'total_jail_beds': 'total_jail_beds', 'reported_population': 'reported_population', }) male_df = aggregate_ingest_utils.rename_columns_and_select( male_df, { 'County': 'facility_name', # Since we've grouped by Male, this Reported Population is only Male 'Reported Population (Total and Male/Female)': 'male_population', 'Class D Inmates': 'class_d_male_population', 'Community Custody Inmates': 'community_custody_male_population', 'Alternative Sentence': 'alternative_sentence_male_population', 'Controlled Intake': 'controlled_intake_male_population', 'Parole Violators': 'parole_violators_male_population', 'Federal Inmates': 'federal_male_population', }) female_df = aggregate_ingest_utils.rename_columns_and_select( female_df, { 'County': 'facility_name', # Since we've grouped by Female, this Reported Population is only Female 'Reported Population (Total and Male/Female)': 'female_population', 'Class D Inmates': 'class_d_female_population', 'Community Custody Inmates': 'community_custody_female_population', 'Alternative Sentence': 'alternative_sentence_female_population', 'Controlled Intake': 'controlled_intake_female_population', 'Parole Violators': 'parole_violators_female_population', 'Federal Inmates': 'federal_female_population', }) result = shared_df.join(male_df.set_index('facility_name'), on='facility_name') result = result.join(female_df.set_index('facility_name'), on='facility_name') return result.reset_index(drop=True)
def _parse_table(filename: str, report_date: datetime.date) -> pd.DataFrame: """Parses the last table in the GA PDF.""" # Set column names since the pdf makes them hard to parse directly column_names = [ "Index", "Jurisdiction", "Total Number of Inmates In Jail", "Jail Capacity", "Inmates as % of Capacity", "Number of Inmates Sentenced to State [Number]", "Number of Inmates Sentenced to State [% of Total]", "Number of Inmates Awaiting Trial in Jail [Number]", "Number of Inmates Awaiting Trial in Jail [% of Total]", "Number of Inmates Serving County Sentence [Number]", "Number of Inmates Serving County Sentence [% of Total]", "Number of Other Inmates [Number]", "Number of Other Inmates [% of Total]", ] # Tables at the end of the doc contain all data we want to parse pages = [8, 9, 10, 11] # Use lattice parsing since default parsing fails to parse columns on # the right half of the page use_lattice = True if filename.endswith("jun_19.pdf"): # Tabula can't handle the multiple tables because it thinks the one on # the last page has extra columns. This concats them manually. *dfs, df4 = tabula.read_pdf(filename, pages=pages, lattice=use_lattice, multiple_tables=True) df4 = df4.iloc[:-1, 1:14] df4.columns = range(13) df4.iloc[33, 1] = df4.iloc[33, 1].strip(" '") dfs.append(df4) result = pd.concat(df.iloc[1:] for df in dfs) result.columns = column_names elif report_date >= datetime.date(2020, 11, 5): # Skip every 48th row for new-style reports result = one( tabula.read_pdf( filename, pages=pages, lattice=use_lattice, multiple_tables=False, pandas_options={ "names": column_names, "skiprows": [x * 48 for x in range(4)], "skipfooter": 1, # The last row is the grand totals "engine": "python", # Only python engine supports 'skipfooter' }, )) else: result = one( tabula.read_pdf( filename, pages=pages, lattice=use_lattice, multiple_tables=False, pandas_options={ "names": column_names, "skiprows": _header_on_each_page(), "skipfooter": 1, # The last row is the grand totals "engine": "python", # Only python engine supports 'skipfooter' }, )) result = aggregate_ingest_utils.rename_columns_and_select( result, { "Jurisdiction": "county_name", "Total Number of Inmates In Jail": "total_number_of_inmates_in_jail", "Jail Capacity": "jail_capacity", "Number of Inmates Sentenced to State [Number]": "number_of_inmates_sentenced_to_state", "Number of Inmates Awaiting Trial in Jail [Number]": "number_of_inmates_awaiting_trial", "Number of Inmates Serving County Sentence [Number]": "number_of_inmates_serving_county_sentence", "Number of Other Inmates [Number]": "number_of_other_inmates", }, ) # Tabula may parse extra empty rows result = result.dropna() aggregate_ingest_utils.cast_columns_to_int(result, ignore_columns={"county_name"}) return result