def _parse_county_table(location: str, filename: str) -> pd.DataFrame: """Parses the FL County - Table 1 in the PDF.""" part1 = read_pdf(location, filename, pages=[3], pandas_options={ 'header': [0, 1], }) part2 = read_pdf( location, filename, pages=[4], pandas_options={ 'header': [0, 1], 'skipfooter': 1, # The last row is the total 'engine': 'python' # Only python engine supports 'skipfooter' }) result = part1.append(part2, ignore_index=True) result.columns = aggregate_ingest_utils.collapse_header(result.columns) result = aggregate_ingest_utils.rename_columns_and_select( result, { 'Florida County': 'county_name', 'County Population': 'county_population', 'Average Daily Population (ADP)': 'average_daily_population', '*Date Reported': 'date_reported' }) for column_name in {'county_population', 'average_daily_population'}: result[column_name] = result[column_name].apply(locale.atoi) result['date_reported'] = pd.to_datetime(result['date_reported']) return result
def _parse_table(location: str, filename: str, is_female: bool, report_date: datetime.date) -> pd.DataFrame: # Most but not all PDFs have data on pages 2-4. pages = ([1, 2] if 2000 <= report_date.year <= 2005 else [3, 4, 5] if report_date.year in (2006, 2009) else [2, 3, 4]) table = read_pdf(location, filename, pages=pages, multiple_tables=True) if report_date.year == 2020 and report_date.month in (4, 5): table = [ table[0], pd.concat((table[1], table[2])), pd.concat((table[3], table[4])) ] formatted_dfs = [ _format_table(df, is_female, report_date.year) for df in table ] table = pd.concat(formatted_dfs, ignore_index=True) # Discard 'TOTAL' row. table = table.iloc[:-1] table = aggregate_ingest_utils.cast_columns_to_int( table, ignore_columns={'facility_name'}) return table
def _parse_table(location: str, filename: str) -> pd.DataFrame: """Parse the Head Count Endings and Contracted Facilities Tables.""" all_dfs = read_pdf(location, filename, multiple_tables=True, lattice=True, pandas_options={ 'header': [0, 1], }) head_count_ending_df = _df_matching_substring( all_dfs, {'total', 'head count ending'}) head_count_ending_df = _format_head_count_ending(head_count_ending_df) facilities_df = _df_matching_substring(all_dfs, {'contracted facilities'}) facilities_df = _format_contracted_facilities(facilities_df) result = head_count_ending_df.append(facilities_df, ignore_index=True) result['fips'] = result.facility_name.map(_facility_acronym_to_fips) result['facility_name'] = \ result.facility_name.map(_facility_acronym_to_name) # Rows that may be NaN need to be cast as a float, otherwise use int string_columns = {'facility_name', 'fips'} nullable_columns = {'design_bed_capacity', 'operation_bed_capacity'} int_columns = set(result.columns) - string_columns - nullable_columns for column_name in int_columns: result[column_name] = result[column_name].astype(int) for column_name in nullable_columns: result[column_name] = result[column_name].astype(float) return result
def _parse_table(location: str, filename: str) -> pd.DataFrame: """Parses the last table in the GA PDF.""" # Set column names since the pdf makes them hard to parse directly column_names = [ 'Index', 'Jurisdiction', 'Total Number of Inmates In Jail', 'Jail Capacity', 'Inmates as % of Capacity', 'Number of Inmates Sentenced to State [Number]', 'Number of Inmates Sentenced to State [% of Total]', 'Number of Inmates Awaiting Trial in Jail [Number]', 'Number of Inmates Awaiting Trial in Jail [% of Total]', 'Number of Inmates Serving County Sentence [Number]', 'Number of Inmates Serving County Sentence [% of Total]', 'Number of Other Inmates [Number]', 'Number of Other Inmates [% of Total]' ] # Tables at the end of the doc contain all data we want to parse pages = [8, 9, 10, 11] # Use lattice parsing since default parsing fails to parse columns on # the right half of the page use_lattice = True result = read_pdf( location, filename, pages=pages, lattice=use_lattice, pandas_options={ 'names': column_names, 'skiprows': _header_on_each_page(), 'skipfooter': 1, # The last row is the grand totals 'engine': 'python' # Only python engine supports 'skipfooter' }) result = aggregate_ingest_utils.rename_columns_and_select( result, { 'Jurisdiction': 'county_name', 'Total Number of Inmates In Jail': 'total_number_of_inmates_in_jail', 'Jail Capacity': 'jail_capacity', 'Number of Inmates Sentenced to State [Number]': 'number_of_inmates_sentenced_to_state', 'Number of Inmates Awaiting Trial in Jail [Number]': 'number_of_inmates_awaiting_trial', 'Number of Inmates Serving County Sentence [Number]': 'number_of_inmates_serving_county_sentence', 'Number of Other Inmates [Number]': 'number_of_other_inmates' }) # Tabula may parse extra empty rows result = result.dropna() aggregate_ingest_utils.cast_columns_to_int(result, ignore_columns={'county_name'}) return result
def _parse_facility_table(location: str, filename: str) -> pd.DataFrame: """Parse the FL County Pretrial Inmate Report - Table 2 in the PDF.""" # Set column names directly since the pdf format makes them hard to parse column_names = [ 'Detention Facility Name', 'Average Daily Population', 'Number Felony Pretrial', 'Number Misdemeanor Pretrial', 'Total Percent Pretrial'] part1 = read_pdf( location, filename, pages=[5], pandas_options={ 'skiprows': [0, 1, 2], 'names': column_names, }) part2 = read_pdf( location, filename, pages=[6], pandas_options={ 'skiprows': [0, 1, 2], 'usecols': [0, 2, 3, 4, 5], # Column 1 contains no data 'names': column_names, 'skipfooter': 2, # The last 2 rows are the totals 'engine': 'python' # Only python engine supports 'skipfooter' }) result = part1.append(part2, ignore_index=True) result = aggregate_ingest_utils.rename_columns_and_select(result, { 'Detention Facility Name': 'facility_name', 'Average Daily Population': 'average_daily_population', 'Number Felony Pretrial': 'number_felony_pretrial', 'Number Misdemeanor Pretrial': 'number_misdemeanor_pretrial' }) result['average_daily_population'] = result[ 'average_daily_population'].apply(_use_stale_adp).apply(_to_int) for column_name in {'number_felony_pretrial', 'number_misdemeanor_pretrial'}: result[column_name] = result[column_name].apply(_to_int) return result
def _parse_table(location: str, filename: str, is_female: bool, year: int) -> pd.DataFrame: # Most but not all PDFs have data on pages 2-4. pages = ([1, 2] if 2000 <= year <= 2005 else [3, 4, 5] if year in (2006, 2009) else [2, 3, 4]) table = read_pdf(location, filename, pages=pages, multiple_tables=True) formatted_dfs = [_format_table(df, is_female, year) for df in table] table = pd.concat(formatted_dfs, ignore_index=True) # Discard 'TOTAL' row. table = table.iloc[:-1] table = aggregate_ingest_utils.cast_columns_to_int( table, ignore_columns={'facility_name'}) return table
def _parse_table(location: str, filename: str) -> pd.DataFrame: """Parses all tables in the GA PDF.""" all_dfs = read_pdf( location, filename, pages='all', multiple_tables=True, lattice=True, pandas_options={ 'header': 0 }) # Trim unnecessary tables all_dfs = all_dfs[3:-1] dfs_split_by_page = [_split_page(df_for_page) for df_for_page in all_dfs] all_split_dfs = list(itertools.chain.from_iterable(dfs_split_by_page)) results = [_format_df(df) for df in all_split_dfs] return pd.concat(results, ignore_index=True)
def _parse_table(location, filename: str) -> pd.DataFrame: """Parses the table in the KY PDF.""" whole_df = read_pdf( location, filename, pages='all', lattice=True ) if filename.endswith('04-16-20.pdf'): whole_df[323:331] = whole_df[323:331].shift(-1, axis='columns') elif filename.endswith('07-09-20.pdf'): whole_df.loc[432] = whole_df.iloc[432].shift(-1) whole_df.loc[434:436] = whole_df.loc[434:436].shift(-1, axis='columns') whole_df.loc[438] = whole_df.iloc[438].shift(-1) whole_df.loc[440] = whole_df.iloc[440].shift(-1) whole_df.loc[442:445] = whole_df.loc[442:445].shift(-1, axis='columns') whole_df.loc[447:462] = whole_df.loc[447:462].shift(-1, axis='columns') whole_df.loc[464:] = whole_df.loc[464:].shift(-1, axis='columns') whole_df.loc[451, 'County'] = 86 whole_df.loc[456, 'County'] = 264 whole_df.loc[461, 'County'] = 52 whole_df.loc[464, 'County'] = 161 whole_df.loc[469, 'County'] = 70 whole_df.loc[472, 'County'] = 204 whole_df.loc[477, 'County'] = 182 whole_df.loc[482, 'County'] = 137 whole_df.loc[487, 'County'] = 45 whole_df.loc[492, 'County'] = 410 whole_df.loc[497, 'County'] = 152 whole_df.loc[500, 'County'] = 95 whole_df.loc[505, 'County'] = 85 whole_df.loc[508, 'County'] = 194 whole_df.loc[513, 'County'] = 72 whole_df.loc[516, 'County'] = 134 whole_df.loc[521, 'County'] = 50 whole_df.loc[524, 'County'] = 63 whole_df.loc[529, 'County'] = 32 # Remove totals separate from parsing since it's a variable length totals_start_index = np.where(whole_df['Date'].str.contains('Totals'))[0][0] whole_df = whole_df[:totals_start_index] # Some rows are parsed including the date, which shift them 1 too far right shifted_rows = whole_df['County'].astype(str).str.contains('Secure') whole_df[shifted_rows] = whole_df[shifted_rows].shift(-1, axis='columns') whole_df = whole_df[whole_df['County'].astype(str) != 'County'] whole_df.reset_index(drop=True) whole_df = _shift_headers(whole_df) whole_df.columns = whole_df.columns.str.replace('\n', ' ') whole_df.columns = whole_df.columns.str.replace('\r', ' ') # Column names can change over time : ( column_name_map = { 'CC Eligible Inmates': 'Community Custody Inmates', } whole_df.columns = [column_name_map[c] if c in column_name_map else c for c in whole_df.columns] # Each block of county data starts with a filled in 'Total Jail Beds' start_of_county_indices = np.where(whole_df['Total Jail Beds'].notnull())[0] dfs_split_by_county = _split_df(whole_df, start_of_county_indices) dfs_grouped_by_gender = [] for df in dfs_split_by_county: # This is a typo in several reports if '12/' in df['Federal Inmates'].values: df['Federal Inmates'] = df['Federal Inmates'].replace({'12/': '12'}) # Cast everything to int before summing below df = df.fillna(0) df = aggregate_ingest_utils.cast_columns_to_int( df, ignore_columns={'County', 'Facility Security', 'Inmate Cusody'}) df['Gender'] = None df = _collapse_by_gender_rows(df, 'Male') df = _collapse_by_gender_rows(df, 'Female') # The first row contains header data for both Male and Female df['County'] = df['County'][0] df['total_jail_beds'] = df['Total Jail Beds'][0] df['reported_population'] = \ df['Reported Population (Total and Male/Female)'][0] df = df[1:] dfs_grouped_by_gender.append(df) df_by_gender = pd.concat(dfs_grouped_by_gender) # Split into male_df and female_df to independently set column headers male_df = df_by_gender[df_by_gender['Gender'] == 'Male'] female_df = df_by_gender[df_by_gender['Gender'] == 'Female'] # Since both male_df and female_df contain shared data, pick arbitrarily shared_df = aggregate_ingest_utils.rename_columns_and_select(female_df, { 'County': 'facility_name', 'total_jail_beds': 'total_jail_beds', 'reported_population': 'reported_population', }) male_df = aggregate_ingest_utils.rename_columns_and_select(male_df, { 'County': 'facility_name', # Since we've grouped by Male, this Reported Population is only Male 'Reported Population (Total and Male/Female)': 'male_population', 'Class D Inmates': 'class_d_male_population', 'Community Custody Inmates': 'community_custody_male_population', 'Alternative Sentence': 'alternative_sentence_male_population', 'Controlled Intake': 'controlled_intake_male_population', 'Parole Violators': 'parole_violators_male_population', 'Federal Inmates': 'federal_male_population', }) female_df = aggregate_ingest_utils.rename_columns_and_select(female_df, { 'County': 'facility_name', # Since we've grouped by Female, this Reported Population is only Female 'Reported Population (Total and Male/Female)': 'female_population', 'Class D Inmates': 'class_d_female_population', 'Community Custody Inmates': 'community_custody_female_population', 'Alternative Sentence': 'alternative_sentence_female_population', 'Controlled Intake': 'controlled_intake_female_population', 'Parole Violators': 'parole_violators_female_population', 'Federal Inmates': 'federal_female_population', }) result = shared_df.join(male_df.set_index('facility_name'), on='facility_name') result = result.join(female_df.set_index('facility_name'), on='facility_name') if filename.endswith('04-16-20.pdf'): result.loc[result['facility_name'] == 'Lincoln', 'total_jail_beds'] = 72 return result.reset_index(drop=True)
def _parse_table(location: str, filename: str, report_date: datetime.date) -> pd.DataFrame: """Parses the TX County Table in the PDF.""" num_pages = 9 columns_to_schema = _get_column_names(report_date) pages = [] for page_num in range(1, num_pages + 1): # Each page has 1 or more tables on it with the last table being the # one with the data on it. The headers are poorly read by tabula and # some years have different responses to this call so we generally # just get all of the tables and consider only the one with numbers on # it. That lets us clean it up by dropping nonsense columns and rows, # and then assigning our own columns names to them. df = read_pdf( location, filename, multiple_tables=True, pages=page_num, ) df = df[-1] df = df.dropna(axis='columns', thresh=5) # We want to remove all of the rows and columns that have no data. numeric_elements = df.apply(pd.to_numeric, errors='coerce').notnull() rows_containing_data = numeric_elements.any(axis='columns') df = df.loc[rows_containing_data] # Next we finally break up some of the columns that were incorrectly # concatenated. for column in df.columns[1:]: # By this point we should only have numeric data in the rows, # if this happens it means some columns were concatenated and they # must be split. If the columns are concatenated, we need only # check one of the rows for a space because they are all # concatenated. if ' ' in df[column].iloc[0]: index_to_insert = df.columns.get_loc(column) df_temp = pd.DataFrame( df.pop(column).str.split(n=1, expand=True)) df.insert(index_to_insert, str(column) + '_a', df_temp[0]) df.insert(index_to_insert + 1, str(column) + '_b', df_temp[1]) pages.append(df) # Drop last rows since it's the 'Totals' section pages[-1] = pages[-1].drop(pages[-1].tail(1).index) # Build result for all the pages. We rename the columns before calling # concat because the column names might all be different. Renaming them # allows concat to pass happily. columns_to_drop = ['percent_capacity', 'total_local'] for i, page in enumerate(pages): page.columns = columns_to_schema.keys() page = aggregate_ingest_utils.rename_columns_and_select( page, columns_to_schema) # We don't care about % of capacity and total_local so we drop these # columns. page = page.drop(columns_to_drop, axis='columns') pages[i] = page result = pd.concat(pages, ignore_index=True) for column_name in set(result.columns) - {'facility_name'}: result[column_name] = result[column_name].astype(int) return result
def _parse_table(location, filename: str) -> pd.DataFrame: """Parses the table in the KY PDF.""" whole_df = read_pdf(location, filename, pages='all', lattice=True) # Remove totals separate from parsing since it's a variable length totals_start_index = np.where( whole_df['Date'].str.contains('Totals'))[0][0] whole_df = whole_df[:totals_start_index] # Some rows are parsed including the date, which shift them 1 too far right shifted_rows = whole_df['County'].astype(str).str.contains('Secure') whole_df[shifted_rows] = whole_df[shifted_rows].shift(-1, axis='columns') whole_df = whole_df[whole_df['County'].astype(str) != 'County'] whole_df.reset_index(drop=True) whole_df = _shift_headers(whole_df) whole_df.columns = whole_df.columns.str.replace('\n', ' ') whole_df.columns = whole_df.columns.str.replace('\r', ' ') # Column names can change over time : ( column_name_map = { 'CC Eligible Inmates': 'Community Custody Inmates', } whole_df.columns = [ column_name_map[c] if c in column_name_map else c for c in whole_df.columns ] # Each block of county data starts with a filled in 'Total Jail Beds' start_of_county_indices = np.where( whole_df['Total Jail Beds'].notnull())[0] dfs_split_by_county = _split_df(whole_df, start_of_county_indices) dfs_grouped_by_gender = [] for df in dfs_split_by_county: # Cast everything to int before summing below df = df.fillna(0) df = aggregate_ingest_utils.cast_columns_to_int( df, ignore_columns={'County', 'Facility Security', 'Inmate Cusody'}) df['Gender'] = None df = _collapse_by_gender_rows(df, 'Male') df = _collapse_by_gender_rows(df, 'Female') # The first row contains header data for both Male and Female df['County'] = df['County'][0] df['total_jail_beds'] = df['Total Jail Beds'][0] df['reported_population'] = \ df['Reported Population (Total and Male/Female)'][0] df = df[1:] dfs_grouped_by_gender.append(df) df_by_gender = pd.concat(dfs_grouped_by_gender) # Split into male_df and female_df to independently set column headers male_df = df_by_gender[df_by_gender['Gender'] == 'Male'] female_df = df_by_gender[df_by_gender['Gender'] == 'Female'] # Since both male_df and female_df contain shared data, pick arbitrarily shared_df = aggregate_ingest_utils.rename_columns_and_select( female_df, { 'County': 'facility_name', 'total_jail_beds': 'total_jail_beds', 'reported_population': 'reported_population', }) male_df = aggregate_ingest_utils.rename_columns_and_select( male_df, { 'County': 'facility_name', # Since we've grouped by Male, this Reported Population is only Male 'Reported Population (Total and Male/Female)': 'male_population', 'Class D Inmates': 'class_d_male_population', 'Community Custody Inmates': 'community_custody_male_population', 'Alternative Sentence': 'alternative_sentence_male_population', 'Controlled Intake': 'controlled_intake_male_population', 'Parole Violators': 'parole_violators_male_population', 'Federal Inmates': 'federal_male_population', }) female_df = aggregate_ingest_utils.rename_columns_and_select( female_df, { 'County': 'facility_name', # Since we've grouped by Female, this Reported Population is only Female 'Reported Population (Total and Male/Female)': 'female_population', 'Class D Inmates': 'class_d_female_population', 'Community Custody Inmates': 'community_custody_female_population', 'Alternative Sentence': 'alternative_sentence_female_population', 'Controlled Intake': 'controlled_intake_female_population', 'Parole Violators': 'parole_violators_female_population', 'Federal Inmates': 'federal_female_population', }) result = shared_df.join(male_df.set_index('facility_name'), on='facility_name') result = result.join(female_df.set_index('facility_name'), on='facility_name') return result.reset_index(drop=True)