Python cast_columns_to_intの例、recidiviz.ingest.aggregate.aggregate_ingest_utils.cast_columns_to_int Pythonの例

コード例 #1

0

ファイルを表示

ファイル: ga_aggregate_ingest.py プロジェクト: xgenie-007/pulse-data

def _parse_table(location: str, filename: str) -> pd.DataFrame:
    """Parses the last table in the GA PDF."""

    # Set column names since the pdf makes them hard to parse directly
    column_names = [
        'Index', 'Jurisdiction', 'Total Number of Inmates In Jail',
        'Jail Capacity', 'Inmates as % of Capacity',
        'Number of Inmates Sentenced to State [Number]',
        'Number of Inmates Sentenced to State [% of Total]',
        'Number of Inmates Awaiting Trial in Jail [Number]',
        'Number of Inmates Awaiting Trial in Jail [% of Total]',
        'Number of Inmates Serving County Sentence [Number]',
        'Number of Inmates Serving County Sentence [% of Total]',
        'Number of Other Inmates [Number]',
        'Number of Other Inmates [% of Total]'
    ]

    # Tables at the end of the doc contain all data we want to parse
    pages = [8, 9, 10, 11]

    # Use lattice parsing since default parsing fails to parse columns on
    # the right half of the page
    use_lattice = True

    result = read_pdf(
        location,
        filename,
        pages=pages,
        lattice=use_lattice,
        pandas_options={
            'names': column_names,
            'skiprows': _header_on_each_page(),
            'skipfooter': 1,  # The last row is the grand totals
            'engine': 'python'  # Only python engine supports 'skipfooter'
        })

    result = aggregate_ingest_utils.rename_columns_and_select(
        result, {
            'Jurisdiction': 'county_name',
            'Total Number of Inmates In Jail':
            'total_number_of_inmates_in_jail',
            'Jail Capacity': 'jail_capacity',
            'Number of Inmates Sentenced to State [Number]':
            'number_of_inmates_sentenced_to_state',
            'Number of Inmates Awaiting Trial in Jail [Number]':
            'number_of_inmates_awaiting_trial',
            'Number of Inmates Serving County Sentence [Number]':
            'number_of_inmates_serving_county_sentence',
            'Number of Other Inmates [Number]': 'number_of_other_inmates'
        })

    # Tabula may parse extra empty rows
    result = result.dropna()

    aggregate_ingest_utils.cast_columns_to_int(result,
                                               ignore_columns={'county_name'})

    return result

コード例 #2

0

ファイルを表示

ファイル: tn_aggregate_ingest.py プロジェクト: Leo-Ryu/pulse-data

def _parse_table(_: str, filename: str, is_female: bool,
                 report_date: datetime.date) -> pd.DataFrame:
    # Most but not all PDFs have data on pages 2-4.
    pages = ([1, 2] if 2000 <= report_date.year <= 2005
             else [3, 4, 5] if report_date.year in (2006, 2009)
             else [2, 3, 4])
    table = tabula.read_pdf(filename, pages=pages, multiple_tables=True)

    if is_female and report_date.year == 2020 and report_date.month in (4, 5, 6):
        table = [table[0],
                 pd.concat((table[1], table[2])),
                 pd.concat((table[3], table[4]))]

    formatted_dfs = [_format_table(df, is_female, report_date.year)
                     for df in table]

    table = pd.concat(formatted_dfs, ignore_index=True)

    # Discard 'TOTAL' row.
    table = table.iloc[:-1]

    table = aggregate_ingest_utils.cast_columns_to_int(
        table, ignore_columns={'facility_name'})

    return table

コード例 #3

0

ファイルを表示

def _parse_table(filename: str) -> pd.DataFrame:
    """Parses the CA aggregate report."""

    # Although the file is downloaded with the '.xls' extension, the contents of
    # the file are in the shape of an HTML file.
    df = pd.read_html(filename, header=0)[0]
    df = df.fillna(0)

    df['report_date'] = df[['Year', 'Month']].apply(_last_date_of_month,
                                                    axis='columns')

    df = aggregate_ingest_utils.rename_columns_and_select(
        df, {
            'Jurisdiction': 'jurisdiction_name',
            'Facility': 'facility_name',
            'Total facility ADP': 'average_daily_population',
            'Unsentenced males': 'unsentenced_male_adp',
            'Unsentenced females': 'unsentenced_female_adp',
            'Sentenced males': 'sentenced_male_adp',
            'Sentenced females': 'sentenced_female_adp',
            'report_date': 'report_date'
        })

    string_columns = {'jurisdiction_name', 'facility_name', 'report_date'}
    df = aggregate_ingest_utils.cast_columns_to_int(
        df, ignore_columns=string_columns)

    return df

コード例 #4

0

ファイルを表示

ファイル: ca_aggregate_ingest.py プロジェクト: Recidiviz/pulse-data

def _parse_table(filename: str) -> pd.DataFrame:
    """Parses the CA aggregate report."""

    # Although the file is downloaded with the '.xls' extension, the contents of
    # the file are in the shape of an HTML file.
    df = pd.read_html(filename, header=0)[0]
    df = df.fillna(0)

    df["report_date"] = df[["Year", "Month"]].apply(_last_date_of_month, axis="columns")

    df = aggregate_ingest_utils.rename_columns_and_select(
        df,
        {
            "Jurisdiction": "jurisdiction_name",
            "Facility": "facility_name",
            "Total facility ADP": "average_daily_population",
            "Unsentenced males": "unsentenced_male_adp",
            "Unsentenced females": "unsentenced_female_adp",
            "Sentenced males": "sentenced_male_adp",
            "Sentenced females": "sentenced_female_adp",
            "report_date": "report_date",
        },
    )

    string_columns = {"jurisdiction_name", "facility_name", "report_date"}
    df = aggregate_ingest_utils.cast_columns_to_int(df, ignore_columns=string_columns)

    return df

コード例 #5

0

ファイルを表示

def _parse_table(location: str, filename: str, is_female: bool,
                 year: int) -> pd.DataFrame:
    # Most but not all PDFs have data on pages 2-4.
    pages = ([1, 2] if 2000 <= year <= 2005 else
             [3, 4, 5] if year in (2006, 2009) else [2, 3, 4])
    table = read_pdf(location, filename, pages=pages, multiple_tables=True)

    formatted_dfs = [_format_table(df, is_female, year) for df in table]

    table = pd.concat(formatted_dfs, ignore_index=True)

    # Discard 'TOTAL' row.
    table = table.iloc[:-1]

    table = aggregate_ingest_utils.cast_columns_to_int(
        table, ignore_columns={'facility_name'})

    return table

コード例 #6

0

ファイルを表示

ファイル: ky_aggregate_ingest.py プロジェクト: Leo-Ryu/pulse-data

def _parse_table(_, filename: str) -> pd.DataFrame:
    """Parses the table in the KY PDF."""
    whole_df = tabula.read_pdf(
        filename,
        pages='all',
        lattice=True
    )

    if filename.endswith('04-16-20.pdf'):
        whole_df[323:331] = whole_df[323:331].shift(-1, axis='columns')
    elif filename.endswith('07-09-20.pdf'):
        whole_df.loc[432] = whole_df.iloc[432].shift(-1)
        whole_df.loc[434:436] = whole_df.loc[434:436].shift(-1, axis='columns')
        whole_df.loc[438] = whole_df.iloc[438].shift(-1)
        whole_df.loc[440] = whole_df.iloc[440].shift(-1)
        whole_df.loc[442:445] = whole_df.loc[442:445].shift(-1, axis='columns')
        whole_df.loc[447:462] = whole_df.loc[447:462].shift(-1, axis='columns')
        whole_df.loc[464:] = whole_df.loc[464:].shift(-1, axis='columns')
        whole_df.loc[451, 'County'] = 86
        whole_df.loc[456, 'County'] = 264
        whole_df.loc[461, 'County'] = 52
        whole_df.loc[464, 'County'] = 161
        whole_df.loc[469, 'County'] = 70
        whole_df.loc[472, 'County'] = 204
        whole_df.loc[477, 'County'] = 182
        whole_df.loc[482, 'County'] = 137
        whole_df.loc[487, 'County'] = 45
        whole_df.loc[492, 'County'] = 410
        whole_df.loc[497, 'County'] = 152
        whole_df.loc[500, 'County'] = 95
        whole_df.loc[505, 'County'] = 85
        whole_df.loc[508, 'County'] = 194
        whole_df.loc[513, 'County'] = 72
        whole_df.loc[516, 'County'] = 134
        whole_df.loc[521, 'County'] = 50
        whole_df.loc[524, 'County'] = 63
        whole_df.loc[529, 'County'] = 32

    # Remove totals separate from parsing since it's a variable length
    totals_start_index = np.where(whole_df['Date'].str.contains('Totals'))[0][0]
    whole_df = whole_df[:totals_start_index]

    # Some rows are parsed including the date, which shift them 1 too far right
    shifted_rows = whole_df['County'].astype(str).str.contains('Secure')
    whole_df[shifted_rows] = whole_df[shifted_rows].shift(-1, axis='columns')

    whole_df = whole_df[whole_df['County'].astype(str) != 'County']

    whole_df.reset_index(drop=True)

    whole_df = _shift_headers(whole_df)
    whole_df.columns = whole_df.columns.str.replace('\n', ' ')
    whole_df.columns = whole_df.columns.str.replace('\r', ' ')

    # Column names can change over time : (
    column_name_map = {
        'CC Eligible Inmates': 'Community Custody Inmates',
    }
    whole_df.columns = [column_name_map[c] if c in column_name_map else c
                        for c in whole_df.columns]

    # Each block of county data starts with a filled in 'Total Jail Beds'
    start_of_county_indices = np.where(whole_df['Total Jail Beds'].notnull())[0]
    dfs_split_by_county = _split_df(whole_df, start_of_county_indices)

    dfs_grouped_by_gender = []
    for df in dfs_split_by_county:
        # This is a typo in several reports
        if '12/' in df['Federal Inmates'].values:
            df['Federal Inmates'] = df['Federal Inmates'].replace({'12/': '12'})

        # Cast everything to int before summing below
        df = df.fillna(0)
        df = aggregate_ingest_utils.cast_columns_to_int(
            df, ignore_columns={'County', 'Facility Security', 'Inmate Cusody'})

        df['Gender'] = None
        df = _collapse_by_gender_rows(df, 'Male')
        df = _collapse_by_gender_rows(df, 'Female')

        # The first row contains header data for both Male and Female
        df['County'] = df['County'][0]
        df['total_jail_beds'] = df['Total Jail Beds'][0]
        df['reported_population'] = \
            df['Reported Population (Total and Male/Female)'][0]
        df = df[1:]

        dfs_grouped_by_gender.append(df)

    df_by_gender = pd.concat(dfs_grouped_by_gender)

    # Split into male_df and female_df to independently set column headers
    male_df = df_by_gender[df_by_gender['Gender'] == 'Male']
    female_df = df_by_gender[df_by_gender['Gender'] == 'Female']

    # Since both male_df and female_df contain shared data, pick arbitrarily
    shared_df = aggregate_ingest_utils.rename_columns_and_select(female_df, {
        'County': 'facility_name',
        'total_jail_beds': 'total_jail_beds',
        'reported_population': 'reported_population',
    })

    male_df = aggregate_ingest_utils.rename_columns_and_select(male_df, {
        'County': 'facility_name',
        # Since we've grouped by Male, this Reported Population is only Male
        'Reported Population (Total and Male/Female)': 'male_population',
        'Class D Inmates': 'class_d_male_population',
        'Community Custody Inmates': 'community_custody_male_population',
        'Alternative Sentence': 'alternative_sentence_male_population',
        'Controlled Intake': 'controlled_intake_male_population',
        'Parole Violators': 'parole_violators_male_population',
        'Federal Inmates': 'federal_male_population',
    })

    female_df = aggregate_ingest_utils.rename_columns_and_select(female_df, {
        'County': 'facility_name',
        # Since we've grouped by Female, this Reported Population is only Female
        'Reported Population (Total and Male/Female)': 'female_population',
        'Class D Inmates': 'class_d_female_population',
        'Community Custody Inmates': 'community_custody_female_population',
        'Alternative Sentence': 'alternative_sentence_female_population',
        'Controlled Intake': 'controlled_intake_female_population',
        'Parole Violators': 'parole_violators_female_population',
        'Federal Inmates': 'federal_female_population',
    })

    result = shared_df.join(male_df.set_index('facility_name'),
                            on='facility_name')
    result = result.join(female_df.set_index('facility_name'),
                         on='facility_name')

    if filename.endswith('04-16-20.pdf'):
        result.loc[result['facility_name'] == 'Lincoln', 'total_jail_beds'] = 72

    return result.reset_index(drop=True)

コード例 #7

0

ファイルを表示

ファイル: ky_aggregate_ingest.py プロジェクト: Recidiviz/pulse-data

def _parse_table(filename: str) -> pd.DataFrame:
    """Parses the table in the KY PDF."""
    whole_df = one(
        tabula.read_pdf(filename,
                        pages="all",
                        multiple_tables=False,
                        lattice=True))

    if filename.endswith("04-16-20.pdf"):
        whole_df[323:331] = whole_df[323:331].shift(-1, axis="columns")
    elif filename.endswith("07-09-20.pdf"):
        whole_df.loc[432] = whole_df.iloc[432].shift(-1)
        whole_df.loc[434:436] = whole_df.loc[434:436].shift(-1, axis="columns")
        whole_df.loc[438] = whole_df.iloc[438].shift(-1)
        whole_df.loc[440] = whole_df.iloc[440].shift(-1)
        whole_df.loc[442:445] = whole_df.loc[442:445].shift(-1, axis="columns")
        whole_df.loc[447:462] = whole_df.loc[447:462].shift(-1, axis="columns")
        whole_df.loc[464:] = whole_df.loc[464:].shift(-1, axis="columns")
        whole_df.loc[451, "County"] = 86
        whole_df.loc[456, "County"] = 264
        whole_df.loc[461, "County"] = 52
        whole_df.loc[464, "County"] = 161
        whole_df.loc[469, "County"] = 70
        whole_df.loc[472, "County"] = 204
        whole_df.loc[477, "County"] = 182
        whole_df.loc[482, "County"] = 137
        whole_df.loc[487, "County"] = 45
        whole_df.loc[492, "County"] = 410
        whole_df.loc[497, "County"] = 152
        whole_df.loc[500, "County"] = 95
        whole_df.loc[505, "County"] = 85
        whole_df.loc[508, "County"] = 194
        whole_df.loc[513, "County"] = 72
        whole_df.loc[516, "County"] = 134
        whole_df.loc[521, "County"] = 50
        whole_df.loc[524, "County"] = 63
        whole_df.loc[529, "County"] = 32

    # Remove totals separate from parsing since it's a variable length
    totals_start_index = np.where(
        whole_df["Date"].str.contains("Totals"))[0][0]
    whole_df = whole_df[:totals_start_index]

    # Some rows are parsed including the date, which shift them 1 too far right
    shifted_rows = whole_df["County"].astype(str).str.contains("Secure")
    whole_df[shifted_rows] = whole_df[shifted_rows].shift(-1, axis="columns")

    whole_df = whole_df[whole_df["County"].astype(str) != "County"]

    whole_df.reset_index(drop=True)

    whole_df = _shift_headers(whole_df)
    whole_df.columns = whole_df.columns.str.replace("\n", " ")
    whole_df.columns = whole_df.columns.str.replace("\r", " ")

    # Column names can change over time : (
    column_name_map = {
        "CC Eligible Inmates": "Community Custody Inmates",
    }
    whole_df.columns = [
        column_name_map[c] if c in column_name_map else c
        for c in whole_df.columns
    ]

    # Each block of county data starts with a filled in 'Total Jail Beds'
    start_of_county_indices = np.where(
        whole_df["Total Jail Beds"].notnull())[0]
    dfs_split_by_county = _split_df(whole_df, start_of_county_indices)

    dfs_grouped_by_gender = []
    for df in dfs_split_by_county:
        # This is a typo in several reports
        if "12/" in df["Federal Inmates"].values:
            df["Federal Inmates"] = df["Federal Inmates"].replace(
                {"12/": "12"})
        if "yo" in df["Federal Inmates"].values:
            df["Federal Inmates"] = df["Federal Inmates"].replace({"yo": "0"})
        if "pe" in df["Federal Inmates"].values:
            df["Federal Inmates"] = df["Federal Inmates"].replace({"pe": "0"})
        if "(" in df["Reported Population (Total and Male/Female)"].values:
            df["Reported Population (Total and Male/Female)"] = df[
                "Reported Population (Total and Male/Female)"].replace(
                    {"(": "0"})

        # Cast everything to int before summing below
        df = df.fillna(0)
        df = aggregate_ingest_utils.cast_columns_to_int(
            df,
            ignore_columns={"County", "Facility Security", "Inmate Cusody"})

        df["Gender"] = None
        df = _collapse_by_gender_rows(df, "Male")
        df = _collapse_by_gender_rows(df, "Female")

        # The first row contains header data for both Male and Female
        df["County"] = df["County"][0]
        df["total_jail_beds"] = df["Total Jail Beds"][0]
        df["reported_population"] = df[
            "Reported Population (Total and Male/Female)"][0]
        df = df[1:]

        dfs_grouped_by_gender.append(df)

    df_by_gender = pd.concat(dfs_grouped_by_gender)

    # Split into male_df and female_df to independently set column headers
    male_df = df_by_gender[df_by_gender["Gender"] == "Male"]
    female_df = df_by_gender[df_by_gender["Gender"] == "Female"]

    # Since both male_df and female_df contain shared data, pick arbitrarily
    shared_df = aggregate_ingest_utils.rename_columns_and_select(
        female_df,
        {
            "County": "facility_name",
            "total_jail_beds": "total_jail_beds",
            "reported_population": "reported_population",
        },
    )

    male_df = aggregate_ingest_utils.rename_columns_and_select(
        male_df,
        {
            "County": "facility_name",
            # Since we've grouped by Male, this Reported Population is only Male
            "Reported Population (Total and Male/Female)": "male_population",
            "Class D Inmates": "class_d_male_population",
            "Community Custody Inmates": "community_custody_male_population",
            "Alternative Sentence": "alternative_sentence_male_population",
            "Controlled Intake": "controlled_intake_male_population",
            "Parole Violators": "parole_violators_male_population",
            "Federal Inmates": "federal_male_population",
        },
    )

    female_df = aggregate_ingest_utils.rename_columns_and_select(
        female_df,
        {
            "County": "facility_name",
            # Since we've grouped by Female, this Reported Population is only Female
            "Reported Population (Total and Male/Female)": "female_population",
            "Class D Inmates": "class_d_female_population",
            "Community Custody Inmates": "community_custody_female_population",
            "Alternative Sentence": "alternative_sentence_female_population",
            "Controlled Intake": "controlled_intake_female_population",
            "Parole Violators": "parole_violators_female_population",
            "Federal Inmates": "federal_female_population",
        },
    )

    result = shared_df.join(male_df.set_index("facility_name"),
                            on="facility_name")
    result = result.join(female_df.set_index("facility_name"),
                         on="facility_name")

    if filename.endswith("04-16-20.pdf"):
        result.loc[result["facility_name"] == "Lincoln",
                   "total_jail_beds"] = 72

    return result.reset_index(drop=True)

コード例 #8

0

ファイルを表示

def _parse_table(location, filename: str) -> pd.DataFrame:
    """Parses the table in the KY PDF."""
    whole_df = read_pdf(location, filename, pages='all', lattice=True)

    # Remove totals separate from parsing since it's a variable length
    totals_start_index = np.where(
        whole_df['Date'].str.contains('Totals'))[0][0]
    whole_df = whole_df[:totals_start_index]

    # Some rows are parsed including the date, which shift them 1 too far right
    shifted_rows = whole_df['County'].astype(str).str.contains('Secure')
    whole_df[shifted_rows] = whole_df[shifted_rows].shift(-1, axis='columns')

    whole_df = whole_df[whole_df['County'].astype(str) != 'County']

    whole_df.reset_index(drop=True)

    whole_df = _shift_headers(whole_df)
    whole_df.columns = whole_df.columns.str.replace('\n', ' ')
    whole_df.columns = whole_df.columns.str.replace('\r', ' ')

    # Column names can change over time : (
    column_name_map = {
        'CC Eligible Inmates': 'Community Custody Inmates',
    }
    whole_df.columns = [
        column_name_map[c] if c in column_name_map else c
        for c in whole_df.columns
    ]

    # Each block of county data starts with a filled in 'Total Jail Beds'
    start_of_county_indices = np.where(
        whole_df['Total Jail Beds'].notnull())[0]
    dfs_split_by_county = _split_df(whole_df, start_of_county_indices)

    dfs_grouped_by_gender = []
    for df in dfs_split_by_county:
        # Cast everything to int before summing below
        df = df.fillna(0)
        df = aggregate_ingest_utils.cast_columns_to_int(
            df,
            ignore_columns={'County', 'Facility Security', 'Inmate Cusody'})

        df['Gender'] = None
        df = _collapse_by_gender_rows(df, 'Male')
        df = _collapse_by_gender_rows(df, 'Female')

        # The first row contains header data for both Male and Female
        df['County'] = df['County'][0]
        df['total_jail_beds'] = df['Total Jail Beds'][0]
        df['reported_population'] = \
            df['Reported Population (Total and Male/Female)'][0]
        df = df[1:]

        dfs_grouped_by_gender.append(df)

    df_by_gender = pd.concat(dfs_grouped_by_gender)

    # Split into male_df and female_df to independently set column headers
    male_df = df_by_gender[df_by_gender['Gender'] == 'Male']
    female_df = df_by_gender[df_by_gender['Gender'] == 'Female']

    # Since both male_df and female_df contain shared data, pick arbitrarily
    shared_df = aggregate_ingest_utils.rename_columns_and_select(
        female_df, {
            'County': 'facility_name',
            'total_jail_beds': 'total_jail_beds',
            'reported_population': 'reported_population',
        })

    male_df = aggregate_ingest_utils.rename_columns_and_select(
        male_df,
        {
            'County': 'facility_name',
            # Since we've grouped by Male, this Reported Population is only Male
            'Reported Population (Total and Male/Female)': 'male_population',
            'Class D Inmates': 'class_d_male_population',
            'Community Custody Inmates': 'community_custody_male_population',
            'Alternative Sentence': 'alternative_sentence_male_population',
            'Controlled Intake': 'controlled_intake_male_population',
            'Parole Violators': 'parole_violators_male_population',
            'Federal Inmates': 'federal_male_population',
        })

    female_df = aggregate_ingest_utils.rename_columns_and_select(
        female_df,
        {
            'County': 'facility_name',
            # Since we've grouped by Female, this Reported Population is only Female
            'Reported Population (Total and Male/Female)': 'female_population',
            'Class D Inmates': 'class_d_female_population',
            'Community Custody Inmates': 'community_custody_female_population',
            'Alternative Sentence': 'alternative_sentence_female_population',
            'Controlled Intake': 'controlled_intake_female_population',
            'Parole Violators': 'parole_violators_female_population',
            'Federal Inmates': 'federal_female_population',
        })

    result = shared_df.join(male_df.set_index('facility_name'),
                            on='facility_name')
    result = result.join(female_df.set_index('facility_name'),
                         on='facility_name')

    return result.reset_index(drop=True)

コード例 #9

0

ファイルを表示

def _parse_table(filename: str, report_date: datetime.date) -> pd.DataFrame:
    """Parses the last table in the GA PDF."""

    # Set column names since the pdf makes them hard to parse directly
    column_names = [
        "Index",
        "Jurisdiction",
        "Total Number of Inmates In Jail",
        "Jail Capacity",
        "Inmates as % of Capacity",
        "Number of Inmates Sentenced to State [Number]",
        "Number of Inmates Sentenced to State [% of Total]",
        "Number of Inmates Awaiting Trial in Jail [Number]",
        "Number of Inmates Awaiting Trial in Jail [% of Total]",
        "Number of Inmates Serving County Sentence [Number]",
        "Number of Inmates Serving County Sentence [% of Total]",
        "Number of Other Inmates [Number]",
        "Number of Other Inmates [% of Total]",
    ]

    # Tables at the end of the doc contain all data we want to parse
    pages = [8, 9, 10, 11]

    # Use lattice parsing since default parsing fails to parse columns on
    # the right half of the page
    use_lattice = True

    if filename.endswith("jun_19.pdf"):
        # Tabula can't handle the multiple tables because it thinks the one on
        # the last page has extra columns. This concats them manually.
        *dfs, df4 = tabula.read_pdf(filename,
                                    pages=pages,
                                    lattice=use_lattice,
                                    multiple_tables=True)
        df4 = df4.iloc[:-1, 1:14]
        df4.columns = range(13)
        df4.iloc[33, 1] = df4.iloc[33, 1].strip(" '")
        dfs.append(df4)
        result = pd.concat(df.iloc[1:] for df in dfs)
        result.columns = column_names
    elif report_date >= datetime.date(2020, 11, 5):
        # Skip every 48th row for new-style reports
        result = one(
            tabula.read_pdf(
                filename,
                pages=pages,
                lattice=use_lattice,
                multiple_tables=False,
                pandas_options={
                    "names": column_names,
                    "skiprows": [x * 48 for x in range(4)],
                    "skipfooter": 1,  # The last row is the grand totals
                    "engine":
                    "python",  # Only python engine supports 'skipfooter'
                },
            ))
    else:
        result = one(
            tabula.read_pdf(
                filename,
                pages=pages,
                lattice=use_lattice,
                multiple_tables=False,
                pandas_options={
                    "names": column_names,
                    "skiprows": _header_on_each_page(),
                    "skipfooter": 1,  # The last row is the grand totals
                    "engine":
                    "python",  # Only python engine supports 'skipfooter'
                },
            ))

    result = aggregate_ingest_utils.rename_columns_and_select(
        result,
        {
            "Jurisdiction": "county_name",
            "Total Number of Inmates In Jail":
            "total_number_of_inmates_in_jail",
            "Jail Capacity": "jail_capacity",
            "Number of Inmates Sentenced to State [Number]":
            "number_of_inmates_sentenced_to_state",
            "Number of Inmates Awaiting Trial in Jail [Number]":
            "number_of_inmates_awaiting_trial",
            "Number of Inmates Serving County Sentence [Number]":
            "number_of_inmates_serving_county_sentence",
            "Number of Other Inmates [Number]": "number_of_other_inmates",
        },
    )

    # Tabula may parse extra empty rows
    result = result.dropna()

    aggregate_ingest_utils.cast_columns_to_int(result,
                                               ignore_columns={"county_name"})

    return result