Python read_pdf Exemples, recidiviz.common.read_pdf.read_pdf Python Exemples

Exemple #1

0

Afficher le fichier

Fichier : fl_aggregate_ingest.py Projet : dxy/pulse-data

def _parse_county_table(location: str, filename: str) -> pd.DataFrame:
    """Parses the FL County - Table 1 in the PDF."""
    part1 = read_pdf(location,
                     filename,
                     pages=[3],
                     pandas_options={
                         'header': [0, 1],
                     })
    part2 = read_pdf(
        location,
        filename,
        pages=[4],
        pandas_options={
            'header': [0, 1],
            'skipfooter': 1,  # The last row is the total
            'engine': 'python'  # Only python engine supports 'skipfooter'
        })
    result = part1.append(part2, ignore_index=True)

    result.columns = aggregate_ingest_utils.collapse_header(result.columns)
    result = aggregate_ingest_utils.rename_columns_and_select(
        result, {
            'Florida County': 'county_name',
            'County Population': 'county_population',
            'Average Daily Population (ADP)': 'average_daily_population',
            '*Date Reported': 'date_reported'
        })

    for column_name in {'county_population', 'average_daily_population'}:
        result[column_name] = result[column_name].apply(locale.atoi)
    result['date_reported'] = pd.to_datetime(result['date_reported'])

    return result

Exemple #2

0

Afficher le fichier

def _parse_table(location: str, filename: str, is_female: bool,
                 report_date: datetime.date) -> pd.DataFrame:
    # Most but not all PDFs have data on pages 2-4.
    pages = ([1, 2] if 2000 <= report_date.year <= 2005 else
             [3, 4, 5] if report_date.year in (2006, 2009) else [2, 3, 4])
    table = read_pdf(location, filename, pages=pages, multiple_tables=True)

    if report_date.year == 2020 and report_date.month in (4, 5):
        table = [
            table[0],
            pd.concat((table[1], table[2])),
            pd.concat((table[3], table[4]))
        ]

    formatted_dfs = [
        _format_table(df, is_female, report_date.year) for df in table
    ]

    table = pd.concat(formatted_dfs, ignore_index=True)

    # Discard 'TOTAL' row.
    table = table.iloc[:-1]

    table = aggregate_ingest_utils.cast_columns_to_int(
        table, ignore_columns={'facility_name'})

    return table

Exemple #3

0

Afficher le fichier

def _parse_table(location: str, filename: str) -> pd.DataFrame:
    """Parse the Head Count Endings and Contracted Facilities Tables."""
    all_dfs = read_pdf(location,
                       filename,
                       multiple_tables=True,
                       lattice=True,
                       pandas_options={
                           'header': [0, 1],
                       })

    head_count_ending_df = _df_matching_substring(
        all_dfs, {'total', 'head count ending'})
    head_count_ending_df = _format_head_count_ending(head_count_ending_df)

    facilities_df = _df_matching_substring(all_dfs, {'contracted facilities'})
    facilities_df = _format_contracted_facilities(facilities_df)

    result = head_count_ending_df.append(facilities_df, ignore_index=True)

    result['fips'] = result.facility_name.map(_facility_acronym_to_fips)
    result['facility_name'] = \
        result.facility_name.map(_facility_acronym_to_name)

    # Rows that may be NaN need to be cast as a float, otherwise use int
    string_columns = {'facility_name', 'fips'}
    nullable_columns = {'design_bed_capacity', 'operation_bed_capacity'}
    int_columns = set(result.columns) - string_columns - nullable_columns

    for column_name in int_columns:
        result[column_name] = result[column_name].astype(int)
    for column_name in nullable_columns:
        result[column_name] = result[column_name].astype(float)

    return result

Exemple #4

0

Afficher le fichier

Fichier : ga_aggregate_ingest.py Projet : xgenie-007/pulse-data

def _parse_table(location: str, filename: str) -> pd.DataFrame:
    """Parses the last table in the GA PDF."""

    # Set column names since the pdf makes them hard to parse directly
    column_names = [
        'Index', 'Jurisdiction', 'Total Number of Inmates In Jail',
        'Jail Capacity', 'Inmates as % of Capacity',
        'Number of Inmates Sentenced to State [Number]',
        'Number of Inmates Sentenced to State [% of Total]',
        'Number of Inmates Awaiting Trial in Jail [Number]',
        'Number of Inmates Awaiting Trial in Jail [% of Total]',
        'Number of Inmates Serving County Sentence [Number]',
        'Number of Inmates Serving County Sentence [% of Total]',
        'Number of Other Inmates [Number]',
        'Number of Other Inmates [% of Total]'
    ]

    # Tables at the end of the doc contain all data we want to parse
    pages = [8, 9, 10, 11]

    # Use lattice parsing since default parsing fails to parse columns on
    # the right half of the page
    use_lattice = True

    result = read_pdf(
        location,
        filename,
        pages=pages,
        lattice=use_lattice,
        pandas_options={
            'names': column_names,
            'skiprows': _header_on_each_page(),
            'skipfooter': 1,  # The last row is the grand totals
            'engine': 'python'  # Only python engine supports 'skipfooter'
        })

    result = aggregate_ingest_utils.rename_columns_and_select(
        result, {
            'Jurisdiction': 'county_name',
            'Total Number of Inmates In Jail':
            'total_number_of_inmates_in_jail',
            'Jail Capacity': 'jail_capacity',
            'Number of Inmates Sentenced to State [Number]':
            'number_of_inmates_sentenced_to_state',
            'Number of Inmates Awaiting Trial in Jail [Number]':
            'number_of_inmates_awaiting_trial',
            'Number of Inmates Serving County Sentence [Number]':
            'number_of_inmates_serving_county_sentence',
            'Number of Other Inmates [Number]': 'number_of_other_inmates'
        })

    # Tabula may parse extra empty rows
    result = result.dropna()

    aggregate_ingest_utils.cast_columns_to_int(result,
                                               ignore_columns={'county_name'})

    return result

Exemple #5

0

Afficher le fichier

Fichier : fl_aggregate_ingest.py Projet : xgenie-007/pulse-data

def _parse_facility_table(location: str, filename: str) -> pd.DataFrame:
    """Parse the FL County Pretrial Inmate Report - Table 2 in the PDF."""
    # Set column names directly since the pdf format makes them hard to parse
    column_names = [
        'Detention Facility Name',
        'Average Daily Population',
        'Number Felony Pretrial',
        'Number Misdemeanor Pretrial',
        'Total Percent Pretrial']

    part1 = read_pdf(
        location,
        filename,
        pages=[5],
        pandas_options={
            'skiprows': [0, 1, 2],
            'names': column_names,
        })
    part2 = read_pdf(
        location,
        filename,
        pages=[6],
        pandas_options={
            'skiprows': [0, 1, 2],
            'usecols': [0, 2, 3, 4, 5],  # Column 1 contains no data
            'names': column_names,
            'skipfooter': 2,  # The last 2 rows are the totals
            'engine': 'python'  # Only python engine supports 'skipfooter'
        })
    result = part1.append(part2, ignore_index=True)

    result = aggregate_ingest_utils.rename_columns_and_select(result, {
        'Detention Facility Name': 'facility_name',
        'Average Daily Population': 'average_daily_population',
        'Number Felony Pretrial': 'number_felony_pretrial',
        'Number Misdemeanor Pretrial': 'number_misdemeanor_pretrial'
    })

    result['average_daily_population'] = result[
        'average_daily_population'].apply(_use_stale_adp).apply(_to_int)
    for column_name in {'number_felony_pretrial',
                        'number_misdemeanor_pretrial'}:
        result[column_name] = result[column_name].apply(_to_int)

    return result

Exemple #6

0

Afficher le fichier

def _parse_table(location: str, filename: str, is_female: bool,
                 year: int) -> pd.DataFrame:
    # Most but not all PDFs have data on pages 2-4.
    pages = ([1, 2] if 2000 <= year <= 2005 else
             [3, 4, 5] if year in (2006, 2009) else [2, 3, 4])
    table = read_pdf(location, filename, pages=pages, multiple_tables=True)

    formatted_dfs = [_format_table(df, is_female, year) for df in table]

    table = pd.concat(formatted_dfs, ignore_index=True)

    # Discard 'TOTAL' row.
    table = table.iloc[:-1]

    table = aggregate_ingest_utils.cast_columns_to_int(
        table, ignore_columns={'facility_name'})

    return table

Exemple #7

0

Afficher le fichier

def _parse_table(location: str, filename: str) -> pd.DataFrame:
    """Parses all tables in the GA PDF."""
    all_dfs = read_pdf(
        location,
        filename,
        pages='all',
        multiple_tables=True,
        lattice=True,
        pandas_options={
            'header': 0
        })

    # Trim unnecessary tables
    all_dfs = all_dfs[3:-1]

    dfs_split_by_page = [_split_page(df_for_page) for df_for_page in all_dfs]
    all_split_dfs = list(itertools.chain.from_iterable(dfs_split_by_page))

    results = [_format_df(df) for df in all_split_dfs]
    return pd.concat(results, ignore_index=True)

Exemple #8

0

Afficher le fichier

Fichier : ky_aggregate_ingest.py Projet : pnchbck/pulse-data

def _parse_table(location, filename: str) -> pd.DataFrame:
    """Parses the table in the KY PDF."""
    whole_df = read_pdf(
        location,
        filename,
        pages='all',
        lattice=True
    )

    if filename.endswith('04-16-20.pdf'):
        whole_df[323:331] = whole_df[323:331].shift(-1, axis='columns')
    elif filename.endswith('07-09-20.pdf'):
        whole_df.loc[432] = whole_df.iloc[432].shift(-1)
        whole_df.loc[434:436] = whole_df.loc[434:436].shift(-1, axis='columns')
        whole_df.loc[438] = whole_df.iloc[438].shift(-1)
        whole_df.loc[440] = whole_df.iloc[440].shift(-1)
        whole_df.loc[442:445] = whole_df.loc[442:445].shift(-1, axis='columns')
        whole_df.loc[447:462] = whole_df.loc[447:462].shift(-1, axis='columns')
        whole_df.loc[464:] = whole_df.loc[464:].shift(-1, axis='columns')
        whole_df.loc[451, 'County'] = 86
        whole_df.loc[456, 'County'] = 264
        whole_df.loc[461, 'County'] = 52
        whole_df.loc[464, 'County'] = 161
        whole_df.loc[469, 'County'] = 70
        whole_df.loc[472, 'County'] = 204
        whole_df.loc[477, 'County'] = 182
        whole_df.loc[482, 'County'] = 137
        whole_df.loc[487, 'County'] = 45
        whole_df.loc[492, 'County'] = 410
        whole_df.loc[497, 'County'] = 152
        whole_df.loc[500, 'County'] = 95
        whole_df.loc[505, 'County'] = 85
        whole_df.loc[508, 'County'] = 194
        whole_df.loc[513, 'County'] = 72
        whole_df.loc[516, 'County'] = 134
        whole_df.loc[521, 'County'] = 50
        whole_df.loc[524, 'County'] = 63
        whole_df.loc[529, 'County'] = 32

    # Remove totals separate from parsing since it's a variable length
    totals_start_index = np.where(whole_df['Date'].str.contains('Totals'))[0][0]
    whole_df = whole_df[:totals_start_index]

    # Some rows are parsed including the date, which shift them 1 too far right
    shifted_rows = whole_df['County'].astype(str).str.contains('Secure')
    whole_df[shifted_rows] = whole_df[shifted_rows].shift(-1, axis='columns')

    whole_df = whole_df[whole_df['County'].astype(str) != 'County']

    whole_df.reset_index(drop=True)

    whole_df = _shift_headers(whole_df)
    whole_df.columns = whole_df.columns.str.replace('\n', ' ')
    whole_df.columns = whole_df.columns.str.replace('\r', ' ')

    # Column names can change over time : (
    column_name_map = {
        'CC Eligible Inmates': 'Community Custody Inmates',
    }
    whole_df.columns = [column_name_map[c] if c in column_name_map else c
                        for c in whole_df.columns]

    # Each block of county data starts with a filled in 'Total Jail Beds'
    start_of_county_indices = np.where(whole_df['Total Jail Beds'].notnull())[0]
    dfs_split_by_county = _split_df(whole_df, start_of_county_indices)

    dfs_grouped_by_gender = []
    for df in dfs_split_by_county:
        # This is a typo in several reports
        if '12/' in df['Federal Inmates'].values:
            df['Federal Inmates'] = df['Federal Inmates'].replace({'12/': '12'})

        # Cast everything to int before summing below
        df = df.fillna(0)
        df = aggregate_ingest_utils.cast_columns_to_int(
            df, ignore_columns={'County', 'Facility Security', 'Inmate Cusody'})

        df['Gender'] = None
        df = _collapse_by_gender_rows(df, 'Male')
        df = _collapse_by_gender_rows(df, 'Female')

        # The first row contains header data for both Male and Female
        df['County'] = df['County'][0]
        df['total_jail_beds'] = df['Total Jail Beds'][0]
        df['reported_population'] = \
            df['Reported Population (Total and Male/Female)'][0]
        df = df[1:]

        dfs_grouped_by_gender.append(df)

    df_by_gender = pd.concat(dfs_grouped_by_gender)

    # Split into male_df and female_df to independently set column headers
    male_df = df_by_gender[df_by_gender['Gender'] == 'Male']
    female_df = df_by_gender[df_by_gender['Gender'] == 'Female']

    # Since both male_df and female_df contain shared data, pick arbitrarily
    shared_df = aggregate_ingest_utils.rename_columns_and_select(female_df, {
        'County': 'facility_name',
        'total_jail_beds': 'total_jail_beds',
        'reported_population': 'reported_population',
    })

    male_df = aggregate_ingest_utils.rename_columns_and_select(male_df, {
        'County': 'facility_name',
        # Since we've grouped by Male, this Reported Population is only Male
        'Reported Population (Total and Male/Female)': 'male_population',
        'Class D Inmates': 'class_d_male_population',
        'Community Custody Inmates': 'community_custody_male_population',
        'Alternative Sentence': 'alternative_sentence_male_population',
        'Controlled Intake': 'controlled_intake_male_population',
        'Parole Violators': 'parole_violators_male_population',
        'Federal Inmates': 'federal_male_population',
    })

    female_df = aggregate_ingest_utils.rename_columns_and_select(female_df, {
        'County': 'facility_name',
        # Since we've grouped by Female, this Reported Population is only Female
        'Reported Population (Total and Male/Female)': 'female_population',
        'Class D Inmates': 'class_d_female_population',
        'Community Custody Inmates': 'community_custody_female_population',
        'Alternative Sentence': 'alternative_sentence_female_population',
        'Controlled Intake': 'controlled_intake_female_population',
        'Parole Violators': 'parole_violators_female_population',
        'Federal Inmates': 'federal_female_population',
    })

    result = shared_df.join(male_df.set_index('facility_name'),
                            on='facility_name')
    result = result.join(female_df.set_index('facility_name'),
                         on='facility_name')

    if filename.endswith('04-16-20.pdf'):
        result.loc[result['facility_name'] == 'Lincoln', 'total_jail_beds'] = 72

    return result.reset_index(drop=True)

Exemple #9

0

Afficher le fichier

Fichier : tx_aggregate_ingest.py Projet : xgenie-007/pulse-data

def _parse_table(location: str, filename: str,
                 report_date: datetime.date) -> pd.DataFrame:
    """Parses the TX County Table in the PDF."""
    num_pages = 9
    columns_to_schema = _get_column_names(report_date)

    pages = []
    for page_num in range(1, num_pages + 1):
        # Each page has 1 or more tables on it with the last table being the
        # one with the data on it.  The headers are poorly read by tabula and
        # some years have different responses to this call so we generally
        # just get all of the tables and consider only the one with numbers on
        # it.  That lets us clean it up by dropping nonsense columns and rows,
        # and then assigning our own columns names to them.
        df = read_pdf(
            location,
            filename,
            multiple_tables=True,
            pages=page_num,
        )
        df = df[-1]
        df = df.dropna(axis='columns', thresh=5)
        # We want to remove all of the rows and columns that have no data.
        numeric_elements = df.apply(pd.to_numeric, errors='coerce').notnull()
        rows_containing_data = numeric_elements.any(axis='columns')
        df = df.loc[rows_containing_data]
        # Next we finally break up some of the columns that were incorrectly
        # concatenated.
        for column in df.columns[1:]:
            # By this point we should only have numeric data in the rows,
            # if this happens it means some columns were concatenated and they
            # must be split.  If the columns are concatenated, we need only
            # check one of the rows for a space because they are all
            # concatenated.
            if ' ' in df[column].iloc[0]:
                index_to_insert = df.columns.get_loc(column)
                df_temp = pd.DataFrame(
                    df.pop(column).str.split(n=1, expand=True))
                df.insert(index_to_insert, str(column) + '_a', df_temp[0])
                df.insert(index_to_insert + 1, str(column) + '_b', df_temp[1])
        pages.append(df)

    # Drop last rows since it's the 'Totals' section
    pages[-1] = pages[-1].drop(pages[-1].tail(1).index)

    # Build result for all the pages.  We rename the columns before calling
    # concat because the column names might all be different.  Renaming them
    # allows concat to pass happily.
    columns_to_drop = ['percent_capacity', 'total_local']
    for i, page in enumerate(pages):
        page.columns = columns_to_schema.keys()
        page = aggregate_ingest_utils.rename_columns_and_select(
            page, columns_to_schema)
        # We don't care about % of capacity and total_local so we drop these
        # columns.
        page = page.drop(columns_to_drop, axis='columns')
        pages[i] = page

    result = pd.concat(pages, ignore_index=True)

    for column_name in set(result.columns) - {'facility_name'}:
        result[column_name] = result[column_name].astype(int)

    return result

Exemple #10

0

Afficher le fichier

def _parse_table(location, filename: str) -> pd.DataFrame:
    """Parses the table in the KY PDF."""
    whole_df = read_pdf(location, filename, pages='all', lattice=True)

    # Remove totals separate from parsing since it's a variable length
    totals_start_index = np.where(
        whole_df['Date'].str.contains('Totals'))[0][0]
    whole_df = whole_df[:totals_start_index]

    # Some rows are parsed including the date, which shift them 1 too far right
    shifted_rows = whole_df['County'].astype(str).str.contains('Secure')
    whole_df[shifted_rows] = whole_df[shifted_rows].shift(-1, axis='columns')

    whole_df = whole_df[whole_df['County'].astype(str) != 'County']

    whole_df.reset_index(drop=True)

    whole_df = _shift_headers(whole_df)
    whole_df.columns = whole_df.columns.str.replace('\n', ' ')
    whole_df.columns = whole_df.columns.str.replace('\r', ' ')

    # Column names can change over time : (
    column_name_map = {
        'CC Eligible Inmates': 'Community Custody Inmates',
    }
    whole_df.columns = [
        column_name_map[c] if c in column_name_map else c
        for c in whole_df.columns
    ]

    # Each block of county data starts with a filled in 'Total Jail Beds'
    start_of_county_indices = np.where(
        whole_df['Total Jail Beds'].notnull())[0]
    dfs_split_by_county = _split_df(whole_df, start_of_county_indices)

    dfs_grouped_by_gender = []
    for df in dfs_split_by_county:
        # Cast everything to int before summing below
        df = df.fillna(0)
        df = aggregate_ingest_utils.cast_columns_to_int(
            df,
            ignore_columns={'County', 'Facility Security', 'Inmate Cusody'})

        df['Gender'] = None
        df = _collapse_by_gender_rows(df, 'Male')
        df = _collapse_by_gender_rows(df, 'Female')

        # The first row contains header data for both Male and Female
        df['County'] = df['County'][0]
        df['total_jail_beds'] = df['Total Jail Beds'][0]
        df['reported_population'] = \
            df['Reported Population (Total and Male/Female)'][0]
        df = df[1:]

        dfs_grouped_by_gender.append(df)

    df_by_gender = pd.concat(dfs_grouped_by_gender)

    # Split into male_df and female_df to independently set column headers
    male_df = df_by_gender[df_by_gender['Gender'] == 'Male']
    female_df = df_by_gender[df_by_gender['Gender'] == 'Female']

    # Since both male_df and female_df contain shared data, pick arbitrarily
    shared_df = aggregate_ingest_utils.rename_columns_and_select(
        female_df, {
            'County': 'facility_name',
            'total_jail_beds': 'total_jail_beds',
            'reported_population': 'reported_population',
        })

    male_df = aggregate_ingest_utils.rename_columns_and_select(
        male_df,
        {
            'County': 'facility_name',
            # Since we've grouped by Male, this Reported Population is only Male
            'Reported Population (Total and Male/Female)': 'male_population',
            'Class D Inmates': 'class_d_male_population',
            'Community Custody Inmates': 'community_custody_male_population',
            'Alternative Sentence': 'alternative_sentence_male_population',
            'Controlled Intake': 'controlled_intake_male_population',
            'Parole Violators': 'parole_violators_male_population',
            'Federal Inmates': 'federal_male_population',
        })

    female_df = aggregate_ingest_utils.rename_columns_and_select(
        female_df,
        {
            'County': 'facility_name',
            # Since we've grouped by Female, this Reported Population is only Female
            'Reported Population (Total and Male/Female)': 'female_population',
            'Class D Inmates': 'class_d_female_population',
            'Community Custody Inmates': 'community_custody_female_population',
            'Alternative Sentence': 'alternative_sentence_female_population',
            'Controlled Intake': 'controlled_intake_female_population',
            'Parole Violators': 'parole_violators_female_population',
            'Federal Inmates': 'federal_female_population',
        })

    result = shared_df.join(male_df.set_index('facility_name'),
                            on='facility_name')
    result = result.join(female_df.set_index('facility_name'),
                         on='facility_name')

    return result.reset_index(drop=True)