Ejemplo n.º 1
0
def check_db_fixed_input_config() -> State:
    fixed_input_configuration = fi_configuration()
    product_tags = products.get_tags()
    fixed_inputs = FixedInputTable.query.options(joinedload(FixedInputTable.product)).all()

    data: Dict = {"fixed_inputs": [], "by_tag": {}}
    errors: List = []

    for tag in product_tags:
        data["by_tag"][tag] = []
    for fi in fixed_inputs:
        fi_data: Dict = first_true(
            fixed_input_configuration["fixed_inputs"], {}, lambda i: i["name"] == fi.name  # noqa: B023
        )
        if not fi_data:
            errors.append(fi)

        if fi.value not in fi_data["values"]:
            errors.append(fi)

        tag_data = {one(fi) for fi in fixed_input_configuration["by_tag"][fi.product.tag]}
        tag_data_required = {one(fi) for fi in fixed_input_configuration["by_tag"][fi.product.tag] if fi[one(fi)]}

        if not tag_data:
            errors.append(fi)

        if {fi.name for fi in fi.product.fixed_inputs} - set(tag_data):
            errors.append(fi.product.name)
        if set(tag_data_required) - {fi.name for fi in fi.product.fixed_inputs}:
            errors.append(fi.product.name)

    if errors:
        raise ProcessFailure("Errors in fixed input config", errors)

    return {"check_db_fixed_input_config": True}
Ejemplo n.º 2
0
    def __getitem__(self, index):
        if index >= len(self):
            raise IndexError()

        label_idx = self.ev_indices[self.opt_choices[index]]
        img_name = self.names[index]
        mid_exp_bgr = one(self.generator.get_exposures(img_name, [0.0]))
        mid_exp_rgb = Image.fromarray(mid_exp_bgr[:, :, [2, 1, 0]])
        mid_exp = self.transform(mid_exp_rgb)
        return mid_exp, label_idx, img_name
Ejemplo n.º 3
0
def _parse_table(filename: str) -> pd.DataFrame:
    """Parses the table in the KY PDF."""
    whole_df = one(
        tabula.read_pdf(filename,
                        pages="all",
                        multiple_tables=False,
                        lattice=True))

    if filename.endswith("04-16-20.pdf"):
        whole_df[323:331] = whole_df[323:331].shift(-1, axis="columns")
    elif filename.endswith("07-09-20.pdf"):
        whole_df.loc[432] = whole_df.iloc[432].shift(-1)
        whole_df.loc[434:436] = whole_df.loc[434:436].shift(-1, axis="columns")
        whole_df.loc[438] = whole_df.iloc[438].shift(-1)
        whole_df.loc[440] = whole_df.iloc[440].shift(-1)
        whole_df.loc[442:445] = whole_df.loc[442:445].shift(-1, axis="columns")
        whole_df.loc[447:462] = whole_df.loc[447:462].shift(-1, axis="columns")
        whole_df.loc[464:] = whole_df.loc[464:].shift(-1, axis="columns")
        whole_df.loc[451, "County"] = 86
        whole_df.loc[456, "County"] = 264
        whole_df.loc[461, "County"] = 52
        whole_df.loc[464, "County"] = 161
        whole_df.loc[469, "County"] = 70
        whole_df.loc[472, "County"] = 204
        whole_df.loc[477, "County"] = 182
        whole_df.loc[482, "County"] = 137
        whole_df.loc[487, "County"] = 45
        whole_df.loc[492, "County"] = 410
        whole_df.loc[497, "County"] = 152
        whole_df.loc[500, "County"] = 95
        whole_df.loc[505, "County"] = 85
        whole_df.loc[508, "County"] = 194
        whole_df.loc[513, "County"] = 72
        whole_df.loc[516, "County"] = 134
        whole_df.loc[521, "County"] = 50
        whole_df.loc[524, "County"] = 63
        whole_df.loc[529, "County"] = 32

    # Remove totals separate from parsing since it's a variable length
    totals_start_index = np.where(
        whole_df["Date"].str.contains("Totals"))[0][0]
    whole_df = whole_df[:totals_start_index]

    # Some rows are parsed including the date, which shift them 1 too far right
    shifted_rows = whole_df["County"].astype(str).str.contains("Secure")
    whole_df[shifted_rows] = whole_df[shifted_rows].shift(-1, axis="columns")

    whole_df = whole_df[whole_df["County"].astype(str) != "County"]

    whole_df.reset_index(drop=True)

    whole_df = _shift_headers(whole_df)
    whole_df.columns = whole_df.columns.str.replace("\n", " ")
    whole_df.columns = whole_df.columns.str.replace("\r", " ")

    # Column names can change over time : (
    column_name_map = {
        "CC Eligible Inmates": "Community Custody Inmates",
    }
    whole_df.columns = [
        column_name_map[c] if c in column_name_map else c
        for c in whole_df.columns
    ]

    # Each block of county data starts with a filled in 'Total Jail Beds'
    start_of_county_indices = np.where(
        whole_df["Total Jail Beds"].notnull())[0]
    dfs_split_by_county = _split_df(whole_df, start_of_county_indices)

    dfs_grouped_by_gender = []
    for df in dfs_split_by_county:
        # This is a typo in several reports
        if "12/" in df["Federal Inmates"].values:
            df["Federal Inmates"] = df["Federal Inmates"].replace(
                {"12/": "12"})
        if "yo" in df["Federal Inmates"].values:
            df["Federal Inmates"] = df["Federal Inmates"].replace({"yo": "0"})
        if "pe" in df["Federal Inmates"].values:
            df["Federal Inmates"] = df["Federal Inmates"].replace({"pe": "0"})
        if "(" in df["Reported Population (Total and Male/Female)"].values:
            df["Reported Population (Total and Male/Female)"] = df[
                "Reported Population (Total and Male/Female)"].replace(
                    {"(": "0"})

        # Cast everything to int before summing below
        df = df.fillna(0)
        df = aggregate_ingest_utils.cast_columns_to_int(
            df,
            ignore_columns={"County", "Facility Security", "Inmate Cusody"})

        df["Gender"] = None
        df = _collapse_by_gender_rows(df, "Male")
        df = _collapse_by_gender_rows(df, "Female")

        # The first row contains header data for both Male and Female
        df["County"] = df["County"][0]
        df["total_jail_beds"] = df["Total Jail Beds"][0]
        df["reported_population"] = df[
            "Reported Population (Total and Male/Female)"][0]
        df = df[1:]

        dfs_grouped_by_gender.append(df)

    df_by_gender = pd.concat(dfs_grouped_by_gender)

    # Split into male_df and female_df to independently set column headers
    male_df = df_by_gender[df_by_gender["Gender"] == "Male"]
    female_df = df_by_gender[df_by_gender["Gender"] == "Female"]

    # Since both male_df and female_df contain shared data, pick arbitrarily
    shared_df = aggregate_ingest_utils.rename_columns_and_select(
        female_df,
        {
            "County": "facility_name",
            "total_jail_beds": "total_jail_beds",
            "reported_population": "reported_population",
        },
    )

    male_df = aggregate_ingest_utils.rename_columns_and_select(
        male_df,
        {
            "County": "facility_name",
            # Since we've grouped by Male, this Reported Population is only Male
            "Reported Population (Total and Male/Female)": "male_population",
            "Class D Inmates": "class_d_male_population",
            "Community Custody Inmates": "community_custody_male_population",
            "Alternative Sentence": "alternative_sentence_male_population",
            "Controlled Intake": "controlled_intake_male_population",
            "Parole Violators": "parole_violators_male_population",
            "Federal Inmates": "federal_male_population",
        },
    )

    female_df = aggregate_ingest_utils.rename_columns_and_select(
        female_df,
        {
            "County": "facility_name",
            # Since we've grouped by Female, this Reported Population is only Female
            "Reported Population (Total and Male/Female)": "female_population",
            "Class D Inmates": "class_d_female_population",
            "Community Custody Inmates": "community_custody_female_population",
            "Alternative Sentence": "alternative_sentence_female_population",
            "Controlled Intake": "controlled_intake_female_population",
            "Parole Violators": "parole_violators_female_population",
            "Federal Inmates": "federal_female_population",
        },
    )

    result = shared_df.join(male_df.set_index("facility_name"),
                            on="facility_name")
    result = result.join(female_df.set_index("facility_name"),
                         on="facility_name")

    if filename.endswith("04-16-20.pdf"):
        result.loc[result["facility_name"] == "Lincoln",
                   "total_jail_beds"] = 72

    return result.reset_index(drop=True)
Ejemplo n.º 4
0
def _parse_table(filename: str, report_date: datetime.date) -> pd.DataFrame:
    """Parses the last table in the GA PDF."""

    # Set column names since the pdf makes them hard to parse directly
    column_names = [
        "Index",
        "Jurisdiction",
        "Total Number of Inmates In Jail",
        "Jail Capacity",
        "Inmates as % of Capacity",
        "Number of Inmates Sentenced to State [Number]",
        "Number of Inmates Sentenced to State [% of Total]",
        "Number of Inmates Awaiting Trial in Jail [Number]",
        "Number of Inmates Awaiting Trial in Jail [% of Total]",
        "Number of Inmates Serving County Sentence [Number]",
        "Number of Inmates Serving County Sentence [% of Total]",
        "Number of Other Inmates [Number]",
        "Number of Other Inmates [% of Total]",
    ]

    # Tables at the end of the doc contain all data we want to parse
    pages = [8, 9, 10, 11]

    # Use lattice parsing since default parsing fails to parse columns on
    # the right half of the page
    use_lattice = True

    if filename.endswith("jun_19.pdf"):
        # Tabula can't handle the multiple tables because it thinks the one on
        # the last page has extra columns. This concats them manually.
        *dfs, df4 = tabula.read_pdf(filename,
                                    pages=pages,
                                    lattice=use_lattice,
                                    multiple_tables=True)
        df4 = df4.iloc[:-1, 1:14]
        df4.columns = range(13)
        df4.iloc[33, 1] = df4.iloc[33, 1].strip(" '")
        dfs.append(df4)
        result = pd.concat(df.iloc[1:] for df in dfs)
        result.columns = column_names
    elif report_date >= datetime.date(2020, 11, 5):
        # Skip every 48th row for new-style reports
        result = one(
            tabula.read_pdf(
                filename,
                pages=pages,
                lattice=use_lattice,
                multiple_tables=False,
                pandas_options={
                    "names": column_names,
                    "skiprows": [x * 48 for x in range(4)],
                    "skipfooter": 1,  # The last row is the grand totals
                    "engine":
                    "python",  # Only python engine supports 'skipfooter'
                },
            ))
    else:
        result = one(
            tabula.read_pdf(
                filename,
                pages=pages,
                lattice=use_lattice,
                multiple_tables=False,
                pandas_options={
                    "names": column_names,
                    "skiprows": _header_on_each_page(),
                    "skipfooter": 1,  # The last row is the grand totals
                    "engine":
                    "python",  # Only python engine supports 'skipfooter'
                },
            ))

    result = aggregate_ingest_utils.rename_columns_and_select(
        result,
        {
            "Jurisdiction": "county_name",
            "Total Number of Inmates In Jail":
            "total_number_of_inmates_in_jail",
            "Jail Capacity": "jail_capacity",
            "Number of Inmates Sentenced to State [Number]":
            "number_of_inmates_sentenced_to_state",
            "Number of Inmates Awaiting Trial in Jail [Number]":
            "number_of_inmates_awaiting_trial",
            "Number of Inmates Serving County Sentence [Number]":
            "number_of_inmates_serving_county_sentence",
            "Number of Other Inmates [Number]": "number_of_other_inmates",
        },
    )

    # Tabula may parse extra empty rows
    result = result.dropna()

    aggregate_ingest_utils.cast_columns_to_int(result,
                                               ignore_columns={"county_name"})

    return result