def check_db_fixed_input_config() -> State: fixed_input_configuration = fi_configuration() product_tags = products.get_tags() fixed_inputs = FixedInputTable.query.options(joinedload(FixedInputTable.product)).all() data: Dict = {"fixed_inputs": [], "by_tag": {}} errors: List = [] for tag in product_tags: data["by_tag"][tag] = [] for fi in fixed_inputs: fi_data: Dict = first_true( fixed_input_configuration["fixed_inputs"], {}, lambda i: i["name"] == fi.name # noqa: B023 ) if not fi_data: errors.append(fi) if fi.value not in fi_data["values"]: errors.append(fi) tag_data = {one(fi) for fi in fixed_input_configuration["by_tag"][fi.product.tag]} tag_data_required = {one(fi) for fi in fixed_input_configuration["by_tag"][fi.product.tag] if fi[one(fi)]} if not tag_data: errors.append(fi) if {fi.name for fi in fi.product.fixed_inputs} - set(tag_data): errors.append(fi.product.name) if set(tag_data_required) - {fi.name for fi in fi.product.fixed_inputs}: errors.append(fi.product.name) if errors: raise ProcessFailure("Errors in fixed input config", errors) return {"check_db_fixed_input_config": True}
def __getitem__(self, index): if index >= len(self): raise IndexError() label_idx = self.ev_indices[self.opt_choices[index]] img_name = self.names[index] mid_exp_bgr = one(self.generator.get_exposures(img_name, [0.0])) mid_exp_rgb = Image.fromarray(mid_exp_bgr[:, :, [2, 1, 0]]) mid_exp = self.transform(mid_exp_rgb) return mid_exp, label_idx, img_name
def _parse_table(filename: str) -> pd.DataFrame: """Parses the table in the KY PDF.""" whole_df = one( tabula.read_pdf(filename, pages="all", multiple_tables=False, lattice=True)) if filename.endswith("04-16-20.pdf"): whole_df[323:331] = whole_df[323:331].shift(-1, axis="columns") elif filename.endswith("07-09-20.pdf"): whole_df.loc[432] = whole_df.iloc[432].shift(-1) whole_df.loc[434:436] = whole_df.loc[434:436].shift(-1, axis="columns") whole_df.loc[438] = whole_df.iloc[438].shift(-1) whole_df.loc[440] = whole_df.iloc[440].shift(-1) whole_df.loc[442:445] = whole_df.loc[442:445].shift(-1, axis="columns") whole_df.loc[447:462] = whole_df.loc[447:462].shift(-1, axis="columns") whole_df.loc[464:] = whole_df.loc[464:].shift(-1, axis="columns") whole_df.loc[451, "County"] = 86 whole_df.loc[456, "County"] = 264 whole_df.loc[461, "County"] = 52 whole_df.loc[464, "County"] = 161 whole_df.loc[469, "County"] = 70 whole_df.loc[472, "County"] = 204 whole_df.loc[477, "County"] = 182 whole_df.loc[482, "County"] = 137 whole_df.loc[487, "County"] = 45 whole_df.loc[492, "County"] = 410 whole_df.loc[497, "County"] = 152 whole_df.loc[500, "County"] = 95 whole_df.loc[505, "County"] = 85 whole_df.loc[508, "County"] = 194 whole_df.loc[513, "County"] = 72 whole_df.loc[516, "County"] = 134 whole_df.loc[521, "County"] = 50 whole_df.loc[524, "County"] = 63 whole_df.loc[529, "County"] = 32 # Remove totals separate from parsing since it's a variable length totals_start_index = np.where( whole_df["Date"].str.contains("Totals"))[0][0] whole_df = whole_df[:totals_start_index] # Some rows are parsed including the date, which shift them 1 too far right shifted_rows = whole_df["County"].astype(str).str.contains("Secure") whole_df[shifted_rows] = whole_df[shifted_rows].shift(-1, axis="columns") whole_df = whole_df[whole_df["County"].astype(str) != "County"] whole_df.reset_index(drop=True) whole_df = _shift_headers(whole_df) whole_df.columns = whole_df.columns.str.replace("\n", " ") whole_df.columns = whole_df.columns.str.replace("\r", " ") # Column names can change over time : ( column_name_map = { "CC Eligible Inmates": "Community Custody Inmates", } whole_df.columns = [ column_name_map[c] if c in column_name_map else c for c in whole_df.columns ] # Each block of county data starts with a filled in 'Total Jail Beds' start_of_county_indices = np.where( whole_df["Total Jail Beds"].notnull())[0] dfs_split_by_county = _split_df(whole_df, start_of_county_indices) dfs_grouped_by_gender = [] for df in dfs_split_by_county: # This is a typo in several reports if "12/" in df["Federal Inmates"].values: df["Federal Inmates"] = df["Federal Inmates"].replace( {"12/": "12"}) if "yo" in df["Federal Inmates"].values: df["Federal Inmates"] = df["Federal Inmates"].replace({"yo": "0"}) if "pe" in df["Federal Inmates"].values: df["Federal Inmates"] = df["Federal Inmates"].replace({"pe": "0"}) if "(" in df["Reported Population (Total and Male/Female)"].values: df["Reported Population (Total and Male/Female)"] = df[ "Reported Population (Total and Male/Female)"].replace( {"(": "0"}) # Cast everything to int before summing below df = df.fillna(0) df = aggregate_ingest_utils.cast_columns_to_int( df, ignore_columns={"County", "Facility Security", "Inmate Cusody"}) df["Gender"] = None df = _collapse_by_gender_rows(df, "Male") df = _collapse_by_gender_rows(df, "Female") # The first row contains header data for both Male and Female df["County"] = df["County"][0] df["total_jail_beds"] = df["Total Jail Beds"][0] df["reported_population"] = df[ "Reported Population (Total and Male/Female)"][0] df = df[1:] dfs_grouped_by_gender.append(df) df_by_gender = pd.concat(dfs_grouped_by_gender) # Split into male_df and female_df to independently set column headers male_df = df_by_gender[df_by_gender["Gender"] == "Male"] female_df = df_by_gender[df_by_gender["Gender"] == "Female"] # Since both male_df and female_df contain shared data, pick arbitrarily shared_df = aggregate_ingest_utils.rename_columns_and_select( female_df, { "County": "facility_name", "total_jail_beds": "total_jail_beds", "reported_population": "reported_population", }, ) male_df = aggregate_ingest_utils.rename_columns_and_select( male_df, { "County": "facility_name", # Since we've grouped by Male, this Reported Population is only Male "Reported Population (Total and Male/Female)": "male_population", "Class D Inmates": "class_d_male_population", "Community Custody Inmates": "community_custody_male_population", "Alternative Sentence": "alternative_sentence_male_population", "Controlled Intake": "controlled_intake_male_population", "Parole Violators": "parole_violators_male_population", "Federal Inmates": "federal_male_population", }, ) female_df = aggregate_ingest_utils.rename_columns_and_select( female_df, { "County": "facility_name", # Since we've grouped by Female, this Reported Population is only Female "Reported Population (Total and Male/Female)": "female_population", "Class D Inmates": "class_d_female_population", "Community Custody Inmates": "community_custody_female_population", "Alternative Sentence": "alternative_sentence_female_population", "Controlled Intake": "controlled_intake_female_population", "Parole Violators": "parole_violators_female_population", "Federal Inmates": "federal_female_population", }, ) result = shared_df.join(male_df.set_index("facility_name"), on="facility_name") result = result.join(female_df.set_index("facility_name"), on="facility_name") if filename.endswith("04-16-20.pdf"): result.loc[result["facility_name"] == "Lincoln", "total_jail_beds"] = 72 return result.reset_index(drop=True)
def _parse_table(filename: str, report_date: datetime.date) -> pd.DataFrame: """Parses the last table in the GA PDF.""" # Set column names since the pdf makes them hard to parse directly column_names = [ "Index", "Jurisdiction", "Total Number of Inmates In Jail", "Jail Capacity", "Inmates as % of Capacity", "Number of Inmates Sentenced to State [Number]", "Number of Inmates Sentenced to State [% of Total]", "Number of Inmates Awaiting Trial in Jail [Number]", "Number of Inmates Awaiting Trial in Jail [% of Total]", "Number of Inmates Serving County Sentence [Number]", "Number of Inmates Serving County Sentence [% of Total]", "Number of Other Inmates [Number]", "Number of Other Inmates [% of Total]", ] # Tables at the end of the doc contain all data we want to parse pages = [8, 9, 10, 11] # Use lattice parsing since default parsing fails to parse columns on # the right half of the page use_lattice = True if filename.endswith("jun_19.pdf"): # Tabula can't handle the multiple tables because it thinks the one on # the last page has extra columns. This concats them manually. *dfs, df4 = tabula.read_pdf(filename, pages=pages, lattice=use_lattice, multiple_tables=True) df4 = df4.iloc[:-1, 1:14] df4.columns = range(13) df4.iloc[33, 1] = df4.iloc[33, 1].strip(" '") dfs.append(df4) result = pd.concat(df.iloc[1:] for df in dfs) result.columns = column_names elif report_date >= datetime.date(2020, 11, 5): # Skip every 48th row for new-style reports result = one( tabula.read_pdf( filename, pages=pages, lattice=use_lattice, multiple_tables=False, pandas_options={ "names": column_names, "skiprows": [x * 48 for x in range(4)], "skipfooter": 1, # The last row is the grand totals "engine": "python", # Only python engine supports 'skipfooter' }, )) else: result = one( tabula.read_pdf( filename, pages=pages, lattice=use_lattice, multiple_tables=False, pandas_options={ "names": column_names, "skiprows": _header_on_each_page(), "skipfooter": 1, # The last row is the grand totals "engine": "python", # Only python engine supports 'skipfooter' }, )) result = aggregate_ingest_utils.rename_columns_and_select( result, { "Jurisdiction": "county_name", "Total Number of Inmates In Jail": "total_number_of_inmates_in_jail", "Jail Capacity": "jail_capacity", "Number of Inmates Sentenced to State [Number]": "number_of_inmates_sentenced_to_state", "Number of Inmates Awaiting Trial in Jail [Number]": "number_of_inmates_awaiting_trial", "Number of Inmates Serving County Sentence [Number]": "number_of_inmates_serving_county_sentence", "Number of Other Inmates [Number]": "number_of_other_inmates", }, ) # Tabula may parse extra empty rows result = result.dropna() aggregate_ingest_utils.cast_columns_to_int(result, ignore_columns={"county_name"}) return result