def parse(location: str, filename: str) -> Dict[DeclarativeMeta, pd.DataFrame]:
    _setup()

    fl_county_table = _parse_county_table(location, filename)
    fl_county_table = fips.add_column_to_df(
        fl_county_table, fl_county_table.county_name, us.states.FL)

    # TODO(#689): Also set the facility_fips
    fl_facility_table = _parse_facility_table(location, filename)
    names = fl_facility_table.facility_name.apply(_pretend_facility_is_county)
    fl_facility_table = fips.add_column_to_df(
        fl_facility_table, names, us.states.FL)

    result = {
        FlCountyAggregate: fl_county_table,
        FlFacilityAggregate: fl_facility_table
    }

    date_scraped = _parse_date(filename)
    for table in result.values():
        table['report_date'] = date_scraped
        table['aggregation_window'] = enum_strings.monthly_granularity
        table['report_frequency'] = enum_strings.monthly_granularity

    return result
Example #2
0
    def testCountyNotInState_RaisesFipsMergingError(self):
        # Arrange
        subject = pd.DataFrame({"county": ["Jo Daviess County"]})

        # Act/Assert
        with self.assertRaises(FipsMergingError):
            incorrect_state = us.states.FL
            fips.add_column_to_df(subject, subject.county, incorrect_state)
Example #3
0
    def testStateWithNoFipsMappings_RaisesFipsMergingError(self):
        # Arrange
        class FakeState:
            fips = "123456789"  # Non-existing state fips maps to no county fips

        subject = pd.DataFrame({"county": ["Jo Daviess County"]})

        # Act/Assert
        with self.assertRaises(FipsMergingError):
            fips.add_column_to_df(subject, subject.county, FakeState)
Example #4
0
    def testStateWithNoFipsMappings_RaisesFipsMergingError(self) -> None:
        # Create fake state
        fake_state = us.states.State(
            **{
                "abbr": "FS",
                "fips":
                "123456789",  # Non-existing state fips maps to no county fips
                "name": "Fake State",
            })

        subject = pd.DataFrame({"county": ["Jo Daviess County"]})

        # Act/Assert
        with self.assertRaises(FipsMergingError):
            fips.add_column_to_df(subject, subject.county, fake_state)
Example #5
0
def _parse_tab_2(filename: str):
    df = pd.read_excel(filename, sheet_name=1, header=1)

    # Drop Totals footer
    df = df[:-4]

    # Set index/columns with correct names
    df = df.rename({df.columns[0]: 'county_name'}, axis='columns')
    df = df.set_index('county_name')
    df = df.rename_axis('report_date', axis='columns')

    # Collapse each column into a new row
    df = df.unstack()
    df = df.rename('pre_sentenced_population')
    df = df.reset_index()

    df['county_name'] = df['county_name'].str.rstrip(' ')
    df['report_date'] = df['report_date'].dt.date
    df['pre_sentenced_population'] = _to_numeric(
        df['pre_sentenced_population'])

    df = fips.add_column_to_df(df, df['county_name'], us.states.PA)
    df['aggregation_window'] = enum_strings.daily_granularity
    df['report_frequency'] = enum_strings.quarterly_granularity

    return df
def _parse_tab_2(filename: str):
    df = pd.read_excel(filename, sheet_name=1, header=1, engine="openpyxl")

    # Drop Totals footer
    df = df[:-4]

    # Set index/columns with correct names
    df = df.rename({df.columns[0]: "county_name"}, axis="columns")
    df = df.set_index("county_name")
    df = df.rename_axis("report_date", axis="columns")

    # Collapse each column into a new row
    df = df.unstack()
    df = df.rename("pre_sentenced_population")
    df = df.reset_index()

    df["county_name"] = df["county_name"].str.rstrip(" ")
    df["report_date"] = df["report_date"].dt.date
    df["pre_sentenced_population"] = _to_numeric(
        df["pre_sentenced_population"])

    df = fips.add_column_to_df(df, df["county_name"], us.states.PA)
    df["aggregation_window"] = enum_strings.daily_granularity
    df["report_frequency"] = enum_strings.quarterly_granularity

    return df
Example #7
0
def parse(filename: str) -> Dict[DeclarativeMeta, pd.DataFrame]:
    df = _parse_table(filename)

    df = fips.add_column_to_df(df, df["county"], us.states.WV)  # type: ignore
    df["aggregation_window"] = enum_strings.daily_granularity
    df["report_frequency"] = enum_strings.daily_granularity

    return {WvFacilityAggregate: df}
Example #8
0
def parse(_, filename: str) -> Dict[DeclarativeMeta, pd.DataFrame]:
    table = _parse_table(filename)

    county_names = table.jurisdiction_name.map(_pretend_jurisdiction_is_county)
    table = fips.add_column_to_df(table, county_names, us.states.CA)

    table['aggregation_window'] = enum_strings.monthly_granularity
    table['report_frequency'] = enum_strings.monthly_granularity

    return {CaFacilityAggregate: table}
def parse(location: str, filename: str) -> Dict[DeclarativeMeta, pd.DataFrame]:
    report_date = _parse_date(filename)
    table = _parse_table(location, filename, report_date)

    county_names = table.facility_name.map(_pretend_facility_is_county)
    table = fips.add_column_to_df(table, county_names, us.states.TX)

    table['report_date'] = report_date
    table['aggregation_window'] = enum_strings.daily_granularity
    table['report_frequency'] = enum_strings.monthly_granularity

    return {TxCountyAggregate: table}
Example #10
0
def parse(location: str, filename: str) -> Dict[DeclarativeMeta, pd.DataFrame]:
    table = _parse_table(location, filename)

    # Fuzzy match each facility_name to a county fips
    county_names = table.county_name.map(_sanitize_county_name)
    table = fips.add_column_to_df(table, county_names, us.states.GA)

    table['report_date'] = _parse_date(filename)
    table['aggregation_window'] = enum_strings.daily_granularity
    table['report_frequency'] = enum_strings.monthly_granularity

    return {GaCountyAggregate: table}
def parse(filename: str) -> Dict[DeclarativeMeta, pd.DataFrame]:
    table = _parse_table(filename)

    # Fuzzy match each facility_name to a county fips
    county_names = table.facility_name.map(_pretend_facility_is_county)
    table = fips.add_column_to_df(table, county_names, us.states.KY)

    table["report_date"] = parse_date(filename)
    table["aggregation_window"] = enum_strings.daily_granularity
    table["report_frequency"] = enum_strings.weekly_granularity

    return {KyFacilityAggregate: table}
Example #12
0
def parse(location: str, filename: str) -> Dict[DeclarativeMeta, pd.DataFrame]:
    _setup()

    table = _parse_table(location, filename)

    # Fuzzy match each facility_name to a county fips
    county_names = table.facility_name.map(_pretend_facility_is_county)
    table = fips.add_column_to_df(table, county_names, us.states.NY)

    table['aggregation_window'] = enum_strings.monthly_granularity
    table['report_frequency'] = enum_strings.monthly_granularity

    return {NyFacilityAggregate: table}
def parse(filename: str) -> Dict[DeclarativeMeta, pd.DataFrame]:
    table = _parse_table(filename)
    report_date = _parse_date(filename)

    county_names = table.county

    table = fips.add_column_to_df(table, county_names,
                                  us.states.MA)  # type: ignore

    table["report_date"] = report_date
    table["aggregation_window"] = enum_strings.daily_granularity
    table["report_frequency"] = enum_strings.weekly_granularity

    return {MaFacilityAggregate: table}
def _parse_tab_1(filename: str) -> pd.DataFrame:
    """Parses the first tab in the PA aggregate report."""
    column_names = {
        r"County Name": "facility_name",
        r"Bed Capacity": "bed_capacity",
        r".*Community Corrections Beds.*":
        "work_release_community_corrections_beds",
        r".*In-House Daily Pop.*": "in_house_adp",
        r".*Housed Elsewhere Daily Pop.*": "housed_elsewhere_adp",
        r".*In-House Work Release.*": "work_release_adp",
        r"Admissions": "admissions",
        r"Discharge": "discharge",
    }

    # Parse everything directly to allow us to correctly map "N/A" and "N/R"
    keep_default_na = False
    df = pd.read_excel(
        filename,
        sheet_name=0,
        header=1,
        keep_default_na=keep_default_na,
        engine="openpyxl",
    )

    # Drop "F/T" and "P/T" line
    df = df[1:]

    # Drop Totals footer
    df = df[:-9]

    df.columns = df.columns.map(lambda name: name.rstrip(" "))
    df = aggregate_ingest_utils.rename_columns_and_select(df,
                                                          column_names,
                                                          use_regex=True)

    # Some cells have extra '*'
    df = df.applymap(lambda e: str(e).rstrip(" *"))

    df = df.apply(_to_numeric)

    df["report_date"] = _report_date_tab_1(filename)
    df = fips.add_column_to_df(df, df["facility_name"], us.states.PA)
    df["aggregation_window"] = enum_strings.yearly_granularity
    df["report_frequency"] = enum_strings.yearly_granularity

    return df.reset_index(drop=True)
Example #15
0
def _parse_tab_1(filename: str) -> pd.DataFrame:
    """Parses the first tab in the PA aggregate report."""
    column_names = {
        r'County Name': 'facility_name',
        r'Bed Capacity': 'bed_capacity',
        r'.*Community Corrections Beds.*':
        'work_release_community_corrections_beds',
        r'.*In-House Daily Pop.*': 'in_house_adp',
        r'.*Housed Elsewhere Daily Pop.*': 'housed_elsewhere_adp',
        r'.*In-House Work Release.*': 'work_release_adp',
        r'Admissions': 'admissions',
        r'Discharge': 'discharge'
    }

    # Parse everything directly to allow us to correctly map "N/A" and "N/R"
    keep_default_na = False
    df = pd.read_excel(filename,
                       sheet_name=0,
                       header=1,
                       keep_default_na=keep_default_na)

    # Drop "F/T" and "P/T" line
    df = df[1:]

    # Drop Totals footer
    df = df[:-9]

    df.columns = df.columns.map(lambda name: name.rstrip(' '))
    df = aggregate_ingest_utils.rename_columns_and_select(df,
                                                          column_names,
                                                          use_regex=True)

    # Some cells have extra '*'
    df = df.applymap(lambda e: str(e).rstrip(' *'))

    df = df.apply(_to_numeric)

    df['report_date'] = _report_date_tab_1(filename)
    df = fips.add_column_to_df(df, df['facility_name'], us.states.PA)
    df['aggregation_window'] = enum_strings.yearly_granularity
    df['report_frequency'] = enum_strings.yearly_granularity

    return df.reset_index(drop=True)
Example #16
0
def parse(location: str, filename: str) -> Dict[DeclarativeMeta, pd.DataFrame]:
    # There are two types of reports, total jail population and female
    # jail population. The reports are very similar, but need to be
    # handled slightly differently.
    is_female = 'female' in filename
    report_date = _parse_date(filename)

    table = _parse_table(location, filename, is_female, report_date)

    names = table.facility_name.apply(_pretend_facility_is_county)
    table = fips.add_column_to_df(table, names, us.states.TN)

    table['report_date'] = report_date
    table['aggregation_window'] = enum_strings.daily_granularity
    table['report_frequency'] = enum_strings.monthly_granularity

    return {
        TnFacilityFemaleAggregate: table
    } if is_female else {
        TnFacilityAggregate: table
    }
Example #17
0
    def testValidCountyNames_SanitizesFieldsAndFuzzyJoinsFips(self):
        # Arrange
        subject = pd.DataFrame({
            'county': [
                'DuPage',  # No County suffix
                'Efingham County',  # Spelled incorrect: Efingham -> Effingham
                'Jo Daviess County'
            ]  # Exact match
        })

        # Act
        result = fips.add_column_to_df(subject, subject.county, us.states.IL)

        # Assert
        expected_result = pd.DataFrame({
            'county': ['DuPage', 'Efingham County', 'Jo Daviess County'],
            'fips': [
                _DUPAGE_COUNTY_FIPS, _EFFINGHAM_COUNTY_FIPS,
                _JO_DAVIESS_COUNTY_FIPS
            ]
        })

        assert_frame_equal(result, expected_result)
Example #18
0
    def testValidCountyNames_SanitizesFieldsAndFuzzyJoinsFips(self) -> None:
        # Arrange
        subject = pd.DataFrame({
            "county": [
                "DuPage",  # No County suffix
                "Efingham County",  # Spelled incorrect: Efingham -> Effingham
                "Jo Daviess County",
            ]  # Exact match
        })

        # Act
        result = fips.add_column_to_df(subject, subject.county, us.states.IL)

        # Assert
        expected_result = pd.DataFrame({
            "county": ["DuPage", "Efingham County", "Jo Daviess County"],
            "fips": [
                _DUPAGE_COUNTY_FIPS,
                _EFFINGHAM_COUNTY_FIPS,
                _JO_DAVIESS_COUNTY_FIPS,
            ],
        })

        assert_frame_equal(result, expected_result)
Example #19
0
def _parse_table(filename: str) -> pd.DataFrame:
    """
    Parse the Colorado jail data csv by turning the data into wide format
    consistent with other aggregate data integration efforts.

    :return: parsed colorado df
    """

    data = pd.read_csv(filename, encoding="cp1252")
    county_names = data.County
    data = fips.add_column_to_df(data, county_names,
                                 us.states.CO)  # type: ignore

    def label_date(row):
        """Impute date_collected based on collection quarter"""
        md = {1: "1/1", 2: "4/1", 3: "7/1", 4: "10/1"}[row["Qtr"]]
        return f"{md}/{row['QtrYear']}"

    data["date_collected"] = data.apply(label_date, axis=1)
    data["date_collected"] = pd.to_datetime(data["date_collected"])
    data = data.assign(
        datecounty=data["date_collected"].dt.strftime("%Y-%m-%d") +
        data["County"])

    def move_column_inplace(df, col, pos):
        """Move newly created columns to the front of the dataset"""
        col = df.pop(col)
        df.insert(pos, col.name, col)

    move_column_inplace(data, "date_collected", 0)
    move_column_inplace(data, "fips", 0)

    # We use the unique county jail and date collected as a unique identifier
    # for each column in the final dataset.
    data = data.set_index(["date_collected", "County"])

    # create head, a df that keeps all columns independent from the "measures"
    # used to categorize the data, such as the first 11 columns, i.e. beds,
    # deaths) for each unique county jail at a specific date. we will
    # concatenate all of the columns that are different for different measures
    # to the end of the head df's
    head = data.iloc[:, :9].drop_duplicates()

    # Create a tail df for each type of measure. Concatenate each of the
    # tail df's for each measure to the head df:
    for measure in data.Measure.unique():
        num_inmates_df = data[data.Measure == measure]
        tail = num_inmates_df.iloc[:, 11:21].add_suffix(
            "_" + measure).drop_duplicates()
        head = pd.concat([head, tail], axis=1, join="outer")

    # add a na_message column, which accumulates all na messages for any measure
    # in the CO dataset per county.
    data = data.assign(na_message=np.where(
        data["Not Available"].isnull(),
        np.NaN,
        "**" + data.Measure + "**: " + data["Not Available"] + ", ",
    ))
    na_mappings = (data.fillna("").groupby(["date_collected", "County"
                                            ]).agg({"na_message": "".join}))
    final = pd.concat([head, na_mappings], axis=1, join="inner")

    # turn off multi-indexing
    final = final.reset_index()

    # clean column names
    final.columns = final.columns.str.replace(" - ", "_")
    final.columns = final.columns.str.replace(" ", "_")
    final.columns = final.columns.str.replace("-", "_")
    final.columns = final.columns.str.lower()

    return final
Example #20
0
def add_fips_to_state_df(df: pd.DataFrame) -> pd.DataFrame:
    state_code = StateCode(df.name)
    df = df.copy()
    return fips.add_column_to_df(df, df[TEMP_COUNTY_NAME_COL],
                                 state_code.get_state())