Beispiel #1
0
def geo_map(df: pd.DataFrame, geo_res: str, sensor: str):
    """
    Map and aggregate a DataFrame at the county resolution to the geographic resolution geo_res.

    Parameters
    ----------
    df: pd.DataFrame
        Columns: fips, timestamp, new_counts, cumulative_counts, population ...
    geo_res: str
        Geographic resolution to which to aggregate.  Valid options:
        ('fips', 'state', 'msa', 'hrr', 'hhs', 'nation').
    sensor: str
        sensor type. Valid options:
        ("new_counts", "cumulative_counts",
        "incidence", "cumulative_prop")

    Returns
    -------
    pd.DataFrame
        Columns: geo_id, timestamp, ...
    """
    df = df.copy()
    if geo_res not in VALID_GEO_RES:
        raise ValueError(f"geo_res must be one of {VALID_GEO_RES}")
    gmpr = GeoMapper()
    df = add_county_pop(df, gmpr)
    unassigned_counties = df[df["fips"].str.endswith("000")].copy()
    df = df[~df["fips"].str.endswith("000")].copy()
    if geo_res == "county":
        if not sensor in ("incidence", "cumulative_prop"):  # prop signals
            # It is not clear how to calculate the proportion for unallocated
            # cases/deaths, so we exclude them for those sensors.
            df = df.append(
                unassigned_counties) if not unassigned_counties.empty else df
        df.rename(columns={"fips": "geo_id"}, inplace=True)
    elif geo_res in ("state", "hhs", "nation"):
        geo = "state_id" if geo_res == "state" else geo_res
        df = df.append(
            unassigned_counties) if not unassigned_counties.empty else df
        df = gmpr.replace_geocode(df,
                                  "fips",
                                  geo,
                                  new_col="geo_id",
                                  date_col="timestamp")
    else:
        df = gmpr.replace_geocode(df,
                                  "fips",
                                  geo_res,
                                  new_col="geo_id",
                                  date_col="timestamp")
    df["incidence"] = df["new_counts"] / df["population"] * INCIDENCE_BASE
    df["cumulative_prop"] = df["cumulative_counts"] / df[
        "population"] * INCIDENCE_BASE
    return df
Beispiel #2
0
def geo_map(df: pd.DataFrame, geo_res: str):
    """
    Map and aggregate a DataFrame at the county resolution to the geographic resolution geo_res.

    Parameters
    ----------
    df: pd.DataFrame
        Columns: fips, timestamp, new_counts, cumulative_counts, population ...
    geo_res: str
        Geographic resolution to which to aggregate.  Valid options:
        ('fips', 'state', 'msa', 'hrr', 'hhs', 'nation').
    sensor: str
        sensor type. Valid options:
        ("new_counts", "cumulative_counts",
        "incidence", "cumulative_prop")

    Returns
    -------
    pd.DataFrame
        Columns: geo_id, timestamp, ...
    """
    df = df.copy()
    if geo_res not in VALID_GEO_RES:
        raise ValueError(f"geo_res must be one of {VALID_GEO_RES}")

    gmpr = GeoMapper()
    if geo_res == "county":
        df.rename(columns={'fips': 'geo_id'}, inplace=True)
    elif geo_res == "state":
        df = df.set_index("fips")
        # Zero out the state FIPS population to avoid double counting.
        state_fips_codes = {str(x).zfill(2) + "000" for x in range(1, 73)}
        subset_state_fips_codes = set(df.index.values) & state_fips_codes
        df.loc[subset_state_fips_codes, "population"] = 0
        df = df.reset_index()
        df = gmpr.replace_geocode(df,
                                  "fips",
                                  "state_id",
                                  new_col="geo_id",
                                  date_col="timestamp")
    else:
        df = gmpr.replace_geocode(df,
                                  "fips",
                                  geo_res,
                                  new_col="geo_id",
                                  date_col="timestamp")
    df["incidence"] = df["new_counts"] / df["population"] * INCIDENCE_BASE
    df["cumulative_prop"] = df["cumulative_counts"] / df[
        "population"] * INCIDENCE_BASE
    df['new_counts'] = df['new_counts']
    df['cumulative_counts'] = df['cumulative_counts']
    return df
    def geo_reindex(self, data):
        """
        Reindex dataframe based on desired output geography.

        Args:
            data: dataframe, the output of load_data::load_data()

        Returns:
            reindexed dataframe

        """
        geo_map = GeoMapper()
        if self.geo == "county":
            data_frame = geo_map.fips_to_megacounty(
                data,
                Config.MIN_DEN,
                Config.MAX_BACKWARDS_PAD_LENGTH,
                thr_col="den",
                mega_col=self.geo)
        elif self.geo == "state":
            data_frame = geo_map.replace_geocode(data,
                                                 from_code="fips",
                                                 new_col=self.geo,
                                                 new_code="state_id")
            data_frame[self.geo] = data_frame[self.geo]
        elif self.geo in ["msa", "hhs", "nation"]:
            data_frame = geo_map.replace_geocode(data,
                                                 from_code="fips",
                                                 new_code=self.geo)
        elif self.geo == "hrr":
            data_frame = data  # data is already adjusted in aggregation step above
        else:
            logging.error(
                "%s is invalid, pick one of 'county', 'state', 'msa', 'hrr', 'hhs', nation'",
                self.geo)
            return False

        unique_geo_ids = pd.unique(data_frame[self.geo])
        data_frame.set_index([self.geo, 'date'], inplace=True)

        # for each location, fill in all missing dates with 0 values
        multiindex = pd.MultiIndex.from_product(
            (unique_geo_ids, self.fit_dates), names=[self.geo, "date"])
        assert (
            len(multiindex) <=
            (GeoConstants.MAX_GEO[self.geo] * len(self.fit_dates))
        ), "more loc-date pairs than maximum number of geographies x number of dates"
        # fill dataframe with missing dates using 0
        data_frame = data_frame.reindex(multiindex, fill_value=0)
        data_frame.fillna(0, inplace=True)
        return data_frame
Beispiel #4
0
def aggregate(df, metric, geo_res):
    """
    Aggregate signals to appropriate resolution.

    Parameters
    ----------
    df: pd.DataFrame
        Zip Code-level data with prepared metrics (output of
        construct_metrics().
    metric: str
        Name of metric to be exported.
    geo_resolution: str
        One of ('county', 'hrr, 'msa', 'state', 'hhs', 'nation')

    Returns
    -------
    pd.DataFrame:
        DataFrame with one row per geo_id, with columns for the individual
        signals.
    """
    df = df.copy()
    metric_count_name = "_".join([metric, "num"])
    metric_prop_name = "_".join([metric, "prop"])

    gmpr = GeoMapper()
    geo_key = GEO_KEY_DICT[geo_res]
    df = gmpr.add_population_column(df, "zip")
    df = gmpr.replace_geocode(df, "zip", geo_key, date_col="timestamp", data_cols=[metric_count_name, "population"])

    df[metric_prop_name] = df[metric_count_name] / df["population"] \
                            * INCIDENCE_BASE
    return df.rename({geo_key: "geo_id"}, axis=1)
    def test_msa_hrr(self, jhu_confirmed_test_data):
        for geo in ["msa", "hrr"]:
            test_df = jhu_confirmed_test_data
            new_df = geo_map(test_df, geo, "cumulative_prop")
            gmpr = GeoMapper()
            test_df = gmpr.add_population_column(test_df, "fips")
            test_df = gmpr.replace_geocode(test_df,
                                           "fips",
                                           geo,
                                           date_col="timestamp")

            new_df = new_df.set_index(["geo_id", "timestamp"]).sort_index()
            test_df = test_df.set_index([geo, "timestamp"]).sort_index()

            # Check that the non-proportional columns are identical
            assert new_df.eq(test_df)[[
                "new_counts", "population", "cumulative_counts"
            ]].all().all()
            # Check that the proportional signals are identical
            exp_incidence = test_df["new_counts"] / test_df[
                "population"] * INCIDENCE_BASE
            expected_cumulative_prop = test_df["cumulative_counts"] / test_df["population"] *\
                INCIDENCE_BASE
            assert new_df["incidence"].eq(exp_incidence).all()
            assert new_df["cumulative_prop"].eq(expected_cumulative_prop).all()
            # Make sure the prop signals don't have inf values
            assert not new_df["incidence"].eq(np.inf).any()
            assert not new_df["cumulative_prop"].eq(np.inf).any()
Beispiel #6
0
    def geo_reindex(self, data):
        """Reindex based on geography, include all date, geo pairs.

        Args:
            data: dataframe, the output of loadcombineddata
        Returns:
            dataframe
        """
        # get right geography
        geo = self.geo
        gmpr = GeoMapper()
        if geo not in {"county", "state", "msa", "hrr", "nation", "hhs"}:
            logging.error("{0} is invalid, pick one of 'county', "
                          "'state', 'msa', 'hrr', 'hss','nation'".format(geo))
            return False
        if geo == "county":
            data_frame = gmpr.fips_to_megacounty(data,
                                                 Config.MIN_DEN,
                                                 Config.MAX_BACKFILL_WINDOW,
                                                 thr_col="den",
                                                 mega_col=geo)
        elif geo == "state":
            data_frame = gmpr.replace_geocode(data,
                                              "fips",
                                              "state_id",
                                              new_col="state")
        else:
            data_frame = gmpr.replace_geocode(data, "fips", geo)

        unique_geo_ids = pd.unique(data_frame[geo])
        data_frame.set_index([geo, Config.DATE_COL], inplace=True)
        # for each location, fill in all missing dates with 0 values
        multiindex = pd.MultiIndex.from_product(
            (unique_geo_ids, self.fit_dates), names=[geo, Config.DATE_COL])
        assert (
            len(multiindex) <=
            (len(gmpr.get_geo_values(gmpr.as_mapper_name(geo))) *
             len(self.fit_dates))
        ), "more loc-date pairs than maximum number of geographies x number of dates"
        # fill dataframe with missing dates using 0
        data_frame = data_frame.reindex(multiindex, fill_value=0)
        data_frame.fillna(0, inplace=True)
        return data_frame
def pull_jhu_data(base_url: str, metric: str, gmpr: GeoMapper) -> pd.DataFrame:
    """Pull the latest Johns Hopkins CSSE data, and conform it into a dataset.

    The output dataset has:

    - Each row corresponds to (County, Date), denoted (FIPS, timestamp)
    - Each row additionally has a column `new_counts` corresponding to the new
      new_counts (either `confirmed` cases or `deaths`), and a column
      `cumulative_counts`, correspond to the aggregate metric from January 22nd
      (as of April 27th) until the latest date.

    Note that the raw dataset gives the `cumulative_counts` metric, from which
    we compute `new_counts` by taking first differences.  Hence, `new_counts`
    may be negative.  This is wholly dependent on the quality of the raw
    dataset.

    We filter the data such that we only keep rows with valid FIPS or "FIPS"
    codes defined under the exceptions of the README.

    Parameters
    ----------
    base_url: str
        Base URL for pulling the JHU CSSE data.
    metric: str
        One of 'confirmed' or 'deaths'.
    gmpr: GeoMapper
        An instance of the geomapping utility.

    Returns
    -------
    pd.DataFrame
        Dataframe as described above.
    """
    df = download_data(base_url, metric)

    gmpr = GeoMapper()
    df = gmpr.replace_geocode(
        df, "jhu_uid", "fips", from_col="UID", date_col="timestamp"
    )

    # Merge in population, set population as NAN for fake fips
    df = gmpr.add_population_column(df, "fips")
    df = create_diffs_column(df)

    # Final sanity checks
    sanity_check_data(df)

    # Reorder columns
    df = df[["fips", "timestamp", "population", "new_counts", "cumulative_counts"]]
    return df
    def test_state(self, jhu_confirmed_test_data):
        df = jhu_confirmed_test_data
        new_df = geo_map(df, "state")

        gmpr = GeoMapper()
        test_df = gmpr.replace_geocode(df,
                                       "fips",
                                       "state_id",
                                       date_col="timestamp",
                                       new_col="state")

        # Test the same states and timestamps are present
        assert new_df["geo_id"].eq(test_df["state"]).all()
        assert new_df["timestamp"].eq(test_df["timestamp"]).all()

        new_df = new_df.set_index(["geo_id", "timestamp"])
        test_df = test_df.set_index(["state", "timestamp"])

        # Get the Alabama state population total in a different way
        summed_population = df.set_index("fips").filter(
            regex="01\d{2}[1-9]",
            axis=0).groupby("fips").first()["population"].sum()
        mega_fips_record = df.set_index(["fips", "timestamp"
                                         ]).loc[("01000", "2020-09-15"),
                                                "population"].sum()
        # Compare with the county megaFIPS record
        assert summed_population == mega_fips_record
        # Compare with the population in the transformed df
        assert new_df.loc["al"]["population"].eq(summed_population).all()
        # Make sure diffs and cumulative are equal
        assert new_df["new_counts"].eq(test_df["new_counts"]).all()
        assert new_df["cumulative_counts"].eq(
            test_df["cumulative_counts"]).all()
        # Manually calculate the proportional signals in Alabama and verify equality
        expected_incidence = test_df.loc["al"][
            "new_counts"] / summed_population * INCIDENCE_BASE
        expected_cumulative_prop = test_df.loc["al"][
            "cumulative_counts"] / summed_population * INCIDENCE_BASE
        assert new_df.loc["al", "incidence"].eq(expected_incidence).all()
        assert new_df.loc["al", "cumulative_prop"].eq(
            expected_cumulative_prop).all()
        # Make sure the prop signals don't have inf values
        assert not new_df["incidence"].eq(np.inf).any()
        assert not new_df["cumulative_prop"].eq(np.inf).any()