Exemple #1
0
def add_county_pop(df: pd.DataFrame, gmpr: GeoMapper):
    """
    Add county populations to the data with special US territory handling.

    Since Guam, Northern Mariana Islands, American Samoa, and the Virgin Islands are reported as
    megafips instead of actual counties in JHU, they would normally not have a population added.
    In addition to adding populations for the non-territory counties, this function adds in
    the entire territory's population for the 4 aforementioned regions.

    Parameters
    ----------
    df
        DataFrame with county level information and county column named "fips"
    gmpr
        GeoMapper

    Returns
    -------
        Dataframe with added population column
    """
    is_territory_mega = df.fips.isin(["78000", "69000", "66000", "60000"])
    territories = df[is_territory_mega]
    territories_state_id = gmpr.add_geocode(territories, "fips", "state_id")
    territories_pop = gmpr.add_population_column(territories_state_id,
                                                 "state_id",
                                                 dropna=False)
    territories_pop.drop("state_id", axis=1, inplace=True)
    nonterritories = df[~is_territory_mega]
    nonterritories_pop = gmpr.add_population_column(nonterritories,
                                                    "fips",
                                                    dropna=False)
    return pd.concat([nonterritories_pop, territories_pop], ignore_index=True)
Exemple #2
0
def aggregate(df, metric, geo_res):
    """
    Aggregate signals to appropriate resolution.

    Parameters
    ----------
    df: pd.DataFrame
        Zip Code-level data with prepared metrics (output of
        construct_metrics().
    metric: str
        Name of metric to be exported.
    geo_resolution: str
        One of ('county', 'hrr, 'msa', 'state', 'hhs', 'nation')

    Returns
    -------
    pd.DataFrame:
        DataFrame with one row per geo_id, with columns for the individual
        signals.
    """
    df = df.copy()
    metric_count_name = "_".join([metric, "num"])
    metric_prop_name = "_".join([metric, "prop"])

    gmpr = GeoMapper()
    geo_key = GEO_KEY_DICT[geo_res]
    df = gmpr.add_population_column(df, "zip")
    df = gmpr.replace_geocode(df, "zip", geo_key, date_col="timestamp", data_cols=[metric_count_name, "population"])

    df[metric_prop_name] = df[metric_count_name] / df["population"] \
                            * INCIDENCE_BASE
    return df.rename({geo_key: "geo_id"}, axis=1)
    def test_msa_hrr(self, jhu_confirmed_test_data):
        for geo in ["msa", "hrr"]:
            test_df = jhu_confirmed_test_data
            new_df = geo_map(test_df, geo, "cumulative_prop")
            gmpr = GeoMapper()
            test_df = gmpr.add_population_column(test_df, "fips")
            test_df = gmpr.replace_geocode(test_df,
                                           "fips",
                                           geo,
                                           date_col="timestamp")

            new_df = new_df.set_index(["geo_id", "timestamp"]).sort_index()
            test_df = test_df.set_index([geo, "timestamp"]).sort_index()

            # Check that the non-proportional columns are identical
            assert new_df.eq(test_df)[[
                "new_counts", "population", "cumulative_counts"
            ]].all().all()
            # Check that the proportional signals are identical
            exp_incidence = test_df["new_counts"] / test_df[
                "population"] * INCIDENCE_BASE
            expected_cumulative_prop = test_df["cumulative_counts"] / test_df["population"] *\
                INCIDENCE_BASE
            assert new_df["incidence"].eq(exp_incidence).all()
            assert new_df["cumulative_prop"].eq(expected_cumulative_prop).all()
            # Make sure the prop signals don't have inf values
            assert not new_df["incidence"].eq(np.inf).any()
            assert not new_df["cumulative_prop"].eq(np.inf).any()
Exemple #4
0
def update_sensor(
        state_files: List[str],
        mmwr_info: pd.DataFrame,
        output_path: str,
        start_date: datetime,
        end_date: datetime) -> pd.DataFrame:
    """
    Generate sensor values, and write to csv format.

    Args:
        state_files: List of JSON files representing COVID-NET hospitalization data for each state
        mmwr_info: Mappings from MMWR week to actual dates, as a pd.DataFrame
        output_path: Path to write the csvs to
        start_date: First sensor date (datetime.datetime)
        end_date: Last sensor date (datetime.datetime)

    Returns:
        The overall pd.DataFrame after all processing
    """
    assert start_date < end_date, "start_date >= end_date"

    # Combine and format hospitalizations dataframe
    hosp_df = CovidNet.read_all_hosp_data(state_files)
    hosp_df = hosp_df.merge(mmwr_info, how="left",
                            left_on=["mmwr-year", "mmwr-week"],
                            right_on=["year", "weeknumber"])

    # Select relevant columns and standardize naming
    hosp_df = hosp_df.loc[:, APIConfig.HOSP_RENAME_COLS.keys()]\
        .rename(columns=APIConfig.HOSP_RENAME_COLS)

    # Restrict to start and end date
    hosp_df = hosp_df[
        (hosp_df["date"] >= start_date) & (
            hosp_df["date"] < end_date)
    ]

    # Set state id to two-letter abbreviation
    gmpr = GeoMapper()
    hosp_df = gmpr.add_geocode(hosp_df,
                               from_col=APIConfig.STATE_COL,
                               from_code="state_name",
                               new_code="state_id",
                               dropna=False)
    # To use the original column name, reassign original column and drop new one
    hosp_df[APIConfig.STATE_COL] = hosp_df["state_id"]
    hosp_df.drop("state_id", axis=1, inplace=True)
    assert not hosp_df.duplicated(["date", "geo_id"]).any(), "Non-unique (date, geo_id) pairs"
    hosp_df.set_index(["date", "geo_id"], inplace=True)

    # Fill in remaining expected columns
    hosp_df["se"] = np.nan
    hosp_df["sample_size"] = np.nan

    # Write results
    signals = add_prefix(SIGNALS, wip_signal=read_params()["wip_signal"], prefix="wip_")
    for signal in signals:
        write_to_csv(hosp_df, signal, output_path)
    return hosp_df
Exemple #5
0
def geo_map(df: pd.DataFrame, geo_res: str, sensor: str):
    """
    Map and aggregate a DataFrame at the county resolution to the geographic resolution geo_res.

    Parameters
    ----------
    df: pd.DataFrame
        Columns: fips, timestamp, new_counts, cumulative_counts, population ...
    geo_res: str
        Geographic resolution to which to aggregate.  Valid options:
        ('fips', 'state', 'msa', 'hrr', 'hhs', 'nation').
    sensor: str
        sensor type. Valid options:
        ("new_counts", "cumulative_counts",
        "incidence", "cumulative_prop")

    Returns
    -------
    pd.DataFrame
        Columns: geo_id, timestamp, ...
    """
    df = df.copy()
    if geo_res not in VALID_GEO_RES:
        raise ValueError(f"geo_res must be one of {VALID_GEO_RES}")
    gmpr = GeoMapper()
    df = add_county_pop(df, gmpr)
    unassigned_counties = df[df["fips"].str.endswith("000")].copy()
    df = df[~df["fips"].str.endswith("000")].copy()
    if geo_res == "county":
        if not sensor in ("incidence", "cumulative_prop"):  # prop signals
            # It is not clear how to calculate the proportion for unallocated
            # cases/deaths, so we exclude them for those sensors.
            df = df.append(
                unassigned_counties) if not unassigned_counties.empty else df
        df.rename(columns={"fips": "geo_id"}, inplace=True)
    elif geo_res in ("state", "hhs", "nation"):
        geo = "state_id" if geo_res == "state" else geo_res
        df = df.append(
            unassigned_counties) if not unassigned_counties.empty else df
        df = gmpr.replace_geocode(df,
                                  "fips",
                                  geo,
                                  new_col="geo_id",
                                  date_col="timestamp")
    else:
        df = gmpr.replace_geocode(df,
                                  "fips",
                                  geo_res,
                                  new_col="geo_id",
                                  date_col="timestamp")
    df["incidence"] = df["new_counts"] / df["population"] * INCIDENCE_BASE
    df["cumulative_prop"] = df["cumulative_counts"] / df[
        "population"] * INCIDENCE_BASE
    return df
Exemple #6
0
def geo_map(df: pd.DataFrame, geo_res: str):
    """
    Map and aggregate a DataFrame at the county resolution to the geographic resolution geo_res.

    Parameters
    ----------
    df: pd.DataFrame
        Columns: fips, timestamp, new_counts, cumulative_counts, population ...
    geo_res: str
        Geographic resolution to which to aggregate.  Valid options:
        ('fips', 'state', 'msa', 'hrr', 'hhs', 'nation').
    sensor: str
        sensor type. Valid options:
        ("new_counts", "cumulative_counts",
        "incidence", "cumulative_prop")

    Returns
    -------
    pd.DataFrame
        Columns: geo_id, timestamp, ...
    """
    df = df.copy()
    if geo_res not in VALID_GEO_RES:
        raise ValueError(f"geo_res must be one of {VALID_GEO_RES}")

    gmpr = GeoMapper()
    if geo_res == "county":
        df.rename(columns={'fips': 'geo_id'}, inplace=True)
    elif geo_res == "state":
        df = df.set_index("fips")
        # Zero out the state FIPS population to avoid double counting.
        state_fips_codes = {str(x).zfill(2) + "000" for x in range(1, 73)}
        subset_state_fips_codes = set(df.index.values) & state_fips_codes
        df.loc[subset_state_fips_codes, "population"] = 0
        df = df.reset_index()
        df = gmpr.replace_geocode(df,
                                  "fips",
                                  "state_id",
                                  new_col="geo_id",
                                  date_col="timestamp")
    else:
        df = gmpr.replace_geocode(df,
                                  "fips",
                                  geo_res,
                                  new_col="geo_id",
                                  date_col="timestamp")
    df["incidence"] = df["new_counts"] / df["population"] * INCIDENCE_BASE
    df["cumulative_prop"] = df["cumulative_counts"] / df[
        "population"] * INCIDENCE_BASE
    df['new_counts'] = df['new_counts']
    df['cumulative_counts'] = df['cumulative_counts']
    return df
    def geo_reindex(self, data):
        """
        Reindex dataframe based on desired output geography.

        Args:
            data: dataframe, the output of load_data::load_data()

        Returns:
            reindexed dataframe

        """
        geo_map = GeoMapper()
        if self.geo == "county":
            data_frame = geo_map.fips_to_megacounty(
                data,
                Config.MIN_DEN,
                Config.MAX_BACKWARDS_PAD_LENGTH,
                thr_col="den",
                mega_col=self.geo)
        elif self.geo == "state":
            data_frame = geo_map.replace_geocode(data,
                                                 from_code="fips",
                                                 new_col=self.geo,
                                                 new_code="state_id")
            data_frame[self.geo] = data_frame[self.geo]
        elif self.geo in ["msa", "hhs", "nation"]:
            data_frame = geo_map.replace_geocode(data,
                                                 from_code="fips",
                                                 new_code=self.geo)
        elif self.geo == "hrr":
            data_frame = data  # data is already adjusted in aggregation step above
        else:
            logging.error(
                "%s is invalid, pick one of 'county', 'state', 'msa', 'hrr', 'hhs', nation'",
                self.geo)
            return False

        unique_geo_ids = pd.unique(data_frame[self.geo])
        data_frame.set_index([self.geo, 'date'], inplace=True)

        # for each location, fill in all missing dates with 0 values
        multiindex = pd.MultiIndex.from_product(
            (unique_geo_ids, self.fit_dates), names=[self.geo, "date"])
        assert (
            len(multiindex) <=
            (GeoConstants.MAX_GEO[self.geo] * len(self.fit_dates))
        ), "more loc-date pairs than maximum number of geographies x number of dates"
        # fill dataframe with missing dates using 0
        data_frame = data_frame.reindex(multiindex, fill_value=0)
        data_frame.fillna(0, inplace=True)
        return data_frame
def pull_jhu_data(base_url: str, metric: str, gmpr: GeoMapper) -> pd.DataFrame:
    """Pull the latest Johns Hopkins CSSE data, and conform it into a dataset.

    The output dataset has:

    - Each row corresponds to (County, Date), denoted (FIPS, timestamp)
    - Each row additionally has a column `new_counts` corresponding to the new
      new_counts (either `confirmed` cases or `deaths`), and a column
      `cumulative_counts`, correspond to the aggregate metric from January 22nd
      (as of April 27th) until the latest date.

    Note that the raw dataset gives the `cumulative_counts` metric, from which
    we compute `new_counts` by taking first differences.  Hence, `new_counts`
    may be negative.  This is wholly dependent on the quality of the raw
    dataset.

    We filter the data such that we only keep rows with valid FIPS or "FIPS"
    codes defined under the exceptions of the README.

    Parameters
    ----------
    base_url: str
        Base URL for pulling the JHU CSSE data.
    metric: str
        One of 'confirmed' or 'deaths'.
    gmpr: GeoMapper
        An instance of the geomapping utility.

    Returns
    -------
    pd.DataFrame
        Dataframe as described above.
    """
    df = download_data(base_url, metric)

    gmpr = GeoMapper()
    df = gmpr.replace_geocode(
        df, "jhu_uid", "fips", from_col="UID", date_col="timestamp"
    )

    # Merge in population, set population as NAN for fake fips
    df = gmpr.add_population_column(df, "fips")
    df = create_diffs_column(df)

    # Final sanity checks
    sanity_check_data(df)

    # Reorder columns
    df = df[["fips", "timestamp", "population", "new_counts", "cumulative_counts"]]
    return df
Exemple #9
0
    def test_good_file(self):
        gmpr = GeoMapper()
        df = pull_jhu_data(join("test_data", "small_{metric}.csv"), "deaths",
                           gmpr)

        assert (df.columns.values == [
            "fips", "timestamp", "new_counts", "cumulative_counts"
        ]).all()
        assert True
    def test_state(self, jhu_confirmed_test_data):
        df = jhu_confirmed_test_data
        new_df = geo_map(df, "state")

        gmpr = GeoMapper()
        test_df = gmpr.replace_geocode(df,
                                       "fips",
                                       "state_id",
                                       date_col="timestamp",
                                       new_col="state")

        # Test the same states and timestamps are present
        assert new_df["geo_id"].eq(test_df["state"]).all()
        assert new_df["timestamp"].eq(test_df["timestamp"]).all()

        new_df = new_df.set_index(["geo_id", "timestamp"])
        test_df = test_df.set_index(["state", "timestamp"])

        # Get the Alabama state population total in a different way
        summed_population = df.set_index("fips").filter(
            regex="01\d{2}[1-9]",
            axis=0).groupby("fips").first()["population"].sum()
        mega_fips_record = df.set_index(["fips", "timestamp"
                                         ]).loc[("01000", "2020-09-15"),
                                                "population"].sum()
        # Compare with the county megaFIPS record
        assert summed_population == mega_fips_record
        # Compare with the population in the transformed df
        assert new_df.loc["al"]["population"].eq(summed_population).all()
        # Make sure diffs and cumulative are equal
        assert new_df["new_counts"].eq(test_df["new_counts"]).all()
        assert new_df["cumulative_counts"].eq(
            test_df["cumulative_counts"]).all()
        # Manually calculate the proportional signals in Alabama and verify equality
        expected_incidence = test_df.loc["al"][
            "new_counts"] / summed_population * INCIDENCE_BASE
        expected_cumulative_prop = test_df.loc["al"][
            "cumulative_counts"] / summed_population * INCIDENCE_BASE
        assert new_df.loc["al", "incidence"].eq(expected_incidence).all()
        assert new_df.loc["al", "cumulative_prop"].eq(
            expected_cumulative_prop).all()
        # Make sure the prop signals don't have inf values
        assert not new_df["incidence"].eq(np.inf).any()
        assert not new_df["cumulative_prop"].eq(np.inf).any()
 def test_add_county_pop(self):
     gmpr = GeoMapper()
     test_df = pd.DataFrame(
         {"fips": ["01001", "06000", "06097", "72000", "72153", "78000"]})
     pd.testing.assert_frame_equal(
         add_county_pop(test_df, gmpr),
         pd.DataFrame({
             "fips": ["01001", "06000", "06097", "72000", "72153", "78000"],
             "population": [55869, np.nan, 494336, np.nan, 42043, 106405]
         }))
    def geo_reindex(self, data):
        """Reindex based on geography, include all date, geo pairs.

        Args:
            data: dataframe, the output of loadcombineddata
        Returns:
            dataframe
        """
        # get right geography
        geo = self.geo
        gmpr = GeoMapper()
        if geo not in {"county", "state", "msa", "hrr", "nation", "hhs"}:
            logging.error("{0} is invalid, pick one of 'county', "
                          "'state', 'msa', 'hrr', 'hss','nation'".format(geo))
            return False
        if geo == "county":
            data_frame = gmpr.fips_to_megacounty(data,
                                                 Config.MIN_DEN,
                                                 Config.MAX_BACKFILL_WINDOW,
                                                 thr_col="den",
                                                 mega_col=geo)
        elif geo == "state":
            data_frame = gmpr.replace_geocode(data, "fips", "state_id", new_col="state")
        else:
            data_frame = gmpr.replace_geocode(data, "fips", geo)

        unique_geo_ids = pd.unique(data_frame[geo])
        data_frame.set_index([geo, Config.DATE_COL],inplace=True)
        # for each location, fill in all missing dates with 0 values
        multiindex = pd.MultiIndex.from_product((unique_geo_ids, self.fit_dates),
                                                names=[geo, Config.DATE_COL])
        assert (len(multiindex) <= (Constants.MAX_GEO[geo] * len(self.fit_dates))
                ), "more loc-date pairs than maximum number of geographies x number of dates"
        # fill dataframe with missing dates using 0
        data_frame = data_frame.reindex(multiindex, fill_value=0)
        data_frame.fillna(0, inplace=True)
        return data_frame
Exemple #13
0
def run_module(params: Dict[str, Any]):
    """Run the JHU indicator module.

    The `params` argument is expected to have the following structure:
    - "common":
        - "export_dir": str, directory to write output
        - "log_exceptions" (optional): bool, whether to log exceptions to file
        - "log_filename" (optional): str, name of file to write logs
    - "indicator":
        - "base_url": str, URL from which to read upstream data
        - "export_start_date": str, date from which to export data in YYYY-MM-DD format
    - "archive" (optional): if provided, output will be archived with S3
        - "aws_credentials": Dict[str, str], AWS login credentials (see S3 documentation)
        - "bucket_name: str, name of S3 bucket to read/write
        - "cache_dir": str, directory of locally cached data
    """
    start_time = time.time()
    csv_export_count = 0
    oldest_final_export_date = None
    export_start_date = params["indicator"]["export_start_date"]
    export_dir = params["common"]["export_dir"]
    base_url = params["indicator"]["base_url"]
    logger = get_structured_logger(
        __name__,
        filename=params["common"].get("log_filename"),
        log_exceptions=params["common"].get("log_exceptions", True))

    if "archive" in params:
        arch_diff = S3ArchiveDiffer(
            params["archive"]["cache_dir"],
            export_dir,
            params["archive"]["bucket_name"],
            "jhu",
            params["archive"]["aws_credentials"],
        )
        arch_diff.update_cache()
    else:
        arch_diff = None

    gmpr = GeoMapper()
    dfs = {metric: pull_jhu_data(base_url, metric, gmpr) for metric in METRICS}
    for metric, geo_res, sensor, smoother in product(METRICS, GEO_RESOLUTIONS,
                                                     SENSORS, SMOOTHERS):
        print(metric, geo_res, sensor, smoother)
        logger.info(event="generating signal and exporting to CSV",
                    metric=metric,
                    geo_res=geo_res,
                    sensor=sensor,
                    smoother=smoother)
        df = dfs[metric]
        # Aggregate to appropriate geographic resolution
        df = geo_map(df, geo_res, sensor)
        df.set_index(["timestamp", "geo_id"], inplace=True)
        df["val"] = df[sensor].groupby(level=1).transform(
            SMOOTHERS_MAP[smoother][0])
        df["se"] = np.nan
        df["sample_size"] = np.nan
        # Drop early entries where data insufficient for smoothing
        df = df[~df["val"].isnull()]
        df = df.reset_index()
        sensor_name = SENSOR_NAME_MAP[sensor][0]
        # if (SENSOR_NAME_MAP[sensor][1] or SMOOTHERS_MAP[smoother][2]):
        #     metric = f"wip_{metric}"
        #     sensor_name = WIP_SENSOR_NAME_MAP[sensor][0]
        sensor_name = SMOOTHERS_MAP[smoother][1] + sensor_name
        exported_csv_dates = create_export_csv(
            df,
            export_dir=export_dir,
            start_date=datetime.strptime(export_start_date, "%Y-%m-%d"),
            metric=metric,
            geo_res=geo_res,
            sensor=sensor_name,
        )
        if not exported_csv_dates.empty:
            csv_export_count += exported_csv_dates.size
            if not oldest_final_export_date:
                oldest_final_export_date = max(exported_csv_dates)
            oldest_final_export_date = min(oldest_final_export_date,
                                           max(exported_csv_dates))

    if arch_diff is not None:
        # Diff exports, and make incremental versions
        _, common_diffs, new_files = arch_diff.diff_exports()

        # Archive changed and new files only
        to_archive = [
            f for f, diff in common_diffs.items() if diff is not None
        ]
        to_archive += new_files
        _, fails = arch_diff.archive_exports(to_archive)

        # Filter existing exports to exclude those that failed to archive
        succ_common_diffs = {
            f: diff
            for f, diff in common_diffs.items() if f not in fails
        }
        arch_diff.filter_exports(succ_common_diffs)

        # Report failures: someone should probably look at them
        for exported_file in fails:
            print(f"Failed to archive '{exported_file}'")

    elapsed_time_in_seconds = round(time.time() - start_time, 2)
    max_lag_in_days = None
    formatted_oldest_final_export_date = None
    if oldest_final_export_date:
        max_lag_in_days = (datetime.now() - oldest_final_export_date).days
        formatted_oldest_final_export_date = oldest_final_export_date.strftime(
            "%Y-%m-%d")
    logger.info("Completed indicator run",
                elapsed_time_in_seconds=elapsed_time_in_seconds,
                csv_export_count=csv_export_count,
                max_lag_in_days=max_lag_in_days,
                oldest_final_export_date=formatted_oldest_final_export_date)
"""Contains geographic mapping tools."""
from delphi_utils import GeoMapper

DATE_COL = "timestamp"
DATA_COLS = ['totalTest', 'numUniqueDevices', 'positiveTest', "population"]
GMPR = GeoMapper()  # Use geo utils
GEO_KEY_DICT = {
    "county": "fips",
    "msa": "msa",
    "hrr": "hrr",
    "state": "state_id"
}


def geo_map(geo_res, df):
    """Map a geocode to a new value."""
    data = df.copy()
    geo_key = GEO_KEY_DICT[geo_res]
    # Add population for each zipcode
    data = GMPR.add_population_column(data, "zip")
    # zip -> geo_res
    data = GMPR.replace_geocode(data,
                                "zip",
                                geo_key,
                                date_col=DATE_COL,
                                data_cols=DATA_COLS)
    if geo_res == "state":
        return data
    # Add parent state
    data = add_parent_state(data, geo_res, geo_key)
    return data, geo_key
Exemple #15
0
class TestLoadData:
    denom_data = load_chng_data(DENOM_FILEPATH, DROP_DATE, "fips",
                                Config.DENOM_COLS, Config.DENOM_DTYPES,
                                Config.DENOM_COL)
    covid_data = load_chng_data(COVID_FILEPATH, DROP_DATE, "fips",
                                Config.COVID_COLS, Config.COVID_DTYPES,
                                Config.COVID_COL)
    combined_data = load_combined_data(DENOM_FILEPATH, COVID_FILEPATH,
                                       DROP_DATE, "fips")
    gmpr = GeoMapper()

    def test_base_unit(self):
        with pytest.raises(AssertionError):
            load_chng_data(DENOM_FILEPATH, DROP_DATE, "foo", Config.DENOM_COLS,
                           Config.DENOM_DTYPES, Config.DENOM_COL)

        with pytest.raises(AssertionError):
            load_chng_data(DENOM_FILEPATH, DROP_DATE, "fips",
                           Config.DENOM_COLS, Config.DENOM_DTYPES,
                           Config.COVID_COL)

        with pytest.raises(AssertionError):
            load_combined_data(DENOM_FILEPATH, COVID_FILEPATH, DROP_DATE,
                               "foo")

    def test_denom_columns(self):
        assert "fips" in self.denom_data.index.names
        assert "date" in self.denom_data.index.names

        expected_denom_columns = ["Denominator"]
        for col in expected_denom_columns:
            assert col in self.denom_data.columns
        assert len(set(self.denom_data.columns) -
                   set(expected_denom_columns)) == 0

    def test_claims_columns(self):
        assert "fips" in self.covid_data.index.names
        assert "date" in self.covid_data.index.names

        expected_covid_columns = ["COVID"]
        for col in expected_covid_columns:
            assert col in self.covid_data.columns
        assert len(set(self.covid_data.columns) -
                   set(expected_covid_columns)) == 0

    def test_combined_columns(self):
        assert "fips" in self.combined_data.index.names
        assert "date" in self.combined_data.index.names

        expected_combined_columns = ["num", "den"]
        for col in expected_combined_columns:
            assert col in self.combined_data.columns
        assert len(
            set(self.combined_data.columns) -
            set(expected_combined_columns)) == 0

    def test_edge_values(self):
        for data in [self.denom_data, self.covid_data, self.combined_data]:
            assert data.index.get_level_values(
                'date').max() >= Config.FIRST_DATA_DATE
            assert data.index.get_level_values('date').min() < DROP_DATE

    def test_fips_values(self):
        for data in [self.denom_data, self.covid_data, self.combined_data]:
            assert (len(data.index.get_level_values('fips').unique()) <= len(
                self.gmpr.get_geo_values("fips")))

    def test_combined_fips_values(self):
        assert self.combined_data.isna().sum().sum() == 0

        sum_fips_num = (self.covid_data["COVID"].sum())
        sum_fips_den = (self.denom_data["Denominator"].sum())

        assert self.combined_data["num"].sum() == sum_fips_num
        assert self.combined_data["den"].sum() == sum_fips_den
Exemple #16
0
"""Functions for mapping between geo regions."""
# -*- coding: utf-8 -*-
import numpy as np
import pandas as pd
from delphi_utils import GeoMapper
from .constants import METRICS, COMBINED_METRIC

gmpr = GeoMapper()


def generate_transition_matrix(geo_res):
    """
    Generate transition matrix from county to msa/hrr.

    Parameters
    ----------
    geo_res: str
        "msa" or "hrr"

    Returns
    -------
    pd.DataFrame
        columns "geo_id", "timestamp", and "val".
        The first is a data frame for HRR regions and the second are MSA
        regions.
    """
    map_df = gmpr._load_crosswalk("fips", geo_res)  # pylint: disable=protected-access
    # Add population as weights
    map_df = gmpr.add_population_column(map_df, "fips")
    if geo_res == "hrr":
        map_df["population"] = map_df["population"] * map_df["weight"]
def geo_map(df: pd.DataFrame, geo_res: str, sensor: str):
    """
    Map a DataFrame with county level data and aggregate it to the geographic resolution geo_res.

    Parameters
    ----------
    df: pd.DataFrame
        Columns: fips, timestamp, new_counts, cumulative_counts, population ...
    geo_res: str
        Geographic resolution to which to aggregate.  Valid options:
        ("county", "state", "msa", "hrr").
    sensor: str
        sensor type. Valid options:
        ("new_counts", "cumulative_counts",
        "incidence", "cumulative_prop")

    Returns
    -------
    pd.DataFrame
        Columns: geo_id, timestamp, ...
    """
    if geo_res not in VALID_GEO_RES:
        raise ValueError(f"geo_res must be one of {VALID_GEO_RES}")

    # State-level records unassigned to specific counties are coded as fake
    # counties with fips XX000.
    unassigned_counties = df[df["fips"].str.endswith("000")].copy()

    df = df[df["fips"].astype(int) % 1000 != 0].copy()
    # Disburse unallocated cases/deaths in NYC to NYC counties
    df = disburse(df, NYC_FIPS[0][0], NYC_FIPS[0][1])
    df = df[df["fips"] != NYC_FIPS[0][0]]
    geo_mapper = GeoMapper()
    if geo_res == "county":
        if sensor not in PROP_SENSORS:
            # It is not clear how to calculate the proportion for unallocated
            # cases/deaths, so we exclude them for those sensors.
            df = df.append(unassigned_counties)
        df["geo_id"] = df["fips"]
    elif geo_res == "state":
        # Grab first two digits of fips
        # Map state fips to us postal code
        # Add unallocated cases/deaths
        df = df.append(unassigned_counties)
        df = geo_mapper.add_geocode(df, "fips", "state_id", new_col="geo_id")

        # Zero out the state FIPS population to avoid double counting.
        df = df.set_index("fips")
        state_fips_codes = {str(x).zfill(2) + "000" for x in range(1, 73)}
        subset_state_fips_codes = set(df.index.values) & state_fips_codes
        df.loc[subset_state_fips_codes, "population"] = 0
        df = df.reset_index()
    else:
        # Map "missing" secondary FIPS to those that are in our canonical set
        for fips, fips_list in SECONDARY_FIPS:
            df = disburse(df, fips, fips_list)
        for usafacts_fips, our_fips in REPLACE_FIPS:
            df.loc[df["fips"] == usafacts_fips, "fips"] = our_fips
        merged = geo_mapper.add_geocode(df, "fips", geo_res, new_col="geo_id")
        if geo_res != "hrr":
            merged["weight"] = 1  # Only HRR requires weight
        merged["cumulative_counts"] =\
             merged["cumulative_counts"] * merged["weight"]
        merged["new_counts"] = merged["new_counts"] * merged["weight"]
        merged["population"] = merged["population"] * merged["weight"]
        df = merged.drop(["weight"], axis=1)
    df = df.drop("fips", axis=1)
    df = df.groupby(["geo_id", "timestamp"]).sum().reset_index()
    df["incidence"] = df["new_counts"] / df["population"] * INCIDENCE_BASE
    df["cumulative_prop"] =\
         df["cumulative_counts"] / df["population"] * INCIDENCE_BASE
    return df
def run_module():
    """Run the usafacts indicator."""
    params = read_params()
    export_start_date = params["export_start_date"]
    if export_start_date == "latest":
        export_start_date = datetime.combine(date.today(), time(
            0, 0)) - timedelta(days=1)
    else:
        export_start_date = datetime.strptime(export_start_date, "%Y-%m-%d")
    export_dir = params["export_dir"]
    base_url = params["base_url"]
    cache_dir = params["cache_dir"]

    arch_diff = S3ArchiveDiffer(cache_dir, export_dir, params["bucket_name"],
                                "usafacts", params["aws_credentials"])
    arch_diff.update_cache()

    geo_mapper = GeoMapper()

    dfs = {
        metric: pull_usafacts_data(base_url, metric, geo_mapper)
        for metric in METRICS
    }
    for metric, geo_res, sensor, smoother in product(METRICS, GEO_RESOLUTIONS,
                                                     SENSORS, SMOOTHERS):
        print(geo_res, metric, sensor, smoother)
        df = dfs[metric]
        # Aggregate to appropriate geographic resolution
        df = geo_map(df, geo_res, sensor)
        df["val"] = SMOOTHERS_MAP[smoother][0].smooth(df[sensor].values)
        df["se"] = np.nan
        df["sample_size"] = np.nan
        # Drop early entries where data insufficient for smoothing
        df = df.loc[~df["val"].isnull(), :]
        sensor_name = SENSOR_NAME_MAP[sensor][0]
        # if (SENSOR_NAME_MAP[sensor][1] or SMOOTHERS_MAP[smoother][2]):
        #     metric = f"wip_{metric}"
        #     sensor_name = WIP_SENSOR_NAME_MAP[sensor][0]
        sensor_name = SMOOTHERS_MAP[smoother][1] + sensor_name
        create_export_csv(
            df,
            export_dir=export_dir,
            start_date=SMOOTHERS_MAP[smoother][3](export_start_date),
            metric=metric,
            geo_res=geo_res,
            sensor=sensor_name,
        )

    # Diff exports, and make incremental versions
    _, common_diffs, new_files = arch_diff.diff_exports()

    # Archive changed and new files only
    to_archive = [f for f, diff in common_diffs.items() if diff is not None]
    to_archive += new_files
    _, fails = arch_diff.archive_exports(to_archive)

    # Filter existing exports to exclude those that failed to archive
    succ_common_diffs = {
        f: diff
        for f, diff in common_diffs.items() if f not in fails
    }
    arch_diff.filter_exports(succ_common_diffs)

    # Report failures: someone should probably look at them
    for exported_file in fails:
        print(f"Failed to archive '{exported_file}'")
from os.path import join

import pandas as pd
import numpy as np
from delphi_utils import GeoMapper
from delphi_usafacts.pull import pull_usafacts_data

base_url_good = "test_data/small_{metric}_pull.csv"

base_url_bad = {
    "missing_days": "test_data/bad_{metric}_missing_days.csv",
    "missing_cols": "test_data/bad_{metric}_missing_cols.csv",
    "extra_cols": "test_data/bad_{metric}_extra_cols.csv"
}

geo_mapper = GeoMapper()


class TestPullUSAFacts:
    def test_good_file(self):
        metric = "deaths"
        df = pull_usafacts_data(base_url_good, metric, geo_mapper)
        expected_df = pd.DataFrame(
            {
                "fips": ["00001", "00001", "00001", "36009", "36009", "36009"],
                "timestamp": [
                    pd.Timestamp("2020-02-29"),
                    pd.Timestamp("2020-03-01"),
                    pd.Timestamp("2020-03-02"),
                    pd.Timestamp("2020-02-29"),
                    pd.Timestamp("2020-03-01"),
Exemple #20
0
def run_module():
    """Run the JHU indicator module."""
    params = read_params()
    export_start_date = params["export_start_date"]
    export_dir = params["export_dir"]
    base_url = params["base_url"]
    cache_dir = params["cache_dir"]
    logger = get_structured_logger(__name__,
                                   filename=params.get("log_filename"))

    if len(params["bucket_name"]) > 0:
        arch_diff = S3ArchiveDiffer(
            cache_dir,
            export_dir,
            params["bucket_name"],
            "jhu",
            params["aws_credentials"],
        )
        arch_diff.update_cache()
    else:
        arch_diff = None

    gmpr = GeoMapper()
    dfs = {metric: pull_jhu_data(base_url, metric, gmpr) for metric in METRICS}
    for metric, geo_res, sensor, smoother in product(METRICS, GEO_RESOLUTIONS,
                                                     SENSORS, SMOOTHERS):
        print(metric, geo_res, sensor, smoother)
        logger.info(event="generating signal and exporting to CSV",
                    metric=metric,
                    geo_res=geo_res,
                    sensor=sensor,
                    smoother=smoother)
        df = dfs[metric]
        # Aggregate to appropriate geographic resolution
        df = geo_map(df, geo_res)
        df.set_index(["timestamp", "geo_id"], inplace=True)
        df["val"] = df[sensor].groupby(level=1).transform(
            SMOOTHERS_MAP[smoother][0])
        df["se"] = np.nan
        df["sample_size"] = np.nan
        # Drop early entries where data insufficient for smoothing
        df = df[~df["val"].isnull()]
        df = df.reset_index()
        sensor_name = SENSOR_NAME_MAP[sensor][0]
        # if (SENSOR_NAME_MAP[sensor][1] or SMOOTHERS_MAP[smoother][2]):
        #     metric = f"wip_{metric}"
        #     sensor_name = WIP_SENSOR_NAME_MAP[sensor][0]
        sensor_name = SMOOTHERS_MAP[smoother][1] + sensor_name
        create_export_csv(
            df,
            export_dir=export_dir,
            start_date=datetime.strptime(export_start_date, "%Y-%m-%d"),
            metric=metric,
            geo_res=geo_res,
            sensor=sensor_name,
        )

    if not arch_diff is None:
        # Diff exports, and make incremental versions
        _, common_diffs, new_files = arch_diff.diff_exports()

        # Archive changed and new files only
        to_archive = [
            f for f, diff in common_diffs.items() if diff is not None
        ]
        to_archive += new_files
        _, fails = arch_diff.archive_exports(to_archive)

        # Filter existing exports to exclude those that failed to archive
        succ_common_diffs = {
            f: diff
            for f, diff in common_diffs.items() if f not in fails
        }
        arch_diff.filter_exports(succ_common_diffs)

        # Report failures: someone should probably look at them
        for exported_file in fails:
            print(f"Failed to archive '{exported_file}'")
Exemple #21
0
 def test_missing_days(self):
     gmpr = GeoMapper()
     with pytest.raises(ValueError):
         pull_jhu_data(join("test_data", "bad_{metric}_missing_days.csv"),
                       "confirmed", gmpr)