Esempio n. 1
0
def add_county_pop(df: pd.DataFrame, gmpr: GeoMapper):
    """
    Add county populations to the data with special US territory handling.

    Since Guam, Northern Mariana Islands, American Samoa, and the Virgin Islands are reported as
    megafips instead of actual counties in JHU, they would normally not have a population added.
    In addition to adding populations for the non-territory counties, this function adds in
    the entire territory's population for the 4 aforementioned regions.

    Parameters
    ----------
    df
        DataFrame with county level information and county column named "fips"
    gmpr
        GeoMapper

    Returns
    -------
        Dataframe with added population column
    """
    is_territory_mega = df.fips.isin(["78000", "69000", "66000", "60000"])
    territories = df[is_territory_mega]
    territories_state_id = gmpr.add_geocode(territories, "fips", "state_id")
    territories_pop = gmpr.add_population_column(territories_state_id,
                                                 "state_id",
                                                 dropna=False)
    territories_pop.drop("state_id", axis=1, inplace=True)
    nonterritories = df[~is_territory_mega]
    nonterritories_pop = gmpr.add_population_column(nonterritories,
                                                    "fips",
                                                    dropna=False)
    return pd.concat([nonterritories_pop, territories_pop], ignore_index=True)
Esempio n. 2
0
def update_sensor(
        state_files: List[str],
        mmwr_info: pd.DataFrame,
        output_path: str,
        start_date: datetime,
        end_date: datetime) -> pd.DataFrame:
    """
    Generate sensor values, and write to csv format.

    Args:
        state_files: List of JSON files representing COVID-NET hospitalization data for each state
        mmwr_info: Mappings from MMWR week to actual dates, as a pd.DataFrame
        output_path: Path to write the csvs to
        start_date: First sensor date (datetime.datetime)
        end_date: Last sensor date (datetime.datetime)

    Returns:
        The overall pd.DataFrame after all processing
    """
    assert start_date < end_date, "start_date >= end_date"

    # Combine and format hospitalizations dataframe
    hosp_df = CovidNet.read_all_hosp_data(state_files)
    hosp_df = hosp_df.merge(mmwr_info, how="left",
                            left_on=["mmwr-year", "mmwr-week"],
                            right_on=["year", "weeknumber"])

    # Select relevant columns and standardize naming
    hosp_df = hosp_df.loc[:, APIConfig.HOSP_RENAME_COLS.keys()]\
        .rename(columns=APIConfig.HOSP_RENAME_COLS)

    # Restrict to start and end date
    hosp_df = hosp_df[
        (hosp_df["date"] >= start_date) & (
            hosp_df["date"] < end_date)
    ]

    # Set state id to two-letter abbreviation
    gmpr = GeoMapper()
    hosp_df = gmpr.add_geocode(hosp_df,
                               from_col=APIConfig.STATE_COL,
                               from_code="state_name",
                               new_code="state_id",
                               dropna=False)
    # To use the original column name, reassign original column and drop new one
    hosp_df[APIConfig.STATE_COL] = hosp_df["state_id"]
    hosp_df.drop("state_id", axis=1, inplace=True)
    assert not hosp_df.duplicated(["date", "geo_id"]).any(), "Non-unique (date, geo_id) pairs"
    hosp_df.set_index(["date", "geo_id"], inplace=True)

    # Fill in remaining expected columns
    hosp_df["se"] = np.nan
    hosp_df["sample_size"] = np.nan

    # Write results
    signals = add_prefix(SIGNALS, wip_signal=read_params()["wip_signal"], prefix="wip_")
    for signal in signals:
        write_to_csv(hosp_df, signal, output_path)
    return hosp_df
Esempio n. 3
0
def geo_map(df: pd.DataFrame, geo_res: str, sensor: str):
    """
    Map a DataFrame with county level data and aggregate it to the geographic resolution geo_res.

    Parameters
    ----------
    df: pd.DataFrame
        Columns: fips, timestamp, new_counts, cumulative_counts, population ...
    geo_res: str
        Geographic resolution to which to aggregate.  Valid options:
        ("county", "state", "msa", "hrr").
    sensor: str
        sensor type. Valid options:
        ("new_counts", "cumulative_counts",
        "incidence", "cumulative_prop")

    Returns
    -------
    pd.DataFrame
        Columns: geo_id, timestamp, ...
    """
    if geo_res not in VALID_GEO_RES:
        raise ValueError(f"geo_res must be one of {VALID_GEO_RES}")

    # State-level records unassigned to specific counties are coded as fake
    # counties with fips XX000.
    unassigned_counties = df[df["fips"].str.endswith("000")].copy()

    df = df[df["fips"].astype(int) % 1000 != 0].copy()
    # Disburse unallocated cases/deaths in NYC to NYC counties
    df = disburse(df, NYC_FIPS[0][0], NYC_FIPS[0][1])
    df = df[df["fips"] != NYC_FIPS[0][0]]
    geo_mapper = GeoMapper()
    if geo_res == "county":
        if sensor not in PROP_SENSORS:
            # It is not clear how to calculate the proportion for unallocated
            # cases/deaths, so we exclude them for those sensors.
            df = df.append(unassigned_counties)
        df["geo_id"] = df["fips"]
    elif geo_res == "state":
        # Grab first two digits of fips
        # Map state fips to us postal code
        # Add unallocated cases/deaths
        df = df.append(unassigned_counties)
        df = geo_mapper.add_geocode(df, "fips", "state_id", new_col="geo_id")

        # Zero out the state FIPS population to avoid double counting.
        df = df.set_index("fips")
        state_fips_codes = {str(x).zfill(2) + "000" for x in range(1, 73)}
        subset_state_fips_codes = set(df.index.values) & state_fips_codes
        df.loc[subset_state_fips_codes, "population"] = 0
        df = df.reset_index()
    else:
        # Map "missing" secondary FIPS to those that are in our canonical set
        for fips, fips_list in SECONDARY_FIPS:
            df = disburse(df, fips, fips_list)
        for usafacts_fips, our_fips in REPLACE_FIPS:
            df.loc[df["fips"] == usafacts_fips, "fips"] = our_fips
        merged = geo_mapper.add_geocode(df, "fips", geo_res, new_col="geo_id")
        if geo_res != "hrr":
            merged["weight"] = 1  # Only HRR requires weight
        merged["cumulative_counts"] =\
             merged["cumulative_counts"] * merged["weight"]
        merged["new_counts"] = merged["new_counts"] * merged["weight"]
        merged["population"] = merged["population"] * merged["weight"]
        df = merged.drop(["weight"], axis=1)
    df = df.drop("fips", axis=1)
    df = df.groupby(["geo_id", "timestamp"]).sum().reset_index()
    df["incidence"] = df["new_counts"] / df["population"] * INCIDENCE_BASE
    df["cumulative_prop"] =\
         df["cumulative_counts"] / df["population"] * INCIDENCE_BASE
    return df