def add_county_pop(df: pd.DataFrame, gmpr: GeoMapper): """ Add county populations to the data with special US territory handling. Since Guam, Northern Mariana Islands, American Samoa, and the Virgin Islands are reported as megafips instead of actual counties in JHU, they would normally not have a population added. In addition to adding populations for the non-territory counties, this function adds in the entire territory's population for the 4 aforementioned regions. Parameters ---------- df DataFrame with county level information and county column named "fips" gmpr GeoMapper Returns ------- Dataframe with added population column """ is_territory_mega = df.fips.isin(["78000", "69000", "66000", "60000"]) territories = df[is_territory_mega] territories_state_id = gmpr.add_geocode(territories, "fips", "state_id") territories_pop = gmpr.add_population_column(territories_state_id, "state_id", dropna=False) territories_pop.drop("state_id", axis=1, inplace=True) nonterritories = df[~is_territory_mega] nonterritories_pop = gmpr.add_population_column(nonterritories, "fips", dropna=False) return pd.concat([nonterritories_pop, territories_pop], ignore_index=True)
def update_sensor( state_files: List[str], mmwr_info: pd.DataFrame, output_path: str, start_date: datetime, end_date: datetime) -> pd.DataFrame: """ Generate sensor values, and write to csv format. Args: state_files: List of JSON files representing COVID-NET hospitalization data for each state mmwr_info: Mappings from MMWR week to actual dates, as a pd.DataFrame output_path: Path to write the csvs to start_date: First sensor date (datetime.datetime) end_date: Last sensor date (datetime.datetime) Returns: The overall pd.DataFrame after all processing """ assert start_date < end_date, "start_date >= end_date" # Combine and format hospitalizations dataframe hosp_df = CovidNet.read_all_hosp_data(state_files) hosp_df = hosp_df.merge(mmwr_info, how="left", left_on=["mmwr-year", "mmwr-week"], right_on=["year", "weeknumber"]) # Select relevant columns and standardize naming hosp_df = hosp_df.loc[:, APIConfig.HOSP_RENAME_COLS.keys()]\ .rename(columns=APIConfig.HOSP_RENAME_COLS) # Restrict to start and end date hosp_df = hosp_df[ (hosp_df["date"] >= start_date) & ( hosp_df["date"] < end_date) ] # Set state id to two-letter abbreviation gmpr = GeoMapper() hosp_df = gmpr.add_geocode(hosp_df, from_col=APIConfig.STATE_COL, from_code="state_name", new_code="state_id", dropna=False) # To use the original column name, reassign original column and drop new one hosp_df[APIConfig.STATE_COL] = hosp_df["state_id"] hosp_df.drop("state_id", axis=1, inplace=True) assert not hosp_df.duplicated(["date", "geo_id"]).any(), "Non-unique (date, geo_id) pairs" hosp_df.set_index(["date", "geo_id"], inplace=True) # Fill in remaining expected columns hosp_df["se"] = np.nan hosp_df["sample_size"] = np.nan # Write results signals = add_prefix(SIGNALS, wip_signal=read_params()["wip_signal"], prefix="wip_") for signal in signals: write_to_csv(hosp_df, signal, output_path) return hosp_df
def geo_map(df: pd.DataFrame, geo_res: str, sensor: str): """ Map a DataFrame with county level data and aggregate it to the geographic resolution geo_res. Parameters ---------- df: pd.DataFrame Columns: fips, timestamp, new_counts, cumulative_counts, population ... geo_res: str Geographic resolution to which to aggregate. Valid options: ("county", "state", "msa", "hrr"). sensor: str sensor type. Valid options: ("new_counts", "cumulative_counts", "incidence", "cumulative_prop") Returns ------- pd.DataFrame Columns: geo_id, timestamp, ... """ if geo_res not in VALID_GEO_RES: raise ValueError(f"geo_res must be one of {VALID_GEO_RES}") # State-level records unassigned to specific counties are coded as fake # counties with fips XX000. unassigned_counties = df[df["fips"].str.endswith("000")].copy() df = df[df["fips"].astype(int) % 1000 != 0].copy() # Disburse unallocated cases/deaths in NYC to NYC counties df = disburse(df, NYC_FIPS[0][0], NYC_FIPS[0][1]) df = df[df["fips"] != NYC_FIPS[0][0]] geo_mapper = GeoMapper() if geo_res == "county": if sensor not in PROP_SENSORS: # It is not clear how to calculate the proportion for unallocated # cases/deaths, so we exclude them for those sensors. df = df.append(unassigned_counties) df["geo_id"] = df["fips"] elif geo_res == "state": # Grab first two digits of fips # Map state fips to us postal code # Add unallocated cases/deaths df = df.append(unassigned_counties) df = geo_mapper.add_geocode(df, "fips", "state_id", new_col="geo_id") # Zero out the state FIPS population to avoid double counting. df = df.set_index("fips") state_fips_codes = {str(x).zfill(2) + "000" for x in range(1, 73)} subset_state_fips_codes = set(df.index.values) & state_fips_codes df.loc[subset_state_fips_codes, "population"] = 0 df = df.reset_index() else: # Map "missing" secondary FIPS to those that are in our canonical set for fips, fips_list in SECONDARY_FIPS: df = disburse(df, fips, fips_list) for usafacts_fips, our_fips in REPLACE_FIPS: df.loc[df["fips"] == usafacts_fips, "fips"] = our_fips merged = geo_mapper.add_geocode(df, "fips", geo_res, new_col="geo_id") if geo_res != "hrr": merged["weight"] = 1 # Only HRR requires weight merged["cumulative_counts"] =\ merged["cumulative_counts"] * merged["weight"] merged["new_counts"] = merged["new_counts"] * merged["weight"] merged["population"] = merged["population"] * merged["weight"] df = merged.drop(["weight"], axis=1) df = df.drop("fips", axis=1) df = df.groupby(["geo_id", "timestamp"]).sum().reset_index() df["incidence"] = df["new_counts"] / df["population"] * INCIDENCE_BASE df["cumulative_prop"] =\ df["cumulative_counts"] / df["population"] * INCIDENCE_BASE return df