Esempio n. 1
0
def convert_geo(df: pd.DataFrame, geo: str, gmpr: GeoMapper) -> pd.DataFrame:
    """
    Map a DataFrame to desired regions.

    The HHS facility level data contains columns for zip, state, and fips. For state and fips, we
    use them as given. For all other geos, we map from zip (the smallest of the regions) to the
    desired geo.

    Parameters
    ----------
    df: pd.DataFrame
        Input DataFrame containing zip, state, and fips columns.
    geo:
        Desired new geographic resolution.
    gmpr:
        GeoMapper object.

    Returns
    -------
    DataFrame containing new geography column `geo_id` in the `geo` resolution.
    """
    if geo == "county":
        output_df = df.copy()
        output_df["geo_id"] = output_df["fips_code"]
    elif geo == "state":
        output_df = df.copy()
        output_df["geo_id"] = output_df["state"]
    elif geo == "hrr":  # use zip for HRR since zips nest within HRR while FIPS split across HRRs.
        output_df = gmpr.add_geocode(df, "zip", geo)
        output_df["geo_id"] = output_df[geo]
    else:
        output_df = gmpr.add_geocode(df, "fips", geo, from_col="fips_code")
        output_df["geo_id"] = output_df[geo]
    return output_df
def aggregate(df, signal_names, geo_resolution='county'):
    """Aggregate signals to appropriate resolution and produce standard errors.

    Parameters
    ----------
    df: pd.DataFrame
        County block group-level data with prepared signals (output of
        construct_signals().
    signal_names: List[str]
        Names of signals to be exported.
    geo_resolution: str
        One of ('county', 'state')
    Returns
    -------
    pd.DataFrame:
        DataFrame with one row per geo_id, with columns for the individual
        signals, standard errors, and sample sizes.
    """
    # Prepare geo resolution
    gmpr = GeoMapper()
    if geo_resolution == 'county':
        geo_transformed_df = df.copy()
        geo_transformed_df['geo_id'] = df['county_fips']
    elif geo_resolution == 'state':
        geo_transformed_df = gmpr.add_geocode(df,
                                              from_col='county_fips',
                                              from_code='fips',
                                              new_code='state_id',
                                              new_col='geo_id',
                                              dropna=False)
    elif geo_resolution in ['msa', 'nation', 'hrr', 'hhs']:
        geo_transformed_df = gmpr.add_geocode(df,
                                              from_col='county_fips',
                                              from_code='fips',
                                              new_code=geo_resolution,
                                              new_col='geo_id',
                                              dropna=False)

    else:
        raise ValueError(
            f'`geo_resolution` must be one of {GEO_RESOLUTIONS}.')

    # Aggregation and signal creation
    grouped_df = geo_transformed_df.groupby(['geo_id'])[signal_names]
    df_mean = grouped_df.mean()
    df_sd = grouped_df.std()
    df_n = grouped_df.count()
    agg_df = pd.DataFrame.join(df_mean, df_sd,
                               lsuffix='_mean', rsuffix='_sd')
    agg_df = pd.DataFrame.join(agg_df, df_n.rename({
        signal: signal + '_n' for signal in signal_names
    }, axis=1))
    for signal in signal_names:
        agg_df[f'{signal}_se'] = (agg_df[f'{signal}_sd']
                                  / np.sqrt(agg_df[f'{signal}_n']))
    return agg_df.reset_index()
Esempio n. 3
0
def run_module(params):
    """
    Runs the indicator

    Arguments
    --------
    params:  Dict[str, Any]
        Nested dictionary of parameters.
    """
    start_time = time.time()
    logger = get_structured_logger(
        __name__,
        filename=params["common"].get("log_filename"),
        log_exceptions=params["common"].get("log_exceptions", True))
    mapper = GeoMapper()
    run_stats = []
    ## build the base version of the signal at the most detailed geo level you can get.
    ## compute stuff here or farm out to another function or file
    all_data = pd.DataFrame(
        columns=["timestamp", "val", "zip", "sample_size", "se"])
    ## aggregate & smooth
    ## TODO: add num/prop variations if needed
    for sensor, smoother, geo in product(SIGNALS, SMOOTHERS, GEOS):
        df = mapper.replace_geocode(all_data,
                                    "zip",
                                    geo,
                                    new_col="geo_id",
                                    date_col="timestamp")
        ## TODO: recompute sample_size, se here if not NA
        df["val"] = df[["geo_id", "val"]].groupby("geo_id")["val"].transform(
            smoother[0].smooth)
        sensor_name = sensor + smoother[
            1]  ## TODO: +num/prop variation if used
        # don't export first 6 days for smoothed signals since they'll be nan.
        start_date = min(df.timestamp) + timedelta(6) if smoother[1] else min(
            df.timestamp)
        dates = create_export_csv(df,
                                  params["common"]["export_dir"],
                                  geo,
                                  sensor_name,
                                  start_date=start_date)
        if len(dates) > 0:
            run_stats.append((max(dates), len(dates)))
    ## log this indicator run
    elapsed_time_in_seconds = round(time.time() - start_time, 2)
    min_max_date = run_stats and min(s[0] for s in run_stats)
    csv_export_count = sum(s[-1] for s in run_stats)
    max_lag_in_days = min_max_date and (datetime.now() - min_max_date).days
    formatted_min_max_date = min_max_date and min_max_date.strftime("%Y-%m-%d")
    logger.info("Completed indicator run",
                elapsed_time_in_seconds=elapsed_time_in_seconds,
                csv_export_count=csv_export_count,
                max_lag_in_days=max_lag_in_days,
                oldest_final_export_date=formatted_min_max_date)
Esempio n. 4
0
def run_module(params) -> None:
    """
    Run entire hhs_facilities indicator.

    Parameters
    ----------
    params
        Dictionary containing indicator configuration. Expected to have the following structure:
        - "common":
            - "export_dir": str, directory to write output
    """
    start_time = time.time()
    logger = get_structured_logger(
        __name__,
        filename=params["common"].get("log_filename"),
        log_exceptions=params["common"].get("log_exceptions", True))

    raw_df = pull_data()
    gmpr = GeoMapper()
    filled_fips_df = fill_missing_fips(raw_df, gmpr)
    for geo, (sig_name, sig_cols, sig_func,
              sig_offset) in product(GEO_RESOLUTIONS, SIGNALS):
        mapped_df = convert_geo(filled_fips_df, geo, gmpr)
        output_df = generate_signal(mapped_df, sig_cols, sig_func, sig_offset)
        create_export_csv(output_df, params["common"]["export_dir"], geo,
                          sig_name)

    elapsed_time_in_seconds = round(time.time() - start_time, 2)
    logger.info("Completed indicator run",
                elapsed_time_in_seconds=elapsed_time_in_seconds)
Esempio n. 5
0
 def test_hrr_msa(self):
     """Tests that values are correctly aggregated at the HRR and MSA level."""
     df = pd.DataFrame({
         "fips": ["13009", "13017", "13021", "09015"],
         "timestamp":
         ["2020-02-15", "2020-02-15", "2020-02-15", "2020-02-15"],
         "new_counts": [10, 15, 2, 13],
         "cumulative_counts": [100, 20, 45, 60],
     })
     hrr_df = geo_map(df, "hrr", SENSOR)
     msa_df = geo_map(df, "msa", SENSOR)
     assert msa_df.shape == (2, 7)
     gmpr = GeoMapper()
     df = gmpr.add_population_column(df, "fips")
     assert np.isclose(hrr_df.new_counts.sum(), df.new_counts.sum())
     assert np.isclose(hrr_df.population.sum(), df.population.sum())
     assert hrr_df.shape == (5, 7)
Esempio n. 6
0
    def test_good_file(self):
        df = pull_nchs_mortality_data(TOKEN, "test_data.csv")

        # Test columns
        assert (df.columns.values == [
            'covid_19_deaths', 'total_deaths', 'percent_of_expected_deaths',
            'pneumonia_deaths', 'pneumonia_and_covid_19_deaths',
            'influenza_deaths', 'pneumonia_influenza_or_covid_19_deaths',
            "timestamp", "geo_id", "population"
        ]).all()

        # Test aggregation for NYC and NY
        raw_df = pd.read_csv("./test_data/test_data.csv",
                             parse_dates=["start_week"])
        raw_df = standardize_columns(raw_df)
        for metric in METRICS:
            ny_list = raw_df.loc[(raw_df["state"] == "New York")
                                 & (raw_df[metric].isnull()),
                                 "timestamp"].values
            nyc_list = raw_df.loc[(raw_df["state"] == "New York City")
                                  & (raw_df[metric].isnull()),
                                  "timestamp"].values
            final_list = df.loc[(df["geo_id"] == "ny")
                                & (df[metric].isnull()), "timestamp"].values
            assert set(final_list) == set(ny_list).intersection(set(nyc_list))

        # Test missing value
        gmpr = GeoMapper()
        state_ids = pd.DataFrame(list(gmpr.get_geo_values("state_id")))
        state_names = gmpr.replace_geocode(state_ids,
                                           "state_id",
                                           "state_name",
                                           from_col=0,
                                           date_col=None)
        for state, geo_id in zip(state_names, state_ids):
            if state in set(["New York", "New York City"]):
                continue
            for metric in METRICS:
                test_list = raw_df.loc[(raw_df["state"] == state)
                                       & (raw_df[metric].isnull()),
                                       "timestamp"].values
                final_list = df.loc[(df["geo_id"] == geo_id)
                                    & (df[metric].isnull()),
                                    "timestamp"].values
                assert set(final_list) == set(test_list)
Esempio n. 7
0
    def test_county(self):
        """Tests that values are correctly aggregated at the county level."""
        df = pd.DataFrame({
            "fips": ["53003", "48027", "50103"],
            "timestamp": ["2020-02-15", "2020-02-15", "2020-02-15"],
            "new_counts": [10, 15, 2],
            "cumulative_counts": [100, 20, 45],
        })
        new_df = geo_map(df, "county", SENSOR)
        gmpr = GeoMapper()
        df = gmpr.add_population_column(df, "fips")
        exp_incidence = df["new_counts"] / df["population"] * 100000
        exp_cprop = df["cumulative_counts"] / df["population"] * 100000

        assert set(new_df["geo_id"].values) == set(df["fips"].values)
        assert set(new_df["timestamp"].values) == set(df["timestamp"].values)
        assert set(new_df["incidence"].values) == set(exp_incidence.values)
        assert set(new_df["cumulative_prop"].values) == set(exp_cprop.values)
Esempio n. 8
0
 def __init__(self):
     """Create the underlying GeoMapper."""
     self.gmpr = GeoMapper()
     self.geo_func = {
         "county":
         partial(self.county_to_megacounty,
                 threshold_visits=Config.MIN_RECENT_VISITS,
                 threshold_len=Config.RECENT_LENGTH),
         "state":
         self.county_to_state,
         "msa":
         self.county_to_msa,
         "hrr":
         self.county_to_hrr,
         "hhs":
         self.county_to_hhs,
         "nation":
         self.county_to_nation
     }
Esempio n. 9
0
    def test_fill_missing_fips(self):
        gmpr = GeoMapper()
        test_input = pd.DataFrame({
            "hospital_pk": ["test", "test2", "test3"],
            "fips_code": ["fakefips", np.nan, np.nan],
            "zip": ["01001", "01001", "00601"],
            "val1": [1.0, 5.0, 10.0],
            "val2": [2.0, 25.0, 210.0]
        })
        expected = pd.DataFrame({
            "hospital_pk": ["test", "test2", "test3", "test3"],
            "fips_code": ["fakefips", "25013", "72001", "72141"],
            "zip": ["01001", "01001", "00601", "00601"],
            "val1":
            [1.0, 5.0, 0.994345718901454 * 10, 0.005654281098546042 * 10],
            "val2": [
                2.0, 25.0, 0.994345718901454 * 210.0,
                0.005654281098546042 * 210.0
            ]
        })
        pd.testing.assert_frame_equal(fill_missing_fips(test_input, gmpr),
                                      expected)

        # test all nans stay as nan
        test_input = pd.DataFrame({
            "hospital_pk": ["test", "test2", "test3"],
            "fips_code": ["fakefips", np.nan, np.nan],
            "zip": ["01001", "01001", "00601"],
            "val1": [1.0, 5.0, np.nan],
            "val2": [2.0, 25.0, 210.0]
        })
        expected = pd.DataFrame({
            "hospital_pk": ["test", "test2", "test3", "test3"],
            "fips_code": ["fakefips", "25013", "72001", "72141"],
            "zip": ["01001", "01001", "00601", "00601"],
            "val1": [1.0, 5.0, np.nan, np.nan],
            "val2": [
                2.0, 25.0, 0.994345718901454 * 210.0,
                0.005654281098546042 * 210.0
            ]
        })
        pd.testing.assert_frame_equal(fill_missing_fips(test_input, gmpr),
                                      expected)

        # test that populated fips or both nan is no-op
        test_input_no_missing = pd.DataFrame({
            "hospital_pk": ["test", "test2", "test3", "test4"],
            "fips_code": ["fakefips", "testfips", "pseudofips", np.nan],
            "zip": ["01001", "01001", "00601", np.nan],
            "val": [1.0, 5.0, 10.0, 0.0]
        })
        pd.testing.assert_frame_equal(
            fill_missing_fips(test_input_no_missing, gmpr),
            test_input_no_missing)
Esempio n. 10
0
def run_module() -> None:
    """Run entire hhs_facilities indicator."""
    params = read_params()
    raw_df = pull_data()
    gmpr = GeoMapper()
    filled_fips_df = fill_missing_fips(raw_df, gmpr)
    for geo, (sig_name, sig_cols, sig_func,
              sig_offset) in product(GEO_RESOLUTIONS, SIGNALS):
        mapped_df = convert_geo(filled_fips_df, geo, gmpr)
        output_df = generate_signal(mapped_df, sig_cols, sig_func, sig_offset)
        create_export_csv(output_df, params["export_dir"], geo, sig_name)
Esempio n. 11
0
def run_module():
    """Generate ground truth HHS hospitalization data."""
    params = read_params()
    mapper = GeoMapper()
    request_all_states = ",".join(mapper.get_geo_values("state_id"))

    today = date.today()
    past_reference_day = date(year=2020, month=1,
                              day=1)  # first available date in DB
    date_range = generate_date_ranges(past_reference_day, today)
    dfs = []
    for r in date_range:
        response = Epidata.covid_hosp(request_all_states, r)
        if response['result'] != 1:
            raise Exception(f"Bad result from Epidata: {response['message']}")
        dfs.append(pd.DataFrame(response['epidata']))
    all_columns = pd.concat(dfs)

    for sig in SIGNALS:
        create_export_csv(make_signal(all_columns, sig), params["export_dir"],
                          "state", sig)
Esempio n. 12
0
def pull_data() -> pd.DataFrame:
    """
    Pull HHS data from Epidata API for all states and dates and convert to a DataFrame.

    Returns
    -------
    DataFrame of HHS data.
    """
    today = int(date.today().strftime("%Y%m%d"))
    past_reference_day = int(date(
        2020, 1, 1).strftime("%Y%m%d"))  # first available date in DB
    all_states = GeoMapper().get_geo_values("state_id")
    responses = pull_data_iteratively(all_states,
                                      Epidata.range(past_reference_day, today))
    all_columns = pd.DataFrame(responses).replace(NAN_VALUES, np.nan)
    all_columns["timestamp"] = pd.to_datetime(all_columns["collection_week"],
                                              format="%Y%m%d")
    return all_columns
Esempio n. 13
0
def fill_missing_fips(df: pd.DataFrame, gmpr: GeoMapper) -> pd.DataFrame:
    """
    Fill in missing FIPS code if zip is present.

    Maps rows that have the FIPS missing but zip present. The rest of the rows,
    including those where both FIPS and zip are nan, are kept as is and appended back at the end.
    Rows with a zip which fail to map to a FIPS are also kept so that column totals remain equal.
    This means that column sums before and after imputation should be identical, and any dropping
    of values is handled by downstream geomapping.

    TODO #636 Generalize this function to geomapper.

    Parameters
    ----------
    df: pd.DataFrame
        Input DataFrame containing zip and fips columns.
    gmpr:
        GeoMapper object.

    Returns
    -------
    DataFrame with missing FIPS imputed with zip.
    """
    mask = pd.isna(df["fips_code"]) & ~pd.isna(df["zip"])
    no_fips = df[mask]
    fips_present = df[~mask]
    no_data_cols = [
        c for c in df.columns
        if df[c].dtypes not in (dtype("int64"), dtype("float64"))
    ]
    data_cols = list(set(df.columns) - set(no_data_cols))
    added_fips = gmpr.add_geocode(no_fips, "zip", "fips", dropna=False)
    added_fips["fips_code"] = added_fips["fips"]
    # set weight of unmapped zips to 1 to they don't zero out all the values when multiplied
    added_fips.weight.fillna(1, inplace=True)
    added_fips[data_cols] = added_fips[data_cols].multiply(
        added_fips["weight"], axis=0)
    fips_filled = added_fips.groupby(no_data_cols,
                                     dropna=False,
                                     as_index=False).sum(min_count=1)
    fips_filled.drop(columns="weight", inplace=True)
    return pd.concat([fips_present, fips_filled]).reset_index(drop=True)
Esempio n. 14
0
def run_module(params):
    """
    Generate ground truth HHS hospitalization data.

    Parameters
    ----------
    params
        Dictionary containing indicator configuration. Expected to have the following structure:
        - "common":
            - "export_dir": str, directory to write output
            - "log_filename" (optional): str, name of file to write logs
    """
    start_time = time.time()
    logger = get_structured_logger(
        __name__,
        filename=params["common"].get("log_filename"),
        log_exceptions=params["common"].get("log_exceptions", True))
    mapper = GeoMapper()
    request_all_states = ",".join(mapper.get_geo_values("state_id"))
    today = date.today()
    past_reference_day = date(year=2020, month=1,
                              day=1)  # first available date in DB
    date_range = generate_date_ranges(past_reference_day, today)
    dfs = []
    for r in date_range:
        response = Epidata.covid_hosp(request_all_states, r)
        # The last date range might only have recent days that don't have any data, so don't error.
        if response["result"] != 1 and r != date_range[-1]:
            raise Exception(f"Bad result from Epidata: {response['message']}")
        if response["result"] == -2 and r == date_range[
                -1]:  # -2 code means no results
            continue
        dfs.append(pd.DataFrame(response['epidata']))
    all_columns = pd.concat(dfs)

    geo_mapper = GeoMapper()

    for sig in SIGNALS:
        state = geo_mapper.add_geocode(make_signal(all_columns, sig),
                                       "state_id",
                                       "state_code",
                                       from_col="state")
        for geo in GEOS:
            create_export_csv(make_geo(state, geo, geo_mapper),
                              params["common"]["export_dir"], geo, sig)

    elapsed_time_in_seconds = round(time.time() - start_time, 2)
    logger.info("Completed indicator run",
                elapsed_time_in_seconds=elapsed_time_in_seconds)
Esempio n. 15
0
 def test_convert_geo(self):
     gmpr = GeoMapper()
     test_input = pd.DataFrame({
         "state": ["test"],
         "fips_code": ["01001"],
         "zip": ["01001"],
     })
     test_state_output = convert_geo(test_input, "state", gmpr)
     pd.testing.assert_series_equal(test_state_output.geo_id,
                                    pd.Series(["test"]),
                                    check_names=False)
     test_county_output = convert_geo(test_input, "county", gmpr)
     pd.testing.assert_series_equal(test_county_output.geo_id,
                                    pd.Series(["01001"]),
                                    check_names=False)
     test_msa_output = convert_geo(test_input, "msa", gmpr)
     pd.testing.assert_series_equal(test_msa_output.geo_id,
                                    pd.Series(["33860"]),
                                    check_names=False)
     test_hrr_output = convert_geo(test_input, "hrr", gmpr)
     pd.testing.assert_series_equal(test_hrr_output.geo_id,
                                    pd.Series(["230"]),
                                    check_names=False)
Esempio n. 16
0
def test_make_geo():
    """Check that geographies transform correctly."""
    test_timestamp = datetime(year=2020, month=1, day=1)
    geo_mapper = GeoMapper()
    
    data = pd.DataFrame({
        'state': ['PA','WV','OH'],
        'state_code': [42, 54, 39],
        'timestamp': [test_timestamp]*3,
        'val': [1, 2, 4],
    })

    template = {
        'se': np.nan,
        'sample_size': np.nan,
    }
    expecteds = {
        "state": pd.DataFrame(
            dict(template,
                 geo_id=data.state,
                 timestamp=data.timestamp,
                 val=data.val)),
        "hhs": pd.DataFrame(
            dict(template,
                 geo_id=['3', '5'],
                 timestamp=[test_timestamp]*2,
                 val=[3, 4])),
        "nation": pd.DataFrame(
            dict(template,
                 geo_id=['us'],
                 timestamp=[test_timestamp],
                 val=[7]))
    }
    for geo, expected in expecteds.items():
        result = make_geo(data, geo, geo_mapper)
        for series in ["geo_id", "timestamp", "val", "se", "sample_size"]:
            pd.testing.assert_series_equal(expected[series], result[series], obj=f"{geo}:{series}")
def geomapper():
    return GeoMapper()
Esempio n. 18
0
class GeoMaps:
    """Class to map counties to other geographic resolutions."""
    def __init__(self):
        """Create the underlying GeoMapper."""
        self.gmpr = GeoMapper()
        self.geo_func = {
            "county":
            partial(self.county_to_megacounty,
                    threshold_visits=Config.MIN_RECENT_VISITS,
                    threshold_len=Config.RECENT_LENGTH),
            "state":
            self.county_to_state,
            "msa":
            self.county_to_msa,
            "hrr":
            self.county_to_hrr,
            "hhs":
            self.county_to_hhs,
            "nation":
            self.county_to_nation
        }

    @staticmethod
    def convert_fips(x):
        """Ensure fips is a string of length 5."""
        return str(x).zfill(5)

    def county_to_msa(self, data):
        """Aggregate county data to the msa resolution.

        Args:
            data: dataframe aggregated to the daily-county resolution (all 7 cols expected)

        Returns: tuple of dataframe at the daily-msa resolution, and the geo_id column name
        """
        data = self.gmpr.add_geocode(data,
                                     "fips",
                                     "msa",
                                     from_col="PatCountyFIPS",
                                     new_col="cbsa_id")
        data.drop(columns="PatCountyFIPS", inplace=True)
        data = data.groupby(["ServiceDate", "cbsa_id"]).sum().reset_index()

        return data.groupby("cbsa_id"), "cbsa_id"

    def county_to_state(self, data):
        """Aggregate county data to the state resolution.

        Args:
            data: dataframe aggregated to the daily-county resolution (all 7 cols expected)

        Returns: tuple of dataframe at the daily-state resolution, and geo_id column name
        """
        data = self.gmpr.add_geocode(data,
                                     "fips",
                                     "state_id",
                                     from_col="PatCountyFIPS")
        data.drop(columns="PatCountyFIPS", inplace=True)
        data = data.groupby(["ServiceDate", "state_id"]).sum().reset_index()

        return data.groupby("state_id"), "state_id"

    def county_to_hhs(self, data):
        """Aggregate county data to the HHS region resolution.

        Args:
            data: dataframe aggregated to the daily-county resolution (all 7 cols expected)

        Returns: tuple of dataframe at the daily-HHS resolution, and geo_id column name
        """
        data = self.gmpr.add_geocode(data,
                                     "fips",
                                     "hhs",
                                     from_col="PatCountyFIPS")
        data.drop(columns="PatCountyFIPS", inplace=True)
        data = data.groupby(["ServiceDate", "hhs"]).sum().reset_index()

        return data.groupby("hhs"), "hhs"

    def county_to_nation(self, data):
        """Aggregate county data to the nation resolution.

        Args:
            data: dataframe aggregated to the daily-county resolution (all 7 cols expected)

        Returns: tuple of dataframe at the daily-nation resolution, and geo_id column name
        """
        data = self.gmpr.add_geocode(data,
                                     "fips",
                                     "nation",
                                     from_col="PatCountyFIPS")
        data.drop(columns="PatCountyFIPS", inplace=True)
        data = data.groupby(["ServiceDate", "nation"]).sum().reset_index()

        return data.groupby("nation"), "nation"

    def county_to_hrr(self, data):
        """Aggregate county data to the HRR resolution.

        Note that counties are not strictly contained within HRRs. When a county
        spans boundaries, we report it with the same rate in each containing HRR,
        but with a sample size weighted by how much it overlaps that HRR.

        Args:
            data: dataframe aggregated to the daily-county resolution (all 7 cols expected)

        Returns:
            tuple of (data frame at daily-HRR resolution, geo_id column name)

        """
        data = self.gmpr.add_geocode(data,
                                     "fips",
                                     "hrr",
                                     from_col="PatCountyFIPS")
        data.drop(columns="PatCountyFIPS", inplace=True)

        ## do a weighted sum by the wpop column to get each HRR's contribution
        tmp = data.groupby(["ServiceDate", "hrr"])
        wtsum = lambda g: g["weight"].values @ g[Config.COUNT_COLS]
        data = tmp.apply(wtsum).reset_index()

        return data.groupby("hrr"), "hrr"

    def county_to_megacounty(self, data, threshold_visits, threshold_len):
        """Convert to megacounty and groupby FIPS using GeoMapper package.

        Args:
            data: dataframe aggregated to the daily-county resolution (all 7 cols expected)
            threshold_visits: count threshold to determine when to convert to megacounty.
            threshold_len: number of days to use when thresholding.

        Returns: tuple of dataframe at the daily-state resolution, and geo_id column name
        """
        all_data = self.gmpr.fips_to_megacounty(data,
                                                threshold_visits,
                                                threshold_len,
                                                fips_col="PatCountyFIPS",
                                                thr_col="Denominator",
                                                date_col="ServiceDate")
        all_data.rename({"megafips": "PatCountyFIPS"}, axis=1, inplace=True)
        megacounties = all_data[all_data.PatCountyFIPS.str.endswith("000")]
        data = pd.concat([data, megacounties])

        return data.groupby("PatCountyFIPS"), "PatCountyFIPS"
Esempio n. 19
0
def pull_nchs_mortality_data(token: str, test_file: Optional[str]=None):
    """Pull the latest NCHS Mortality data, and conforms it into a dataset.

    The output dataset has:

    - Each row corresponds to (State, Week), denoted (geo_id, timestamp)
    - Each row additionally has columns 'covid_deaths', 'total_deaths',
       'percent_of_expected_deaths', 'pneumonia_deaths',
       'pneumonia_and_covid_deaths', 'influenza_deaths',
       'pneumonia_influenza_or_covid_19_deaths' correspond to the aggregate
       metric from Feb. 1st until the latest date.

    # New York City would be included in New York State

    Parameters
    ----------
    token: str
        My App Token for pulling the NCHS mortality data
    test_file: Optional[str]
        When not null, name of file from which to read test data

    Returns
    -------
    pd.DataFrame
        Dataframe as described above.
    """
    # Constants
    keep_columns = METRICS.copy()
    type_dict = {key: float for key in keep_columns}
    type_dict["timestamp"] = 'datetime64[ns]'

    if test_file:
        df = pd.read_csv("./test_data/%s"%test_file)
    else:
        # Pull data from Socrata API
        client = Socrata("data.cdc.gov", token)
        results = client.get("r8kw-7aab", limit=10**10)
        df = pd.DataFrame.from_records(results)
        # drop "By Total" rows
        df = df[df["group"].transform(str.lower) == "by week"]

    df = standardize_columns(df)

    if "end_date" in df.columns:
        # Check missing week_ending_date == end_date
        try:
            assert all(df["week_ending_date"] == df["end_date"])
        except AssertionError as exc:
            raise ValueError(
                "week_ending_date is not always the same as end_date, check the raw file"
            ) from exc
    else:
        # Check missing start_week == end_week
        try:
            assert all(df["timestamp"] == df["end_week"])
        except AssertionError as exc:
            raise ValueError(
                "end_week is not always the same as start_week, check the raw file"
            ) from exc

    try:
        df = df.astype(type_dict)
    except KeyError as exc:
        raise ValueError(f"""
Expected column(s) missed, The dataset schema may
have changed. Please investigate and amend the code.

Columns needed:
{NEWLINE.join(type_dict.keys())}

Columns available:
{NEWLINE.join(df.columns)}
""") from exc

    # Drop rows for locations outside US
    df = df[df["state"] != "United States"]
    df = df.loc[:, keep_columns + ["timestamp", "state"]].set_index("timestamp")

    # NCHS considers NYC as an individual state, however, we want it included
    # in NY. If values are nan for both NYC and NY, the aggreagtion should
    # also have NAN.
    df_ny = df.loc[df["state"] == "New York", :].drop("state", axis=1)
    df_nyc = df.loc[df["state"] == "New York City", :].drop("state", axis=1)
    # Get mask df to ignore cells where both of them have NAN values
    mask = (df_ny[keep_columns].isnull().values \
            & df_nyc[keep_columns].isnull().values)
    df_ny = df_ny.append(df_nyc).groupby("timestamp").sum().where(~mask, np.nan)
    df_ny["state"] = "New York"
    # Drop NYC and NY in the full dataset
    df = df.loc[~df["state"].isin(["New York", "New York City"]), :]
    df = df.append(df_ny).reset_index().sort_values(["state", "timestamp"])
    # Add population info
    keep_columns.extend(["timestamp", "geo_id", "population"])
    gmpr = GeoMapper()
    df = gmpr.add_population_column(df, "state_name", geocode_col="state")
    df = gmpr.add_geocode(df, "state_name", "state_id", from_col="state", new_col="geo_id")
    return df[keep_columns]
Esempio n. 20
0
is only used for visualization.  It sources Puerto Rico from jhu-csse and
everything else from usa-facts.
"""
from datetime import date, timedelta, datetime
from itertools import product
import re
import time

import covidcast
import pandas as pd

from delphi_utils import add_prefix, get_structured_logger
from delphi_utils.geomap import GeoMapper
from .constants import METRICS, SMOOTH_TYPES, SENSORS, GEO_RESOLUTIONS

GMPR = GeoMapper()

COLUMN_MAPPING = {
    "time_value": "timestamp",
    "geo_value": "geo_id",
    "value": "val",
    "stderr": "se",
    "sample_size": "sample_size"
}

EMPTY_FRAME = pd.DataFrame({}, columns=COLUMN_MAPPING.values())

covidcast.covidcast._ASYNC_CALL = True  # pylint: disable=protected-access


def check_none_data_frame(data_frame, label, date_range):