Beispiel #1
0
def run_module(params):
    """
    Runs the indicator

    Arguments
    --------
    params:  Dict[str, Any]
        Nested dictionary of parameters.
    """
    start_time = time.time()
    logger = get_structured_logger(
        __name__,
        filename=params["common"].get("log_filename"),
        log_exceptions=params["common"].get("log_exceptions", True))
    mapper = GeoMapper()
    run_stats = []
    ## build the base version of the signal at the most detailed geo level you can get.
    ## compute stuff here or farm out to another function or file
    all_data = pd.DataFrame(
        columns=["timestamp", "val", "zip", "sample_size", "se"])
    ## aggregate & smooth
    ## TODO: add num/prop variations if needed
    for sensor, smoother, geo in product(SIGNALS, SMOOTHERS, GEOS):
        df = mapper.replace_geocode(all_data,
                                    "zip",
                                    geo,
                                    new_col="geo_id",
                                    date_col="timestamp")
        ## TODO: recompute sample_size, se here if not NA
        df["val"] = df[["geo_id", "val"]].groupby("geo_id")["val"].transform(
            smoother[0].smooth)
        sensor_name = sensor + smoother[
            1]  ## TODO: +num/prop variation if used
        # don't export first 6 days for smoothed signals since they'll be nan.
        start_date = min(df.timestamp) + timedelta(6) if smoother[1] else min(
            df.timestamp)
        dates = create_export_csv(df,
                                  params["common"]["export_dir"],
                                  geo,
                                  sensor_name,
                                  start_date=start_date)
        if len(dates) > 0:
            run_stats.append((max(dates), len(dates)))
    ## log this indicator run
    elapsed_time_in_seconds = round(time.time() - start_time, 2)
    min_max_date = run_stats and min(s[0] for s in run_stats)
    csv_export_count = sum(s[-1] for s in run_stats)
    max_lag_in_days = min_max_date and (datetime.now() - min_max_date).days
    formatted_min_max_date = min_max_date and min_max_date.strftime("%Y-%m-%d")
    logger.info("Completed indicator run",
                elapsed_time_in_seconds=elapsed_time_in_seconds,
                csv_export_count=csv_export_count,
                max_lag_in_days=max_lag_in_days,
                oldest_final_export_date=formatted_min_max_date)
Beispiel #2
0
    def test_good_file(self):
        df = pull_nchs_mortality_data(TOKEN, "test_data.csv")

        # Test columns
        assert (df.columns.values == [
            'covid_19_deaths', 'total_deaths', 'percent_of_expected_deaths',
            'pneumonia_deaths', 'pneumonia_and_covid_19_deaths',
            'influenza_deaths', 'pneumonia_influenza_or_covid_19_deaths',
            "timestamp", "geo_id", "population"
        ]).all()

        # Test aggregation for NYC and NY
        raw_df = pd.read_csv("./test_data/test_data.csv",
                             parse_dates=["start_week"])
        raw_df = standardize_columns(raw_df)
        for metric in METRICS:
            ny_list = raw_df.loc[(raw_df["state"] == "New York")
                                 & (raw_df[metric].isnull()),
                                 "timestamp"].values
            nyc_list = raw_df.loc[(raw_df["state"] == "New York City")
                                  & (raw_df[metric].isnull()),
                                  "timestamp"].values
            final_list = df.loc[(df["geo_id"] == "ny")
                                & (df[metric].isnull()), "timestamp"].values
            assert set(final_list) == set(ny_list).intersection(set(nyc_list))

        # Test missing value
        gmpr = GeoMapper()
        state_ids = pd.DataFrame(list(gmpr.get_geo_values("state_id")))
        state_names = gmpr.replace_geocode(state_ids,
                                           "state_id",
                                           "state_name",
                                           from_col=0,
                                           date_col=None)
        for state, geo_id in zip(state_names, state_ids):
            if state in set(["New York", "New York City"]):
                continue
            for metric in METRICS:
                test_list = raw_df.loc[(raw_df["state"] == state)
                                       & (raw_df[metric].isnull()),
                                       "timestamp"].values
                final_list = df.loc[(df["geo_id"] == geo_id)
                                    & (df[metric].isnull()),
                                    "timestamp"].values
                assert set(final_list) == set(test_list)