Ejemplo n.º 1
0
def run_module():
    """Produce a combined cases and deaths signal using data from JHU and USA Facts."""
    variants = [
        tuple((metric, geo_res) + sensor_signal(metric, sensor, smoother))
        for (metric, geo_res, sensor, smoother
             ) in product(METRICS, GEO_RESOLUTIONS, SENSORS, SMOOTH_TYPES)
    ]
    params = configure(variants)
    for metric, geo_res, sensor_name, signal in variants:
        df = combine_usafacts_and_jhu(
            signal, geo_res, extend_raw_date_range(params, sensor_name))
        df["timestamp"] = pd.to_datetime(df["timestamp"])
        start_date = pd.to_datetime(params['export_start_date'])
        export_dir = params["export_dir"]
        dates = pd.Series(df[df["timestamp"] >= start_date]
                          ["timestamp"].unique()).sort_values()

        signal_name = add_prefix([signal],
                                 wip_signal=params["wip_signal"],
                                 prefix="wip_")
        for date_ in dates:
            export_fn = f'{date_.strftime("%Y%m%d")}_{geo_res}_{signal_name[0]}.csv'
            df[df["timestamp"] == date_][[
                "geo_id",
                "val",
                "se",
                "sample_size",
            ]].to_csv(f"{export_dir}/{export_fn}", index=False, na_rep="NA")
Ejemplo n.º 2
0
def update_sensor(
        state_files: List[str],
        mmwr_info: pd.DataFrame,
        output_path: str,
        start_date: datetime,
        end_date: datetime) -> pd.DataFrame:
    """
    Generate sensor values, and write to csv format.

    Args:
        state_files: List of JSON files representing COVID-NET hospitalization data for each state
        mmwr_info: Mappings from MMWR week to actual dates, as a pd.DataFrame
        output_path: Path to write the csvs to
        start_date: First sensor date (datetime.datetime)
        end_date: Last sensor date (datetime.datetime)

    Returns:
        The overall pd.DataFrame after all processing
    """
    assert start_date < end_date, "start_date >= end_date"

    # Combine and format hospitalizations dataframe
    hosp_df = CovidNet.read_all_hosp_data(state_files)
    hosp_df = hosp_df.merge(mmwr_info, how="left",
                            left_on=["mmwr-year", "mmwr-week"],
                            right_on=["year", "weeknumber"])

    # Select relevant columns and standardize naming
    hosp_df = hosp_df.loc[:, APIConfig.HOSP_RENAME_COLS.keys()]\
        .rename(columns=APIConfig.HOSP_RENAME_COLS)

    # Restrict to start and end date
    hosp_df = hosp_df[
        (hosp_df["date"] >= start_date) & (
            hosp_df["date"] < end_date)
    ]

    # Set state id to two-letter abbreviation
    gmpr = GeoMapper()
    hosp_df = gmpr.add_geocode(hosp_df,
                               from_col=APIConfig.STATE_COL,
                               from_code="state_name",
                               new_code="state_id",
                               dropna=False)
    # To use the original column name, reassign original column and drop new one
    hosp_df[APIConfig.STATE_COL] = hosp_df["state_id"]
    hosp_df.drop("state_id", axis=1, inplace=True)
    assert not hosp_df.duplicated(["date", "geo_id"]).any(), "Non-unique (date, geo_id) pairs"
    hosp_df.set_index(["date", "geo_id"], inplace=True)

    # Fill in remaining expected columns
    hosp_df["se"] = np.nan
    hosp_df["sample_size"] = np.nan

    # Write results
    signals = add_prefix(SIGNALS, wip_signal=read_params()["wip_signal"], prefix="wip_")
    for signal in signals:
        write_to_csv(hosp_df, signal, output_path)
    return hosp_df
Ejemplo n.º 3
0
def run_module(params):
    """
    Produce a combined cases and deaths signal using data from JHU and USA Facts.

    Parameters
    ----------
    params
        Dictionary containing indicator configuration. Expected to have the following structure:
        - "common":
            - "export_dir": str, directory to write output.
            - "log_exceptions" (optional): bool, whether to log exceptions to file.
            - "log_filename" (optional): str, name of file to write logs
        - "indicator":
            - "export_start_date": list of ints, [year, month, day] format, first day to begin
                data exports from.
            - "date_range": str, YYYYMMDD-YYYYMMDD format, range of dates to generate data for.
            - "source": str, name of combo indicator in metadata.
            - "wip_signal": list of str or bool, to be passed to delphi_utils.add_prefix.
    """
    start_time = time.time()
    variants = [
        tuple((metric, geo_res) + sensor_signal(metric, sensor, smoother))
        for (metric, geo_res, sensor, smoother
             ) in product(METRICS, GEO_RESOLUTIONS, SENSORS, SMOOTH_TYPES)
    ]
    params = configure(variants, params)
    logger = get_structured_logger(
        __name__,
        filename=params["common"].get("log_filename"),
        log_exceptions=params["common"].get("log_exceptions", True))

    for metric, geo_res, sensor_name, signal in variants:
        df = combine_usafacts_and_jhu(
            signal, geo_res, extend_raw_date_range(params, sensor_name),
            params['indicator']['issue_range'])
        df["timestamp"] = pd.to_datetime(df["timestamp"])
        start_date = pd.to_datetime(params['indicator']['export_start_date'])
        export_dir = params["common"]["export_dir"]
        dates = pd.Series(df[df["timestamp"] >= start_date]
                          ["timestamp"].unique()).sort_values()

        signal_name = add_prefix([signal],
                                 wip_signal=params['indicator']["wip_signal"],
                                 prefix="wip_")
        for date_ in dates:
            export_fn = f'{date_.strftime("%Y%m%d")}_{geo_res}_{signal_name[0]}.csv'
            df[df["timestamp"] == date_][[
                "geo_id",
                "val",
                "se",
                "sample_size",
            ]].to_csv(f"{export_dir}/{export_fn}", index=False, na_rep="NA")

    elapsed_time_in_seconds = round(time.time() - start_time, 2)
    logger.info("Completed indicator run",
                elapsed_time_in_seconds=elapsed_time_in_seconds)
Ejemplo n.º 4
0
    def test_output_files(self, run_as_module):
        """Tests that the output files contain the correct results of the run."""
        params = read_params()
        # Test output exists
        csv_files = listdir("receiving")

        dates_for_covid_ag = [
            "20200702", "20200703", "20200704", "20200705", "20200706",
            "20200707", "20200708", "20200709"
        ]

        dates_for_flu_ag = [
            "20200623", "20200624", "20200625", "20200626", "20200627",
            "20200628", "20200629", "20200630", "20200701", "20200702",
            "20200703"
        ]

        geos = GEO_RESOLUTIONS.copy()
        sensors = add_prefix(list(SENSORS.keys()),
                             wip_signal=params["wip_signal"],
                             prefix="wip_")

        expected_files = []
        for geo in geos:
            for sensor in sensors:
                if "covid_ag" in sensor:
                    for date in dates_for_covid_ag:
                        expected_files += [
                            date + "_" + geo + "_" + sensor + ".csv"
                        ]
                else:
                    for date in dates_for_flu_ag:
                        expected_files += [
                            date + "_" + geo + "_" + sensor + ".csv"
                        ]

        assert set(expected_files).issubset(set(csv_files))

        # Test output format
        df = pd.read_csv(
            join("./receiving",
                 "20200709_state_covid_ag_raw_pct_positive.csv"))
        assert (df.columns.values == ["geo_id", "val", "se",
                                      "sample_size"]).all()

        # test_intermediate_file
        flag = None
        for fname in listdir("./cache"):
            if ".csv" in fname:
                flag = 1
        assert flag is not None
    def __init__(self,
                 startdate,
                 enddate,
                 dropdate,
                 geo,
                 parallel,
                 weekday,
                 numtype,
                 se):
        """Init Sensor Updator.

        Args:
            startdate: first sensor date (YYYY-mm-dd)
            enddate: last sensor date (YYYY-mm-dd)
            dropdate: data drop date (YYYY-mm-dd)
            geo: geographic resolution, one of ["county", "state", "msa", "hrr", "hhs", "nation"]
            parallel: boolean to run the sensor update in parallel
            weekday: boolean to adjust for weekday effects
            numtype: type of count data used, one of ["covid", "cli"]
            se: boolean to write out standard errors, if true, use an obfuscated name
        """
        self.startdate, self.enddate, self.dropdate = [
            pd.to_datetime(t) for t in (startdate, enddate, dropdate)]
        # handle dates
        assert (self.startdate > (Config.FIRST_DATA_DATE + Config.BURN_IN_PERIOD)
                ), f"not enough data to produce estimates starting {self.startdate}"
        assert self.startdate < self.enddate, "start date >= end date"
        assert self.enddate <= self.dropdate, "end date > drop date"
        self.geo, self.parallel, self.weekday, self.numtype, self.se = geo.lower(), parallel, \
                                                                       weekday, numtype, se

        # output file naming
        if self.numtype == "covid":
            signals = [SMOOTHED_ADJ if self.weekday else SMOOTHED]
        elif self.numtype == "cli":
            signals = [SMOOTHED_ADJ_CLI if self.weekday else SMOOTHED_CLI]
        signal_names = add_prefix(
            signals,
            wip_signal=read_params()["wip_signal"])
        self.updated_signal_names = signal_names

        # initialize members set in shift_dates().
        self.burnindate = None
        self.fit_dates = None
        self.burn_in_dates = None
        self.sensor_dates = None
Ejemplo n.º 6
0
    def test_output_files(self, run_as_module):
        """Tests that the proper files are output."""

        # Test output exists
        csv_files = listdir("receiving")

        dates = [
            "20200702", "20200703", "20200704", "20200705", "20200706",
            "20200707", "20200708", "20200709"
        ]
        geos = GEO_RESOLUTIONS.copy()
        sensors = add_prefix(SENSORS,
                             wip_signal=read_params()["wip_signal"],
                             prefix="wip_")

        expected_files = []
        for date in dates:
            for geo in geos:
                for sensor in sensors:
                    expected_files += [
                        date + "_" + geo + "_" + sensor + ".csv"
                    ]

        assert set(expected_files).issubset(set(csv_files))

        # Test output format
        df = pd.read_csv(
            join("./receiving",
                 "20200709_state_covid_ag_raw_pct_positive.csv"))
        assert (df.columns.values == ["geo_id", "val", "se",
                                      "sample_size"]).all()

        # test_intermediate_file
        flag = None
        for fname in listdir("./cache"):
            if ".csv" in fname:
                flag = 1
        assert flag is not None
Ejemplo n.º 7
0
    def test_output_files(self, clean_receiving_dir):
        """Tests that the proper files are output."""

        run_module(self.PARAMS)
        csv_files = listdir("receiving")

        dates = ["20200718", "20200719", "20200720"]
        geos = GEO_RESOLUTIONS.copy()
        sensors = add_prefix(SENSORS,
                             wip_signal=self.PARAMS["indicator"]["wip_signal"],
                             prefix="wip_")

        expected_files = []
        for date in dates:
            for geo in geos:
                for sensor in sensors:
                    expected_files += [
                        date + "_" + geo + "_" + sensor + ".csv"
                    ]

        assert set(expected_files).issubset(set(csv_files))
        assert '20200721_state_covid_ag_raw_pct_positive.csv' not in csv_files
        assert '20200722_state_covid_ag_raw_pct_positive.csv' not in csv_files

        # Test output format
        df = pd.read_csv(
            join("./receiving",
                 "20200718_state_covid_ag_smoothed_pct_positive.csv"))
        assert (df.columns.values == ["geo_id", "val", "se",
                                      "sample_size"]).all()

        # test_intermediate_file
        flag = None
        for fname in listdir("./cache"):
            if ".csv" in fname:
                flag = 1
        assert flag is not None
Ejemplo n.º 8
0
def run_module(params):
    """
    Main function run when calling the module.

    Inputs parameters from the file 'params.json' and produces output data in
    the directory defined by the `export_dir` (should be "receiving" expect for
    testing purposes).

    Parameters
    ----------
    params
        Dictionary containing indicator configuration. Expected to have the following structure:
        - "common":
            - "export_dir": str, directory to write output
            - "log_exceptions" (optional): bool, whether to log exceptions to file
            - "log_filename" (optional): str, name of file to write logs
        - "indicator":
            - "static_file_dir": str, path to DMA mapping files
            - "data_dir": str, location of cached CSVs
            - "start_date": str, YYYY-MM-DD format, first day to generate data for
            - "end_date": str, YYYY-MM-DD format or empty string, last day to generate data for.
            - "ght_key": str, GHT API key
            - "wip_signal": list of str or bool, to be passed to delphi_utils.add_prefix
            - "test_data_dir": str, path to test data
        - "archive" (optional): if provided, output will be archived with S3
            - "aws_credentials": Dict[str, str], AWS login credentials (see S3 documentation)
            - "bucket_name: str, name of S3 bucket to read/write
            - "cache_dir": str, directory of locally cached data
    """
    start_time = time.time()
    csv_export_count = 0
    oldest_final_export_date = None

    # read parameters
    ght_key = params["indicator"]["ght_key"]
    start_date = params["indicator"]["start_date"]
    end_date = params["indicator"]["end_date"]
    static_dir = params["indicator"]["static_file_dir"]
    export_dir = params["common"]["export_dir"]
    data_dir = params["indicator"]["data_dir"]
    wip_signal = params["indicator"]["wip_signal"]

    logger = get_structured_logger(
        __name__,
        filename=params["common"].get("log_filename"),
        log_exceptions=params["common"].get("log_exceptions", True))

    if "archive" in params:
        arch_diff = S3ArchiveDiffer(params["archive"]["cache_dir"], export_dir,
                                    params["archive"]["bucket_name"], "ght",
                                    params["archive"]["aws_credentials"])
        arch_diff.update_cache()

    # if missing start_date, set to today (GMT) minus 5 days
    if start_date == "":
        now = datetime.datetime.now(datetime.timezone.utc)
        start_date = (now - datetime.timedelta(days=4)).strftime("%Y-%m-%d")

    # if missing end_date, set to today (GMT) minus 5 days
    if end_date == "":
        now = datetime.datetime.now(datetime.timezone.utc)
        end_date = (now - datetime.timedelta(days=4)).strftime("%Y-%m-%d")

    # Turn on basic logging messages (level INFO)
    logging.basicConfig(format="%(levelname)s:%(message)s", level=logging.INFO)
    logging.info("Creating data from %s through %s.", start_date, end_date)

    # Dictionary mapping geo resolution to the data corresponding to that resolution.
    df_by_geo_res = {}

    if not params["indicator"]["test_data_dir"]:
        # setup class to handle API calls
        ght = GoogleHealthTrends(ght_key=ght_key)

        # read data frame version of the data
        df_by_geo_res[STATE] = get_counts_states(ght,
                                                 PULL_START_DATE,
                                                 end_date,
                                                 static_dir=static_dir,
                                                 data_dir=data_dir)
        df_by_geo_res[DMA] = get_counts_dma(ght,
                                            PULL_START_DATE,
                                            end_date,
                                            static_dir=static_dir,
                                            data_dir=data_dir)
    else:
        df_by_geo_res[STATE] = pd.read_csv(
            params["indicator"]["test_data_dir"].format(geo_res="state"))
        df_by_geo_res[DMA] = pd.read_csv(
            params["indicator"]["test_data_dir"].format(geo_res="dma"))

    df_by_geo_res[HRR], df_by_geo_res[MSA] = derived_counts_from_dma(
        df_by_geo_res[DMA], static_dir=static_dir)

    signal_names = add_prefix(SIGNALS, wip_signal, prefix="wip_")

    for signal in signal_names:
        is_smoothed = signal.endswith(SMOOTHED)
        for geo_res, df in df_by_geo_res.items():
            exported_csv_dates = create_export_csv(format_for_export(
                df, is_smoothed),
                                                   geo_res=geo_res,
                                                   sensor=signal,
                                                   start_date=start_date,
                                                   export_dir=export_dir)

            if not exported_csv_dates.empty:
                csv_export_count += exported_csv_dates.size
                if not oldest_final_export_date:
                    oldest_final_export_date = max(exported_csv_dates)
                oldest_final_export_date = min(oldest_final_export_date,
                                               max(exported_csv_dates))

    if "archive" in params:
        archive(arch_diff)

    elapsed_time_in_seconds = round(time.time() - start_time, 2)
    max_lag_in_days = None
    formatted_oldest_final_export_date = None
    if oldest_final_export_date:
        max_lag_in_days = (datetime.datetime.now() -
                           oldest_final_export_date).days
        formatted_oldest_final_export_date = oldest_final_export_date.strftime(
            "%Y-%m-%d")
    logger.info("Completed indicator run",
                elapsed_time_in_seconds=elapsed_time_in_seconds,
                csv_export_count=csv_export_count,
                max_lag_in_days=max_lag_in_days,
                oldest_final_export_date=formatted_oldest_final_export_date)
Ejemplo n.º 9
0
def run_module(params: Dict[str, Any]):
    """Run the quidel_covidtest indicator.

    The `params` argument is expected to have the following structure:
    - "common":
        - "export_dir": str, directory to write output
        - "log_exceptions" (optional): bool, whether to log exceptions to file
        - "log_filename" (optional): str, name of file to write logs
    - indicator":
        - "static_file_dir": str, directory name with population information
        - "input_cache_dir": str, directory in which to cache input data
        - "export_start_date": str, YYYY-MM-DD format of earliest date to create output
        - "export_end_date": str, YYYY-MM-DD format of latest date to create output or "" to create
                             through the present
        - "pull_start_date": str, YYYY-MM-DD format of earliest date to pull input
        - "pull_end_date": str, YYYY-MM-DD format of latest date to create output or "" to create
                           through the present
        - "aws_credentials": Dict[str, str], authentication parameters for AWS S3; see S3
                             documentation
        - "bucket_name": str, name of AWS bucket in which to find data
        - "wip_signal": List[str], list of signal names that are works in progress
        - "test_mode": bool, whether we are running in test mode
    """
    start_time = time.time()
    logger = get_structured_logger(
        __name__,
        filename=params["common"].get("log_filename"),
        log_exceptions=params["common"].get("log_exceptions", True))
    atexit.register(log_exit, start_time, logger)
    cache_dir = params["indicator"]["input_cache_dir"]
    export_dir = params["common"]["export_dir"]
    export_start_date = params["indicator"]["export_start_date"]
    export_end_date = params["indicator"]["export_end_date"]

    # Pull data and update export date
    df, _end_date = pull_quidel_covidtest(params["indicator"])
    if _end_date is None:
        print("The data is up-to-date. Currently, no new data to be ingested.")
        return
    export_end_date = check_export_end_date(export_end_date, _end_date,
                                            END_FROM_TODAY_MINUS)
    export_start_date = check_export_start_date(export_start_date,
                                                export_end_date,
                                                EXPORT_DAY_RANGE)

    first_date, last_date = df["timestamp"].min(), df["timestamp"].max()

    # State Level
    data = df.copy()
    state_groups = geo_map("state", data).groupby("state_id")

    # Add prefix, if required
    sensors = add_prefix(SENSORS,
                         wip_signal=params["indicator"]["wip_signal"],
                         prefix="wip_")
    smoothers = SMOOTHERS.copy()

    for sensor in sensors:
        # For State Level
        print("state", sensor)
        if sensor.endswith(SMOOTHED_POSITIVE):
            smoothers[sensor] = smoothers.pop(SMOOTHED_POSITIVE)
        elif sensor.endswith(RAW_POSITIVE):
            smoothers[sensor] = smoothers.pop(RAW_POSITIVE)
        elif sensor.endswith(SMOOTHED_TEST_PER_DEVICE):
            smoothers[sensor] = smoothers.pop(SMOOTHED_TEST_PER_DEVICE)
        else:
            smoothers[sensor] = smoothers.pop(RAW_TEST_PER_DEVICE)
        state_df = generate_sensor_for_states(state_groups,
                                              smooth=smoothers[sensor][1],
                                              device=smoothers[sensor][0],
                                              first_date=first_date,
                                              last_date=last_date)
        create_export_csv(state_df,
                          geo_res="state",
                          sensor=sensor,
                          export_dir=export_dir,
                          start_date=export_start_date,
                          end_date=export_end_date)

    # County/HRR/MSA level
    for geo_res in GEO_RESOLUTIONS:
        geo_data, res_key = geo_map(geo_res, data)
        for sensor in sensors:
            print(geo_res, sensor)
            res_df = generate_sensor_for_other_geores(
                state_groups,
                geo_data,
                res_key,
                smooth=smoothers[sensor][1],
                device=smoothers[sensor][0],
                first_date=first_date,
                last_date=last_date)
            create_export_csv(res_df,
                              geo_res=geo_res,
                              sensor=sensor,
                              export_dir=export_dir,
                              start_date=export_start_date,
                              end_date=export_end_date,
                              remove_null_samples=True)

    # Export the cache file if the pipeline runs successfully.
    # Otherwise, don't update the cache file
    update_cache_file(df, _end_date, cache_dir)
Ejemplo n.º 10
0
def run_module():
    """Run the quidel_covidtest indicator."""
    params = read_params()
    cache_dir = params["cache_dir"]
    export_dir = params["export_dir"]
    export_start_date = params["export_start_date"]
    export_end_date = params["export_end_date"]

    # Pull data and update export date
    df, _end_date = pull_quidel_covidtest(params)
    if _end_date is None:
        print("The data is up-to-date. Currently, no new data to be ingested.")
        return
    export_end_date = check_export_end_date(export_end_date, _end_date,
                                            END_FROM_TODAY_MINUS)
    export_start_date = check_export_start_date(export_start_date,
                                                export_end_date,
                                                EXPORT_DAY_RANGE)

    first_date, last_date = df["timestamp"].min(), df["timestamp"].max()

    # State Level
    data = df.copy()
    state_groups = geo_map("state", data).groupby("state_id")

    # Add prefix, if required
    sensors = add_prefix(SENSORS,
                         wip_signal=read_params()["wip_signal"],
                         prefix="wip_")
    smoothers = SMOOTHERS.copy()

    for sensor in sensors:
        # For State Level
        print("state", sensor)
        if sensor.endswith(SMOOTHED_POSITIVE):
            smoothers[sensor] = smoothers.pop(SMOOTHED_POSITIVE)
        elif sensor.endswith(RAW_POSITIVE):
            smoothers[sensor] = smoothers.pop(RAW_POSITIVE)
        elif sensor.endswith(SMOOTHED_TEST_PER_DEVICE):
            smoothers[sensor] = smoothers.pop(SMOOTHED_TEST_PER_DEVICE)
        else:
            smoothers[sensor] = smoothers.pop(RAW_TEST_PER_DEVICE)
        state_df = generate_sensor_for_states(state_groups,
                                              smooth=smoothers[sensor][1],
                                              device=smoothers[sensor][0],
                                              first_date=first_date,
                                              last_date=last_date)
        create_export_csv(state_df,
                          geo_res="state",
                          sensor=sensor,
                          export_dir=export_dir,
                          start_date=export_start_date,
                          end_date=export_end_date)

    # County/HRR/MSA level
    for geo_res in GEO_RESOLUTIONS:
        geo_data, res_key = geo_map(geo_res, data)
        for sensor in sensors:
            print(geo_res, sensor)
            res_df = generate_sensor_for_other_geores(
                state_groups,
                geo_data,
                res_key,
                smooth=smoothers[sensor][1],
                device=smoothers[sensor][0],
                first_date=first_date,
                last_date=last_date)
            create_export_csv(res_df,
                              geo_res=geo_res,
                              sensor=sensor,
                              export_dir=export_dir,
                              start_date=export_start_date,
                              end_date=export_end_date,
                              remove_null_samples=True)

    # Export the cache file if the pipeline runs successfully.
    # Otherwise, don't update the cache file
    update_cache_file(df, _end_date, cache_dir)
Ejemplo n.º 11
0
def run_module():
    """Main function run when calling the module.

    Inputs parameters from the file 'params.json' and produces output data in
    the directory defined by the `export_dir` (should be "receiving" expect for
    testing purposes).
    """

    # read parameters
    params = read_params()
    ght_key = params["ght_key"]
    start_date = params["start_date"]
    end_date = params["end_date"]
    static_dir = params["static_file_dir"]
    export_dir = params["export_dir"]
    data_dir = params["data_dir"]
    wip_signal = params["wip_signal"]
    cache_dir = params["cache_dir"]

    arch_diff = S3ArchiveDiffer(cache_dir, export_dir, params["bucket_name"],
                                "ght", params["aws_credentials"])
    arch_diff.update_cache()
    print(arch_diff)
    # if missing start_date, set to today (GMT) minus 5 days
    if start_date == "":
        now = datetime.datetime.now(datetime.timezone.utc)
        start_date = (now - datetime.timedelta(days=4)).strftime("%Y-%m-%d")

    # if missing start_date, set to today (GMT) minus 5 days
    if start_date == "":
        now = datetime.datetime.now(datetime.timezone.utc)
        start_date = (now - datetime.timedelta(days=4)).strftime("%Y-%m-%d")

    # if missing end_date, set to today (GMT) minus 5 days
    if end_date == "":
        now = datetime.datetime.now(datetime.timezone.utc)
        end_date = (now - datetime.timedelta(days=4)).strftime("%Y-%m-%d")

    # Turn on basic logging messages (level INFO)
    logging.basicConfig(format="%(levelname)s:%(message)s", level=logging.INFO)
    logging.info("Creating data from %s through %s.", start_date, end_date)

    # Dictionary mapping geo resolution to the data corresponding to that resolution.
    df_by_geo_res = {}

    if not params["test"]:
        # setup class to handle API calls
        ght = GoogleHealthTrends(ght_key=ght_key)

        # read data frame version of the data
        df_by_geo_res[STATE] = get_counts_states(ght,
                                                 PULL_START_DATE,
                                                 end_date,
                                                 static_dir=static_dir,
                                                 data_dir=data_dir)
        df_by_geo_res[DMA] = get_counts_dma(ght,
                                            PULL_START_DATE,
                                            end_date,
                                            static_dir=static_dir,
                                            data_dir=data_dir)
    else:
        df_by_geo_res[STATE] = pd.read_csv(
            params["test_data_dir"].format(geo_res="state"))
        df_by_geo_res[DMA] = pd.read_csv(
            params["test_data_dir"].format(geo_res="dma"))

    df_by_geo_res[HRR], df_by_geo_res[MSA] = derived_counts_from_dma(
        df_by_geo_res[DMA], static_dir=static_dir)

    signal_names = add_prefix(SIGNALS, wip_signal, prefix="wip_")

    for signal in signal_names:
        is_smoothed = signal.endswith(SMOOTHED)
        for geo_res, df in df_by_geo_res.items():
            create_export_csv(format_for_export(df, is_smoothed),
                              geo_res=geo_res,
                              sensor=signal,
                              start_date=start_date,
                              export_dir=export_dir)

    if not params["test"]:
        # Diff exports, and make incremental versions
        _, common_diffs, new_files = arch_diff.diff_exports()

        # Archive changed and new files only
        to_archive = [
            f for f, diff in common_diffs.items() if diff is not None
        ]
        to_archive += new_files
        _, fails = arch_diff.archive_exports(to_archive)

        # Filter existing exports to exclude those that failed to archive
        succ_common_diffs = {
            f: diff
            for f, diff in common_diffs.items() if f not in fails
        }
        arch_diff.filter_exports(succ_common_diffs)

        # Report failures: someone should probably look at them
        for exported_file in fails:
            print(f"Failed to archive '{exported_file}'")
Ejemplo n.º 12
0
def run_module(params: Dict[str, Any]):
    """Run Quidel flu test module.

    The `params` argument is expected to have the following structure:
    - "common":
        - "export_dir": str, directory to write output
        - "log_exceptions" (optional): bool, whether to log exceptions to file
        - "log_filename" (optional): str, name of file to write logs
    - indicator":
        - "static_file_dir": str, directory name with population information
        - "input_cache_dir": str, directory in which to cache input data
        - "export_start_date": str, YYYY-MM-DD format of earliest date to create output
        - "export_end_date": str, YYYY-MM-DD format of latest date to create output or "" to create
                             through the present
        - "pull_start_date": str, YYYY-MM-DD format of earliest date to pull input
        - "pull_end_date": str, YYYY-MM-DD format of latest date to create output or "" to create
                           through the present
        - "aws_credentials": Dict[str, str], authentication parameters for AWS S3; see S3
                             documentation
        - "bucket_name": str, name of AWS bucket in which to find data
        - "wip_signal": List[str], list of signal names that are works in progress
        - "test_mode": bool, whether we are running in test mode
    """
    start_time = time.time()
    logger = get_structured_logger(
        __name__,
        filename=params["common"].get("log_filename"),
        log_exceptions=params["common"].get("log_exceptions", True))
    cache_dir = params["indicator"]["input_cache_dir"]
    export_dir = params["common"]["export_dir"]
    static_file_dir = params["indicator"]["static_file_dir"]
    export_start_dates = params["indicator"]["export_start_date"]
    export_end_dates = params["indicator"]["export_end_date"]
    map_df = pd.read_csv(join(static_file_dir, "fips_prop_pop.csv"),
                         dtype={"fips": int})

    # Pull data and update export date
    dfs, _end_date = pull_quidel_data(params["indicator"])
    if _end_date is None:
        print("The data is up-to-date. Currently, no new data to be ingested.")
        return
    export_end_dates = check_export_end_date(export_end_dates, _end_date,
                                             END_FROM_TODAY_MINUS)
    export_start_dates = check_export_start_date(export_start_dates,
                                                 export_end_dates,
                                                 EXPORT_DAY_RANGE)

    # Add prefix, if required
    sensors = add_prefix(list(SENSORS.keys()),
                         wip_signal=params["indicator"]["wip_signal"],
                         prefix="wip_")

    for sensor in sensors:
        # Check either covid_ag or flu_ag
        test_type = "covid_ag" if "covid_ag" in sensor else "flu_ag"
        print("state", sensor)
        data = dfs[test_type].copy()
        state_groups = geo_map("state", data, map_df).groupby("state_id")
        first_date, last_date = data["timestamp"].min(), data["timestamp"].max(
        )

        # For State Level
        state_df = generate_sensor_for_states(state_groups,
                                              smooth=SENSORS[sensor][1],
                                              device=SENSORS[sensor][0],
                                              first_date=first_date,
                                              last_date=last_date)
        create_export_csv(state_df,
                          geo_res="state",
                          sensor=sensor,
                          export_dir=export_dir,
                          start_date=export_start_dates[test_type],
                          end_date=export_end_dates[test_type])

        # County/HRR/MSA level
        for geo_res in GEO_RESOLUTIONS:
            print(geo_res, sensor)
            data = dfs[test_type].copy()
            data, res_key = geo_map(geo_res, data, map_df)
            res_df = generate_sensor_for_other_geores(
                state_groups,
                data,
                res_key,
                smooth=SENSORS[sensor][1],
                device=SENSORS[sensor][0],
                first_date=first_date,
                last_date=last_date)
            create_export_csv(res_df,
                              geo_res=geo_res,
                              sensor=sensor,
                              export_dir=export_dir,
                              start_date=export_start_dates[test_type],
                              end_date=export_end_dates[test_type],
                              remove_null_samples=True)

    # Export the cache file if the pipeline runs successfully.
    # Otherwise, don't update the cache file
    update_cache_file(dfs, _end_date, cache_dir)

    elapsed_time_in_seconds = round(time.time() - start_time, 2)
    logger.info("Completed indicator run",
                elapsed_time_in_seconds=elapsed_time_in_seconds)