Esempio n. 1
0
def run_module(params) -> None:
    """
    Run entire hhs_facilities indicator.

    Parameters
    ----------
    params
        Dictionary containing indicator configuration. Expected to have the following structure:
        - "common":
            - "export_dir": str, directory to write output
    """
    start_time = time.time()
    logger = get_structured_logger(
        __name__,
        filename=params["common"].get("log_filename"),
        log_exceptions=params["common"].get("log_exceptions", True))

    raw_df = pull_data()
    gmpr = GeoMapper()
    filled_fips_df = fill_missing_fips(raw_df, gmpr)
    for geo, (sig_name, sig_cols, sig_func,
              sig_offset) in product(GEO_RESOLUTIONS, SIGNALS):
        mapped_df = convert_geo(filled_fips_df, geo, gmpr)
        output_df = generate_signal(mapped_df, sig_cols, sig_func, sig_offset)
        create_export_csv(output_df, params["common"]["export_dir"], geo,
                          sig_name)

    elapsed_time_in_seconds = round(time.time() - start_time, 2)
    logger.info("Completed indicator run",
                elapsed_time_in_seconds=elapsed_time_in_seconds)
Esempio n. 2
0
def run_module(params):
    """
    Produce a combined cases and deaths signal using data from JHU and USA Facts.

    Parameters
    ----------
    params
        Dictionary containing indicator configuration. Expected to have the following structure:
        - "common":
            - "export_dir": str, directory to write output.
            - "log_exceptions" (optional): bool, whether to log exceptions to file.
            - "log_filename" (optional): str, name of file to write logs
        - "indicator":
            - "export_start_date": list of ints, [year, month, day] format, first day to begin
                data exports from.
            - "date_range": str, YYYYMMDD-YYYYMMDD format, range of dates to generate data for.
            - "source": str, name of combo indicator in metadata.
            - "wip_signal": list of str or bool, to be passed to delphi_utils.add_prefix.
    """
    start_time = time.time()
    variants = [
        tuple((metric, geo_res) + sensor_signal(metric, sensor, smoother))
        for (metric, geo_res, sensor, smoother
             ) in product(METRICS, GEO_RESOLUTIONS, SENSORS, SMOOTH_TYPES)
    ]
    params = configure(variants, params)
    logger = get_structured_logger(
        __name__,
        filename=params["common"].get("log_filename"),
        log_exceptions=params["common"].get("log_exceptions", True))

    for metric, geo_res, sensor_name, signal in variants:
        df = combine_usafacts_and_jhu(
            signal, geo_res, extend_raw_date_range(params, sensor_name),
            params['indicator']['issue_range'])
        df["timestamp"] = pd.to_datetime(df["timestamp"])
        start_date = pd.to_datetime(params['indicator']['export_start_date'])
        export_dir = params["common"]["export_dir"]
        dates = pd.Series(df[df["timestamp"] >= start_date]
                          ["timestamp"].unique()).sort_values()

        signal_name = add_prefix([signal],
                                 wip_signal=params['indicator']["wip_signal"],
                                 prefix="wip_")
        for date_ in dates:
            export_fn = f'{date_.strftime("%Y%m%d")}_{geo_res}_{signal_name[0]}.csv'
            df[df["timestamp"] == date_][[
                "geo_id",
                "val",
                "se",
                "sample_size",
            ]].to_csv(f"{export_dir}/{export_fn}", index=False, na_rep="NA")

    elapsed_time_in_seconds = round(time.time() - start_time, 2)
    logger.info("Completed indicator run",
                elapsed_time_in_seconds=elapsed_time_in_seconds)
Esempio n. 3
0
def run_module(params):
    """
    Runs the indicator

    Arguments
    --------
    params:  Dict[str, Any]
        Nested dictionary of parameters.
    """
    start_time = time.time()
    logger = get_structured_logger(
        __name__,
        filename=params["common"].get("log_filename"),
        log_exceptions=params["common"].get("log_exceptions", True))
    mapper = GeoMapper()
    run_stats = []
    ## build the base version of the signal at the most detailed geo level you can get.
    ## compute stuff here or farm out to another function or file
    all_data = pd.DataFrame(
        columns=["timestamp", "val", "zip", "sample_size", "se"])
    ## aggregate & smooth
    ## TODO: add num/prop variations if needed
    for sensor, smoother, geo in product(SIGNALS, SMOOTHERS, GEOS):
        df = mapper.replace_geocode(all_data,
                                    "zip",
                                    geo,
                                    new_col="geo_id",
                                    date_col="timestamp")
        ## TODO: recompute sample_size, se here if not NA
        df["val"] = df[["geo_id", "val"]].groupby("geo_id")["val"].transform(
            smoother[0].smooth)
        sensor_name = sensor + smoother[
            1]  ## TODO: +num/prop variation if used
        # don't export first 6 days for smoothed signals since they'll be nan.
        start_date = min(df.timestamp) + timedelta(6) if smoother[1] else min(
            df.timestamp)
        dates = create_export_csv(df,
                                  params["common"]["export_dir"],
                                  geo,
                                  sensor_name,
                                  start_date=start_date)
        if len(dates) > 0:
            run_stats.append((max(dates), len(dates)))
    ## log this indicator run
    elapsed_time_in_seconds = round(time.time() - start_time, 2)
    min_max_date = run_stats and min(s[0] for s in run_stats)
    csv_export_count = sum(s[-1] for s in run_stats)
    max_lag_in_days = min_max_date and (datetime.now() - min_max_date).days
    formatted_min_max_date = min_max_date and min_max_date.strftime("%Y-%m-%d")
    logger.info("Completed indicator run",
                elapsed_time_in_seconds=elapsed_time_in_seconds,
                csv_export_count=csv_export_count,
                max_lag_in_days=max_lag_in_days,
                oldest_final_export_date=formatted_min_max_date)
Esempio n. 4
0
def run_module(params):
    """
    Generate ground truth HHS hospitalization data.

    Parameters
    ----------
    params
        Dictionary containing indicator configuration. Expected to have the following structure:
        - "common":
            - "export_dir": str, directory to write output
            - "log_filename" (optional): str, name of file to write logs
    """
    start_time = time.time()
    logger = get_structured_logger(
        __name__,
        filename=params["common"].get("log_filename"),
        log_exceptions=params["common"].get("log_exceptions", True))
    mapper = GeoMapper()
    request_all_states = ",".join(mapper.get_geo_values("state_id"))
    today = date.today()
    past_reference_day = date(year=2020, month=1,
                              day=1)  # first available date in DB
    date_range = generate_date_ranges(past_reference_day, today)
    dfs = []
    for r in date_range:
        response = Epidata.covid_hosp(request_all_states, r)
        # The last date range might only have recent days that don't have any data, so don't error.
        if response["result"] != 1 and r != date_range[-1]:
            raise Exception(f"Bad result from Epidata: {response['message']}")
        if response["result"] == -2 and r == date_range[
                -1]:  # -2 code means no results
            continue
        dfs.append(pd.DataFrame(response['epidata']))
    all_columns = pd.concat(dfs)

    geo_mapper = GeoMapper()

    for sig in SIGNALS:
        state = geo_mapper.add_geocode(make_signal(all_columns, sig),
                                       "state_id",
                                       "state_code",
                                       from_col="state")
        for geo in GEOS:
            create_export_csv(make_geo(state, geo, geo_mapper),
                              params["common"]["export_dir"], geo, sig)

    elapsed_time_in_seconds = round(time.time() - start_time, 2)
    logger.info("Completed indicator run",
                elapsed_time_in_seconds=elapsed_time_in_seconds)
Esempio n. 5
0
def run_module(params: Dict[str, Any]):
    """Run module for processing NCHS mortality data.

    The `params` argument is expected to have the following structure:
    - "common":
        - "daily_export_dir": str, directory to write daily output
        - "weekly_export_dir": str, directory to write weekly output
        - "log_exceptions" (optional): bool, whether to log exceptions to file
        - "log_filename" (optional): str, name of file to write logs
    - "indicator":
        - "export_start_date": str, date from which to export data in YYYY-MM-DD format
        - "static_file_dir": str, directory containing population csv files
        - "test_file" (optional): str, name of file from which to read test data
        - "token": str, authentication for upstream data pull
    - "archive" (optional): if provided, output will be archived with S3
        - "aws_credentials": Dict[str, str], AWS login credentials (see S3 documentation)
        - "bucket_name: str, name of S3 bucket to read/write
        - "daily_cache_dir": str, directory of locally cached daily data
        - "weekly_cache_dir": str, directory of locally cached weekly data
    """
    start_time = time.time()
    logger = get_structured_logger(
        __name__,
        filename=params["common"].get("log_filename"),
        log_exceptions=params["common"].get("log_exceptions", True))
    export_start_date = params["indicator"]["export_start_date"]
    if export_start_date == "latest":  # Find the previous Saturday
        export_start_date = date.today() - timedelta(
            days=date.today().weekday() + 2)
        export_start_date = export_start_date.strftime('%Y-%m-%d')
    daily_export_dir = params["common"]["daily_export_dir"]
    token = params["indicator"]["token"]
    test_file = params["indicator"].get("test_file", None)

    if "archive" in params:
        daily_arch_diff = S3ArchiveDiffer(params["archive"]["daily_cache_dir"],
                                          daily_export_dir,
                                          params["archive"]["bucket_name"],
                                          "nchs_mortality",
                                          params["archive"]["aws_credentials"])
        daily_arch_diff.update_cache()

    df_pull = pull_nchs_mortality_data(token, test_file)
    for metric in METRICS:
        if metric == 'percent_of_expected_deaths':
            print(metric)
            df = df_pull.copy()
            df["val"] = df[metric]
            df["se"] = np.nan
            df["sample_size"] = np.nan
            df = df[~df["val"].isnull()]
            sensor_name = "_".join([SENSOR_NAME_MAP[metric]])
            export_csv(
                df,
                geo_name=GEO_RES,
                export_dir=daily_export_dir,
                start_date=datetime.strptime(export_start_date, "%Y-%m-%d"),
                sensor=sensor_name,
            )
        else:
            for sensor in SENSORS:
                print(metric, sensor)
                df = df_pull.copy()
                if sensor == "num":
                    df["val"] = df[metric]
                else:
                    df["val"] = df[metric] / df["population"] * INCIDENCE_BASE
                df["se"] = np.nan
                df["sample_size"] = np.nan
                df = df[~df["val"].isnull()]
                sensor_name = "_".join([SENSOR_NAME_MAP[metric], sensor])
                export_csv(
                    df,
                    geo_name=GEO_RES,
                    export_dir=daily_export_dir,
                    start_date=datetime.strptime(export_start_date,
                                                 "%Y-%m-%d"),
                    sensor=sensor_name,
                )

#     Weekly run of archive utility on Monday
#     - Does not upload to S3, that is handled by daily run of archive utility
#     - Exports issues into receiving for the API
#     Daily run of archiving utility
#     - Uploads changed files to S3
#     - Does not export any issues into receiving
    if "archive" in params:
        arch_diffs(params, daily_arch_diff)

    elapsed_time_in_seconds = round(time.time() - start_time, 2)
    logger.info("Completed indicator run",
                elapsed_time_in_seconds=elapsed_time_in_seconds)
Esempio n. 6
0
def get_logger():
    params = read_params()
    return get_structured_logger(__name__,
                                 filename=params.get("log_filename"),
                                 log_exceptions=params.get(
                                     "log_exceptions", True))
Esempio n. 7
0
def run_module(params):
    """Create the Safegraph indicator.

    The `params` argument is expected to have the following structure:
    - "common":
        - "export_dir": str, directory to write output
        - "log_exceptions" (optional): bool, whether to log exceptions to file
        - "log_filename" (optional): str, name of file to write logs
    - "indicator":
        - "aws_access_key_id": str, ID of access key for AWS S3
        - "aws_secret_access_key": str, access key for AWS S3
        - "aws_default_region": str, name of AWS S3 region
        - "aws_endpoint": str, name of AWS S3 endpoint
        - "n_core": int, number of cores to use for multithreaded processing
        - "raw_data_dir": str, directory from which to read downloaded data from AWS,
        - "static_file_dir": str, directory containing brand and population csv files
        - "sync": bool, whether to sync S3 data before running indicator
        - "wip_signal": list of str or bool, list of work-in-progress signals to be passed to
                        `delphi_utils.add_prefix()`
    """
    start_time = time.time()
    logger = get_structured_logger(
        __name__,
        filename=params["common"].get("log_filename"),
        log_exceptions=params["common"].get("log_exceptions", True))

    # Place to write output files.
    export_dir = params["common"]["export_dir"]
    # Location of input files.
    raw_data_dir = params["indicator"]["raw_data_dir"]

    # Number of cores to use in multiprocessing.
    n_core = params["indicator"]["n_core"]

    # AWS credentials
    aws_access_key_id = params["indicator"]["aws_access_key_id"]
    aws_secret_access_key = params["indicator"]["aws_secret_access_key"]
    aws_default_region = params["indicator"]["aws_default_region"]
    aws_endpoint = params["indicator"]["aws_endpoint"]
    # Whether to sync `raw_data_dir` with an AWS backend.
    # Must be a bool in the JSON file (rather than the string "True" or "False")
    sync = params["indicator"]["sync"]

    # List of work-in-progress signal names.
    wip_signal = params["indicator"]["wip_signal"]

    # Convert `process()` to a single-argument function for use in `pool.map`.
    single_arg_process = functools.partial(
        process,
        signal_names=SIGNALS,
        wip_signal=wip_signal,
        geo_resolutions=GEO_RESOLUTIONS,
        export_dir=export_dir,
    )

    # Update raw data
    # Why call subprocess rather than using a native Python client, e.g. boto3?
    # Because boto3 does not have a simple rsync-like call that can perform
    # the following behavior elegantly.
    if sync:
        subprocess.run(
            f'aws s3 sync s3://sg-c19-response/social-distancing/v2/ '
            f'{raw_data_dir}/social-distancing/ --endpoint {aws_endpoint}',
            env={
                'AWS_ACCESS_KEY_ID': aws_access_key_id,
                'AWS_SECRET_ACCESS_KEY': aws_secret_access_key,
                'AWS_DEFAULT_REGION': aws_default_region,
            },
            shell=True,
            check=True,
        )

    files = get_daily_source_files(
        f'{raw_data_dir}/social-distancing/**/*.csv.gz')

    files_with_previous_weeks = []
    for day in files:
        previous_week = [files[day]]
        for i in range(1, 7):
            if day - timedelta(i) in files:
                previous_week.append(files[day - timedelta(i)])
        files_with_previous_weeks.append(previous_week)

    with mp.Pool(n_core) as pool:
        pool.map(single_arg_process, files_with_previous_weeks)

    elapsed_time_in_seconds = round(time.time() - start_time, 2)
    logger.info("Completed indicator run",
                elapsed_time_in_seconds=elapsed_time_in_seconds)
Esempio n. 8
0
def run_module(params):
    """
    Main function run when calling the module.

    Inputs parameters from the file 'params.json' and produces output data in
    the directory defined by the `export_dir` (should be "receiving" expect for
    testing purposes).

    Parameters
    ----------
    params
        Dictionary containing indicator configuration. Expected to have the following structure:
        - "common":
            - "export_dir": str, directory to write output
            - "log_exceptions" (optional): bool, whether to log exceptions to file
            - "log_filename" (optional): str, name of file to write logs
        - "indicator":
            - "static_file_dir": str, path to DMA mapping files
            - "data_dir": str, location of cached CSVs
            - "start_date": str, YYYY-MM-DD format, first day to generate data for
            - "end_date": str, YYYY-MM-DD format or empty string, last day to generate data for.
            - "ght_key": str, GHT API key
            - "wip_signal": list of str or bool, to be passed to delphi_utils.add_prefix
            - "test_data_dir": str, path to test data
        - "archive" (optional): if provided, output will be archived with S3
            - "aws_credentials": Dict[str, str], AWS login credentials (see S3 documentation)
            - "bucket_name: str, name of S3 bucket to read/write
            - "cache_dir": str, directory of locally cached data
    """
    start_time = time.time()
    csv_export_count = 0
    oldest_final_export_date = None

    # read parameters
    ght_key = params["indicator"]["ght_key"]
    start_date = params["indicator"]["start_date"]
    end_date = params["indicator"]["end_date"]
    static_dir = params["indicator"]["static_file_dir"]
    export_dir = params["common"]["export_dir"]
    data_dir = params["indicator"]["data_dir"]
    wip_signal = params["indicator"]["wip_signal"]

    logger = get_structured_logger(
        __name__,
        filename=params["common"].get("log_filename"),
        log_exceptions=params["common"].get("log_exceptions", True))

    if "archive" in params:
        arch_diff = S3ArchiveDiffer(params["archive"]["cache_dir"], export_dir,
                                    params["archive"]["bucket_name"], "ght",
                                    params["archive"]["aws_credentials"])
        arch_diff.update_cache()

    # if missing start_date, set to today (GMT) minus 5 days
    if start_date == "":
        now = datetime.datetime.now(datetime.timezone.utc)
        start_date = (now - datetime.timedelta(days=4)).strftime("%Y-%m-%d")

    # if missing end_date, set to today (GMT) minus 5 days
    if end_date == "":
        now = datetime.datetime.now(datetime.timezone.utc)
        end_date = (now - datetime.timedelta(days=4)).strftime("%Y-%m-%d")

    # Turn on basic logging messages (level INFO)
    logging.basicConfig(format="%(levelname)s:%(message)s", level=logging.INFO)
    logging.info("Creating data from %s through %s.", start_date, end_date)

    # Dictionary mapping geo resolution to the data corresponding to that resolution.
    df_by_geo_res = {}

    if not params["indicator"]["test_data_dir"]:
        # setup class to handle API calls
        ght = GoogleHealthTrends(ght_key=ght_key)

        # read data frame version of the data
        df_by_geo_res[STATE] = get_counts_states(ght,
                                                 PULL_START_DATE,
                                                 end_date,
                                                 static_dir=static_dir,
                                                 data_dir=data_dir)
        df_by_geo_res[DMA] = get_counts_dma(ght,
                                            PULL_START_DATE,
                                            end_date,
                                            static_dir=static_dir,
                                            data_dir=data_dir)
    else:
        df_by_geo_res[STATE] = pd.read_csv(
            params["indicator"]["test_data_dir"].format(geo_res="state"))
        df_by_geo_res[DMA] = pd.read_csv(
            params["indicator"]["test_data_dir"].format(geo_res="dma"))

    df_by_geo_res[HRR], df_by_geo_res[MSA] = derived_counts_from_dma(
        df_by_geo_res[DMA], static_dir=static_dir)

    signal_names = add_prefix(SIGNALS, wip_signal, prefix="wip_")

    for signal in signal_names:
        is_smoothed = signal.endswith(SMOOTHED)
        for geo_res, df in df_by_geo_res.items():
            exported_csv_dates = create_export_csv(format_for_export(
                df, is_smoothed),
                                                   geo_res=geo_res,
                                                   sensor=signal,
                                                   start_date=start_date,
                                                   export_dir=export_dir)

            if not exported_csv_dates.empty:
                csv_export_count += exported_csv_dates.size
                if not oldest_final_export_date:
                    oldest_final_export_date = max(exported_csv_dates)
                oldest_final_export_date = min(oldest_final_export_date,
                                               max(exported_csv_dates))

    if "archive" in params:
        archive(arch_diff)

    elapsed_time_in_seconds = round(time.time() - start_time, 2)
    max_lag_in_days = None
    formatted_oldest_final_export_date = None
    if oldest_final_export_date:
        max_lag_in_days = (datetime.datetime.now() -
                           oldest_final_export_date).days
        formatted_oldest_final_export_date = oldest_final_export_date.strftime(
            "%Y-%m-%d")
    logger.info("Completed indicator run",
                elapsed_time_in_seconds=elapsed_time_in_seconds,
                csv_export_count=csv_export_count,
                max_lag_in_days=max_lag_in_days,
                oldest_final_export_date=formatted_oldest_final_export_date)
Esempio n. 9
0
def run_module(params: Dict[str, Dict[str, Any]]):
    """
    Run the delphi_changehc module.

    Parameters
    ----------
    params
        Dictionary containing indicator configuration. Expected to have the following structure:
        - "common":
            - "export_dir": str, directory to write output.
            - "log_exceptions" (optional): bool, whether to log exceptions to file.
            - "log_filename" (optional): str, name of file to write logs
        - "indicator":
            - "input_cache_dir": str, directory to download source files.
            - "input_files": dict of str: str or null, optional filenames to download. If null,
                defaults are set in retrieve_files().
            - "start_date": str, YYYY-MM-DD format, first day to generate data for.
            - "end_date": str or null, YYYY-MM-DD format, last day to generate data for.
               If set to null, end date is derived from drop date and n_waiting_days.
            - "drop_date": str or null, YYYY-MM-DD format, date data is dropped. If set to
               null, current day minus 40 hours is used.
            - "n_backfill_days": int, number of past days to generate estimates for.
            - "n_waiting_days": int, number of most recent days to skip estimates for.
            - "se": bool, whether to write out standard errors.
            - "parallel": bool, whether to update sensor in parallel.
            - "geos": list of str, geographies to generate sensor for.
            - "weekday": list of bool, whether to adjust for weekday effects.
            - "types": list of str, sensor types to generate.
            - "wip_signal": list of str or bool, to be passed to delphi_utils.add_prefix.
            - "ftp_conn": dict, connection information for source FTP.
    """
    start_time = time.time()

    logger = get_structured_logger(
        __name__,
        filename=params["common"].get("log_filename"),
        log_exceptions=params["common"].get("log_exceptions", True))

    make_asserts(params)

    if params["indicator"]["drop_date"] is None:
        # files are dropped about 4pm the day after the issue date
        dropdate_dt = (datetime.now() - timedelta(days=1, hours=16))
        dropdate_dt = dropdate_dt.replace(hour=0,
                                          minute=0,
                                          second=0,
                                          microsecond=0)
    else:
        dropdate_dt = datetime.strptime(params["indicator"]["drop_date"],
                                        "%Y-%m-%d")
    filedate = dropdate_dt.strftime("%Y%m%d")

    file_dict = retrieve_files(params, filedate, logger)

    dropdate = str(dropdate_dt.date())

    # range of estimates to produce
    n_backfill_days = params["indicator"][
        "n_backfill_days"]  # produce estimates for n_backfill_days
    n_waiting_days = params["indicator"][
        "n_waiting_days"]  # most recent n_waiting_days won't be est
    enddate_dt = dropdate_dt - timedelta(days=n_waiting_days)
    startdate_dt = enddate_dt - timedelta(days=n_backfill_days)
    enddate = str(enddate_dt.date())
    startdate = str(startdate_dt.date())

    # now allow manual overrides
    if params["indicator"]["end_date"] is not None:
        enddate = params["indicator"]["end_date"]
    if params["indicator"]["start_date"] is not None:
        startdate = params["indicator"]["start_date"]

    logger.info("generating signal and exporting to CSV",
                first_sensor_date=startdate,
                last_sensor_date=enddate,
                drop_date=dropdate,
                n_backfill_days=n_backfill_days,
                n_waiting_days=n_waiting_days,
                geos=params["indicator"]["geos"],
                export_dir=params["common"]["export_dir"],
                parallel=params["indicator"]["parallel"],
                weekday=params["indicator"]["weekday"],
                types=params["indicator"]["types"],
                se=params["indicator"]["se"])

    ## start generating
    for geo in params["indicator"]["geos"]:
        for numtype in params["indicator"]["types"]:
            for weekday in params["indicator"]["weekday"]:
                if weekday:
                    logger.info("starting weekday adj",
                                geo=geo,
                                numtype=numtype)
                else:
                    logger.info("starting no adj", geo=geo, numtype=numtype)
                su_inst = CHCSensorUpdator(startdate, enddate, dropdate, geo,
                                           params["indicator"]["parallel"],
                                           weekday, numtype,
                                           params["indicator"]["se"],
                                           params["indicator"]["wip_signal"])
                if numtype == "covid":
                    data = load_combined_data(file_dict["denom"],
                                              file_dict["covid"], dropdate_dt,
                                              "fips")
                elif numtype == "cli":
                    data = load_cli_data(file_dict["denom"], file_dict["flu"],
                                         file_dict["mixed"],
                                         file_dict["flu_like"],
                                         file_dict["covid_like"], dropdate_dt,
                                         "fips")
                su_inst.update_sensor(data, params["common"]["export_dir"])
            logger.info("finished processing", geo=geo)

    elapsed_time_in_seconds = round(time.time() - start_time, 2)
    logger.info("Completed indicator run",
                elapsed_time_in_seconds=elapsed_time_in_seconds)
Esempio n. 10
0
def get_logger():
    params = read_params()
    return get_structured_logger(__name__, filename=params.get("log_filename"))
Esempio n. 11
0
def run_module(params):
    """
    Generate updated claims-based hospitalization indicator values.

    Parameters
    ----------
    params
        Dictionary containing indicator configuration. Expected to have the following structure:
        - "common":
            - "export_dir": str, directory to write output.
            - "log_exceptions" (optional): bool, whether to log exceptions to file.
            - "log_filename" (optional): str, name of file to write logs
        - "indicator":
            - "input_file": str, optional filenames to download. If null,
                defaults are set in retrieve_files().
            - "start_date": str, YYYY-MM-DD format, first day to generate data for.
            - "end_date": str or null, YYYY-MM-DD format, last day to generate data for.
               If set to null, end date is derived from drop date and n_waiting_days.
            - "drop_date": str or null, YYYY-MM-DD format, date data is dropped. If set to
               null, current day minus 40 hours is used.
            - "n_backfill_days": int, number of past days to generate estimates for.
            - "n_waiting_days": int, number of most recent days to skip estimates for.
            - "write_se": bool, whether to write out standard errors.
            - "obfuscated_prefix": str, prefix for signal name if write_se is True.
            - "parallel": bool, whether to update sensor in parallel.
            - "geos": list of str, geographies to generate sensor for.
            - "weekday": list of bool, which weekday adjustments to perform. For each value in the
                list, signals will be generated with weekday adjustments (True) or without
                adjustments (False).
    """
    start_time = time.time()
    logger = get_structured_logger(
        __name__,
        filename=params["common"].get("log_filename"),
        log_exceptions=params["common"].get("log_exceptions", True))

    # handle range of estimates to produce
    # filename expected to have format: EDI_AGG_INPATIENT_DDMMYYYY_HHMM{timezone}.csv.gz
    if params["indicator"]["drop_date"] is None:
        dropdate_dt = datetime.strptime(
            Path(params["indicator"]["input_file"]).name.split("_")[3],
            "%d%m%Y")
    else:
        dropdate_dt = datetime.strptime(params["indicator"]["drop_date"],
                                        "%Y-%m-%d")

    # produce estimates for n_backfill_days
    # most recent n_waiting_days won't be est
    enddate_dt = dropdate_dt - timedelta(
        days=params["indicator"]["n_waiting_days"])
    startdate_dt = enddate_dt - timedelta(
        days=params["indicator"]["n_backfill_days"])
    enddate = str(enddate_dt.date())
    startdate = str(startdate_dt.date())
    dropdate = str(dropdate_dt.date())

    # now allow manual overrides
    if params["indicator"]["end_date"] is not None:
        enddate = params["indicator"]["end_date"]
    if params["indicator"]["start_date"] is not None:
        startdate = params["indicator"]['start_date']

    # print out information
    logger.info("Loaded params",
                startdate=startdate,
                enddate=enddate,
                dropdate=dropdate,
                n_backfill_days=params["indicator"]["n_backfill_days"],
                n_waiting_days=params["indicator"]["n_waiting_days"],
                geos=params["indicator"]["geos"],
                outpath=params["common"]["export_dir"],
                parallel=params["indicator"]["parallel"],
                weekday=params["indicator"]["weekday"],
                write_se=params["indicator"]["write_se"])

    # generate indicator csvs
    for geo in params["indicator"]["geos"]:
        for weekday in params["indicator"]["weekday"]:
            if weekday:
                logger.info("starting weekday adj", geo=geo)
            else:
                logger.info("starting no weekday adj", geo=geo)

            signal_name = Config.signal_weekday_name if weekday else Config.signal_name
            if params["indicator"]["write_se"]:
                assert params["indicator"]["obfuscated_prefix"] is not None, \
                    "supply obfuscated prefix in params.json"
                signal_name = params["indicator"][
                    "obfuscated_prefix"] + "_" + signal_name

            logger.info("Updating signal name", signal_name=signal_name)
            updater = ClaimsHospIndicatorUpdater(
                startdate, enddate, dropdate, geo,
                params["indicator"]["parallel"], weekday,
                params["indicator"]["write_se"], signal_name)
            updater.update_indicator(params["indicator"]["input_file"],
                                     params["common"]["export_dir"])
        logger.info("finished updating", geo=geo)

    elapsed_time_in_seconds = round(time.time() - start_time, 2)
    logger.info("Completed indicator run",
                elapsed_time_in_seconds=elapsed_time_in_seconds)
Esempio n. 12
0
def run_module(params):
    """
    Run Google Symptoms module.

    Parameters
    ----------
    params
        Dictionary containing indicator configuration. Expected to have the following structure:
    - "common":
        - "export_dir": str, directory to write output
        - "log_exceptions" (optional): bool, whether to log exceptions to file
        - "log_filename" (optional): str, name of file to write logs
    - "indicator":
        - "export_start_date": str, YYYY-MM-DD format, date from which to export data
        - "num_export_days": int, number of days before end date (today) to export
        - "path_to_bigquery_credentials": str, path to BigQuery API key and service account
            JSON file
    """
    start_time = time.time()
    csv_export_count = 0
    oldest_final_export_date = None

    export_start_date = datetime.strptime(
        params["indicator"]["export_start_date"], "%Y-%m-%d")
    export_dir = params["common"]["export_dir"]
    num_export_days = params["indicator"].get("num_export_days", "all")

    logger = get_structured_logger(
        __name__, filename=params["common"].get("log_filename"),
        log_exceptions=params["common"].get("log_exceptions", True))

    # Pull GS data
    dfs = pull_gs_data(params["indicator"]["bigquery_credentials"],
                       export_start_date,
                       num_export_days)
    gmpr = geomap.GeoMapper()

    for geo_res in GEO_RESOLUTIONS:
        if geo_res == "state":
            df_pull = dfs["state"]
        elif geo_res in ["hhs", "nation"]:
            df_pull = gmpr.replace_geocode(dfs["county"], "fips", geo_res, from_col="geo_id",
                                           date_col="timestamp")
            df_pull.rename(columns={geo_res: "geo_id"}, inplace=True)
        else:
            df_pull = geo_map(dfs["county"], geo_res)

        if len(df_pull) == 0:
            continue
        for metric, smoother in product(
                METRICS+[COMBINED_METRIC], SMOOTHERS):
            print(geo_res, metric, smoother)
            df = df_pull.set_index(["timestamp", "geo_id"])
            df["val"] = df[metric].groupby(level=1
                                           ).transform(SMOOTHERS_MAP[smoother][0])
            df["se"] = np.nan
            df["sample_size"] = np.nan
            # Drop early entries where data insufficient for smoothing
            df = df.loc[~df["val"].isnull(), :]
            df = df.reset_index()
            sensor_name = "_".join([smoother, "search"])

            if len(df) == 0:
                continue
            exported_csv_dates = create_export_csv(
                df,
                export_dir=export_dir,
                start_date=SMOOTHERS_MAP[smoother][1](export_start_date),
                metric=metric.lower(),
                geo_res=geo_res,
                sensor=sensor_name)

            if not exported_csv_dates.empty:
                csv_export_count += exported_csv_dates.size
                if not oldest_final_export_date:
                    oldest_final_export_date = max(exported_csv_dates)
                oldest_final_export_date = min(
                    oldest_final_export_date, max(exported_csv_dates))

    elapsed_time_in_seconds = round(time.time() - start_time, 2)
    max_lag_in_days = None
    formatted_oldest_final_export_date = None
    if oldest_final_export_date:
        max_lag_in_days = (datetime.now() - oldest_final_export_date).days
        formatted_oldest_final_export_date = oldest_final_export_date.strftime(
            "%Y-%m-%d")
    logger.info("Completed indicator run",
                elapsed_time_in_seconds=elapsed_time_in_seconds,
                csv_export_count=csv_export_count,
                max_lag_in_days=max_lag_in_days,
                oldest_final_export_date=formatted_oldest_final_export_date)
Esempio n. 13
0
def run_module(params: Dict[str, Any]):
    """Run the quidel_covidtest indicator.

    The `params` argument is expected to have the following structure:
    - "common":
        - "export_dir": str, directory to write output
        - "log_exceptions" (optional): bool, whether to log exceptions to file
        - "log_filename" (optional): str, name of file to write logs
    - indicator":
        - "static_file_dir": str, directory name with population information
        - "input_cache_dir": str, directory in which to cache input data
        - "export_start_date": str, YYYY-MM-DD format of earliest date to create output
        - "export_end_date": str, YYYY-MM-DD format of latest date to create output or "" to create
                             through the present
        - "pull_start_date": str, YYYY-MM-DD format of earliest date to pull input
        - "pull_end_date": str, YYYY-MM-DD format of latest date to create output or "" to create
                           through the present
        - "aws_credentials": Dict[str, str], authentication parameters for AWS S3; see S3
                             documentation
        - "bucket_name": str, name of AWS bucket in which to find data
        - "wip_signal": List[str], list of signal names that are works in progress
        - "test_mode": bool, whether we are running in test mode
    """
    start_time = time.time()
    logger = get_structured_logger(
        __name__,
        filename=params["common"].get("log_filename"),
        log_exceptions=params["common"].get("log_exceptions", True))
    atexit.register(log_exit, start_time, logger)
    cache_dir = params["indicator"]["input_cache_dir"]
    export_dir = params["common"]["export_dir"]
    export_start_date = params["indicator"]["export_start_date"]
    export_end_date = params["indicator"]["export_end_date"]

    # Pull data and update export date
    df, _end_date = pull_quidel_covidtest(params["indicator"])
    if _end_date is None:
        print("The data is up-to-date. Currently, no new data to be ingested.")
        return
    export_end_date = check_export_end_date(export_end_date, _end_date,
                                            END_FROM_TODAY_MINUS)
    export_start_date = check_export_start_date(export_start_date,
                                                export_end_date,
                                                EXPORT_DAY_RANGE)

    first_date, last_date = df["timestamp"].min(), df["timestamp"].max()

    # State Level
    data = df.copy()
    state_groups = geo_map("state", data).groupby("state_id")

    # Add prefix, if required
    sensors = add_prefix(SENSORS,
                         wip_signal=params["indicator"]["wip_signal"],
                         prefix="wip_")
    smoothers = SMOOTHERS.copy()

    for sensor in sensors:
        # For State Level
        print("state", sensor)
        if sensor.endswith(SMOOTHED_POSITIVE):
            smoothers[sensor] = smoothers.pop(SMOOTHED_POSITIVE)
        elif sensor.endswith(RAW_POSITIVE):
            smoothers[sensor] = smoothers.pop(RAW_POSITIVE)
        elif sensor.endswith(SMOOTHED_TEST_PER_DEVICE):
            smoothers[sensor] = smoothers.pop(SMOOTHED_TEST_PER_DEVICE)
        else:
            smoothers[sensor] = smoothers.pop(RAW_TEST_PER_DEVICE)
        state_df = generate_sensor_for_states(state_groups,
                                              smooth=smoothers[sensor][1],
                                              device=smoothers[sensor][0],
                                              first_date=first_date,
                                              last_date=last_date)
        create_export_csv(state_df,
                          geo_res="state",
                          sensor=sensor,
                          export_dir=export_dir,
                          start_date=export_start_date,
                          end_date=export_end_date)

    # County/HRR/MSA level
    for geo_res in GEO_RESOLUTIONS:
        geo_data, res_key = geo_map(geo_res, data)
        for sensor in sensors:
            print(geo_res, sensor)
            res_df = generate_sensor_for_other_geores(
                state_groups,
                geo_data,
                res_key,
                smooth=smoothers[sensor][1],
                device=smoothers[sensor][0],
                first_date=first_date,
                last_date=last_date)
            create_export_csv(res_df,
                              geo_res=geo_res,
                              sensor=sensor,
                              export_dir=export_dir,
                              start_date=export_start_date,
                              end_date=export_end_date,
                              remove_null_samples=True)

    # Export the cache file if the pipeline runs successfully.
    # Otherwise, don't update the cache file
    update_cache_file(df, _end_date, cache_dir)
Esempio n. 14
0
def run_module(params):
    """Run module for Safegraph patterns data.

    The `params` argument is expected to have the following structure:
    - "common":
        - "export_dir": str, directory to write output
        - "log_exceptions" (optional): bool, whether to log exceptions to file
        - "log_filename" (optional): str, name of file to write logs
    - "indicator":
        - "aws_access_key_id": str, ID of access key for AWS S3
        - "aws_secret_access_key": str, access key for AWS S3
        - "aws_default_region": str, name of AWS S3 region
        - "aws_endpoint": str, name of AWS S3 endpoint
        - "n_core": int, number of cores to use for multithreaded processing
        - "raw_data_dir": directory from which to read downloaded data from AWS,
        - "static_file_dir": str, directory containing brand and population csv files
        - "sync": bool, whether to sync S3 data before running indicator
    """
    start_time = time.time()
    export_dir = params["common"]["export_dir"]
    raw_data_dir = params["indicator"]["raw_data_dir"]
    n_core = params["indicator"]["n_core"]
    aws_endpoint = params["indicator"]["aws_endpoint"]
    static_file_dir = params["indicator"]["static_file_dir"]
    logger = get_structured_logger(
        __name__,
        filename=params["common"].get("log_filename"),
        log_exceptions=params["common"].get("log_exceptions", True))
    env_vars = {
        'AWS_ACCESS_KEY_ID': params["indicator"]["aws_access_key_id"],
        'AWS_SECRET_ACCESS_KEY': params["indicator"]["aws_secret_access_key"],
        'AWS_DEFAULT_REGION': params["indicator"]["aws_default_region"],
    }

    for ver in VERSIONS:
        # Update raw data
        # Why call subprocess rather than using a native Python client, e.g. boto3?
        # Because boto3 does not have a simple rsync-like call that can perform
        # the following behavior elegantly.
        if params["indicator"]["sync"]:
            subprocess.run(
                f'aws s3 sync s3://sg-c19-response/{ver[1]}/ '
                f'{raw_data_dir}/{ver[1]}/ --endpoint {aws_endpoint}',
                env=env_vars,
                shell=True,
                check=True)

        brand_df = pd.read_csv(
            join(static_file_dir, f"brand_info/brand_info_{ver[0]}.csv"))

        files = glob.glob(f'{raw_data_dir}/{ver[1]}/{ver[2]}', recursive=True)

        process_file = partial(
            process,
            brand_df=brand_df,
            metrics=METRICS,
            sensors=SENSORS,
            geo_resolutions=GEO_RESOLUTIONS,
            export_dir=export_dir,
        )

        with mp.Pool(n_core) as pool:
            pool.map(process_file, files)

    elapsed_time_in_seconds = round(time.time() - start_time, 2)
    logger.info("Completed indicator run",
                elapsed_time_in_seconds=elapsed_time_in_seconds)
Esempio n. 15
0
def run_module():
    """Run the JHU indicator module."""
    params = read_params()
    export_start_date = params["export_start_date"]
    export_dir = params["export_dir"]
    base_url = params["base_url"]
    cache_dir = params["cache_dir"]
    logger = get_structured_logger(__name__,
                                   filename=params.get("log_filename"))

    if len(params["bucket_name"]) > 0:
        arch_diff = S3ArchiveDiffer(
            cache_dir,
            export_dir,
            params["bucket_name"],
            "jhu",
            params["aws_credentials"],
        )
        arch_diff.update_cache()
    else:
        arch_diff = None

    gmpr = GeoMapper()
    dfs = {metric: pull_jhu_data(base_url, metric, gmpr) for metric in METRICS}
    for metric, geo_res, sensor, smoother in product(METRICS, GEO_RESOLUTIONS,
                                                     SENSORS, SMOOTHERS):
        print(metric, geo_res, sensor, smoother)
        logger.info(event="generating signal and exporting to CSV",
                    metric=metric,
                    geo_res=geo_res,
                    sensor=sensor,
                    smoother=smoother)
        df = dfs[metric]
        # Aggregate to appropriate geographic resolution
        df = geo_map(df, geo_res)
        df.set_index(["timestamp", "geo_id"], inplace=True)
        df["val"] = df[sensor].groupby(level=1).transform(
            SMOOTHERS_MAP[smoother][0])
        df["se"] = np.nan
        df["sample_size"] = np.nan
        # Drop early entries where data insufficient for smoothing
        df = df[~df["val"].isnull()]
        df = df.reset_index()
        sensor_name = SENSOR_NAME_MAP[sensor][0]
        # if (SENSOR_NAME_MAP[sensor][1] or SMOOTHERS_MAP[smoother][2]):
        #     metric = f"wip_{metric}"
        #     sensor_name = WIP_SENSOR_NAME_MAP[sensor][0]
        sensor_name = SMOOTHERS_MAP[smoother][1] + sensor_name
        create_export_csv(
            df,
            export_dir=export_dir,
            start_date=datetime.strptime(export_start_date, "%Y-%m-%d"),
            metric=metric,
            geo_res=geo_res,
            sensor=sensor_name,
        )

    if not arch_diff is None:
        # Diff exports, and make incremental versions
        _, common_diffs, new_files = arch_diff.diff_exports()

        # Archive changed and new files only
        to_archive = [
            f for f, diff in common_diffs.items() if diff is not None
        ]
        to_archive += new_files
        _, fails = arch_diff.archive_exports(to_archive)

        # Filter existing exports to exclude those that failed to archive
        succ_common_diffs = {
            f: diff
            for f, diff in common_diffs.items() if f not in fails
        }
        arch_diff.filter_exports(succ_common_diffs)

        # Report failures: someone should probably look at them
        for exported_file in fails:
            print(f"Failed to archive '{exported_file}'")
Esempio n. 16
0
def run_module(params: Dict[str, Any]):
    """Run the JHU indicator module.

    The `params` argument is expected to have the following structure:
    - "common":
        - "export_dir": str, directory to write output
        - "log_exceptions" (optional): bool, whether to log exceptions to file
        - "log_filename" (optional): str, name of file to write logs
    - "indicator":
        - "base_url": str, URL from which to read upstream data
        - "export_start_date": str, date from which to export data in YYYY-MM-DD format
    - "archive" (optional): if provided, output will be archived with S3
        - "aws_credentials": Dict[str, str], AWS login credentials (see S3 documentation)
        - "bucket_name: str, name of S3 bucket to read/write
        - "cache_dir": str, directory of locally cached data
    """
    start_time = time.time()
    csv_export_count = 0
    oldest_final_export_date = None
    export_start_date = params["indicator"]["export_start_date"]
    export_dir = params["common"]["export_dir"]
    base_url = params["indicator"]["base_url"]
    logger = get_structured_logger(
        __name__,
        filename=params["common"].get("log_filename"),
        log_exceptions=params["common"].get("log_exceptions", True))

    if "archive" in params:
        arch_diff = S3ArchiveDiffer(
            params["archive"]["cache_dir"],
            export_dir,
            params["archive"]["bucket_name"],
            "jhu",
            params["archive"]["aws_credentials"],
        )
        arch_diff.update_cache()
    else:
        arch_diff = None

    gmpr = GeoMapper()
    dfs = {metric: pull_jhu_data(base_url, metric, gmpr) for metric in METRICS}
    for metric, geo_res, sensor, smoother in product(METRICS, GEO_RESOLUTIONS,
                                                     SENSORS, SMOOTHERS):
        print(metric, geo_res, sensor, smoother)
        logger.info(event="generating signal and exporting to CSV",
                    metric=metric,
                    geo_res=geo_res,
                    sensor=sensor,
                    smoother=smoother)
        df = dfs[metric]
        # Aggregate to appropriate geographic resolution
        df = geo_map(df, geo_res, sensor)
        df.set_index(["timestamp", "geo_id"], inplace=True)
        df["val"] = df[sensor].groupby(level=1).transform(
            SMOOTHERS_MAP[smoother][0])
        df["se"] = np.nan
        df["sample_size"] = np.nan
        # Drop early entries where data insufficient for smoothing
        df = df[~df["val"].isnull()]
        df = df.reset_index()
        sensor_name = SENSOR_NAME_MAP[sensor][0]
        # if (SENSOR_NAME_MAP[sensor][1] or SMOOTHERS_MAP[smoother][2]):
        #     metric = f"wip_{metric}"
        #     sensor_name = WIP_SENSOR_NAME_MAP[sensor][0]
        sensor_name = SMOOTHERS_MAP[smoother][1] + sensor_name
        exported_csv_dates = create_export_csv(
            df,
            export_dir=export_dir,
            start_date=datetime.strptime(export_start_date, "%Y-%m-%d"),
            metric=metric,
            geo_res=geo_res,
            sensor=sensor_name,
        )
        if not exported_csv_dates.empty:
            csv_export_count += exported_csv_dates.size
            if not oldest_final_export_date:
                oldest_final_export_date = max(exported_csv_dates)
            oldest_final_export_date = min(oldest_final_export_date,
                                           max(exported_csv_dates))

    if arch_diff is not None:
        # Diff exports, and make incremental versions
        _, common_diffs, new_files = arch_diff.diff_exports()

        # Archive changed and new files only
        to_archive = [
            f for f, diff in common_diffs.items() if diff is not None
        ]
        to_archive += new_files
        _, fails = arch_diff.archive_exports(to_archive)

        # Filter existing exports to exclude those that failed to archive
        succ_common_diffs = {
            f: diff
            for f, diff in common_diffs.items() if f not in fails
        }
        arch_diff.filter_exports(succ_common_diffs)

        # Report failures: someone should probably look at them
        for exported_file in fails:
            print(f"Failed to archive '{exported_file}'")

    elapsed_time_in_seconds = round(time.time() - start_time, 2)
    max_lag_in_days = None
    formatted_oldest_final_export_date = None
    if oldest_final_export_date:
        max_lag_in_days = (datetime.now() - oldest_final_export_date).days
        formatted_oldest_final_export_date = oldest_final_export_date.strftime(
            "%Y-%m-%d")
    logger.info("Completed indicator run",
                elapsed_time_in_seconds=elapsed_time_in_seconds,
                csv_export_count=csv_export_count,
                max_lag_in_days=max_lag_in_days,
                oldest_final_export_date=formatted_oldest_final_export_date)
Esempio n. 17
0
def run_module(params: Dict[str, Any]):
    """Run Quidel flu test module.

    The `params` argument is expected to have the following structure:
    - "common":
        - "export_dir": str, directory to write output
        - "log_exceptions" (optional): bool, whether to log exceptions to file
        - "log_filename" (optional): str, name of file to write logs
    - indicator":
        - "static_file_dir": str, directory name with population information
        - "input_cache_dir": str, directory in which to cache input data
        - "export_start_date": str, YYYY-MM-DD format of earliest date to create output
        - "export_end_date": str, YYYY-MM-DD format of latest date to create output or "" to create
                             through the present
        - "pull_start_date": str, YYYY-MM-DD format of earliest date to pull input
        - "pull_end_date": str, YYYY-MM-DD format of latest date to create output or "" to create
                           through the present
        - "aws_credentials": Dict[str, str], authentication parameters for AWS S3; see S3
                             documentation
        - "bucket_name": str, name of AWS bucket in which to find data
        - "wip_signal": List[str], list of signal names that are works in progress
        - "test_mode": bool, whether we are running in test mode
    """
    start_time = time.time()
    logger = get_structured_logger(
        __name__,
        filename=params["common"].get("log_filename"),
        log_exceptions=params["common"].get("log_exceptions", True))
    cache_dir = params["indicator"]["input_cache_dir"]
    export_dir = params["common"]["export_dir"]
    static_file_dir = params["indicator"]["static_file_dir"]
    export_start_dates = params["indicator"]["export_start_date"]
    export_end_dates = params["indicator"]["export_end_date"]
    map_df = pd.read_csv(join(static_file_dir, "fips_prop_pop.csv"),
                         dtype={"fips": int})

    # Pull data and update export date
    dfs, _end_date = pull_quidel_data(params["indicator"])
    if _end_date is None:
        print("The data is up-to-date. Currently, no new data to be ingested.")
        return
    export_end_dates = check_export_end_date(export_end_dates, _end_date,
                                             END_FROM_TODAY_MINUS)
    export_start_dates = check_export_start_date(export_start_dates,
                                                 export_end_dates,
                                                 EXPORT_DAY_RANGE)

    # Add prefix, if required
    sensors = add_prefix(list(SENSORS.keys()),
                         wip_signal=params["indicator"]["wip_signal"],
                         prefix="wip_")

    for sensor in sensors:
        # Check either covid_ag or flu_ag
        test_type = "covid_ag" if "covid_ag" in sensor else "flu_ag"
        print("state", sensor)
        data = dfs[test_type].copy()
        state_groups = geo_map("state", data, map_df).groupby("state_id")
        first_date, last_date = data["timestamp"].min(), data["timestamp"].max(
        )

        # For State Level
        state_df = generate_sensor_for_states(state_groups,
                                              smooth=SENSORS[sensor][1],
                                              device=SENSORS[sensor][0],
                                              first_date=first_date,
                                              last_date=last_date)
        create_export_csv(state_df,
                          geo_res="state",
                          sensor=sensor,
                          export_dir=export_dir,
                          start_date=export_start_dates[test_type],
                          end_date=export_end_dates[test_type])

        # County/HRR/MSA level
        for geo_res in GEO_RESOLUTIONS:
            print(geo_res, sensor)
            data = dfs[test_type].copy()
            data, res_key = geo_map(geo_res, data, map_df)
            res_df = generate_sensor_for_other_geores(
                state_groups,
                data,
                res_key,
                smooth=SENSORS[sensor][1],
                device=SENSORS[sensor][0],
                first_date=first_date,
                last_date=last_date)
            create_export_csv(res_df,
                              geo_res=geo_res,
                              sensor=sensor,
                              export_dir=export_dir,
                              start_date=export_start_dates[test_type],
                              end_date=export_end_dates[test_type],
                              remove_null_samples=True)

    # Export the cache file if the pipeline runs successfully.
    # Otherwise, don't update the cache file
    update_cache_file(dfs, _end_date, cache_dir)

    elapsed_time_in_seconds = round(time.time() - start_time, 2)
    logger.info("Completed indicator run",
                elapsed_time_in_seconds=elapsed_time_in_seconds)