コード例 #1
0
    def test_update_sensor(self):
        """Tests that the sensors are properly updated."""
        for geo in ["state","hrr"]:
            td = TemporaryDirectory()
            su_inst = CHCSensorUpdator(
                "02-01-2020",
                "06-01-2020",
                "06-12-2020",
                geo,
                self.parallel,
                self.weekday,
                self.numtype,
                self.se
            )

            with mock_s3():
                # Create the fake bucket we will be using
                params = read_params()
                aws_credentials = params["aws_credentials"]
                s3_client = Session(**aws_credentials).client("s3")
                s3_client.create_bucket(Bucket=params["bucket_name"])
                su_inst.update_sensor(
                    self.small_test_data,
                    td.name)

            assert len(os.listdir(td.name)) == len(su_inst.sensor_dates),\
                f"failed {geo} update sensor test"
            td.cleanup()
コード例 #2
0
def run_as_module(date):
    # Clean directories
    for fname in listdir("receiving"):
        if ".csv" in fname:
            remove(join("receiving", fname))

    for fname in listdir("cache"):
        if ".csv" in fname:
            remove(join("cache", fname))

    for fname in listdir("daily_cache"):
        if ".csv" in fname:
            remove(join("daily_cache", fname))

    # Simulate the cache already being partially populated
    copy("test_data/weekly_202025_state_wip_deaths_covid_incidence_prop.csv",
         "daily_cache")

    for fname in listdir("daily_receiving"):
        if ".csv" in fname:
            remove(join("daily_receiving", fname))

    with mock_s3():
        with freeze_time(date):
            # Create the fake bucket we will be using
            params = read_params()
            aws_credentials = params["aws_credentials"]
            s3_client = Session(**aws_credentials).client("s3")
            s3_client.create_bucket(Bucket=params["bucket_name"])

            run_module()
コード例 #3
0
    def test_match_old_smoothed_output(self,
                                       run_as_module,
                                       wip_signal=read_params()["wip_signal"]):
        """Tests that smooth output files don't change over time."""
        if wip_signal:

            files = [
                "20200419_hrr_wip_smoothed_search.csv",
                "20200419_msa_wip_smoothed_search.csv",
                "20200419_state_wip_smoothed_search.csv",
                "20200419_dma_wip_smoothed_search.csv",
            ]
        else:
            files = [
                "20200419_hrr_smoothed_search.csv",
                "20200419_msa_smoothed_search.csv",
                "20200419_state_smoothed_search.csv",
                "20200419_dma_smoothed_search.csv",
            ]

        for fname in files:
            test_df = pd.read_csv(join("receiving_test", fname))
            new_df = pd.read_csv(join("receiving", fname))

            assert_frame_equal(test_df, new_df)
コード例 #4
0
def run_module():
    """Run the validator as a module."""
    parent_params = read_params()
    params = parent_params['validation']

    validator = Validator(params)
    validator.validate(parent_params["export_dir"]).print_and_exit()
コード例 #5
0
    def test_class(self,
                   run_as_module,
                   wip_signal=read_params()["wip_signal"]):
        """Tests output file existence."""
        if wip_signal:
            assert exists(join("receiving", "20200419_hrr_wip_raw_search.csv"))
            assert exists(join("receiving", "20200419_msa_wip_raw_search.csv"))
            assert exists(
                join("receiving", "20200419_state_wip_raw_search.csv"))
            assert exists(join("receiving", "20200419_dma_wip_raw_search.csv"))

            assert exists(join("receiving", "20200315_hrr_wip_raw_search.csv"))
            assert exists(join("receiving", "20200315_msa_wip_raw_search.csv"))
            assert exists(
                join("receiving", "20200315_state_wip_raw_search.csv"))
            assert exists(join("receiving", "20200315_dma_wip_raw_search.csv"))
        else:
            assert exists(join("receiving", "20200419_hrr_raw_search.csv"))
            assert exists(join("receiving", "20200419_msa_raw_search.csv"))
            assert exists(join("receiving", "20200419_state_raw_search.csv"))
            assert exists(join("receiving", "20200419_dma_raw_search.csv"))

            assert exists(join("receiving", "20200315_hrr_raw_search.csv"))
            assert exists(join("receiving", "20200315_msa_raw_search.csv"))
            assert exists(join("receiving", "20200315_state_raw_search.csv"))
            assert exists(join("receiving", "20200315_dma_raw_search.csv"))
コード例 #6
0
    def test_pull_quidel_data(self):

        params = read_params()

        dfs, _ = pull_quidel_data(params)

        # For covid_ag
        df = dfs["covid_ag"]
        first_date = df["timestamp"].min().date()
        last_date = df["timestamp"].max().date()

        assert [first_date.month, first_date.day] == [7, 2]
        assert [last_date.month, last_date.day] == [7, 23]
        assert (df.columns == [
            'timestamp', 'zip', 'totalTest', 'numUniqueDevices', 'positiveTest'
        ]).all()

        # For covid_ag
        df = dfs["flu_ag"]
        first_date = df["timestamp"].min().date()
        last_date = df["timestamp"].max().date()

        assert [first_date.month, first_date.day] == [6, 22]
        assert [last_date.month, last_date.day] == [8, 17]
        assert (df.columns == [
            'timestamp', 'zip', 'totalTest', 'numUniqueDevices', 'positiveTest'
        ]).all()
コード例 #7
0
def update_sensor(
        state_files: List[str],
        mmwr_info: pd.DataFrame,
        output_path: str,
        start_date: datetime,
        end_date: datetime) -> pd.DataFrame:
    """
    Generate sensor values, and write to csv format.

    Args:
        state_files: List of JSON files representing COVID-NET hospitalization data for each state
        mmwr_info: Mappings from MMWR week to actual dates, as a pd.DataFrame
        output_path: Path to write the csvs to
        start_date: First sensor date (datetime.datetime)
        end_date: Last sensor date (datetime.datetime)

    Returns:
        The overall pd.DataFrame after all processing
    """
    assert start_date < end_date, "start_date >= end_date"

    # Combine and format hospitalizations dataframe
    hosp_df = CovidNet.read_all_hosp_data(state_files)
    hosp_df = hosp_df.merge(mmwr_info, how="left",
                            left_on=["mmwr-year", "mmwr-week"],
                            right_on=["year", "weeknumber"])

    # Select relevant columns and standardize naming
    hosp_df = hosp_df.loc[:, APIConfig.HOSP_RENAME_COLS.keys()]\
        .rename(columns=APIConfig.HOSP_RENAME_COLS)

    # Restrict to start and end date
    hosp_df = hosp_df[
        (hosp_df["date"] >= start_date) & (
            hosp_df["date"] < end_date)
    ]

    # Set state id to two-letter abbreviation
    gmpr = GeoMapper()
    hosp_df = gmpr.add_geocode(hosp_df,
                               from_col=APIConfig.STATE_COL,
                               from_code="state_name",
                               new_code="state_id",
                               dropna=False)
    # To use the original column name, reassign original column and drop new one
    hosp_df[APIConfig.STATE_COL] = hosp_df["state_id"]
    hosp_df.drop("state_id", axis=1, inplace=True)
    assert not hosp_df.duplicated(["date", "geo_id"]).any(), "Non-unique (date, geo_id) pairs"
    hosp_df.set_index(["date", "geo_id"], inplace=True)

    # Fill in remaining expected columns
    hosp_df["se"] = np.nan
    hosp_df["sample_size"] = np.nan

    # Write results
    signals = add_prefix(SIGNALS, wip_signal=read_params()["wip_signal"], prefix="wip_")
    for signal in signals:
        write_to_csv(hosp_df, signal, output_path)
    return hosp_df
コード例 #8
0
def run_module():
    """
    Calls the method for handling the wip signals
    Returns
    -------
    prints the updated signal names
    """
    params = read_params()
    wip_signal = params["wip_signal"]
    signal_names = add_prefix(SIGNALS, wip_signal, prefix="wip_")
    print(signal_names)
コード例 #9
0
def run_module() -> None:
    """Run entire hhs_facilities indicator."""
    params = read_params()
    raw_df = pull_data()
    gmpr = GeoMapper()
    filled_fips_df = fill_missing_fips(raw_df, gmpr)
    for geo, (sig_name, sig_cols, sig_func,
              sig_offset) in product(GEO_RESOLUTIONS, SIGNALS):
        mapped_df = convert_geo(filled_fips_df, geo, gmpr)
        output_df = generate_signal(mapped_df, sig_cols, sig_func, sig_offset)
        create_export_csv(output_df, params["export_dir"], geo, sig_name)
コード例 #10
0
def run_as_module():
    # Clean receiving directory
    for fname in listdir("receiving"):
        if fname != ".gitignore":
            remove(join("receiving", fname))
    with mock_s3():
        # Create the fake bucket we will be using
        params = read_params()
        aws_credentials = params["aws_credentials"]
        s3_client = Session(**aws_credentials).client("s3")
        s3_client.create_bucket(Bucket=params["bucket_name"])
        run_module()
コード例 #11
0
ファイル: run.py プロジェクト: sgsmob/covidcast-indicators
def run_module():
    params = read_params()
    qparams = params['qualtrics']
    qparams['qualtrics_dir'] = params['input_dir']

    if not os.path.exists(qparams['qualtrics_dir']):
        os.makedirs(qparams['qualtrics_dir'])

    if not qparams['token']:
        print("\nDRY-RUN MODE\n")
    fetch,post = make_fetchers(qparams)

    get(fetch, post, qparams)
コード例 #12
0
    def test_output_files(self, run_as_module):
        """Tests that the output files contain the correct results of the run."""
        params = read_params()
        # Test output exists
        csv_files = listdir("receiving")

        dates_for_covid_ag = [
            "20200702", "20200703", "20200704", "20200705", "20200706",
            "20200707", "20200708", "20200709"
        ]

        dates_for_flu_ag = [
            "20200623", "20200624", "20200625", "20200626", "20200627",
            "20200628", "20200629", "20200630", "20200701", "20200702",
            "20200703"
        ]

        geos = GEO_RESOLUTIONS.copy()
        sensors = add_prefix(list(SENSORS.keys()),
                             wip_signal=params["wip_signal"],
                             prefix="wip_")

        expected_files = []
        for geo in geos:
            for sensor in sensors:
                if "covid_ag" in sensor:
                    for date in dates_for_covid_ag:
                        expected_files += [
                            date + "_" + geo + "_" + sensor + ".csv"
                        ]
                else:
                    for date in dates_for_flu_ag:
                        expected_files += [
                            date + "_" + geo + "_" + sensor + ".csv"
                        ]

        assert set(expected_files).issubset(set(csv_files))

        # Test output format
        df = pd.read_csv(
            join("./receiving",
                 "20200709_state_covid_ag_raw_pct_positive.csv"))
        assert (df.columns.values == ["geo_id", "val", "se",
                                      "sample_size"]).all()

        # test_intermediate_file
        flag = None
        for fname in listdir("./cache"):
            if ".csv" in fname:
                flag = 1
        assert flag is not None
コード例 #13
0
    def test_pull_quidel_covidtest(self):

        params = read_params()

        df, _ = pull_quidel_covidtest(params)

        first_date = df["timestamp"].min().date()
        last_date = df["timestamp"].max().date()

        assert [first_date.month, first_date.day] == [7, 2]
        assert [last_date.month, last_date.day] == [7, 23]
        assert (df.columns == [
            'timestamp', 'zip', 'totalTest', 'numUniqueDevices', 'positiveTest'
        ]).all()
コード例 #14
0
    def __init__(self,
                 startdate,
                 enddate,
                 dropdate,
                 geo,
                 parallel,
                 weekday,
                 numtype,
                 se):
        """Init Sensor Updator.

        Args:
            startdate: first sensor date (YYYY-mm-dd)
            enddate: last sensor date (YYYY-mm-dd)
            dropdate: data drop date (YYYY-mm-dd)
            geo: geographic resolution, one of ["county", "state", "msa", "hrr", "hhs", "nation"]
            parallel: boolean to run the sensor update in parallel
            weekday: boolean to adjust for weekday effects
            numtype: type of count data used, one of ["covid", "cli"]
            se: boolean to write out standard errors, if true, use an obfuscated name
        """
        self.startdate, self.enddate, self.dropdate = [
            pd.to_datetime(t) for t in (startdate, enddate, dropdate)]
        # handle dates
        assert (self.startdate > (Config.FIRST_DATA_DATE + Config.BURN_IN_PERIOD)
                ), f"not enough data to produce estimates starting {self.startdate}"
        assert self.startdate < self.enddate, "start date >= end date"
        assert self.enddate <= self.dropdate, "end date > drop date"
        self.geo, self.parallel, self.weekday, self.numtype, self.se = geo.lower(), parallel, \
                                                                       weekday, numtype, se

        # output file naming
        if self.numtype == "covid":
            signals = [SMOOTHED_ADJ if self.weekday else SMOOTHED]
        elif self.numtype == "cli":
            signals = [SMOOTHED_ADJ_CLI if self.weekday else SMOOTHED_CLI]
        signal_names = add_prefix(
            signals,
            wip_signal=read_params()["wip_signal"])
        self.updated_signal_names = signal_names

        # initialize members set in shift_dates().
        self.burnindate = None
        self.fit_dates = None
        self.burn_in_dates = None
        self.sensor_dates = None
コード例 #15
0
def configure(variants):
    """Validate params file and set date range."""
    params = read_params()
    params['export_start_date'] = date(*params['export_start_date'])
    yesterday = date.today() - timedelta(days=1)
    if params['date_range'] == 'new':
        # only create combined file for the newest update
        # (usually for yesterday, but check just in case)
        params['date_range'] = [
            min(
                yesterday,
                next_missing_day(params["source"],
                                 set(signal[-1] for signal in variants))),
            yesterday
        ]
    elif params['date_range'] == 'all':
        # create combined files for all of the historical reports
        params['date_range'] = [params['export_start_date'], yesterday]
    else:
        match_res = re.findall(re.compile(r'^\d{8}-\d{8}$'),
                               params['date_range'])
        if len(match_res) == 0:
            raise ValueError(
                "Invalid date_range parameter. Please choose from (new, all, yyyymmdd-yyyymmdd)."
            )
        try:
            date1 = datetime.strptime(params['date_range'][:8],
                                      '%Y%m%d').date()
        except ValueError as error:
            raise ValueError(
                "Invalid date_range parameter. Please check the first date."
            ) from error
        try:
            date2 = datetime.strptime(params['date_range'][-8:],
                                      '%Y%m%d').date()
        except ValueError as error:
            raise ValueError(
                "Invalid date_range parameter. Please check the second date."
            ) from error

        #The the valid start date
        if date1 < params['export_start_date']:
            date1 = params['export_start_date']
        params['date_range'] = [date1, date2]
    return params
コード例 #16
0
def run_module():
    """Run Google Symptoms module."""
    params = read_params()
    export_start_date = datetime.strptime(params["export_start_date"],
                                          "%Y-%m-%d")
    export_dir = params["export_dir"]
    base_url = params["base_url"]

    # Pull GS data
    dfs = pull_gs_data(base_url)
    gmpr = geomap.GeoMapper()
    for geo_res in GEO_RESOLUTIONS:
        if geo_res == "state":
            df_pull = dfs["state"]
        elif geo_res in ["hhs", "nation"]:
            df_pull = gmpr.replace_geocode(dfs["county"],
                                           "fips",
                                           geo_res,
                                           from_col="geo_id",
                                           date_col="timestamp")
            df_pull.rename(columns={geo_res: "geo_id"}, inplace=True)
        else:
            df_pull = geo_map(dfs["county"], geo_res)
        for metric, smoother in product(METRICS + [COMBINED_METRIC],
                                        SMOOTHERS):
            print(geo_res, metric, smoother)
            df = df_pull.set_index(["timestamp", "geo_id"])
            df["val"] = df[metric].groupby(level=1).transform(
                SMOOTHERS_MAP[smoother][0])
            df["se"] = np.nan
            df["sample_size"] = np.nan
            # Drop early entries where data insufficient for smoothing
            df = df.loc[~df["val"].isnull(), :]
            df = df.reset_index()
            sensor_name = "_".join([smoother, "search"])
            create_export_csv(
                df,
                export_dir=export_dir,
                start_date=SMOOTHERS_MAP[smoother][1](export_start_date),
                metric=metric.lower(),
                geo_res=geo_res,
                sensor=sensor_name)
コード例 #17
0
def run_module():
    """Parse parameters and generates csv files for the COVID-NET sensor."""
    params = read_params()

    logging.basicConfig(level=logging.DEBUG)

    start_date = datetime.strptime(params["start_date"], "%Y-%m-%d")

    # If no end_date is specified, assume it is the current date
    if params["end_date"] == "":
        end_date = datetime.now()
    else:
        end_date = datetime.strptime(params["end_date"], "%Y-%m-%d")

    logging.info("start date:\t%s", start_date.date())
    logging.info("end date:\t%s", end_date.date())

    logging.info("outpath:\t%s", params["export_dir"])
    logging.info("parallel:\t%s", params["parallel"])

    # Only geo is state, and no weekday adjustment for now
    # COVID-NET data is by weeks anyway, not daily
    logging.info("starting state, no adj")

    # Download latest COVID-NET files into the cache directory first
    mappings_file = join(params["cache_dir"], "init.json")
    CovidNet.download_mappings(outfile=mappings_file)
    _, mmwr_info, _ = CovidNet.read_mappings(mappings_file)
    state_files = CovidNet.download_all_hosp_data(mappings_file,
                                                  params["cache_dir"],
                                                  parallel=params["parallel"])

    update_sensor(state_files, mmwr_info, params["export_dir"], start_date,
                  end_date)

    # Cleanup cache dir
    remove(mappings_file)
    for state_file in state_files:
        remove(state_file)

    logging.info("finished all")
コード例 #18
0
ファイル: run.py プロジェクト: sgsmob/covidcast-indicators
def run_module():
    start_time = time.time()
    params = read_params()
    meta = covidcast.metadata()
    slack_notifier = None
    if "channel" in params and "slack_token" in params:
        slack_notifier = SlackNotifier(params["channel"],
                                       params["slack_token"])

    complaints = []
    for data_source in params["sources"].keys():
        complaints.extend(
            check_source(data_source, meta, params["sources"],
                         params.get("grace", 0), LOGGER))

    if len(complaints) > 0:
        report_complaints(complaints, slack_notifier)

    elapsed_time_in_seconds = round(time.time() - start_time, 2)
    LOGGER.info("Completed indicator run",
                elapsed_time_in_seconds=elapsed_time_in_seconds)
コード例 #19
0
def run_module():
    """Generate ground truth HHS hospitalization data."""
    params = read_params()
    mapper = GeoMapper()
    request_all_states = ",".join(mapper.get_geo_values("state_id"))

    today = date.today()
    past_reference_day = date(year=2020, month=1,
                              day=1)  # first available date in DB
    date_range = generate_date_ranges(past_reference_day, today)
    dfs = []
    for r in date_range:
        response = Epidata.covid_hosp(request_all_states, r)
        if response['result'] != 1:
            raise Exception(f"Bad result from Epidata: {response['message']}")
        dfs.append(pd.DataFrame(response['epidata']))
    all_columns = pd.concat(dfs)

    for sig in SIGNALS:
        create_export_csv(make_signal(all_columns, sig), params["export_dir"],
                          "state", sig)
コード例 #20
0
    def test_output_files(self, run_as_module):
        """Tests that the proper files are output."""

        # Test output exists
        csv_files = listdir("receiving")

        dates = [
            "20200702", "20200703", "20200704", "20200705", "20200706",
            "20200707", "20200708", "20200709"
        ]
        geos = GEO_RESOLUTIONS.copy()
        sensors = add_prefix(SENSORS,
                             wip_signal=read_params()["wip_signal"],
                             prefix="wip_")

        expected_files = []
        for date in dates:
            for geo in geos:
                for sensor in sensors:
                    expected_files += [
                        date + "_" + geo + "_" + sensor + ".csv"
                    ]

        assert set(expected_files).issubset(set(csv_files))

        # Test output format
        df = pd.read_csv(
            join("./receiving",
                 "20200709_state_covid_ag_raw_pct_positive.csv"))
        assert (df.columns.values == ["geo_id", "val", "se",
                                      "sample_size"]).all()

        # test_intermediate_file
        flag = None
        for fname in listdir("./cache"):
            if ".csv" in fname:
                flag = 1
        assert flag is not None
コード例 #21
0
def run_module():
    params = read_params()
    meta = covidcast.metadata()

    complaints = []
    for data_source in params["sources"].keys():
        complaints.extend(
            check_source(data_source, meta, params["sources"],
                         params.get("grace", 0), LOGGER))

    if len(complaints) > 0:
        for complaint in complaints:
            LOGGER.critical(
                event="signal out of SLA",
                message=complaint.message,
                data_source=complaint.data_source,
                signal=complaint.signal,
                geo_types=complaint.geo_types,
                last_updated=complaint.last_updated.strftime("%Y-%m-%d"))

        report_complaints(complaints, params)

        sys.exit(1)
コード例 #22
0
def run_module():
    """Run the delphi_changehc module."""
    params = read_params()

    logging.basicConfig(level=logging.DEBUG)

    make_asserts(params)

    if params["drop_date"] is None:
        # files are dropped about 4pm the day after the issue date
        dropdate_dt = (datetime.now() - timedelta(days=1, hours=16))
        dropdate_dt = dropdate_dt.replace(hour=0,
                                          minute=0,
                                          second=0,
                                          microsecond=0)
    else:
        dropdate_dt = datetime.strptime(params["drop_date"], "%Y-%m-%d")
    filedate = dropdate_dt.strftime("%Y%m%d")

    file_dict = retrieve_files(params, filedate)

    dropdate = str(dropdate_dt.date())

    # range of estimates to produce
    n_backfill_days = params[
        "n_backfill_days"]  # produce estimates for n_backfill_days
    n_waiting_days = params[
        "n_waiting_days"]  # most recent n_waiting_days won't be est
    enddate_dt = dropdate_dt - timedelta(days=n_waiting_days)
    startdate_dt = enddate_dt - timedelta(days=n_backfill_days)
    enddate = str(enddate_dt.date())
    startdate = str(startdate_dt.date())

    # now allow manual overrides
    if params["end_date"] is not None:
        enddate = params["end_date"]
    if params["start_date"] is not None:
        startdate = params["start_date"]

    logging.info("first sensor date:\t%s", startdate)
    logging.info("last sensor date:\t%s", enddate)
    logging.info("drop date:\t\t%s", dropdate)
    logging.info("n_backfill_days:\t%s", n_backfill_days)
    logging.info("n_waiting_days:\t%s", n_waiting_days)

    ## print out other vars
    logging.info("geos:\t\t\t%s", params["geos"])
    logging.info("outpath:\t\t%s", params["export_dir"])
    logging.info("parallel:\t\t%s", params["parallel"])
    logging.info("weekday:\t\t%s", params["weekday"])
    logging.info("types:\t\t%s", params["types"])
    logging.info("se:\t\t\t%s", params["se"])

    ## start generating
    for geo in params["geos"]:
        for numtype in params["types"]:
            for weekday in params["weekday"]:
                if weekday:
                    logging.info("starting %s, %s, weekday adj", geo, numtype)
                else:
                    logging.info("starting %s, %s, no adj", geo, numtype)
                su_inst = CHCSensorUpdator(startdate, enddate, dropdate, geo,
                                           params["parallel"], weekday,
                                           numtype, params["se"])
                if numtype == "covid":
                    data = load_combined_data(file_dict["denom"],
                                              file_dict["covid"], dropdate_dt,
                                              "fips")
                elif numtype == "cli":
                    data = load_cli_data(file_dict["denom"], file_dict["flu"],
                                         file_dict["mixed"],
                                         file_dict["flu_like"],
                                         file_dict["covid_like"], dropdate_dt,
                                         "fips")
                su_inst.update_sensor(data, params["export_dir"])
            logging.info("finished %s", geo)

    logging.info("finished all")
コード例 #23
0
ファイル: run.py プロジェクト: sgsmob/covidcast-indicators
def get_logger():
    params = read_params()
    return get_structured_logger(__name__,
                                 filename=params.get("log_filename"),
                                 log_exceptions=params.get(
                                     "log_exceptions", True))
コード例 #24
0
 def test_return_params(self):
     params = read_params()
     assert params["test"] == "yes"
コード例 #25
0
 def test_copy_template(self):
     os.remove("params.json")
     params = read_params()
     assert params["test"] == "yes"
コード例 #26
0
# -*- coding: utf-8 -*-
"""Call the function run_module when executed.

This file indicates that calling the module (`python -m MODULE_NAME`) will
call the function `run_module` found within the run.py file. There should be
no need to change this template.
"""

from delphi_utils import read_params
from .run import run_module  # pragma: no cover

run_module(read_params())  # pragma: no cover
コード例 #27
0
def run_module():
    """Run the usafacts indicator."""
    params = read_params()
    export_start_date = params["export_start_date"]
    if export_start_date == "latest":
        export_start_date = datetime.combine(date.today(), time(
            0, 0)) - timedelta(days=1)
    else:
        export_start_date = datetime.strptime(export_start_date, "%Y-%m-%d")
    export_dir = params["export_dir"]
    base_url = params["base_url"]
    cache_dir = params["cache_dir"]

    arch_diff = S3ArchiveDiffer(cache_dir, export_dir, params["bucket_name"],
                                "usafacts", params["aws_credentials"])
    arch_diff.update_cache()

    geo_mapper = GeoMapper()

    dfs = {
        metric: pull_usafacts_data(base_url, metric, geo_mapper)
        for metric in METRICS
    }
    for metric, geo_res, sensor, smoother in product(METRICS, GEO_RESOLUTIONS,
                                                     SENSORS, SMOOTHERS):
        print(geo_res, metric, sensor, smoother)
        df = dfs[metric]
        # Aggregate to appropriate geographic resolution
        df = geo_map(df, geo_res, sensor)
        df["val"] = SMOOTHERS_MAP[smoother][0].smooth(df[sensor].values)
        df["se"] = np.nan
        df["sample_size"] = np.nan
        # Drop early entries where data insufficient for smoothing
        df = df.loc[~df["val"].isnull(), :]
        sensor_name = SENSOR_NAME_MAP[sensor][0]
        # if (SENSOR_NAME_MAP[sensor][1] or SMOOTHERS_MAP[smoother][2]):
        #     metric = f"wip_{metric}"
        #     sensor_name = WIP_SENSOR_NAME_MAP[sensor][0]
        sensor_name = SMOOTHERS_MAP[smoother][1] + sensor_name
        create_export_csv(
            df,
            export_dir=export_dir,
            start_date=SMOOTHERS_MAP[smoother][3](export_start_date),
            metric=metric,
            geo_res=geo_res,
            sensor=sensor_name,
        )

    # Diff exports, and make incremental versions
    _, common_diffs, new_files = arch_diff.diff_exports()

    # Archive changed and new files only
    to_archive = [f for f, diff in common_diffs.items() if diff is not None]
    to_archive += new_files
    _, fails = arch_diff.archive_exports(to_archive)

    # Filter existing exports to exclude those that failed to archive
    succ_common_diffs = {
        f: diff
        for f, diff in common_diffs.items() if f not in fails
    }
    arch_diff.filter_exports(succ_common_diffs)

    # Report failures: someone should probably look at them
    for exported_file in fails:
        print(f"Failed to archive '{exported_file}'")
コード例 #28
0
def get_logger():
    params = read_params()
    return get_structured_logger(__name__, filename=params.get("log_filename"))
コード例 #29
0
from os.path import join, exists
from tempfile import TemporaryDirectory

# third party
from delphi_utils import read_params
import numpy as np
import pandas as pd
import pytest

# first party
from delphi_claims_hosp.config import Config, GeoConstants
from delphi_claims_hosp.update_indicator import ClaimsHospIndicatorUpdater

CONFIG = Config()
CONSTANTS = GeoConstants()
PARAMS = read_params()
DATA_FILEPATH = PARAMS["input_file"]
DROP_DATE = pd.to_datetime(PARAMS["drop_date"])
OUTPATH = "test_data/"


class TestClaimsHospIndicatorUpdater:
    geo = "hrr"
    parallel = False
    weekday = False
    write_se = False
    prefix = "foo"
    small_test_data = pd.DataFrame({
        "num": [0, 100, 200, 300, 400, 500, 600, 100, 200, 300, 400, 500, 600],
        "hrr": [1.0] * 7 + [2.0] * 6,
        "den": [1000] * 7 + [2000] * 6,
コード例 #30
0
def run_module():
    """Run the JHU indicator module."""
    params = read_params()
    export_start_date = params["export_start_date"]
    export_dir = params["export_dir"]
    base_url = params["base_url"]
    cache_dir = params["cache_dir"]
    logger = get_structured_logger(__name__,
                                   filename=params.get("log_filename"))

    if len(params["bucket_name"]) > 0:
        arch_diff = S3ArchiveDiffer(
            cache_dir,
            export_dir,
            params["bucket_name"],
            "jhu",
            params["aws_credentials"],
        )
        arch_diff.update_cache()
    else:
        arch_diff = None

    gmpr = GeoMapper()
    dfs = {metric: pull_jhu_data(base_url, metric, gmpr) for metric in METRICS}
    for metric, geo_res, sensor, smoother in product(METRICS, GEO_RESOLUTIONS,
                                                     SENSORS, SMOOTHERS):
        print(metric, geo_res, sensor, smoother)
        logger.info(event="generating signal and exporting to CSV",
                    metric=metric,
                    geo_res=geo_res,
                    sensor=sensor,
                    smoother=smoother)
        df = dfs[metric]
        # Aggregate to appropriate geographic resolution
        df = geo_map(df, geo_res)
        df.set_index(["timestamp", "geo_id"], inplace=True)
        df["val"] = df[sensor].groupby(level=1).transform(
            SMOOTHERS_MAP[smoother][0])
        df["se"] = np.nan
        df["sample_size"] = np.nan
        # Drop early entries where data insufficient for smoothing
        df = df[~df["val"].isnull()]
        df = df.reset_index()
        sensor_name = SENSOR_NAME_MAP[sensor][0]
        # if (SENSOR_NAME_MAP[sensor][1] or SMOOTHERS_MAP[smoother][2]):
        #     metric = f"wip_{metric}"
        #     sensor_name = WIP_SENSOR_NAME_MAP[sensor][0]
        sensor_name = SMOOTHERS_MAP[smoother][1] + sensor_name
        create_export_csv(
            df,
            export_dir=export_dir,
            start_date=datetime.strptime(export_start_date, "%Y-%m-%d"),
            metric=metric,
            geo_res=geo_res,
            sensor=sensor_name,
        )

    if not arch_diff is None:
        # Diff exports, and make incremental versions
        _, common_diffs, new_files = arch_diff.diff_exports()

        # Archive changed and new files only
        to_archive = [
            f for f, diff in common_diffs.items() if diff is not None
        ]
        to_archive += new_files
        _, fails = arch_diff.archive_exports(to_archive)

        # Filter existing exports to exclude those that failed to archive
        succ_common_diffs = {
            f: diff
            for f, diff in common_diffs.items() if f not in fails
        }
        arch_diff.filter_exports(succ_common_diffs)

        # Report failures: someone should probably look at them
        for exported_file in fails:
            print(f"Failed to archive '{exported_file}'")