コード例 #1
0
    def test_export_without_null_removal(self):
        """Test that `remove_null_samples = False` does not remove entries with null samples."""
        _clean_directory(self.TEST_DIR)

        df_with_nulls = self.DF.copy().append(
            {
                "geo_id": "66666",
                "timestamp": datetime(2020, 6, 6),
                "val": 10,
                "se": 0.2,
                "sample_size": pd.NA
            },
            ignore_index=True)

        create_export_csv(df=df_with_nulls,
                          export_dir=self.TEST_DIR,
                          geo_res="state",
                          sensor="test",
                          remove_null_samples=False)

        assert _non_ignored_files_set(self.TEST_DIR) == set([
            "20200215_state_test.csv", "20200301_state_test.csv",
            "20200315_state_test.csv", "20200606_state_test.csv"
        ])
        assert pd.read_csv(join(self.TEST_DIR,
                                "20200606_state_test.csv")).size > 0
コード例 #2
0
    def test_export_with_limiting_start_date(self):
        """Test that the `start_date` prevents earlier dates from being exported."""

        # Clean receiving directory
        _clean_directory(self.TEST_DIR)

        create_export_csv(
            df=self.DF,
            start_date=datetime.strptime("2020-02-20", "%Y-%m-%d"),
            export_dir=self.TEST_DIR,
            geo_res="county",
            sensor="test",
        )

        assert _non_ignored_files_set(self.TEST_DIR) == set([
            "20200301_county_test.csv",
            "20200315_county_test.csv",
        ])
コード例 #3
0
    def test_export_with_no_dates(self):
        """Test that omitting the `start_date` and `end_date` exports all dates."""

        # Clean receiving directory
        _clean_directory(self.TEST_DIR)

        create_export_csv(
            df=self.DF,
            export_dir=self.TEST_DIR,
            geo_res="state",
            sensor="test",
        )

        assert _non_ignored_files_set(self.TEST_DIR) == set([
            "20200215_state_test.csv",
            "20200301_state_test.csv",
            "20200315_state_test.csv",
        ])
コード例 #4
0
    def test_export_without_metric(self):
        """Test that exporting CSVs without the `metrics` argument yields the correct files."""

        # Clean receiving directory
        _clean_directory(self.TEST_DIR)

        create_export_csv(
            df=self.DF,
            start_date=datetime.strptime("2020-02-15", "%Y-%m-%d"),
            export_dir=self.TEST_DIR,
            geo_res="county",
            sensor="test",
        )

        assert _non_ignored_files_set(self.TEST_DIR) == set([
            "20200215_county_test.csv",
            "20200301_county_test.csv",
            "20200315_county_test.csv",
        ])
コード例 #5
0
def run_module():
    """Run Google Symptoms module."""
    params = read_params()
    export_start_date = datetime.strptime(params["export_start_date"],
                                          "%Y-%m-%d")
    export_dir = params["export_dir"]
    base_url = params["base_url"]

    # Pull GS data
    dfs = pull_gs_data(base_url)
    gmpr = geomap.GeoMapper()
    for geo_res in GEO_RESOLUTIONS:
        if geo_res == "state":
            df_pull = dfs["state"]
        elif geo_res in ["hhs", "nation"]:
            df_pull = gmpr.replace_geocode(dfs["county"],
                                           "fips",
                                           geo_res,
                                           from_col="geo_id",
                                           date_col="timestamp")
            df_pull.rename(columns={geo_res: "geo_id"}, inplace=True)
        else:
            df_pull = geo_map(dfs["county"], geo_res)
        for metric, smoother in product(METRICS + [COMBINED_METRIC],
                                        SMOOTHERS):
            print(geo_res, metric, smoother)
            df = df_pull.set_index(["timestamp", "geo_id"])
            df["val"] = df[metric].groupby(level=1).transform(
                SMOOTHERS_MAP[smoother][0])
            df["se"] = np.nan
            df["sample_size"] = np.nan
            # Drop early entries where data insufficient for smoothing
            df = df.loc[~df["val"].isnull(), :]
            df = df.reset_index()
            sensor_name = "_".join([smoother, "search"])
            create_export_csv(
                df,
                export_dir=export_dir,
                start_date=SMOOTHERS_MAP[smoother][1](export_start_date),
                metric=metric.lower(),
                geo_res=geo_res,
                sensor=sensor_name)
コード例 #6
0
    def test_export_rounding(self):
        """Test that exporting CSVs with the `metrics` argument yields the correct files."""

        # Clean receiving directory
        _clean_directory(self.TEST_DIR)

        create_export_csv(
            df=self.DF,
            start_date=datetime.strptime("2020-02-15", "%Y-%m-%d"),
            export_dir=self.TEST_DIR,
            metric="deaths",
            geo_res="county",
            sensor="test",
        )
        pd.testing.assert_frame_equal(
            pd.read_csv(join(self.TEST_DIR,
                             "20200215_county_deaths_test.csv")),
            pd.DataFrame({
                "geo_id": [51093, 51175],
                "val": [round(3.12345678910, 7), 2.1],
                "se": [0.15, 0.22],
                "sample_size": [100, 100]
            }))
コード例 #7
0
def run_module(params):
    """
    Run the CAN testing metrics indicator.

    Parameters
    ----------
    params
        Dictionary containing indicator configuration. Expected to have the following structure:
        - "common":
            - "export_dir": str, directory to write output
        - "indicator":
            - "parquet_url": str, URL of source file in parquet format
        - "archive" (optional): if provided, output will be archived with S3
            - "cache_dir": str, directory of locally cached data
            - "bucket_name: str, name of S3 bucket to read/write
            - "aws_credentials": Dict[str, str], AWS login credentials (see S3 documentation)
    """
    # Configuration
    export_dir = params["common"]["export_dir"]
    parquet_url = params["indicator"]["parquet_url"]

    # Archive Differ configuration
    if "archive" in params:
        cache_dir = params["archive"]["cache_dir"]
        arch_diff = S3ArchiveDiffer(cache_dir, export_dir,
                                    params["archive"]["bucket_name"], "CAN",
                                    params["archive"]["aws_credentials"])
        arch_diff.update_cache()
    else:
        arch_diff = None

    # Load CAN county-level testing data
    print("Pulling CAN data")
    df_pq = load_data(parquet_url)
    df_county_testing = extract_testing_metrics(df_pq)

    # Perform geo aggregations and export to receiving
    for geo_res in GEO_RESOLUTIONS:
        print(f"Processing {geo_res}")
        df = geo_map(df_county_testing, geo_res)

        # Export 'pcr_specimen_positivity_rate'
        exported_csv_dates = create_export_csv(df,
                                               export_dir=export_dir,
                                               geo_res=geo_res,
                                               sensor=SIGNALS[0])

        # Export 'pcr_specimen_total_tests'
        df["val"] = df["sample_size"]
        df["sample_size"] = np.nan
        df["se"] = np.nan
        exported_csv_dates = create_export_csv(df,
                                               export_dir=export_dir,
                                               geo_res=geo_res,
                                               sensor=SIGNALS[1])

        earliest, latest = min(exported_csv_dates), max(exported_csv_dates)
        print(f"Exported dates: {earliest} to {latest}")

    # Perform archive differencing
    if not arch_diff is None:
        # Diff exports, and make incremental versions
        _, common_diffs, new_files = arch_diff.diff_exports()

        # Archive changed and new files only
        to_archive = [
            f for f, diff in common_diffs.items() if diff is not None
        ]
        to_archive += new_files
        _, fails = arch_diff.archive_exports(to_archive)

        # Filter existing exports to exclude those that failed to archive
        succ_common_diffs = {
            f: diff
            for f, diff in common_diffs.items() if f not in fails
        }
        arch_diff.filter_exports(succ_common_diffs)

        # Report failures: someone should probably look at them
        for exported_file in fails:
            print(f"Failed to archive '{exported_file}'")
コード例 #8
0
ファイル: run.py プロジェクト: sgsmob/covidcast-indicators
def run_module(params: Dict[str, Any]):
    """Run the JHU indicator module.

    The `params` argument is expected to have the following structure:
    - "common":
        - "export_dir": str, directory to write output
        - "log_exceptions" (optional): bool, whether to log exceptions to file
        - "log_filename" (optional): str, name of file to write logs
    - "indicator":
        - "base_url": str, URL from which to read upstream data
        - "export_start_date": str, date from which to export data in YYYY-MM-DD format
    - "archive" (optional): if provided, output will be archived with S3
        - "aws_credentials": Dict[str, str], AWS login credentials (see S3 documentation)
        - "bucket_name: str, name of S3 bucket to read/write
        - "cache_dir": str, directory of locally cached data
    """
    start_time = time.time()
    csv_export_count = 0
    oldest_final_export_date = None
    export_start_date = params["indicator"]["export_start_date"]
    export_dir = params["common"]["export_dir"]
    base_url = params["indicator"]["base_url"]
    logger = get_structured_logger(
        __name__,
        filename=params["common"].get("log_filename"),
        log_exceptions=params["common"].get("log_exceptions", True))

    if "archive" in params:
        arch_diff = S3ArchiveDiffer(
            params["archive"]["cache_dir"],
            export_dir,
            params["archive"]["bucket_name"],
            "jhu",
            params["archive"]["aws_credentials"],
        )
        arch_diff.update_cache()
    else:
        arch_diff = None

    gmpr = GeoMapper()
    dfs = {metric: pull_jhu_data(base_url, metric, gmpr) for metric in METRICS}
    for metric, geo_res, sensor, smoother in product(METRICS, GEO_RESOLUTIONS,
                                                     SENSORS, SMOOTHERS):
        print(metric, geo_res, sensor, smoother)
        logger.info(event="generating signal and exporting to CSV",
                    metric=metric,
                    geo_res=geo_res,
                    sensor=sensor,
                    smoother=smoother)
        df = dfs[metric]
        # Aggregate to appropriate geographic resolution
        df = geo_map(df, geo_res, sensor)
        df.set_index(["timestamp", "geo_id"], inplace=True)
        df["val"] = df[sensor].groupby(level=1).transform(
            SMOOTHERS_MAP[smoother][0])
        df["se"] = np.nan
        df["sample_size"] = np.nan
        # Drop early entries where data insufficient for smoothing
        df = df[~df["val"].isnull()]
        df = df.reset_index()
        sensor_name = SENSOR_NAME_MAP[sensor][0]
        # if (SENSOR_NAME_MAP[sensor][1] or SMOOTHERS_MAP[smoother][2]):
        #     metric = f"wip_{metric}"
        #     sensor_name = WIP_SENSOR_NAME_MAP[sensor][0]
        sensor_name = SMOOTHERS_MAP[smoother][1] + sensor_name
        exported_csv_dates = create_export_csv(
            df,
            export_dir=export_dir,
            start_date=datetime.strptime(export_start_date, "%Y-%m-%d"),
            metric=metric,
            geo_res=geo_res,
            sensor=sensor_name,
        )
        if not exported_csv_dates.empty:
            csv_export_count += exported_csv_dates.size
            if not oldest_final_export_date:
                oldest_final_export_date = max(exported_csv_dates)
            oldest_final_export_date = min(oldest_final_export_date,
                                           max(exported_csv_dates))

    if arch_diff is not None:
        # Diff exports, and make incremental versions
        _, common_diffs, new_files = arch_diff.diff_exports()

        # Archive changed and new files only
        to_archive = [
            f for f, diff in common_diffs.items() if diff is not None
        ]
        to_archive += new_files
        _, fails = arch_diff.archive_exports(to_archive)

        # Filter existing exports to exclude those that failed to archive
        succ_common_diffs = {
            f: diff
            for f, diff in common_diffs.items() if f not in fails
        }
        arch_diff.filter_exports(succ_common_diffs)

        # Report failures: someone should probably look at them
        for exported_file in fails:
            print(f"Failed to archive '{exported_file}'")

    elapsed_time_in_seconds = round(time.time() - start_time, 2)
    max_lag_in_days = None
    formatted_oldest_final_export_date = None
    if oldest_final_export_date:
        max_lag_in_days = (datetime.now() - oldest_final_export_date).days
        formatted_oldest_final_export_date = oldest_final_export_date.strftime(
            "%Y-%m-%d")
    logger.info("Completed indicator run",
                elapsed_time_in_seconds=elapsed_time_in_seconds,
                csv_export_count=csv_export_count,
                max_lag_in_days=max_lag_in_days,
                oldest_final_export_date=formatted_oldest_final_export_date)
コード例 #9
0
ファイル: run.py プロジェクト: sgsmob/covidcast-indicators
def run_module(params):
    """
    Main function run when calling the module.

    Inputs parameters from the file 'params.json' and produces output data in
    the directory defined by the `export_dir` (should be "receiving" expect for
    testing purposes).

    Parameters
    ----------
    params
        Dictionary containing indicator configuration. Expected to have the following structure:
        - "common":
            - "export_dir": str, directory to write output
            - "log_exceptions" (optional): bool, whether to log exceptions to file
            - "log_filename" (optional): str, name of file to write logs
        - "indicator":
            - "static_file_dir": str, path to DMA mapping files
            - "data_dir": str, location of cached CSVs
            - "start_date": str, YYYY-MM-DD format, first day to generate data for
            - "end_date": str, YYYY-MM-DD format or empty string, last day to generate data for.
            - "ght_key": str, GHT API key
            - "wip_signal": list of str or bool, to be passed to delphi_utils.add_prefix
            - "test_data_dir": str, path to test data
        - "archive" (optional): if provided, output will be archived with S3
            - "aws_credentials": Dict[str, str], AWS login credentials (see S3 documentation)
            - "bucket_name: str, name of S3 bucket to read/write
            - "cache_dir": str, directory of locally cached data
    """
    start_time = time.time()
    csv_export_count = 0
    oldest_final_export_date = None

    # read parameters
    ght_key = params["indicator"]["ght_key"]
    start_date = params["indicator"]["start_date"]
    end_date = params["indicator"]["end_date"]
    static_dir = params["indicator"]["static_file_dir"]
    export_dir = params["common"]["export_dir"]
    data_dir = params["indicator"]["data_dir"]
    wip_signal = params["indicator"]["wip_signal"]

    logger = get_structured_logger(
        __name__,
        filename=params["common"].get("log_filename"),
        log_exceptions=params["common"].get("log_exceptions", True))

    if "archive" in params:
        arch_diff = S3ArchiveDiffer(params["archive"]["cache_dir"], export_dir,
                                    params["archive"]["bucket_name"], "ght",
                                    params["archive"]["aws_credentials"])
        arch_diff.update_cache()

    # if missing start_date, set to today (GMT) minus 5 days
    if start_date == "":
        now = datetime.datetime.now(datetime.timezone.utc)
        start_date = (now - datetime.timedelta(days=4)).strftime("%Y-%m-%d")

    # if missing end_date, set to today (GMT) minus 5 days
    if end_date == "":
        now = datetime.datetime.now(datetime.timezone.utc)
        end_date = (now - datetime.timedelta(days=4)).strftime("%Y-%m-%d")

    # Turn on basic logging messages (level INFO)
    logging.basicConfig(format="%(levelname)s:%(message)s", level=logging.INFO)
    logging.info("Creating data from %s through %s.", start_date, end_date)

    # Dictionary mapping geo resolution to the data corresponding to that resolution.
    df_by_geo_res = {}

    if not params["indicator"]["test_data_dir"]:
        # setup class to handle API calls
        ght = GoogleHealthTrends(ght_key=ght_key)

        # read data frame version of the data
        df_by_geo_res[STATE] = get_counts_states(ght,
                                                 PULL_START_DATE,
                                                 end_date,
                                                 static_dir=static_dir,
                                                 data_dir=data_dir)
        df_by_geo_res[DMA] = get_counts_dma(ght,
                                            PULL_START_DATE,
                                            end_date,
                                            static_dir=static_dir,
                                            data_dir=data_dir)
    else:
        df_by_geo_res[STATE] = pd.read_csv(
            params["indicator"]["test_data_dir"].format(geo_res="state"))
        df_by_geo_res[DMA] = pd.read_csv(
            params["indicator"]["test_data_dir"].format(geo_res="dma"))

    df_by_geo_res[HRR], df_by_geo_res[MSA] = derived_counts_from_dma(
        df_by_geo_res[DMA], static_dir=static_dir)

    signal_names = add_prefix(SIGNALS, wip_signal, prefix="wip_")

    for signal in signal_names:
        is_smoothed = signal.endswith(SMOOTHED)
        for geo_res, df in df_by_geo_res.items():
            exported_csv_dates = create_export_csv(format_for_export(
                df, is_smoothed),
                                                   geo_res=geo_res,
                                                   sensor=signal,
                                                   start_date=start_date,
                                                   export_dir=export_dir)

            if not exported_csv_dates.empty:
                csv_export_count += exported_csv_dates.size
                if not oldest_final_export_date:
                    oldest_final_export_date = max(exported_csv_dates)
                oldest_final_export_date = min(oldest_final_export_date,
                                               max(exported_csv_dates))

    if "archive" in params:
        archive(arch_diff)

    elapsed_time_in_seconds = round(time.time() - start_time, 2)
    max_lag_in_days = None
    formatted_oldest_final_export_date = None
    if oldest_final_export_date:
        max_lag_in_days = (datetime.datetime.now() -
                           oldest_final_export_date).days
        formatted_oldest_final_export_date = oldest_final_export_date.strftime(
            "%Y-%m-%d")
    logger.info("Completed indicator run",
                elapsed_time_in_seconds=elapsed_time_in_seconds,
                csv_export_count=csv_export_count,
                max_lag_in_days=max_lag_in_days,
                oldest_final_export_date=formatted_oldest_final_export_date)
コード例 #10
0
def process(fname, sensors, metrics, geo_resolutions,
            export_dir, brand_df):
    """
    Process an input census block group-level CSV and export it.

    Assumes that the input file has _only_ one date of data.

    Parameters
    ----------
    fname: str
        Input filename.
    metrics: List[Tuple[str, bool]]
        List of (metric_name, wip).
    sensors: List[str]
        List of (sensor)
    geo_resolutions: List[str]
        List of geo resolutions to export the data.
    brand_df: pd.DataFrame
        mapping info from naics_code to safegraph_brand_id

    Returns
    -------
    None
    """
    metric_names, naics_codes, wips = (list(x) for x in zip(*metrics))
    used_cols = [
            "safegraph_brand_ids",
            "visits_by_day",
            "date_range_start",
            "date_range_end",
            "postal_code",
            ]

    if ".csv.gz" in fname:
        df = pd.read_csv(fname,
                         usecols=used_cols,
                         parse_dates=["date_range_start", "date_range_end"])
        dfs = construct_signals(df, metric_names, naics_codes, brand_df)
        print("Finished pulling data from " + fname)
    else:
        files = glob.glob(f'{fname}/**/*.csv.gz', recursive=True)
        dfs_dict = {"bars_visit": [], "restaurants_visit": []}
        for fn in files:
            df = pd.read_csv(fn,
                         usecols=used_cols,
                         parse_dates=["date_range_start", "date_range_end"])
            dfs = construct_signals(df, metric_names, naics_codes, brand_df)
            dfs_dict["bars_visit"].append(dfs["bars_visit"])
            dfs_dict["restaurants_visit"].append(dfs["restaurants_visit"])
        dfs = {}
        dfs["bars_visit"] = pd.concat(dfs_dict["bars_visit"]
            ).groupby(["timestamp", "zip"]).sum().reset_index()
        dfs["restaurants_visit"] = pd.concat(dfs_dict["restaurants_visit"]
            ).groupby(["timestamp", "zip"]).sum().reset_index()
        print("Finished pulling data from " + fname)
    for geo_res, sensor in product(geo_resolutions, sensors):
        for metric, wip in zip(metric_names, wips):
            df_export = aggregate(dfs[metric], metric, geo_res)
            df_export["val"] = df_export["_".join([metric, sensor])]
            df_export["se"] = np.nan
            df_export["sample_size"] = np.nan

            if wip:
                metric = "wip_" + metric
            create_export_csv(
                df_export,
                export_dir=export_dir,
                start_date=df_export["timestamp"].min(),
                metric=metric,
                geo_res=geo_res,
                sensor=sensor,
            )
コード例 #11
0
def run_module():
    """Main function run when calling the module.

    Inputs parameters from the file 'params.json' and produces output data in
    the directory defined by the `export_dir` (should be "receiving" expect for
    testing purposes).
    """

    # read parameters
    params = read_params()
    ght_key = params["ght_key"]
    start_date = params["start_date"]
    end_date = params["end_date"]
    static_dir = params["static_file_dir"]
    export_dir = params["export_dir"]
    data_dir = params["data_dir"]
    wip_signal = params["wip_signal"]
    cache_dir = params["cache_dir"]

    arch_diff = S3ArchiveDiffer(cache_dir, export_dir, params["bucket_name"],
                                "ght", params["aws_credentials"])
    arch_diff.update_cache()
    print(arch_diff)
    # if missing start_date, set to today (GMT) minus 5 days
    if start_date == "":
        now = datetime.datetime.now(datetime.timezone.utc)
        start_date = (now - datetime.timedelta(days=4)).strftime("%Y-%m-%d")

    # if missing start_date, set to today (GMT) minus 5 days
    if start_date == "":
        now = datetime.datetime.now(datetime.timezone.utc)
        start_date = (now - datetime.timedelta(days=4)).strftime("%Y-%m-%d")

    # if missing end_date, set to today (GMT) minus 5 days
    if end_date == "":
        now = datetime.datetime.now(datetime.timezone.utc)
        end_date = (now - datetime.timedelta(days=4)).strftime("%Y-%m-%d")

    # Turn on basic logging messages (level INFO)
    logging.basicConfig(format="%(levelname)s:%(message)s", level=logging.INFO)
    logging.info("Creating data from %s through %s.", start_date, end_date)

    # Dictionary mapping geo resolution to the data corresponding to that resolution.
    df_by_geo_res = {}

    if not params["test"]:
        # setup class to handle API calls
        ght = GoogleHealthTrends(ght_key=ght_key)

        # read data frame version of the data
        df_by_geo_res[STATE] = get_counts_states(ght,
                                                 PULL_START_DATE,
                                                 end_date,
                                                 static_dir=static_dir,
                                                 data_dir=data_dir)
        df_by_geo_res[DMA] = get_counts_dma(ght,
                                            PULL_START_DATE,
                                            end_date,
                                            static_dir=static_dir,
                                            data_dir=data_dir)
    else:
        df_by_geo_res[STATE] = pd.read_csv(
            params["test_data_dir"].format(geo_res="state"))
        df_by_geo_res[DMA] = pd.read_csv(
            params["test_data_dir"].format(geo_res="dma"))

    df_by_geo_res[HRR], df_by_geo_res[MSA] = derived_counts_from_dma(
        df_by_geo_res[DMA], static_dir=static_dir)

    signal_names = add_prefix(SIGNALS, wip_signal, prefix="wip_")

    for signal in signal_names:
        is_smoothed = signal.endswith(SMOOTHED)
        for geo_res, df in df_by_geo_res.items():
            create_export_csv(format_for_export(df, is_smoothed),
                              geo_res=geo_res,
                              sensor=signal,
                              start_date=start_date,
                              export_dir=export_dir)

    if not params["test"]:
        # Diff exports, and make incremental versions
        _, common_diffs, new_files = arch_diff.diff_exports()

        # Archive changed and new files only
        to_archive = [
            f for f, diff in common_diffs.items() if diff is not None
        ]
        to_archive += new_files
        _, fails = arch_diff.archive_exports(to_archive)

        # Filter existing exports to exclude those that failed to archive
        succ_common_diffs = {
            f: diff
            for f, diff in common_diffs.items() if f not in fails
        }
        arch_diff.filter_exports(succ_common_diffs)

        # Report failures: someone should probably look at them
        for exported_file in fails:
            print(f"Failed to archive '{exported_file}'")
コード例 #12
0
def run_module():
    """Run the JHU indicator module."""
    params = read_params()
    export_start_date = params["export_start_date"]
    export_dir = params["export_dir"]
    base_url = params["base_url"]
    cache_dir = params["cache_dir"]
    logger = get_structured_logger(__name__,
                                   filename=params.get("log_filename"))

    if len(params["bucket_name"]) > 0:
        arch_diff = S3ArchiveDiffer(
            cache_dir,
            export_dir,
            params["bucket_name"],
            "jhu",
            params["aws_credentials"],
        )
        arch_diff.update_cache()
    else:
        arch_diff = None

    gmpr = GeoMapper()
    dfs = {metric: pull_jhu_data(base_url, metric, gmpr) for metric in METRICS}
    for metric, geo_res, sensor, smoother in product(METRICS, GEO_RESOLUTIONS,
                                                     SENSORS, SMOOTHERS):
        print(metric, geo_res, sensor, smoother)
        logger.info(event="generating signal and exporting to CSV",
                    metric=metric,
                    geo_res=geo_res,
                    sensor=sensor,
                    smoother=smoother)
        df = dfs[metric]
        # Aggregate to appropriate geographic resolution
        df = geo_map(df, geo_res)
        df.set_index(["timestamp", "geo_id"], inplace=True)
        df["val"] = df[sensor].groupby(level=1).transform(
            SMOOTHERS_MAP[smoother][0])
        df["se"] = np.nan
        df["sample_size"] = np.nan
        # Drop early entries where data insufficient for smoothing
        df = df[~df["val"].isnull()]
        df = df.reset_index()
        sensor_name = SENSOR_NAME_MAP[sensor][0]
        # if (SENSOR_NAME_MAP[sensor][1] or SMOOTHERS_MAP[smoother][2]):
        #     metric = f"wip_{metric}"
        #     sensor_name = WIP_SENSOR_NAME_MAP[sensor][0]
        sensor_name = SMOOTHERS_MAP[smoother][1] + sensor_name
        create_export_csv(
            df,
            export_dir=export_dir,
            start_date=datetime.strptime(export_start_date, "%Y-%m-%d"),
            metric=metric,
            geo_res=geo_res,
            sensor=sensor_name,
        )

    if not arch_diff is None:
        # Diff exports, and make incremental versions
        _, common_diffs, new_files = arch_diff.diff_exports()

        # Archive changed and new files only
        to_archive = [
            f for f, diff in common_diffs.items() if diff is not None
        ]
        to_archive += new_files
        _, fails = arch_diff.archive_exports(to_archive)

        # Filter existing exports to exclude those that failed to archive
        succ_common_diffs = {
            f: diff
            for f, diff in common_diffs.items() if f not in fails
        }
        arch_diff.filter_exports(succ_common_diffs)

        # Report failures: someone should probably look at them
        for exported_file in fails:
            print(f"Failed to archive '{exported_file}'")
コード例 #13
0
def run_module():
    """Run the quidel_covidtest indicator."""
    params = read_params()
    cache_dir = params["cache_dir"]
    export_dir = params["export_dir"]
    export_start_date = params["export_start_date"]
    export_end_date = params["export_end_date"]

    # Pull data and update export date
    df, _end_date = pull_quidel_covidtest(params)
    if _end_date is None:
        print("The data is up-to-date. Currently, no new data to be ingested.")
        return
    export_end_date = check_export_end_date(export_end_date, _end_date,
                                            END_FROM_TODAY_MINUS)
    export_start_date = check_export_start_date(export_start_date,
                                                export_end_date,
                                                EXPORT_DAY_RANGE)

    first_date, last_date = df["timestamp"].min(), df["timestamp"].max()

    # State Level
    data = df.copy()
    state_groups = geo_map("state", data).groupby("state_id")

    # Add prefix, if required
    sensors = add_prefix(SENSORS,
                         wip_signal=read_params()["wip_signal"],
                         prefix="wip_")
    smoothers = SMOOTHERS.copy()

    for sensor in sensors:
        # For State Level
        print("state", sensor)
        if sensor.endswith(SMOOTHED_POSITIVE):
            smoothers[sensor] = smoothers.pop(SMOOTHED_POSITIVE)
        elif sensor.endswith(RAW_POSITIVE):
            smoothers[sensor] = smoothers.pop(RAW_POSITIVE)
        elif sensor.endswith(SMOOTHED_TEST_PER_DEVICE):
            smoothers[sensor] = smoothers.pop(SMOOTHED_TEST_PER_DEVICE)
        else:
            smoothers[sensor] = smoothers.pop(RAW_TEST_PER_DEVICE)
        state_df = generate_sensor_for_states(state_groups,
                                              smooth=smoothers[sensor][1],
                                              device=smoothers[sensor][0],
                                              first_date=first_date,
                                              last_date=last_date)
        create_export_csv(state_df,
                          geo_res="state",
                          sensor=sensor,
                          export_dir=export_dir,
                          start_date=export_start_date,
                          end_date=export_end_date)

    # County/HRR/MSA level
    for geo_res in GEO_RESOLUTIONS:
        geo_data, res_key = geo_map(geo_res, data)
        for sensor in sensors:
            print(geo_res, sensor)
            res_df = generate_sensor_for_other_geores(
                state_groups,
                geo_data,
                res_key,
                smooth=smoothers[sensor][1],
                device=smoothers[sensor][0],
                first_date=first_date,
                last_date=last_date)
            create_export_csv(res_df,
                              geo_res=geo_res,
                              sensor=sensor,
                              export_dir=export_dir,
                              start_date=export_start_date,
                              end_date=export_end_date,
                              remove_null_samples=True)

    # Export the cache file if the pipeline runs successfully.
    # Otherwise, don't update the cache file
    update_cache_file(df, _end_date, cache_dir)
コード例 #14
0
ファイル: run.py プロジェクト: sgsmob/covidcast-indicators
def run_module(params: Dict[str, Any]):
    """Run the quidel_covidtest indicator.

    The `params` argument is expected to have the following structure:
    - "common":
        - "export_dir": str, directory to write output
        - "log_exceptions" (optional): bool, whether to log exceptions to file
        - "log_filename" (optional): str, name of file to write logs
    - indicator":
        - "static_file_dir": str, directory name with population information
        - "input_cache_dir": str, directory in which to cache input data
        - "export_start_date": str, YYYY-MM-DD format of earliest date to create output
        - "export_end_date": str, YYYY-MM-DD format of latest date to create output or "" to create
                             through the present
        - "pull_start_date": str, YYYY-MM-DD format of earliest date to pull input
        - "pull_end_date": str, YYYY-MM-DD format of latest date to create output or "" to create
                           through the present
        - "aws_credentials": Dict[str, str], authentication parameters for AWS S3; see S3
                             documentation
        - "bucket_name": str, name of AWS bucket in which to find data
        - "wip_signal": List[str], list of signal names that are works in progress
        - "test_mode": bool, whether we are running in test mode
    """
    start_time = time.time()
    logger = get_structured_logger(
        __name__,
        filename=params["common"].get("log_filename"),
        log_exceptions=params["common"].get("log_exceptions", True))
    atexit.register(log_exit, start_time, logger)
    cache_dir = params["indicator"]["input_cache_dir"]
    export_dir = params["common"]["export_dir"]
    export_start_date = params["indicator"]["export_start_date"]
    export_end_date = params["indicator"]["export_end_date"]

    # Pull data and update export date
    df, _end_date = pull_quidel_covidtest(params["indicator"])
    if _end_date is None:
        print("The data is up-to-date. Currently, no new data to be ingested.")
        return
    export_end_date = check_export_end_date(export_end_date, _end_date,
                                            END_FROM_TODAY_MINUS)
    export_start_date = check_export_start_date(export_start_date,
                                                export_end_date,
                                                EXPORT_DAY_RANGE)

    first_date, last_date = df["timestamp"].min(), df["timestamp"].max()

    # State Level
    data = df.copy()
    state_groups = geo_map("state", data).groupby("state_id")

    # Add prefix, if required
    sensors = add_prefix(SENSORS,
                         wip_signal=params["indicator"]["wip_signal"],
                         prefix="wip_")
    smoothers = SMOOTHERS.copy()

    for sensor in sensors:
        # For State Level
        print("state", sensor)
        if sensor.endswith(SMOOTHED_POSITIVE):
            smoothers[sensor] = smoothers.pop(SMOOTHED_POSITIVE)
        elif sensor.endswith(RAW_POSITIVE):
            smoothers[sensor] = smoothers.pop(RAW_POSITIVE)
        elif sensor.endswith(SMOOTHED_TEST_PER_DEVICE):
            smoothers[sensor] = smoothers.pop(SMOOTHED_TEST_PER_DEVICE)
        else:
            smoothers[sensor] = smoothers.pop(RAW_TEST_PER_DEVICE)
        state_df = generate_sensor_for_states(state_groups,
                                              smooth=smoothers[sensor][1],
                                              device=smoothers[sensor][0],
                                              first_date=first_date,
                                              last_date=last_date)
        create_export_csv(state_df,
                          geo_res="state",
                          sensor=sensor,
                          export_dir=export_dir,
                          start_date=export_start_date,
                          end_date=export_end_date)

    # County/HRR/MSA level
    for geo_res in GEO_RESOLUTIONS:
        geo_data, res_key = geo_map(geo_res, data)
        for sensor in sensors:
            print(geo_res, sensor)
            res_df = generate_sensor_for_other_geores(
                state_groups,
                geo_data,
                res_key,
                smooth=smoothers[sensor][1],
                device=smoothers[sensor][0],
                first_date=first_date,
                last_date=last_date)
            create_export_csv(res_df,
                              geo_res=geo_res,
                              sensor=sensor,
                              export_dir=export_dir,
                              start_date=export_start_date,
                              end_date=export_end_date,
                              remove_null_samples=True)

    # Export the cache file if the pipeline runs successfully.
    # Otherwise, don't update the cache file
    update_cache_file(df, _end_date, cache_dir)
コード例 #15
0
ファイル: run.py プロジェクト: sgsmob/covidcast-indicators
def run_module(params: Dict[str, Any]):
    """Run Quidel flu test module.

    The `params` argument is expected to have the following structure:
    - "common":
        - "export_dir": str, directory to write output
        - "log_exceptions" (optional): bool, whether to log exceptions to file
        - "log_filename" (optional): str, name of file to write logs
    - indicator":
        - "static_file_dir": str, directory name with population information
        - "input_cache_dir": str, directory in which to cache input data
        - "export_start_date": str, YYYY-MM-DD format of earliest date to create output
        - "export_end_date": str, YYYY-MM-DD format of latest date to create output or "" to create
                             through the present
        - "pull_start_date": str, YYYY-MM-DD format of earliest date to pull input
        - "pull_end_date": str, YYYY-MM-DD format of latest date to create output or "" to create
                           through the present
        - "aws_credentials": Dict[str, str], authentication parameters for AWS S3; see S3
                             documentation
        - "bucket_name": str, name of AWS bucket in which to find data
        - "wip_signal": List[str], list of signal names that are works in progress
        - "test_mode": bool, whether we are running in test mode
    """
    start_time = time.time()
    logger = get_structured_logger(
        __name__,
        filename=params["common"].get("log_filename"),
        log_exceptions=params["common"].get("log_exceptions", True))
    cache_dir = params["indicator"]["input_cache_dir"]
    export_dir = params["common"]["export_dir"]
    static_file_dir = params["indicator"]["static_file_dir"]
    export_start_dates = params["indicator"]["export_start_date"]
    export_end_dates = params["indicator"]["export_end_date"]
    map_df = pd.read_csv(join(static_file_dir, "fips_prop_pop.csv"),
                         dtype={"fips": int})

    # Pull data and update export date
    dfs, _end_date = pull_quidel_data(params["indicator"])
    if _end_date is None:
        print("The data is up-to-date. Currently, no new data to be ingested.")
        return
    export_end_dates = check_export_end_date(export_end_dates, _end_date,
                                             END_FROM_TODAY_MINUS)
    export_start_dates = check_export_start_date(export_start_dates,
                                                 export_end_dates,
                                                 EXPORT_DAY_RANGE)

    # Add prefix, if required
    sensors = add_prefix(list(SENSORS.keys()),
                         wip_signal=params["indicator"]["wip_signal"],
                         prefix="wip_")

    for sensor in sensors:
        # Check either covid_ag or flu_ag
        test_type = "covid_ag" if "covid_ag" in sensor else "flu_ag"
        print("state", sensor)
        data = dfs[test_type].copy()
        state_groups = geo_map("state", data, map_df).groupby("state_id")
        first_date, last_date = data["timestamp"].min(), data["timestamp"].max(
        )

        # For State Level
        state_df = generate_sensor_for_states(state_groups,
                                              smooth=SENSORS[sensor][1],
                                              device=SENSORS[sensor][0],
                                              first_date=first_date,
                                              last_date=last_date)
        create_export_csv(state_df,
                          geo_res="state",
                          sensor=sensor,
                          export_dir=export_dir,
                          start_date=export_start_dates[test_type],
                          end_date=export_end_dates[test_type])

        # County/HRR/MSA level
        for geo_res in GEO_RESOLUTIONS:
            print(geo_res, sensor)
            data = dfs[test_type].copy()
            data, res_key = geo_map(geo_res, data, map_df)
            res_df = generate_sensor_for_other_geores(
                state_groups,
                data,
                res_key,
                smooth=SENSORS[sensor][1],
                device=SENSORS[sensor][0],
                first_date=first_date,
                last_date=last_date)
            create_export_csv(res_df,
                              geo_res=geo_res,
                              sensor=sensor,
                              export_dir=export_dir,
                              start_date=export_start_dates[test_type],
                              end_date=export_end_dates[test_type],
                              remove_null_samples=True)

    # Export the cache file if the pipeline runs successfully.
    # Otherwise, don't update the cache file
    update_cache_file(dfs, _end_date, cache_dir)

    elapsed_time_in_seconds = round(time.time() - start_time, 2)
    logger.info("Completed indicator run",
                elapsed_time_in_seconds=elapsed_time_in_seconds)
コード例 #16
0
def run_module():
    """Run the usafacts indicator."""
    params = read_params()
    export_start_date = params["export_start_date"]
    if export_start_date == "latest":
        export_start_date = datetime.combine(date.today(), time(
            0, 0)) - timedelta(days=1)
    else:
        export_start_date = datetime.strptime(export_start_date, "%Y-%m-%d")
    export_dir = params["export_dir"]
    base_url = params["base_url"]
    cache_dir = params["cache_dir"]

    arch_diff = S3ArchiveDiffer(cache_dir, export_dir, params["bucket_name"],
                                "usafacts", params["aws_credentials"])
    arch_diff.update_cache()

    geo_mapper = GeoMapper()

    dfs = {
        metric: pull_usafacts_data(base_url, metric, geo_mapper)
        for metric in METRICS
    }
    for metric, geo_res, sensor, smoother in product(METRICS, GEO_RESOLUTIONS,
                                                     SENSORS, SMOOTHERS):
        print(geo_res, metric, sensor, smoother)
        df = dfs[metric]
        # Aggregate to appropriate geographic resolution
        df = geo_map(df, geo_res, sensor)
        df["val"] = SMOOTHERS_MAP[smoother][0].smooth(df[sensor].values)
        df["se"] = np.nan
        df["sample_size"] = np.nan
        # Drop early entries where data insufficient for smoothing
        df = df.loc[~df["val"].isnull(), :]
        sensor_name = SENSOR_NAME_MAP[sensor][0]
        # if (SENSOR_NAME_MAP[sensor][1] or SMOOTHERS_MAP[smoother][2]):
        #     metric = f"wip_{metric}"
        #     sensor_name = WIP_SENSOR_NAME_MAP[sensor][0]
        sensor_name = SMOOTHERS_MAP[smoother][1] + sensor_name
        create_export_csv(
            df,
            export_dir=export_dir,
            start_date=SMOOTHERS_MAP[smoother][3](export_start_date),
            metric=metric,
            geo_res=geo_res,
            sensor=sensor_name,
        )

    # Diff exports, and make incremental versions
    _, common_diffs, new_files = arch_diff.diff_exports()

    # Archive changed and new files only
    to_archive = [f for f, diff in common_diffs.items() if diff is not None]
    to_archive += new_files
    _, fails = arch_diff.archive_exports(to_archive)

    # Filter existing exports to exclude those that failed to archive
    succ_common_diffs = {
        f: diff
        for f, diff in common_diffs.items() if f not in fails
    }
    arch_diff.filter_exports(succ_common_diffs)

    # Report failures: someone should probably look at them
    for exported_file in fails:
        print(f"Failed to archive '{exported_file}'")
コード例 #17
0
def run_module(params):
    """
    Run Google Symptoms module.

    Parameters
    ----------
    params
        Dictionary containing indicator configuration. Expected to have the following structure:
    - "common":
        - "export_dir": str, directory to write output
        - "log_exceptions" (optional): bool, whether to log exceptions to file
        - "log_filename" (optional): str, name of file to write logs
    - "indicator":
        - "export_start_date": str, YYYY-MM-DD format, date from which to export data
        - "num_export_days": int, number of days before end date (today) to export
        - "path_to_bigquery_credentials": str, path to BigQuery API key and service account
            JSON file
    """
    start_time = time.time()
    csv_export_count = 0
    oldest_final_export_date = None

    export_start_date = datetime.strptime(
        params["indicator"]["export_start_date"], "%Y-%m-%d")
    export_dir = params["common"]["export_dir"]
    num_export_days = params["indicator"].get("num_export_days", "all")

    logger = get_structured_logger(
        __name__, filename=params["common"].get("log_filename"),
        log_exceptions=params["common"].get("log_exceptions", True))

    # Pull GS data
    dfs = pull_gs_data(params["indicator"]["bigquery_credentials"],
                       export_start_date,
                       num_export_days)
    gmpr = geomap.GeoMapper()

    for geo_res in GEO_RESOLUTIONS:
        if geo_res == "state":
            df_pull = dfs["state"]
        elif geo_res in ["hhs", "nation"]:
            df_pull = gmpr.replace_geocode(dfs["county"], "fips", geo_res, from_col="geo_id",
                                           date_col="timestamp")
            df_pull.rename(columns={geo_res: "geo_id"}, inplace=True)
        else:
            df_pull = geo_map(dfs["county"], geo_res)

        if len(df_pull) == 0:
            continue
        for metric, smoother in product(
                METRICS+[COMBINED_METRIC], SMOOTHERS):
            print(geo_res, metric, smoother)
            df = df_pull.set_index(["timestamp", "geo_id"])
            df["val"] = df[metric].groupby(level=1
                                           ).transform(SMOOTHERS_MAP[smoother][0])
            df["se"] = np.nan
            df["sample_size"] = np.nan
            # Drop early entries where data insufficient for smoothing
            df = df.loc[~df["val"].isnull(), :]
            df = df.reset_index()
            sensor_name = "_".join([smoother, "search"])

            if len(df) == 0:
                continue
            exported_csv_dates = create_export_csv(
                df,
                export_dir=export_dir,
                start_date=SMOOTHERS_MAP[smoother][1](export_start_date),
                metric=metric.lower(),
                geo_res=geo_res,
                sensor=sensor_name)

            if not exported_csv_dates.empty:
                csv_export_count += exported_csv_dates.size
                if not oldest_final_export_date:
                    oldest_final_export_date = max(exported_csv_dates)
                oldest_final_export_date = min(
                    oldest_final_export_date, max(exported_csv_dates))

    elapsed_time_in_seconds = round(time.time() - start_time, 2)
    max_lag_in_days = None
    formatted_oldest_final_export_date = None
    if oldest_final_export_date:
        max_lag_in_days = (datetime.now() - oldest_final_export_date).days
        formatted_oldest_final_export_date = oldest_final_export_date.strftime(
            "%Y-%m-%d")
    logger.info("Completed indicator run",
                elapsed_time_in_seconds=elapsed_time_in_seconds,
                csv_export_count=csv_export_count,
                max_lag_in_days=max_lag_in_days,
                oldest_final_export_date=formatted_oldest_final_export_date)