Python MultiRegionDatasetの例、libs.datasets.timeseries.MultiRegionDataset Pythonの例

コード例 #1

0

ファイルを表示

def drop_observations(dataset: timeseries.MultiRegionDataset,
                      filter_: Filter) -> timeseries.MultiRegionDataset:
    """Drops observations according to `filter_` from every region in `dataset`."""
    assert filter_.drop_observations

    # wide-dates DataFrames that will be concat-ed to produce the result MultiRegionDataset.
    ts_results = []
    ts_selected_fields, ts_not_selected_fields = _partition_by_fields(
        dataset.timeseries_bucketed_wide_dates, filter_.fields_included)
    ts_results.append(ts_not_selected_fields)

    if filter_.start_date:
        ts_filtered, ts_no_real_values_to_drop = _filter_by_date(
            ts_selected_fields, drop_start_date=filter_.start_date)
        return dataset.replace_timeseries_wide_dates([
            ts_not_selected_fields, ts_filtered, ts_no_real_values_to_drop
        ]).add_tag_to_subset(filter_.tag, ts_filtered.index)
    else:
        # When start_date is None all of ts_selected_fields is dropped; only the not selected
        # fields are kept.
        selected_index = ts_selected_fields.index
        return (dataset.replace_timeseries_wide_dates([
            ts_not_selected_fields
        ]).remove_tags_from_subset(selected_index).add_tag_to_subset(
            filter_.tag, selected_index))

コード例 #2

0

ファイルを表示

def replace_dc_county_with_state_data(
    dataset_in: timeseries.MultiRegionDataset,
) -> timeseries.MultiRegionDataset:
    """Replace DC County data with data from State.

    Args:
        dataset_in: Input dataset.

    Returns: Dataset with DC county data replaced to match DC state.
    """
    dc_state_region = pipeline.Region.from_fips(DC_STATE_FIPS)
    dc_county_region = pipeline.Region.from_fips(DC_COUNTY_FIPS)

    dc_map = {dc_state_region: dc_county_region}

    # aggregate_regions only copies number columns. Extract them and re-add to the aggregated
    # dataset.
    static_excluding_numbers = dataset_in.get_regions_subset(
        [dc_county_region]
    ).static.select_dtypes(exclude="number")
    dc_county_dataset = timeseries.aggregate_regions(dataset_in, dc_map).add_static_values(
        static_excluding_numbers.reset_index()
    )
    dataset_without_dc_county = dataset_in.remove_regions([dc_county_region])

    return dataset_without_dc_county.append_regions(dc_county_dataset)

コード例 #3

0

ファイルを表示

ファイル: combined_dataset_utils.py プロジェクト: epius/covid-data-model

def persist_dataset(
    dataset: timeseries.MultiRegionDataset,
    data_directory: pathlib.Path,
    data_public_path: pathlib.Path = dataset_utils.LOCAL_PUBLIC_DATA_PATH,
) -> DatasetPointer:
    """Saves dataset and associated pointer in same data directory.

    Args:
        dataset: Dataset to persist.
        data_directory: Data directory
        data_public_path: Path to covid data public folder.

    Returns: DatasetPointer describing persisted dataset.
    """
    model_git_info = GitSummary.from_repo_path(dataset_utils.REPO_ROOT)
    data_git_info = GitSummary.from_repo_path(data_public_path)

    dataset_type = dataset.dataset_type

    dataset_path = data_directory / f"{dataset_type.value}.csv"
    dataset_pointer = DatasetPointer(
        dataset_type=dataset_type,
        path=dataset_path,
        data_git_info=data_git_info,
        model_git_info=model_git_info,
        updated_at=datetime.datetime.utcnow(),
    )
    dataset.write_to_dataset_pointer(dataset_pointer)
    dataset_pointer.save(data_directory)
    return dataset_pointer

コード例 #4

0

ファイルを表示

ファイル: api_v2_pipeline.py プロジェクト: epius/covid-data-model

    def from_region_and_model_output(
        region: pipeline.Region,
        combined_data_with_test_positivity: MultiRegionDataset,
        rt_data: MultiRegionDataset,
        icu_data: MultiRegionDataset,
    ) -> "RegionalInput":
        one_region_data = combined_data_with_test_positivity.get_one_region(
            region)

        # Not all regions have Rt or ICU data due to various filters in pyseir code.
        try:
            rt_data = rt_data.get_one_region(region)
        except timeseries.RegionLatestNotFound:
            rt_data = None

        try:
            icu_data = icu_data.get_one_region(region)
        except timeseries.RegionLatestNotFound:
            icu_data = None

        return RegionalInput(
            region=region,
            _combined_data_with_test_positivity=one_region_data,
            rt_data=rt_data,
            icu_data=icu_data,
        )

コード例 #5

0

ファイルを表示

ファイル: run.py プロジェクト: epius/covid-data-model

    def read(output_dir: pathlib.Path) -> "PyseirOutputDatasets":
        icu_data_path = output_dir / SummaryArtifact.ICU_METRIC_COMBINED.value
        icu_data = MultiRegionDataset.from_csv(icu_data_path)

        rt_data_path = output_dir / SummaryArtifact.RT_METRIC_COMBINED.value
        rt_data = MultiRegionDataset.from_csv(rt_data_path)

        return PyseirOutputDatasets(icu=icu_data, infection_rate=rt_data)

コード例 #6

0

ファイルを表示

ファイル: run.py プロジェクト: ConsultingMD/covid-data-model

    def from_pipeline_output(pipelines: List[OneRegionPipeline]) -> "PyseirOutputDatasets":
        infection_rate_metric_df = pd.concat((p.infer_df for p in pipelines), ignore_index=True)
        infection_rate_ds = MultiRegionDataset.from_geodata_timeseries_df(infection_rate_metric_df)

        icu_df = pd.concat((p.icu_data.data for p in pipelines if p.icu_data), ignore_index=True)
        icu_ds = MultiRegionDataset.from_geodata_timeseries_df(icu_df)

        return PyseirOutputDatasets(icu=icu_ds, infection_rate=infection_rate_ds)

コード例 #7

0

ファイルを表示

ファイル: combined_datasets.py プロジェクト: ConsultingMD/covid-data-model

def load_us_timeseries_dataset(
    pointer_directory: pathlib.Path = dataset_utils.DATA_DIRECTORY,
) -> MultiRegionDataset:
    filename = dataset_pointer.form_filename(DatasetType.MULTI_REGION)
    pointer_path = pointer_directory / filename
    pointer = DatasetPointer.parse_raw(pointer_path.read_text())
    return MultiRegionDataset.read_from_pointer(pointer)

コード例 #8

0

ファイルを表示

    def calculate(self, dataset: MultiRegionDataset, diff_days: int,
                  most_recent_date: pd.Timestamp) -> MethodOutput:

        positivity_time_series = {}
        # To replicate the behavior of the old code, a region is considered to have recent
        # positivity when the input timeseries (POSITIVE_TESTS and NEGATIVE_TESTS) are recent. The
        # other subclasses of `Method` filter based on the most recent real value in the output
        # timeseries.
        recent_regions = []
        for region, regional_data in dataset.iter_one_regions():
            data = regional_data.date_indexed
            positive_negative_recent = self._has_recent_data(
                data[CommonFields.POSITIVE_TESTS]) and self._has_recent_data(
                    data[CommonFields.NEGATIVE_TESTS])
            series = calculate_test_positivity(regional_data)
            if not series.empty:
                positivity_time_series[region.location_id] = series
                if positive_negative_recent:
                    recent_regions.append(region)

        # Convert dict[location_id, Series] to rows with key as index and value as the row data.
        # See https://stackoverflow.com/a/21005134
        wide_date_df = pd.concat(positivity_time_series, axis=1).T.rename_axis(
            index=CommonFields.LOCATION_ID, columns=CommonFields.DATE)

        # Make a dataset with TEST_POSITIVITY for every region where the calculation finished.
        all_output = _make_output_dataset(
            dataset,
            self.columns,
            wide_date_df,
            CommonFields.TEST_POSITIVITY,
        )
        # Make a dataset with the subset of regions having recent input timeseries.
        ds_recent = all_output.get_regions_subset(recent_regions)
        return MethodOutput(all_output=all_output, recent=ds_recent)

コード例 #9

0

ファイルを表示

def test_aggregate():
    df_in = read_csv_and_index_fips_date(
        "fips,state,aggregate_level,county,m1,date,foo\n"
        "55005,ZZ,county,North County,1,2020-05-01,11\n"
        "55005,ZZ,county,North County,2,2020-05-02,22\n"
        "55005,ZZ,county,North County,3,2020-05-03,33\n"
        "55005,ZZ,county,North County,0,2020-05-04,0\n"
        "55006,ZZ,county,South County,0,2020-05-01,0\n"
        "55006,ZZ,county,South County,0,2020-05-02,0\n"
        "55006,ZZ,county,South County,3,2020-05-03,44\n"
        "55006,ZZ,county,South County,4,2020-05-04,55\n"
        "55,ZZ,state,Grand State,41,2020-05-01,66\n"
        "55,ZZ,state,Grand State,43,2020-05-03,77\n"
    ).reset_index()
    ts_in = MultiRegionDataset.from_fips_timeseries_df(df_in)
    agg = statistical_areas.CountyToCBSAAggregator(
        county_map={"55005": "10001", "55006": "10001"},
        cbsa_title_map={"10001": "Stat Area 1"},
        aggregations=[],
    )
    ts_out = agg.aggregate(ts_in)

    assert ts_out.groupby_region().ngroups == 1

    ts_cbsa = ts_out.get_one_region(Region.from_cbsa_code("10001"))
    assert ts_cbsa.date_indexed["m1"].to_dict() == {
        pd.to_datetime("2020-05-01"): 1,
        pd.to_datetime("2020-05-02"): 2,
        pd.to_datetime("2020-05-03"): 6,
        pd.to_datetime("2020-05-04"): 4,
    }

コード例 #10

0

ファイルを表示

ファイル: tail_filter.py プロジェクト: ConsultingMD/covid-data-model

    def run(
        dataset: timeseries.MultiRegionDataset, fields: List[FieldName]
    ) -> Tuple["TailFilter", timeseries.MultiRegionDataset]:
        """Returns a dataset with recent data that looks bad removed from cumulative fields."""
        timeseries_wide_dates = dataset.timeseries_wide_dates()

        fields_mask = timeseries_wide_dates.index.get_level_values(
            PdFields.VARIABLE).isin(fields)
        to_filter = timeseries_wide_dates.loc[pd.IndexSlice[:, fields_mask], :]
        not_filtered = timeseries_wide_dates.loc[
            pd.IndexSlice[:, ~fields_mask], :]

        tail_filter = TailFilter()
        filtered = to_filter.apply(tail_filter._filter_one_series, axis=1)

        merged = pd.concat([not_filtered, filtered])
        timeseries_wide_variables = merged.stack().unstack(
            PdFields.VARIABLE).sort_index()

        # TODO(tom): Find a generic way to return the counts in tail_filter and stop returning the
        #  object itself.
        return (
            tail_filter,
            dataclasses.replace(
                dataset, timeseries=timeseries_wide_variables).append_tag_df(
                    tail_filter._annotations.as_dataframe()),
        )

コード例 #11

0

ファイルを表示

def aggregate_to_new_york_city(
    ds_in: timeseries.MultiRegionDataset,
) -> timeseries.MultiRegionDataset:
    nyc_region = pipeline.Region.from_fips(NEW_YORK_CITY_FIPS)
    # Map from borough / county to the region used for aggregated NYC
    nyc_map = {borough_region: nyc_region for borough_region in ALL_NYC_REGIONS}

    # aggregate_regions only copies number columns. Extract them and re-add to the aggregated
    # dataset.
    static_excluding_numbers = ds_in.get_regions_subset([nyc_region]).static.select_dtypes(
        exclude="number"
    )
    nyc_dataset = timeseries.aggregate_regions(
        ds_in, nyc_map, reporting_ratio_required_to_aggregate=None
    ).add_static_values(static_excluding_numbers.reset_index())

    return ds_in.append_regions(nyc_dataset)

コード例 #12

0

ファイルを表示

def load_us_timeseries_dataset(
    pointer_directory: pathlib.Path = dataset_utils.DATA_DIRECTORY,
) -> MultiRegionDataset:
    """Returns all combined data. `load_test_dataset` is more suitable for tests."""
    filename = dataset_pointer.form_filename(DatasetType.MULTI_REGION)
    pointer_path = pointer_directory / filename
    pointer = DatasetPointer.parse_raw(pointer_path.read_text())
    return MultiRegionDataset.read_from_pointer(pointer)

コード例 #13

0

ファイルを表示

    def run(
        dataset_in: MultiRegionDataset,
        methods: Sequence[Method] = TEST_POSITIVITY_METHODS,
        *,
        diff_days: int = 7,
    ) -> "AllMethods":
        """Runs `methods` on `dataset_in` and returns the results or raises a TestPositivityException."""
        relevant_columns = AllMethods._list_columns(
            AllMethods._methods_with_columns_available(
                methods, dataset_in.timeseries.columns))
        if not relevant_columns:
            raise NoMethodsWithRelevantColumns()

        input_wide = dataset_in.timeseries_wide_dates()
        if input_wide.empty:
            raise NoRealTimeseriesValuesException()
        dates = input_wide.columns.get_level_values(CommonFields.DATE)
        most_recent_date = dates.max()

        methods_with_data = AllMethods._methods_with_columns_available(
            methods,
            input_wide.index.get_level_values(PdFields.VARIABLE).unique())
        if not methods_with_data:
            raise NoColumnsWithDataException()

        method_map = {method.name: method for method in methods_with_data}
        calculated_dataset_map = {
            method_name: method.calculate(dataset_in, diff_days,
                                          most_recent_date)
            for method_name, method in method_map.items()
        }
        calculated_dataset_recent_map = {
            name: method_output.recent
            for name, method_output in calculated_dataset_map.items()
        }
        calculated_dataset_all_map = {
            name: method_output.all_output
            for name, method_output in calculated_dataset_map.items()
        }
        # HACK: If SmoothedTests is in calculated_dataset_map (that is the MethodOutput returned by
        # `calculate`) then add it again at the end of the map with the Method.all_output. Remember
        # that dict entries remain in the order inserted. This makes # SmoothedTests the final
        # fallback for a location if no other Method has a timeseries for it.
        old_method_output: Optional[MethodOutput] = calculated_dataset_map.get(
            timeseries.DatasetName("SmoothedTests"))
        if old_method_output:
            calculated_dataset_recent_map[timeseries.DatasetName(
                "SmoothedTestsAll")] = old_method_output.all_output

        # Make a dataset object with one metric, containing for each region the timeseries
        # from the highest priority dataset that has recent data.
        test_positivity = timeseries.combined_datasets(
            {
                CommonFields.TEST_POSITIVITY:
                list(calculated_dataset_recent_map.values())
            }, {})
        return AllMethods(all_methods_datasets=calculated_dataset_all_map,
                          test_positivity=test_positivity)

コード例 #14

0

ファイルを表示

 def make_dataset(cls) -> timeseries.MultiRegionDataset:
     """Default implementation of make_dataset that loads timeseries data from a CSV."""
     assert cls.COMMON_DF_CSV_PATH, f"No path in {cls}"
     data_root = dataset_utils.LOCAL_PUBLIC_DATA_PATH
     input_path = data_root / cls.COMMON_DF_CSV_PATH
     data = common_df.read_csv(input_path, set_index=False)
     data = cls._check_data(data)
     return MultiRegionDataset.from_fips_timeseries_df(
         data).add_provenance_all(cls.SOURCE_NAME)

コード例 #15

0

ファイルを表示

ファイル: test_helpers.py プロジェクト: ConsultingMD/covid-data-model

def _latest_sorted_by_location_date(
    ts: timeseries.MultiRegionDataset, drop_na: bool
) -> pd.DataFrame:
    """Returns the latest data, sorted by LOCATION_ID."""
    df = ts.static_and_timeseries_latest_with_fips().sort_values(
        [CommonFields.LOCATION_ID], ignore_index=True
    )
    if drop_na:
        df = df.dropna("columns", "all")
    return df

コード例 #16

0

ファイルを表示

def run_and_maybe_join_columns(mrts: timeseries.MultiRegionDataset,
                               log) -> timeseries.MultiRegionDataset:
    """Calculates test positivity and joins it with the input, if successful."""
    try:
        test_positivity_results = AllMethods.run(mrts)
    except TestPositivityException:
        log.exception("test_positivity failed")
        return mrts

    return mrts.join_columns(test_positivity_results.test_positivity)

コード例 #17

0

ファイルを表示

ファイル: top_level_metrics_test.py プロジェクト: ConsultingMD/covid-data-model

def _fips_csv_to_one_region(csv_str: str,
                            region: Region,
                            latest=None) -> OneRegionTimeseriesDataset:
    df = read_csv_and_index_fips_date(csv_str).reset_index()
    # from_timeseries_and_latest adds the location_id column needed by get_one_region
    dataset = MultiRegionDataset.from_fips_timeseries_df(df).get_one_region(
        region)
    if latest:
        return dataclasses.replace(dataset, latest=latest)
    else:
        return dataset

コード例 #18

0

ファイルを表示

 def _remove_stale_regions(self, all_output: MultiRegionDataset,
                           most_recent_date: pd.Timestamp) -> MethodOutput:
     """Filters the output for all regions, returning output for all regions and only those
     with recent data."""
     assert self.recent_days >= 1
     # The oldest date that is considered recent/not-stale. If recent_days is 1 then this is
     # the most recent day in the input.
     recent_date_cutoff = most_recent_date + pd.to_timedelta(
         1 - self.recent_days, "D")
     return MethodOutput(
         all_output=all_output,
         recent=all_output.drop_stale_timeseries(recent_date_cutoff))

コード例 #19

0

ファイルを表示

ファイル: combined_dataset_utils_test.py プロジェクト: ConsultingMD/covid-data-model

def test_persist_and_load_dataset(tmp_path, nyc_fips):
    region = Region.from_fips(nyc_fips)
    dataset = combined_datasets.load_us_timeseries_dataset()
    timeseries_nyc = dataset.get_regions_subset([region])

    pointer = combined_dataset_utils.persist_dataset(timeseries_nyc, tmp_path)

    downloaded_dataset = MultiRegionDataset.read_from_pointer(pointer)
    differ_l = DatasetDiff.make(downloaded_dataset.timeseries)
    differ_r = DatasetDiff.make(timeseries_nyc.timeseries)
    differ_l.compare(differ_r)

    assert not len(differ_l.my_ts)

コード例 #20

0

ファイルを表示

    def calculate(
        self, dataset: MultiRegionDataset, diff_days: int, most_recent_date: pd.Timestamp
    ) -> MethodOutput:
        numerator_delta = dataset.get_timeseries_not_bucketed_wide_dates(self._numerator).diff(
            periods=diff_days, axis=1
        )
        assert numerator_delta.index.names == [CommonFields.LOCATION_ID]
        assert numerator_delta.columns.names == [CommonFields.DATE]

        denomintaor_delta = dataset.get_timeseries_not_bucketed_wide_dates(self._denominator).diff(
            periods=diff_days, axis=1
        )
        assert denomintaor_delta.index.names == [CommonFields.LOCATION_ID]
        assert denomintaor_delta.columns.names == [CommonFields.DATE]

        # `/` is calculated for each region/state and date.
        wide_date_df = numerator_delta / denomintaor_delta

        all_output = _make_output_dataset(
            dataset, self.columns, wide_date_df, CommonFields.TEST_POSITIVITY,
        )
        return self._remove_stale_regions(all_output, most_recent_date)

コード例 #21

0

ファイルを表示

    def calculate(
        self, dataset: MultiRegionDataset, diff_days: int, most_recent_date: pd.Timestamp
    ) -> MethodOutput:
        wide_date_df = dataset.get_timeseries_not_bucketed_wide_dates(self._column)
        assert wide_date_df.index.names == [CommonFields.LOCATION_ID]
        assert wide_date_df.columns.names == [CommonFields.DATE]

        # Optional optimization: The following likely adds the variable/field/column name back in
        # to the index which was just taken out. Consider skipping reindexing.

        all_output = _make_output_dataset(
            dataset, self.columns, wide_date_df, CommonFields.TEST_POSITIVITY
        )
        return self._remove_stale_regions(all_output, most_recent_date)

コード例 #22

0

ファイルを表示

ファイル: data_source.py プロジェクト: epius/covid-data-model

 def make_dataset(cls) -> timeseries.MultiRegionDataset:
     """Default implementation of make_dataset that loads data from the parquet file."""
     ccd_dataset = cls._get_covid_county_dataset()
     rows, source_df = ccd_dataset.query_multiple_variables(
         # pylint: disable=E1101
         cls.VARIABLES,
         log_provider_coverage_warnings=True,
         source_type=cls.SOURCE_TYPE,
     )
     data = rows.unstack(PdFields.VARIABLE)
     data = cls._check_and_removed_unexpected_data(data)
     ds = MultiRegionDataset(timeseries_bucketed=data)
     if not source_df.empty:
         # For each FIPS-VARIABLE pair keep the source_url row with the last DATE.
         source_tag_df = (
             source_df.sort_values(CommonFields.DATE).groupby(
                 [CommonFields.LOCATION_ID, PdFields.VARIABLE],
                 sort=False).last().reset_index().drop(
                     columns=[CommonFields.DATE])
             # copy before calling tag_df_add_all_bucket_in_place, just to be safe.
             .copy())
         timeseries.tag_df_add_all_bucket_in_place(source_tag_df)
         ds = ds.append_tag_df(source_tag_df)
     return ds

コード例 #23

0

ファイルを表示

    def calculate(self, dataset: MultiRegionDataset, diff_days: int,
                  most_recent_date: pd.Timestamp) -> MethodOutput:
        df = dataset.timeseries_wide_dates().reorder_levels(
            [PdFields.VARIABLE, CommonFields.LOCATION_ID])
        assert df.columns.names == [CommonFields.DATE]
        assert df.index.names == [PdFields.VARIABLE, CommonFields.LOCATION_ID]
        # df has the field name as the first level of the index. delta_df.loc[field, :] returns a
        # DataFrame without the field label
        wide_date_df = df.loc[self._column, :]
        # Optional optimization: The following likely adds the variable/field/column name back in
        # to the index which was just taken out. Consider skipping reindexing.

        all_output = _make_output_dataset(dataset, self.columns, wide_date_df,
                                          CommonFields.TEST_POSITIVITY)
        return self._remove_stale_regions(all_output, most_recent_date)

コード例 #24

0

ファイルを表示

def aggregate_puerto_rico_from_counties(
    dataset: timeseries.MultiRegionDataset, ) -> timeseries.MultiRegionDataset:
    """Returns a dataset with NA static values for the state PR aggregated from counties."""
    pr_counties = dataset.get_subset(AggregationLevel.COUNTY, state="PR")
    if pr_counties.location_ids.empty:
        return dataset
    aggregated = _aggregate_ignoring_nas(
        pr_counties.static.select_dtypes(include="number"))
    pr_location_id = pipeline.Region.from_state("PR").location_id

    patched_static = dataset.static.copy()
    for field, aggregated_value in aggregated.items():
        if pd.isna(patched_static.at[pr_location_id, field]):
            patched_static.at[pr_location_id, field] = aggregated_value

    return dataclasses.replace(dataset, static=patched_static)

コード例 #25

0

ファイルを表示

def test_load_from_local_public_data():
    agg = statistical_areas.CountyToCBSAAggregator.from_local_public_data()
    agg = dataclasses.replace(agg, aggregations=[])  # Disable scaled aggregations

    assert agg.cbsa_title_map["43580"] == "Sioux City, IA-NE-SD"
    assert agg.county_map["48187"] == "41700"

    df_in = read_csv_and_index_fips_date(
        "fips,state,aggregate_level,county,m1,date,foo\n"
        "48059,ZZ,county,North County,3,2020-05-03,33\n"
        "48253,ZZ,county,South County,4,2020-05-03,77\n"
        "48441,ZZ,county,Other County,2,2020-05-03,41\n"
    ).reset_index()
    ts_in = MultiRegionDataset.from_fips_timeseries_df(df_in)
    ts_out = agg.aggregate(ts_in)
    ts_cbsa = ts_out.get_one_region(Region.from_cbsa_code("10180"))
    assert ts_cbsa.date_indexed["m1"].to_dict() == {
        pd.to_datetime("2020-05-03"): 9,
    }

コード例 #26

0

ファイルを表示

 def make_dataset(cls) -> timeseries.MultiRegionDataset:
     """Default implementation of make_dataset that loads data from the parquet file."""
     assert cls.VARIABLES
     ccd_dataset = CanScraperBase._get_covid_county_dataset()
     data, source_urls_df = ccd_dataset.query_multiple_variables(
         cls.VARIABLES, log_provider_coverage_warnings=True)
     data = cls.transform_data(data)
     data = cls._check_data(data)
     ds = MultiRegionDataset.from_fips_timeseries_df(
         data).add_provenance_all(cls.SOURCE_NAME)
     if not source_urls_df.empty:
         # For each FIPS-VARIABLE pair keep the source_url row with the last DATE.
         source_urls_df = (source_urls_df.sort_values(
             CommonFields.DATE).groupby(
                 [CommonFields.FIPS, PdFields.VARIABLE],
                 sort=False).last().reset_index().drop(
                     columns=[CommonFields.DATE]))
         source_urls_df[taglib.TagField.TYPE] = taglib.TagType.SOURCE_URL
         ds = ds.append_fips_tag_df(source_urls_df)
     return ds

コード例 #27

0

ファイルを表示

def _make_output_dataset(
    dataset_in: MultiRegionDataset,
    source_columns: Collection[FieldName],
    wide_date_df: pd.DataFrame,
    output_metric: FieldName,
) -> MultiRegionDataset:
    """Returns a dataset with `wide_date_df` in a column named `output_metric` and provenance
    information copied from `dataset_in`.

    Args:
        dataset_in: original MultiRegionDataset.
        source_columns: columns that were a source for the output. tags are copied from these to
          the output.
        wide_date_df: the timeseries output, copied to the returned dataset
        output_metric: wide_date_df is copied to this metric/column in the returned dataset
    """
    assert wide_date_df.index.names == [CommonFields.LOCATION_ID]
    # Drop all-NA timeseries now, as done in from_timeseries_wide_dates_df. This makes sure
    # `locations` is used to build provenance information for only timeseries in the returned
    # MultiRegionDataset.
    wide_date_df = wide_date_df.dropna("rows", "all")
    locations = wide_date_df.index.get_level_values(CommonFields.LOCATION_ID)
    _append_variable_index_level(wide_date_df, output_metric)

    assert dataset_in.tag.index.names == [
        TagField.LOCATION_ID,
        TagField.VARIABLE,
        TagField.TYPE,
    ]
    dataset_out = MultiRegionDataset.from_timeseries_wide_dates_df(
        wide_date_df)
    if source_columns:
        source_tags = dataset_in.tag.loc[locations,
                                         list(source_columns)].reset_index()
        source_tags[TagField.VARIABLE] = output_metric
        # When there are two source_columns they usually contain the same provenance content.
        # Only keep one copy of it.
        output_tags = source_tags.drop_duplicates(ignore_index=True)
        dataset_out = dataset_out.append_tag_df(output_tags)

    return dataset_out

コード例 #28

0

ファイルを表示

def run(dataset: timeseries.MultiRegionDataset,
        config: Config = CONFIG) -> timeseries.MultiRegionDataset:
    for filter_ in config.filters:
        filtered_dataset, passed_dataset = dataset.partition_by_region(
            filter_.regions_included, exclude=filter_.regions_excluded)
        if filtered_dataset.location_ids.empty:
            # TODO(tom): Find a cleaner way to refer to a filter in logs.
            _logger.info("No locations matched",
                         regions=str(filter_.regions_included))
            continue
        if filter_.drop_observations:
            filtered_dataset = drop_observations(filtered_dataset, filter_)
        else:
            ts_selected_fields, _ = _partition_by_fields(
                filtered_dataset.timeseries_bucketed_wide_dates,
                filter_.fields_included)
            filtered_dataset = filtered_dataset.add_tag_to_subset(
                filter_.tag, ts_selected_fields.index)

        dataset = filtered_dataset.append_regions(passed_dataset)

    return dataset

コード例 #29

0

ファイルを表示

    def calculate(self, dataset: MultiRegionDataset, diff_days: int,
                  most_recent_date: pd.Timestamp) -> MethodOutput:
        delta_df = (dataset.timeseries_wide_dates().reorder_levels(
            [PdFields.VARIABLE,
             CommonFields.LOCATION_ID]).diff(periods=diff_days, axis=1))
        assert delta_df.columns.names == [CommonFields.DATE]
        assert delta_df.index.names == [
            PdFields.VARIABLE, CommonFields.LOCATION_ID
        ]
        # delta_df has the field name as the first level of the index. delta_df.loc[field, :] returns a
        # DataFrame without the field label so operators such as `/` are calculated for each
        # region/state and date.
        wide_date_df = delta_df.loc[self._numerator, :] / delta_df.loc[
            self._denominator, :]

        all_output = _make_output_dataset(
            dataset,
            self.columns,
            wide_date_df,
            CommonFields.TEST_POSITIVITY,
        )
        return self._remove_stale_regions(all_output, most_recent_date)

コード例 #30

0

ファイルを表示

def drop_all_zero_timeseries(
        ds_in: timeseries.MultiRegionDataset,
        fields: Collection[CommonFields]) -> timeseries.MultiRegionDataset:
    """Returns a dataset with `fields` timeseries dropped if they contain only NA and 0.

    When first built this is dropping a timeseries in Loving County, TX which has so few people
    that the all-zero timeseries is likely accurate. It may be worth only applying this to
    locations with a population over some threshold. Or perhaps an automatic filter isn't worth
    the trouble after all :-(
    """
    ts_wide = ds_in.timeseries_wide_dates()

    # Separate into timeseries in `fields` and all others.
    variable_mask = ts_wide.index.get_level_values(
        PdFields.VARIABLE).isin(fields)
    ts_wide_other_variables = ts_wide.loc[~variable_mask]
    ts_wide_variables = ts_wide.loc[variable_mask]

    # Keep rows/timeseries that have at least one value that is not 0 or NA
    to_keep_mask = ts_wide_variables.replace(pd.NA, 0).any(axis=1)
    to_drop = ts_wide_variables.loc[~to_keep_mask].index
    if not to_drop.empty:
        # Maybe add filtering to not log about the known bad data in OH counties and Loving
        # County Texas using a RegionMask(level=County, state=OH) and some kind of RegionMask
        # representing counties with a small population.
        _log.info(DROPPING_TIMESERIES_WITH_ONLY_ZEROS, dropped=to_drop)
    ts_wide_kept = ts_wide_variables.loc[to_keep_mask]

    ts_wide_out = pd.concat([ts_wide_kept, ts_wide_other_variables])

    # Make a new dataset without the dropped timeseries. This does not drop the tags of the
    # dropped timeseries but keeping the provenance tags doesn't seem to be a problem. Maybe it'd
    # be cleaner to add a method 'MultiRegionDataset.drop_timeseries' similar to 'remove_regions' or
    # move this into 'MultiRegionDataset' similar to 'drop_stale_timeseries'.
    return dataclasses.replace(ds_in,
                               timeseries=ts_wide_out.stack().unstack(
                                   PdFields.VARIABLE).sort_index())