Ejemplo n.º 1
0
def test_aggregate():
    df_in = read_csv_and_index_fips_date(
        "fips,state,aggregate_level,county,m1,date,foo\n"
        "55005,ZZ,county,North County,1,2020-05-01,11\n"
        "55005,ZZ,county,North County,2,2020-05-02,22\n"
        "55005,ZZ,county,North County,3,2020-05-03,33\n"
        "55005,ZZ,county,North County,0,2020-05-04,0\n"
        "55006,ZZ,county,South County,0,2020-05-01,0\n"
        "55006,ZZ,county,South County,0,2020-05-02,0\n"
        "55006,ZZ,county,South County,3,2020-05-03,44\n"
        "55006,ZZ,county,South County,4,2020-05-04,55\n"
        "55,ZZ,state,Grand State,41,2020-05-01,66\n"
        "55,ZZ,state,Grand State,43,2020-05-03,77\n"
    ).reset_index()
    ts_in = MultiRegionDataset.from_fips_timeseries_df(df_in)
    agg = statistical_areas.CountyToCBSAAggregator(
        county_map={"55005": "10001", "55006": "10001"},
        cbsa_title_map={"10001": "Stat Area 1"},
        aggregations=[],
    )
    ts_out = agg.aggregate(ts_in)

    assert ts_out.groupby_region().ngroups == 1

    ts_cbsa = ts_out.get_one_region(Region.from_cbsa_code("10001"))
    assert ts_cbsa.date_indexed["m1"].to_dict() == {
        pd.to_datetime("2020-05-01"): 1,
        pd.to_datetime("2020-05-02"): 2,
        pd.to_datetime("2020-05-03"): 6,
        pd.to_datetime("2020-05-04"): 4,
    }
Ejemplo n.º 2
0
 def make_dataset(cls) -> timeseries.MultiRegionDataset:
     """Default implementation of make_dataset that loads timeseries data from a CSV."""
     assert cls.COMMON_DF_CSV_PATH, f"No path in {cls}"
     data_root = dataset_utils.LOCAL_PUBLIC_DATA_PATH
     input_path = data_root / cls.COMMON_DF_CSV_PATH
     data = common_df.read_csv(input_path, set_index=False)
     data = cls._check_data(data)
     return MultiRegionDataset.from_fips_timeseries_df(
         data).add_provenance_all(cls.SOURCE_NAME)
def _fips_csv_to_one_region(csv_str: str,
                            region: Region,
                            latest=None) -> OneRegionTimeseriesDataset:
    df = read_csv_and_index_fips_date(csv_str).reset_index()
    # from_timeseries_and_latest adds the location_id column needed by get_one_region
    dataset = MultiRegionDataset.from_fips_timeseries_df(df).get_one_region(
        region)
    if latest:
        return dataclasses.replace(dataset, latest=latest)
    else:
        return dataset
Ejemplo n.º 4
0
def test_load_from_local_public_data():
    agg = statistical_areas.CountyToCBSAAggregator.from_local_public_data()
    agg = dataclasses.replace(agg, aggregations=[])  # Disable scaled aggregations

    assert agg.cbsa_title_map["43580"] == "Sioux City, IA-NE-SD"
    assert agg.county_map["48187"] == "41700"

    df_in = read_csv_and_index_fips_date(
        "fips,state,aggregate_level,county,m1,date,foo\n"
        "48059,ZZ,county,North County,3,2020-05-03,33\n"
        "48253,ZZ,county,South County,4,2020-05-03,77\n"
        "48441,ZZ,county,Other County,2,2020-05-03,41\n"
    ).reset_index()
    ts_in = MultiRegionDataset.from_fips_timeseries_df(df_in)
    ts_out = agg.aggregate(ts_in)
    ts_cbsa = ts_out.get_one_region(Region.from_cbsa_code("10180"))
    assert ts_cbsa.date_indexed["m1"].to_dict() == {
        pd.to_datetime("2020-05-03"): 9,
    }
Ejemplo n.º 5
0
 def make_dataset(cls) -> timeseries.MultiRegionDataset:
     """Default implementation of make_dataset that loads data from the parquet file."""
     assert cls.VARIABLES
     ccd_dataset = CanScraperBase._get_covid_county_dataset()
     data, source_urls_df = ccd_dataset.query_multiple_variables(
         cls.VARIABLES, log_provider_coverage_warnings=True)
     data = cls.transform_data(data)
     data = cls._check_data(data)
     ds = MultiRegionDataset.from_fips_timeseries_df(
         data).add_provenance_all(cls.SOURCE_NAME)
     if not source_urls_df.empty:
         # For each FIPS-VARIABLE pair keep the source_url row with the last DATE.
         source_urls_df = (source_urls_df.sort_values(
             CommonFields.DATE).groupby(
                 [CommonFields.FIPS, PdFields.VARIABLE],
                 sort=False).last().reset_index().drop(
                     columns=[CommonFields.DATE]))
         source_urls_df[taglib.TagField.TYPE] = taglib.TagType.SOURCE_URL
         ds = ds.append_fips_tag_df(source_urls_df)
     return ds
Ejemplo n.º 6
0
 def make_dataset(cls) -> timeseries.MultiRegionDataset:
     """Default implementation of make_dataset that loads timeseries data from a CSV."""
     data = cls._load_data()
     data = cls._check_and_removed_unexpected_data(data)
     return MultiRegionDataset.from_fips_timeseries_df(
         data).add_tag_all_bucket(cls.source_tag())