Beispiel #1
0
def test_nyc_aggregation(are_boroughs_zero):

    nyc_county_fips = custom_aggregations.NEW_YORK_COUNTY_FIPS
    nyc_borough_fips = custom_aggregations.NYC_BOROUGH_FIPS[0]

    nyc_cases = 10
    borough_cases = 0 if are_boroughs_zero else 10
    rows = [
        default_timeseries_row(fips=nyc_county_fips, cases=nyc_cases),
        default_timeseries_row(fips=nyc_borough_fips,
                               cases=borough_cases,
                               deaths=borough_cases,
                               recovered=borough_cases),
        default_timeseries_row()
    ]

    df = pd.DataFrame(rows)

    # Todo: figure out a better way to define these groups.
    group = [
        'date', 'source', 'country', 'aggregate_level', 'state', 'generated'
    ]
    result = custom_aggregations.update_with_combined_new_york_counties(
        df, group, are_boroughs_zero=are_boroughs_zero)
    results = result.sort_values('fips').to_dict(orient='records')

    assert len(results) == 2
    nyc_result = results[1]

    if are_boroughs_zero:
        assert nyc_result['cases'] == nyc_cases
    else:
        assert nyc_result['cases'] == nyc_cases + borough_cases
Beispiel #2
0
def test_nyc_aggregation(are_boroughs_zero):

    nyc_county_fips = custom_aggregations.NEW_YORK_COUNTY_FIPS
    nyc_borough_fips = custom_aggregations.NYC_BOROUGH_FIPS[0]

    nyc_cases = 10
    borough_cases = 0 if are_boroughs_zero else 10
    rows = [
        default_timeseries_row(fips=nyc_county_fips, cases=nyc_cases),
        default_timeseries_row(
            fips=nyc_borough_fips,
            cases=borough_cases,
            deaths=borough_cases,
            recovered=borough_cases,
        ),
        default_timeseries_row(),
    ]

    df = pd.DataFrame(rows)

    # Todo: figure out a better way to define these groups.
    group = ["date", "source", "country", "aggregate_level", "state"]
    result = custom_aggregations.update_with_combined_new_york_counties(
        df, group, are_boroughs_zero=are_boroughs_zero)
    results = result.sort_values("fips").to_dict(orient="records")

    assert len(results) == 2
    nyc_result = results[1]

    if are_boroughs_zero:
        assert nyc_result["cases"] == nyc_cases
    else:
        assert nyc_result["cases"] == nyc_cases + borough_cases
        assert pd.isna(nyc_result["current_icu"])
    def from_source(cls,
                    source: "DataSource",
                    fill_missing_state: bool = True) -> "TimeseriesDataset":
        """Loads data from a specific datasource.

        Args:
            source: DataSource to standardize for timeseries dataset
            fill_missing_state: If True, backfills missing state data by
                calculating county level aggregates.

        Returns: Timeseries object.
        """
        data = source.data
        group = [
            CommonFields.DATE,
            CommonFields.COUNTRY,
            CommonFields.AGGREGATE_LEVEL,
            CommonFields.STATE,
        ]
        data = custom_aggregations.update_with_combined_new_york_counties(
            data, group, are_boroughs_zero=source.HAS_AGGREGATED_NYC_BOROUGH)

        if fill_missing_state:
            state_groupby_fields = [
                CommonFields.DATE,
                CommonFields.COUNTRY,
                CommonFields.STATE,
            ]
            non_matching = dataset_utils.aggregate_and_get_nonmatching(
                data,
                state_groupby_fields,
                AggregationLevel.COUNTY,
                AggregationLevel.STATE,
            ).reset_index()
            data = pd.concat([data, non_matching])

        fips_data = dataset_utils.build_fips_data_frame()
        data = dataset_utils.add_county_using_fips(data, fips_data)
        is_state = data[
            CommonFields.AGGREGATE_LEVEL] == AggregationLevel.STATE.value
        state_fips = data.loc[is_state, CommonFields.STATE].map(
            us_state_abbrev.ABBREV_US_FIPS)
        data.loc[is_state, CommonFields.FIPS] = state_fips

        no_fips = data[CommonFields.FIPS].isnull()
        if no_fips.any():
            _log.warning("Dropping rows without FIPS",
                         source=str(source),
                         rows=repr(data.loc[no_fips]))
            data = data.loc[~no_fips]

        dups = data.duplicated(COMMON_FIELDS_TIMESERIES_KEYS, keep=False)
        if dups.any():
            raise DuplicateDataException(f"Duplicates in {source}",
                                         data.loc[dups])

        # Choosing to sort by date
        data = data.sort_values(CommonFields.DATE)
        return cls(data, provenance=source.provenance)
Beispiel #4
0
    def from_source(cls,
                    source: "DataSource",
                    fill_missing_state: bool = True) -> "TimeseriesDataset":
        """Loads data from a specific datasource.

        Args:
            source: DataSource to standardize for timeseries dataset
            fill_missing_state: If True, backfills missing state data by
                calculating county level aggregates.

        Returns: Timeseries object.
        """
        data = source.data
        # TODO(tom): Do this renaming upstream, when the source is loaded or when first copied from the third party.
        to_common_fields = {
            value: key
            for key, value in source.all_fields_map().items()
        }
        final_columns = to_common_fields.values()
        data = data.rename(columns=to_common_fields)[final_columns]
        group = [
            CommonFields.DATE,
            CommonFields.COUNTRY,
            CommonFields.AGGREGATE_LEVEL,
            CommonFields.STATE,
        ]
        data = custom_aggregations.update_with_combined_new_york_counties(
            data, group, are_boroughs_zero=source.HAS_AGGREGATED_NYC_BOROUGH)

        if fill_missing_state:
            state_groupby_fields = [
                CommonFields.DATE,
                CommonFields.COUNTRY,
                CommonFields.STATE,
            ]
            non_matching = dataset_utils.aggregate_and_get_nonmatching(
                data,
                state_groupby_fields,
                AggregationLevel.COUNTY,
                AggregationLevel.STATE,
            ).reset_index()
            data = pd.concat([data, non_matching])

        fips_data = dataset_utils.build_fips_data_frame()
        data = dataset_utils.add_county_using_fips(data, fips_data)
        is_state = data[
            CommonFields.AGGREGATE_LEVEL] == AggregationLevel.STATE.value
        state_fips = data.loc[is_state, CommonFields.STATE].map(
            us_state_abbrev.ABBREV_US_FIPS)
        data.loc[is_state, CommonFields.FIPS] = state_fips

        # Choosing to sort by date
        data = data.sort_values(CommonFields.DATE)
        return cls(data)
Beispiel #5
0
    def from_source(cls, source: "DataSource", fill_missing_state=True):
        """Loads data from a specific datasource.

        Remaps columns from source dataset, fills in missing data
        by computing aggregates, and adds standardized county names from fips.

        Args:
            source: Data source.
            fill_missing_state: If True, fills in missing state level data by
                aggregating county level for a given state.
        """
        if not source.BEDS_FIELD_MAP:
            raise ValueError("Source must have beds field map.")

        data = source.data

        to_common_fields = {
            value: key
            for key, value in source.BEDS_FIELD_MAP.items()
        }
        final_columns = to_common_fields.values()
        data = data.rename(columns=to_common_fields)[final_columns]
        data[cls.Fields.SOURCE] = source.SOURCE_NAME
        data[cls.Fields.GENERATED] = False

        # Generating max bed count.
        columns_to_consider = [
            cls.Fields.STAFFED_BEDS, cls.Fields.LICENSED_BEDS
        ]
        data[cls.Fields.MAX_BED_COUNT] = data[columns_to_consider].max(axis=1)

        # When grouping nyc data, we don't want to count the generated field
        # as a value to sum.
        group = cls.STATE_GROUP_KEY + [cls.Fields.GENERATED]
        data = custom_aggregations.update_with_combined_new_york_counties(
            data, group, are_boroughs_zero=False)
        if fill_missing_state:
            non_matching = dataset_utils.aggregate_and_get_nonmatching(
                data,
                cls.STATE_GROUP_KEY,
                AggregationLevel.COUNTY,
                AggregationLevel.STATE,
            ).reset_index()

            non_matching[cls.Fields.GENERATED] = True
            data = pd.concat([data, non_matching])

        fips_data = dataset_utils.build_fips_data_frame()
        data = dataset_utils.add_county_using_fips(data, fips_data)

        return cls(data)
Beispiel #6
0
    def from_source(cls, source: "DataSource", fill_missing_state=True):
        """Loads data from a specific datasource."""
        if not source.TIMESERIES_FIELD_MAP:
            raise ValueError("Source must have field timeseries field map.")

        data = source.data
        to_common_fields = {
            value: key for key, value in source.TIMESERIES_FIELD_MAP.items()
        }
        final_columns = to_common_fields.values()
        data = data.rename(columns=to_common_fields)[final_columns]
        data[cls.Fields.SOURCE] = source.SOURCE_NAME
        data[cls.Fields.GENERATED] = False

        group = [
            cls.Fields.DATE,
            cls.Fields.SOURCE,
            cls.Fields.COUNTRY,
            cls.Fields.AGGREGATE_LEVEL,
            cls.Fields.STATE,
            cls.Fields.GENERATED,
        ]
        data = custom_aggregations.update_with_combined_new_york_counties(
            data, group, are_boroughs_zero=True
        )

        if fill_missing_state:
            state_groupby_fields = [
                cls.Fields.DATE,
                cls.Fields.SOURCE,
                cls.Fields.COUNTRY,
                cls.Fields.STATE,
            ]
            non_matching = dataset_utils.aggregate_and_get_nonmatching(
                data,
                state_groupby_fields,
                AggregationLevel.COUNTY,
                AggregationLevel.STATE,
            ).reset_index()
            non_matching[cls.Fields.GENERATED] = True
            data = pd.concat([data, non_matching])

        fips_data = dataset_utils.build_fips_data_frame()
        data = dataset_utils.add_county_using_fips(data, fips_data)

        # Choosing to sort by date
        data = data.sort_values(cls.Fields.DATE)
        return cls(data)
Beispiel #7
0
    def from_source(cls, source: "DataSource", fill_missing_state=True):
        """Loads data from a specific datasource.

        Remaps columns from source dataset, fills in missing data
        by computing aggregates, and adds standardized county names from fips.
        """
        if not source.BEDS_FIELD_MAP:
            raise ValueError("Source must have beds field map.")

        data = source.data
        to_common_fields = {
            value: key
            for key, value in source.BEDS_FIELD_MAP.items()
        }
        final_columns = to_common_fields.values()
        data = data.rename(columns=to_common_fields)[final_columns]
        data[cls.Fields.SOURCE] = source.SOURCE_NAME
        data[cls.Fields.GENERATED] = False

        group = [
            cls.Fields.SOURCE, cls.Fields.AGGREGATE_LEVEL, cls.Fields.STATE,
            cls.Fields.GENERATED
        ]
        data = custom_aggregations.update_with_combined_new_york_counties(
            data, group, are_boroughs_zero=False)

        if fill_missing_state:
            state_groupby_fields = [cls.Fields.SOURCE, cls.Fields.STATE]
            non_matching = dataset_utils.aggregate_and_get_nonmatching(
                data,
                state_groupby_fields,
                AggregationLevel.COUNTY,
                AggregationLevel.STATE,
            ).reset_index()

            non_matching[cls.Fields.GENERATED] = True
            data = pd.concat([data, non_matching])

        fips_data = dataset_utils.build_fips_data_frame()
        data = dataset_utils.add_county_using_fips(data, fips_data)

        return cls(data)
Beispiel #8
0
    def _aggregate_new_york_data(cls, data):
        # When grouping nyc data, we don't want to count the generated field
        # as a value to sum.
        nyc_data = data[data[cls.Fields.FIPS].isin(
            custom_aggregations.ALL_NYC_FIPS)]
        if not len(nyc_data):
            return data
        group = cls.STATE_GROUP_KEY
        weighted_all_bed_occupancy = None

        if cls.Fields.ALL_BED_TYPICAL_OCCUPANCY_RATE in data.columns:
            licensed_beds = nyc_data[cls.Fields.LICENSED_BEDS]
            occupancy_rates = nyc_data[
                cls.Fields.ALL_BED_TYPICAL_OCCUPANCY_RATE]
            weighted_all_bed_occupancy = (
                (licensed_beds * occupancy_rates).sum() / licensed_beds.sum())
        weighted_icu_occupancy = None
        if cls.Fields.ICU_TYPICAL_OCCUPANCY_RATE in data.columns:
            icu_beds = nyc_data[cls.Fields.ICU_BEDS]
            occupancy_rates = nyc_data[cls.Fields.ICU_TYPICAL_OCCUPANCY_RATE]
            weighted_icu_occupancy = ((icu_beds * occupancy_rates).sum() /
                                      icu_beds.sum())

        data = custom_aggregations.update_with_combined_new_york_counties(
            data, group, are_boroughs_zero=False)

        nyc_fips = custom_aggregations.NEW_YORK_COUNTY_FIPS
        if weighted_all_bed_occupancy:
            data.loc[data[cls.Fields.FIPS] == nyc_fips,
                     cls.Fields.ALL_BED_TYPICAL_OCCUPANCY_RATE] = (
                         weighted_all_bed_occupancy)

        if weighted_icu_occupancy:
            data.loc[data[cls.Fields.FIPS] == nyc_fips,
                     cls.Fields.ICU_TYPICAL_OCCUPANCY_RATE] = (
                         weighted_icu_occupancy)

        return data
Beispiel #9
0
    def from_source(cls,
                    source: "DataSource",
                    fill_missing_state: bool = True,
                    fill_na: bool = True) -> "Timeseries":
        """Loads data from a specific datasource.

        Args:
            source: DataSource to standardize for timeseries dataset
            fill_missing_state: If True, backfills missing state data by
                calculating county level aggregates.
            fill_na: If True, fills in all NaN values for metrics columns.

        Returns: Timeseries object.
        """
        if not source.TIMESERIES_FIELD_MAP:
            raise ValueError("Source must have field timeseries field map.")

        data = source.data
        to_common_fields = {
            value: key
            for key, value in source.TIMESERIES_FIELD_MAP.items()
        }
        final_columns = to_common_fields.values()
        data = data.rename(columns=to_common_fields)[final_columns]
        data[cls.Fields.SOURCE] = source.SOURCE_NAME
        data[cls.Fields.GENERATED] = False

        group = [
            cls.Fields.DATE,
            cls.Fields.SOURCE,
            cls.Fields.COUNTRY,
            cls.Fields.AGGREGATE_LEVEL,
            cls.Fields.STATE,
            cls.Fields.GENERATED,
        ]
        data = custom_aggregations.update_with_combined_new_york_counties(
            data, group, are_boroughs_zero=True)

        if fill_missing_state:
            state_groupby_fields = [
                cls.Fields.DATE,
                cls.Fields.SOURCE,
                cls.Fields.COUNTRY,
                cls.Fields.STATE,
            ]
            non_matching = dataset_utils.aggregate_and_get_nonmatching(
                data,
                state_groupby_fields,
                AggregationLevel.COUNTY,
                AggregationLevel.STATE,
            ).reset_index()
            non_matching[cls.Fields.GENERATED] = True
            data = pd.concat([data, non_matching])

        if fill_na:
            # Filtering out metric columns that don't exist in the dataset.
            # It might be that we all timeseries datasets to have all of the metric
            # columns. If so, initialization of the missing columns should come earlier.
            metric_columns = [
                field for field in cls.Fields.metrics()
                if field in data.columns
            ]
            data[metric_columns] = data[metric_columns].fillna(0.0)

        fips_data = dataset_utils.build_fips_data_frame()
        data = dataset_utils.add_county_using_fips(data, fips_data)

        # Choosing to sort by date
        data = data.sort_values(cls.Fields.DATE)
        return cls(data)