コード例 #1
0
    def standardize_data(cls, data: pd.DataFrame) -> pd.DataFrame:
        # Add Missing
        unknown_fips = []

        for state in data.state.unique():
            row = {
                cls.Fields.STATE: state,
                cls.Fields.FIPS: ABBREV_US_UNKNOWN_COUNTY_FIPS[state],
                cls.Fields.POPULATION: None,
                cls.Fields.COUNTY: "Unknown",
            }
            unknown_fips.append(row)

        data = data.append(unknown_fips)
        # All DH data is aggregated at the county level
        data[cls.Fields.AGGREGATE_LEVEL] = AggregationLevel.COUNTY.value
        data[cls.Fields.COUNTRY] = "USA"

        states_aggregated = dataset_utils.aggregate_and_get_nonmatching(
            data,
            [cls.Fields.COUNTRY, cls.Fields.STATE, cls.Fields.AGGREGATE_LEVEL],
            AggregationLevel.COUNTY,
            AggregationLevel.STATE,
        ).reset_index()
        states_aggregated[cls.Fields.FIPS] = states_aggregated[
            cls.Fields.STATE].map(ABBREV_US_FIPS)
        states_aggregated[cls.Fields.COUNTY] = None

        common_fields_data = pd.concat([data, states_aggregated])
        return common_fields_data
コード例 #2
0
    def from_source(cls, source: "DataSource", fill_missing_state=True):
        """Loads data from a specific datasource.

        Remaps columns from source dataset, fills in missing data
        by computing aggregates, and adds standardized county names from fips.
        """
        if not source.BEDS_FIELD_MAP:
            raise ValueError("Source must have beds field map.")

        data = source.data
        to_common_fields = {
            value: key
            for key, value in source.BEDS_FIELD_MAP.items()
        }
        final_columns = to_common_fields.values()
        data = data.rename(columns=to_common_fields)[final_columns]
        data[cls.Fields.SOURCE] = source.SOURCE_NAME
        data[cls.Fields.GENERATED] = False

        if fill_missing_state:
            state_groupby_fields = [cls.Fields.SOURCE, cls.Fields.STATE]
            non_matching = dataset_utils.aggregate_and_get_nonmatching(
                data,
                state_groupby_fields,
                AggregationLevel.COUNTY,
                AggregationLevel.STATE,
            ).reset_index()

            non_matching[cls.Fields.GENERATED] = True
            data = pd.concat([data, non_matching])

        fips_data = dataset_utils.build_fips_data_frame()
        data = dataset_utils.add_county_using_fips(data, fips_data)

        return cls(data)
コード例 #3
0
    def standardize_data(cls, data: pd.DataFrame) -> pd.DataFrame:
        # Add Missing
        unknown_fips = []

        for state in data.state.unique():
            row = {
                cls.Fields.STATE:
                state,
                # TODO(chris): Possibly separate fips out by state prefix
                cls.Fields.FIPS:
                enums.UNKNOWN_FIPS,
                cls.Fields.POPULATION:
                None,
                cls.Fields.COUNTY:
                'Unknown'
            }
            unknown_fips.append(row)

        data = data.append(unknown_fips)
        # All DH data is aggregated at the county level
        data[cls.Fields.AGGREGATE_LEVEL] = AggregationLevel.COUNTY.value
        data[cls.Fields.COUNTRY] = "USA"

        states_aggregated = dataset_utils.aggregate_and_get_nonmatching(
            data,
            [cls.Fields.COUNTRY, cls.Fields.STATE, cls.Fields.AGGREGATE_LEVEL],
            AggregationLevel.COUNTY,
            AggregationLevel.STATE,
        ).reset_index()
        states_aggregated[cls.Fields.FIPS] = states_aggregated[
            cls.Fields.STATE].map(us_state_abbrev.abbrev_us_fips)
        states_aggregated[cls.Fields.COUNTY] = None

        return pd.concat([data, states_aggregated])
コード例 #4
0
    def from_source(cls,
                    source: "DataSource",
                    fill_missing_state: bool = True) -> "TimeseriesDataset":
        """Loads data from a specific datasource.

        Args:
            source: DataSource to standardize for timeseries dataset
            fill_missing_state: If True, backfills missing state data by
                calculating county level aggregates.

        Returns: Timeseries object.
        """
        data = source.data
        group = [
            CommonFields.DATE,
            CommonFields.COUNTRY,
            CommonFields.AGGREGATE_LEVEL,
            CommonFields.STATE,
        ]
        data = custom_aggregations.update_with_combined_new_york_counties(
            data, group, are_boroughs_zero=source.HAS_AGGREGATED_NYC_BOROUGH)

        if fill_missing_state:
            state_groupby_fields = [
                CommonFields.DATE,
                CommonFields.COUNTRY,
                CommonFields.STATE,
            ]
            non_matching = dataset_utils.aggregate_and_get_nonmatching(
                data,
                state_groupby_fields,
                AggregationLevel.COUNTY,
                AggregationLevel.STATE,
            ).reset_index()
            data = pd.concat([data, non_matching])

        fips_data = dataset_utils.build_fips_data_frame()
        data = dataset_utils.add_county_using_fips(data, fips_data)
        is_state = data[
            CommonFields.AGGREGATE_LEVEL] == AggregationLevel.STATE.value
        state_fips = data.loc[is_state, CommonFields.STATE].map(
            us_state_abbrev.ABBREV_US_FIPS)
        data.loc[is_state, CommonFields.FIPS] = state_fips

        no_fips = data[CommonFields.FIPS].isnull()
        if no_fips.any():
            _log.warning("Dropping rows without FIPS",
                         source=str(source),
                         rows=repr(data.loc[no_fips]))
            data = data.loc[~no_fips]

        dups = data.duplicated(COMMON_FIELDS_TIMESERIES_KEYS, keep=False)
        if dups.any():
            raise DuplicateDataException(f"Duplicates in {source}",
                                         data.loc[dups])

        # Choosing to sort by date
        data = data.sort_values(CommonFields.DATE)
        return cls(data, provenance=source.provenance)
コード例 #5
0
    def from_source(cls,
                    source: "DataSource",
                    fill_missing_state: bool = True) -> "TimeseriesDataset":
        """Loads data from a specific datasource.

        Args:
            source: DataSource to standardize for timeseries dataset
            fill_missing_state: If True, backfills missing state data by
                calculating county level aggregates.

        Returns: Timeseries object.
        """
        data = source.data
        # TODO(tom): Do this renaming upstream, when the source is loaded or when first copied from the third party.
        to_common_fields = {
            value: key
            for key, value in source.all_fields_map().items()
        }
        final_columns = to_common_fields.values()
        data = data.rename(columns=to_common_fields)[final_columns]
        group = [
            CommonFields.DATE,
            CommonFields.COUNTRY,
            CommonFields.AGGREGATE_LEVEL,
            CommonFields.STATE,
        ]
        data = custom_aggregations.update_with_combined_new_york_counties(
            data, group, are_boroughs_zero=source.HAS_AGGREGATED_NYC_BOROUGH)

        if fill_missing_state:
            state_groupby_fields = [
                CommonFields.DATE,
                CommonFields.COUNTRY,
                CommonFields.STATE,
            ]
            non_matching = dataset_utils.aggregate_and_get_nonmatching(
                data,
                state_groupby_fields,
                AggregationLevel.COUNTY,
                AggregationLevel.STATE,
            ).reset_index()
            data = pd.concat([data, non_matching])

        fips_data = dataset_utils.build_fips_data_frame()
        data = dataset_utils.add_county_using_fips(data, fips_data)
        is_state = data[
            CommonFields.AGGREGATE_LEVEL] == AggregationLevel.STATE.value
        state_fips = data.loc[is_state, CommonFields.STATE].map(
            us_state_abbrev.ABBREV_US_FIPS)
        data.loc[is_state, CommonFields.FIPS] = state_fips

        # Choosing to sort by date
        data = data.sort_values(CommonFields.DATE)
        return cls(data)
コード例 #6
0
ファイル: beds.py プロジェクト: satharIBM/covid-data-model
    def from_source(cls, source: "DataSource", fill_missing_state=True):
        """Loads data from a specific datasource.

        Remaps columns from source dataset, fills in missing data
        by computing aggregates, and adds standardized county names from fips.

        Args:
            source: Data source.
            fill_missing_state: If True, fills in missing state level data by
                aggregating county level for a given state.
        """
        if not source.BEDS_FIELD_MAP:
            raise ValueError("Source must have beds field map.")

        data = source.data

        to_common_fields = {
            value: key
            for key, value in source.BEDS_FIELD_MAP.items()
        }
        final_columns = to_common_fields.values()
        data = data.rename(columns=to_common_fields)[final_columns]
        data[cls.Fields.SOURCE] = source.SOURCE_NAME
        data[cls.Fields.GENERATED] = False

        # Generating max bed count.
        columns_to_consider = [
            cls.Fields.STAFFED_BEDS, cls.Fields.LICENSED_BEDS
        ]
        data[cls.Fields.MAX_BED_COUNT] = data[columns_to_consider].max(axis=1)

        # When grouping nyc data, we don't want to count the generated field
        # as a value to sum.
        group = cls.STATE_GROUP_KEY + [cls.Fields.GENERATED]
        data = custom_aggregations.update_with_combined_new_york_counties(
            data, group, are_boroughs_zero=False)
        if fill_missing_state:
            non_matching = dataset_utils.aggregate_and_get_nonmatching(
                data,
                cls.STATE_GROUP_KEY,
                AggregationLevel.COUNTY,
                AggregationLevel.STATE,
            ).reset_index()

            non_matching[cls.Fields.GENERATED] = True
            data = pd.concat([data, non_matching])

        fips_data = dataset_utils.build_fips_data_frame()
        data = dataset_utils.add_county_using_fips(data, fips_data)

        return cls(data)
コード例 #7
0
    def from_source(cls, source: "DataSource", fill_missing_state=True):
        """Loads data from a specific datasource."""
        if not source.TIMESERIES_FIELD_MAP:
            raise ValueError("Source must have field timeseries field map.")

        data = source.data
        to_common_fields = {
            value: key for key, value in source.TIMESERIES_FIELD_MAP.items()
        }
        final_columns = to_common_fields.values()
        data = data.rename(columns=to_common_fields)[final_columns]
        data[cls.Fields.SOURCE] = source.SOURCE_NAME
        data[cls.Fields.GENERATED] = False

        group = [
            cls.Fields.DATE,
            cls.Fields.SOURCE,
            cls.Fields.COUNTRY,
            cls.Fields.AGGREGATE_LEVEL,
            cls.Fields.STATE,
            cls.Fields.GENERATED,
        ]
        data = custom_aggregations.update_with_combined_new_york_counties(
            data, group, are_boroughs_zero=True
        )

        if fill_missing_state:
            state_groupby_fields = [
                cls.Fields.DATE,
                cls.Fields.SOURCE,
                cls.Fields.COUNTRY,
                cls.Fields.STATE,
            ]
            non_matching = dataset_utils.aggregate_and_get_nonmatching(
                data,
                state_groupby_fields,
                AggregationLevel.COUNTY,
                AggregationLevel.STATE,
            ).reset_index()
            non_matching[cls.Fields.GENERATED] = True
            data = pd.concat([data, non_matching])

        fips_data = dataset_utils.build_fips_data_frame()
        data = dataset_utils.add_county_using_fips(data, fips_data)

        # Choosing to sort by date
        data = data.sort_values(cls.Fields.DATE)
        return cls(data)
コード例 #8
0
    def from_source(cls, source: "DataSource", fill_missing_state=True):
        """Loads data from a specific datasource.

        Remaps columns from source dataset, fills in missing data
        by computing aggregates, and adds standardized county names from fips.

        Args:
            source: Data source.
            fill_missing_state: If True, fills in missing state level data by
                aggregating county level for a given state.
        """
        if not source.COMMON_FIELD_MAP and not source.INDEX_FIELD_MAP:
            raise ValueError("Source must have metadata field map.")

        data = source.data
        fields = source.all_fields_map().items()
        to_common_fields = {value: key for key, value in fields}
        final_columns = to_common_fields.values()
        data = data.rename(columns=to_common_fields)[final_columns]

        data = cls._aggregate_new_york_data(data)
        if fill_missing_state:
            non_matching = dataset_utils.aggregate_and_get_nonmatching(
                data,
                cls.STATE_GROUP_KEY,
                AggregationLevel.COUNTY,
                AggregationLevel.STATE,
            ).reset_index()

            data = pd.concat([data, non_matching])

        fips_data = dataset_utils.build_fips_data_frame()
        data = dataset_utils.add_county_using_fips(data, fips_data)

        # Add state fips
        is_state = data[
            cls.Fields.AGGREGATE_LEVEL] == AggregationLevel.STATE.value
        state_fips = data.loc[is_state, cls.Fields.STATE].map(
            us_state_abbrev.ABBREV_US_FIPS)
        data.loc[is_state, cls.Fields.FIPS] = state_fips

        return cls(data)
コード例 #9
0
    def from_source(cls,
                    source: "DataSource",
                    fill_missing_state: bool = True,
                    fill_na: bool = True) -> "Timeseries":
        """Loads data from a specific datasource.

        Args:
            source: DataSource to standardize for timeseries dataset
            fill_missing_state: If True, backfills missing state data by
                calculating county level aggregates.
            fill_na: If True, fills in all NaN values for metrics columns.

        Returns: Timeseries object.
        """
        if not source.TIMESERIES_FIELD_MAP:
            raise ValueError("Source must have field timeseries field map.")

        data = source.data
        to_common_fields = {
            value: key
            for key, value in source.TIMESERIES_FIELD_MAP.items()
        }
        final_columns = to_common_fields.values()
        data = data.rename(columns=to_common_fields)[final_columns]
        data[cls.Fields.SOURCE] = source.SOURCE_NAME
        data[cls.Fields.GENERATED] = False

        group = [
            cls.Fields.DATE,
            cls.Fields.SOURCE,
            cls.Fields.COUNTRY,
            cls.Fields.AGGREGATE_LEVEL,
            cls.Fields.STATE,
            cls.Fields.GENERATED,
        ]
        data = custom_aggregations.update_with_combined_new_york_counties(
            data, group, are_boroughs_zero=True)

        if fill_missing_state:
            state_groupby_fields = [
                cls.Fields.DATE,
                cls.Fields.SOURCE,
                cls.Fields.COUNTRY,
                cls.Fields.STATE,
            ]
            non_matching = dataset_utils.aggregate_and_get_nonmatching(
                data,
                state_groupby_fields,
                AggregationLevel.COUNTY,
                AggregationLevel.STATE,
            ).reset_index()
            non_matching[cls.Fields.GENERATED] = True
            data = pd.concat([data, non_matching])

        if fill_na:
            # Filtering out metric columns that don't exist in the dataset.
            # It might be that we all timeseries datasets to have all of the metric
            # columns. If so, initialization of the missing columns should come earlier.
            metric_columns = [
                field for field in cls.Fields.metrics()
                if field in data.columns
            ]
            data[metric_columns] = data[metric_columns].fillna(0.0)

        fips_data = dataset_utils.build_fips_data_frame()
        data = dataset_utils.add_county_using_fips(data, fips_data)

        # Choosing to sort by date
        data = data.sort_values(cls.Fields.DATE)
        return cls(data)