Esempio n. 1
0
    def from_source(cls, source: "DataSource", fill_missing_state=True):
        """Loads data from a specific datasource.

        Remaps columns from source dataset, fills in missing data
        by computing aggregates, and adds standardized county names from fips.
        """
        if not source.BEDS_FIELD_MAP:
            raise ValueError("Source must have beds field map.")

        data = source.data
        to_common_fields = {
            value: key
            for key, value in source.BEDS_FIELD_MAP.items()
        }
        final_columns = to_common_fields.values()
        data = data.rename(columns=to_common_fields)[final_columns]
        data[cls.Fields.SOURCE] = source.SOURCE_NAME
        data[cls.Fields.GENERATED] = False

        if fill_missing_state:
            state_groupby_fields = [cls.Fields.SOURCE, cls.Fields.STATE]
            non_matching = dataset_utils.aggregate_and_get_nonmatching(
                data,
                state_groupby_fields,
                AggregationLevel.COUNTY,
                AggregationLevel.STATE,
            ).reset_index()

            non_matching[cls.Fields.GENERATED] = True
            data = pd.concat([data, non_matching])

        fips_data = dataset_utils.build_fips_data_frame()
        data = dataset_utils.add_county_using_fips(data, fips_data)

        return cls(data)
    def from_source(cls,
                    source: "DataSource",
                    fill_missing_state: bool = True) -> "TimeseriesDataset":
        """Loads data from a specific datasource.

        Args:
            source: DataSource to standardize for timeseries dataset
            fill_missing_state: If True, backfills missing state data by
                calculating county level aggregates.

        Returns: Timeseries object.
        """
        data = source.data
        group = [
            CommonFields.DATE,
            CommonFields.COUNTRY,
            CommonFields.AGGREGATE_LEVEL,
            CommonFields.STATE,
        ]
        data = custom_aggregations.update_with_combined_new_york_counties(
            data, group, are_boroughs_zero=source.HAS_AGGREGATED_NYC_BOROUGH)

        if fill_missing_state:
            state_groupby_fields = [
                CommonFields.DATE,
                CommonFields.COUNTRY,
                CommonFields.STATE,
            ]
            non_matching = dataset_utils.aggregate_and_get_nonmatching(
                data,
                state_groupby_fields,
                AggregationLevel.COUNTY,
                AggregationLevel.STATE,
            ).reset_index()
            data = pd.concat([data, non_matching])

        fips_data = dataset_utils.build_fips_data_frame()
        data = dataset_utils.add_county_using_fips(data, fips_data)
        is_state = data[
            CommonFields.AGGREGATE_LEVEL] == AggregationLevel.STATE.value
        state_fips = data.loc[is_state, CommonFields.STATE].map(
            us_state_abbrev.ABBREV_US_FIPS)
        data.loc[is_state, CommonFields.FIPS] = state_fips

        no_fips = data[CommonFields.FIPS].isnull()
        if no_fips.any():
            _log.warning("Dropping rows without FIPS",
                         source=str(source),
                         rows=repr(data.loc[no_fips]))
            data = data.loc[~no_fips]

        dups = data.duplicated(COMMON_FIELDS_TIMESERIES_KEYS, keep=False)
        if dups.any():
            raise DuplicateDataException(f"Duplicates in {source}",
                                         data.loc[dups])

        # Choosing to sort by date
        data = data.sort_values(CommonFields.DATE)
        return cls(data, provenance=source.provenance)
Esempio n. 3
0
    def standardize_data(cls, data: pd.DataFrame) -> pd.DataFrame:
        # All DH data is aggregated at the county level
        data[cls.Fields.AGGREGATE_LEVEL] = "county"

        # Backfilling FIPS data based on county names.
        # TODO: Fix all missing cases
        fips_data = dataset_utils.build_fips_data_frame()
        data = match_county_to_fips(data, fips_data)
        return data
Esempio n. 4
0
    def from_source(cls,
                    source: "DataSource",
                    fill_missing_state: bool = True) -> "TimeseriesDataset":
        """Loads data from a specific datasource.

        Args:
            source: DataSource to standardize for timeseries dataset
            fill_missing_state: If True, backfills missing state data by
                calculating county level aggregates.

        Returns: Timeseries object.
        """
        data = source.data
        # TODO(tom): Do this renaming upstream, when the source is loaded or when first copied from the third party.
        to_common_fields = {
            value: key
            for key, value in source.all_fields_map().items()
        }
        final_columns = to_common_fields.values()
        data = data.rename(columns=to_common_fields)[final_columns]
        group = [
            CommonFields.DATE,
            CommonFields.COUNTRY,
            CommonFields.AGGREGATE_LEVEL,
            CommonFields.STATE,
        ]
        data = custom_aggregations.update_with_combined_new_york_counties(
            data, group, are_boroughs_zero=source.HAS_AGGREGATED_NYC_BOROUGH)

        if fill_missing_state:
            state_groupby_fields = [
                CommonFields.DATE,
                CommonFields.COUNTRY,
                CommonFields.STATE,
            ]
            non_matching = dataset_utils.aggregate_and_get_nonmatching(
                data,
                state_groupby_fields,
                AggregationLevel.COUNTY,
                AggregationLevel.STATE,
            ).reset_index()
            data = pd.concat([data, non_matching])

        fips_data = dataset_utils.build_fips_data_frame()
        data = dataset_utils.add_county_using_fips(data, fips_data)
        is_state = data[
            CommonFields.AGGREGATE_LEVEL] == AggregationLevel.STATE.value
        state_fips = data.loc[is_state, CommonFields.STATE].map(
            us_state_abbrev.ABBREV_US_FIPS)
        data.loc[is_state, CommonFields.FIPS] = state_fips

        # Choosing to sort by date
        data = data.sort_values(CommonFields.DATE)
        return cls(data)
Esempio n. 5
0
    def standardize_data(cls, data: pd.DataFrame) -> pd.DataFrame:
        data = dataset_utils.strip_whitespace(data)

        data = cls.remove_duplicate_city_data(data)

        # CDS state level aggregates are identifiable by not having a city or county.
        only_county = data[cls.Fields.COUNTY].notnull() & data[cls.Fields.STATE].notnull()
        county_hits = numpy.where(only_county, "county", None)
        only_state = (
            data[cls.Fields.COUNTY].isnull()
            & data[cls.Fields.CITY].isnull()
            & data[cls.Fields.STATE].notnull()
        )
        only_country = (
            data[cls.Fields.COUNTY].isnull()
            & data[cls.Fields.CITY].isnull()
            & data[cls.Fields.STATE].isnull()
            & data[cls.Fields.COUNTRY].notnull()
        )

        state_hits = numpy.where(only_state, "state", None)
        county_hits[state_hits != None] = state_hits[state_hits != None]
        county_hits[only_country] = "country"
        data[cls.Fields.AGGREGATE_LEVEL] = county_hits

        # Backfilling FIPS data based on county names.
        # The following abbrev mapping only makes sense for the US
        # TODO: Fix all missing cases
        data = data[data["country"] == "United States"]
        data[CommonFields.COUNTRY] = "USA"
        data[CommonFields.STATE] = data[cls.Fields.STATE].apply(
            lambda x: US_STATE_ABBREV[x] if x in US_STATE_ABBREV else x
        )

        fips_data = dataset_utils.build_fips_data_frame()
        data = dataset_utils.add_fips_using_county(data, fips_data)
        no_fips = data[CommonFields.FIPS].isna()
        if no_fips.sum() > 0:
            logging.error(f"Removing {len(data.loc[no_fips])} rows without fips id")
            # logging.error(f"Removing rows without fips id: {str(data.loc[no_fips])}")
            data = data.loc[~no_fips]

        data.set_index(["date", "fips"], inplace=True)
        if data.index.has_duplicates:
            # Use keep=False when logging so the output contains all duplicated rows, not just the
            # first or last instance of each duplicate.
            logging.error(f"Removing duplicates: {str(data.index.duplicated(keep=False))}")
            data = data.loc[~data.index.duplicated(keep=False)]
        data.reset_index(inplace=True)

        # ADD Negative tests
        data[cls.Fields.NEGATIVE_TESTS] = data[cls.Fields.TESTED] - data[cls.Fields.CASES]

        return data
Esempio n. 6
0
    def standardize_data(cls, data: pd.DataFrame) -> pd.DataFrame:
        data = dataset_utils.strip_whitespace(data)

        # Don't want to return city data because it's duplicated in county
        # City data before 3-23 was not duplicated.
        # data = data[data[cls.Fields.CITY].isnull()]
        pre_march_23 = data[data.date < "2020-03-23"]
        pre_march_23.county = pre_march_23.apply(fill_missing_county_with_city,
                                                 axis=1)
        split_data = [
            pre_march_23,
            data[(data.date >= "2020-03-23") & data[cls.Fields.CITY].isnull()],
        ]
        data = pd.concat(split_data)

        # CDS state level aggregates are identifiable by not having a city or county.
        only_county = (data[cls.Fields.COUNTY].notnull()
                       & data[cls.Fields.STATE].notnull())
        county_hits = numpy.where(only_county, "county", None)
        only_state = (data[cls.Fields.COUNTY].isnull()
                      & data[cls.Fields.CITY].isnull()
                      & data[cls.Fields.STATE].notnull())
        only_country = (data[cls.Fields.COUNTY].isnull()
                        & data[cls.Fields.CITY].isnull()
                        & data[cls.Fields.STATE].isnull()
                        & data[cls.Fields.COUNTRY].notnull())

        state_hits = numpy.where(only_state, "state", None)
        county_hits[state_hits != None] = state_hits[state_hits != None]
        county_hits[only_country] = "country"
        data[cls.Fields.AGGREGATE_LEVEL] = county_hits

        # Backfilling FIPS data based on county names.
        # The following abbrev mapping only makes sense for the US
        # TODO: Fix all missing cases
        data = data[data["country"] == "United States"]
        data["state_abbr"] = data[cls.Fields.STATE].apply(
            lambda x: US_STATE_ABBREV[x] if x in US_STATE_ABBREV else x)
        data["state_tmp"] = data["state"]
        data["state"] = data["state_abbr"]

        fips_data = dataset_utils.build_fips_data_frame()
        data = dataset_utils.add_fips_using_county(data, fips_data)

        # ADD Negative tests
        data[cls.Fields.NEGATIVE_TESTS] = (data[cls.Fields.TESTED] -
                                           data[cls.Fields.CASES])

        # put the state column back
        data["state"] = data["state_tmp"]

        return data
Esempio n. 7
0
    def from_source(cls, source: "DataSource", fill_missing_state=True):
        """Loads data from a specific datasource.

        Remaps columns from source dataset, fills in missing data
        by computing aggregates, and adds standardized county names from fips.

        Args:
            source: Data source.
            fill_missing_state: If True, fills in missing state level data by
                aggregating county level for a given state.
        """
        if not source.BEDS_FIELD_MAP:
            raise ValueError("Source must have beds field map.")

        data = source.data

        to_common_fields = {
            value: key
            for key, value in source.BEDS_FIELD_MAP.items()
        }
        final_columns = to_common_fields.values()
        data = data.rename(columns=to_common_fields)[final_columns]
        data[cls.Fields.SOURCE] = source.SOURCE_NAME
        data[cls.Fields.GENERATED] = False

        # Generating max bed count.
        columns_to_consider = [
            cls.Fields.STAFFED_BEDS, cls.Fields.LICENSED_BEDS
        ]
        data[cls.Fields.MAX_BED_COUNT] = data[columns_to_consider].max(axis=1)

        # When grouping nyc data, we don't want to count the generated field
        # as a value to sum.
        group = cls.STATE_GROUP_KEY + [cls.Fields.GENERATED]
        data = custom_aggregations.update_with_combined_new_york_counties(
            data, group, are_boroughs_zero=False)
        if fill_missing_state:
            non_matching = dataset_utils.aggregate_and_get_nonmatching(
                data,
                cls.STATE_GROUP_KEY,
                AggregationLevel.COUNTY,
                AggregationLevel.STATE,
            ).reset_index()

            non_matching[cls.Fields.GENERATED] = True
            data = pd.concat([data, non_matching])

        fips_data = dataset_utils.build_fips_data_frame()
        data = dataset_utils.add_county_using_fips(data, fips_data)

        return cls(data)
Esempio n. 8
0
    def standardize_data(cls, data: pd.DataFrame) -> pd.DataFrame:
        # All DH data is aggregated at the county level
        data[cls.Fields.AGGREGATE_LEVEL] = "county"

        data[cls.Fields.COUNTRY] = "USA"

        # Backfilling FIPS data based on county names.
        fips_data = dataset_utils.build_fips_data_frame()
        data = match_county_to_fips(data, fips_data)

        # The virgin islands do not currently have associated fips codes.
        # if VI is supported in the future, this should be removed.
        is_virgin_islands = data[cls.Fields.STATE] == 'VI'
        return data[~is_virgin_islands]
Esempio n. 9
0
    def from_source(cls, source: "DataSource", fill_missing_state=True):
        """Loads data from a specific datasource."""
        if not source.TIMESERIES_FIELD_MAP:
            raise ValueError("Source must have field timeseries field map.")

        data = source.data
        to_common_fields = {
            value: key for key, value in source.TIMESERIES_FIELD_MAP.items()
        }
        final_columns = to_common_fields.values()
        data = data.rename(columns=to_common_fields)[final_columns]
        data[cls.Fields.SOURCE] = source.SOURCE_NAME
        data[cls.Fields.GENERATED] = False

        group = [
            cls.Fields.DATE,
            cls.Fields.SOURCE,
            cls.Fields.COUNTRY,
            cls.Fields.AGGREGATE_LEVEL,
            cls.Fields.STATE,
            cls.Fields.GENERATED,
        ]
        data = custom_aggregations.update_with_combined_new_york_counties(
            data, group, are_boroughs_zero=True
        )

        if fill_missing_state:
            state_groupby_fields = [
                cls.Fields.DATE,
                cls.Fields.SOURCE,
                cls.Fields.COUNTRY,
                cls.Fields.STATE,
            ]
            non_matching = dataset_utils.aggregate_and_get_nonmatching(
                data,
                state_groupby_fields,
                AggregationLevel.COUNTY,
                AggregationLevel.STATE,
            ).reset_index()
            non_matching[cls.Fields.GENERATED] = True
            data = pd.concat([data, non_matching])

        fips_data = dataset_utils.build_fips_data_frame()
        data = dataset_utils.add_county_using_fips(data, fips_data)

        # Choosing to sort by date
        data = data.sort_values(cls.Fields.DATE)
        return cls(data)
Esempio n. 10
0
    def standardize_data(cls, data: pd.DataFrame) -> pd.DataFrame:
        # All DH data is aggregated at the county level
        data[cls.Fields.AGGREGATE_LEVEL] = "county"
        data[cls.Fields.COUNTRY] = "USA"

        # Backfilling FIPS data based on county names.
        fips_data = dataset_utils.build_fips_data_frame()
        fips_data = fips_data[fips_data.aggregate_level == AggregationLevel.COUNTY.value]
        fips_data = fips_data[fips_data.fips != enums.UNKNOWN_FIPS]
        data = match_county_to_fips(data, fips_data)

        data[cls.Fields.MAX_BED_COUNT] = data[
            [cls.Fields.STAFFED_BEDS, cls.Fields.LICENSED_BEDS]
        ].max(axis=1)

        # The virgin islands do not currently have associated fips codes.
        # if VI is supported in the future, this should be removed.
        is_virgin_islands = data[cls.Fields.STATE] == "VI"
        return data[~is_virgin_islands]
Esempio n. 11
0
    def from_source(cls, source: "DataSource", fill_missing_state=True):
        """Loads data from a specific datasource.

        Remaps columns from source dataset, fills in missing data
        by computing aggregates, and adds standardized county names from fips.

        Args:
            source: Data source.
            fill_missing_state: If True, fills in missing state level data by
                aggregating county level for a given state.
        """
        if not source.COMMON_FIELD_MAP and not source.INDEX_FIELD_MAP:
            raise ValueError("Source must have metadata field map.")

        data = source.data
        fields = source.all_fields_map().items()
        to_common_fields = {value: key for key, value in fields}
        final_columns = to_common_fields.values()
        data = data.rename(columns=to_common_fields)[final_columns]

        data = cls._aggregate_new_york_data(data)
        if fill_missing_state:
            non_matching = dataset_utils.aggregate_and_get_nonmatching(
                data,
                cls.STATE_GROUP_KEY,
                AggregationLevel.COUNTY,
                AggregationLevel.STATE,
            ).reset_index()

            data = pd.concat([data, non_matching])

        fips_data = dataset_utils.build_fips_data_frame()
        data = dataset_utils.add_county_using_fips(data, fips_data)

        # Add state fips
        is_state = data[
            cls.Fields.AGGREGATE_LEVEL] == AggregationLevel.STATE.value
        state_fips = data.loc[is_state, cls.Fields.STATE].map(
            us_state_abbrev.ABBREV_US_FIPS)
        data.loc[is_state, cls.Fields.FIPS] = state_fips

        return cls(data)
Esempio n. 12
0
    def standardize_data(cls, data: pd.DataFrame) -> pd.DataFrame:
        data = dataset_utils.strip_whitespace(data)

        data = cls.remove_duplicate_city_data(data)

        # CDS state level aggregates are identifiable by not having a city or county.
        only_county = data[cls.Fields.COUNTY].notnull() & data[
            cls.Fields.STATE].notnull()
        county_hits = numpy.where(only_county, "county", None)
        only_state = (data[cls.Fields.COUNTY].isnull()
                      & data[cls.Fields.CITY].isnull()
                      & data[cls.Fields.STATE].notnull())
        only_country = (data[cls.Fields.COUNTY].isnull()
                        & data[cls.Fields.CITY].isnull()
                        & data[cls.Fields.STATE].isnull()
                        & data[cls.Fields.COUNTRY].notnull())

        state_hits = numpy.where(only_state, "state", None)
        county_hits[state_hits != None] = state_hits[state_hits != None]
        county_hits[only_country] = "country"
        data[cls.Fields.AGGREGATE_LEVEL] = county_hits

        # Backfilling FIPS data based on county names.
        # The following abbrev mapping only makes sense for the US
        # TODO: Fix all missing cases
        data = data[data["country"] == "United States"]
        data["state_abbr"] = data[cls.Fields.STATE].apply(
            lambda x: US_STATE_ABBREV[x] if x in US_STATE_ABBREV else x)
        data["state_tmp"] = data["state"]
        data["state"] = data["state_abbr"]

        fips_data = dataset_utils.build_fips_data_frame()
        data = dataset_utils.add_fips_using_county(data, fips_data)

        # ADD Negative tests
        data[cls.Fields.
             NEGATIVE_TESTS] = data[cls.Fields.TESTED] - data[cls.Fields.CASES]

        # put the state column back
        data["state"] = data["state_tmp"]

        return data
Esempio n. 13
0
    def standardize_data(cls, data: pd.DataFrame) -> pd.DataFrame:
        data = dataset_utils.strip_whitespace(data)

        # Don't want to return city data because it's duplicated in county
        # City data before 3-23 was not duplicated.
        # data = data[data[cls.Fields.CITY].isnull()]
        pre_march_23 = data[data.date < "2020-03-23"]
        pre_march_23.county = pre_march_23.apply(fill_missing_county_with_city,
                                                 axis=1)
        split_data = [
            pre_march_23,
            data[(data.date >= "2020-03-23") & data[cls.Fields.CITY].isnull()],
        ]
        data = pd.concat(split_data)

        # CDS state level aggregates are identifiable by not having a city or county.
        only_county = (data[cls.Fields.COUNTY].notnull()
                       & data[cls.Fields.STATE].notnull())
        county_hits = numpy.where(only_county, "county", None)
        only_state = (data[cls.Fields.COUNTY].isnull()
                      & data[cls.Fields.CITY].isnull()
                      & data[cls.Fields.STATE].notnull())
        only_country = (data[cls.Fields.COUNTY].isnull()
                        & data[cls.Fields.CITY].isnull()
                        & data[cls.Fields.STATE].isnull()
                        & data[cls.Fields.COUNTRY].notnull())

        state_hits = numpy.where(only_state, "state", None)
        county_hits[state_hits != None] = state_hits[state_hits != None]
        county_hits[only_country] = "country"
        data[cls.Fields.AGGREGATE_LEVEL] = county_hits

        # Backfilling FIPS data based on county names.
        # TODO: Fix all missing cases
        fips_data = dataset_utils.build_fips_data_frame()
        data = dataset_utils.add_fips_using_county(data, fips_data)

        return data
Esempio n. 14
0
    def from_source(cls,
                    source: "DataSource",
                    fill_missing_state: bool = True,
                    fill_na: bool = True) -> "Timeseries":
        """Loads data from a specific datasource.

        Args:
            source: DataSource to standardize for timeseries dataset
            fill_missing_state: If True, backfills missing state data by
                calculating county level aggregates.
            fill_na: If True, fills in all NaN values for metrics columns.

        Returns: Timeseries object.
        """
        if not source.TIMESERIES_FIELD_MAP:
            raise ValueError("Source must have field timeseries field map.")

        data = source.data
        to_common_fields = {
            value: key
            for key, value in source.TIMESERIES_FIELD_MAP.items()
        }
        final_columns = to_common_fields.values()
        data = data.rename(columns=to_common_fields)[final_columns]
        data[cls.Fields.SOURCE] = source.SOURCE_NAME
        data[cls.Fields.GENERATED] = False

        group = [
            cls.Fields.DATE,
            cls.Fields.SOURCE,
            cls.Fields.COUNTRY,
            cls.Fields.AGGREGATE_LEVEL,
            cls.Fields.STATE,
            cls.Fields.GENERATED,
        ]
        data = custom_aggregations.update_with_combined_new_york_counties(
            data, group, are_boroughs_zero=True)

        if fill_missing_state:
            state_groupby_fields = [
                cls.Fields.DATE,
                cls.Fields.SOURCE,
                cls.Fields.COUNTRY,
                cls.Fields.STATE,
            ]
            non_matching = dataset_utils.aggregate_and_get_nonmatching(
                data,
                state_groupby_fields,
                AggregationLevel.COUNTY,
                AggregationLevel.STATE,
            ).reset_index()
            non_matching[cls.Fields.GENERATED] = True
            data = pd.concat([data, non_matching])

        if fill_na:
            # Filtering out metric columns that don't exist in the dataset.
            # It might be that we all timeseries datasets to have all of the metric
            # columns. If so, initialization of the missing columns should come earlier.
            metric_columns = [
                field for field in cls.Fields.metrics()
                if field in data.columns
            ]
            data[metric_columns] = data[metric_columns].fillna(0.0)

        fips_data = dataset_utils.build_fips_data_frame()
        data = dataset_utils.add_county_using_fips(data, fips_data)

        # Choosing to sort by date
        data = data.sort_values(cls.Fields.DATE)
        return cls(data)