def _standardize_data(cls, data: pd.DataFrame) -> pd.DataFrame:
        data = dataset_utils.strip_whitespace(data)
        # Drop all data outside the USA to speed up debugging
        data = data.loc[data[cls.Fields.COUNTRY].isin(["US", "USA"])]
        # TODO Figure out how to rename to some ISO standard.
        country_remap = {
            "Mainland China": "China",
            "Bahamas, The": "Bahamas",
            "Congo (Brazzaville)": "Congo",
            "Congo (Kinshasa)": "Congo",
            "Diamond Princess": "Cruise Ship",
            "Hong Kong SAR": "Hong Kong",
            "Iran (Islamic Republic of)": "Iran",
            "Korea, South": "South Korea",
            "Taiwan*": "Taiwan",
            "UK": "United Kingdom",
            "US": "USA",
        }
        data = data.replace({cls.Fields.COUNTRY: country_remap})
        states = data[cls.Fields.STATE].apply(dataset_utils.parse_state)

        county_from_state = data[cls.Fields.STATE].apply(
            dataset_utils.parse_county_from_state)
        data[cls.Fields.COUNTY] = data[cls.Fields.COUNTY].combine_first(
            county_from_state)
        data[cls.Fields.STATE] = states
        data = cls._fill_incomplete_county_data(data)

        state_only = data[cls.Fields.FIPS].isnull() & data[
            cls.Fields.COUNTY].isnull()
        # Pad fips values to 5 spots
        data[cls.Fields.FIPS] = data[cls.Fields.FIPS].apply(
            lambda x: f"{x.zfill(5)}" if type(x) == str else x)

        data[cls.Fields.AGGREGATE_LEVEL] = numpy.where(state_only, "state",
                                                       "county")
        data = cls._aggregate_fips_data(data)

        dataset_utils.assert_counties_have_fips(data,
                                                county_key=cls.Fields.COUNTY,
                                                fips_key=cls.Fields.FIPS)

        # Not including cases grouped in recovered state.
        # on 8/11 recovered cases were assigned to a specific fips which caused duplicates.
        is_recovered_state = data[cls.Fields.STATE] == "Recovered"
        data.loc[is_recovered_state, cls.Fields.FIPS] = None

        common_fields_data = cls._rename_to_common_fields(data)
        return common_fields_data
Beispiel #2
0
    def standardize_data(cls, data: pd.DataFrame) -> pd.DataFrame:
        data = dataset_utils.strip_whitespace(data)
        # TODO Figure out how to rename to some ISO standard.
        country_remap = {
            "Mainland China": "China",
            "Bahamas, The": "Bahamas",
            "Congo (Brazzaville)": "Congo",
            "Congo (Kinshasa)": "Congo",
            "Diamond Princess": "Cruise Ship",
            "Hong Kong SAR": "Hong Kong",
            "Iran (Islamic Republic of)": "Iran",
            "Korea, South": "South Korea",
            "Taiwan*": "Taiwan",
            "UK": "United Kingdom",
            "US": "USA",
        }
        data = data.replace({cls.Fields.COUNTRY: country_remap})
        states = data[cls.Fields.STATE].apply(dataset_utils.parse_state)

        county_from_state = data[cls.Fields.STATE].apply(
            dataset_utils.parse_county_from_state)
        data[cls.Fields.COUNTY] = data[cls.Fields.COUNTY].combine_first(
            county_from_state)
        data[cls.Fields.STATE] = states
        data = cls._fill_incomplete_county_data(data)

        state_only = data[cls.Fields.FIPS].isnull() & data[
            cls.Fields.COUNTY].isnull()
        # Pad fips values to 5 spots
        data[cls.Fields.FIPS] = data[cls.Fields.FIPS].apply(
            lambda x: f"{x.zfill(5)}" if type(x) == str else x)

        data[cls.Fields.AGGREGATE_LEVEL] = numpy.where(state_only, "state",
                                                       "county")
        data = cls._aggregate_fips_data(data)

        dataset_utils.assert_counties_have_fips(data,
                                                county_key=cls.Fields.COUNTY,
                                                fips_key=cls.Fields.FIPS)

        return data