def standardize_data(cls, data: pd.DataFrame) -> pd.DataFrame: data[cls.Fields.COUNTRY] = "USA" data = dataset_utils.strip_whitespace(data) data[cls.Fields.STATE] = data[cls.Fields.STATE].apply(dataset_utils.parse_state) # Super hacky way of filling in new york. data.loc[data[cls.Fields.COUNTY] == "New York City", "county"] = "New York County" data.loc[data[cls.Fields.COUNTY] == "New York County", "fips"] = "36061" data[cls.Fields.AGGREGATE_LEVEL] = "county" return data
def standardize_data(cls, data: pd.DataFrame) -> pd.DataFrame: data = dataset_utils.strip_whitespace(data) data = cls.remove_duplicate_city_data(data) # CDS state level aggregates are identifiable by not having a city or county. only_county = data[cls.Fields.COUNTY].notnull() & data[cls.Fields.STATE].notnull() county_hits = numpy.where(only_county, "county", None) only_state = ( data[cls.Fields.COUNTY].isnull() & data[cls.Fields.CITY].isnull() & data[cls.Fields.STATE].notnull() ) only_country = ( data[cls.Fields.COUNTY].isnull() & data[cls.Fields.CITY].isnull() & data[cls.Fields.STATE].isnull() & data[cls.Fields.COUNTRY].notnull() ) state_hits = numpy.where(only_state, "state", None) county_hits[state_hits != None] = state_hits[state_hits != None] county_hits[only_country] = "country" data[cls.Fields.AGGREGATE_LEVEL] = county_hits # Backfilling FIPS data based on county names. # The following abbrev mapping only makes sense for the US # TODO: Fix all missing cases data = data[data["country"] == "United States"] data[CommonFields.COUNTRY] = "USA" data[CommonFields.STATE] = data[cls.Fields.STATE].apply( lambda x: US_STATE_ABBREV[x] if x in US_STATE_ABBREV else x ) fips_data = dataset_utils.build_fips_data_frame() data = dataset_utils.add_fips_using_county(data, fips_data) no_fips = data[CommonFields.FIPS].isna() if no_fips.sum() > 0: logging.error(f"Removing {len(data.loc[no_fips])} rows without fips id") # logging.error(f"Removing rows without fips id: {str(data.loc[no_fips])}") data = data.loc[~no_fips] data.set_index(["date", "fips"], inplace=True) if data.index.has_duplicates: # Use keep=False when logging so the output contains all duplicated rows, not just the # first or last instance of each duplicate. logging.error(f"Removing duplicates: {str(data.index.duplicated(keep=False))}") data = data.loc[~data.index.duplicated(keep=False)] data.reset_index(inplace=True) # ADD Negative tests data[cls.Fields.NEGATIVE_TESTS] = data[cls.Fields.TESTED] - data[cls.Fields.CASES] return data
def standardize_data(cls, data: pd.DataFrame) -> pd.DataFrame: data = dataset_utils.strip_whitespace(data) # Don't want to return city data because it's duplicated in county # City data before 3-23 was not duplicated. # data = data[data[cls.Fields.CITY].isnull()] pre_march_23 = data[data.date < "2020-03-23"] pre_march_23.county = pre_march_23.apply(fill_missing_county_with_city, axis=1) split_data = [ pre_march_23, data[(data.date >= "2020-03-23") & data[cls.Fields.CITY].isnull()], ] data = pd.concat(split_data) # CDS state level aggregates are identifiable by not having a city or county. only_county = (data[cls.Fields.COUNTY].notnull() & data[cls.Fields.STATE].notnull()) county_hits = numpy.where(only_county, "county", None) only_state = (data[cls.Fields.COUNTY].isnull() & data[cls.Fields.CITY].isnull() & data[cls.Fields.STATE].notnull()) only_country = (data[cls.Fields.COUNTY].isnull() & data[cls.Fields.CITY].isnull() & data[cls.Fields.STATE].isnull() & data[cls.Fields.COUNTRY].notnull()) state_hits = numpy.where(only_state, "state", None) county_hits[state_hits != None] = state_hits[state_hits != None] county_hits[only_country] = "country" data[cls.Fields.AGGREGATE_LEVEL] = county_hits # Backfilling FIPS data based on county names. # The following abbrev mapping only makes sense for the US # TODO: Fix all missing cases data = data[data["country"] == "United States"] data["state_abbr"] = data[cls.Fields.STATE].apply( lambda x: US_STATE_ABBREV[x] if x in US_STATE_ABBREV else x) data["state_tmp"] = data["state"] data["state"] = data["state_abbr"] fips_data = dataset_utils.build_fips_data_frame() data = dataset_utils.add_fips_using_county(data, fips_data) # ADD Negative tests data[cls.Fields.NEGATIVE_TESTS] = (data[cls.Fields.TESTED] - data[cls.Fields.CASES]) # put the state column back data["state"] = data["state_tmp"] return data
def _standardize_data(cls, data: pd.DataFrame) -> pd.DataFrame: data = dataset_utils.strip_whitespace(data) # Drop all data outside the USA to speed up debugging data = data.loc[data[cls.Fields.COUNTRY].isin(["US", "USA"])] # TODO Figure out how to rename to some ISO standard. country_remap = { "Mainland China": "China", "Bahamas, The": "Bahamas", "Congo (Brazzaville)": "Congo", "Congo (Kinshasa)": "Congo", "Diamond Princess": "Cruise Ship", "Hong Kong SAR": "Hong Kong", "Iran (Islamic Republic of)": "Iran", "Korea, South": "South Korea", "Taiwan*": "Taiwan", "UK": "United Kingdom", "US": "USA", } data = data.replace({cls.Fields.COUNTRY: country_remap}) states = data[cls.Fields.STATE].apply(dataset_utils.parse_state) county_from_state = data[cls.Fields.STATE].apply( dataset_utils.parse_county_from_state) data[cls.Fields.COUNTY] = data[cls.Fields.COUNTY].combine_first( county_from_state) data[cls.Fields.STATE] = states data = cls._fill_incomplete_county_data(data) state_only = data[cls.Fields.FIPS].isnull() & data[ cls.Fields.COUNTY].isnull() # Pad fips values to 5 spots data[cls.Fields.FIPS] = data[cls.Fields.FIPS].apply( lambda x: f"{x.zfill(5)}" if type(x) == str else x) data[cls.Fields.AGGREGATE_LEVEL] = numpy.where(state_only, "state", "county") data = cls._aggregate_fips_data(data) dataset_utils.assert_counties_have_fips(data, county_key=cls.Fields.COUNTY, fips_key=cls.Fields.FIPS) # Not including cases grouped in recovered state. # on 8/11 recovered cases were assigned to a specific fips which caused duplicates. is_recovered_state = data[cls.Fields.STATE] == "Recovered" data.loc[is_recovered_state, cls.Fields.FIPS] = None common_fields_data = cls._rename_to_common_fields(data) return common_fields_data
def standardize_data(cls, data: pd.DataFrame) -> pd.DataFrame: data[cls.Fields.COUNTRY] = "USA" data[cls.Fields.AGGREGATE_LEVEL] = "county" data = dataset_utils.strip_whitespace(data) data[cls.Fields.STATE] = data[cls.Fields.STATE].apply( dataset_utils.parse_state) # Super hacky way of filling in new york. data.loc[data[cls.Fields.COUNTY] == "New York City", "county"] = "New York County" data.loc[data[cls.Fields.COUNTY] == "New York County", "fips"] = "36061" # UNKNOWN_FIPS is overwritten with values from ABBREV_US_UNKNOWN_COUNTY_FIPS below. data.loc[data[cls.Fields.COUNTY] == "Unknown", "fips"] = enums.UNKNOWN_FIPS # https://github.com/nytimes/covid-19-data/blob/master/README.md#geographic-exceptions # Both Joplin and Kansas City numbers are reported separately from the surrounding counties. # Until we figure out a better way to spread amongst counties they are in, combining all # data missing a fips into one unknown fips. is_kc = data[cls.Fields.COUNTY] == "Kansas City" is_joplin = data[cls.Fields.COUNTY] == "Joplin" data.loc[is_kc | is_joplin, cls.Fields.FIPS] = enums.UNKNOWN_FIPS is_missouri = data[cls.Fields.STATE] == "MO" is_unknown = data[cls.Fields.FIPS] == enums.UNKNOWN_FIPS missouri_unknown = data.loc[is_missouri & is_unknown, :] group_columns = [ cls.Fields.AGGREGATE_LEVEL, cls.Fields.FIPS, cls.Fields.DATE, cls.Fields.COUNTRY, cls.Fields.STATE, ] missouri_unknown = missouri_unknown.groupby( group_columns).sum().reset_index() missouri_unknown[ cls.Fields.COUNTY] = "Aggregated City and Unknown Data" data = pd.concat( [data.loc[~(is_missouri & is_unknown), :], missouri_unknown]) # Change all the 99999 FIPS to per-state unknown unknown_fips = data[cls.Fields.FIPS] == enums.UNKNOWN_FIPS data.loc[unknown_fips, cls.Fields.FIPS] = data.loc[ unknown_fips, cls.Fields.STATE].map(ABBREV_US_UNKNOWN_COUNTY_FIPS) return data
def standardize_data(cls, data: pd.DataFrame) -> pd.DataFrame: data = dataset_utils.strip_whitespace(data) data = cls.remove_duplicate_city_data(data) # CDS state level aggregates are identifiable by not having a city or county. only_county = data[cls.Fields.COUNTY].notnull() & data[ cls.Fields.STATE].notnull() county_hits = numpy.where(only_county, "county", None) only_state = (data[cls.Fields.COUNTY].isnull() & data[cls.Fields.CITY].isnull() & data[cls.Fields.STATE].notnull()) only_country = (data[cls.Fields.COUNTY].isnull() & data[cls.Fields.CITY].isnull() & data[cls.Fields.STATE].isnull() & data[cls.Fields.COUNTRY].notnull()) state_hits = numpy.where(only_state, "state", None) county_hits[state_hits != None] = state_hits[state_hits != None] county_hits[only_country] = "country" data[cls.Fields.AGGREGATE_LEVEL] = county_hits # Backfilling FIPS data based on county names. # The following abbrev mapping only makes sense for the US # TODO: Fix all missing cases data = data[data["country"] == "United States"] data["state_abbr"] = data[cls.Fields.STATE].apply( lambda x: US_STATE_ABBREV[x] if x in US_STATE_ABBREV else x) data["state_tmp"] = data["state"] data["state"] = data["state_abbr"] fips_data = dataset_utils.build_fips_data_frame() data = dataset_utils.add_fips_using_county(data, fips_data) # ADD Negative tests data[cls.Fields. NEGATIVE_TESTS] = data[cls.Fields.TESTED] - data[cls.Fields.CASES] # put the state column back data["state"] = data["state_tmp"] return data
def standardize_data(cls, data: pd.DataFrame) -> pd.DataFrame: data = dataset_utils.strip_whitespace(data) # TODO Figure out how to rename to some ISO standard. country_remap = { "Mainland China": "China", "Bahamas, The": "Bahamas", "Congo (Brazzaville)": "Congo", "Congo (Kinshasa)": "Congo", "Diamond Princess": "Cruise Ship", "Hong Kong SAR": "Hong Kong", "Iran (Islamic Republic of)": "Iran", "Korea, South": "South Korea", "Taiwan*": "Taiwan", "UK": "United Kingdom", "US": "USA", } data = data.replace({cls.Fields.COUNTRY: country_remap}) states = data[cls.Fields.STATE].apply(dataset_utils.parse_state) county_from_state = data[cls.Fields.STATE].apply( dataset_utils.parse_county_from_state) data[cls.Fields.COUNTY] = data[cls.Fields.COUNTY].combine_first( county_from_state) data[cls.Fields.STATE] = states data = cls._fill_incomplete_county_data(data) state_only = data[cls.Fields.FIPS].isnull() & data[ cls.Fields.COUNTY].isnull() # Pad fips values to 5 spots data[cls.Fields.FIPS] = data[cls.Fields.FIPS].apply( lambda x: f"{x.zfill(5)}" if type(x) == str else x) data[cls.Fields.AGGREGATE_LEVEL] = numpy.where(state_only, "state", "county") data = cls._aggregate_fips_data(data) dataset_utils.assert_counties_have_fips(data, county_key=cls.Fields.COUNTY, fips_key=cls.Fields.FIPS) return data
def standardize_data(cls, data: pd.DataFrame) -> pd.DataFrame: data = dataset_utils.strip_whitespace(data) # Don't want to return city data because it's duplicated in county # City data before 3-23 was not duplicated. # data = data[data[cls.Fields.CITY].isnull()] pre_march_23 = data[data.date < "2020-03-23"] pre_march_23.county = pre_march_23.apply(fill_missing_county_with_city, axis=1) split_data = [ pre_march_23, data[(data.date >= "2020-03-23") & data[cls.Fields.CITY].isnull()], ] data = pd.concat(split_data) # CDS state level aggregates are identifiable by not having a city or county. only_county = (data[cls.Fields.COUNTY].notnull() & data[cls.Fields.STATE].notnull()) county_hits = numpy.where(only_county, "county", None) only_state = (data[cls.Fields.COUNTY].isnull() & data[cls.Fields.CITY].isnull() & data[cls.Fields.STATE].notnull()) only_country = (data[cls.Fields.COUNTY].isnull() & data[cls.Fields.CITY].isnull() & data[cls.Fields.STATE].isnull() & data[cls.Fields.COUNTRY].notnull()) state_hits = numpy.where(only_state, "state", None) county_hits[state_hits != None] = state_hits[state_hits != None] county_hits[only_country] = "country" data[cls.Fields.AGGREGATE_LEVEL] = county_hits # Backfilling FIPS data based on county names. # TODO: Fix all missing cases fips_data = dataset_utils.build_fips_data_frame() data = dataset_utils.add_fips_using_county(data, fips_data) return data