def from_source(cls, source: "DataSource", fill_missing_state=True): """Loads data from a specific datasource. Remaps columns from source dataset, fills in missing data by computing aggregates, and adds standardized county names from fips. """ if not source.BEDS_FIELD_MAP: raise ValueError("Source must have beds field map.") data = source.data to_common_fields = { value: key for key, value in source.BEDS_FIELD_MAP.items() } final_columns = to_common_fields.values() data = data.rename(columns=to_common_fields)[final_columns] data[cls.Fields.SOURCE] = source.SOURCE_NAME data[cls.Fields.GENERATED] = False if fill_missing_state: state_groupby_fields = [cls.Fields.SOURCE, cls.Fields.STATE] non_matching = dataset_utils.aggregate_and_get_nonmatching( data, state_groupby_fields, AggregationLevel.COUNTY, AggregationLevel.STATE, ).reset_index() non_matching[cls.Fields.GENERATED] = True data = pd.concat([data, non_matching]) fips_data = dataset_utils.build_fips_data_frame() data = dataset_utils.add_county_using_fips(data, fips_data) return cls(data)
def from_source(cls, source: "DataSource", fill_missing_state: bool = True) -> "TimeseriesDataset": """Loads data from a specific datasource. Args: source: DataSource to standardize for timeseries dataset fill_missing_state: If True, backfills missing state data by calculating county level aggregates. Returns: Timeseries object. """ data = source.data group = [ CommonFields.DATE, CommonFields.COUNTRY, CommonFields.AGGREGATE_LEVEL, CommonFields.STATE, ] data = custom_aggregations.update_with_combined_new_york_counties( data, group, are_boroughs_zero=source.HAS_AGGREGATED_NYC_BOROUGH) if fill_missing_state: state_groupby_fields = [ CommonFields.DATE, CommonFields.COUNTRY, CommonFields.STATE, ] non_matching = dataset_utils.aggregate_and_get_nonmatching( data, state_groupby_fields, AggregationLevel.COUNTY, AggregationLevel.STATE, ).reset_index() data = pd.concat([data, non_matching]) fips_data = dataset_utils.build_fips_data_frame() data = dataset_utils.add_county_using_fips(data, fips_data) is_state = data[ CommonFields.AGGREGATE_LEVEL] == AggregationLevel.STATE.value state_fips = data.loc[is_state, CommonFields.STATE].map( us_state_abbrev.ABBREV_US_FIPS) data.loc[is_state, CommonFields.FIPS] = state_fips no_fips = data[CommonFields.FIPS].isnull() if no_fips.any(): _log.warning("Dropping rows without FIPS", source=str(source), rows=repr(data.loc[no_fips])) data = data.loc[~no_fips] dups = data.duplicated(COMMON_FIELDS_TIMESERIES_KEYS, keep=False) if dups.any(): raise DuplicateDataException(f"Duplicates in {source}", data.loc[dups]) # Choosing to sort by date data = data.sort_values(CommonFields.DATE) return cls(data, provenance=source.provenance)
def standardize_data(cls, data: pd.DataFrame) -> pd.DataFrame: # All DH data is aggregated at the county level data[cls.Fields.AGGREGATE_LEVEL] = "county" # Backfilling FIPS data based on county names. # TODO: Fix all missing cases fips_data = dataset_utils.build_fips_data_frame() data = match_county_to_fips(data, fips_data) return data
def from_source(cls, source: "DataSource", fill_missing_state: bool = True) -> "TimeseriesDataset": """Loads data from a specific datasource. Args: source: DataSource to standardize for timeseries dataset fill_missing_state: If True, backfills missing state data by calculating county level aggregates. Returns: Timeseries object. """ data = source.data # TODO(tom): Do this renaming upstream, when the source is loaded or when first copied from the third party. to_common_fields = { value: key for key, value in source.all_fields_map().items() } final_columns = to_common_fields.values() data = data.rename(columns=to_common_fields)[final_columns] group = [ CommonFields.DATE, CommonFields.COUNTRY, CommonFields.AGGREGATE_LEVEL, CommonFields.STATE, ] data = custom_aggregations.update_with_combined_new_york_counties( data, group, are_boroughs_zero=source.HAS_AGGREGATED_NYC_BOROUGH) if fill_missing_state: state_groupby_fields = [ CommonFields.DATE, CommonFields.COUNTRY, CommonFields.STATE, ] non_matching = dataset_utils.aggregate_and_get_nonmatching( data, state_groupby_fields, AggregationLevel.COUNTY, AggregationLevel.STATE, ).reset_index() data = pd.concat([data, non_matching]) fips_data = dataset_utils.build_fips_data_frame() data = dataset_utils.add_county_using_fips(data, fips_data) is_state = data[ CommonFields.AGGREGATE_LEVEL] == AggregationLevel.STATE.value state_fips = data.loc[is_state, CommonFields.STATE].map( us_state_abbrev.ABBREV_US_FIPS) data.loc[is_state, CommonFields.FIPS] = state_fips # Choosing to sort by date data = data.sort_values(CommonFields.DATE) return cls(data)
def standardize_data(cls, data: pd.DataFrame) -> pd.DataFrame: data = dataset_utils.strip_whitespace(data) data = cls.remove_duplicate_city_data(data) # CDS state level aggregates are identifiable by not having a city or county. only_county = data[cls.Fields.COUNTY].notnull() & data[cls.Fields.STATE].notnull() county_hits = numpy.where(only_county, "county", None) only_state = ( data[cls.Fields.COUNTY].isnull() & data[cls.Fields.CITY].isnull() & data[cls.Fields.STATE].notnull() ) only_country = ( data[cls.Fields.COUNTY].isnull() & data[cls.Fields.CITY].isnull() & data[cls.Fields.STATE].isnull() & data[cls.Fields.COUNTRY].notnull() ) state_hits = numpy.where(only_state, "state", None) county_hits[state_hits != None] = state_hits[state_hits != None] county_hits[only_country] = "country" data[cls.Fields.AGGREGATE_LEVEL] = county_hits # Backfilling FIPS data based on county names. # The following abbrev mapping only makes sense for the US # TODO: Fix all missing cases data = data[data["country"] == "United States"] data[CommonFields.COUNTRY] = "USA" data[CommonFields.STATE] = data[cls.Fields.STATE].apply( lambda x: US_STATE_ABBREV[x] if x in US_STATE_ABBREV else x ) fips_data = dataset_utils.build_fips_data_frame() data = dataset_utils.add_fips_using_county(data, fips_data) no_fips = data[CommonFields.FIPS].isna() if no_fips.sum() > 0: logging.error(f"Removing {len(data.loc[no_fips])} rows without fips id") # logging.error(f"Removing rows without fips id: {str(data.loc[no_fips])}") data = data.loc[~no_fips] data.set_index(["date", "fips"], inplace=True) if data.index.has_duplicates: # Use keep=False when logging so the output contains all duplicated rows, not just the # first or last instance of each duplicate. logging.error(f"Removing duplicates: {str(data.index.duplicated(keep=False))}") data = data.loc[~data.index.duplicated(keep=False)] data.reset_index(inplace=True) # ADD Negative tests data[cls.Fields.NEGATIVE_TESTS] = data[cls.Fields.TESTED] - data[cls.Fields.CASES] return data
def standardize_data(cls, data: pd.DataFrame) -> pd.DataFrame: data = dataset_utils.strip_whitespace(data) # Don't want to return city data because it's duplicated in county # City data before 3-23 was not duplicated. # data = data[data[cls.Fields.CITY].isnull()] pre_march_23 = data[data.date < "2020-03-23"] pre_march_23.county = pre_march_23.apply(fill_missing_county_with_city, axis=1) split_data = [ pre_march_23, data[(data.date >= "2020-03-23") & data[cls.Fields.CITY].isnull()], ] data = pd.concat(split_data) # CDS state level aggregates are identifiable by not having a city or county. only_county = (data[cls.Fields.COUNTY].notnull() & data[cls.Fields.STATE].notnull()) county_hits = numpy.where(only_county, "county", None) only_state = (data[cls.Fields.COUNTY].isnull() & data[cls.Fields.CITY].isnull() & data[cls.Fields.STATE].notnull()) only_country = (data[cls.Fields.COUNTY].isnull() & data[cls.Fields.CITY].isnull() & data[cls.Fields.STATE].isnull() & data[cls.Fields.COUNTRY].notnull()) state_hits = numpy.where(only_state, "state", None) county_hits[state_hits != None] = state_hits[state_hits != None] county_hits[only_country] = "country" data[cls.Fields.AGGREGATE_LEVEL] = county_hits # Backfilling FIPS data based on county names. # The following abbrev mapping only makes sense for the US # TODO: Fix all missing cases data = data[data["country"] == "United States"] data["state_abbr"] = data[cls.Fields.STATE].apply( lambda x: US_STATE_ABBREV[x] if x in US_STATE_ABBREV else x) data["state_tmp"] = data["state"] data["state"] = data["state_abbr"] fips_data = dataset_utils.build_fips_data_frame() data = dataset_utils.add_fips_using_county(data, fips_data) # ADD Negative tests data[cls.Fields.NEGATIVE_TESTS] = (data[cls.Fields.TESTED] - data[cls.Fields.CASES]) # put the state column back data["state"] = data["state_tmp"] return data
def from_source(cls, source: "DataSource", fill_missing_state=True): """Loads data from a specific datasource. Remaps columns from source dataset, fills in missing data by computing aggregates, and adds standardized county names from fips. Args: source: Data source. fill_missing_state: If True, fills in missing state level data by aggregating county level for a given state. """ if not source.BEDS_FIELD_MAP: raise ValueError("Source must have beds field map.") data = source.data to_common_fields = { value: key for key, value in source.BEDS_FIELD_MAP.items() } final_columns = to_common_fields.values() data = data.rename(columns=to_common_fields)[final_columns] data[cls.Fields.SOURCE] = source.SOURCE_NAME data[cls.Fields.GENERATED] = False # Generating max bed count. columns_to_consider = [ cls.Fields.STAFFED_BEDS, cls.Fields.LICENSED_BEDS ] data[cls.Fields.MAX_BED_COUNT] = data[columns_to_consider].max(axis=1) # When grouping nyc data, we don't want to count the generated field # as a value to sum. group = cls.STATE_GROUP_KEY + [cls.Fields.GENERATED] data = custom_aggregations.update_with_combined_new_york_counties( data, group, are_boroughs_zero=False) if fill_missing_state: non_matching = dataset_utils.aggregate_and_get_nonmatching( data, cls.STATE_GROUP_KEY, AggregationLevel.COUNTY, AggregationLevel.STATE, ).reset_index() non_matching[cls.Fields.GENERATED] = True data = pd.concat([data, non_matching]) fips_data = dataset_utils.build_fips_data_frame() data = dataset_utils.add_county_using_fips(data, fips_data) return cls(data)
def standardize_data(cls, data: pd.DataFrame) -> pd.DataFrame: # All DH data is aggregated at the county level data[cls.Fields.AGGREGATE_LEVEL] = "county" data[cls.Fields.COUNTRY] = "USA" # Backfilling FIPS data based on county names. fips_data = dataset_utils.build_fips_data_frame() data = match_county_to_fips(data, fips_data) # The virgin islands do not currently have associated fips codes. # if VI is supported in the future, this should be removed. is_virgin_islands = data[cls.Fields.STATE] == 'VI' return data[~is_virgin_islands]
def from_source(cls, source: "DataSource", fill_missing_state=True): """Loads data from a specific datasource.""" if not source.TIMESERIES_FIELD_MAP: raise ValueError("Source must have field timeseries field map.") data = source.data to_common_fields = { value: key for key, value in source.TIMESERIES_FIELD_MAP.items() } final_columns = to_common_fields.values() data = data.rename(columns=to_common_fields)[final_columns] data[cls.Fields.SOURCE] = source.SOURCE_NAME data[cls.Fields.GENERATED] = False group = [ cls.Fields.DATE, cls.Fields.SOURCE, cls.Fields.COUNTRY, cls.Fields.AGGREGATE_LEVEL, cls.Fields.STATE, cls.Fields.GENERATED, ] data = custom_aggregations.update_with_combined_new_york_counties( data, group, are_boroughs_zero=True ) if fill_missing_state: state_groupby_fields = [ cls.Fields.DATE, cls.Fields.SOURCE, cls.Fields.COUNTRY, cls.Fields.STATE, ] non_matching = dataset_utils.aggregate_and_get_nonmatching( data, state_groupby_fields, AggregationLevel.COUNTY, AggregationLevel.STATE, ).reset_index() non_matching[cls.Fields.GENERATED] = True data = pd.concat([data, non_matching]) fips_data = dataset_utils.build_fips_data_frame() data = dataset_utils.add_county_using_fips(data, fips_data) # Choosing to sort by date data = data.sort_values(cls.Fields.DATE) return cls(data)
def standardize_data(cls, data: pd.DataFrame) -> pd.DataFrame: # All DH data is aggregated at the county level data[cls.Fields.AGGREGATE_LEVEL] = "county" data[cls.Fields.COUNTRY] = "USA" # Backfilling FIPS data based on county names. fips_data = dataset_utils.build_fips_data_frame() fips_data = fips_data[fips_data.aggregate_level == AggregationLevel.COUNTY.value] fips_data = fips_data[fips_data.fips != enums.UNKNOWN_FIPS] data = match_county_to_fips(data, fips_data) data[cls.Fields.MAX_BED_COUNT] = data[ [cls.Fields.STAFFED_BEDS, cls.Fields.LICENSED_BEDS] ].max(axis=1) # The virgin islands do not currently have associated fips codes. # if VI is supported in the future, this should be removed. is_virgin_islands = data[cls.Fields.STATE] == "VI" return data[~is_virgin_islands]
def from_source(cls, source: "DataSource", fill_missing_state=True): """Loads data from a specific datasource. Remaps columns from source dataset, fills in missing data by computing aggregates, and adds standardized county names from fips. Args: source: Data source. fill_missing_state: If True, fills in missing state level data by aggregating county level for a given state. """ if not source.COMMON_FIELD_MAP and not source.INDEX_FIELD_MAP: raise ValueError("Source must have metadata field map.") data = source.data fields = source.all_fields_map().items() to_common_fields = {value: key for key, value in fields} final_columns = to_common_fields.values() data = data.rename(columns=to_common_fields)[final_columns] data = cls._aggregate_new_york_data(data) if fill_missing_state: non_matching = dataset_utils.aggregate_and_get_nonmatching( data, cls.STATE_GROUP_KEY, AggregationLevel.COUNTY, AggregationLevel.STATE, ).reset_index() data = pd.concat([data, non_matching]) fips_data = dataset_utils.build_fips_data_frame() data = dataset_utils.add_county_using_fips(data, fips_data) # Add state fips is_state = data[ cls.Fields.AGGREGATE_LEVEL] == AggregationLevel.STATE.value state_fips = data.loc[is_state, cls.Fields.STATE].map( us_state_abbrev.ABBREV_US_FIPS) data.loc[is_state, cls.Fields.FIPS] = state_fips return cls(data)
def standardize_data(cls, data: pd.DataFrame) -> pd.DataFrame: data = dataset_utils.strip_whitespace(data) data = cls.remove_duplicate_city_data(data) # CDS state level aggregates are identifiable by not having a city or county. only_county = data[cls.Fields.COUNTY].notnull() & data[ cls.Fields.STATE].notnull() county_hits = numpy.where(only_county, "county", None) only_state = (data[cls.Fields.COUNTY].isnull() & data[cls.Fields.CITY].isnull() & data[cls.Fields.STATE].notnull()) only_country = (data[cls.Fields.COUNTY].isnull() & data[cls.Fields.CITY].isnull() & data[cls.Fields.STATE].isnull() & data[cls.Fields.COUNTRY].notnull()) state_hits = numpy.where(only_state, "state", None) county_hits[state_hits != None] = state_hits[state_hits != None] county_hits[only_country] = "country" data[cls.Fields.AGGREGATE_LEVEL] = county_hits # Backfilling FIPS data based on county names. # The following abbrev mapping only makes sense for the US # TODO: Fix all missing cases data = data[data["country"] == "United States"] data["state_abbr"] = data[cls.Fields.STATE].apply( lambda x: US_STATE_ABBREV[x] if x in US_STATE_ABBREV else x) data["state_tmp"] = data["state"] data["state"] = data["state_abbr"] fips_data = dataset_utils.build_fips_data_frame() data = dataset_utils.add_fips_using_county(data, fips_data) # ADD Negative tests data[cls.Fields. NEGATIVE_TESTS] = data[cls.Fields.TESTED] - data[cls.Fields.CASES] # put the state column back data["state"] = data["state_tmp"] return data
def standardize_data(cls, data: pd.DataFrame) -> pd.DataFrame: data = dataset_utils.strip_whitespace(data) # Don't want to return city data because it's duplicated in county # City data before 3-23 was not duplicated. # data = data[data[cls.Fields.CITY].isnull()] pre_march_23 = data[data.date < "2020-03-23"] pre_march_23.county = pre_march_23.apply(fill_missing_county_with_city, axis=1) split_data = [ pre_march_23, data[(data.date >= "2020-03-23") & data[cls.Fields.CITY].isnull()], ] data = pd.concat(split_data) # CDS state level aggregates are identifiable by not having a city or county. only_county = (data[cls.Fields.COUNTY].notnull() & data[cls.Fields.STATE].notnull()) county_hits = numpy.where(only_county, "county", None) only_state = (data[cls.Fields.COUNTY].isnull() & data[cls.Fields.CITY].isnull() & data[cls.Fields.STATE].notnull()) only_country = (data[cls.Fields.COUNTY].isnull() & data[cls.Fields.CITY].isnull() & data[cls.Fields.STATE].isnull() & data[cls.Fields.COUNTRY].notnull()) state_hits = numpy.where(only_state, "state", None) county_hits[state_hits != None] = state_hits[state_hits != None] county_hits[only_country] = "country" data[cls.Fields.AGGREGATE_LEVEL] = county_hits # Backfilling FIPS data based on county names. # TODO: Fix all missing cases fips_data = dataset_utils.build_fips_data_frame() data = dataset_utils.add_fips_using_county(data, fips_data) return data
def from_source(cls, source: "DataSource", fill_missing_state: bool = True, fill_na: bool = True) -> "Timeseries": """Loads data from a specific datasource. Args: source: DataSource to standardize for timeseries dataset fill_missing_state: If True, backfills missing state data by calculating county level aggregates. fill_na: If True, fills in all NaN values for metrics columns. Returns: Timeseries object. """ if not source.TIMESERIES_FIELD_MAP: raise ValueError("Source must have field timeseries field map.") data = source.data to_common_fields = { value: key for key, value in source.TIMESERIES_FIELD_MAP.items() } final_columns = to_common_fields.values() data = data.rename(columns=to_common_fields)[final_columns] data[cls.Fields.SOURCE] = source.SOURCE_NAME data[cls.Fields.GENERATED] = False group = [ cls.Fields.DATE, cls.Fields.SOURCE, cls.Fields.COUNTRY, cls.Fields.AGGREGATE_LEVEL, cls.Fields.STATE, cls.Fields.GENERATED, ] data = custom_aggregations.update_with_combined_new_york_counties( data, group, are_boroughs_zero=True) if fill_missing_state: state_groupby_fields = [ cls.Fields.DATE, cls.Fields.SOURCE, cls.Fields.COUNTRY, cls.Fields.STATE, ] non_matching = dataset_utils.aggregate_and_get_nonmatching( data, state_groupby_fields, AggregationLevel.COUNTY, AggregationLevel.STATE, ).reset_index() non_matching[cls.Fields.GENERATED] = True data = pd.concat([data, non_matching]) if fill_na: # Filtering out metric columns that don't exist in the dataset. # It might be that we all timeseries datasets to have all of the metric # columns. If so, initialization of the missing columns should come earlier. metric_columns = [ field for field in cls.Fields.metrics() if field in data.columns ] data[metric_columns] = data[metric_columns].fillna(0.0) fips_data = dataset_utils.build_fips_data_frame() data = dataset_utils.add_county_using_fips(data, fips_data) # Choosing to sort by date data = data.sort_values(cls.Fields.DATE) return cls(data)