def normalize(self, data: str) -> pd.DataFrame: # Read the dataframe from the string csv df = data.copy() df.columns = [x.lower().strip() for x in df.columns] # Set date and fips code # NOTE: collection_week refers to the first day of the week, so add 6 # days to get the last day. df.loc[:, "dt"] = pd.to_datetime( df["collection_week"]) + timedelta(days=6) # Filter out all of the columns without a fips code for now -- I # think that it is likely that we could reverse engineer these # either by looking them up or by mapping city to county df = df.loc[~df["fips_code"].isna(), :] # :see_no_evil: df["location"] = ( df["fips_code"].astype(int).replace({ # 02120 corresponded to Kenai-Cook Inlet Division... It was # then the relevant piece became Kenai Peninsula Borough which # is 02122 # https://data.nber.org/asg/ASG_release/County_City/FIPS/FIPS_Changes.pdf 2120: 2122, # City associated with the hospital is Seward which is in the # Kenai Borough which is 02122 but I have no idea how this # ended up with fips code 02210??? # https://en.wikipedia.org/wiki/Seward,_Alaska 2210: 2122, # 02260 was fips code for Valdez-Chitina-Whittier Division... It # was then put into Valdez–Cordova Census Area which is # 02261, but 02261 was split in Jan 2019 and we'll need to change # this again if we update geographies # https://data.nber.org/asg/ASG_release/County_City/FIPS/FIPS_Changes.pdf 2260: 2261, # 02280 corresponded to Wrangell-Petersburg but became the # Petersburg Borough 02195 in 2012 # https://www.cdc.gov/nchs/nvss/bridged_race/county_geography-_changes2015.pdf 2280: 2195, # City associated with the hospital is Cordova which is in the # Valdez-Cordova census area but I don't know which one this # ended up in after the split... # https://en.wikipedia.org/wiki/Cordova,_Alaska 2080: 2261, # Source of change: https://www.cdc.gov/nchs/nvss/bridged_race/county_geography-_changes2015.pdf # page 6 # Virginia, 2013: Bedford (independent) city (FIPS 51515) was changed to # town status and added to Bedford County (FIPS 51019) effective July 1st, 2013 51515: 51019, })) # Set all missing values (-999999) to nan for all numeric columns numeric_cols = list(df.select_dtypes("number")) df.loc[:, numeric_cols] = df.loc[:, numeric_cols].where( lambda x: x > 0, np.nan) # Variables that can be determined with "simple average" vars_to_compute_avg = [ "inpatient_beds_7_day", "inpatient_beds_used_7_day", "total_staffed_adult_icu_beds_7_day", "staffed_adult_icu_bed_occupancy_7_day", "staffed_icu_adult_patients_confirmed_covid_7_day", ] for var in vars_to_compute_avg: df.loc[:, f"{var}_canavg"] = df.eval(f"{var}_sum / {var}_coverage") # Variables that require "more complicated average" aps = "total_adult_patients_hospitalized_confirmed_covid_7_day_sum" apc = "total_adult_patients_hospitalized_confirmed_covid_7_day_coverage" pps = "total_pediatric_patients_hospitalized_confirmed_covid_7_day_sum" ppc = "total_pediatric_patients_hospitalized_confirmed_covid_7_day_coverage" temp = df.eval(f"{aps} / {apc}") # Do the pediatric sum second so that we keep adult values if they're available # (while filling pediatric missing data with 0s) but if adult is missing then # it will stay as missing temp = temp + df.eval(f"{pps} / {ppc}").fillna(0.0) df.loc[:, "inpatient_beds_used_covid_7_day_canavg"] = temp.values crename = { "inpatient_beds_7_day_canavg": CMU( category="hospital_beds_capacity", measurement="rolling_average_7_day", unit="beds", ), "inpatient_beds_used_7_day_canavg": CMU( category="hospital_beds_in_use", measurement="rolling_average_7_day", unit="beds", ), # This column is generated by summing adult and pediatric # beds -- Should be missing if either is missing "inpatient_beds_used_covid_7_day_canavg": CMU( category="hospital_beds_in_use_covid", measurement="rolling_average_7_day", unit="beds", ), "total_staffed_adult_icu_beds_7_day_canavg": CMU( category="adult_icu_beds_capacity", measurement="rolling_average_7_day", unit="beds", ), "staffed_adult_icu_bed_occupancy_7_day_canavg": CMU( category="adult_icu_beds_in_use", measurement="rolling_average_7_day", unit="beds", ), "staffed_icu_adult_patients_confirmed_covid_7_day_canavg": CMU( category="adult_icu_beds_in_use_covid", measurement="rolling_average_7_day", unit="beds", ), } # Reshape by putting into long form df_long = df.melt(id_vars=["dt", "location"], value_vars=crename.keys()).dropna() df_long.loc[:, "value"] = pd.to_numeric( df_long["value"].astype(str).str.replace(",", "")) # Add category, measurement, unit, age, sex, race df_long = self.extract_CMU(df_long, crename) # Group by relevant factors and sum identifier = [ "dt", "location", "category", "measurement", "unit", "age", "sex", "race", "ethnicity", ] # TODO: We could do a different groupby and put this into states # or hospital regions out_county = (df_long.groupby(identifier)["value"].agg( pd.Series.sum, skipna=False).reset_index()) # TODO: Throwing out territories because I don't remember which weren't # included in the census data :( out_county = out_county.query("location < 60_000").copy() # Add vintage out_county.loc[:, "vintage"] = self._retrieve_vintage() out_county.loc[:, "location_type"] = "county" cols_2_keep = identifier + ["vintage", "location_type", "value"] return out_county.loc[:, cols_2_keep]
def normalize(self, data) -> pd.DataFrame: countiesVac1 = data[0]["stringColumn"]["values"] countiesNullIndexVac1 = data[0]["nullIndex"] countiesSupplyVac1 = data[1]["doubleColumn"]["values"] countiesSupplyNullIndexVac1 = data[1]["nullIndex"] countiesAllocVac1 = data[2]["doubleColumn"]["values"] countiesAllocNullIndexVac1 = data[2]["nullIndex"] # sorting null index lists so there is no chance of an index out of bounds error when we insert 0 values for # null indexes countiesNullIndexVac1.sort() countiesAllocNullIndexVac1.sort() countiesSupplyNullIndexVac1.sort() # inserting values of 0 for the indexes corresponding with the null value indexes for value in countiesNullIndexVac1: countiesVac1.insert(value, 0) for value in countiesSupplyNullIndexVac1: countiesSupplyVac1.insert(value, 0) for value in countiesAllocNullIndexVac1: countiesAllocVac1.insert(value, 0) # create dose 1 data frame countyVaccineDataFrameVac1 = pd.DataFrame({ "location_name": countiesVac1, "supplyVac1": countiesSupplyVac1, "administeredVac1": countiesAllocVac1, }) countiesVac2 = data[3]["stringColumn"]["values"] countiesNullIndexVac2 = data[3]["nullIndex"] countiesSupplyVac2 = data[4]["doubleColumn"]["values"] countiesSupplyNullIndexVac2 = data[4]["nullIndex"] countiesAllocVac2 = data[5]["doubleColumn"]["values"] countiesAllocNullIndexVac2 = data[5]["nullIndex"] # sorting null index lists so there is no chance of an index out of bounds error when we insert 0 values countiesNullIndexVac2.sort() countiesAllocNullIndexVac2.sort() countiesSupplyNullIndexVac2.sort() # inserting values of 0 for the indexes corresponding with the null value indexes for value in countiesNullIndexVac2: countiesVac2.insert(value, 0) for value in countiesSupplyNullIndexVac2: countiesSupplyVac2.insert(value, 0) for value in countiesAllocNullIndexVac2: countiesAllocVac2.insert(value, 0) # create dose 2 dataframe countyVaccineDataFrameVac2 = pd.DataFrame({ "location_name": countiesVac2, "supplyVac2": countiesSupplyVac2, "administeredVac2": countiesAllocVac2, }) # merge dose 1 data frame with dose 2 dataframe countyVaccineDataFrame = countyVaccineDataFrameVac1.merge( countyVaccineDataFrameVac2, on="location_name", how="outer") countyVaccineDataFrame["dt"] = self.execution_dt countyVaccineDataFrame["totalSupply"] = ( countyVaccineDataFrame["supplyVac2"] + countyVaccineDataFrame["supplyVac1"]) crename = { "totalSupply": CMU( category="total_vaccine_allocated", measurement="cumulative", unit="doses", ), "administeredVac1": CMU( category="total_vaccine_initiated", measurement="cumulative", unit="people", ), "administeredVac2": CMU( category="total_vaccine_completed", measurement="cumulative", unit="people", ), } out = countyVaccineDataFrame.melt(id_vars=["dt", "location_name"], value_vars=crename.keys()).dropna() out["value"] = out["value"].astype(int) out["vintage"] = self._retrieve_vintage() df = self.extract_CMU(out, crename) return df.drop(["variable"], axis="columns")
def normalize(self, data) -> pd.DataFrame: """ Cleans and normalizes the data we recieve from the Google api """ datesListVac1 = data[0]["dateColumn"]["values"] nullValuesSupplyListVac1 = data[1]["nullIndex"] valuesSupplyListVac1 = data[1]["doubleColumn"]["values"] nullValuesAdminListVac1 = data[2]["nullIndex"] valuesAdminListVac1 = data[2]["doubleColumn"]["values"] # sorts the Null Values index lists so we dont get an out of bounds error when we insert values of 0 in their places nullValuesAdminListVac1.sort() nullValuesSupplyListVac1.sort() for value in nullValuesSupplyListVac1: valuesSupplyListVac1.insert(value, 0) for value in nullValuesAdminListVac1: valuesAdminListVac1.insert(value, 0) # creating dataframe for dose 1 county level data stateVaccineDataFrameVac1 = pd.DataFrame({ "dt": datesListVac1, "supplyVac1": valuesSupplyListVac1, "administeredVac1": valuesAdminListVac1, }) datesListVac2 = data[3]["dateColumn"]["values"] nullValuesSupplyListVac2 = data[4]["nullIndex"] valuesSupplyListVac2 = data[4]["doubleColumn"]["values"] nullValuesAdminListVac2 = data[5]["nullIndex"] valuesAdminListVac2 = data[5]["doubleColumn"]["values"] nullValuesAdminListVac2.sort() nullValuesSupplyListVac2.sort() for value in nullValuesSupplyListVac2: valuesSupplyListVac2.insert(value, 0) for value in nullValuesAdminListVac2: valuesAdminListVac2.insert(value, 0) # creating data frame for second dose of vaccines stateVaccineDataFrameVac2 = pd.DataFrame({ "dt": datesListVac2, "supplyVac2": valuesSupplyListVac2, "administeredVac2": valuesAdminListVac2, }) # merges two data frames together stateVaccineDataFrame = stateVaccineDataFrameVac1.merge( stateVaccineDataFrameVac2, on="dt", how="outer") # sums the first dose allocation and the second dose allocations together stateVaccineDataFrame["supplyTotal"] = ( stateVaccineDataFrame["supplyVac2"] + stateVaccineDataFrame["supplyVac1"]) # create cumulative vaccine supply variable stateVaccineDataFrame["supplyCumulative"] = [ stateVaccineDataFrame["supplyTotal"].loc[0:x].sum() for x in range(len(stateVaccineDataFrame["supplyTotal"])) ] stateVaccineDataFrame["location"] = self.state_fips stateVaccineDataFrame["dt"] = pd.to_datetime( stateVaccineDataFrame["dt"]) crename = { "supplyCumulative": CMU( category="total_vaccine_allocated", measurement="cumulative", unit="doses", ), "administeredVac1": CMU(category="total_vaccine_initiated", measurement="new", unit="people"), "administeredVac2": CMU(category="total_vaccine_completed", measurement="new", unit="people"), } out = stateVaccineDataFrame.melt(id_vars=["dt", "location"], value_vars=crename.keys()).dropna() out["value"] = out["value"].astype(int) out["vintage"] = self._retrieve_vintage() out = self.extract_CMU(out, crename) return out.drop(["variable"], axis="columns")
class GeorgiaCountyVaccineAge(GeorgiaCountyVaccine): service = "Georgia_DPH_PUBLIC_Vaccination_Dashboard_V5_VIEW" sheet = 7 column_names = ["AGE"] variables = { "00-05": CMU( category="total_vaccine_initiated", measurement="cumulative", unit="people", age="0-9", ), "05_09": CMU( category="total_vaccine_initiated", measurement="cumulative", unit="people", age="5-9", ), "10_14": CMU( category="total_vaccine_initiated", measurement="cumulative", unit="people", age="10-14", ), "15_19": CMU( category="total_vaccine_initiated", measurement="cumulative", unit="people", age="15-19", ), "20_24": CMU( category="total_vaccine_initiated", measurement="cumulative", unit="people", age="20-24", ), "25_34": CMU( category="total_vaccine_initiated", measurement="cumulative", unit="people", age="25-34", ), "35_44": CMU( category="total_vaccine_initiated", measurement="cumulative", unit="people", age="35-44", ), "45_54": CMU( category="total_vaccine_initiated", measurement="cumulative", unit="people", age="45-54", ), "55_64": CMU( category="total_vaccine_initiated", measurement="cumulative", unit="people", age="55-64", ), "65_74": CMU( category="total_vaccine_initiated", measurement="cumulative", unit="people", age="65-74", ), "75_84": CMU( category="total_vaccine_initiated", measurement="cumulative", unit="people", age="75-84", ), "85PLUS": CMU( category="total_vaccine_initiated", measurement="cumulative", unit="people", age="85_plus", ), } def fetch(self): return self.get_all_jsons(self.service, self.sheet, "6") def normalize(self, data): df = self.arcgis_jsons_to_df(data) df = (df.pivot_table(index="COUNTYFIPS", columns=self.column_names, values="COUNTS").reset_index().rename_axis( None, axis=1)) df = self._rename_or_add_date_and_location( df, location_column="COUNTYFIPS", timezone="US/Eastern") df = self._reshape_variables(df, self.variables) locs_to_drop = ["0", "00000", 0] df = df.query("location not in @locs_to_drop") return df
class LAVaccineCountyDemographics(LAVaccineCounty): variables = { "PercInt_Black_value": CMU( category="total_vaccine_initiated", measurement="cumulative", unit="people", race="black", ), "PercInt_White_value": CMU( category="total_vaccine_initiated", measurement="cumulative", unit="people", race="white", ), "PercInt_Other_value": CMU( category="total_vaccine_initiated", measurement="cumulative", unit="people", race="other", ), "PercInt_RaceUnk_value": CMU( category="total_vaccine_initiated", measurement="cumulative", unit="people", race="unknown", ), "PercComp_Black_value": CMU( category="total_vaccine_completed", measurement="cumulative", unit="people", race="black", ), "PercComp_Other_value": CMU( category="total_vaccine_completed", measurement="cumulative", unit="people", race="other", ), "PercComp_RaceUnk_value": CMU( category="total_vaccine_completed", measurement="cumulative", unit="people", race="unknown", ), "PercComp_White_value": CMU( category="total_vaccine_completed", measurement="cumulative", unit="people", race="white", ), "PercInt_5to17_value": CMU( category="total_vaccine_initiated", measurement="cumulative", unit="people", age="5-17", ), "PercInt_18to29_value": CMU( category="total_vaccine_initiated", measurement="cumulative", unit="people", age="18-29", ), "PercInt_30to39_value": CMU( category="total_vaccine_initiated", measurement="cumulative", unit="people", age="30-39", ), "PercInt_40to49_value": CMU( category="total_vaccine_initiated", measurement="cumulative", unit="people", age="40-49", ), "PercInt_50to59_value": CMU( category="total_vaccine_initiated", measurement="cumulative", unit="people", age="50-59", ), "PercInt_60to69_value": CMU( category="total_vaccine_initiated", measurement="cumulative", unit="people", age="60-69", ), "PercInt_70plus_value": CMU( category="total_vaccine_initiated", measurement="cumulative", unit="people", age="70_plus", ), "PercInt_AgeUnk_value": CMU( category="total_vaccine_initiated", measurement="cumulative", unit="people", age="unknown", ), "PercComp_5to17_value": CMU( category="total_vaccine_completed", measurement="cumulative", unit="people", age="5-17", ), "PercComp_18to29_value": CMU( category="total_vaccine_completed", measurement="cumulative", unit="people", age="18-29", ), "PercComp_30to39_value": CMU( category="total_vaccine_completed", measurement="cumulative", unit="people", age="30-39", ), "PercComp_40to49_value": CMU( category="total_vaccine_completed", measurement="cumulative", unit="people", age="40-49", ), "PercComp_50to59_value": CMU( category="total_vaccine_completed", measurement="cumulative", unit="people", age="50-59", ), "PercComp_60to69_value": CMU( category="total_vaccine_completed", measurement="cumulative", unit="people", age="60-69", ), "PercComp_70plus_value": CMU( category="total_vaccine_completed", measurement="cumulative", unit="people", age="70_plus", ), "PercInt_Female_value": CMU( category="total_vaccine_initiated", measurement="cumulative", unit="people", sex="female", ), "PercInt_Male_value": CMU( category="total_vaccine_initiated", measurement="cumulative", unit="people", sex="male", ), "PercInt_SexUnk_value": CMU( category="total_vaccine_initiated", measurement="cumulative", unit="people", sex="unknown", ), "PercComp_Female_value": CMU( category="total_vaccine_completed", measurement="cumulative", unit="people", sex="female", ), "PercComp_Male_value": CMU( category="total_vaccine_completed", measurement="cumulative", unit="people", sex="male", ), "PercComp_SexUnk_value": CMU( category="total_vaccine_completed", measurement="cumulative", unit="people", sex="unknown", ), } def normalize(self, data): df = self.arcgis_jsons_to_df(data) # Multiply each of these columns by the population column init_col_names = [ x.replace("_value", "") for x in self.variables.keys() ] for col in init_col_names: df[col + "_value"] = np.floor( (df[col] / 100) * df["Total_2018pop"]) df = self._rename_or_add_date_and_location(df, location_column="PFIPS", timezone="US/Eastern") df = self._reshape_variables(df, self.variables) return df
class WisconsinVaccineAge(WisconsinVaccineCounty): data_tableau_table = "Age vax/unvax County" # age does not report missing/unknown entries missing_tableau_table = "" location_name_col = "AGG(Geography TT)-alias" location_type = "state" # map wide form column names into CMUs cmus = { "SUM(Initiation or completed count for TT)-alias": CMU( category="total_vaccine_initiated", measurement="cumulative", unit="people", ) } def _get_demographic( self, df: pd.DataFrame, demo: str, demo_col_name: str ) -> pd.DataFrame: """ description: a general "normalize" function to avoid extra/copied code each demographic uses this in its respective normalize params: demo: the demographic as labeled according to CMU (age,sex,race, etc...) demo_col_name: the name of the demographic column from the fetched data returns: normalized data in long format """ # county names (converted to title case) df["location_name"] = df[self.location_name_col].str.title() # fix county names df = df.replace( {"location_name": {"St Croix": "St. Croix", "Fond Du Lac": "Fond du Lac"}} ) # parse out data columns value_cols = list(set(df.columns) & set(self.cmus.keys())) assert len(value_cols) == len(self.cmus) df = ( df.melt(id_vars=[demo_col_name, "location_name"], value_vars=value_cols) .dropna() .assign( dt=self._retrieve_dt(self.timezone), vintage=self._retrieve_vintage(), value=lambda x: pd.to_numeric( x["value"].astype(str).str.replace(",", "") ), ) .pipe(self.extract_CMU, cmu=self.cmus) ) df[demo] = df[demo_col_name] return df.drop(["variable", demo_col_name], axis=1) def fetch(self) -> pd.DataFrame: if self.missing_tableau_table: # extract both data table and missing data table dfs = [ self.get_tableau_view().get(table) for table in [self.data_tableau_table, self.missing_tableau_table] ] return pd.concat(dfs) else: return self.get_tableau_view()[self.data_tableau_table] def normalize(self, df: pd.DataFrame) -> pd.DataFrame: df = self._get_demographic(df, "age", "Age-value") return df.replace({"age": {"65+": "65_plus"}})
def normalize(self, data) -> pd.DataFrame: countiesTotal = data[0]["stringColumn"]["values"] countiesTotalNullIndex = data[0]["nullIndex"] countiesTotalPer = data[1]["doubleColumn"]["values"] countiesTotalPerNullIndex = data[1]["nullIndex"] # sorting null index lists so there is no chance of an index out of bounds error when we insert 0 values for # null indexes countiesTotalNullIndex.sort() countiesTotalPerNullIndex.sort() # inserting values of 0 for the indexes corresponding with the null value indexes for value in countiesTotalNullIndex: countiesTotal.insert(value, 0) for value in countiesTotalPerNullIndex: countiesTotalPer.insert(value, 0) # create dose 1 data frame countyVaccineDataFrameAll = pd.DataFrame({ "location_name": countiesTotal, "TotalPer": countiesTotalPer, }) countyVaccineDataFrameAll["TotalPer"] = ( countyVaccineDataFrameAll["TotalPer"] * 100) countyVaccineDataFrameAll["age"] = "all" counties18plus = data[2]["stringColumn"]["values"] counties18PlusNullIndex = data[2]["nullIndex"] counties18PlusPer = data[3]["doubleColumn"]["values"] counties18PlusPerNullIndex = data[3]["nullIndex"] # sorting null index lists so there is no chance of an index out of bounds error when we insert 0 values counties18PlusNullIndex.sort() counties18PlusPerNullIndex.sort() # inserting values of 0 for the indexes corresponding with the null value indexes for value in counties18PlusNullIndex: counties18plus.insert(value, 0) for value in counties18PlusPerNullIndex: counties18PlusPer.insert(value, 0) # create dose 2 dataframe countyVaccineDataFrame18 = pd.DataFrame({ "location_name": counties18plus, "TotalPer": counties18PlusPer, }) countyVaccineDataFrame18["age"] = "18_plus" countyVaccineDataFrame18["TotalPer"] = ( countyVaccineDataFrame18["TotalPer"] * 100) counties65plus = data[4]["stringColumn"]["values"] counties65PlusNullIndex = data[4]["nullIndex"] counties65PlusPer = data[5]["doubleColumn"]["values"] counties65PlusPerNullIndex = data[5]["nullIndex"] # sorting null index lists so there is no chance of an index out of bounds error when we insert 0 values counties65PlusNullIndex.sort() counties65PlusPerNullIndex.sort() # inserting values of 0 for the indexes corresponding with the null value indexes for value in counties65PlusNullIndex: counties65plus.insert(value, 0) for value in counties18PlusPerNullIndex: counties65PlusPer.insert(value, 0) # create dose 2 dataframe countyVaccineDataFrame65 = pd.DataFrame({ "location_name": counties65plus, "TotalPer": counties65PlusPer, }) countyVaccineDataFrame65["age"] = "65_plus" countyVaccineDataFrame65["TotalPer"] = ( countyVaccineDataFrame65["TotalPer"] * 100) # merge data frame for all populations with dataframe of those 18 + countyVaccineDataFrame = countyVaccineDataFrameAll.append( countyVaccineDataFrame18) # Merge Previous dataframe with dataframe of those 65+ countyVaccineDataFrame = countyVaccineDataFrame.append( countyVaccineDataFrame65) countyVaccineDataFrame["dt"] = self.execution_dt crename = { "TotalPer": CMU( category="total_vaccine_completed", measurement="current", unit="percentage", ), } out = countyVaccineDataFrame.melt( id_vars=["dt", "location_name", "age"], value_vars=crename.keys()).dropna() out["value"] = out["value"].astype(int) out["vintage"] = self._retrieve_vintage() df = self.extract_CMU( out, crename, ["category", "measurement", "unit", "sex", "race", "ethnicity"], ) return df.drop(["variable"], axis="columns")
def _make_cmu(cat): return CMU( category=cat, measurement="cumulative", unit="people", )
def normalize(self, data) -> pd.DataFrame: # Read data into data frame df = pd.read_excel(data.content, parse_dates=["DATE"]) # Rename columns df = df.rename(columns={ "DATE": "dt", "COUNTY": "location_name", "AGE_GROUP": "age" }) # Drop the information that we won't be keeping track of age_not_keep = ["Pending"] loc_not_keep = ["Out of State", "Pending"] df = df.query( "(age not in @age_not_keep) & (location_name not in @loc_not_keep)" ) # Fix incorrectly spelled county names loc_replacer = {"Dekalb": "DeKalb"} df = df.replace({"location_name": loc_replacer}) # Translate age column df = self.translate_age(df) # Create dictionary for columns to map crename = { "CASE_COUNT": CMU(category="cases", measurement="cumulative", unit="people"), } # Move things into long format df = df.melt(id_vars=["dt", "location_name", "age"], value_vars=crename.keys()).dropna() # Determine the category of each observation df = self.extract_CMU( df, crename, ["category", "measurement", "unit", "race", "ethnicity", "sex"]) # Determine what columns to keep cols_to_keep = [ "dt", "location_name", "category", "measurement", "unit", "age", "race", "ethnicity", "sex", "value", ] # Drop extraneous columns out = df.loc[:, cols_to_keep] # Convert value columns out["value"] = out["value"].astype(int) # Add rows that don't change out["vintage"] = self._retrieve_vintage() return out
def normalize(self, data: str) -> pd.DataFrame: df = data.copy() # Read the dataframe from the string csv df.columns = [x.lower().strip() for x in df.columns] # Set date and fips code df.loc[:, "dt"] = pd.to_datetime(df["date"]) df.loc[:, "location"] = df["state"].map( lambda x: int(us.states.lookup(x).fips)) crename = { "critical_staffing_shortage_today_yes": CMU( category="critical_staff_shortage_yes", measurement="current", unit="hospitals", ), "critical_staffing_shortage_today_no": CMU( category="critical_staff_shortage_no", measurement="current", unit="hospitals", ), "critical_staffing_shortage_today_yes": CMU( category="critical_staff_shortage_noreport", measurement="current", unit="hospitals", ), "critical_staffing_shortage_anticipated_within_week_yes": CMU( category="critical_staff_shortage_yes", measurement="anticipated_within_7_day", unit="hospitals", ), "critical_staffing_shortage_anticipated_within_week_no": CMU( category="critical_staff_shortage_no", measurement="anticipated_within_7_day", unit="hospitals", ), "critical_staffing_shortage_anticipated_within_week_yes": CMU( category="critical_staff_shortage_noreport", measurement="anticipated_within_7_day", unit="hospitals", ), "inpatient_beds": CMU(category="hospital_beds_capacity", measurement="current", unit="beds"), "inpatient_beds_used": CMU(category="hospital_beds_in_use", measurement="current", unit="beds"), "inpatient_beds_used_covid": CMU( category="hospital_beds_in_use_covid", measurement="current", unit="beds", ), "inpatient_beds_utilization": CMU( category="hospital_beds_in_use", measurement="current", unit="percentage", ), "total_staffed_adult_icu_beds": CMU(category="adult_icu_beds_capacity", measurement="current", unit="beds"), "staffed_adult_icu_bed_occupancy": CMU(category="adult_icu_beds_in_use", measurement="current", unit="beds"), "staffed_icu_adult_patients_confirmed_covid": CMU( category="adult_icu_beds_in_use_covid", measurement="current", unit="beds", ), "adult_icu_bed_covid_utilization": CMU( category="adult_icu_beds_in_use", measurement="current", unit="percentage", ), } # Put into long form out = df.melt(id_vars=["dt", "location"], value_vars=crename.keys()) out.loc[:, "value"] = pd.to_numeric(out["value"].astype(str).str.replace( ",", "").replace("nan", None)) # Add category, measurement, unit, age, sex, race out = self.extract_CMU(out, crename) out["vintage"] = self._retrieve_vintage() cols_2_keep = [ "vintage", "dt", "location", "category", "measurement", "unit", "age", "sex", "race", "value", ] return out
def normalize(self, data: pd.DataFrame) -> pd.DataFrame: # date is written out in first column name data["variable"] = data["Vaccine Type"] + data["Dose Number"] data["variable"] = data["variable"].str.replace(" ", "") def _make_cmu(cat): return CMU( category=cat, measurement="cumulative", unit="people", ) colnames = { "Person's Residence in County": "location_name", "Data as of": "dt", "Number of Doses": "value", } cmus = { "J&JFirstDose": _make_cmu("janssen_vaccine_completed"), "ModernaFirstDose": _make_cmu("moderna_vaccine_initiated"), "ModernaSecondDose": _make_cmu("moderna_vaccine_completed"), "PfizerFirstDose": _make_cmu("pfizer_vaccine_initiated"), "PfizerSecondDose": _make_cmu("pfizer_vaccine_completed"), "total_initiated": _make_cmu("total_vaccine_initiated"), "total_completed": _make_cmu("total_vaccine_completed"), "total": CMU( category="total_vaccine_doses_administered", measurement="cumulative", unit="doses", ), } not_counties = ["No County", "Non-Michigan Resident"] # noqa # need to sum over all the possible facility types for distribution df = (data.rename( columns=colnames ).loc[:, ["location_name", "dt", "variable", "value"]].query( "location_name not in @not_counties" ).assign(dt=lambda x: pd.to_datetime(x["dt"])).pivot_table( index=["dt", "location_name"], columns="variable", values="value", aggfunc="sum", ).fillna(0).astype(int).assign( total_initiated=lambda x: x.eval( "ModernaFirstDose + PfizerFirstDose") + x["J&JFirstDose"], total_completed=lambda x: x.eval( "ModernaSecondDose + PfizerSecondDose") + x["J&JFirstDose"], ).assign(total=lambda x: x.eval("total_initiated + total_completed"), ).loc[:, cmus.keys()]) # Detroit data is reported separately from Wayne county. As detroit is not a real # county, combine data with Wayne county. is_wayne_county = df.index.get_level_values("location_name") == "Wayne" is_detroit = df.index.get_level_values("location_name") == "Detroit" renamed_detroit_data = df.loc[is_detroit, :].rename( index={"Detroit": "Wayne"}, level="location_name") # verify that indices are the same so that when adding data frames # no values are dropped assert renamed_detroit_data.index.equals( df.loc[is_wayne_county, :].index) df.loc[is_wayne_county, :] += renamed_detroit_data # Drop detroit data df = df.loc[~is_detroit, :] # now we need to reindex to fill in all dates -- fill missing with 0 dates = pd.Series(df.index.get_level_values("dt")).agg(["min", "max"]) new_index = pd.MultiIndex.from_product( [ pd.date_range(*dates), df.index.get_level_values("location_name").unique(), ], names=["dt", "location_name"], ) return (df.reindex(new_index, fill_value=0) # fill in missing dates .sort_index() # make sure we are sorted .unstack(level=[ "location_name" ]) # make index=dt, columns=[variable,loc_name] .cumsum() # compute cumulative sum .stack(level=[0, 1]) # long form Series .rename("value") # name the series .reset_index() # convert to long form df .assign( # fill fix value value=lambda x: pd.to_numeric(x.loc[:, "value"]), vintage=self._retrieve_vintage(), ).pipe(self.extract_CMU, cmu=cmus) # extract CMUs .drop(["variable"], axis=1) # drop variable )
def normalize(self, data: pd.DataFrame) -> pd.DataFrame: column_map = dict( death=CMU( category="deaths", measurement="cumulative", unit="people", ), hospitalizedCurrently=CMU( category="hospital_beds_in_use_covid", measurement="current", unit="beds", ), inIcuCurrently=CMU( category="icu_beds_in_use_covid", measurement="current", unit="beds", ), negative=CMU( category="pcr_tests_negative", measurement="cumulative", unit="unique_people", ), negativeTestsAntibody=CMU( category="antibody_tests_negative", measurement="cumulative", unit="specimens", ), negativeTestsPeopleAntibody=CMU( category="antibody_tests_negative", measurement="cumulative", unit="unique_people", ), negativeTestsViral=CMU( category="pcr_tests_negative", measurement="cumulative", unit="specimens", ), positive=CMU( category="cases", measurement="cumulative", unit="people", ), positiveCasesViral=CMU( category="pcr_tests_positive", measurement="cumulative", unit="unique_people", ), positiveTestsAntibody=CMU( category="antibody_tests_positive", measurement="cumulative", unit="specimens", ), positiveTestsAntigen=CMU( category="antigen_tests_positive", measurement="cumulative", unit="specimens", ), positiveTestsPeopleAntibody=CMU( category="antibody_tests_positive", measurement="cumulative", unit="unique_people", ), positiveTestsPeopleAntigen=CMU( category="antigen_tests_positive", measurement="cumulative", unit="unique_people", ), positiveTestsViral=CMU( category="pcr_tests_positive", measurement="cumulative", unit="specimens", ), totalTestsAntigen=CMU( category="antigen_tests_total", measurement="cumulative", unit="specimens", ), totalTestsAntibody=CMU( category="antibody_tests_total", measurement="cumulative", unit="specimens", ), totalTestsPeopleAntibody=CMU( category="antibody_tests_total", measurement="cumulative", unit="unique_people", ), totalTestsPeopleAntigen=CMU( category="antigen_tests_total", measurement="cumulative", unit="unique_people", ), totalTestsPeopleViral=CMU( category="pcr_tests_total", measurement="cumulative", unit="unique_people", ), totalTestsViral=CMU( category="pcr_tests_total", measurement="cumulative", unit="specimens", ), ) df = (data.rename(columns=dict(fips="location", date="dt")).melt( id_vars=["dt", "location"], value_vars=column_map.keys(), ).dropna().pipe(self.extract_CMU, column_map).assign(location_type="state", vintage=self._retrieve_vintage())) return df
def pre_normalize(self, data) -> pd.DataFrame: """ Get icu and hospital usage by covid patients from the OpenDataCali api Parameters ---------- data : List A list of json elements Returns ------- df: pd.DataFrame A pandas DataFrame containing icu+hospital usage for each county """ # Rename columns and subset data crename = { "hospitalized_covid_patients": CMU( category="hospital_beds_in_use_covid", measurement="current", unit="beds", ), "all_hospital_beds": CMU(category="hospital_beds_capacity", measurement="current", unit="beds"), "icu_covid_patients": CMU(category="icu_beds_in_use_covid", measurement="current", unit="beds"), } # Read in data and convert to long format df = self.data_from_raw(data).rename( columns={"county": "location_name"}) # Convert column to date df = df.replace("None", None) df = df.apply(lambda x: pd.to_numeric(x, errors="ignore")) df["dt"] = pd.to_datetime(df["todays_date"]) # Create a total number of icu covid patients df["icu_covid_patients"] = df.eval( "icu_covid_confirmed_patients + icu_suspected_covid_patients") # Reshape out = df.melt(id_vars=["dt", "location_name"], value_vars=crename.keys()).dropna() # Determine the category and demographics of each observation out = self.extract_CMU(out, crename) cols_to_keep = [ "dt", "location_name", "category", "measurement", "unit", "age", "race", "ethnicity", "sex", "value", ] return out.loc[:, cols_to_keep]
def normalize(self, df: pd.DataFrame) -> pd.DataFrame: bads = [r",", r"%", "nan"] str_cols = ["County", "FileNumber", "ProviderName"] df = self._clean_cols(df, bads, str_cols) df["location_name"] = df["County"].str.title() # Create new columns df["ICU Census"] = df["Adult ICU Census"] + df["Pediatric ICU Census"] df["ICU Capacity"] = (df["Total AdultICU Capacity"] + df["Total PediatricICU Capacity"]) df["Available ICU"] = df["Available Adult ICU"] + df[ "Available Pediatric ICU"] # Rename appropriate columns crename = { "Adult ICU Census": CMU(category="adult_icu_beds_in_use", measurement="current", unit="beds"), "Available Adult ICU": CMU(category="adult_icu_beds_available", measurement="current", unit="beds"), "Total AdultICU Capacity": CMU(category="adult_icu_beds_capacity", measurement="current", unit="beds"), "Pediatric ICU Census": CMU(category="pediatric_icu_beds_in_use", measurement="current", unit="beds"), "Available Pediatric ICU": CMU( category="pediatric_icu_beds_available", measurement="current", unit="beds", ), "Total PediatricICU Capacity": CMU( category="pediatric_icu_beds_capacity", measurement="current", unit="beds", ), "ICU Census": CMU(category="icu_beds_in_use", measurement="current", unit="beds"), "ICU Capacity": CMU(category="icu_beds_capacity", measurement="current", unit="beds"), "Available ICU": CMU(category="icu_beds_available", measurement="current", unit="beds"), } # Drop grand total and melt out = (df.query("location_name != 'Grand Total'").melt( id_vars=["location_name"], value_vars=crename.keys()).dropna()) out["value"] = pd.to_numeric(out["value"]) out = out.groupby(["location_name", "variable"]).sum().reset_index() out.loc[out["location_name"] == "Desoto", "location_name"] = "DeSoto" # Extract category information and add other context out = self.extract_CMU(out, crename) out["dt"] = self._retrieve_dt("US/Eastern") out["vintage"] = self._retrieve_vintage() self.clean_desoto(out) return out.loc[:, self.out_cols]
def normalize(self, data) -> pd.DataFrame: # Read data into data frame key = "data" if key not in data: raise ValueError(f"Expected to find {key} in JSON response") df = pd.DataFrame(data[key]) # Determine what columns to keep cols_to_keep = [ "county", "date", "modernaShipped", "pfizerShipped", "dosesAdministered", "totalShipped", "partiallyVaccinated", "fullyVaccinated", "percentPartiallyVaccinated", "percentFullyVaccinated", ] # Drop extraneous columns df = df.loc[:, cols_to_keep] # Rename columns df = df.rename(columns={"date": "dt", "county": "location_name"}) # Convert date time column to a datetime df = df.assign(dt=lambda x: pd.to_datetime(x["dt"])) # Create dictionary for columns to map crename = { "modernaShipped": CMU( category="moderna_vaccine_distributed", measurement="cumulative", unit="doses", ), "pfizerShipped": CMU( category="pfizer_vaccine_distributed", measurement="cumulative", unit="doses", ), "dosesAdministered": CMU( category="total_vaccine_doses_administered", measurement="cumulative", unit="doses", ), "totalShipped": CMU( category="total_vaccine_distributed", measurement="cumulative", unit="doses", ), "partiallyVaccinated": variables.INITIATING_VACCINATIONS_ALL, "fullyVaccinated": variables.FULLY_VACCINATED_ALL, "percentPartiallyVaccinated": variables.PERCENTAGE_PEOPLE_INITIATING_VACCINE, "percentFullyVaccinated": variables.PERCENTAGE_PEOPLE_COMPLETING_VACCINE, } # Move things into long format df = df.melt( id_vars=["dt", "location_name"], value_vars=crename.keys() ).dropna() # Determine the category of each observation df = self.extract_CMU(df, crename) # Determine what columns to keep cols_to_keep = [ "dt", "location_name", "category", "measurement", "unit", "age", "race", "ethnicity", "sex", "value", ] # Drop extraneous columns out = df.loc[:, cols_to_keep] # Convert value columns out["value"] = out["value"].astype(int) # Add rows that don't change out["vintage"] = self._retrieve_vintage() return out
def normalize(self, data) -> pd.DataFrame: # Read data into data frame df = pd.read_excel(data.content, parse_dates=["Date"]) # Rename columns df = df.rename( columns={ "Date": "dt", "CAT_DETAIL": "category_detail", "Category": "category_name", }) # Drop the information that we won't be keeping track of cat_detail_not_keep = ["Pending"] df = df.query("category_detail not in @cat_detail_not_keep") # Translate race, ethnicity, and gender (sex) to standard names cat_detail_replace = { "American Indian or Alaska Native": "ai_an", "Asian": "asian", "Black or African American": "black", "White": "white", "Native Hawaiian or Other Pacific Islander": "pacific_islander", "Other/ Multiracial": "multiple_other", "Other/Multiracial": "multiple_other", "Hispanic": "hispanic", "Not Hispanic or Latino": "non-hispanic", "Female": "female", "Male": "male", } df = df.replace({"category_detail": cat_detail_replace}) # Split data packed into category_name and category_detail into race, ethnicity, and gender (sex) columns df.loc[df["category_name"] == "RACE", "race"] = df["category_detail"] df.loc[df["category_name"] != "RACE", "race"] = "all" df.loc[df["category_name"] == "ETHNICITY", "ethnicity"] = df["category_detail"] df.loc[df["category_name"] != "ETHNICITY", "ethnicity"] = "all" df.loc[df["category_name"] == "SEX", "sex"] = df["category_detail"] df.loc[df["category_name"] != "SEX", "sex"] = "all" # Create dictionary for columns to map crename = { "Cat_CaseCount": CMU(category="cases", measurement="cumulative", unit="people"), "CAT_DEATHCOUNT": CMU(category="deaths", measurement="cumulative", unit="people"), } # Move things into long format df = df.melt( id_vars=[ "dt", "category_detail", "category_name", "race", "ethnicity", "sex", ], value_vars=crename.keys(), ).dropna() # Determine the category of each observation df = self.extract_CMU(df, crename, ["category", "measurement", "unit", "age"]) # Determine what columns to keep cols_to_keep = [ "dt", "category", "measurement", "unit", "age", "race", "ethnicity", "sex", "value", ] # Drop extraneous columns out = df.loc[:, cols_to_keep] # Convert value columns out["value"] = out["value"].astype(int) # Add rows that don't change out["location"] = self.state_fips out["vintage"] = self._retrieve_vintage() return out
class DelawareCountyVaccine(StateDashboard): kent_fully_vaccinated_url = "https://myhealthycommunity.dhss.delaware.gov/locations/county-kent/covid19_vaccine_fully_vaccinated" new_castle_fully_vaccinated_url = "https://myhealthycommunity.dhss.delaware.gov/locations/county-new-castle/covid19_vaccine_fully_vaccinated" sussex_fully_vaccinated_url = "https://myhealthycommunity.dhss.delaware.gov/locations/county-sussex/covid19_vaccine_fully_vaccinated" kent_total_url = "https://myhealthycommunity.dhss.delaware.gov/locations/county-kent/covid19_vaccine_administrations" new_castle_total_url = "https://myhealthycommunity.dhss.delaware.gov/locations/county-new-castle/covid19_vaccine_administrations" sussex_total_url = "https://myhealthycommunity.dhss.delaware.gov/locations/county-sussex/covid19_vaccine_administrations" has_location = False location_type = "county" # Initialize source = "https://myhealthycommunity.dhss.delaware.gov" source_name = "Delaware Health and Social Services" state_fips = int(us.states.lookup("Delaware").fips) variables = { "FirstDose": CMU( category="total_vaccine_initiated", measurement="new", unit="doses" ), "SecondDose": CMU( category="total_vaccine_completed", measurement="new", unit="doses" ), "TotalDoses": CMU( category="total_vaccine_doses_administered", measurement="new", unit="doses" ), } def _fetch_fully_vaccinated(self): dfs = [] fully_vaccinated_urls = [ {"county": "Kent", "url": self.kent_fully_vaccinated_url}, {"county": "New Castle", "url": self.new_castle_fully_vaccinated_url}, {"county": "Sussex", "url": self.sussex_fully_vaccinated_url}, ] for curl in fully_vaccinated_urls: county = curl["county"] url = curl["url"] r = requests.get(url) soup = BeautifulSoup(r.text, features="lxml") tdata = json.loads( soup.find( "div", {"aria-labelledby": "chart-covid-vaccine-fully-vaccinated-label"}, )["data-charts--covid-vaccine-fully-vaccinated-config-value"] ) sd = tdata["startDate"] # Parse start date startDate = dt.datetime(sd[0], sd[1], sd[2]) # Get first dose data first_dose_data = tdata["series"][0]["data"] idx = pd.date_range(startDate, periods=len(first_dose_data), freq="d") first_dose_df = pd.DataFrame( data=first_dose_data, columns=["FirstDose"], index=idx ) # Get second dose data second_dose_data = tdata["series"][1]["data"] idx = pd.date_range(startDate, periods=len(second_dose_data), freq="d") second_dose_df = pd.DataFrame( data=second_dose_data, columns=["SecondDose"], index=idx ) df = first_dose_df.join(second_dose_df) df["location_name"] = county dfs.append(df) return pd.concat(dfs) def _fetch_total_administered(self): dfs = [] total_urls = [ {"county": "Kent", "url": self.kent_total_url}, {"county": "New Castle", "url": self.new_castle_total_url}, {"county": "Sussex", "url": self.sussex_total_url}, ] for curl in total_urls: county = curl["county"] url = curl["url"] r = requests.get(url) soup = BeautifulSoup(r.text, features="lxml") tdata = json.loads( soup.find( "div", { "aria-labelledby": "chart-covid-vaccine-administrations-daily-label" }, )["data-charts--covid-vaccine-administrations-daily-config-value"] ) sd = tdata["startDate"] # Parse start date startDate = dt.datetime(sd[0], sd[1], sd[2]) # Get first dose data total_df = None for srs in tdata["series"]: if srs["name"] == "Daily Count": total_data = srs["data"] idx = pd.date_range(startDate, periods=len(total_data), freq="d") total_df = pd.DataFrame( data=total_data, columns=["TotalDoses"], index=idx ) if total_df is None: raise "Couln't get county total data" df = total_df df["location_name"] = county dfs.append(df) return pd.concat(dfs) def fetch(self): totals = self._fetch_total_administered() doses = self._fetch_fully_vaccinated() return {"totals": totals, "doses": doses} def normalize(self, data): totals = data["totals"] totals = totals.reset_index().rename(columns={"index": "dt"}) doses = data["doses"] doses = doses.reset_index().rename(columns={"index": "dt"}) df = ( totals.set_index(["dt", "location_name"]) .join(doses.set_index(["dt", "location_name"])) .reset_index() ) df = df.fillna(0) out = self._reshape_variables(df, self.variables) return out
class GeorgiaCountyVaccineRace(GeorgiaCountyVaccineAge): sheet = 6 column_names = ["RACE_ID"] variables = { "2054-5": CMU( category="total_vaccine_initiated", measurement="cumulative", unit="people", race="black", ), "2076-8": CMU( category="total_vaccine_initiated", measurement="cumulative", unit="people", race="white", ), "2106-3": CMU( category="total_vaccine_initiated", measurement="cumulative", unit="people", race="white", ), "1002-5": CMU( category="total_vaccine_initiated", measurement="cumulative", unit="people", race="ai_an", ), "2028-9": CMU( category="total_vaccine_initiated", measurement="cumulative", unit="people", race="multiple", ), "ANHOPI": CMU( category="total_vaccine_initiated", measurement="cumulative", unit="people", race="asian", ), "UNK": CMU( category="total_vaccine_initiated", measurement="cumulative", unit="people", race="unknown", ), "2131-1": CMU( category="total_vaccine_initiated", measurement="cumulative", unit="people", race="other", ), } def fetch(self): return self.get_all_jsons(self.service, self.sheet, "6")
class CDCCountyVaccine(FederalDashboard): has_location = True location_type = "county" source = "https://covid.cdc.gov/covid-data-tracker/#county-view" url = ("https://covid.cdc.gov/covid-data-tracker/COVIDData/" "getAjaxData?id=vaccination_county_condensed_data") source_name = "Centers for Disease Control and Prevention" provider = "cdc" variables = { "Series_Complete_Yes": variables.FULLY_VACCINATED_ALL, "Series_Complete_18Plus": CMU( category="total_vaccine_completed", measurement="cumulative", unit="people", age="18_plus", ), "Series_Complete_65Plus": CMU( category="total_vaccine_completed", measurement="cumulative", unit="people", age="65_plus", ), "Series_Complete_Pop_Pct": variables.PERCENTAGE_PEOPLE_COMPLETING_VACCINE, "Series_Complete_18PlusPop_Pct": CMU( category="total_vaccine_completed", measurement="current", unit="percentage", age="18_plus", ), "Series_Complete_65PlusPop_Pct": CMU( category="total_vaccine_completed", measurement="current", unit="percentage", age="65_plus", ), } def fetch(self): response = requests.get(self.url) if not response.ok: msg = f"Failed to make request to {self.url}\n" msg += "Response from request was\n:" msg += textwrap.indent(response.content, "\t") raise RequestError(msg) return response.json() def normalize(self, data): df = pd.DataFrame.from_records( data["vaccination_county_condensed_data"]) out = self._rename_or_add_date_and_location(df, location_column="FIPS", date_column="Date", locations_to_drop=["UNK"]) return self._reshape_variables(out, self.variables)
def normalize(self, data: str) -> pd.DataFrame: # Read the dataframe from the string csv df = pd.read_csv(StringIO(data)) df.columns = [x.lower().strip() for x in df.columns] # Set date and fips code df.loc[:, "dt"] = pd.to_datetime(df["collection_week"]) # Filter out all of the columns without a fips code for now -- I # think that it is likely that we could reverse engineer these # either by looking them up or by mapping city to county df = df.loc[~df["fips_code"].isna(), :] # :see_no_evil: df["location"] = ( df["fips_code"] .astype(int) .replace( { # 02120 corresponded to Kenai-Cook Inlet Division... It was # then the relevant piece became Kenai Peninsula Borough which # is 02122 # https://data.nber.org/asg/ASG_release/County_City/FIPS/FIPS_Changes.pdf 2120: 2122, # City associated with the hospital is Seward which is in the # Kenai Borough which is 02122 but I have no idea how this # ended up with fips code 02210??? # https://en.wikipedia.org/wiki/Seward,_Alaska 2210: 2122, # 02260 was fips code for Valdez-Chitina-Whittier Division... It # was then put into Valdez–Cordova Census Area which is # 02261, but 02261 was split in Jan 2019 and we'll need to change # this again if we update geographies # https://data.nber.org/asg/ASG_release/County_City/FIPS/FIPS_Changes.pdf 2260: 2261, # 02280 corresponded to Wrangell-Petersburg but became the # Petersburg Borough 02195 in 2012 # https://www.cdc.gov/nchs/nvss/bridged_race/county_geography-_changes2015.pdf 2280: 2195, # City associated with the hospital is Cordova which is in the # Valdez-Cordova census area but I don't know which one this # ended up in after the split... # https://en.wikipedia.org/wiki/Cordova,_Alaska 2080: 2261, } ) ) # Set all missing values (-999999) to nan for all numeric columns numeric_cols = list(df.select_dtypes("number")) df.loc[:, numeric_cols] = df.loc[:, numeric_cols].where(lambda x: x > 0, np.nan) # Create new columns that we need df["inpatient_beds_used_covid_7_day_avg"] = df.eval( "total_adult_patients_hospitalized_confirmed_covid_7_day_avg + " "total_pediatric_patients_hospitalized_confirmed_covid_7_day_avg" ) crename = { "inpatient_beds_7_day_avg": CMU( category="hospital_beds_capacity", measurement="rolling_average_7_day", unit="beds", ), "inpatient_beds_used_7_day_avg": CMU( category="hospital_beds_in_use", measurement="rolling_average_7_day", unit="beds", ), # This column is generated by summing adult and pediatric # beds -- Should be missing if either is missing "inpatient_beds_used_covid_7_day_avg": CMU( category="hospital_beds_in_use_covid", measurement="rolling_average_7_day", unit="beds", ), "total_staffed_adult_icu_beds_7_day_avg": CMU( category="adult_icu_beds_capacity", measurement="rolling_average_7_day", unit="beds", ), "staffed_adult_icu_bed_occupancy_7_day_avg": CMU( category="adult_icu_beds_in_use", measurement="rolling_average_7_day", unit="beds", ), "staffed_icu_adult_patients_confirmed_covid_7_day_avg": CMU( category="adult_icu_beds_in_use_covid", measurement="rolling_average_7_day", unit="beds", ), } # Reshape by putting into long form df_long = df.melt( id_vars=["dt", "location"], value_vars=crename.keys() ).dropna() df_long.loc[:, "value"] = pd.to_numeric( df_long["value"].astype(str).str.replace(",", "") ) # Add category, measurement, unit, age, sex, race df_long = self.extract_CMU(df_long, crename) # Group by relevant factors and sum identifier = [ "dt", "location", "category", "measurement", "unit", "age", "sex", "race", ] # TODO: We could do a different groupby and put this into states # or hospital regions out_county = df_long.groupby(identifier)["value"].sum().reset_index() # TODO: Throwing out territories because I don't remember which weren't # included in the census data :( out_county = out_county.query("location < 60_000") # Add vintage out_county["vintage"] = self._retrieve_vintage() out_county["location_type"] = "county" cols_2_keep = identifier + ["vintage", "location_type", "value"] return out_county.loc[:, cols_2_keep]