def normalize(self, resjson: dict) -> pd.DataFrame: # Extract components we care about from json foo = resjson["results"][0]["result"]["data"] data = foo["dsr"]["DS"][0]["PH"][1]["DM1"] data = [d for d in data if list(d.keys())[0] == "G0"] # keep only relevent data # Build dict of dicts with relevant info col_mapping = { "G0": "county", "M_0_DM2_0_C_0": "total_vaccine_administered", "M_0_DM2_0_C_1": "pfizer_moderna_first_dose", "M_0_DM2_0_C_2": "total_vaccine_completed", "M_1_DM3_0_C_1": "janssen_series", "M_0_DM2_0_C_4": "total_vaccine_completed_percent", } # Iterate through all of the rows and store relevant data data_rows = [] for record in data: flat_record = flatten_dict(record) row = {} for k in list(col_mapping.keys()): flat_record_key = [ frk for frk in flat_record.keys() if k in frk ] if len(flat_record_key) > 0: row[col_mapping[k]] = flat_record[flat_record_key[0]] data_rows.append(row) df = pd.DataFrame.from_records(data_rows) # calculate vaccine initiated to match def'n df["total_vaccine_initiated"] = (df["pfizer_moderna_first_dose"] + df["janssen_series"]) # Title case and remove the word county df["location_name"] = df["county"].str.replace("County, ME", "").str.strip() # Change % column into percentage df["total_vaccine_completed_percent"] = ( 100 * df["total_vaccine_completed_percent"]) # Reshape variables = { "total_vaccine_administered": v.TOTAL_DOSES_ADMINISTERED_ALL, "total_vaccine_initiated": v.INITIATING_VACCINATIONS_ALL, "total_vaccine_completed": v.FULLY_VACCINATED_ALL, "total_vaccine_completed_percent": v.PERCENTAGE_PEOPLE_COMPLETING_VACCINE, } out = self._reshape_variables(df, variables) out["dt"] = self._retrieve_dt("US/Eastern") return out
def normalize(self, resjson: dict) -> pd.DataFrame: # combine all list entries into one dict s.t they can all be parsed at once data = [] for chunk in resjson: foo = chunk["results"][0]["result"]["data"] d = foo["dsr"]["DS"][0]["PH"][1]["DM1"] data.extend(d) # Build dict of dicts with relevant info col_mapping = { "C_0": "location_name", "C_1": "total_vaccine_initiated", "C_2": "total_vaccine_completed", } # Iterate through all of the rows and store relevant data data_rows = [] for record in data: flat_record = flatten_dict(record) row = {} for k in list(col_mapping.keys()): flat_record_key = [ frk for frk in flat_record.keys() if k in frk ] if len(flat_record_key) > 0: row[col_mapping[k]] = flat_record[flat_record_key[0]] data_rows.append(row) df = pd.DataFrame.from_records(data_rows).reset_index() out = self._reshape_variables(df, self.variables) out["dt"] = self._retrieve_dt("US/Eastern") return out
def normalize(self, resjson): # Extract components we care about from json data_rows = [] for name, values in resjson.items(): data = [] for chunk in values: foo = chunk["results"][0]["result"]["data"] d = foo["dsr"]["DS"][0]["PH"][1]["DM1"] data.extend(d) for record in data: flat_record = flatten_dict(record) row = {} for k in list(self.col_mapping.keys()): flat_record_key = [ frk for frk in flat_record.keys() if k in frk ] if len(flat_record_key) > 0: row[self.col_mapping[k]] = flat_record[ flat_record_key[0]] row["variable"] = name data_rows.append(row) # Dump records into a DataFrame df = pd.DataFrame.from_records(data_rows).dropna() # Reshape out = df.melt(id_vars=["location_name", "variable"], var_name=self.demographic) # Add CMU, dt, vintage out = self.extract_CMU(out, self.variables, skip_columns=[self.demographic]) out = out.assign( dt=self._retrieve_dt("US/Eastern"), vintage=self._retrieve_vintage(), value=lambda x: pd.to_numeric(x["value"].astype(str).str.replace( ",", "")), ) cols_to_keep = [ "vintage", "dt", "location_name", "category", "measurement", "unit", "age", "race", "ethnicity", "sex", "value", ] return out.loc[:, cols_to_keep]
def normalize(self, resjson): # Extract components we care about from json # TODO: The order of these outputs may differ slightly in # different dashboards foo = resjson["results"][0]["result"]["data"] descriptor = foo["descriptor"]["Select"] data = foo["dsr"]["DS"][0]["PH"][0]["DM0"] # Build dict of dicts with relevant info col_mapping = {x["Value"]: x["Name"] for x in descriptor} col_keys = list(col_mapping.keys()) # TODO: Figure out how to iterate through all of the rows and # store relevant data data_rows = [] for record in data: flat_record = flatten_dict(record) row = {} for k in col_keys: flat_record_key = [frk for frk in flat_record.keys() if k in frk] if len(flat_record_key) > 0: row[col_mapping[k]] = flat_record[flat_record_key[0]] data_rows.append(row) # Dump records into a DataFrame df = pd.DataFrame.from_records(data_rows).dropna() # TODO: Manipulate data to desired shape # Reshape crename = {} out = df.melt(id_vars=["location_name"]) # Add CMU, dt, vintage out = self.extract_CMU(out, crename) cols_to_keep = [ "vintage", "dt", "location_name", "category", "measurement", "unit", "age", "race", "ethnicity", "sex", "value", ] return out.loc[:, cols_to_keep]
def normalize(self, resjson: dict) -> pd.DataFrame: data = [] for chunk in resjson: foo = chunk["results"][0]["result"]["data"] d = foo["dsr"]["DS"][0]["PH"][1]["DM1"] data.extend(d) # Iterate through all of the rows and store relevant data data_rows = [] for record in data: flat_record = flatten_dict(record) for demo_key, demo in self.demographic_key.items(): row = {} row["race"] = demo for k in list(self.col_mapping.keys()): k_formatted = k.format(demo=demo_key) flat_record_key = [ frk for frk in flat_record.keys() if k_formatted in frk ] if len(flat_record_key) > 0: row[self.col_mapping[k]] = flat_record[flat_record_key[0]] data_rows.append(row) # combine into dataframe df = ( pd.DataFrame.from_records(data_rows) .dropna() .assign( initiated=lambda x: x["initiated"].astype(str).str.replace("L", ""), completed=lambda x: x["completed"].astype(str).str.replace("L", ""), location_name=lambda x: x["location_name"].str.replace(" County", ""), dt=self._retrieve_dtm1d("US/Eastern"), vintage=self._retrieve_vintage(), ) ) out = self._reshape_variables( df, self.variables, id_vars=["race"], skip_columns=["race"], ) # shift 'hispanic' entries into ethnicity column # mark ethnicity as unknown for unknown race columns b/c the variable is 'unknown race/ethnicity' hisp_rows = out["race"] == "hispanic" out.loc[hisp_rows, "ethnicity"] = "hispanic" out.loc[hisp_rows, "race"] = "all" unknown_rows = out["race"] == "unknown" out.loc[unknown_rows, "ethnicity"] = "unknown" return out
def normalize(self, resjson: dict) -> pd.DataFrame: data = [] for chunk in resjson: foo = chunk["results"][0]["result"]["data"] d = foo["dsr"]["DS"][0]["PH"][1]["DM1"] data.extend(d) # Iterate through all of the rows and store relevant data data_rows = [] for record in data: flat_record = flatten_dict(record) for demo_key, demo in self.demographic_key.items(): row = {} row[self.demographic] = demo for k in list(self.col_mapping.keys()): k_formatted = k.format(demo=demo_key) flat_record_key = [ frk for frk in flat_record.keys() if k_formatted in frk ] if len(flat_record_key) > 0: row[self.col_mapping[k]] = flat_record[ flat_record_key[0]] data_rows.append(row) # combine into dataframe df = pd.DataFrame.from_records(data_rows) # they report j&j doses in either the initiated or completed column--the other is empty (NA) df["jj_complete"] = df["jj_complete"].fillna(0) df["jj_init"] = df["jj_init"].fillna(0) # format, calculate total_vacccine_initiated + map CMU out = ( df.dropna( ) # counties that don't have age 0-11 return NA for value .assign( initiated_total=lambda x: x["first_dose_total"] + x[ "jj_complete"] + x["jj_init"], vintage=self._retrieve_vintage(), dt=self._retrieve_dt("US/Eastern"), location_name=lambda x: x["county"].str.replace( " County, ME", ""), ).drop(columns={"county"}).pipe( self._reshape_variables, variable_map=self.variables, skip_columns=[self.demographic], id_vars=[self.demographic], )) return out
def normalize(self, resjson: dict) -> pd.DataFrame: data = [] for chunk in resjson: foo = chunk["results"][0]["result"]["data"] d = foo["dsr"]["DS"][0]["PH"][1]["DM1"] data.extend(d) # Iterate through all of the rows and store relevant data data_rows = [] for record in data: flat_record = flatten_dict(record) # print(flat_record) for demo_key, demo in self.demographic_key.items(): row = {} row[self.demographic] = demo for k in list(self.col_mapping.keys()): k_formatted = k.format(demo=demo_key) flat_record_key = [ frk for frk in flat_record.keys() if k_formatted in frk ] if len(flat_record_key) > 0: row[self.col_mapping[k]] = flat_record[ flat_record_key[0]] data_rows.append(row) # combine into dataframe df = pd.DataFrame.from_records(data_rows) out = self._reshape_variables( df, self.variables, id_vars=[self.demographic], skip_columns=[self.demographic], ) return out.assign( location_name=lambda x: x["location_name"].str.title(), dt=self._retrieve_dtm1d("US/Eastern"), vintage=self._retrieve_vintage(), ).replace({ "Mcleod": "McLeod", "Lac Qui Parle": "Lac qui Parle", "Lake Of The Woods": "Lake of the Woods", })
def normalize(self, resjson): # Extract components we care about from json foo = resjson["results"][0]["result"]["data"] data = foo["dsr"]["DS"][0]["PH"][0]["DM0"] # total doses admin data are stored in different part of response total_doses = foo["dsr"]["DS"][0]["ValueDicts"]["D0"] data_rows = [] # make the mappings manually col_mapping = { "G0": "county", "M_0_DM1_0_X_2_A0": "at_least_one_dose", } for i, record in enumerate(data): flat_record = flatten_dict(record) row = {} for k in list(col_mapping.keys()): flat_record_key = [ frk for frk in flat_record.keys() if k in frk ] if len(flat_record_key) > 0: row[col_mapping[k]] = flat_record[flat_record_key[0]] # append total doses administered data row["total_doses"] = int(total_doses[i].replace(",", "")) data_rows.append(row) # Dump records into a DataFrame and transform df = pd.DataFrame.from_records(data_rows) out = self._rename_or_add_date_and_location( df, location_name_column="county", timezone="US/Central", location_names_to_drop=["Other"], ) out = self._reshape_variables(out, self.variables).dropna() return out.replace( {"location_name": { "Mccook": "McCook", "Mcpherson": "McPherson" }})
def normalize(self, resjson: Any) -> pd.DataFrame: foo = resjson["results"][0]["result"]["data"] data = foo["dsr"]["DS"][0]["PH"][1]["DM1"] # Build dict of dicts with relevant info col_mapping = { "C_0": "county", "C_1": "at_least_one_dose", "C_2": "fully_vaccinated", } data_rows = [] # Iterate through all of the rows and store relevant data # skip first entry, which does not contain any data for record in data[1:]: flat_record = flatten_dict(record) row = {} for k in list(col_mapping.keys()): flat_record_key = [frk for frk in flat_record.keys() if k in frk] if len(flat_record_key) > 0: row[col_mapping[k]] = flat_record[flat_record_key[0]] data_rows.append(row) data = self._rename_or_add_date_and_location( data=pd.DataFrame(data_rows), location_name_column="county", location_names_to_drop=["ND-COUNTY UNKNOWN"], location_names_to_replace={ "Lamoure": "LaMoure", "Mchenry": "McHenry", "Mcintosh": "McIntosh", "Mckenzie": "McKenzie", "Mclean": "McLean", }, timezone="US/Central", ).pipe(self._reshape_variables, variable_map=self.variables) return data
def normalize(self, resjson): # get the demographic and dose values and the order in which they appear from the fetch request (Extracts the list from the "ValueDicts" object) foo = resjson[0]["results"][0]["result"]["data"] demographic_values = foo["dsr"]["DS"][0]["ValueDicts"]["D1"] dose_values = foo["dsr"]["DS"][0]["ValueDicts"]["D0"] demo_value_dicts = {k: v for k, v in enumerate(demographic_values)} dose_value_dicts = {k: v for k, v in enumerate(dose_values)} # extract the data we want from each response chunk data = [] for chunk in resjson: foo = chunk["results"][0]["result"]["data"] d = foo["dsr"]["DS"][0]["PH"][1]["DM1"] data.extend(d) # loop through each county and parse all wanted data into list of dicts data_rows = [] # for each county record for record in data: flat_record = flatten_dict(record) # get current dose for dose_key, dose_value in dose_value_dicts.items(): row = {} row["county"] = flat_record["G0"] row["dose_type"] = dose_value # get current demo type for demo_key, demo_value in demo_value_dicts.items(): key = f"M_1_DM3_{dose_key}_M_1_DM5_{demo_key}_C_1" if key in flat_record.keys(): row[demo_value] = flat_record.get(key) # mark repeated values to be replaced/filled later elif key not in flat_record.keys(): row[demo_value] = "REPEAT" data_rows.append(row) # manually forward fill the REPEAT values for i in range(0, len(data_rows)): record = data_rows[i] prev_record = data_rows[i - 1] keys = list(record.keys()) max_demo_dict_key = max(demo_value_dicts.keys()) for i in range(0, len(keys)): k = keys[i] prev_k = keys[i - 1] if record[k] == "REPEAT": # if the entry is not the first in the list of demographics # copy the value from the previous demographic if k != demo_value_dicts[0]: record[k] = record[prev_k] # if the repeat is the first demograhic key, copy from last demographic value of previous record else: max_demo = demo_value_dicts[max_demo_dict_key] record[k] = prev_record[max_demo] # dump into dataframe and pivot df = pd.DataFrame.from_records(data_rows) df = df.melt(id_vars=["county", "dose_type"], var_name=self.demographic) # calculate total_vaccine_initiated and total_vaccine_completed values init = (df.query( "dose_type in ['Pfizer - 1 dose', 'Moderna - 1 dose', 'Janssen - Series Complete']" ).groupby([ "county", self.demographic ]).sum().reset_index().assign(dose_type="total_vaccine_initiated")) complete = (df.query( "dose_type in ['Moderna - Series Complete','Pfizer - Series Complete', 'Janssen - Series Complete']" ).groupby([ "county", self.demographic ]).sum().reset_index().assign(dose_type="total_vaccine_completed")) out = self._rename_or_add_date_and_location( pd.concat([init, complete]), location_name_column="county", timezone="US/Central", location_names_to_drop=["Other"], ) out = (self.extract_CMU( out, self.variables, skip_columns=[self.demographic], var_name="dose_type", ).assign(vintage=self._retrieve_vintage()).drop(columns="dose_type")) out[self.demographic] = out[self.demographic].str.lower() return out.replace({ "Mccook": "McCook", "Mcpherson": "McPherson", "asian / pacific islander": "asian", "native american": "ai_an", "80+": "80_plus", }).query("race != 'hispanic'")
def normalize(self, resjson): # Extract components we care about from json foo = resjson["results"][0]["result"]["data"] descriptor = foo["descriptor"]["Select"] data = foo["dsr"]["DS"][0]["PH"][0]["DM0"] # Build dict of dicts with relevant info col_mapping = {x["Value"]: x["Name"] for x in descriptor} col_keys = list(col_mapping.keys()) # Iterate through all of the rows and store relevant data data_rows = [] for record in data: flat_record = flatten_dict(record) row = {} for k in col_keys: flat_record_key = [ frk for frk in flat_record.keys() if k in frk ] if len(flat_record_key) > 0: row[col_mapping[k]] = flat_record[flat_record_key[0]] data_rows.append(row) # Dump records into a DataFrame df = pd.DataFrame.from_records(data_rows).dropna() df = df.query("location_name != '' & location_name != 'Out-of-State'") # Initiated is not at least one dose for PA -- it is a count of # individuals that are currently partially covered by a vaccine df["total_vaccine_initiated"] = df.eval( "total_vaccine_initiated + total_vaccine_completed") # Make sure McKean follows capitalization in db df = df.replace({"location_name": {"Mckean": "McKean"}}) # Reshape crename = { "total_vaccine_initiated": CMU( category="total_vaccine_initiated", measurement="cumulative", unit="people", ), "total_vaccine_completed": CMU( category="total_vaccine_completed", measurement="cumulative", unit="people", ), } out = df.melt(id_vars=["location_name"]) # Add CMU, dt, vintage out = self.extract_CMU(out, crename) out["dt"] = self._retrieve_dt("US/Eastern") out["vintage"] = self._retrieve_vintage() cols_to_keep = [ "vintage", "dt", "location_name", "category", "measurement", "unit", "age", "race", "ethnicity", "sex", "value", ] return out.loc[:, cols_to_keep]
def normalize(self, resjson): # extract the data we want from each response data = [] for chunk in resjson: foo = chunk["results"][0]["result"]["data"] d = foo["dsr"]["DS"][0]["PH"][1]["DM1"] data.extend(d) # make the mappings manually col_mapping = { "G0": "county", "M_1_DM3_1_C_1": "janssen_series", "M_1_DM3_2_C_1": "janssen_booster", "M_1_DM3_3_C_1": "moderna_1_dose", "M_1_DM3_4_C_1": "moderna_complete", "M_1_DM3_5_C_1": "moderna_booster", "M_1_DM3_6_C_1": "pfizer_1_dose", "M_1_DM3_7_C_1": "pfizer_complete", "M_1_DM3_8_C_1": "pfizer_booster", } data_rows = [] for record in data: flat_record = flatten_dict(record) row = {} for k in list(col_mapping.keys()): flat_record_key = [ frk for frk in flat_record.keys() if k in frk ] if len(flat_record_key) > 0: row[col_mapping[k]] = flat_record[flat_record_key[0]] data_rows.append(row) # Dump records into a DataFrame and transform df = pd.DataFrame.from_records(data_rows) # Calculate metrics to match our definitions: # SD moves individuals between buckets when they receive shots. E.g when someone gets their second dose of Moderna, # they are removed from the 1-dose bucket and placed into the 2-dose bucket. So, we need to combine all the buckets. df["total_vaccine_completed"] = (df["janssen_series"] + df["janssen_booster"] + df["moderna_complete"] + df["pfizer_complete"] + df["moderna_booster"] + df["pfizer_booster"]) df["total_vaccine_initiated"] = (df["moderna_1_dose"] + df["pfizer_1_dose"] + df["total_vaccine_completed"]) out = self._rename_or_add_date_and_location( df, location_name_column="county", timezone="US/Central", location_names_to_drop=["Other"], ) out = self._reshape_variables(out, self.variables).dropna() return out.replace( {"location_name": { "Mccook": "McCook", "Mcpherson": "McPherson" }})