def __parse_df_from_2021_01_18(source: Source, df: pd.DataFrame) -> Iterable[Vaccinated]: df = df.drop("Unnamed: 0", axis=1) df_iterrows = df.iterrows() for row in df_iterrows: _, (title, *data) = row if type(title) == str and re.match("^(nhs )?region of residence$", title.lower()): break for row in df_iterrows: _, (location, *data) = row if type(location) == float and math.isnan(location): continue if location == "Data quality notes:": break dose_1, dose_2, cumulative = filter(lambda d: not math.isnan(d), data) if re.match(r"^(Total\d?|England)$", location): location = ALL_LOCATIONS else: location = Location(location) yield Vaccinated(source, dose_1, Slice(location=location, dose=Dose.DOSE_1)) yield Vaccinated(source, dose_2, Slice(location=location, dose=Dose.DOSE_2)) yield Vaccinated(source, cumulative, Slice(location=location, dose=Dose.ALL))
def vaccinated_to_df(vaccinated: List[Vaccinated]) -> pd.DataFrame: df = pd.DataFrame(vaccinated) # Move field to top level. df["data_date"] = df["source"].apply(lambda s: s["data_date"]) df["real_date"] = df["source"].apply(lambda s: s["real_date"]) df["dose"] = df["slice"].apply(lambda s: s["dose"].csv_str()) df["group"] = df["slice"].apply(lambda s: Group(**s["group"]).csv_str()) df["location"] = df["slice"].apply( lambda s: Location(**s["location"]).csv_str()) df = df.drop("source", axis=1) df = df.drop("slice", axis=1) df["vaccinated"] = df["vaccinated"].astype(int) return df
def __parse_df_weekly(source: Source, df: pd.DataFrame) -> Iterable[Vaccinated]: def is_start(cell) -> bool: return type(cell) == str and (cell.lower() == "region of residence" or cell.lower() == "nhs region of residence") def is_end(cell) -> bool: return type(cell) == str and cell.lower() == "data quality notes:" def is_nan(cell) -> bool: return type(cell) == float and math.isnan(cell) a = df.to_numpy() # Trim. (start_y, ), (start_x, ) = np.where(np.vectorize(is_start)(a)) (end_y, ), (_, ) = np.where(np.vectorize(is_end)(a)) a = a[start_y:end_y, start_x:] # Remove NaNs. is_nans = np.vectorize(is_nan)(a) a = a[:, ~np.all(is_nans, axis=0)] a = a[~np.all(is_nans, axis=1), :] # Fill in dose row. filled_in_doses = [] current_dose = None for population in a[0, 1:]: if not is_nan(population): current_dose = population filled_in_doses.append(current_dose) a[0, 1:] = filled_in_doses vaccinated_by_slice: DefaultDict[Slice, int] = defaultdict(int) for y in range(2, a.shape[0]): for x in range(1, a.shape[1]): dose = a[0, x] group = a[1, x] location = a[y, 0] vaccinated = a[y, x] ignore = [ # Ignore population estimates. "population estimates", # Ignore precalculated % "% who have had at least 1 dose", "% who have had both doses", # Ignore dose summaries. "total 1st doses", "total 2nd doses", ] if any(map(lambda d: d in dose.lower(), ignore)): continue if type(group) == str and "percent of all" in group.lower(): # Ignore percentage reports. continue is_dose_and_group_all = "cumulative total doses to date" in dose.lower( ) if dose in ["1st dose", "1st dose5"]: dose = Dose.DOSE_1 elif dose in ["2nd dose", "2nd dose5"]: dose = Dose.DOSE_2 elif is_dose_and_group_all: dose = Dose.ALL else: raise AssertionError( f"Unexpected dose {dose} in source {source}") if is_dose_and_group_all: group = ALL_AGES else: group = Group.from_csv_str(group) if re.match(r"^Total\d?$", location): location = ALL_LOCATIONS else: location = Location(location) vaccinated_by_slice[Slice(dose, group, location)] += vaccinated for slice_, vaccinated in vaccinated_by_slice.items(): yield Vaccinated(source, vaccinated, slice_)