def read(self) -> pd.DataFrame: data = request_json(self.source_url) df = pd.DataFrame.from_records(data["data"]) check_known_columns( df, [ "date", "change_cases", "change_fatalities", "change_tests", "change_hospitalizations", "change_criticals", "change_recoveries", "change_vaccinations", "change_vaccinated", "change_boosters_1", "change_boosters_2", "change_vaccines_distributed", "total_cases", "total_fatalities", "total_tests", "total_hospitalizations", "total_criticals", "total_recoveries", "total_vaccinations", "total_vaccinated", "total_boosters_1", "total_boosters_2", "total_vaccines_distributed", ], ) return df[["date", "total_vaccinations", "total_vaccinated", "total_boosters_1", "total_boosters_2"]]
def read(self) -> pd.DataFrame: data = request_json(self.source_url) df = pd.DataFrame.from_records(data) check_known_columns( df, [ "Day_Date", "vaccinated", "vaccinated_cum", "vaccinated_population_perc", "vaccinated_seconde_dose", "vaccinated_seconde_dose_cum", "vaccinated_seconde_dose_population_perc", "vaccinated_third_dose", "vaccinated_third_dose_cum", "vaccinated_third_dose_population_perc", "vaccinated_fourth_dose_population_perc", "vaccinated_fourth_dose", "vaccinated_validity_perc", "vaccinated_expired_perc", "not_vaccinated_perc", "vaccinated_fourth_dose_cum", ], ) return df
def read(self) -> pd.DataFrame: data = request_json(self.source_url) df = pd.DataFrame.from_dict(data["historicalData"], orient="index") check_known_columns( df, [ "parsedOn", "parsedOnString", "fileName", "complete", "averageAge", "numberInfected", "numberCured", "numberDeceased", "percentageOfWomen", "percentageOfMen", "percentageOfChildren", "numberTotalDosesAdministered", "distributionByAge", "countyInfectionsNumbers", "incidence", "large_cities_incidence", "small_cities_incidence", "vaccines", ], ) return df[["vaccines", "numberTotalDosesAdministered" ]].reset_index().dropna().sort_values(by="index")
def _parse_pdf_table(self) -> pd.Series: """Extract table from pdf url""" print(self.source_url_ref["manufacturer"]) df_list = tabula.read_pdf(self.source_url_ref["manufacturer"], pages="all", stream=True) df = [table for table in df_list if "Pfizer" in table.columns][0] # Checks data check_known_columns( df, [ "Unnamed: 0", "Covid Shield", "Unnamed: 1", "Verocell", "Unnamed: 2", "J & J", "Pfizer", "Unnamed: 3", "Unnamed: 4", "Unnamed: 5", "Moderna", ], ) return df[df["Unnamed: 0"] == "Total"].drop(columns=["Unnamed: 0"])
def read(self) -> pd.DataFrame: df = pd.read_csv(self.source_url) check_known_columns( df, [ "Date", "MMWR_week", "Location", "Administered_Daily", "Administered_Cumulative", "Administered_7_Day_Rolling_Average", "Admin_Dose_1_Daily", "Admin_Dose_1_Cumulative", "Admin_Dose_1_Day_Rolling_Average", "date_type", "Administered_daily_change_report", "Administered_daily_change_report_7dayroll", "Series_Complete_Daily", "Series_Complete_Cumulative", "Series_Complete_Day_Rolling_Average", "Booster_Daily", "Booster_Cumulative", "Booster_7_Day_Rolling_Average", ], ) return df[[ "Date", "Location", "Administered_Cumulative", "Admin_Dose_1_Cumulative", "date_type", "Series_Complete_Cumulative", "Booster_Cumulative", ]]
def read(self) -> str: with tempfile.TemporaryDirectory() as tf: r = requests.get(self.source_url) z = zipfile.ZipFile(io.BytesIO(r.content)) z.extractall(tf) df_primary = pd.read_csv( os.path.join( tf, "primary-series-vaccination-take-up-by-population.csv")) check_known_columns( df_primary, [ "vacc_date", "received_at_least_one_dose", "received_at_least_two_doses", "received_one_dose_pcttakeup", "received_two_doses_pcttakeup", ], ) df_boosters = pd.read_csv( os.path.join(tf, "progress-of-vaccine-booster-programme.csv")) check_known_columns( df_boosters, [ "vacc_date", "received_booster_or_three_doses", "booster_or_three_doses_pcttakeup", ], ) df = self._merge_primary_and_boosters(df_primary, df_boosters) return df
def read(self) -> pd.DataFrame: data = requests.get(self.source_url, headers={ "Authorization": f"Token {self.token}" }).json() df = pd.DataFrame.from_records(data) check_known_columns( df, [ "area", "areaid", "dailydose1", "dailydose2", "dailydose3", "daydiff", "daytotal", "referencedate", "totaldistinctpersons", "totaldose1", "totaldose2", "totaldose3", "totalvaccinations", ], ) return (df.rename( columns={ "referencedate": "date", "totaldistinctpersons": "people_vaccinated", "totaldose2": "people_fully_vaccinated", "totaldose3": "total_boosters", "totalvaccinations": "total_vaccinations", })[[ "people_vaccinated", "people_fully_vaccinated", "total_boosters", "total_vaccinations", "date" ]].groupby("date").sum().reset_index())
def pipe_checks(self, df: pd.DataFrame) -> pd.DataFrame: check_known_columns( df, [ "COUNTRY", "WHO_REGION", "ISO3", "PERSONS_VACCINATED_1PLUS_DOSE_PER100", "PERSONS_FULLY_VACCINATED", "DATA_SOURCE", "TOTAL_VACCINATIONS", "NUMBER_VACCINES_TYPES_USED", "TOTAL_VACCINATIONS_PER100", "FIRST_VACCINE_DATE", "PERSONS_FULLY_VACCINATED_PER100", "PERSONS_VACCINATED_1PLUS_DOSE", "VACCINES_USED", "DATE_UPDATED", ], ) if len(df) > 300: raise ValueError( f"Check source, it may contain updates from several dates! Shape found was {df.shape}" ) if df.groupby("COUNTRY").DATE_UPDATED.nunique().nunique() == 1: if df.groupby("COUNTRY").DATE_UPDATED.nunique().unique()[0] != 1: raise ValueError("Countries have more than one date update!") else: raise ValueError("Countries have more than one date update!") return df
def pipe_initial_check(self, df: pd.DataFrame) -> pd.DataFrame: # Vaccines vaccines_wrong = set(df.Vaccine).difference(self.vaccine_mapping) if vaccines_wrong: raise ValueError(f"Unknown vaccines found. Check {vaccines_wrong}") check_known_columns(df, COLUMNS) return df
def read(self): df = pd.read_csv(self.source_url) check_known_columns( df, [ "granularity_time", "granularity_geo", "location_code", "border", "age", "sex", "year", "week", "yrwk", "season", "x", "date", "n_dose_1", "n_dose_2", "n_dose_3_all", "cum_n_dose_1", "cum_n_dose_2", "cum_n_dose_3_all", "cum_pr100_dose_1", "cum_pr100_dose_2", "cum_pr100_dose_3_all", "pop", "location_name", "date_of_publishing", ], ) return df
def read(self): df = pd.read_csv(self.source_url) check_known_columns(df, [ "fecha_corte", "fecha_vacunacion", "fabricante", "dosis", "n_reg", "flag_vacunacion_general" ]) return df[[ "fecha_vacunacion", "fabricante", "dosis", "n_reg", "flag_vacunacion_general" ]]
def read(self) -> pd.DataFrame: df = pd.read_csv(self.source_url) check_known_columns( df, [ "data", "doses", "doses_novas", "doses1", "doses1_novas", "doses2", "doses2_novas", "pessoas_vacinadas_completamente", "pessoas_vacinadas_completamente_novas", "pessoas_vacinadas_parcialmente", "pessoas_vacinadas_parcialmente_novas", "pessoas_inoculadas", "pessoas_inoculadas_novas", "vacinas", "vacinas_novas", "pessoas_vacinadas_completamente_continente", "pessoas_vacinadas_completamente_continente_novas", "pessoas_reforço", "pessoas_reforço_novas", "pessoas_reforço_continente", "pessoas_reforço_continente_novas", "pessoas_gripe", "pessoas_gripe_novas", "vacinas_reforço_e_gripe_novas", "reforço_80mais", "reforço_80mais_novas", "reforço_70_79", "reforço_70_79_novas", "reforço_65_69", "reforço_65_69_novas", "reforço_60_69", "reforço_60_69_novas", "reforço_50_59", "reforço_50_59_novas", "vacinação_iniciada_05_11", "vacinação_iniciada_05_11_novas", "pessoas_inoculadas_12mais", "reforço_40_49", "reforço_40_49_novas", "vacinação_completa_05_11_novas", "reforço_30_39_novas", "vacinação_completa_05_11", "reforço_18_29", "reforço_18_29_novas", "reforço_30_39", ], ) return df[self.columns_rename.keys()]
def pipe_calculate_metrics(self, df: pd.DataFrame) -> pd.DataFrame: check_known_columns(df, [ "Region", "date", "Primera", "Refuerzo", "Segunda", "Unica", "Cuarta" ]) df = df.fillna(0) return df.assign( people_vaccinated=df.Primera + df.Unica, people_fully_vaccinated=df.Segunda + df.Unica, total_vaccinations=df.Primera + df.Refuerzo + df.Segunda + df.Unica + df.Cuarta, total_boosters=df.Refuerzo + df.Cuarta, ).drop(columns=["Primera", "Refuerzo", "Segunda", "Unica", "Cuarta"])
def read(self) -> pd.DataFrame: df = pd.read_excel(self.source_url) check_known_columns( df, [ "Date", "Nombre de dose 1", "Nombre de dose 2", "Nombre de Dose complémentaire par rapport à schéma complet", "Nombre total de doses", ], ) return df
def read(self) -> pd.DataFrame: df = pd.read_csv(self.source_url) # Temporal fix df.columns = [col.replace("\ufeff", "") for col in df.columns] check_known_columns( df, [ "Date of Vaccination", "Total Vaccination Doses", "Fully vaccinated (2 of 2 or 1 of 1)", "Received one dose (1 of 2 or 1 of 1)", "Total Booster doses", ], ) return df
def read(source: str) -> pd.DataFrame: df = pd.read_csv(source) check_known_columns( df, [ "id", "datum", "vakcina", "kraj_nuts_kod", "kraj_nazev", "vekova_skupina", "prvnich_davek", "druhych_davek", "celkem_davek", ], ) return df
def read(self): data = request_json(self.source_url) df = pd.DataFrame.from_records(elem["attributes"] for elem in data["features"]) check_known_columns( df, [ "Reportdt", "Total_Vaccinations", "Total_Individuals", "LastValue", "ObjectId", "Elderly", "FirstDose", "SecondDose", "BoosterDose", ], ) return df
def read(self): df = pd.read_csv(self.source_url, sep=";") check_known_columns( df, [ "iso_week", "iso_year", "week", "vaccine", "gender", "AgeGroup", "region", "district", "district_code", "dose", "doses_administered", ], ) return df
def read(self) -> pd.DataFrame: df = pd.read_csv(self.source_url) check_known_columns( df, [ "date", "daily_partial", "daily_full", "daily", "daily_partial_child", "daily_full_child", "daily_booster", "cumul_partial", "cumul_full", "cumul", "cumul_partial_child", "cumul_full_child", "cumul_booster", "pfizer1", "pfizer2", "pfizer3", "sinovac1", "sinovac2", "sinovac3", "astra1", "astra2", "astra3", "sinopharm1", "sinopharm2", "sinopharm3", "cansino", "cansino3", "pending1", "pending2", "pending3", "daily_partial_adol", "daily_full_adol", "cumul_full_adol", "cumul_partial_adol", ], ) return df
def pipeline(self, df: pd.DataFrame) -> pd.DataFrame: check_known_columns( df, [ "fecha", "dosis_total", "primera_dosis", "segunda_dosis", "dosis_unica", "refuerzo" ], ) return (df.pipe(self.pipe_column_rename).pipe(self.pipe_metrics).pipe( self.pipe_checks).pipe(self.pipe_date).pipe( self.pipe_vaccines).assign( location=self.location, source_url=self.source_url_ref)[[ "location", "date", "vaccine", "source_url", "total_vaccinations", "people_vaccinated", "people_fully_vaccinated", "total_boosters", ]].sort_values("date").pipe( self.pipe_exclude_dp).pipe(make_monotonic))
def read(self) -> pd.DataFrame: df = pd.read_csv(self.source_url) check_known_columns( df, [ "data_somministrazione", "fornitore", "area", "fascia_anagrafica", "sesso_maschile", "sesso_femminile", "prima_dose", "seconda_dose", "pregressa_infezione", "dose_addizionale_booster", "codice_NUTS1", "codice_NUTS2", "codice_regione_ISTAT", "nome_area", "booster_immuno", ], ) return df[self.columns]
def pipeline_manufacturer(self, df: pd.DataFrame) -> pd.DataFrame: check_known_columns( df, [ "zona", "fabricante", "dosis_total", "primera_dosis", "segunda_dosis", "dosis_unica", "dosis_refuerzo", "administered_at", ], ) return (df.pipe(self.pipe_manuf_rename_cols).pipe( self.pipe_manuf_aggregate).pipe( self.pipe_manuf_vaccine_checks).pipe( self.pipe_manuf_date).assign( location=self.location).sort_values( ["vaccine", "date"])[[ "location", "date", "vaccine", "total_vaccinations" ]])
def read(self) -> pd.DataFrame: response = requests.get(self.source_url).content df = pd.read_csv(io.StringIO(response.decode("utf-8"))) check_known_columns( df, [ "Date", "Age Group", "Sex", "Sinovac 1st dose", "Sinovac 2nd dose", "Sinovac 3rd dose", "Sinovac 4th dose", "Sinovac 5th dose", "Sinovac 6th dose", "BioNTech 1st dose", "BioNTech 2nd dose", "BioNTech 3rd dose", "BioNTech 4th dose", "BioNTech 5th dose", "BioNTech 6th dose", ], ) return df
def read(self) -> pd.DataFrame: df = read_csv_from_url(self.source_url["main"]) check_known_columns(df, ["date", "dose_1", "dose_2", "dose_3"]) return df
def read(self) -> pd.DataFrame: df = pd.read_csv(self.source_url) check_known_columns( df, ["DATE", "REGION", "AGEGROUP", "SEX", "BRAND", "DOSE", "COUNT"]) return df[["DATE", "DOSE", "COUNT"]]
def read(self) -> pd.DataFrame: df = read_csv_from_url(self.source_url, sep=";", ciphers_low=True) check_known_columns( df, ["date", "state_id", "state_name", "vaccine", "dose_number", "doses_administered_cumulative"] ) return df[["date", "state_name", "vaccine", "dose_number", "doses_administered_cumulative"]]
def export(self): vaccine_mapping = { 1: "Pfizer/BioNTech", 2: "Moderna", 3: "Oxford/AstraZeneca", 4: "Johnson&Johnson", } one_dose_vaccines = ["Johnson&Johnson"] df = read_csv_from_url(self.source_url, sep=";") check_known_columns( df, [ "fra", "vaccin", "jour", "n_dose1", "n_dose2", "n_dose3", "n_dose4", "n_rappel", "n_cum_dose1", "n_cum_dose2", "n_cum_dose3", "n_cum_dose4", "n_cum_rappel", ], ) df = df[["vaccin", "jour", "n_cum_dose1", "n_cum_dose2", "n_cum_dose3", "n_cum_dose4"]] df = df.rename( columns={ "vaccin": "vaccine", "jour": "date", } ) # Map vaccine names df = df[(df.vaccine.isin(vaccine_mapping.keys())) & (df.n_cum_dose1 > 0)] assert set(df["vaccine"].unique()) == set(vaccine_mapping.keys()) df["vaccine"] = df.vaccine.replace(vaccine_mapping) df["total_vaccinations"] = df.n_cum_dose1 + df.n_cum_dose2 + df.n_cum_dose3 + df.n_cum_dose4 df["people_vaccinated"] = df.n_cum_dose1 # 2-dose vaccines mask = -df.vaccine.isin(one_dose_vaccines) df.loc[mask, "people_fully_vaccinated"] = df.n_cum_dose2 df.loc[mask, "total_boosters"] = df.n_cum_dose3 + df.n_cum_dose4 # 1-dose vaccines mask = df.vaccine.isin(one_dose_vaccines) df.loc[mask, "people_fully_vaccinated"] = df.n_cum_dose1 df.loc[mask, "total_boosters"] = df.n_cum_dose2 + df.n_cum_dose3 + df.n_cum_dose4 df = df.drop(columns=["n_cum_dose1", "n_cum_dose2", "n_cum_dose3", "n_cum_dose4"]) df_man = df[["date", "total_vaccinations", "vaccine"]].assign(location="France") approval_timeline = df_man[["vaccine", "date"]].groupby("vaccine").min().to_dict()["date"] df = ( df.groupby("date", as_index=False) .agg( { "total_vaccinations": "sum", "people_vaccinated": "sum", "people_fully_vaccinated": "sum", "total_boosters": "sum", } ) .pipe(build_vaccine_timeline, approval_timeline) ) df = df.assign( location=self.location, source_url=self.source_url_ref, ) self.export_datafile( df, df_manufacturer=df_man, meta_manufacturer={"source_name": "Public Health France", "source_url": self.source_url}, )
def parse_metrics_from_pdf(self, pdf_path): print(pdf_path) dfs = tabula.read_pdf(pdf_path) df = dfs[0] # Fix header df = self._fix_header(df) # All calculations below assume a fixed shape of the PDF's table, and a specific order for # the columns and vaccines. If the following test fails, then the table should be checked # for potential changes. check_known_columns( df, [ "දිය", "ේ ාවිෂීල්ඩඩ් Covishield", "ටයිේයාෆාම් Sinopharm", # "ටුට්නිව් - V", "ට ට්නිව් - V", "Sputnik - V", "Unnamed: 0", "ෆයිසර්Pfizer", "Unnamed: 1", "Unnamed: 2", "ම ොඩර්ර්ො Moderna", ], ) values_idx = df.index[df.iloc[:, 1].notnull()].max() values_raw = df.iloc[values_idx].values.flatten() values = [] for val in values_raw: if not pd.isnull(val): values += val.split() assert len(values) == 11 doses = { "first": { "covishield": clean_count(values[0]), "sinopharm": clean_count(values[2]), "sputnik": clean_count(values[4]), "pfizer": clean_count(values[6]), "moderna": clean_count(values[9]), }, "second": { "covishield": clean_count(values[1]), "sinopharm": clean_count(values[3]), "sputnik": clean_count(values[5]), "pfizer": clean_count(values[7]), "moderna": clean_count(values[10]), }, "third": { "pfizer": clean_count(values[8]) }, } people_vaccinated = sum(doses["first"].values()) people_fully_vaccinated = sum(doses["second"].values()) total_boosters = sum(doses["third"].values()) total_vaccinations = people_vaccinated + people_fully_vaccinated + total_boosters return pd.Series({ "total_vaccinations": total_vaccinations, "people_vaccinated": people_vaccinated, "people_fully_vaccinated": people_fully_vaccinated, "total_boosters": total_boosters, })