def main(): # Load current file df_source = pd.read_csv(OUTPUT_FILE) # Get date driver = load_driver(DATA_URL) try: date = load_date(driver) except: raise Exception("Date not found!") # Load dose 1 data url = "https://www.fhi.no/api/chartdata/api/99112" dix = json.loads(requests.get(url).content) df_dose1 = pd.DataFrame(dix, columns=["region", "people_vaccinated"]) # Load dose 2 data url = "https://www.fhi.no/api/chartdata/api/99111" dix = json.loads(requests.get(url).content) df_dose2 = pd.DataFrame(dix, columns=["region", "people_fully_vaccinated"]) # Remove row df_dose1 = df_dose1.loc[~(df_dose2["region"] == "Fylke")] df_dose2 = df_dose2.loc[~(df_dose2["region"] == "Fylke")] # Merge df = df_dose1.merge(df_dose2, on="region", how="left") # Process region column df.loc[:, "region"] = df.loc[:, "region"].replace(REGION_RENAMING) # Add columns df.loc[:, "date"] = date df.loc[:, "location"] = COUNTRY df.loc[:, "total_vaccinations"] = df.loc[:, "people_fully_vaccinated"] + df.loc[:, "people_vaccinated"] # Add ISO codes df = ISODB().merge(df, country_iso=COUNTRY_ISO) # Concat df_source = df_source.loc[~(df_source.loc[:, "date"] == date)] df = pd.concat([df, df_source]) # Export df = df[[ "location", "region", "date", "location_iso", "region_iso", "total_vaccinations", "people_vaccinated", "people_fully_vaccinated" ]] df = keep_min_date(df) cols = [ "total_vaccinations", "people_vaccinated", "people_fully_vaccinated" ] df[cols] = df[cols].astype("Int64").fillna(pd.NA) df = df.sort_values(by=["region", "date"]) df.to_csv(OUTPUT_FILE, index=False) # Tracking update_country_tracking(country=COUNTRY, url=DATA_URL_REFERENCE, last_update=df["date"].max())
def _postprocess(self, df): df = ISODB().merge(df, country_iso=self.country_iso, mode=self.mode_iso_merge) df = df.sort_values(by="date") for field in self.do_cumsum_fields: df[field] = df.groupby("region")[field].cumsum().values # TODO: Insert here population info (need path to population.csv as class attribute) return df
def main(): # Load data df = pd.read_csv(DATA_URL) # Check 1 cols = ["datum", "vakcina", "kraj_nuts_kod", "kraj_nazev", "vekova_skupina", "prvnich_davek", "druhych_davek", "celkem_davek"] if not all([col in df.columns for col in cols]): raise Exception("API changed") # Column renaming df = df.rename(columns={ "datum": "date", "kraj_nazev": "region", "prvnich_davek": "people_vaccinated", "druhych_davek": "people_fully_vaccinated", "celkem_davek": "total_vaccinations" }) # Add counts per day df = df.groupby(by=["date", "region"]).agg( people_vaccinated=("people_vaccinated", sum), people_fully_vaccinated=("people_fully_vaccinated", sum), total_vaccinations=("total_vaccinations", sum) ).reset_index() # Check 2 if not (df["total_vaccinations"] == df["people_vaccinated"] + df["people_fully_vaccinated"]).all(): raise Exception("Error in columns. dose_1 + dose_2 != total_doses") # Rename regions df.loc[:, "region"] = df.loc[:, "region"].replace(REGION_RENAMING) df.loc[:, "location"] = COUNTRY # ISO df = ISODB().merge(df, country_iso=COUNTRY_ISO) # Compute cumsums df = df.sort_values(by="date") df.loc[:, "total_vaccinations"] = df.groupby("region")["total_vaccinations"].cumsum().values df.loc[:, "people_vaccinated"] = df.groupby("region")["people_vaccinated"].cumsum().values df.loc[:, "people_fully_vaccinated"] = df.groupby("region")["people_fully_vaccinated"].cumsum().values # Export export_data( df=df, data_url_reference=DATA_URL_REFERENCE, output_file=OUTPUT_FILE )
def main(): # Load current data df_source = pd.read_csv(OUTPUT_FILE) # Load data page_content = requests.get(DATA_URL, headers={ 'User-Agent': 'Custom' }).content soup = BeautifulSoup(page_content, "html.parser") # Get new data boxes = soup.findAll(class_="col-12 col-md-6 col-xl-4") new_data = [] if len(boxes) == 3: for box in boxes: fields = box.findAll(class_="col-12") if len(fields) == 4: region = fields[0].text.strip() if "Vaccines administered" in fields[1].text: total, regional = fields[1].findAll( class_="col-auto text-end") dose_1, dose_2 = list( map(lambda x: int(x.replace(",", "")), regional.text.strip().split("\n"))) new_data.append([region, dose_1, dose_2]) df = pd.DataFrame( new_data, columns=["region", "people_vaccinated", "people_fully_vaccinated"]) # Process df.loc[:, "total_vaccinations"] = df.loc[:, "people_vaccinated"] + df.loc[:, "people_fully_vaccinated"] df.loc[:, "location"] = COUNTRY # Join with date url = "https://covid-vaccinatie.be/en/vaccines-administered.xlsx" df_dates = get_date(url) df = df.merge(df_dates, left_on="region", right_on="Region", how="left") # ISO df = ISODB().merge(df, country_iso=COUNTRY_ISO) # Export region = df_dates.index.tolist() date = df_dates.date.tolist() df_source = df_source.loc[~(df_source["region"].isin(region) & df_source["date"].isin(date))] df = pd.concat([df, df_source]) df = df[[ "location", "region", "date", "location_iso", "region_iso", "total_vaccinations", "people_vaccinated", "people_fully_vaccinated" ]] cols = [ "total_vaccinations", "people_vaccinated", "people_fully_vaccinated" ] # Avoid repeating reports df = keep_min_date(df) # Export df[cols] = df[cols].astype("Int64").fillna(pd.NA) df = df.sort_values(by=["region", "date"]) df.to_csv(OUTPUT_FILE, index=False) # Tracking update_country_tracking(country=COUNTRY, url=DATA_URL_REFERENCE, last_update=df["date"].max())
def main(): # Load current data df_source = pd.read_csv(OUTPUT_FILE) # Locate newest pdf html_page = urllib.request.urlopen(DATA_URL) soup = BeautifulSoup(html_page, "html.parser") pdf_path = soup.find('a', text="Download her").get( "href") # Get path to newest pdf # Get preliminary dataframe column_string = { 'dtype': str, 'header': None } # Force dtype to be object because of thousand separator kwargs = { 'pandas_options': column_string, } dfs_from_pdf = tabula.read_pdf(pdf_path, pages="all", **kwargs) # len(dfs_from_pdf) == 8 ? #date = datetime.strptime(pdf_path.split("-")[-2], "%d%m%Y").strftime("%Y-%m-%d") date = get_date(dfs_from_pdf) # Get preliminary dataframe column_string = { 'dtype': str, 'header': None } # Force dtype to be object because of thousand separator kwargs = { 'pandas_options': column_string, } dfs_from_pdf = tabula.read_pdf(pdf_path, pages="all", **kwargs) df = dfs_from_pdf[1] # Hardcoded if df.shape != (11, 7): raise Exception("Shape of table changed!") if not all(region in df[0].tolist() for region in regions): raise Exception("Region missing!") # Drop columns df = df.drop([0, 1, 2, 3, len(df) - 1]) # Rename columns df = df.rename(columns={ 0: "region", 2: "people_vaccinated", 4: "people_fully_vaccinated" }) df = df.astype(str) # Remove numeric 1000-separator df.loc[:, "people_vaccinated"] = df.loc[:, "people_vaccinated"].apply( lambda x: int(x.replace(".", ""))).fillna(0).astype(int) def del_separator(x): if x != 'nan': return int(x.replace(".", "")) else: return 0 df.loc[:, "people_fully_vaccinated"] = df.loc[:, "people_fully_vaccinated"].apply( lambda x: del_separator( x)).astype("Int64") # Process region column df.loc[:, "region"] = df.loc[:, "region"].replace(REGION_RENAMING) # Get new columns df.loc[:, "total_vaccinations"] = df.loc[:, "people_vaccinated"] + df.loc[:, "people_fully_vaccinated"] df.loc[:, "location"] = COUNTRY df.loc[:, "date"] = date # Add ISO codes df = ISODB().merge(df, country_iso=COUNTRY_ISO) df.loc[df["region"] == "Others", "location_iso"] = COUNTRY_ISO # Concat df_source = df_source.loc[~(df_source.loc[:, "date"] == date)] df = pd.concat([df, df_source]) # Export cols = [ "location", "region", "date", "location_iso", "region_iso", "total_vaccinations", "people_vaccinated", "people_fully_vaccinated" ] df = keep_min_date(df[cols])[cols] df = df.sort_values(by=["region", "date"]) df.to_csv(OUTPUT_FILE, index=False) # Tracking update_country_tracking(country=COUNTRY, url=DATA_URL_REFERENCE, last_update=df["date"].max())