def read(source_daily: str, source_weekly: str) -> pd.DataFrame: # Daily soup_daily = get_soup(source_daily) for div in soup_daily.find_all("div"): if div.text == "Vaccine doses administered": dose_block = div.parent.findChildren()[1] break date_daily = parse_date_daily(dose_block) total_vaccinations_d = parse_data_daily(dose_block) # Weekly soup_weekly = get_soup(source_weekly) date_weekly = parse_date_weekly(soup_weekly) total_vaccinations_w, people_vaccinated, people_fully_vaccinated = parse_data_weekly( soup_weekly) df = pd.DataFrame.from_records([{ "date": date_weekly, "total_vaccinations": total_vaccinations_w, "people_vaccinated": people_vaccinated, "people_fully_vaccinated": people_fully_vaccinated, "source_url": source_weekly }, { "date": date_daily, "total_vaccinations": total_vaccinations_d, "source_url": source_daily }]) return df
def read(self, last_update: str) -> pd.DataFrame: yearly_report_page = get_soup(self.source_url) # Get Newest Month Report Page monthly_report_link = yearly_report_page.find( "div", class_="col-lg-12", id="content-detail").find("a")["href"] monthly_report_page = get_soup(monthly_report_link) # Get links df = self._parse_data(monthly_report_page, last_update) return df
def read(source: str) -> pd.Series: soup = get_soup(source) link = parse_infogram_link(soup) soup = get_soup(link) infogram_data = parse_infogram_data(soup) return pd.Series({ "date": parse_infogram_date(infogram_data), "source_url": source, **parse_infogram_vaccinations(infogram_data) })
def read(source: str) -> pd.Series: soup = get_soup(source) link = parse_infogram_link(soup) soup = get_soup(link) infogram_data = parse_infogram_data(soup) return pd.Series({ "people_vaccinated": parse_infogram_people_vaccinated(infogram_data), "date": parse_infogram_date(infogram_data), "source_url": source })
def read(source: str): soup = get_soup(source) url = parse_pdf_link(soup, source) if not url.endswith(".pdf"): raise ValueError(f"File reporting metrics is not a PDF: {url}!") ds = pd.Series(parse_data(url)) return ds
def parse_vaccinations(elem) -> dict: # Get news text url = elem.find_parent(class_="card").find("a").get("href") soup = get_soup(url) text = "\n".join([p.text for p in soup.find("article").find_all("p")]) # Find metrics metrics = dict() # total_vaccinations = re.search(r"疫苗共有(?P<count>[\d,]*)人次", text) total_vaccinations = re.search(r"疫苗劑數為(?P<count>[\d,]*)劑", text) # print(total_vaccinations) # people_vaccinated = re.search(r"1劑疫苗共有(?P<count>[\d,]*)人次", text) people_vaccinated = re.search(r"已接種人數共有(?P<count>[\d,]*)人", text) # people_fully_vaccinated = re.search(r"2劑疫苗共有(?P<count>[\d,]*)人次", text) people_fully_vaccinated = re.search(r"已完成接種2劑有(?P<count>[\d,]*)人", text) if total_vaccinations: metrics["total_vaccinations"] = clean_count( total_vaccinations.group(1)) if people_vaccinated: metrics["people_vaccinated"] = clean_count(people_vaccinated.group(1)) if people_fully_vaccinated: metrics["people_fully_vaccinated"] = clean_count( people_fully_vaccinated.group(1)) return metrics
def read(self) -> pd.DataFrame: """Load data.""" soup = utils.get_soup(self.source_url) link = self._parse_file_link(soup) df = utils.read_xlsx_from_url(link, sheet_name="Date") print(link) #df_by_age = utils.read_xlsx_from_url(link, sheet_name="Ethnicity Age Gender by dose") return df#, df_by_age
def read(source: str) -> pd.Series: url = f"{source}/Category/Page/9jFXNbCe-sFK9EImRRi2Og" soup = get_soup(url) url_pdf = parse_pdf_link(source, soup) df = parse_table(url_pdf) return pd.Series({ "total_vaccinations": parse_total_vaccinations(df), "date": parse_date(df) })
def _parse_pdf_link(self, soup) -> str: a = soup.find(class_="download").find("a") url_pdf = f"{self.source_url}{a['href']}" for i in range(10): soup = get_soup(url_pdf) a = soup.find(class_="viewer-button") if a is not None: break return f"{self.source_url}{a['href']}"
def read(source: str) -> pd.Series: soup = get_soup(source) total_vaccinations, people_fully_vaccinated = parse_data(soup) return pd.Series({ "total_vaccinations": total_vaccinations, "people_fully_vaccinated": people_fully_vaccinated, "source_url": source, "date": parse_date(soup) })
def read_1(self): soup = get_soup(self.source_url_1) dfs = pd.read_html(str(soup), header=0) if len(dfs) != 1: raise ValueError( f"Only one table should be present. {len(dfs)} tables detected." ) df = dfs[0] return df
def read(self) -> pd.Series: soup = get_soup(self.source_url) total_vaccinations, people_vaccinated, people_fully_vaccinated = self._parse_metrics(soup) return pd.Series({ "total_vaccinations": total_vaccinations, "people_vaccinated": people_vaccinated, "people_fully_vaccinated": people_fully_vaccinated, "source_url": self.source_url, "date": self._parse_date(soup) })
def read(self, last_update: str) -> pd.DataFrame: data = [] for cnt in range(0, 5 * self._num_max_pages, 5): # print(f"page: {cnt}") url = f"{self.source_url}/(offset)/{cnt}/" soup = get_soup(url) data_, proceed = self.parse_data(soup, last_update) data.extend(data_) if not proceed: break return pd.DataFrame(data)
def read(self) -> pd.Series: soup = get_soup(self.source_url) people_vaccinated, people_fully_vaccinated = self.parse_vaccinated(soup) date_str = self.parse_date(soup) data = pd.Series({ "people_vaccinated": people_vaccinated, "people_fully_vaccinated": people_fully_vaccinated, "total_vaccinations": people_vaccinated + people_fully_vaccinated, "date": self.parse_date(soup) }) return pd.Series(data)
def vaccines_approved(self, location: str = None, original_names: bool = False) -> list: """Get list of approved vaccines in a country (or all if None specified). Args: location (str, optional): Country name. If None, retrieves all approved vaccines. Defaults to None. original_names (bool, optional): Set to True to keep vaccine from web. Defaults to False. Returns: list: Approved vaccines """ if location: try: url = self.get_country_url(location) soup = get_soup(url) return self._parse_vaccines_location(soup, original_names) except ValueError: return None else: soup = get_soup(self.all_vaccines_url) return self._parse_vaccines_all(soup, original_names)
def parse_metrics(self) -> tuple: soup = get_soup(self.source_url) elems = soup.find(class_="vacunometro-cifras").find_all("td") if len(elems) != 2: raise ValueError( "Something changed in source layout. More than two elemnts with class='vacunados' were found." ) values = [clean_count(elem.text) for elem in elems] dose_1 = max(values) dose_2 = min(values) return dose_1, dose_2
def connect_parse_data(source: str) -> pd.Series: soup = get_soup(source) people_vaccinated = soup.find(class_="count-up").text people_vaccinated = clean_count(people_vaccinated) total_vaccinations = people_vaccinated return pd.Series(data={ "total_vaccinations": total_vaccinations, "people_vaccinated": people_vaccinated, })
def read(source: str) -> pd.Series: soup = get_soup(source) for label in soup.find_all(class_="number-label"): if label.text == "Total vaccins administrés": container = label.parent.parent return pd.Series( data={ "total_vaccinations": parse_total_vaccinations(container), "people_vaccinated": parse_people_vaccinated(container), "people_fully_vaccinated": parse_people_fully_vaccinated( container), "source_url": source, })
def read(source_daily: str, source_weekly: str) -> pd.DataFrame: # Daily soup_daily = get_soup(source_daily) date_daily = parse_date_daily(soup_daily) total_vaccinations_d = parse_data_daily(soup_daily) # Weekly soup_weekly = get_soup(source_weekly) date_weekly = parse_date_weekly(soup_weekly) total_vaccinations_w, people_vaccinated, people_fully_vaccinated = parse_data_weekly( soup_weekly) df = pd.DataFrame.from_records([{ "date": date_weekly, "total_vaccinations": total_vaccinations_w, "people_vaccinated": people_vaccinated, "people_fully_vaccinated": people_fully_vaccinated, "source_url": source_weekly }, { "date": date_daily, "total_vaccinations": total_vaccinations_d, "source_url": source_daily }]) return df
def parse_data(self, soup: BeautifulSoup, last_update: str) -> tuple: elems = self.get_elements(soup) records = [] for elem in elems: if elem["date"] > last_update: # print(elem["date"], elem) soup = get_soup(elem["link"]) record = { "source_url": elem["link"], "date": elem["date"], **self.parse_data_news_page(soup), } records.append(record) else: # print(elem["date"], "END") return records, False return records, True
def parse_metrics(self) -> tuple: soup = get_soup(self.source_url) elems = soup.find_all(class_="vacunados") if len(elems) != 2: raise ValueError( "Something changed in source layout. More than two elemnts with class='vacunados' were found." ) for elem in elems: _ = elem.find_all("span") if _[0].text == "1ra Dosis": dose_1 = _[1].text elif _[0].text == "2da Dosis": dose_2 = _[1].text else: raise ValueError( "Something changed in source layout. Name different than '1ra Dosis' or '2da Dosis'" ) return clean_count(dose_1), clean_count(dose_2)
def read(source: str, last_update: str, num_pages_limit: int = 10): records = [] for page_nr in range(1, num_pages_limit): # print(page_nr) # Get soup url = f"{source}/{page_nr}/" soup = get_soup(url) # Get data (if any) records_sub = parse_data(soup) if records_sub: records.extend(records_sub) if any([record["date"] <= last_update for record in records_sub]): # print("Dates exceding! ", str([record["date"] for record in records_sub])) break if len(records) > 0: records = [record for record in records if record["date"] >= last_update] if len(records) > 0: return postprocess(pd.DataFrame(records)) return None
def read(source: str) -> pd.Series: soup = get_soup(source) total_vaccinations = clean_count( soup.find(class_="stats-decoration-title").text) people_vaccinated = total_vaccinations people_fully_vaccinated = 0 date = re.search(r"\d+ \w+ 202\d", soup.find(class_="stats-decoration-text").text).group(0) date = str(pd.to_datetime(date).date()) return pd.Series( data={ "total_vaccinations": total_vaccinations, "people_vaccinated": people_vaccinated, "people_fully_vaccinated": people_fully_vaccinated, "date": date, })
def read(source: str) -> pd.Series: soup = get_soup(source) people_vaccinated = clean_count( re.search(r"^[\d,]+", soup.find_all(class_="info-box-number")[2].text).group(0)) people_fully_vaccinated = clean_count( re.search(r"^[\d,]+", soup.find_all(class_="info-box-number")[3].text).group(0)) total_vaccinations = people_vaccinated + people_fully_vaccinated date = localdate("Asia/Dhaka") return pd.Series( data={ "total_vaccinations": total_vaccinations, "people_vaccinated": people_vaccinated, "people_fully_vaccinated": people_fully_vaccinated, "date": date, })
def read(source: str) -> pd.Series: soup = get_soup(source) counters = soup.find_all(class_="counter") people_partially_vaccinated = clean_count(counters[0].text) people_fully_vaccinated = clean_count(counters[1].text) total_vaccinations = clean_count(counters[2].text) people_vaccinated = people_partially_vaccinated + people_fully_vaccinated date = soup.find("span", id="last-update").text date = re.search(r"\d+.*202\d", date).group(0) date = clean_date(date, "%d %B, %Y") data = { "total_vaccinations": total_vaccinations, "people_vaccinated": people_vaccinated, "people_fully_vaccinated": people_fully_vaccinated, "date": date, "source_url": source, } return pd.Series(data=data)
def connect_parse_data(source: str) -> pd.Series: soup = get_soup(source) people_vaccinated = soup.find_all(class_="repart-stlucia")[0].text people_vaccinated = clean_count(people_vaccinated) people_fully_vaccinated = soup.find_all(class_="repart-stlucia")[1].text people_fully_vaccinated = clean_count(people_fully_vaccinated) total_vaccinations = people_vaccinated + people_fully_vaccinated date = soup.find(class_="h2-blue").text date = re.search(r"\w+ +\d+, +202\d", date).group(0) date = clean_date(date, "%B %d, %Y") data = { "total_vaccinations": total_vaccinations, "people_vaccinated": people_vaccinated, "people_fully_vaccinated": people_fully_vaccinated, "date": date, } return pd.Series(data=data)
def read(source: str) -> pd.Series: soup = get_soup(source) people_vaccinated = clean_count( re.search(r"^[\d,]+", soup.find_all(class_="info-box-number")[2].text).group(0)) people_fully_vaccinated = clean_count( re.search(r"^[\d,]+", soup.find_all(class_="info-box-number")[3].text).group(0)) total_vaccinations = people_vaccinated + people_fully_vaccinated date = soup.find(class_="main_foot").find("span").text.replace( "Last updated: ", "") return pd.Series( data={ "total_vaccinations": total_vaccinations, "people_vaccinated": people_vaccinated, "people_fully_vaccinated": people_fully_vaccinated, "date": date, })
def parse_data(source: str) -> pd.Series: soup = get_soup(source) html_table = str(soup.find_all("table")[2]) df = pd.read_html(html_table, header=0)[0] assert len(df) <= 6, "New rows in the vaccine table!" astrazeneca = df.loc[df["백신"] == "아스트라제네카", "누적 접종(C)"].values.astype(int) pfizer = df.loc[df["백신"] == "화이자", "누적 접종(C)"].values.astype(int) johnson = df.loc[df["백신"] == "얀센2)", "누적 접종(C)"].values.astype(int) total_vaccinations = astrazeneca.sum() + pfizer.sum() + johnson[0] people_vaccinated = astrazeneca[0] + pfizer[0] + johnson[0] people_fully_vaccinated = astrazeneca[1] + pfizer[1] + johnson[0] data = { "people_vaccinated": people_vaccinated, "people_fully_vaccinated": people_fully_vaccinated, "total_vaccinations": total_vaccinations, "source_url": source, } return pd.Series(data=data)
def read(source: str) -> pd.Series: soup = get_soup(source) for block in soup.find(class_="main").find_all(class_="w3-center"): if block.find("p").text == "ΣΥΝΟΛΟ ΕΜΒΟΛΙΑΣΜΩΝ": total_vaccinations = clean_count(block.find_all("p")[1].text) date = re.search(r"[\d/]{8,10}", block.find_all("p")[2].text) date = clean_date(date.group(0), "%d/%m/%Y") if block.find("p").text == "ΣΥΝΟΛΟ 1ης ΔΟΣΗΣ": people_vaccinated = clean_count(block.find_all("p")[1].text) if block.find("p").text == "ΣΥΝΟΛΟ 2ης ΔΟΣΗΣ": people_fully_vaccinated = clean_count(block.find_all("p")[1].text) data = { "total_vaccinations": total_vaccinations, "people_vaccinated": people_vaccinated, "people_fully_vaccinated": people_fully_vaccinated, "date": date, "source_url": source, } return pd.Series(data=data)
def load_data(self) -> pd.DataFrame: """Load original data.""" soup = utils.get_soup(self.source_url) link = self._parse_file_link(soup) return utils.read_xlsx_from_url(link, sheet_name="Date")