def read(self, last_update: str) -> pd.DataFrame: yearly_report_page = get_soup(self.source_url) # Get Newest Month Report Page monthly_report_link = yearly_report_page.find("div", class_="col-lg-12", id="content-detail").find("a")["href"] monthly_report_page = get_soup(monthly_report_link) # Get links df = self._parse_data(monthly_report_page, last_update) return df
def read(source: str) -> pd.Series: soup = get_soup(source) link = parse_infogram_link(soup) soup = get_soup(link) infogram_data = parse_infogram_data(soup) return pd.Series({ "date": parse_infogram_date(infogram_data), "source_url": source, **parse_infogram_vaccinations(infogram_data), })
def read(source: str): soup = get_soup(source) url = parse_pdf_link(soup, source) if not url.endswith(".pdf"): raise ValueError(f"File reporting metrics is not a PDF: {url}!") ds = pd.Series(parse_data(url)) return ds
def connect_parse_data(source: str) -> pd.Series: soup = get_soup(source) counters = soup.find_all(class_="elementor-counter-number") assert len(counters) == 4, "New counter in dashboard?" total_vaccinations = clean_count(counters[0]["data-to-value"]) first_doses = clean_count(counters[1]["data-to-value"]) second_doses = clean_count(counters[2]["data-to-value"]) unique_doses = clean_count(counters[3]["data-to-value"]) people_vaccinated = first_doses + unique_doses people_fully_vaccinated = second_doses + unique_doses date = localdate("America/Jamaica") return pd.Series( data={ "total_vaccinations": total_vaccinations, "people_vaccinated": people_vaccinated, "people_fully_vaccinated": people_fully_vaccinated, "date": date, } )
def parse_vaccinations(elem) -> dict: # Get news text url = elem.find_parent(class_="card").find("a").get("href") soup = get_soup(url, verify=False) text = "\n".join([p.text for p in soup.find("article").find_all("p")]) # Find metrics metrics = dict() # total_vaccinations = re.search(r"疫苗共有(?P<count>[\d,]*)人次", text) total_vaccinations = re.search(r"疫苗劑數為(?P<count>[\d,]*)劑", text) # print(total_vaccinations) # people_vaccinated = re.search(r"1劑疫苗共有(?P<count>[\d,]*)人次", text) people_vaccinated = re.search(r"已接種人數共有(?P<count>[\d,]*)人", text) # people_fully_vaccinated = re.search(r"2劑疫苗共有(?P<count>[\d,]*)人次", text) # people_fully_vaccinated = re.search(r"已完成接種2劑有(?P<count>[\d,]*)人", text) people_fully_vaccinated = re.search(r"已接種第2劑的?有([\d,]{6,})", text) # people_fully_vaccinated = re.search(r"接種2劑有(?P<count>[\d,]*)人", text) if total_vaccinations: metrics["total_vaccinations"] = clean_count( total_vaccinations.group(1)) if people_vaccinated: metrics["people_vaccinated"] = clean_count(people_vaccinated.group(1)) if people_fully_vaccinated: metrics["people_fully_vaccinated"] = clean_count( people_fully_vaccinated.group(1)) return metrics
def read(self) -> pd.DataFrame: """Load data.""" soup = utils.get_soup(self.source_url) link = self._parse_file_link(soup) df = utils.read_xlsx_from_url(link, sheet_name="Date") print(link) # df_by_age = utils.read_xlsx_from_url(link, sheet_name="Ethnicity Age Gender by dose") return df # , df_by_age
def _parse_pdf_link(self, soup) -> str: a = soup.find(class_="download").find("a") url_pdf = f"{self.source_url}{a['href']}" for i in range(10): soup = get_soup(url_pdf) a = soup.find(class_="viewer-button") if a is not None: break return f"{self.source_url}{a['href']}"
def read(self, last_update: str) -> pd.DataFrame: data = [] for cnt in range(0, 5 * self._num_max_pages, 5): # print(f"page: {cnt}") url = f"{self.source_url}/(offset)/{cnt}/" soup = get_soup(url) data_, proceed = self.parse_data(soup, last_update) data.extend(data_) if not proceed: break return pd.DataFrame(data)
def read(self) -> pd.Series: soup = get_soup(self.source_url) people_vaccinated, people_fully_vaccinated = self.parse_vaccinated( soup) date_str = self.parse_date(soup) data = pd.Series({ "people_vaccinated": people_vaccinated, "people_fully_vaccinated": people_fully_vaccinated, "total_vaccinations": people_vaccinated + people_fully_vaccinated, "date": date_str, }) return pd.Series(data)
def read(source: str) -> pd.Series: soup = get_soup(source) counters = soup.find_all(class_="text-brand-blue") dose_1 = clean_count(counters[1].text) dose_2 = clean_count(counters[2].text) assert dose_1 >= dose_2 date = soup.find(class_="text-gray-500").text date = date.replace("Updated ", "") + str(datetime.date.today().year) date = clean_date(date, fmt="%d. %B%Y", lang="en") return pd.Series({"people_vaccinated": dose_1, "people_fully_vaccinated": dose_2, "date": date})
def vaccines_approved(self, location: str = None, original_names: bool = False) -> list: """Get list of approved vaccines in a country (or all if None specified). Args: location (str, optional): Country name. If None, retrieves all approved vaccines. Defaults to None. original_names (bool, optional): Set to True to keep vaccine from web. Defaults to False. Returns: list: Approved vaccines """ if location: try: url = self.get_country_url(location) soup = get_soup(url) return self._parse_vaccines_location(soup, original_names) except ValueError: return None else: soup = get_soup(self.all_vaccines_url) return self._parse_vaccines_all(soup, original_names)
def connect_parse_data(source: str) -> pd.Series: soup = get_soup(source) people_vaccinated = soup.find(class_="count-up").text people_vaccinated = clean_count(people_vaccinated) total_vaccinations = people_vaccinated return pd.Series( data={ "total_vaccinations": total_vaccinations, "people_vaccinated": people_vaccinated, })
def read(source: str) -> pd.Series: soup = get_soup(source) for label in soup.find_all(class_="number-label"): if label.text == "Total vaccins administrés": container = label.parent.parent return pd.Series( data={ "total_vaccinations": parse_total_vaccinations(container), "people_vaccinated": parse_people_vaccinated(container), "people_fully_vaccinated": parse_people_fully_vaccinated( container), "source_url": source, })
def read(self) -> pd.Series: soup = get_soup(self.source_url) ( total_vaccinations, people_vaccinated, people_fully_vaccinated, total_boosters, ) = self._parse_metrics(soup) return pd.Series({ "total_vaccinations": total_vaccinations, "people_vaccinated": people_vaccinated, "people_fully_vaccinated": people_fully_vaccinated, "total_boosters": total_boosters, "source_url": self.source_url, "date": self._parse_date(soup), })
def parse_data(self, soup: BeautifulSoup, last_update: str) -> tuple: elems = self.get_elements(soup) records = [] for elem in elems: if elem["date"] > last_update: # print(elem["date"], elem) soup = get_soup(elem["link"]) record = { "source_url": elem["link"], "date": elem["date"], **self.parse_data_news_page(soup), } records.append(record) else: # print(elem["date"], "END") return records, False return records, True
def connect_parse_data(source: str) -> pd.Series: soup = get_soup(source) people_vaccinated = soup.find_all(class_="repart-stlucia")[0].text people_vaccinated = clean_count(people_vaccinated) people_fully_vaccinated = soup.find_all(class_="repart-stlucia")[1].text people_fully_vaccinated = clean_count(people_fully_vaccinated) total_vaccinations = people_vaccinated + people_fully_vaccinated date = localdate("America/St_Lucia") data = { "total_vaccinations": total_vaccinations, "people_vaccinated": people_vaccinated, "people_fully_vaccinated": people_fully_vaccinated, "date": date, } return pd.Series(data=data)
def read(source: str, last_update: str, num_pages_limit: int = 10): records = [] for page_nr in range(1, num_pages_limit): # print(page_nr) # Get soup url = f"{source}/{page_nr}/" soup = get_soup(url, verify=False) # Get data (if any) records_sub = parse_data(soup) if records_sub: records.extend(records_sub) if any([record["date"] <= last_update for record in records_sub]): # print("Dates exceding! ", str([record["date"] for record in records_sub])) break if pd.Series([r.get("total_vaccinations") for r in records]).notnull().any(): records = [ record for record in records if record["date"] >= last_update ] if len(records) > 0: return postprocess(pd.DataFrame(records)) return None
def read(source: str) -> pd.Series: soup = get_soup(source) people_vaccinated = clean_count( re.search(r"^[\d,]+", soup.find_all(class_="info-box-number")[2].text).group(0) ) people_fully_vaccinated = clean_count( re.search(r"^[\d,]+", soup.find_all(class_="info-box-number")[3].text).group(0) ) total_vaccinations = people_vaccinated + people_fully_vaccinated date = localdate("Asia/Dhaka") return pd.Series( data={ "total_vaccinations": total_vaccinations, "people_vaccinated": people_vaccinated, "people_fully_vaccinated": people_fully_vaccinated, "date": date, } )
def connect_parse_data(source: str) -> pd.Series: soup = get_soup(source) df = pd.read_html(str(soup.find(class_="vaccination-count")))[0] assert df.shape == (3, 7) values = df.iloc[:, 2].values total_vaccinations = values[0] people_vaccinated = values[1] people_fully_vaccinated = values[2] assert total_vaccinations == people_vaccinated + people_fully_vaccinated date = soup.find(class_="aly_tx_center").text date = localdate("Asia/Tokyo") data = { "total_vaccinations": total_vaccinations, "people_vaccinated": people_vaccinated, "people_fully_vaccinated": people_fully_vaccinated, "date": date, } return pd.Series(data=data)
def read(source: str) -> pd.Series: soup = get_soup(source) for block in soup.find(class_="main").find_all(class_="w3-center"): if block.find("p").text == "ΣΥΝΟΛΟ ΕΜΒΟΛΙΑΣΜΩΝ": total_vaccinations = clean_count(block.find_all("p")[1].text) date = re.search(r"[\d/]{8,10}", block.find_all("p")[2].text) date = clean_date(date.group(0), "%d/%m/%Y") if block.find("p").text == "ΣΥΝΟΛΟ 1ης ΔΟΣΗΣ": people_vaccinated = clean_count(block.find_all("p")[1].text) if block.find("p").text == "ΣΥΝΟΛΟ 2ης ΔΟΣΗΣ": people_fully_vaccinated = clean_count(block.find_all("p")[1].text) data = { "total_vaccinations": total_vaccinations, "people_vaccinated": people_vaccinated, "people_fully_vaccinated": people_fully_vaccinated, "date": date, "source_url": source, } return pd.Series(data=data)
def read(self) -> pd.Series: soup = get_soup(self.source_url, verify=False) return self.parse_data(soup)
def read(self) -> pd.DataFrame: soup = get_soup(self.source_url) return self.parse_data(soup)
def read(self) -> pd.Series: soup = get_soup(self.source_data_url) url_pdf = self._parse_pdf_link(soup) dfs = self._parse_tables(url_pdf) data = self.parse_data(dfs, soup) return data
def _parse_link_zip(self): soup = get_soup(self.source_url_ref) url = soup.find("a", string="Download her").get("href") return url
def pipe_total_vax_bfill(self, df: pd.DataFrame, n_days: int) -> pd.DataFrame: soup = get_soup(self.source_url_ref) links = self._get_zip_links(soup) links = links[:n_days] df = self._backfill_total_vaccinations(df, links) return df
def _get_file_link(self): soup = get_soup(self.source_url) file_url = soup.find_all("a", class_="resource-url-analytics")[-1]["href"] return file_url
def read(self) -> pd.Series: soup = get_soup(self.source_url) data = self._parse_data(soup) return pd.Series(data)
def read(self): soup = get_soup(self.source_url) links = self._parse_links_pdfs(soup) # For now, only get most recent link link = links[0] return self._parse_data(link)
def read(self): soup = get_soup(self.source_url) data = self.parse_data(soup) print(data) return pd.Series(data=data)
def read(source: str) -> pd.Series: soup = get_soup(source) return parse_data(soup)