def export(self): data = pd.read_csv(self.output_path) url = "http://cdcmoh.gov.kh/" soup = get_soup(url) print(soup.select("span:nth-child(1) strong span")) count = clean_count(soup.select("p+ div strong:nth-child(1)")[0].text) date_str = localdatenow("Asia/Phnom_Penh") if count > data["Cumulative total"].max( ) and date_str > data["Date"].max(): new = pd.DataFrame({ "Country": self.location, "Date": [date_str], "Cumulative total": count, "Source URL": url, "Source label": "CDCMOH", "Units": "tests performed", }) data = pd.concat([new, data], sort=False) self.export_datafile(data)
def read(self) -> pd.DataFrame: """Reads the data from the source.""" soup = get_soup(self.source_url_ref) self._read_latest(soup) link = self._parse_file_link(soup) df = read_xlsx_from_url(link, sheet_name="Date") return df
def _parse_data_url(self): soup = get_soup(self.source_url_ref) h3 = soup.find_all(class_="accordion-body") assert len(h3) == 4 h5 = h3[1].find_all("h5") url = h5[0].a.get("href") return url
def export(self): url = "https://guineasalud.org/estadisticas/" soup = get_soup(url) stats = soup.find_all("tr") count = clean_count(stats[9].find_all("td")[-1].text) date_str = date.today().strftime("%Y-%m-%d") df = pd.DataFrame({ "Country": self.location, "Date": [date_str], "Cumulative total": count, "Source URL": url, "Source label": "Ministerio de Sanidad y Bienestar Social", "Units": "tests performed", "Notes": pd.NA, }) if os.path.isfile(self.output_path): existing = pd.read_csv(self.output_path) if count > existing["Cumulative total"].max( ) and date_str > existing["Date"].max(): df = pd.concat([df, existing]).sort_values( "Date", ascending=False).drop_duplicates() df.to_csv(self.output_path, index=False)
def export(self): data = pd.read_csv(self.output_path) url = "https://www.gouv.bj/coronavirus/" soup = get_soup(url) stats = soup.find_all("h2", attrs={"class", "h1 adapt white regular"}) count = int(stats[0].text) + int(stats[1].text) date_str = localdatenow("Africa/Porto-Novo") if count > data["Cumulative total"].max( ) and date_str > data["Date"].max(): new = pd.DataFrame({ "Country": self.location, "Date": [date_str], "Cumulative total": count, "Source URL": url, "Source label": "Government of Benin", "Units": "tests performed", }) df = pd.concat([new, data], sort=False) self.export_datafile(df)
def _parse_link(self, soup: BeautifulSoup) -> str: """Parse the article url from soup""" page_href = soup.find("a", text=re.compile(self.regex["title"]))["href"] if not page_href: raise ValueError("Article page not found, please update the script") soup_page = get_soup(f"{self.source_url}{page_href}") href = soup_page.find_all("a", text=re.compile(self.regex["title"]))[-1]["href"] return f"{self._base_url}{href}"
def _get_elems(self) -> list: soup = get_soup(self.source_url) elems = soup.find_all("tr") elems = [ e for e in elems if "Progress Report of COVID - 19 Immunization" in e.text ] return elems
def _get_file_links_bfill(self, index=None, date_limit=None): soup = get_soup(self.source_url_ref) links = [x.a.get("href") for x in soup.find_all("h5")] if index is None: date_limit = date_limit if date_limit is not None else self.date_limit_one_dose_ddmmyyyy i = [i for i, l in enumerate(links) if date_limit in l] index = i[0] links = links[:index] return links
def _get_list_pdf_urls(self) -> list: """Get list of pdf urls""" soup = get_soup(self.source_url, verify=False) links = list( map( lambda x: x.get("href"), soup.findAll( "a", text=re.compile("MINISTRY OF HEALTH KENYA COVID-19")))) return links
def read(self) -> pd.DataFrame: """Read data from source""" soup = get_soup(self.source_url_ref, verify=False) date = self._parse_date(soup) df = pd.DataFrame( { "Date": [date], } ) return df
def read(self) -> pd.DataFrame: """Read data from source.""" data = [] for cnt in range(1, self._num_max_pages + 1): url = f"{self.source_url}{cnt}/" soup = get_soup(url) for _ in range(self._num_rows_per_page): data, proceed = self._parse_data(soup) if not proceed: return pd.DataFrame(data) return None
def _parse_data(self, soup: BeautifulSoup) -> pd.DataFrame: """Parse data from the soup""" # the dashboard URL link = soup.find("iframe", {"title": "Covid Dashboard"})["src"] if not link: raise ValueError("Dashboard not found, please update the script") soup = get_soup(link) # the metrics metrics = self._parse_metrics(soup) # DataFrame df = pd.DataFrame({ **metrics, }) return df
def _parse_data(self, soup: BeautifulSoup) -> pd.DataFrame: """Parse data from soup""" # Get the article URL link = soup.find("a", text=re.compile(self.regex["title"]))["href"] if not link: raise ValueError("Article not found, please update the script") self.source_url_ref = link soup = get_soup(link) # Get the metrics metrics = self._parse_metrics(soup) # Get the date date = self._parse_date(soup) df = pd.DataFrame({ "date": [date], **metrics, }) return df
def _parse_data(self, soup: BeautifulSoup) -> pd.DataFrame: """Parse data from soup""" # Get the article URL link = soup.find("article", { "class": "category-izvestuvanja" }).find("a")["href"] if not link: raise ValueError("Article not found, please update the script") self.source_url_ref = link soup = get_soup(link) # Get the metrics count = self._parse_metrics(soup) # Get the date date = self._parse_date(soup) df = pd.DataFrame({ "Date": [date], "Cumulative total": [count], }) return df
def _parse_data(self, soup: BeautifulSoup) -> pd.DataFrame: """Parse data from soup.""" # Obtain the relevant link link = self._parse_link(soup, self.regex["title"]) # Get soup from link soup = get_soup(link) # Extract pdf link from soup self.source_url_ref = self._parse_link(soup, self.regex["pdf"]) # Extract text from pdf url text = self._extract_text_from_pdf() # Parse metrics count = self._parse_metrics(text) # Parse date date = self._parse_date(text) # Create dataframe df = { "Cumulative total": [count], "Date": [date], } return pd.DataFrame(df)
def _parse_link_zip(self) -> str: """Get link to latest pdf.""" soup = get_soup(self.source_url_ref) url = soup.find("a", string="Download her").get("href") return url
def read(self) -> pd.Series: """Reada data from source""" soup = get_soup(self.source_url) json_data = self._get_json_data(soup) data = self._parse_data(json_data) return pd.Series(data)
def read(self) -> pd.Series: soup = get_soup(self.source_url) return self._parse_data(soup)
def read(self) -> pd.Series: """Read data from source.""" soup = get_soup(self.source_url) data = self._parse_data(soup) return pd.Series(data)
def read(self): """Read the data from the source""" soup = get_soup(self.source_page) link = self._parse_link_pdf(soup) return self._parse_data(link)
def _get_text_from_url(self, url: str) -> str: """Extract text from the url.""" soup = get_soup(url) text = soup.get_text().replace("\n", " ").replace("\xa0", "").lower() text = re.sub(r"(\d),(\d)", r"\1\2", text) return text
def read(self) -> pd.Series: soup = get_soup(self.source_url) data = self._parse_data(soup) return pd.Series(data=data)
def read(self) -> pd.DataFrame: """Reads data from the source page.""" soup = get_soup(self.source_url_ref) data = self._parse_data(soup) return data
def read(self): soup = get_soup(self.source_url) json_data = self._get_json_data(soup) data = self._parse_data(json_data) df_manuf = self._parse_data_manufacturer(json_data) return data, df_manuf
def read(self) -> pd.DataFrame: """Reads data from source.""" soup = get_soup(self.source_url) df = self._parse_data(soup) return df
def read(self): soup = get_soup(self.source_url) url = soup.find_all(class_="data-link")[-1]["href"] df = pd.read_csv(url, usecols=["date", "total tests"]) return df