def _get_text_from_url(self, url: str) -> str: """Extract text from the url.""" soup = get_soup(url, verify=False) text = soup.find( "div", class_="pageDescription").get_text(strip=True).replace(",", "") return text
def read(self): soup = get_soup(self.source_url) count = self._parse_count(soup) date = self._parse_date(soup) return {"count": count, "date": date}
def find_article(self) -> str: soup = get_soup(self.feed_url) for link in soup.find_all("item"): elements = link.children for elem in elements: if "local-covid-19-situation" in elem: return elem
def main(): path = os.path.join(get_project_dir(), "scripts", "scripts", "testing", "automated_sheets", "Nigeria.csv") data = pd.read_csv(path).sort_values(by="Date", ascending=False) source_url = "http://covid19.ncdc.gov.ng/" soup = get_soup(source_url) element = soup.find("div", class_="col-xl-3").find("span") cumulative_total = clean_count(element.text) if cumulative_total > data["Cumulative total"].max(): new = pd.DataFrame({ "Date": [localdate("Africa/Lagos")], "Cumulative total": cumulative_total, "Country": "Nigeria", "Units": "samples tested", "Source URL": source_url, "Source label": "Nigeria Centre for Disease Control", }) df = pd.concat([new, data], sort=False) df.to_csv(path, index=False)
def main(): path = os.path.join(get_project_dir(), "scripts", "scripts", "testing", "automated_sheets", "Azerbaijan.csv") data = pd.read_csv(path).sort_values(by="Date", ascending=False) source_url = "https://koronavirusinfo.az/az/page/statistika/azerbaycanda-cari-veziyyet" soup = get_soup(source_url) element = soup.find_all("div", class_="gray_little_statistic")[5].find("strong") cumulative_total = clean_count(element.text) if cumulative_total > data["Cumulative total"].max(): new = pd.DataFrame({ "Cumulative total": cumulative_total, "Date": [localdate("Asia/Baku")], "Country": "Azerbaijan", "Units": "tests performed", "Source URL": source_url, "Source label": "Cabinet of Ministers of Azerbaijan", }) df = pd.concat([new, data], sort=False) df.to_csv(path, index=False)
def main(): path = os.path.join(get_project_dir(), "scripts", "scripts", "testing", "automated_sheets", "Kenya.csv") data = pd.read_csv(path).sort_values(by="Date", ascending=False) source_url = "http://covidkenya.org/" soup = get_soup(source_url) element = soup.find("div", class_="elementor-element-b36fad5").find( class_="elementor-text-editor") cumulative_total = clean_count(element.text) date_raw = soup.select(".elementor-element-75168b2 p")[0].text date = extract_clean_date( date_raw, regex=r"\[Updated on ([A-Za-z]+ \d+) \[\d\d:\d\d\]", date_format="%B %d", replace_year=2021) if cumulative_total > data["Cumulative total"].max(): new = pd.DataFrame({ "Cumulative total": cumulative_total, "Date": [date], "Country": "Kenya", "Units": "samples tested", "Source URL": source_url, "Source label": "Kenya Ministry of Health", }) df = pd.concat([new, data], sort=False) df.to_csv(path, index=False)
def main(): path = os.path.join(get_project_dir(), "scripts", "scripts", "testing", "automated_sheets", "Lebanon.csv") data = pd.read_csv(path).sort_values(by="Date", ascending=False) source_url = "https://corona.ministryinfo.gov.lb/" soup = get_soup(source_url) element = soup.find("h1", class_="s-counter3") cumulative_total = clean_count(element.text) date_raw = soup.select(".last-update strong")[0].text date = extract_clean_date(date_raw, regex=r"([A-Za-z]+ \d+)", date_format="%b %d", replace_year=2021) if cumulative_total > data["Cumulative total"].max(): new = pd.DataFrame( { "Cumulative total": cumulative_total, "Date": [date], "Country": "Lebanon", "Units": "tests performed", "Source URL": source_url, "Source label": "Lebanon Ministry of Health", } ) df = pd.concat([new, data], sort=False) df.to_csv(path, index=False)
def main(): path = os.path.join(get_project_dir(), "scripts", "scripts", "testing", "automated_sheets", "Tunisia.csv") data = pd.read_csv(path).sort_values(by="Date", ascending=False) source_url = "https://onmne.tn" soup = get_soup(source_url) cumulative_total = json.loads( soup.find( "span", class_="vcex-milestone-time").attrs["data-options"])["endVal"] Date = soup.select("p span")[0].text.replace( "Chiffres clés mis à jour le ", "") Date = pd.to_datetime(Date, format="%d %B %Y").strftime("%Y-%m-%d") if cumulative_total > data["Cumulative total"].max(): new = pd.DataFrame({ "Cumulative total": cumulative_total, "Date": [Date], "Country": "Tunisia", "Units": "people tested", "Source URL": source_url, "Source label": "Tunisia Ministry of Health", }) df = pd.concat([new, data], sort=False) df.to_csv(path, index=False)
def read(source: str) -> pd.Series: soup = get_soup(source) text = soup.find("div", id="data").find("p").text date = re.search(r"На сегодня \(([\d\.]{8})\)", text).group(1) date = clean_date(date, "%d.%m.%y") people_vaccinated = re.search( r"([\d\s]+) чел\. \([\d\.]+% от населения[^)]*\) - привито хотя бы одним компонентом вакцины", text, ).group(1) people_vaccinated = clean_count(people_vaccinated) people_fully_vaccinated = re.search( r"([\d\s]+) чел\. \([\d\.]+% от населения,?[^)]*\) - полностью привито", text).group(1) people_fully_vaccinated = clean_count(people_fully_vaccinated) total_vaccinations = re.search(r"([\d\s]+) шт\. - всего прививок сделано", text).group(1) total_vaccinations = clean_count(total_vaccinations) total_boosters = re.search(r"([\d\s]+) чел\. - прошли ревакцинацию", text).group(1) total_boosters = clean_count(total_boosters) return pd.Series({ "total_vaccinations": total_vaccinations, "people_vaccinated": people_vaccinated, "people_fully_vaccinated": people_fully_vaccinated, "total_boosters": total_boosters, "date": date, })
def _get_text_from_url(self, url: str) -> str: """Extract text from URL.""" soup = get_soup(url) text = soup.get_text() text = re.sub(r"(\d)\.(\d)", r"\1\2", text) text = re.sub(r"\s+", " ", text) return text
def _get_text_and_date_from_url(self, url: str) -> tuple: """Extract text from the url.""" soup = get_soup(url) date = self._parse_date(soup) text = soup.find(class_="news-detail").text.replace("\n", " ").replace( "\xa0", "") text = re.sub(r'(\d)\s+(\d)', r'\1\2', text) return text, date
def _parse_count(self, soup): # Read all tables soup = get_soup(self.source_url) tables = pd.read_html(str(soup)) columns = {"Тип", "Общо", "Нови"} for table in tables: if not columns.difference(table.columns) and "RT PCR" in table["Тип"].tolist(): return table.loc[table["Тип"] == "Общо", "Общо"].item() raise ValueError(f"Table not found! It may have changed its format.")
def _get_relevant_table(self, url: str) -> element.Tag: """Get the table with the relevant data""" soup = get_soup(url) tables = soup.find_all("table") table = [ table for table in tables if table.findChild("caption").text == "Tests COVID-19" ][0] return str(table)
def _get_records(self, url: str) -> dict: soup = get_soup(url) elem = soup.find(id="newsContent") elems = elem.find_all("table") records = [{ "Date": self._parse_date(elem), "Cumulative total": self._parse_metric(elem), } for elem in elems] return records
def _load_data(self, data_id): """Load data from source""" url = f"{self.source_url}{data_id}" soup = get_soup(url) match = re.search(self.regex["element"], str(soup)) if not match: raise ValueError( "Website Structure Changed, please update the script") data = json.loads(match.group(1)) return data
def _parse_data(self): soup = get_soup(self.source_url) date_raw = soup.select("span+ span")[0].text return { "count": clean_count( soup.select(".bg-success:nth-child(1) .info-box-number") [0].text), "date": clean_date(date_raw, "%d/%m/%Y"), }
def _parse_data(self): soup = get_soup(self.source_url) date_raw = soup.select_one("#lastupdated ul li").text return { "count": clean_count(soup.select_one("#renderbody table th span").text), "date": extract_clean_date(date_raw, regex=r"(\d+/\d+/20\d+).*", date_format="%d/%m/%Y"), }
def read(self) -> pd.Series: data = [] for cnt in range(1, self._num_max_pages + 1): url = f"{self._base_url}{self._url_subdirectory}{cnt}" soup = get_soup(url) data, proceed = self._parse_data(soup) if not proceed: break return pd.Series(data)
def read(source: str) -> pd.Series: headers = { "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.16; rv:86.0) Gecko/20100101 Firefox/86.0", "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8", "Accept-Encoding": "gzip, deflate, br", "Connection": "keep-alive", "Upgrade-Insecure-Requests": "1", "Pragma": "no-cache", "Cache-Control": "no-cache", } soup = get_soup(source) return parse_data(soup)
def read(self) -> pd.Series: """Read data from source.""" data = [] for cnt in range(1, self._num_max_pages + 1): url = f"{self.source_url}{cnt}/" soup = get_soup(url) for _ in range(self._num_rows_per_page): data, proceed = self._parse_data(soup) if not proceed: return pd.Series(data) return None
def main(): soup = get_soup(METADATA["source_url_ref"]) records = json.loads(soup.find("cv-stats-virus")[":charts-data"]) df = ( pd.DataFrame.from_records(records, columns=["date", "hospitalized"]) .rename(columns={"hospitalized": "value"}) .assign(entity=METADATA["entity"], indicator="Weekly new hospital admissions") ) df["date"] = clean_date_series(df.date, "%d.%m.%Y") df = df[df.value > 0].sort_values("date") df["value"] = df.value.rolling(7).sum() df = df.dropna(subset=["value"]) return df, METADATA
def _parse_data(self, soup: BeautifulSoup) -> dict: """Get data from the source page.""" # Get relevant link url = self._get_relevant_link(soup) # Extract text from url text = self._get_text_from_url(url) # Extract date from text soup = get_soup(url) date = self._parse_date_from_text(soup) # Extract metrics from text count = self._parse_metrics(text) record = { "source_url": url, "date": date, "count": count, } return record
def read(self) -> pd.DataFrame: """Read data from source""" body = str(get_soup(self.source_url)) # Get count count = 0 if "Totaal Testen" in body: count = int(body.split("Totaal Testen")[0].split('data-counter-value="')[-1].split('"')[0]) # Get negative results negative = 0 if "Totaal negatieve" in body: negative = int(body.split("Totaal negatieve")[0].split('data-counter-value="')[-1].split('"')[0]) df = pd.DataFrame( { "Date": [localdate("America/Paramaribo")], "Daily change in cumulative total": [count], "positive": [count - negative], } ) return df
def _load_data(self, data_id: str) -> pd.DataFrame: """Load data from source""" url = f"{self.source_url}{data_id}" soup = get_soup(url) match = re.search(self.regex["element"], str(soup)) if not match: raise ValueError( "Website Structure Changed, please update the script") data = json.loads(match.group(1)) data = data["elements"]["content"]["content"]["entities"] data = [ data[idx] for idx in data if re.search(self.regex["title"], str(data[idx].values())) ][0] data_list = data["props"]["chartData"]["data"] df = pd.DataFrame() for frame in data_list: col = frame.pop(0) col[0] = "Date" df = df.append(pd.DataFrame(frame, columns=col), ignore_index=True) return df
def _get_data_id_from_source(self, source_url: str) -> str: """Get Data ID from source""" soup = get_soup(source_url) data_id = soup.find(class_="infogram-embed")["data-id"] return data_id
def read(self) -> pd.Series: """Read data from source.""" soup = get_soup(self.source_url) data = self._parse_data(soup) return pd.Series(data)
def read(source: str) -> pd.Series: soup = get_soup(source) return parse_data(soup)
def main(paths): url = "https://e.infogram.com/c3bc3569-c86d-48a7-9d4c-377928f102bf" soup = get_soup(url) for script in soup.find_all("script"): if "infographicData" in str(script): json_data = str(script).replace("<script>window.infographicData=", "").replace(";</script>", "") json_data = json.loads(json_data) break metric_entities = { "total_vaccinations": "7287c058-7921-4abc-a667-ce298827c969", "people_vaccinated": "8d14f33a-d482-4176-af55-71209314b07b", "people_fully_vaccinated": "16a69e30-01fd-4806-920c-436f8f29e9bf", "total_boosters": "209af2de-9927-4c51-a704-ddc85e28bab9", } data = {} for metric, entity in metric_entities.items(): value = json_data["elements"]["content"]["content"]["entities"][ entity]["props"]["chartData"]["data"][0][0][0] value = re.search(r'18px;">([\d\.]+)', value).group(1) value = clean_count(value) data[metric] = value date = json_data["updatedAt"][:10] increment( paths=paths, location="Iceland", total_vaccinations=data["total_vaccinations"], people_vaccinated=data["people_vaccinated"], people_fully_vaccinated=data["people_fully_vaccinated"], total_boosters=data["total_boosters"], date=date, source_url="https://www.covid.is/tolulegar-upplysingar-boluefni", vaccine=", ".join(sorted(VACCINE_MAPPING.values())), ) # By manufacturer data = json_data["elements"]["content"]["content"]["entities"][ "e329559c-c3cc-48e9-8b7b-1a5f87ea7ad3"]["props"]["chartData"]["data"][ 0] df = pd.DataFrame(data[1:]).reset_index(drop=True) df.columns = ["date"] + data[0][1:] df = df.melt("date", var_name="vaccine", value_name="total_vaccinations") df["date"] = pd.to_datetime(df["date"], format="%d.%m.%y").astype(str) df["total_vaccinations"] = pd.to_numeric(df["total_vaccinations"], errors="coerce").fillna(0) df["total_vaccinations"] = df.sort_values("date").groupby( "vaccine", as_index=False)["total_vaccinations"].cumsum() df["location"] = "Iceland" assert set(df["vaccine"].unique()) == set(VACCINE_MAPPING.keys( )), f"Vaccines present in data: {df['vaccine'].unique()}" df = df.replace(VACCINE_MAPPING) df.to_csv(paths.tmp_vax_out_man("Iceland"), index=False) export_metadata(df, "Ministry of Health", url, paths.tmp_vax_metadata_man)
def _parse_date(self): print(self.source_url) soup = get_soup(self.source_url) return extract_clean_date(soup.text, "Reporte (?:(?:V|v)acunación|COVID\-19) (\d\d\-\d\d\-20\d\d)", "%d-%m-%Y")
def read(self) -> pd.Series: soup = get_soup(self.source_url) return self._parse_data(soup)