def _parse_date(self, dose1_elem, dose2_elem): date1_raw = dose1_elem.find("h2").text date1 = extract_clean_date(date1_raw, self.regex["date"], "%B %d, %Y", minus_days=1, lang="en") date2_raw = dose2_elem.find("h2").text date2 = extract_clean_date(date2_raw, self.regex["date"], "%B %d, %Y", minus_days=1, lang="en") if date1 == date2: return date1 raise ValueError("Dates in first and second doses are not aligned")
def _parse_data(self) -> dict: with get_driver() as driver: driver.get(self.source_url) time.sleep(2) spans = [ span for span in driver.find_elements_by_tag_name("span") if span.get_attribute("data-text") ] # Date date = extract_clean_date( spans[6].text.replace("Sept", "Sep"), "\(as of ([a-zA-Z]+)\.\s?(\d{1,2}), (20\d{2})\)", "%b %d %Y", lang="en", ) # Metrics total_vaccinations = clean_count(spans[8].text) people_fully_vaccinated = clean_count(spans[15].text) if total_vaccinations < people_fully_vaccinated: raise ValueError( "Check values for:\n" f"total_vaccinations\t\t{total_vaccinations}\npeople_fully_vaccinated\t\t{people_fully_vaccinated}" ) return { "total_vaccinations": total_vaccinations, # "people_vaccinated": people_vaccinated, "people_fully_vaccinated": people_fully_vaccinated, "date": date, }
def main(): path = os.path.join(get_project_dir(), "scripts", "scripts", "testing", "automated_sheets", "Lebanon.csv") data = pd.read_csv(path).sort_values(by="Date", ascending=False) source_url = "https://corona.ministryinfo.gov.lb/" soup = get_soup(source_url) element = soup.find("h1", class_="s-counter3") cumulative_total = clean_count(element.text) date_raw = soup.select(".last-update strong")[0].text date = extract_clean_date(date_raw, regex=r"([A-Za-z]+ \d+)", date_format="%b %d", replace_year=2021) if cumulative_total > data["Cumulative total"].max(): new = pd.DataFrame( { "Cumulative total": cumulative_total, "Date": [date], "Country": "Lebanon", "Units": "tests performed", "Source URL": source_url, "Source label": "Lebanon Ministry of Health", } ) df = pd.concat([new, data], sort=False) df.to_csv(path, index=False)
def main(): path = os.path.join(get_project_dir(), "scripts", "scripts", "testing", "automated_sheets", "Kenya.csv") data = pd.read_csv(path).sort_values(by="Date", ascending=False) source_url = "http://covidkenya.org/" soup = get_soup(source_url) element = soup.find("div", class_="elementor-element-b36fad5").find( class_="elementor-text-editor") cumulative_total = clean_count(element.text) date_raw = soup.select(".elementor-element-75168b2 p")[0].text date = extract_clean_date( date_raw, regex=r"\[Updated on ([A-Za-z]+ \d+) \[\d\d:\d\d\]", date_format="%B %d", replace_year=2021) if cumulative_total > data["Cumulative total"].max(): new = pd.DataFrame({ "Cumulative total": cumulative_total, "Date": [date], "Country": "Kenya", "Units": "samples tested", "Source URL": source_url, "Source label": "Kenya Ministry of Health", }) df = pd.concat([new, data], sort=False) df.to_csv(path, index=False)
def parse_data_news_page(self, soup: BeautifulSoup): """ 2021-09-10 We received confirmation from the International Communications Office, State Secretariat for International Communications and Relations, that the part of the report referring to people who received the 2nd dose ("közülük ([\d ]+) fő már a második oltását is megkapt") also included those who have received the J&J vaccine. On the other hand, we cannot estimate the number of vaccinations administered, as adding the two reported metrics would count J&J vaccines twice. """ text = clean_string(soup.find(class_="page_body").text) match = re.search(self.regex["metrics"], text) people_vaccinated = clean_count(match.group(1)) people_fully_vaccinated = clean_count(match.group(2)) total_boosters = clean_count(match.group(3)) return { "people_vaccinated": people_vaccinated, "people_fully_vaccinated": people_fully_vaccinated, "total_boosters": total_boosters, "date": extract_clean_date( soup.find("p").text, regex="(202\d. .* \d+.) - .*", date_format="%Y. %B %d.", loc="hu_HU.UTF-8", minus_days=1, ), }
def _parse_data(self, driver, url): driver.get(url) elem = driver.find_element_by_id("xw_box") return { "date": extract_clean_date(elem.text, self.regex["date"], "%Y %m %d"), "total_vaccinations": clean_count(re.search(self.regex["total_vaccinations"], elem.text).group(1)), "source_url": url, }
def parse_infogram_date(self, infogram_data: dict) -> str: field_id = "d58d673d-f6f7-44d2-8825-8f83ea806a695bbc73d2-f5d6-493e-890b-b7499db493a2" x = self._get_infogram_value(infogram_data, field_id, join_text=True) dt = extract_clean_date(x, "RESUMEN DE VACUNACIÓN\s?(\d+-[A-Z]+-2\d)\s?", "%d-%b-%y", lang="es") return dt
def parse_date(self, soup): return extract_clean_date( text=soup.text, regex=r"Datos: a (\d+ \w+ de 20\d{2})", date_format="%d %B de %Y", lang="es", unicode_norm=True, )
def _parse_data_date(self, soup) -> dict: date = soup.find(class_="text-gray-500").text date = date.strip() + str(datetime.date.today().year) date = extract_clean_date(date, self.regex["date"], "%d. %B%Y", lang="en") return {"date": date}
def parse_infogram_date(self, infogram_data: dict) -> str: x = self._get_infogram_value(infogram_data, "d58d673d-f6f7-44d2-8825-8f83ea806a69", join_text=True) dt = extract_clean_date(x, "RESUMEN DE VACUNACIÓN\s?(\d+-[A-Z]+-2\d)\s?", "%d-%b-%y", lang="es") return dt
def _parse_data(self): soup = get_soup(self.source_url) date_raw = soup.select_one("#lastupdated ul li").text return { "count": clean_count(soup.select_one("#renderbody table th span").text), "date": extract_clean_date(date_raw, regex=r"(\d+/\d+/20\d+).*", date_format="%d/%m/%Y"), }
def _parse_data(self, data: dict) -> pd.DataFrame: """Parse data.""" count = clean_count(data["samples_collected"]) date = extract_clean_date( data["screen_updated_times"] ["toplevel_page_acf-options-statistics"], self.regex["date"], "%d/%m/%Y") df = { "Date": [date], "Daily change in cumulative total": [count], } return pd.DataFrame(df)
def parse_data(self, soup): # Get table tables = soup.find_all("table") ds = pd.read_html(str(tables[0]))[0].squeeze() # Rename, add/remove columns return pd.Series( { "date": extract_clean_date( text=str(soup.text), regex=self._regex_date, date_format="%d %B %Y", lang="en" ), "total_vaccinations": clean_count( ds.loc[ds[0] == "Total doses", 1].values[0], ), } )
def parse_data(self, soup: BeautifulSoup) -> pd.Series: h6 = soup.find_all("h6") for i, h in enumerate(h6): # print(i) text = h.text.strip() if text == "1ière dose": people_vaccinated = clean_count(h.parent.find("h3").text) elif text == "2ième dose": people_fully_vaccinated = clean_count(h.parent.find("h3").text) else: match = re.search(self.regex["date"], text) if match: date_str = extract_clean_date(text, self.regex["date"], "%d-%m-%Y") return pd.Series({ "people_vaccinated": people_vaccinated, "people_fully_vaccinated": people_fully_vaccinated, "date": date_str, })
def _parse_date(self, driver) -> pd.Series: text_date = driver.find_element_by_class_name("full_data_set").text regex_date = r"Time period: 29 January 2020 - (\d{2} [a-zA-Z]+ 202\d)" return extract_clean_date(text_date, regex_date, "%d %B %Y", lang="en")
def _parse_date(self, df_list: list) -> str: """Parses date from DataFrame list""" df_date = [df for df in df_list if self.columns_to_check["date"] in df.columns][0] date_str = df_date.iat[0, 0] date = extract_clean_date(date_str.lower(), regex=self.regex["date"], date_format="%d %B %Y") return date
def _parse_date(self, text: str) -> str: """Get date from text.""" return extract_clean_date(text.lower(), self.regex["date"], "%b %d %Y")
def _parse_date_from_soup(self, soup: BeautifulSoup) -> str: """Get date from soup.""" date_text = soup.find(text=self.regex["header"]).parent.findChild(id="last-update") return extract_clean_date(date_text.text, self.regex["date"], "%d %b, %Y")
def _parse_date_from_text(self, soup) -> str: """Get date from text.""" date_raw = soup.select(".detail-time div")[0].text return extract_clean_date(date_raw, r"(\d{2}\/\d{2}\/\d{4})", "%d/%m/%Y")
def _parse_date(self): print(self.source_url) soup = get_soup(self.source_url) return extract_clean_date(soup.text, "Reporte (?:(?:V|v)acunación|COVID\-19) (\d\d\-\d\d\-20\d\d)", "%d-%m-%Y")
def _parse_date(self, element): """Get data from report file title.""" r = r".* \(Last updated: (\d\d\/\d\d\/20\d\d) .*\)" return extract_clean_date(element.text, r, "%d/%m/%Y")
def _parse_date(self): soup = get_soup(self.source_url) date_raw = soup.select_one(".pane-content .georgia-italic").text dt_raw = extract_clean_date(date_raw, regex="(\\d+ \\w+ 202\\d)", date_format="%d %B %Y") return dt_raw
def _parse_date(self, text: str) -> str: """Get date from relevant element.""" return extract_clean_date(text.lower(), self.regex["date"], "%d %B %Y", lang="pt")
def _parse_date(self, link: str) -> str: """Get date from link.""" return extract_clean_date(link, self.regex["date"], "%d-%m-%Y")
def _parse_date(self, soup: BeautifulSoup) -> str: """Parse date from soup""" date_str = soup.find("table").find("span").text return extract_clean_date(date_str, self.regex["date"], "%d-%m-%Y")
def _parse_date(self, url: str) -> str: """Parse date from pdf text""" rex = ".*\-(\d+)\w+\-(\w+)\-(20\d\d).*.pdf" return extract_clean_date(url, rex, "%d %B %Y")