def _parse_date(self, dose1_elem, dose2_elem):
     date1_raw = dose1_elem.find("h2").text
     date1 = extract_clean_date(date1_raw, self.regex["date"], "%B %d, %Y", minus_days=1, lang="en")
     date2_raw = dose2_elem.find("h2").text
     date2 = extract_clean_date(date2_raw, self.regex["date"], "%B %d, %Y", minus_days=1, lang="en")
     if date1 == date2:
         return date1
     raise ValueError("Dates in first and second doses are not aligned")
Example #2
0
 def _parse_data(self) -> dict:
     with get_driver() as driver:
         driver.get(self.source_url)
         time.sleep(2)
         spans = [
             span for span in driver.find_elements_by_tag_name("span")
             if span.get_attribute("data-text")
         ]
         # Date
         date = extract_clean_date(
             spans[6].text.replace("Sept", "Sep"),
             "\(as of ([a-zA-Z]+)\.\s?(\d{1,2}), (20\d{2})\)",
             "%b %d %Y",
             lang="en",
         )
         # Metrics
         total_vaccinations = clean_count(spans[8].text)
         people_fully_vaccinated = clean_count(spans[15].text)
     if total_vaccinations < people_fully_vaccinated:
         raise ValueError(
             "Check values for:\n"
             f"total_vaccinations\t\t{total_vaccinations}\npeople_fully_vaccinated\t\t{people_fully_vaccinated}"
         )
     return {
         "total_vaccinations": total_vaccinations,
         # "people_vaccinated": people_vaccinated,
         "people_fully_vaccinated": people_fully_vaccinated,
         "date": date,
     }
Example #3
0
def main():
    path = os.path.join(get_project_dir(), "scripts", "scripts", "testing", "automated_sheets", "Lebanon.csv")
    data = pd.read_csv(path).sort_values(by="Date", ascending=False)

    source_url = "https://corona.ministryinfo.gov.lb/"

    soup = get_soup(source_url)

    element = soup.find("h1", class_="s-counter3")
    cumulative_total = clean_count(element.text)

    date_raw = soup.select(".last-update strong")[0].text
    date = extract_clean_date(date_raw, regex=r"([A-Za-z]+ \d+)", date_format="%b %d", replace_year=2021)

    if cumulative_total > data["Cumulative total"].max():
        new = pd.DataFrame(
            {
                "Cumulative total": cumulative_total,
                "Date": [date],
                "Country": "Lebanon",
                "Units": "tests performed",
                "Source URL": source_url,
                "Source label": "Lebanon Ministry of Health",
            }
        )

        df = pd.concat([new, data], sort=False)
        df.to_csv(path, index=False)
Example #4
0
def main():
    path = os.path.join(get_project_dir(), "scripts", "scripts", "testing",
                        "automated_sheets", "Kenya.csv")
    data = pd.read_csv(path).sort_values(by="Date", ascending=False)

    source_url = "http://covidkenya.org/"

    soup = get_soup(source_url)

    element = soup.find("div", class_="elementor-element-b36fad5").find(
        class_="elementor-text-editor")
    cumulative_total = clean_count(element.text)

    date_raw = soup.select(".elementor-element-75168b2 p")[0].text
    date = extract_clean_date(
        date_raw,
        regex=r"\[Updated on ([A-Za-z]+ \d+) \[\d\d:\d\d\]",
        date_format="%B %d",
        replace_year=2021)

    if cumulative_total > data["Cumulative total"].max():
        new = pd.DataFrame({
            "Cumulative total": cumulative_total,
            "Date": [date],
            "Country": "Kenya",
            "Units": "samples tested",
            "Source URL": source_url,
            "Source label": "Kenya Ministry of Health",
        })

        df = pd.concat([new, data], sort=False)
        df.to_csv(path, index=False)
Example #5
0
    def parse_data_news_page(self, soup: BeautifulSoup):
        """
        2021-09-10
        We received confirmation from the International Communications Office, State Secretariat
        for International Communications and Relations, that the part of the report referring to
        people who received the 2nd dose ("közülük ([\d ]+) fő már a második oltását is megkapt")
        also included those who have received the J&J vaccine.
        On the other hand, we cannot estimate the number of vaccinations administered, as adding
        the two reported metrics would count J&J vaccines twice.
        """

        text = clean_string(soup.find(class_="page_body").text)
        match = re.search(self.regex["metrics"], text)

        people_vaccinated = clean_count(match.group(1))
        people_fully_vaccinated = clean_count(match.group(2))
        total_boosters = clean_count(match.group(3))

        return {
            "people_vaccinated": people_vaccinated,
            "people_fully_vaccinated": people_fully_vaccinated,
            "total_boosters": total_boosters,
            "date": extract_clean_date(
                soup.find("p").text,
                regex="(202\d. .* \d+.) - .*",
                date_format="%Y. %B %d.",
                loc="hu_HU.UTF-8",
                minus_days=1,
            ),
        }
Example #6
0
 def _parse_data(self, driver, url):
     driver.get(url)
     elem = driver.find_element_by_id("xw_box")
     return {
         "date": extract_clean_date(elem.text, self.regex["date"], "%Y %m %d"),
         "total_vaccinations": clean_count(re.search(self.regex["total_vaccinations"], elem.text).group(1)),
         "source_url": url,
     }
Example #7
0
 def parse_infogram_date(self, infogram_data: dict) -> str:
     field_id = "d58d673d-f6f7-44d2-8825-8f83ea806a695bbc73d2-f5d6-493e-890b-b7499db493a2"
     x = self._get_infogram_value(infogram_data, field_id, join_text=True)
     dt = extract_clean_date(x,
                             "RESUMEN DE VACUNACIÓN\s?(\d+-[A-Z]+-2\d)\s?",
                             "%d-%b-%y",
                             lang="es")
     return dt
 def parse_date(self, soup):
     return extract_clean_date(
         text=soup.text,
         regex=r"Datos: a (\d+ \w+ de 20\d{2})",
         date_format="%d %B de %Y",
         lang="es",
         unicode_norm=True,
     )
Example #9
0
 def _parse_data_date(self, soup) -> dict:
     date = soup.find(class_="text-gray-500").text
     date = date.strip() + str(datetime.date.today().year)
     date = extract_clean_date(date,
                               self.regex["date"],
                               "%d. %B%Y",
                               lang="en")
     return {"date": date}
Example #10
0
 def parse_infogram_date(self, infogram_data: dict) -> str:
     x = self._get_infogram_value(infogram_data,
                                  "d58d673d-f6f7-44d2-8825-8f83ea806a69",
                                  join_text=True)
     dt = extract_clean_date(x,
                             "RESUMEN DE VACUNACIÓN\s?(\d+-[A-Z]+-2\d)\s?",
                             "%d-%b-%y",
                             lang="es")
     return dt
Example #11
0
 def _parse_data(self):
     soup = get_soup(self.source_url)
     date_raw = soup.select_one("#lastupdated ul li").text
     return {
         "count":
         clean_count(soup.select_one("#renderbody table th span").text),
         "date":
         extract_clean_date(date_raw,
                            regex=r"(\d+/\d+/20\d+).*",
                            date_format="%d/%m/%Y"),
     }
Example #12
0
 def _parse_data(self, data: dict) -> pd.DataFrame:
     """Parse data."""
     count = clean_count(data["samples_collected"])
     date = extract_clean_date(
         data["screen_updated_times"]
         ["toplevel_page_acf-options-statistics"], self.regex["date"],
         "%d/%m/%Y")
     df = {
         "Date": [date],
         "Daily change in cumulative total": [count],
     }
     return pd.DataFrame(df)
Example #13
0
 def parse_data(self, soup):
     # Get table
     tables = soup.find_all("table")
     ds = pd.read_html(str(tables[0]))[0].squeeze()
     # Rename, add/remove columns
     return pd.Series(
         {
             "date": extract_clean_date(
                 text=str(soup.text), regex=self._regex_date, date_format="%d %B %Y", lang="en"
             ),
             "total_vaccinations": clean_count(
                 ds.loc[ds[0] == "Total doses", 1].values[0],
             ),
         }
     )
Example #14
0
 def parse_data(self, soup: BeautifulSoup) -> pd.Series:
     h6 = soup.find_all("h6")
     for i, h in enumerate(h6):
         # print(i)
         text = h.text.strip()
         if text == "1ière dose":
             people_vaccinated = clean_count(h.parent.find("h3").text)
         elif text == "2ième dose":
             people_fully_vaccinated = clean_count(h.parent.find("h3").text)
         else:
             match = re.search(self.regex["date"], text)
             if match:
                 date_str = extract_clean_date(text, self.regex["date"],
                                               "%d-%m-%Y")
     return pd.Series({
         "people_vaccinated": people_vaccinated,
         "people_fully_vaccinated": people_fully_vaccinated,
         "date": date_str,
     })
 def _parse_date(self, driver) -> pd.Series:
     text_date = driver.find_element_by_class_name("full_data_set").text
     regex_date = r"Time period: 29 January 2020 - (\d{2} [a-zA-Z]+ 202\d)"
     return extract_clean_date(text_date, regex_date, "%d %B %Y", lang="en")
Example #16
0
 def _parse_date(self, df_list: list) -> str:
     """Parses date from DataFrame list"""
     df_date = [df for df in df_list if self.columns_to_check["date"] in df.columns][0]
     date_str = df_date.iat[0, 0]
     date = extract_clean_date(date_str.lower(), regex=self.regex["date"], date_format="%d %B %Y")
     return date
 def _parse_date(self, text: str) -> str:
     """Get date from text."""
     return extract_clean_date(text.lower(), self.regex["date"], "%b %d %Y")
Example #18
0
 def _parse_date_from_soup(self, soup: BeautifulSoup) -> str:
     """Get date from soup."""
     date_text = soup.find(text=self.regex["header"]).parent.findChild(id="last-update")
     return extract_clean_date(date_text.text, self.regex["date"], "%d %b, %Y")
Example #19
0
 def _parse_date_from_text(self, soup) -> str:
     """Get date from text."""
     date_raw = soup.select(".detail-time div")[0].text
     return extract_clean_date(date_raw, r"(\d{2}\/\d{2}\/\d{4})",
                               "%d/%m/%Y")
Example #20
0
 def _parse_date(self):
     print(self.source_url)
     soup = get_soup(self.source_url)
     return extract_clean_date(soup.text, "Reporte (?:(?:V|v)acunación|COVID\-19) (\d\d\-\d\d\-20\d\d)", "%d-%m-%Y")
Example #21
0
 def _parse_date(self, element):
     """Get data from report file title."""
     r = r".* \(Last updated: (\d\d\/\d\d\/20\d\d) .*\)"
     return extract_clean_date(element.text, r, "%d/%m/%Y")
Example #22
0
 def _parse_date(self):
     soup = get_soup(self.source_url)
     date_raw = soup.select_one(".pane-content .georgia-italic").text
     dt_raw = extract_clean_date(date_raw, regex="(\\d+ \\w+ 202\\d)", date_format="%d %B %Y")
     return dt_raw
Example #23
0
 def _parse_date(self, text: str) -> str:
     """Get date from relevant element."""
     return extract_clean_date(text.lower(),
                               self.regex["date"],
                               "%d %B %Y",
                               lang="pt")
Example #24
0
 def _parse_date(self, link: str) -> str:
     """Get date from link."""
     return extract_clean_date(link, self.regex["date"], "%d-%m-%Y")
Example #25
0
 def _parse_date(self, soup: BeautifulSoup) -> str:
     """Parse date from soup"""
     date_str = soup.find("table").find("span").text
     return extract_clean_date(date_str, self.regex["date"], "%d-%m-%Y")
Example #26
0
 def _parse_date(self, url: str) -> str:
     """Parse date from pdf text"""
     rex = ".*\-(\d+)\w+\-(\w+)\-(20\d\d).*.pdf"
     return extract_clean_date(url, rex, "%d %B %Y")