Ejemplo n.º 1
0
def parse_date(soup: BeautifulSoup) -> str:
    for h3 in soup.find_all("h3"):
        if "Vaccination Data" in h3.text:
            break
    date = re.search(r"as of (\d+ \w+ \d+)", h3.text).group(1)
    date = clean_date(date, "%d %b %Y")
    return date
Ejemplo n.º 2
0
def connect_parse_data(source: str) -> pd.Series:

    headers = {
        "User-Agent":
        "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.16; rv:86.0) Gecko/20100101 Firefox/86.0",
        "Accept":
        "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
        "Accept-Language": "fr,fr-FR;q=0.8,en-US;q=0.5,en;q=0.3",
        "Accept-Encoding": "gzip, deflate, br",
        "Connection": "keep-alive",
        "Upgrade-Insecure-Requests": "1",
        "Pragma": "no-cache",
        "Cache-Control": "no-cache",
    }
    soup = BeautifulSoup(
        requests.get(source, headers=headers).content, "html.parser")

    total_vaccinations = soup.find(class_="repart-stlucia").text
    total_vaccinations = clean_count(total_vaccinations)

    date = soup.find(class_="h2-blue").text
    date = re.search(r"\w+ +\d+, +202\d", date).group(0)
    date = clean_date(date, "%B %d, %Y")

    data = {
        "total_vaccinations": total_vaccinations,
        "date": date,
    }
    return pd.Series(data=data)
Ejemplo n.º 3
0
def main():

    data = {
        "location": "Guatemala",
        "source_url": "https://gtmvigilanciacovid.shinyapps.io/3869aac0fb95d6baf2c80f19f2da5f98",
        "vaccine": "Moderna, Oxford/AstraZeneca",
    }

    op = Options()
    op.add_argument("--headless")
    with webdriver.Chrome(options=op) as driver:
        driver.maximize_window()  # For maximizing window
        driver.implicitly_wait(20)  # gives an implicit wait for 20 seconds
        driver.get(data["source_url"])
        driver.find_element_by_class_name("fa-syringe").click()
        date = driver.find_element_by_class_name("logo").text
        dose1 = driver.find_element_by_id("dosisaplicadas1").find_element_by_tag_name("h3").text
        dose2 = driver.find_element_by_id("dosisaplicadas2").find_element_by_tag_name("h3").text

    data["people_vaccinated"] = clean_count(dose1)
    data["people_fully_vaccinated"] = clean_count(dose2)
    data["total_vaccinations"] = data["people_vaccinated"] + data["people_fully_vaccinated"]

    date = re.search(r"\d+/\d+/202\d", date).group(0)
    data["date"] = clean_date(date, "%d/%m/%Y")

    increment(
        location=data["location"],
        total_vaccinations=data["total_vaccinations"],
        people_vaccinated=data["people_vaccinated"],
        people_fully_vaccinated=data["people_fully_vaccinated"],
        date=data["date"],
        source_url=data["source_url"],
        vaccine=data["vaccine"],
    )
Ejemplo n.º 4
0
def connect_parse_data(source: str) -> pd.Series:
    op = Options()
    op.add_argument("--headless")

    with webdriver.Chrome(options=op) as driver:
        driver.get(source)
        time.sleep(10)

        date = driver.find_element_by_class_name(
            "as_of").find_element_by_tag_name("span").text
        date = clean_date(date, "%d.%m.%Y")

        for elem in driver.find_elements_by_class_name("counter_block"):
            if "1 ДОЗУ" in elem.text:
                people_vaccinated = elem.find_element_by_tag_name("h2").text
            if "2 ДОЗИ" in elem.text:
                people_fully_vaccinated = elem.find_element_by_tag_name(
                    "h2").text

    data = {
        "people_vaccinated": clean_count(people_vaccinated),
        "people_fully_vaccinated": clean_count(people_fully_vaccinated),
        "date": date,
    }
    return pd.Series(data=data)
Ejemplo n.º 5
0
def parse_data(soup: BeautifulSoup) -> pd.Series:

    # Get path to newest pdf
    links = soup.find(class_="rt-article").find_all("a")
    for link in links:
        if "sitrep-sl-en" in link["href"]:
            pdf_path = "https://www.epid.gov.lk" + link["href"]
            break

    tf = tempfile.NamedTemporaryFile()

    with open(tf.name, mode="wb") as f:
        f.write(requests.get(pdf_path).content)

    with open(tf.name, mode="rb") as f:
        reader = PyPDF2.PdfFileReader(f)
        page = reader.getPage(0)
        text = page.extractText().replace("\n", "")

    regex = r"COVID-19\s+Total\s+Vaccinated\s+(\d+)"
    total_vaccinations = re.search(regex, text).group(1)
    total_vaccinations = clean_count(total_vaccinations)

    people_vaccinated = total_vaccinations

    regex = r"Situation Report\s+([\d\.]{10})"
    date = re.search(regex, text).group(1)
    date = clean_date(date, "%d.%m.%Y")

    return pd.Series(data={
        "total_vaccinations": total_vaccinations,
        "people_vaccinated": people_vaccinated,
        "date": date,
        "source_url": pdf_path,
    })
Ejemplo n.º 6
0
def connect_parse_data(source: str) -> pd.Series:
    op = Options()
    op.add_argument("--headless")

    with webdriver.Chrome(options=op) as driver:
        driver.get(source)
        time.sleep(10)

        date = re.search(r"Fecha de corte : ([\d/]{10})",
                         driver.page_source).group(1)

        for block in driver.find_elements_by_class_name("unselectable"):
            if block.get_attribute("aria-label") == "Dosis aplicadas Card":
                total_vaccinations = clean_count(
                    block.find_element_by_class_name("value").text)
            elif block.get_attribute(
                    "aria-label") == "Segundas dosis aplicadas Card":
                people_fully_vaccinated = clean_count(
                    block.find_element_by_class_name("value").text)

    people_vaccinated = total_vaccinations - people_fully_vaccinated

    return pd.Series({
        "total_vaccinations": total_vaccinations,
        "people_vaccinated": people_vaccinated,
        "people_fully_vaccinated": people_fully_vaccinated,
        "date": clean_date(date, "%d/%m/%Y")
    })
Ejemplo n.º 7
0
def connect_parse_data(source: str, source_old: str) -> pd.Series:
    op = Options()
    op.add_argument("--headless")

    with webdriver.Chrome(options=op) as driver:
        driver.get(source)
        time.sleep(5)

        total_vaccinations = driver.find_element_by_id("counter1").text
        people_vaccinated = driver.find_element_by_id("counter2").text
        people_fully_vaccinated = driver.find_element_by_id("counter3").text
        
        driver.get(source_old)
        time.sleep(5)

        # Sanity check
        total_vaccinations_old = driver.find_element_by_id("counter1").text
        if total_vaccinations != total_vaccinations_old:
            raise ValueError("Both dashboards may not be synced and hence may refer to different timestamps. Consider"
                             "Introducing the timestamp manually.")
        date = driver.find_element_by_id("pupdateddate").text
        date = clean_date(date.replace("Updated ", ""), "%d %b, %Y")

    data = {
        "total_vaccinations": clean_count(total_vaccinations),
        "people_vaccinated": clean_count(people_vaccinated),
        "people_fully_vaccinated": clean_count(people_fully_vaccinated),
        "date": date,
    }
    return pd.Series(data=data)
Ejemplo n.º 8
0
def parse_data(data: dict) -> pd.Series:

    date = clean_date(data["updated"], "%Y/%m/%d")

    total_vaccinations = data["progress"]

    return pd.Series(data={
        "date": date,
        "total_vaccinations": total_vaccinations,
    })
Ejemplo n.º 9
0
def parse_data(soup: BeautifulSoup) -> pd.Series:

    numbers = soup.find_all(class_="odometer")

    date = re.search(r"[\d\.]{10}", soup.find(class_="counter").text).group(0)
    date = clean_date(date, "%d.%m.%Y")

    return pd.Series(data={
        "total_vaccinations": int(numbers[0]["data-count"]),
        "people_vaccinated": int(numbers[1]["data-count"]),
        "people_fully_vaccinated": int(numbers[2]["data-count"]),
        "date": date
    })
Ejemplo n.º 10
0
def parse_data(data: dict) -> pd.Series:

    date = clean_date(data["updated"], "%Y/%m/%d")

    people_vaccinated = data["progress"]
    people_fully_vaccinated = data["completed"]

    return pd.Series(data={
        "date": date,
        "people_vaccinated": people_vaccinated,
        "people_fully_vaccinated": people_fully_vaccinated,
        "vaccine": ", ".join(_get_vaccine_names(data, translate=True)),
    })
Ejemplo n.º 11
0
def connect_parse_data(source: str) -> pd.Series:
    sheet = open_google_sheet(source)

    date = sheet.get('C44').first().strip()
    total_vaccinations = int(sheet.get('K16').first().strip().replace(',', ''))
    people_fully_vaccinated = int(sheet.get('K27').first().strip().replace(',', ''))

    people_vaccinated = total_vaccinations - people_fully_vaccinated

    return pd.Series({
        "total_vaccinations": total_vaccinations,
        "people_vaccinated": people_vaccinated,
        "people_fully_vaccinated": people_fully_vaccinated,
        "date": clean_date(date, "%d/%m/%Y")
    })
Ejemplo n.º 12
0
def connect_parse_data(source: str) -> pd.Series:
    op = Options()
    op.add_argument("--headless")

    with webdriver.Chrome(options=op) as driver:
        driver.get(source)
        time.sleep(5)

        total_vaccinations = driver.find_element_by_id("counter1").text

        date = driver.find_element_by_id("pupdateddate").text
        date = clean_date(date.replace("Updated ", ""), "%d %b, %Y")

    data = {
        "total_vaccinations": clean_count(total_vaccinations),
        "date": date,
    }
    return pd.Series(data=data)
Ejemplo n.º 13
0
def parse_data(soup: BeautifulSoup) -> pd.Series:

    # Get path to newest pdf
    links = soup.find(class_="rt-article").find_all("a")
    for link in links:
        if "sitrep-sl-en" in link["href"]:
            pdf_path = "https://www.epid.gov.lk" + link["href"]
            break

    tf = tempfile.NamedTemporaryFile()

    with open(tf.name, mode="wb") as f:
        f.write(requests.get(pdf_path).content)

    with open(tf.name, mode="rb") as f:
        reader = PyPDF2.PdfFileReader(f)
        page = reader.getPage(0)
        text = page.extractText().replace("\n", "")

    covishield_data = re.search(r"Covishield Vaccine (\d+) (\d+)", text)
    covishield_dose1 = clean_count(covishield_data.group(1))
    covishield_dose2 = clean_count(covishield_data.group(2))

    sinopharm_data = re.search(
        r"Sinopharm Vaccine \(Chinese Nationals\) (\d+) (\d+)", text)
    sinopharm_dose1 = clean_count(sinopharm_data.group(1))
    sinopharm_dose2 = clean_count(sinopharm_data.group(2))

    total_vaccinations = covishield_dose1 + covishield_dose2 + sinopharm_dose1 + sinopharm_dose2
    people_vaccinated = covishield_dose1 + sinopharm_dose1
    people_fully_vaccinated = covishield_dose2 + sinopharm_dose2

    regex = r"Situation Report\s+([\d\.]{10})"
    date = re.search(regex, text).group(1)
    date = clean_date(date, "%d.%m.%Y")

    return pd.Series(
        data={
            "total_vaccinations": total_vaccinations,
            "people_vaccinated": people_vaccinated,
            "people_fully_vaccinated": people_fully_vaccinated,
            "date": date,
            "source_url": pdf_path,
        })
Ejemplo n.º 14
0
def parse_data(soup: BeautifulSoup) -> pd.Series:

    total_vaccinations = int(soup.find_all(class_="counter")[0].text)
    people_vaccinated = int(soup.find_all(class_="counter")[1].text)
    people_fully_vaccinated = int(soup.find_all(class_="counter")[2].text)
    assert total_vaccinations >= people_vaccinated
    assert people_vaccinated >= people_fully_vaccinated

    date = soup.find(class_="fuente").text
    date = re.search(r"\d{2}-\d{2}-\d{4}", date).group(0)
    date = clean_date(date, "%d-%m-%Y")

    data = {
        "total_vaccinations": total_vaccinations,
        "people_vaccinated": people_vaccinated,
        "people_fully_vaccinated": people_fully_vaccinated,
        "date": date,
    }
    return pd.Series(data=data)
Ejemplo n.º 15
0
def parse_data(soup: BeautifulSoup) -> pd.Series:

    people_vaccinated = int(soup.find_all(class_="count")[0]["data-count"])
    people_fully_vaccinated = int(
        soup.find_all(class_="count")[1]["data-count"])
    assert people_vaccinated >= people_fully_vaccinated
    total_vaccinations = people_vaccinated + people_fully_vaccinated

    date = soup.find(class_="reportdate").text
    date = re.search(r"\d+ \w+ 202\d", date).group(0)
    date = clean_date(date, "%d %b %Y")

    data = {
        "total_vaccinations": total_vaccinations,
        "people_vaccinated": people_vaccinated,
        "people_fully_vaccinated": people_fully_vaccinated,
        "date": date,
    }
    return pd.Series(data=data)
Ejemplo n.º 16
0
def read(source: str) -> pd.Series:

    headers = {
        "User-Agent":
        "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.16; rv:86.0) Gecko/20100101 Firefox/86.0",
        "Accept":
        "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
        "Accept-Language": "fr,fr-FR;q=0.8,en-US;q=0.5,en;q=0.3",
        "Accept-Encoding": "gzip, deflate, br",
        "Connection": "keep-alive",
        "Upgrade-Insecure-Requests": "1",
        "Pragma": "no-cache",
        "Cache-Control": "no-cache",
    }
    soup = BeautifulSoup(
        requests.get(source, headers=headers).content, "html.parser")

    text = soup.find("div", id="data").find("p").text

    date = re.search(r"На сегодня \(([\d\.]{8})\)", text).group(1)
    date = clean_date(date, "%d.%m.%y")

    people_vaccinated = re.search(
        r"([\d\s]+) чел\. \([\d\.]+% от населения\) - привито хотя бы одним компонентом вакцины",
        text).group(1)
    people_vaccinated = clean_count(people_vaccinated)

    people_fully_vaccinated = re.search(
        r"([\d\s]+) чел\. \([\d\.]+% от населения\) - полностью привито",
        text).group(1)
    people_fully_vaccinated = clean_count(people_fully_vaccinated)

    total_vaccinations = re.search(r"([\d\s]+) шт\. - всего прививок сделано",
                                   text).group(1)
    total_vaccinations = clean_count(total_vaccinations)

    return pd.Series({
        "total_vaccinations": total_vaccinations,
        "people_vaccinated": people_vaccinated,
        "people_fully_vaccinated": people_fully_vaccinated,
        "date": date,
    })
Ejemplo n.º 17
0
def read(source: str) -> pd.Series:
    soup = get_soup(source)

    counters = soup.find_all(class_="counter")
    people_partially_vaccinated = clean_count(counters[0].text)
    people_fully_vaccinated = clean_count(counters[1].text)
    total_vaccinations = clean_count(counters[2].text)
    people_vaccinated = people_partially_vaccinated + people_fully_vaccinated

    date = soup.find("span", id="last-update").text
    date = re.search(r"\d+.*202\d", date).group(0)
    date = clean_date(date, "%d %B, %Y")

    data = {
        "total_vaccinations": total_vaccinations,
        "people_vaccinated": people_vaccinated,
        "people_fully_vaccinated": people_fully_vaccinated,
        "date": date,
        "source_url": source,
    }
    return pd.Series(data=data)
Ejemplo n.º 18
0
def read(source: str) -> pd.Series:
    soup = get_soup(source)

    for block in soup.find(class_="main").find_all(class_="w3-center"):

        if block.find("p").text == "ΣΥΝΟΛΟ ΕΜΒΟΛΙΑΣΜΩΝ":
            total_vaccinations = clean_count(block.find_all("p")[1].text)
            date = re.search(r"[\d/]{8,10}", block.find_all("p")[2].text)
            date = clean_date(date.group(0), "%d/%m/%Y")

        if block.find("p").text == "ΣΥΝΟΛΟ 1ης ΔΟΣΗΣ":
            people_vaccinated = clean_count(block.find_all("p")[1].text)

        if block.find("p").text == "ΣΥΝΟΛΟ 2ης ΔΟΣΗΣ":
            people_fully_vaccinated = clean_count(block.find_all("p")[1].text)

    data = {
        "total_vaccinations": total_vaccinations,
        "people_vaccinated": people_vaccinated,
        "people_fully_vaccinated": people_fully_vaccinated,
        "date": date,
        "source_url": source,
    }
    return pd.Series(data=data)
Ejemplo n.º 19
0
def parse_data(soup: BeautifulSoup) -> pd.Series:
    # Get Newest PDF Report Link
    latest_report_link = soup.find("div",
                                   class_="col-lg-12",
                                   id="content-detail").find("a")["href"]

    tf = tempfile.NamedTemporaryFile()

    with open(tf.name, mode="wb") as f:
        f.write(requests.get(latest_report_link).content)

    with open(tf.name, mode="rb") as f:
        viewer = SimplePDFViewer(f)
        viewer.render()
        raw_text = "".join(viewer.canvas.strings)

    special_char_replace = {
        '\uf701': u'\u0e34',
        '\uf702': u'\u0e35',
        '\uf703': u'\u0e36',
        '\uf704': u'\u0e37',
        '\uf705': u'\u0e48',
        '\uf706': u'\u0e49',
        '\uf70a': u'\u0e48',
        '\uf70b': u'\u0e49',
        '\uf70e': u'\u0e4c',
        '\uf710': u'\u0e31',
        '\uf712': u'\u0e47',
        '\uf713': u'\u0e48',
        '\uf714': u'\u0e49'
    }

    # Correct Thai Sprcial Character Error
    special_char_replace = dict(
        (re.escape(k), v) for k, v in special_char_replace.items())
    pattern = re.compile("|".join(special_char_replace.keys()))
    text = pattern.sub(lambda m: special_char_replace[re.escape(m.group(0))],
                       raw_text)

    total_vaccinations_regex = r"ผู้ที่ได้รับวัคซีนสะสม .{1,100} ทั้งหมด[^\d]+([\d,]+) โดส"
    total_vaccinations = re.search(total_vaccinations_regex, text).group(1)
    total_vaccinations = clean_count(total_vaccinations)

    people_vaccinated_regex = r"ผู้ได้รับวัคซีนเข็มที่ 1 .{1,3}นวน[^\d]+([\d,]+) ร.{1,3}ย"
    people_vaccinated = re.search(people_vaccinated_regex, text).group(1)
    people_vaccinated = clean_count(people_vaccinated)

    people_fully_vaccinated_regex = (
        r"นวนผู้ได้รับวัคซีนครบต.{1,2}มเกณฑ์ \(ได้รับวัคซีน 2 เข็ม\) .{1,3}นวน[^\d]+([\d,]+)"
    )
    people_fully_vaccinated = re.search(people_fully_vaccinated_regex,
                                        text).group(1)
    people_fully_vaccinated = clean_count(people_fully_vaccinated)

    thai_date_regex = r"\( ข้อมูล ณ วันที่ (.{1,30}) เวล(.{1,3}) (.{1,10}) น. \)"
    thai_date = re.search(thai_date_regex, text).group(1).replace("ำ", "า")
    thai_date_replace = {
        "มกราคม": "January",
        "กุมภาพันธ์": "February",
        "มีนาคม": "March",
        "เมษายน": "April",
        "พฤษภาคม": "May",
        "มิถุนายน": "June",
        "กรกฎาคม": "July",
        "สิงหาคม": "August",
        "กันยายน": "September",
        "ตุลาคม": "October",
        "พฤศจิกายน": "November",
        "ธันวาคม": "December",
        "2563": "2020",
        "2564": "2021",
        "2565": "2022",
        "2566": "2023",
        "2567": "2024"
    }

    # Replace Thai Date Format with Standard Date Time Format
    thai_date_replace = dict(
        (re.escape(k), v) for k, v in thai_date_replace.items())
    pattern = re.compile("|".join(thai_date_replace.keys()))
    date = pattern.sub(lambda m: thai_date_replace[re.escape(m.group(0))],
                       thai_date)
    date = clean_date(date, "%d %B %Y")

    return pd.Series(
        data={
            "total_vaccinations": total_vaccinations,
            "people_vaccinated": people_vaccinated,
            "people_fully_vaccinated": people_fully_vaccinated,
            "date": date,
            "source_url": latest_report_link,
        })
Ejemplo n.º 20
0
def parse_date(soup) -> str:
    date_raw = soup.find(class_="download").text
    regex = r"(\d{4})\sCOVID-19疫苗日報表"
    date_str = re.search(regex, date_raw).group(1)
    date_str = clean_date("2021" + date_str, "%Y%m%d")
    return date_str
Ejemplo n.º 21
0
def parse_date(df: pd.DataFrame) -> str:
    date = re.search(r"Dati aggiornati al (\d{2}/\d{2}/\d{4})", df).group(1)
    return clean_date(date, "%d/%m/%Y")
Ejemplo n.º 22
0
def parse_date(df: dict) -> str:
    date = df["Unnamed: 1"].str.replace("Journée du ", "").values[0]
    date = clean_date(date, "%d.%m.%Y")
    return date
Ejemplo n.º 23
0
def parse_date(soup: BeautifulSoup) -> str:
    date = re.search(r"Data applies to: Week ending (\d[\w\s]+\d{4})",
                     soup.text).group(1)
    date = str(date)
    return clean_date(date, "%d %B %Y")
Ejemplo n.º 24
0
def parse_date(soup: BeautifulSoup) -> str:
    date = soup.find(class_="field-name-post-date").text
    date = clean_date(date, "%d.%m.%Y")
    return date