def _parse_data(self) -> dict: with get_driver() as driver: driver.get(self.source_url) time.sleep(2) spans = [ span for span in driver.find_elements_by_tag_name("span") if span.get_attribute("data-text") ] # Date date = extract_clean_date( spans[6].text.replace("Sept", "Sep"), "\(as of ([a-zA-Z]+)\.\s?(\d{1,2}), (20\d{2})\)", "%b %d %Y", lang="en", ) # Metrics total_vaccinations = clean_count(spans[8].text) people_fully_vaccinated = clean_count(spans[15].text) if total_vaccinations < people_fully_vaccinated: raise ValueError( "Check values for:\n" f"total_vaccinations\t\t{total_vaccinations}\npeople_fully_vaccinated\t\t{people_fully_vaccinated}" ) return { "total_vaccinations": total_vaccinations, # "people_vaccinated": people_vaccinated, "people_fully_vaccinated": people_fully_vaccinated, "date": date, }
def read(source: str) -> pd.Series: with get_driver() as driver: driver.get(source) time.sleep(10) for block in driver.find_elements_by_class_name("kpimetric"): if "1ste dosis" in block.text and "%" not in block.text: people_partly_vaccinated = clean_count( block.find_element_by_class_name("valueLabel").text) elif "2de dosis" in block.text and "%" not in block.text: people_fully_vaccinated = clean_count( block.find_element_by_class_name("valueLabel").text) elif "3de dosis" in block.text and "%" not in block.text: total_boosters = clean_count( block.find_element_by_class_name("valueLabel").text) people_vaccinated = people_partly_vaccinated + people_fully_vaccinated return pd.Series( data={ "total_vaccinations": people_vaccinated + people_fully_vaccinated, "people_vaccinated": people_vaccinated, "people_fully_vaccinated": people_fully_vaccinated, "total_boosters": total_boosters, "date": localdate("America/Paramaribo"), })
def _parse_date(self, text: str): thai_date_replace = { # Months "มกราคม": 1, "กุมภาพันธ์": 2, "มีนาคม": 3, "เมษายน": 4, "พฤษภาคม": 5, "พฤษภำคม": 5, "มิถุนายน": 6, "มิถุนำยน": 6, "กรกฎาคม": 7, "กรกฎำคม": 7, "สิงหาคม": 8, "สิงหำคม": 8, "กันยายน": 9, "ตุลาคม": 10, "พฤศจิกายน": 11, "ธันวาคม": 12, } date_raw = re.search(self.regex_date, text) day = clean_count(date_raw.group(1)) month = thai_date_replace[date_raw.group(2)] year = clean_count(date_raw.group(3)) - self._year_difference_conversion return clean_date(datetime(year, month, day))
def _parse_metrics(self, soup: BeautifulSoup): match = re.search(self.regex["metrics"], soup.text) total_vaccinations = clean_count(match.group(1)) people_vaccinated = clean_count(match.group(2)) people_fully_vaccinated = clean_count(match.group(3)) total_boosters = clean_count(match.group(4)) return total_vaccinations, people_vaccinated, people_fully_vaccinated, total_boosters
def _parse_data(self) -> pd.Series: with get_driver(headless=True) as driver: # Main page driver.get(self._get_iframe_url()) time.sleep(5) data_blocks = WebDriverWait(driver, 30).until( EC.visibility_of_all_elements_located((By.CLASS_NAME, "card")) ) for block in data_blocks: block_title = block.get_attribute("aria-label") if "first dose" in block_title: people_vaccinated = re.search(r"first dose +(\d+)\.", block_title).group(1) elif "sec dose" in block_title: people_fully_vaccinated = re.search(r"sec dose +(\d+)\.", block_title).group(1) people_vaccinated = clean_count(people_vaccinated) people_fully_vaccinated = clean_count(people_fully_vaccinated) return pd.Series( { "people_vaccinated": people_vaccinated, "people_fully_vaccinated": people_fully_vaccinated, } )
def parse_data_news_page(self, soup: BeautifulSoup): """ 2021-09-10 We received confirmation from the International Communications Office, State Secretariat for International Communications and Relations, that the part of the report referring to people who received the 2nd dose ("közülük ([\d ]+) fő már a második oltását is megkapt") also included those who have received the J&J vaccine. On the other hand, we cannot estimate the number of vaccinations administered, as adding the two reported metrics would count J&J vaccines twice. """ text = clean_string(soup.find(class_="page_body").text) match = re.search(self.regex["metrics"], text) people_vaccinated = clean_count(match.group(1)) people_fully_vaccinated = clean_count(match.group(2)) total_boosters = clean_count(match.group(3)) return { "people_vaccinated": people_vaccinated, "people_fully_vaccinated": people_fully_vaccinated, "total_boosters": total_boosters, "date": extract_clean_date( soup.find("p").text, regex="(202\d. .* \d+.) - .*", date_format="%Y. %B %d.", loc="hu_HU.UTF-8", minus_days=1, ), }
def read(source: str) -> pd.Series: op = Options() op.add_argument("--headless") with webdriver.Chrome(options=op) as driver: driver.get(source) time.sleep(3) for h5 in driver.find_elements_by_tag_name("h5"): if "Primera dosis" in h5.text: people_vaccinated = clean_count( h5.find_element_by_xpath("./preceding-sibling::div").text) elif "Total dosis aplicadas" in h5.text: total_vaccinations = clean_count( h5.find_element_by_xpath("./preceding-sibling::div").text) elif "Población completamente vacunada" in h5.text: people_fully_vaccinated = clean_count( h5.find_element_by_xpath("./preceding-sibling::div").text) elif "Dosis refuerzo" in h5.text: total_boosters = clean_count( h5.find_element_by_xpath("./preceding-sibling::div").text) data = { "date": localdate("America/Santo_Domingo"), "people_vaccinated": people_vaccinated, "people_fully_vaccinated": people_fully_vaccinated, "total_vaccinations": total_vaccinations, "total_boosters": total_boosters, } return pd.Series(data=data)
def _parse_data(self, worksheet): for row in worksheet.values(): for value in row: if "Total dosis aplicadas al " in str(value): total_vaccinations = row[-1] if type(total_vaccinations) != int: total_vaccinations = clean_count(total_vaccinations) date_raw = re.search(r"[\d-]{10}$", value).group(0) date_str = clean_date(date_raw, "%d-%m-%Y") elif value == "Esquemas completos segundas + únicas dosis": people_fully_vaccinated = row[-1] if type(people_fully_vaccinated) != int: people_fully_vaccinated = clean_count( people_fully_vaccinated) elif value == "Total únicas dosis acumuladas": unique_doses = row[-1] if type(unique_doses) != int: unique_doses = clean_count(unique_doses) if total_vaccinations is None or people_fully_vaccinated is None: raise ValueError( "Date is not where it is expected be! Check worksheet") return pd.Series({ "date": date_str, "total_vaccinations": total_vaccinations, "people_fully_vaccinated": people_fully_vaccinated, "people_vaccinated": total_vaccinations - people_fully_vaccinated + unique_doses, })
def parse_vaccinations(elem) -> dict: # Get news text url = elem.find_parent(class_="card").find("a").get("href") soup = get_soup(url, verify=False) text = "\n".join([p.text for p in soup.find("article").find_all("p")]) # Find metrics metrics = dict() # total_vaccinations = re.search(r"疫苗共有(?P<count>[\d,]*)人次", text) total_vaccinations = re.search(r"疫苗劑數為(?P<count>[\d,]*)劑", text) # print(total_vaccinations) # people_vaccinated = re.search(r"1劑疫苗共有(?P<count>[\d,]*)人次", text) people_vaccinated = re.search(r"已接種人數共有(?P<count>[\d,]*)人", text) # people_fully_vaccinated = re.search(r"2劑疫苗共有(?P<count>[\d,]*)人次", text) # people_fully_vaccinated = re.search(r"已完成接種2劑有(?P<count>[\d,]*)人", text) people_fully_vaccinated = re.search(r"已接種第2劑的?有([\d,]{6,})", text) # people_fully_vaccinated = re.search(r"接種2劑有(?P<count>[\d,]*)人", text) if total_vaccinations: metrics["total_vaccinations"] = clean_count( total_vaccinations.group(1)) if people_vaccinated: metrics["people_vaccinated"] = clean_count(people_vaccinated.group(1)) if people_fully_vaccinated: metrics["people_fully_vaccinated"] = clean_count( people_fully_vaccinated.group(1)) return metrics
def connect_parse_data(self) -> pd.Series: op = Options() op.add_argument("--headless") with webdriver.Chrome(options=op) as driver: driver.get(self.source_url) time.sleep(5) total_vaccinations = clean_count( driver.find_element_by_id("counter1").text) total_boosters = clean_count( driver.find_element_by_id("counter4").text) # people_vaccinated_share = driver.find_element_by_id("counter4").text # assert "One dose" in people_vaccinated_share # people_fully_vaccinated_share = driver.find_element_by_id("counter4a").text # assert "Two doses" in people_fully_vaccinated_share # This logic is only valid as long as Qatar *exclusively* uses 2-dose vaccines # people_vaccinated_share = float(re.search(r"[\d.]+", people_vaccinated_share).group(0)) # people_fully_vaccinated_share = float(re.search(r"[\d.]+", people_fully_vaccinated_share).group(0)) # vaccinated_proportion = people_vaccinated_share / (people_vaccinated_share + people_fully_vaccinated_share) # people_vaccinated = round(total_vaccinations * vaccinated_proportion) # people_fully_vaccinated = total_vaccinations - people_vaccinated date = localdate("Asia/Qatar") data = { "total_vaccinations": total_vaccinations, "total_boosters": total_boosters, # "people_vaccinated": people_vaccinated, # "people_fully_vaccinated": people_fully_vaccinated, "date": date, } return pd.Series(data=data)
def read(source: str) -> pd.Series: soup = get_soup(source) for block in soup.find(class_="main").find_all(class_="w3-center"): if block.find("p").text == "ΣΥΝΟΛΟ ΕΜΒΟΛΙΑΣΜΩΝ": total_vaccinations = clean_count(block.find_all("p")[1].text) date = re.search(r"[\d/]{8,10}", block.find_all("p")[2].text) date = clean_date(date.group(0), "%d/%m/%Y") if block.find("p").text == "ΣΥΝΟΛΟ 1ης ΔΟΣΗΣ": people_vaccinated = clean_count(block.find_all("p")[1].text) if block.find("p").text == "ΣΥΝΟΛΟ 2ης ΔΟΣΗΣ": people_fully_vaccinated = clean_count(block.find_all("p")[1].text) if block.find("p").text == "ΣΥΝΟΛΟ 3ης ΔΟΣΗΣ": total_boosters = clean_count(block.find_all("p")[1].text) data = { "total_vaccinations": total_vaccinations, "people_vaccinated": people_vaccinated, "people_fully_vaccinated": people_fully_vaccinated, "total_boosters": total_boosters, "date": date, "source_url": source, } return pd.Series(data=data)
def read(source: str) -> pd.Series: soup = get_soup(source) text = soup.find("div", id="data").find("p").text date = re.search(r"На сегодня \(([\d\.]{8})\)", text).group(1) date = clean_date(date, "%d.%m.%y") people_vaccinated = re.search( r"([\d\s]+) чел\. \([\d\.]+% от населения[^)]*\) - привито хотя бы одним компонентом вакцины", text, ).group(1) people_vaccinated = clean_count(people_vaccinated) people_fully_vaccinated = re.search( r"([\d\s]+) чел\. \([\d\.]+% от населения,?[^)]*\) - полностью привито", text).group(1) people_fully_vaccinated = clean_count(people_fully_vaccinated) total_vaccinations = re.search(r"([\d\s]+) шт\. - всего прививок сделано", text).group(1) total_vaccinations = clean_count(total_vaccinations) total_boosters = re.search(r"([\d\s]+) чел\. - прошли ревакцинацию", text).group(1) total_boosters = clean_count(total_boosters) return pd.Series({ "total_vaccinations": total_vaccinations, "people_vaccinated": people_vaccinated, "people_fully_vaccinated": people_fully_vaccinated, "total_boosters": total_boosters, "date": date, })
def pipe_metrics(self, ds: pd.Series) -> pd.Series: ds = enrich_data(ds, "people_vaccinated", clean_count(ds["first_vaccine_number"])) ds = enrich_data(ds, "people_fully_vaccinated", clean_count(ds["second_vaccine_number"])) total_vaccinations = ds["people_vaccinated"] + ds[ "people_fully_vaccinated"] return enrich_data(ds, "total_vaccinations", total_vaccinations)
def _parse_table(self, soup): df = pd.read_html(str(soup.find("table", id="content-table3")))[0] df = df[df["Región"] == "Total"] total_vaccinations = clean_count(df["Total dosis"].item()) people_vaccinated = clean_count(df["Dosis 1"].item()) people_fully_vaccinated = clean_count(df["Dosis 2"].item()) total_boosters = clean_count(df["Dosis 3"].item()) return total_vaccinations, people_vaccinated, people_fully_vaccinated, total_boosters
def _parse_boosters(self, infogram_data: dict) -> int: boosters = clean_count( self._get_infogram_value(infogram_data, "2fbd1738-f9c3-49ad-8855-c933c83abc18")) boosters_for = clean_count( self._get_infogram_value(infogram_data, "20d53fe7-91f8-4778-a13f-c938f18dd8fe")) return boosters + boosters_for
def _parse_people_fully_vaccinated(self, infogram_data: dict) -> int: ppl_fully_vaxed = clean_count( self._get_infogram_value(infogram_data, "7b45d34f-b8d0-47d7-8c3a-35c89a4d4cdf")) ppl_fully_vaxed_for = clean_count( self._get_infogram_value(infogram_data, "9eee1f41-c398-4a15-81aa-2588250e53cb")) return ppl_fully_vaxed + ppl_fully_vaxed_for
def _parse_people_vaccinated(self, infogram_data: dict) -> int: ppl_vaxed = clean_count( self._get_infogram_value(infogram_data, "4275cc3f-7ae8-4af3-9c5a-ef94203d47d7")) ppl_vaxed_for = clean_count( self._get_infogram_value(infogram_data, "8a007cb6-7384-4af1-9f92-c41699d77aab")) return ppl_vaxed + ppl_vaxed_for
def _parse_metrics_raw(self, soup, raise_err=True): elems = soup.find_all(class_="ttip") has_d3 = False for e in elems: if p := e.find("p"): if (text := p.text.strip()) == "1st doses administered": dose1 = clean_count(e.span.text) elif text == "2nd doses administered": dose2 = clean_count(e.span.text)
def _parse_metrics(self, pdf_text: str): regex = ( r"total doses administered ([\d,]+) total partially vaccinated ([\d,]+) total fully vaccinated ([\d,]+)" ) data = re.search(regex, pdf_text) total_vaccinations = clean_count(data.group(1)) people_vaccinated = clean_count(data.group(2)) people_fully_vaccinated = clean_count(data.group(3)) return total_vaccinations, people_vaccinated, people_fully_vaccinated
def _parse_text_who(self, soup): who_eul = ( r"In addition, ([\d,]+) doses of other vaccines recognised in the World Health Organization.s Emergency" r" Use Listing \(WHO EUL\) have been administered, covering ([\d,]+) individuals\." ) data = re.search(who_eul, soup.text).groups() who_doses = clean_count(data[0]) who_people_vaccinated = clean_count(data[1]) return who_doses, who_people_vaccinated
def _parse_text_summary(self, soup): preamble = ( r"As of ([\d]+ [A-Za-z]+ 20\d{2}), (\d+)% of our population has completed their full regimen/" r" received two doses of COVID-19 vaccines, (\d+)% has received at least one dose," r" and (\d+)% ha(?:ve|s) received (?:their )?booster(?:s)?") data = re.search(preamble, soup.text).groups() date = clean_date(data[0], fmt="%d %B %Y", lang="en") share_fully_vaccinated = clean_count(data[1]) share_vaccinated = clean_count(data[2]) share_boosters = clean_count(data[3]) return date, share_fully_vaccinated, share_vaccinated, share_boosters
def _parse_text_national(self, soup): national_program = ( r"We have administered a total of ([\d,]+) doses of COVID-19 vaccines under the.*" r"In total, ([\d,]+) individuals have received at least one dose of vaccine under the national vaccination" r" programme,.* ([\d,]+) (?:individuals )?have (?:received|taken) their booster shots" ) data = re.search(national_program, soup.text).groups() national_doses = clean_count(data[0]) national_people_vaccinated = clean_count(data[1]) national_boosters = clean_count(data[2]) return national_doses, national_boosters, national_people_vaccinated
def _parse_metrics(self, news_info: dict): soup = get_soup(news_info["link"]) text = clean_string(soup.text) metrics = re.search(self.regex["metrics"], text).group(1, 2, 3) return { "total_vaccinations": clean_count(metrics[0]), "people_vaccinated": clean_count(metrics[1]), "people_fully_vaccinated": clean_count(metrics[2]), "source_url": news_info["link"], "date": news_info["date"], }
def parse_metrics(self, soup): dfs = pd.read_html(self.source_url, converters={"Totales": lambda x: str(x)}) df = dfs[0].rename(columns={"Unnamed: 0": "metric"}).set_index("metric") people_vaccinated = clean_count(df.loc["Total Vacunados 1ª dosis", "Totales"]) total_vaccinations = clean_count(df.loc["Total dosis administradas", "Totales"]) people_fully_vaccinated = total_vaccinations - people_vaccinated return { "total_vaccinations": total_vaccinations, "people_vaccinated": people_vaccinated, "people_fully_vaccinated": people_fully_vaccinated, }
def _parse_data_metrics(self, soup) -> dict: counters = soup.find_all(class_="text-brand-blue") dose_1 = clean_count( re.search(r"Innuttaasut ([\d\.]+)", counters[1].parent.find_all("dd")[-1].text).group(1)) dose_2 = clean_count( re.search(r"Innuttaasut ([\d\.]+)", counters[2].parent.find_all("dd")[-1].text).group(1)) if dose_1 < dose_2: raise ValueError("dose_1 cannot be higher than dose_2") return {"people_vaccinated": dose_1, "people_fully_vaccinated": dose_2}
def parse_data(self, soup): widgets = soup.find_all(class_="textwidget") total_vaccinations = clean_count(widgets[0].text) people_fully_vaccinated = clean_count(widgets[1].text) people_vaccinated = total_vaccinations - people_fully_vaccinated return pd.Series({ "total_vaccinations": total_vaccinations, "people_vaccinated": people_vaccinated, "people_fully_vaccinated": people_fully_vaccinated, "date": localdate("Asia/Tbilisi"), })
def parse_data(soup: BeautifulSoup) -> pd.Series: total_vaccinations = clean_count( soup.find(id="stats").find_all("span")[0].text) people_fully_vaccinated = clean_count( soup.find(id="stats").find_all("span")[1].text) data = { "total_vaccinations": total_vaccinations, "people_fully_vaccinated": people_fully_vaccinated, } return pd.Series(data=data)
def _parse_metrics(self, text: str) -> pd.DataFrame: """Parse metrics from data.""" pcr = re.search(self.regex["pcr"], text) ag = re.search(self.regex["ag"], text) if not pcr and not ag: raise ValueError("Unable to extract data from text, please update the regex.") pcr = clean_count(pcr.group(1)) ag = clean_count(ag.group(1)) return pcr + ag
def _parse_data(self, soup): metrics_raw = soup.find_all("h3", class_="ml-4") data = {} for h in metrics_raw: title = h.parent.p.text.strip() if title == "Всего вакцинаций": data["total_vaccinations"] = clean_count(h.text) elif title == "Количество вакцинированных 1 дозой": data["people_vaccinated"] = clean_count(h.text) elif title == "Количество лиц, прошедших полный курс вакцинации": data["people_fully_vaccinated"] = clean_count(h.text) return data
def _parse_people_fully_vaccinated(self, infogram_data: dict) -> int: ppl_fully_vaxed = clean_count( self._get_infogram_value( infogram_data, "7b45d34f-b8d0-47d7-8c3a-35c89a4d4cdfbeb30c31-b4de-45fc-bdc5-78e39d37039b" )) ppl_fully_vaxed_for = clean_count( self._get_infogram_value( infogram_data, "9eee1f41-c398-4a15-81aa-2588250e53cbbc920ff4-fff6-4ef4-a174-af1915c4b1a7" )) return ppl_fully_vaxed + ppl_fully_vaxed_for