def parse_vaccinated(self, soup): regex = r"De los ([\d\.]+) vacunados un total de ([\d\.]+) \(([\d\.]+)%\) ya han recibido la 2ª dosis" match = re.search(regex, soup.text) people_vaccinated = match.group(1) people_fully_vaccinated = match.group(2) return clean_count(people_vaccinated), clean_count( people_fully_vaccinated)
def parse_data(soup: BeautifulSoup) -> pd.Series: # regex = ( # r"So far, ([\d,]+) \(([\d,]+)% of the estimated population of 65,000\) have received at least one dose of a" # r" COVID-19 vaccine, with ([\d,]+)% having completed the two-dose course" # ) # matches = re.search(regex, soup.text) # people_vaccinated = clean_count(matches.group(1)) # proportion_dose1 = clean_count(matches.group(2)) # proportion_dose2 = clean_count(matches.group(3)) # assert proportion_dose1 >= proportion_dose2 # people_fully_vaccinated = round(people_vaccinated * proportion_dose2 / proportion_dose1) # total_vaccinations = people_vaccinated + people_fully_vaccinated regex_1 = ( r"There have been ([\d,]+) Covid-19 vaccinations given in total in the Cayman Islands." ) total_vaccinations = clean_count(re.search(regex_1, soup.text).group(1)) regex_2 = ( r"Of these, ([\d,]+) \(([\d,]+)% of (?:[a-zA-Z0-9,]+)\) have had at least one dose of a COVID-19 " r"vaccine and ([\d,]+)% have completed the two dose course.") matches = re.search(regex_2, soup.text) people_vaccinated = clean_count(matches.group(1)) proportion_dose1 = clean_count(matches.group(2)) proportion_dose2 = clean_count(matches.group(3)) assert proportion_dose1 >= proportion_dose2 people_fully_vaccinated = round(total_vaccinations * proportion_dose2 / 100) return pd.Series({ "total_vaccinations": total_vaccinations, "people_vaccinated": people_vaccinated, "people_fully_vaccinated": people_fully_vaccinated, })
def parse_data(soup: BeautifulSoup) -> pd.Series: people_vaccinated = clean_count( soup .find(class_="status_infoArea") .find(class_="round1") .find(class_="big") .text ) people_fully_vaccinated = clean_count( soup .find(class_="status_infoArea") .find(class_="round2") .find(class_="big") .text ) total_vaccinations = people_vaccinated + people_fully_vaccinated date = str((datetime.datetime.now(pytz.timezone("Asia/Seoul")) - datetime.timedelta(days=1)).date()) data = { "date": date, "people_vaccinated": people_vaccinated, "people_fully_vaccinated": people_fully_vaccinated, "total_vaccinations": total_vaccinations, } return pd.Series(data=data)
def connect_parse_data(source: str, source_old: str) -> pd.Series: op = Options() op.add_argument("--headless") with webdriver.Chrome(options=op) as driver: driver.get(source) time.sleep(5) total_vaccinations = driver.find_element_by_id("counter1").text people_vaccinated = driver.find_element_by_id("counter2").text people_fully_vaccinated = driver.find_element_by_id("counter3").text driver.get(source_old) time.sleep(5) # Sanity check total_vaccinations_old = driver.find_element_by_id("counter1").text if total_vaccinations != total_vaccinations_old: raise ValueError( "Both dashboards may not be synced and hence may refer to different timestamps. Consider" "Introducing the timestamp manually.") date = driver.find_element_by_id("pupdateddate").text.replace( "Updated ", "") date = str(pd.to_datetime(date, dayfirst=True).date()) data = { "total_vaccinations": clean_count(total_vaccinations), "people_vaccinated": clean_count(people_vaccinated), "people_fully_vaccinated": clean_count(people_fully_vaccinated), "date": date, } return pd.Series(data=data)
def parse_data(url: str) -> pd.Series: kwargs = {"pandas_options": {"dtype": str, "header": None}} dfs_from_pdf = tabula.read_pdf(url, pages="all", **kwargs) for df in dfs_from_pdf: if "Beneficiaries vaccinated" in dfs_from_pdf[0].values.flatten(): break df = df[df[0] == "India"] ncols = df.shape[1] people_vaccinated = clean_count(df[ncols - 3].item()) people_fully_vaccinated = clean_count(df[ncols - 2].item()) total_vaccinations = clean_count(df[ncols - 1].item()) return pd.Series({ "date": str((datetime.datetime.now(pytz.timezone("Asia/Kolkata")) - datetime.timedelta(days=1)).date()), "people_vaccinated": people_vaccinated, "people_fully_vaccinated": people_fully_vaccinated, "total_vaccinations": total_vaccinations, "source_url": url, })
def parse_data(soup: BeautifulSoup) -> pd.Series: a = 1 + 2 b = 1 regex_1 = ( r"([\d,]+) C(ovid|OVID)-19 vaccinations has been given in total in the Cayman Islands" ) regex_1 = r"([\d,]+) C(ovid|OVID)-19 vaccinations (?:had|have) been given in total in the Cayman Islands" total_vaccinations = clean_count(re.search(regex_1, soup.text).group(1)) # regex_2 = ( # r"Of these, ([\d,]+) \((?:[\d,]+)% of (?:[a-zA-Z0-9,]+)\) have had at least one dose" # ) # assert total_vaccinations >= people_vaccinated # people_fully_vaccinated = total_vaccinations - people_vaccinated regex_2 = ( r"Of these,? ([\d,]+) \((?:[\d,]+)% of (?:[\d,]+)\) (?:have)? had at least one dose of a C(?:ovid|OVID)-19 " r"vaccine and (?:approximately)? (?:[\d,]+)% \(([\d,]+)\) have completed the two-dose course\." ) matches = re.search(regex_2, soup.text) people_vaccinated = clean_count(matches.group(1)) people_fully_vaccinated = clean_count(matches.group(2)) return pd.Series({ "total_vaccinations": total_vaccinations, "people_vaccinated": people_vaccinated, "people_fully_vaccinated": people_fully_vaccinated, })
def read(source: str) -> pd.Series: op = Options() op.add_argument("--headless") with webdriver.Chrome(options=op) as driver: driver.get(source) time.sleep(1) for h5 in driver.find_elements_by_tag_name("h5"): if "Primera dosis" in h5.text: people_vaccinated = clean_count( h5.find_element_by_xpath("./preceding-sibling::div").text) elif "Total dosis aplicadas" in h5.text: total_vaccinations = clean_count( h5.find_element_by_xpath("./preceding-sibling::div").text) elif "Población completamente vacunada" in h5.text: people_fully_vaccinated = clean_count( h5.find_element_by_xpath("./preceding-sibling::div").text) elif "Acumulados al" in h5.text: date = h5.text.replace("Acumulados al ", "") date = str(dateparser.parse(date, languages=["es"]).date()) data = { "date": date, "people_vaccinated": people_vaccinated, "people_fully_vaccinated": people_fully_vaccinated, "total_vaccinations": total_vaccinations, } return pd.Series(data=data)
def _parse_data(self) -> pd.Series: op = Options() op.add_argument("--headless") with webdriver.Chrome(options=op) as driver: # Main page driver.get(self.source_url) # Get report page from within iframe source = driver.find_element_by_xpath( "/html/body/section[2]/iframe").get_attribute("src") driver.get(source) data_blocks = (WebDriverWait(driver, 20).until( EC.visibility_of_all_elements_located( (By.CLASS_NAME, "card")))) for block in data_blocks: block_title = block.get_attribute("aria-label") if "first dose" in block_title: people_vaccinated = re.search(r"first dose +(\d+)\.", block_title).group(1) elif "sec dose" in block_title: people_fully_vaccinated = re.search( r"sec dose +(\d+)\.", block_title).group(1) people_vaccinated = clean_count(people_vaccinated) people_fully_vaccinated = clean_count(people_fully_vaccinated) return pd.Series({ "people_vaccinated": people_vaccinated, "people_fully_vaccinated": people_fully_vaccinated, })
def connect_parse_data(source: str) -> pd.Series: op = Options() op.add_argument("--headless") with webdriver.Chrome(options=op) as driver: driver.get(source) time.sleep(10) date = driver.find_element_by_class_name( "as_of").find_element_by_tag_name("span").text date = clean_date(date, "%d.%m.%Y") for elem in driver.find_elements_by_class_name("counter_block"): if "1 ДОЗУ" in elem.text: people_vaccinated = elem.find_element_by_tag_name("h2").text if "2 ДОЗИ" in elem.text: people_fully_vaccinated = elem.find_element_by_tag_name( "h2").text data = { "people_vaccinated": clean_count(people_vaccinated), "people_fully_vaccinated": clean_count(people_fully_vaccinated), "date": date, } return pd.Series(data=data)
def parse_vaccinations(elem) -> dict: # Get news text url = elem.find_parent(class_="card").find("a").get("href") soup = get_soup(url) text = "\n".join([p.text for p in soup.find("article").find_all("p")]) # Find metrics metrics = dict() # total_vaccinations = re.search(r"疫苗共有(?P<count>[\d,]*)人次", text) total_vaccinations = re.search(r"疫苗劑數為(?P<count>[\d,]*)劑", text) # print(total_vaccinations) # people_vaccinated = re.search(r"1劑疫苗共有(?P<count>[\d,]*)人次", text) people_vaccinated = re.search(r"已接種人數共有(?P<count>[\d,]*)人", text) # people_fully_vaccinated = re.search(r"2劑疫苗共有(?P<count>[\d,]*)人次", text) people_fully_vaccinated = re.search(r"已完成接種2劑有(?P<count>[\d,]*)人", text) if total_vaccinations: metrics["total_vaccinations"] = clean_count( total_vaccinations.group(1)) if people_vaccinated: metrics["people_vaccinated"] = clean_count(people_vaccinated.group(1)) if people_fully_vaccinated: metrics["people_fully_vaccinated"] = clean_count( people_fully_vaccinated.group(1)) return metrics
def parse_data(soup: BeautifulSoup): regex = r"Укупно вакцинација: ([\d.]+), од тога ревакцинација: ([\d.]+)" matches = re.search(regex, soup.text) total_vaccinations = clean_count(matches.group(1)) people_fully_vaccinated = clean_count(matches.group(2)) return total_vaccinations, people_fully_vaccinated
def connect_parse_data(source: str) -> pd.Series: headers = { "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.16; rv:86.0) Gecko/20100101 Firefox/86.0", "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8", "Accept-Language": "fr,fr-FR;q=0.8,en-US;q=0.5,en;q=0.3", "Accept-Encoding": "gzip, deflate, br", "Connection": "keep-alive", "Upgrade-Insecure-Requests": "1", "Pragma": "no-cache", "Cache-Control": "no-cache", } soup = BeautifulSoup( requests.get(source, headers=headers).content, "html.parser") data = re.search(r"De los ([\d\.]+) vacunados un total de ([\d\.]+)", soup.text) people_vaccinated = clean_count(data.group(1)) people_fully_vaccinated = clean_count(data.group(2)) data = { "people_vaccinated": people_vaccinated, "people_fully_vaccinated": people_fully_vaccinated, } return pd.Series(data=data)
def main(): data = { "location": "Guatemala", "source_url": "https://gtmvigilanciacovid.shinyapps.io/3869aac0fb95d6baf2c80f19f2da5f98", "vaccine": "Moderna, Oxford/AstraZeneca", } op = Options() op.add_argument("--headless") with webdriver.Chrome(options=op) as driver: driver.maximize_window() # For maximizing window driver.implicitly_wait(20) # gives an implicit wait for 20 seconds driver.get(data["source_url"]) driver.find_element_by_class_name("fa-syringe").click() date = driver.find_element_by_class_name("logo").text dose1 = driver.find_element_by_id("dosisaplicadas1").find_element_by_tag_name("h3").text dose2 = driver.find_element_by_id("dosisaplicadas2").find_element_by_tag_name("h3").text data["people_vaccinated"] = clean_count(dose1) data["people_fully_vaccinated"] = clean_count(dose2) data["total_vaccinations"] = data["people_vaccinated"] + data["people_fully_vaccinated"] date = re.search(r"\d+/\d+/202\d", date).group(0) data["date"] = clean_date(date, "%d/%m/%Y") increment( location=data["location"], total_vaccinations=data["total_vaccinations"], people_vaccinated=data["people_vaccinated"], people_fully_vaccinated=data["people_fully_vaccinated"], date=data["date"], source_url=data["source_url"], vaccine=data["vaccine"], )
def connect_parse_data(source: str) -> pd.Series: op = Options() op.add_argument("--headless") with webdriver.Chrome(options=op) as driver: driver.get(source) time.sleep(10) date = re.search(r"Fecha de corte : ([\d/]{10})", driver.page_source).group(1) for block in driver.find_elements_by_class_name("unselectable"): if block.get_attribute("aria-label") == "Dosis aplicadas Card": total_vaccinations = clean_count( block.find_element_by_class_name("value").text) elif block.get_attribute( "aria-label") == "Segundas dosis aplicadas Card": people_fully_vaccinated = clean_count( block.find_element_by_class_name("value").text) people_vaccinated = total_vaccinations - people_fully_vaccinated return pd.Series({ "total_vaccinations": total_vaccinations, "people_vaccinated": people_vaccinated, "people_fully_vaccinated": people_fully_vaccinated, "date": clean_date(date, "%d/%m/%Y") })
def parse_data(soup: BeautifulSoup) -> pd.Series: for p in soup.find_all("p"): if "Primera dosis" in p.text: people_vaccinated = clean_count( re.search(r"[\d,]{6,}", p.text).group(0)) elif "Total dosis aplicadas" in p.text: total_vaccinations = clean_count( re.search(r"[\d,]{6,}", p.text).group(0)) elif "Población completamente vacunada" in p.text: people_fully_vaccinated = clean_count( re.search(r"[\d,]{6,}", p.text).group(0)) date = soup.find("h6").text.replace("Acumulados al ", "") date = str(dateparser.parse(date, languages=["es"]).date()) data = { "date": date, "people_vaccinated": people_vaccinated, "people_fully_vaccinated": people_fully_vaccinated, "total_vaccinations": total_vaccinations, } return pd.Series(data=data)
def parse_data(data: dict) -> pd.Series: dose1 = clean_count(data["data"][0]["vakdose1"]) dose2 = clean_count(data["data"][0]["vakdose2"]) data = pd.Series({ "date": datetime.fromtimestamp(data["updated"] // 1000).strftime("%Y-%m-%d"), "people_vaccinated": dose1, "people_fully_vaccinated": dose2, "total_vaccinations": dose1 + dose2 }) return data
def parse_data(soup: BeautifulSoup) -> pd.Series: numbers = soup.find(class_="cifras-coronavirus").find_all(class_="cifra") return pd.Series( data={ "total_vaccinations": clean_count(numbers[1].text), "people_fully_vaccinated": clean_count(numbers[2].text), "date": set_date() })
def parse_data(self, soup): widgets = soup.find_all(class_="textwidget") total_vaccinations = clean_count(widgets[0].text) people_fully_vaccinated = clean_count(widgets[1].text) people_vaccinated = total_vaccinations - people_fully_vaccinated return pd.Series({ "total_vaccinations": total_vaccinations, "people_vaccinated": people_vaccinated, "people_fully_vaccinated": people_fully_vaccinated, "date": localdate("Asia/Tbilisi") })
def _parse_metrics(self, soup: BeautifulSoup): total_vaccinations = clean_count( re.search(self.regex["total_vaccinations"], soup.text).group(1) ) people_vaccinated = clean_count( re.search(self.regex["people_vaccinated"], soup.text).group(1) ) people_fully_vaccinated = clean_count( re.search(self.regex["people_fully_vaccinated"], soup.text).group(1) ) return total_vaccinations, people_vaccinated, people_fully_vaccinated
def parse_data(self, soup): regex = r"ja s’han administrat ([\d\.]+) dosis i ([\d\.]+) persones han rebut, com a mínim, una dosi del vaccí" match = re.search(regex, soup.text) # Metrics total_vaccinations = clean_count(match.group(1)) people_vaccinated = clean_count(match.group(2)) people_fully_vaccinated = total_vaccinations - people_vaccinated return pd.Series({ "total_vaccinations": total_vaccinations, "people_vaccinated": people_vaccinated, "people_fully_vaccinated": people_fully_vaccinated, "date": self.parse_date(soup) })
def parse_data_news_page(self, soup: BeautifulSoup): people_vaccinated = re.search(self.regex["people_vaccinated"], soup.text) people_fully_vaccinated = re.search( self.regex["people_fully_vaccinated"], soup.text) metrics = {} if people_vaccinated: metrics["people_vaccinated"] = clean_count( people_vaccinated.group(1)) if people_fully_vaccinated: metrics["people_fully_vaccinated"] = clean_count( people_fully_vaccinated.group(1)) return metrics
def _parse_metrics(self, text: str): metrics = re.search(self.regex_vax, text).groups() people_vaccinated = clean_count(metrics[0]) people_fully_vaccinated = clean_count(metrics[1]) total_vaccinations = clean_count(metrics[2]) if total_vaccinations != people_vaccinated + people_fully_vaccinated: raise ValueError( "total_vaccinations != people_vaccinated + people_fully_vaccinated" ) return { "total_vaccinations": total_vaccinations, "people_vaccinated": people_vaccinated, "people_fully_vaccinated": people_fully_vaccinated, }
def parse_infogram_vaccinations(infogram_data: dict) -> int: total_vaccinations = clean_count( _get_infogram_value(infogram_data, "4f66ed81-151f-4b97-aa3c-4927bde058b2")) people_vaccinated = clean_count( _get_infogram_value(infogram_data, "4048eac1-24ba-4e24-b081-61dfa0281a0e")) people_fully_vaccinated = clean_count( _get_infogram_value(infogram_data, "50a2486f-7dca-4afd-a551-bd24665d7314")) return { "total_vaccinations": total_vaccinations, "people_vaccinated": people_vaccinated, "people_fully_vaccinated": people_fully_vaccinated }
def parse_data(self, soup): regex = ( r"s’han administrat un total de ([\d\.]+) vacunes, ([\d\.]+) persones (?:han rebut|tenen) una dosi del " r"vaccí,? i ([\d\.]+) (persones )?(en )?tenen les dues") match = re.search(regex, soup.text) # Metrics total_vaccinations = clean_count(match.group(1)) people_vaccinated = clean_count(match.group(2)) people_fully_vaccinated = clean_count(match.group(3)) # people_fully_vaccinated = total_vaccinations - people_vaccinated return pd.Series({ "total_vaccinations": total_vaccinations, "people_vaccinated": people_vaccinated, "people_fully_vaccinated": people_fully_vaccinated, "date": self.parse_date(soup) })
def connect_parse_data(source: str) -> pd.Series: headers = { "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.16; rv:86.0) Gecko/20100101 Firefox/86.0", "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8", "Accept-Language": "fr,fr-FR;q=0.8,en-US;q=0.5,en;q=0.3", "Accept-Encoding": "gzip, deflate, br", "Connection": "keep-alive", "Upgrade-Insecure-Requests": "1", "Pragma": "no-cache", "Cache-Control": "no-cache", } soup = BeautifulSoup( requests.get(source, headers=headers).content, "html.parser") total_vaccinations = soup.find(class_="repart-stlucia").text total_vaccinations = clean_count(total_vaccinations) date = soup.find(class_="h2-blue").text date = re.search(r"\w+ +\d+, +202\d", date).group(0) date = clean_date(date, "%B %d, %Y") data = { "total_vaccinations": total_vaccinations, "date": date, } return pd.Series(data=data)
def parse_data(soup: BeautifulSoup) -> pd.Series: # Get path to newest pdf links = soup.find(class_="rt-article").find_all("a") for link in links: if "sitrep-sl-en" in link["href"]: pdf_path = "https://www.epid.gov.lk" + link["href"] break tf = tempfile.NamedTemporaryFile() with open(tf.name, mode="wb") as f: f.write(requests.get(pdf_path).content) with open(tf.name, mode="rb") as f: reader = PyPDF2.PdfFileReader(f) page = reader.getPage(0) text = page.extractText().replace("\n", "") regex = r"COVID-19\s+Total\s+Vaccinated\s+(\d+)" total_vaccinations = re.search(regex, text).group(1) total_vaccinations = clean_count(total_vaccinations) people_vaccinated = total_vaccinations regex = r"Situation Report\s+([\d\.]{10})" date = re.search(regex, text).group(1) date = clean_date(date, "%d.%m.%Y") return pd.Series(data={ "total_vaccinations": total_vaccinations, "people_vaccinated": people_vaccinated, "date": date, "source_url": pdf_path, })
def _parse_stats(self, df: pd.DataFrame) -> int: if df.shape[1] != 4 or df.iloc[0, 0] != "廠牌" or df.iloc[ 0, 1] != "劑次" or not (df.iloc[-1, 0] == "總計" or df.iloc[-2, 0] == "總計"): raise ValueError(f"Table 1: format has changed!") num_dose1 = df[df[1] == "第 1劑"].tail(1).values[0][-1] num_dose1 = clean_count(num_dose1) num_dose2 = df[df[1] == "第 2劑"].tail(1).values[0][-1] num_dose2 = clean_count(num_dose2) return { "total_vaccinations": (num_dose1 + num_dose2), "people_vaccinated": num_dose1, }
def parse_total_vaccinations(soup: BeautifulSoup) -> str: return clean_count( soup .find(class_="counter-box-content", string=re.compile("Vaccines Administered")) .parent .find(class_="display-counter")["data-value"] )
def parse_data(self, soup: BeautifulSoup) -> pd.Series: data = {} match = re.search(self.regex["title"], soup.text) if match: # date date_str = match.group(1) data["date"] = clean_date(f"{date_str} {datetime.now().year}", "%d de %B %Y", lang="es") # vaccinations data["total_vaccinations"] = clean_count(match.group(2)) match = re.search(self.regex["data"], soup.text) if match: data["people_vaccinated"] = clean_count(match.group(1)) data["people_fully_vaccinated"] = clean_count(match.group(3)) return pd.Series(data)
def parse_metric(soup: BeautifulSoup, description: str) -> int: value = ( soup.find("strong", string=description) .parent.parent.parent.parent .find_all("tr")[-1] .text ) return clean_count(value)