def _propose_df(self): regex_1 = ( r"COVID-19 Vaccination Update:\n\n1st and second dose — (([a-zA-Z]+) (\d{1,2})(?:th|nd|rd|st) (202\d)), in 36 States \+ the FCT\. \n\n([0-9,]+) eligible " r"Nigerians have been vaccinated with first dose while ([0-9,]+) of Nigerians vaccinated with 1st dose have collected their 2nd dose\." ) regex_2 = r"COVID-19 Vaccination Update for (([a-zA-Z]+) (\d{1,2})(?:th|nd|rd|st),? (202\d)), in 36 States \+ the FCT\. " regex_3 = r"COVID-19 Vaccination Update" data = [] for tweet in self.tweets: match_1 = re.search(regex_1, tweet.full_text) match_2 = re.search(regex_2, tweet.full_text) match_3 = re.search(regex_3, tweet.full_text) if match_1: people_vaccinated = clean_count(match_1.group(5)) people_fully_vaccinated = clean_count(match_1.group(6)) dt = clean_date(" ".join(match_1.group(2, 3, 4)), "%B %d %Y") if self.stop_search(dt): break data.append({ "date": dt, "total_vaccinations": people_vaccinated + people_fully_vaccinated, "people_vaccinated": people_vaccinated, "people_fully_vaccinated": people_fully_vaccinated, "text": tweet.full_text, "source_url": self.build_post_url(tweet.id), "media_url": tweet.extended_entities["media"][0]["media_url_https"], }) elif match_2: dt = clean_date(" ".join(match_2.group(2, 3, 4)), "%B %d %Y") if self.stop_search(dt): break data.append({ "date": dt, "text": tweet.full_text, "source_url": self.build_post_url(tweet.id), "media_url": tweet.extended_entities["media"][0]["media_url_https"], }) elif match_3: data.append({ "text": tweet.full_text, "source_url": self.build_post_url(tweet.id), "media_url": tweet.extended_entities["media"][0]["media_url_https"], }) df = pd.DataFrame(data) return df
def parse_data(self, soup: BeautifulSoup) -> pd.Series: # Get path to newest pdf pdf_path = self._parse_last_pdf_link(soup) # Get text from pdf text = self._extract_text_from_pdf(pdf_path) # Get vaccine table from text df_vax = self._parse_vaccines_table_as_df(text) people_vaccinated = df_vax.doses_1.sum() people_fully_vaccinated = df_vax.doses_2.sum() total_vaccinations = people_vaccinated + people_fully_vaccinated vaccine = ", ".join(df_vax.vaccine.map(vaccines_mapping)) # Get date regex = r"Situation Report\s+([\d\.]{10})" date = re.search(regex, text).group(1) date = clean_date(date, "%d.%m.%Y") # Build data series return pd.Series( data={ "total_vaccinations": total_vaccinations, "people_vaccinated": people_vaccinated, "people_fully_vaccinated": people_fully_vaccinated, "date": date, "source_url": pdf_path, "vaccine": vaccine, "location": self.location, })
def read(source: str) -> pd.Series: op = Options() op.add_argument("--headless") with webdriver.Chrome(options=op) as driver: driver.get(source) time.sleep(3) for h5 in driver.find_elements_by_tag_name("h5"): if "Primera dosis" in h5.text: people_vaccinated = clean_count( h5.find_element_by_xpath("./preceding-sibling::div").text) elif "Total dosis aplicadas" in h5.text: total_vaccinations = clean_count( h5.find_element_by_xpath("./preceding-sibling::div").text) elif "Población completamente vacunada" in h5.text: people_fully_vaccinated = clean_count( h5.find_element_by_xpath("./preceding-sibling::div").text) elif "Acumulados al" in h5.text: date = clean_date(h5.text, "Acumulados al %d de %B de %Y", "es") data = { "date": date, "people_vaccinated": people_vaccinated, "people_fully_vaccinated": people_fully_vaccinated, "total_vaccinations": total_vaccinations, } return pd.Series(data=data)
def connect_parse_data(source: str) -> pd.Series: op = Options() op.add_argument("--headless") with webdriver.Chrome(options=op) as driver: driver.get(source) time.sleep(10) date = driver.find_element_by_class_name( "as_of").find_element_by_tag_name("span").text date = clean_date(date, "%d.%m.%Y") for elem in driver.find_elements_by_class_name("counter_block"): if "1 ДОЗУ" in elem.text: people_vaccinated = elem.find_element_by_tag_name("h2").text if "2 ДОЗИ" in elem.text: people_fully_vaccinated = elem.find_element_by_tag_name( "h2").text data = { "people_vaccinated": clean_count(people_vaccinated), "people_fully_vaccinated": clean_count(people_fully_vaccinated), "date": date, } return pd.Series(data=data)
def connect_parse_data(source: str, source_old: str) -> pd.Series: op = Options() op.add_argument("--headless") with webdriver.Chrome(options=op) as driver: driver.get(source) time.sleep(5) total_vaccinations = driver.find_element_by_id("counter1").text people_vaccinated = driver.find_element_by_id("counter2").text people_fully_vaccinated = driver.find_element_by_id("counter3").text driver.get(source_old) time.sleep(5) # Sanity check total_vaccinations_old = driver.find_element_by_id("counter1").text if total_vaccinations != total_vaccinations_old: raise ValueError( "Both dashboards may not be synced and hence may refer to different timestamps. Consider" "Introducing the timestamp manually.") date = driver.find_element_by_id("pupdateddate").text date = clean_date(date, "Updated %d %b, %Y") data = { "total_vaccinations": clean_count(total_vaccinations), "people_vaccinated": clean_count(people_vaccinated), "people_fully_vaccinated": clean_count(people_fully_vaccinated), "date": date, } return pd.Series(data=data)
def _propose_df(self): regex = r"COVID-19 update: As at (\d{1,2} [a-zA-Z]+ 202\d), .* a total of ([\d ]+) people have been vaccinated" data = [] for tweet in self.tweets: match = re.search(regex, tweet.full_text) if match: dt = clean_date(match.group(1), "%d %B %Y") total_vaccinations = clean_count(match.group(2)) dt = tweet.created_at.strftime("%Y-%m-%d") if self.stop_search(dt): break data.append({ "date": dt, "people_vaccinated": total_vaccinations, "text": tweet.full_text, "source_url": self.build_post_url(tweet.id), "media_url": tweet.entities["media"][0]["media_url_https"] if "media" in tweet.entities else None, }) return pd.DataFrame(data)
def _parse_data(self, worksheet): for row in worksheet.values(): for value in row: if "Total dosis aplicadas al " in str(value): total_vaccinations = row[-1] date_raw = re.search(r"[\d-]{10}$", value).group(0) date_str = clean_date(date_raw, "%d-%m-%Y") elif value == "Esquemas completos segundas + únicas dosis": people_fully_vaccinated = row[-1] elif value == "Total únicas dosis acumuladas": unique_doses = row[-1] if total_vaccinations is None or people_fully_vaccinated is None: raise ValueError( "Date is not where it is expected be! Check worksheet") return pd.Series({ "date": date_str, "total_vaccinations": total_vaccinations, "people_fully_vaccinated": people_fully_vaccinated, "people_vaccinated": total_vaccinations - people_fully_vaccinated + unique_doses, })
def parse_date(soup: BeautifulSoup) -> str: for h3 in soup.find_all("h3"): if "Vaccination Data" in h3.text: break date = re.search(r"as of (\d+ \w+ \d+)", h3.text).group(1) date = clean_date(date, "%d %b %Y") return date
def parse_date(filename): # Read pdf (for date) with open(filename, mode="rb") as f: reader = PyPDF2.PdfFileReader(f) page = reader.getPage(0) text = page.extractText() # Get date date_str = re.search(r"\n(?P<count>\d{1,2}.\d{1,2}.\d{4})\n", text).group(1) return clean_date(date_str, "%d.%m.%Y")
def read(self) -> pd.Series: data = self._parse_data() # Build Series return pd.Series({ "total_vaccinations": data["Doses_Administered"], "people_vaccinated": data["Administered_Dose1_Recip"], "people_fully_vaccinated": data["Series_Complete_Yes"], "date": clean_date(data["Date"], "%Y-%m-%d"), "vaccine": self._parse_vaccines(data) })
def _parse_date(self, soup: BeautifulSoup) -> str: elems = soup.find_all("p") x = [] for elem in elems: if elem.find(text=re.compile(self.regex["date"])): x.append(elem) if len(x) > 1: raise ValueError("Format of source has changed") date_str = clean_date(x[0].text, "ажурирано %d.%m.%Y") return date_str
def read(self): data = requests.get( self.source_url).json()["features"][0]["attributes"] return pd.Series({ "total_vaccinations": data["Vaccine_total"], "people_fully_vaccinated": data["Vaccine_total_last24"], "date": clean_date(datetime.fromtimestamp(data["Date"] / 1000)), })
def _parse_date(self, driver): driver.find_element_by_id("tabZoneId87").click() time.sleep(1) driver.find_element_by_id("download-ToolbarButton").click() time.sleep(2) driver.find_element_by_xpath( f"//button[contains(text(),'Data')]").click() time.sleep(4) window_after = driver.window_handles[1] driver.switch_to.window(window_after) time.sleep(2) date_str = driver.find_element_by_tag_name("tbody").text return clean_date(date_str, "%m/%d/%Y")
def parse_data(data: dict) -> pd.Series: date = clean_date(data["updated"], "%Y/%m/%d") people_vaccinated = data["progress"] people_fully_vaccinated = data["completed"] return pd.Series( data={ "date": date, "people_vaccinated": people_vaccinated, "people_fully_vaccinated": people_fully_vaccinated, "vaccine": ", ".join(_get_vaccine_names(data, translate=True)), })
def parse_data(soup: BeautifulSoup) -> pd.Series: numbers = soup.find_all(class_="odometer") date = re.search(r"[\d\.]{10}", soup.find(class_="counter").text).group(0) date = clean_date(date, "%d.%m.%Y") return pd.Series( data={ "total_vaccinations": int(numbers[0]["data-count"]), "people_vaccinated": int(numbers[1]["data-count"]), "people_fully_vaccinated": int(numbers[2]["data-count"]), "date": date })
def read(source: str) -> pd.Series: data = requests.get(source).json() people_vaccinated = data["topBlock"]["vaccination"]["tot_dose_1"] people_fully_vaccinated = data["topBlock"]["vaccination"]["tot_dose_2"] total_vaccinations = data["topBlock"]["vaccination"]["total_doses"] date = clean_date(data["timestamp"], "%Y-%m-%d %H:%M:%S") return pd.Series({ "date": date, "people_vaccinated": people_vaccinated, "people_fully_vaccinated": people_fully_vaccinated, "total_vaccinations": total_vaccinations, })
def parse_data_pdf(self, link) -> dict: text = self._get_pdf_text(link) regex = ( r"([\d,]+) people have been vaccinated against COVID-19 as of (\d{1,2})(?:th|nd|st|rd) ([a-zA-Z]+) (202\d)" ) match = re.search(regex, text) people_vaccinated = clean_count(match.group(1)) date_raw = " ".join(match.group(2, 3, 4)) date_str = clean_date(date_raw, "%d %B %Y") return { "total_vaccinations": people_vaccinated, "people_vaccinated": people_vaccinated, "people_fully_vaccinated": 0, "date": date_str, "source_url": link, }
def parse_data(self, soup: BeautifulSoup) -> pd.Series: data = {} match = re.search(self.regex["title"], soup.text) if match: # date date_str = match.group(1) data["date"] = clean_date(f"{date_str} {datetime.now().year}", "%d de %B %Y", lang="es") # vaccinations data["total_vaccinations"] = clean_count(match.group(2)) match = re.search(self.regex["data"], soup.text) if match: data["people_vaccinated"] = clean_count(match.group(1)) data["people_fully_vaccinated"] = clean_count(match.group(3)) return pd.Series(data)
def parse_date(df: dict) -> str: # Old colnames = df.loc[0] date = df.loc[0, "Unnamed: 1"].replace("Journée du ", "") # New # colnames = df.columns # date = df.columns.str.replace("Journée du ", "").values[0] _ = [ re.search(r"Journée du (\d{1,2}.\d{1,2}.\d{4})", col) for col in colnames.astype(str) ] col = [col for col in _ if col is not None] if len(col) != 1: raise ValueError("Something changed in the columns!") date = clean_date(col[0].group(1), "%d.%m.%Y") return date
def _propose_df(self): regex = r"COVID-19 : Vaccination Updates\n\n(\d{1,2}\.\d{1,2}\.202\d).*" data = [] for tweet in self.tweets: match = re.search(regex, tweet.full_text) if match: dt = clean_date(match.group(1), "%d.%m.%Y") if self.stop_search(dt): break data.append({ "date": dt, "text": tweet.full_text, "source_url": self.build_post_url(tweet.id), "media_url": tweet.entities["media"][0]["media_url_https"] if "media" in tweet.entities else None, }) return pd.DataFrame(data)
def _propose_df(self): regex = r"Recevez la situation .* au (\d{1,2} [a-z]+ 202\d)\." data = [] for tweet in self.tweets: match = re.search(regex, tweet.full_text) if match: dt = clean_date(match.group(1), "%d %B %Y", lang="fr") if self.stop_search(dt): break data.append({ "date": dt, "text": tweet.full_text, "source_url": self.build_post_url(tweet.id), "media_url": tweet.entities["media"][0]["media_url_https"] if "media" in tweet.entities else None, }) return pd.DataFrame(data)
def parse_data(soup: BeautifulSoup) -> pd.Series: total_vaccinations = int(soup.find_all(class_="counter")[0].text) people_vaccinated = int(soup.find_all(class_="counter")[1].text) people_fully_vaccinated = int(soup.find_all(class_="counter")[2].text) assert total_vaccinations >= people_vaccinated assert people_vaccinated >= people_fully_vaccinated date = soup.find(class_="fuente").text date = re.search(r"\d{2}-\d{2}-\d{4}", date).group(0) date = clean_date(date, "%d-%m-%Y") data = { "total_vaccinations": total_vaccinations, "people_vaccinated": people_vaccinated, "people_fully_vaccinated": people_fully_vaccinated, "date": date, } return pd.Series(data=data)
def parse_data(soup: BeautifulSoup) -> pd.Series: people_vaccinated = int(soup.find_all(class_="count")[0]["data-count"]) people_fully_vaccinated = int( soup.find_all(class_="count")[1]["data-count"]) assert people_vaccinated >= people_fully_vaccinated total_vaccinations = people_vaccinated + people_fully_vaccinated date = soup.find(class_="reportdate").text date = re.search(r"\d+ \w+ 202\d", date).group(0) date = clean_date(date, "%d %b %Y") data = { "total_vaccinations": total_vaccinations, "people_vaccinated": people_vaccinated, "people_fully_vaccinated": people_fully_vaccinated, "date": date, } return pd.Series(data=data)
def parse_data(soup: BeautifulSoup) -> pd.Series: # Get path to newest pdf links = soup.find(class_="rt-article").find_all("a") for link in links: if "sitrep-sl-en" in link["href"]: pdf_path = "https://www.epid.gov.lk" + link["href"] break tf = tempfile.NamedTemporaryFile() with open(tf.name, mode="wb") as f: f.write(requests.get(pdf_path).content) with open(tf.name, mode="rb") as f: reader = PyPDF2.PdfFileReader(f) page = reader.getPage(0) text = page.extractText().replace("\n", "") covishield_data = re.search(r"Covishield Vaccine +(\d+) (\d+)", text) covishield_dose1 = clean_count(covishield_data.group(1)) covishield_dose2 = clean_count(covishield_data.group(2)) sinopharm_data = re.search(r"Sinopharm Vaccine +(\d+) (\d+)", text) sinopharm_dose1 = clean_count(sinopharm_data.group(1)) sinopharm_dose2 = clean_count(sinopharm_data.group(2)) total_vaccinations = covishield_dose1 + covishield_dose2 + sinopharm_dose1 + sinopharm_dose2 people_vaccinated = covishield_dose1 + sinopharm_dose1 people_fully_vaccinated = covishield_dose2 + sinopharm_dose2 regex = r"Situation Report\s+([\d\.]{10})" date = re.search(regex, text).group(1) date = clean_date(date, "%d.%m.%Y") return pd.Series( data={ "total_vaccinations": total_vaccinations, "people_vaccinated": people_vaccinated, "people_fully_vaccinated": people_fully_vaccinated, "date": date, "source_url": pdf_path, })
def _parse_data(self, source: str) -> pd.Series: df = self._get_data_raw(source) # Parse metrics total_vaccinations = df.loc["合計", "接種回数"].item() people_vaccinated = df.loc["合計", "内1回目"].sum() people_fully_vaccinated = df.loc["合計", "内2回目"].sum() # Parse date date = clean_date( max(dt for dt in df.index.values if isinstance(dt, datetime))) # Parse vaccines vaccines = self._parse_vaccines(df) return pd.Series( data={ "total_vaccinations": total_vaccinations, "people_vaccinated": people_vaccinated, "people_fully_vaccinated": people_fully_vaccinated, "date": date, "vaccine": vaccines })
def read(source: str) -> pd.Series: headers = { "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.16; rv:86.0) Gecko/20100101 Firefox/86.0", "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8", "Accept-Language": "fr,fr-FR;q=0.8,en-US;q=0.5,en;q=0.3", "Accept-Encoding": "gzip, deflate, br", "Connection": "keep-alive", "Upgrade-Insecure-Requests": "1", "Pragma": "no-cache", "Cache-Control": "no-cache", } soup = BeautifulSoup( requests.get(source, headers=headers).content, "html.parser") text = soup.find("div", id="data").find("p").text date = re.search(r"На сегодня \(([\d\.]{8})\)", text).group(1) date = clean_date(date, "%d.%m.%y") people_vaccinated = re.search( r"([\d\s]+) чел\. \([\d\.]+% от населения[^)]*\) - привито хотя бы одним компонентом вакцины", text).group(1) people_vaccinated = clean_count(people_vaccinated) people_fully_vaccinated = re.search( r"([\d\s]+) чел\. \([\d\.]+% от населения,?[^)]*\) - полностью привито", text).group(1) people_fully_vaccinated = clean_count(people_fully_vaccinated) total_vaccinations = re.search(r"([\d\s]+) шт\. - всего прививок сделано", text).group(1) total_vaccinations = clean_count(total_vaccinations) return pd.Series({ "total_vaccinations": total_vaccinations, "people_vaccinated": people_vaccinated, "people_fully_vaccinated": people_fully_vaccinated, "date": date, })
def _propose_df(self): regex = r"VACUNACIÓN #COVID19 \| Reporte del (\d{1,2}\.\d{1,2}\.202\d) - \d{1,2}:\d{1,2}" data = [] for tweet in self.tweets: match = re.search(regex, tweet.full_text) if match: regex_doses = r"Total Dosis Administradas: ([\d\.]+)" total_vaccinations = re.search(regex_doses, tweet.full_text) if total_vaccinations: total_vaccinations = clean_count( total_vaccinations.group(1)) else: total_vaccinations = pd.NA regex_people = r"Total personas vacunadas: ([\d\.]+)" people_vaccinated = re.search(regex_people, tweet.full_text) if people_vaccinated: people_vaccinated = clean_count(people_vaccinated.group(1)) else: people_vaccinated = pd.NA people_fully_vaccinated = total_vaccinations - people_vaccinated dt = clean_date(match.group(1), "%d.%m.%Y") if self.stop_search(dt): break data.append({ "date": dt, "total_vaccinations": total_vaccinations, "people_vaccinated": people_vaccinated, "people_fully_vaccinated": people_fully_vaccinated, "text": tweet.full_text, "source_url": 1, #pan.build_post_url(tweet.id), "media_url": tweet.extended_entities["media"][0]["media_url_https"], }) df = pd.DataFrame(data) return df
def read(source: str) -> pd.Series: soup = get_soup(source) counters = soup.find_all(class_="counter") people_partially_vaccinated = clean_count(counters[0].text) people_fully_vaccinated = clean_count(counters[1].text) total_vaccinations = clean_count(counters[2].text) people_vaccinated = people_partially_vaccinated + people_fully_vaccinated date = soup.find("span", id="last-update").text date = re.search(r"\d+.*202\d", date).group(0) date = clean_date(date, "%d %B, %Y") data = { "total_vaccinations": total_vaccinations, "people_vaccinated": people_vaccinated, "people_fully_vaccinated": people_fully_vaccinated, "date": date, "source_url": source, } return pd.Series(data=data)
def _propose_df(self): regex = r"Minister of Health Lizzie Nkosi's #COVID19 update on (\d{1,2} [a-zA-Z]+ 202\d)" data = [] for tweet in self.tweets: match = re.search(regex, tweet.full_text) if match: dt = clean_date(match.group(1), "%d %B %Y") if self.stop_search(dt): break data.append({ "date": dt, "text": tweet.full_text, "source_url": self.build_post_url(tweet.id), "media_url": tweet.extended_entities["media"][0]["media_url_https"], }) df = pd.DataFrame(data) return df
def connect_parse_data(source: str) -> pd.Series: soup = get_soup(source) people_vaccinated = soup.find_all(class_="repart-stlucia")[0].text people_vaccinated = clean_count(people_vaccinated) people_fully_vaccinated = soup.find_all(class_="repart-stlucia")[1].text people_fully_vaccinated = clean_count(people_fully_vaccinated) total_vaccinations = people_vaccinated + people_fully_vaccinated date = soup.find(class_="h2-blue").text date = re.search(r"\w+ +\d+, +202\d", date).group(0) date = clean_date(date, "%B %d, %Y") data = { "total_vaccinations": total_vaccinations, "people_vaccinated": people_vaccinated, "people_fully_vaccinated": people_fully_vaccinated, "date": date, } return pd.Series(data=data)