def _parse_metrics(self, soup): match = re.search(self.regex["title"], soup.text) data = {"total_vaccinations": clean_count(match.group(2))} for metric in ["people_vaccinated", "people_fully_vaccinated", "total_boosters"]: match = re.search(self.regex[metric], soup.text) data[metric] = clean_count(match.group(1)) return data
def _parse_data_metrics(self, soup) -> dict: counters = soup.find_all(class_="text-brand-blue") dose_1 = clean_count( re.search(r"Innuttaasut ([\d\.]+)", counters[1].parent.find_all("dd")[-1].text).group(1)) dose_2 = clean_count( re.search(r"Innuttaasut ([\d\.]+)", counters[2].parent.find_all("dd")[-1].text).group(1)) if dose_1 < dose_2: raise ValueError("dose_1 cannot be higher than dose_2") return {"people_vaccinated": dose_1, "people_fully_vaccinated": dose_2}
def _parse_data(self) -> tuple: """Parses the data from the source.""" # Pfizer first dose response = json.loads( requests.post( self.source_url, headers=self.headers, data=json.dumps(self._payload("Pfizer_first")) ).content ) Pfizer_first = response["results"][0]["result"]["data"]["dsr"]["DS"][0]["PH"][0]["DM0"][0]["M0"] date = response["results"][0]["result"]["data"]["timestamp"] # Pfizer second dose response = json.loads( requests.post( self.source_url, headers=self.headers, data=json.dumps(self._payload("Pfizer_second")) ).content ) Pfizer_second = response["results"][0]["result"]["data"]["dsr"]["DS"][0]["PH"][0]["DM0"][0]["M0"] # Pfizer booster response = json.loads( requests.post( self.source_url, headers=self.headers, data=json.dumps(self._payload("Pfizer_booster")) ).content ) Pfizer_booster = response["results"][0]["result"]["data"]["dsr"]["DS"][0]["PH"][0]["DM0"][0]["M0"] # Johnson first dose response = json.loads( requests.post( self.source_url, headers=self.headers, data=json.dumps(self._payload("Johnson_first")) ).content ) Johnson_first = response["results"][0]["result"]["data"]["dsr"]["DS"][0]["PH"][0]["DM0"][0]["M0"] # Johnson booster response = json.loads( requests.post( self.source_url, headers=self.headers, data=json.dumps(self._payload("Johnson_booster")) ).content ) Johnson_booster = response["results"][0]["result"]["data"]["dsr"]["DS"][0]["PH"][0]["DM0"][0]["M0"] # parse date date = self._parse_date(date) # create metrics list metrics = [ clean_count(Pfizer_first), clean_count(Pfizer_second), clean_count(Pfizer_booster), clean_count(Johnson_first), clean_count(Johnson_booster), ] # build dataframe df_main, df_manufacturer = self._build_df(metrics, date) return df_main, df_manufacturer
def _parse_metrics(self, text: str) -> tuple: """Parse metrics from text.""" total_vaccinations = re.search(self.regex["doses"], text).group(1) people_vaccinated = re.search(self.regex["doses"], text).group(2) people_fully_vaccinated = re.search(self.regex["doses"], text).group(3) total_boosters = re.search(self.regex["doses"], text).group(4) dose_after_positive = re.search(self.regex["doses"], text).group(5) return ( clean_count(total_vaccinations), clean_count(people_vaccinated), clean_count(people_fully_vaccinated), clean_count(total_boosters), clean_count(dose_after_positive), )
def _parse_metrics(self, soup: BeautifulSoup) -> int: """Parse metrics from soup""" text = soup.get_text() text = re.sub(r"(\d),(\d)", r"\1\2", text) people_vaccinated = clean_count( re.search(self.regex["people_vaccinated"], text).group(1)) people_fully_vaccinated = clean_count( re.search(self.regex["people_fully_vaccinated"], text).group(2)) total_vaccinations = people_vaccinated + people_fully_vaccinated df = { "people_vaccinated": [people_vaccinated], "people_fully_vaccinated": [people_fully_vaccinated], "total_vaccinations": [total_vaccinations], } return df
def _parse_metrics(self, elem: element.Tag) -> int: """Parse metrics from element""" count = clean_count( elem.find_previous_sibling("div", class_="t192__title").text.replace( " ", "")) return count
def export(self): url = "https://guineasalud.org/estadisticas/" soup = get_soup(url) stats = soup.find_all("tr") count = clean_count(stats[9].find_all("td")[-1].text) date_str = date.today().strftime("%Y-%m-%d") df = pd.DataFrame({ "Country": self.location, "Date": [date_str], "Cumulative total": count, "Source URL": url, "Source label": "Ministerio de Sanidad y Bienestar Social", "Units": "tests performed", "Notes": pd.NA, }) if os.path.isfile(self.output_path): existing = pd.read_csv(self.output_path) if count > existing["Cumulative total"].max( ) and date_str > existing["Date"].max(): df = pd.concat([df, existing]).sort_values( "Date", ascending=False).drop_duplicates() df.to_csv(self.output_path, index=False)
def export(self): data = pd.read_csv(self.output_path) url = "http://cdcmoh.gov.kh/" soup = get_soup(url) print(soup.select("span:nth-child(1) strong span")) count = clean_count(soup.select("p+ div strong:nth-child(1)")[0].text) date_str = localdatenow("Asia/Phnom_Penh") if count > data["Cumulative total"].max( ) and date_str > data["Date"].max(): new = pd.DataFrame({ "Country": self.location, "Date": [date_str], "Cumulative total": count, "Source URL": url, "Source label": "CDCMOH", "Units": "tests performed", }) data = pd.concat([new, data], sort=False) self.export_datafile(data)
def _parse_data(self) -> pd.Series: data = request_json(self.source_url)["stats"] data = pd.DataFrame.from_records(data, columns=["tested"]).iloc[0] return { "count": clean_count(data[0]), "date": localdate("Atlantic/Faeroe"), }
def _parse_metrics(self, text: str) -> int: """Get metrics from news text.""" match = re.search(self.regex["booster"], text) if not match: raise TypeError( ("Website Structure Changed, please update the script")) count = match.group(1) return clean_count(count)
def _parse_metrics(self, json_data: dict) -> dict: """Parses metrics from JSON""" data = {} for metric, entity in self.metric_entities.items(): value = json_data["elements"]["content"]["content"]["entities"][ entity]["props"]["content"]["blocks"][0]["text"] value = clean_count(value) data[metric] = value return data
def _parse_metrics(self, json_data): data = {} for metric, entity in self.metric_entities.items(): value = json_data["elements"]["content"]["content"]["entities"][ entity]["props"]["chartData"]["data"][0][0][0] value = re.search(r'18px;">([\d\.]+)', value).group(1) value = clean_count(value) data[metric] = value return data
def _parse_data(self, url: str) -> pd.Series: """Parse the data from the pdf url""" text = self._get_text_from_pdf(url) data = { "total_vaccinations": clean_count( re.search(self.regex["total_vaccinations"], text).group(1)), "people_vaccinated": clean_count( re.search(self.regex["people_vaccinated"], text).group(1)), "people_fully_vaccinated": clean_count( re.search(self.regex["people_fully_vaccinated"], text).group(1)), "date": clean_date(re.search(self.regex["date"], text).group(1), "%d %B, %Y", lang="es"), } self._check_data(data) return pd.Series(data)
def _parse_metrics(self, soup: BeautifulSoup) -> int: """Parse metrics from the soup""" total = soup.find(text=self.regex["Total"]) dose2 = soup.find(text=self.regex["Dose2"]) dose3 = soup.find(text=self.regex["Dose3"]) boosters = soup.find(text=self.regex["Boosters"]) if not total or not dose2 or not dose3 or not boosters: raise ValueError("Metrics not found, please update the script") total_vaccinations = clean_count(total.parent.find_next().text) people_fully_vaccinated = clean_count(dose2.parent.find_next().text) total_boosters = clean_count( dose3.parent.find_next().text) + clean_count( boosters.parent.find_next().text) people_vaccinated = total_vaccinations - people_fully_vaccinated - total_boosters df = { "people_vaccinated": [people_vaccinated], "people_fully_vaccinated": [people_fully_vaccinated], "total_boosters": [total_boosters], "total_vaccinations": [total_vaccinations], } return df
def _parse_metrics_from_soup(self, soup: BeautifulSoup) -> tuple: """Get metrics from soup.""" metrics = [ "total_vaccinations", "people_vaccinated", "people_fully_vaccinated", "total_boosters", ] count = [ clean_count( soup.find(text=re.compile(self.regex[metric])).find_parent( "div", class_="wptb-text-container").find_next_sibling( "div", class_="wptb-text-container").text) for metric in metrics ] return count
def _parse_metrics(self, elem: element.Tag) -> int: """Parse metrics from element""" count = clean_count( elem.find_next_sibling("p", class_="case-Number").text) return count
def _parse_metrics(self, soup: BeautifulSoup) -> int: """Parse metrics from soup""" text = soup.get_text() text = re.sub(r"(\d)\.(\d)", r"\1\2", text) count = re.search(self.regex["count"], text).group(1) return clean_count(count)
def _parse_metrics(self, elem: element.Tag) -> int: """Parse metrics from element""" count = json.loads(elem.attrs["data-options"])["endVal"] return clean_count(count)
def _df_builder(self, count: str) -> pd.DataFrame: """Builds dataframe from the text data""" df = pd.DataFrame({"Cumulative total": [clean_count(count)]}) return df
def _parse_metrics(self, elem: element.Tag) -> int: """Parse metrics from element""" count = elem.find_next_sibling("td").text return clean_count(count)
def _parse_metrics(self, elem: element.Tag) -> int: """Parse metrics from element""" text = elem.find_next_sibling().text count = re.sub(r"\D", "", text) return clean_count(count)
def _parse_metrics(self, elem: element.Tag) -> int: """Parse metrics from element""" count = clean_count(elem.find_previous_sibling("strong").text) return count
def _parse_metrics(self, elem: element.Tag) -> int: """Parse metrics from element""" count = clean_count( elem.find(class_="stats-number")["data-counter-value"]) return count
def pipe_metrics(self, df: pd.DataFrame) -> pd.DataFrame: """Parse metrics from source""" data = request_json(self.source_url) count = data["features"][0]["attributes"]["value"] return df.assign(**{"Cumulative total": clean_count(count)})
def _parse_metrics(self, text: str) -> int: """Parse metrics from text""" count = re.search(self.regex["count"], text).group(1) return clean_count(count)
def _parse_metrics(self, elem: element.Tag) -> int: """Parse metrics from element""" count = elem.text return clean_count(re.sub(r"\D", "", count))
def _parse_metrics(self, soup: BeautifulSoup) -> int: """Parse metrics from soup""" count = soup.find("div", class_="test-stlucia").text return clean_count(count)
def _parse_metrics(self, t_scraper: TableauScraper) -> int: """Parse metrics from TableauScraper""" count = int( t_scraper.getWorksheet("Resumen").data.loc[ 0, "SUM(Cantidad Pruebas)-alias"]) return clean_count(count)
def _parse_metrics(self, soup: BeautifulSoup) -> int: """Parse metrics from soup""" text = soup.find("table").find_all("span")[1].text count = re.sub(self.regex["count"], "", text) return clean_count(count)
def _parse_metrics(self, table: pd.DataFrame) -> int: """Parse metrics from table""" count = table.iloc[-1][0] return clean_count(count)