def connect_parse_data(self) -> pd.Series: op = Options() op.add_argument("--headless") with webdriver.Chrome(options=op) as driver: driver.get(self.source_url) time.sleep(5) total_vaccinations = clean_count( driver.find_element_by_id("counter1").text) total_boosters = clean_count( driver.find_element_by_id("counter4").text) # people_vaccinated_share = driver.find_element_by_id("counter4").text # assert "One dose" in people_vaccinated_share # people_fully_vaccinated_share = driver.find_element_by_id("counter4a").text # assert "Two doses" in people_fully_vaccinated_share # This logic is only valid as long as Qatar *exclusively* uses 2-dose vaccines # people_vaccinated_share = float(re.search(r"[\d.]+", people_vaccinated_share).group(0)) # people_fully_vaccinated_share = float(re.search(r"[\d.]+", people_fully_vaccinated_share).group(0)) # vaccinated_proportion = people_vaccinated_share / (people_vaccinated_share + people_fully_vaccinated_share) # people_vaccinated = round(total_vaccinations * vaccinated_proportion) # people_fully_vaccinated = total_vaccinations - people_vaccinated date = localdate("Asia/Qatar") data = { "total_vaccinations": total_vaccinations, "total_boosters": total_boosters, # "people_vaccinated": people_vaccinated, # "people_fully_vaccinated": people_fully_vaccinated, "date": date, } return pd.Series(data=data)
def read(source: str) -> pd.Series: op = Options() op.add_argument("--headless") with webdriver.Chrome(options=op) as driver: driver.get(source) time.sleep(3) for h5 in driver.find_elements_by_tag_name("h5"): if "Primera dosis" in h5.text: people_vaccinated = clean_count( h5.find_element_by_xpath("./preceding-sibling::div").text) elif "Total dosis aplicadas" in h5.text: total_vaccinations = clean_count( h5.find_element_by_xpath("./preceding-sibling::div").text) elif "Población completamente vacunada" in h5.text: people_fully_vaccinated = clean_count( h5.find_element_by_xpath("./preceding-sibling::div").text) elif "Dosis refuerzo" in h5.text: total_boosters = clean_count( h5.find_element_by_xpath("./preceding-sibling::div").text) data = { "date": localdate("America/Santo_Domingo"), "people_vaccinated": people_vaccinated, "people_fully_vaccinated": people_fully_vaccinated, "total_vaccinations": total_vaccinations, "total_boosters": total_boosters, } return pd.Series(data=data)
def _export_log_info(df_exec, t_sec_1, t_sec_2): # print(len(df_new), len(MODULES_NAME), len(df_new) == len(MODULES_NAME)) if len(df_exec) == len(MODULES_NAME): print("EXPORTING LOG DETAILS") details = system_details() date_now = localdate(force_today=True) machine = details["id"] # Export timings per country df_exec = df_exec.reset_index().assign(date=date_now, machine=machine) df = obj_from_s3(LOG_GET_COUNTRIES) df = df[df.date + df.machine != date_now + machine] df = pd.concat([df, df_exec]) obj_to_s3(df, LOG_GET_COUNTRIES) # Export machine info data = obj_from_s3(LOG_MACHINES) if machine not in data: data = {**details, machine: details["info"]} obj_to_s3(data, LOG_MACHINES) # Export overall timing report = { "machine": machine, "date": date_now, "t_sec": t_sec_1, "t_sec_retry": t_sec_2 } df_new = pd.DataFrame([report]) df = obj_from_s3(LOG_GET_GLOBAL) df = df[df.date + df.machine != date_now + machine] df = pd.concat([df, df_new]) obj_to_s3(df, LOG_GET_GLOBAL)
def _parse_date(self, soup): match = re.search(self.regex["title"], soup.text) date_str = match.group(1) date = clean_date(f"{date_str} {datetime.now().year}", "%d de %B %Y", lang="es") if date > localdate("America/Havana", force_today=True): date = clean_date(f"{date_str} {datetime.now().year-1}", "%d de %b %Y", lang="es") return date
def _parse_data(self) -> pd.Series: data = request_json(self.source_url)["stats"] data = pd.DataFrame.from_records(data, columns=["tested"]).iloc[0] return { "count": clean_count(data[0]), "date": localdate("Atlantic/Faeroe"), }
def main(): path = os.path.join(get_project_dir(), "scripts", "scripts", "testing", "automated_sheets", "Azerbaijan.csv") data = pd.read_csv(path).sort_values(by="Date", ascending=False) source_url = "https://koronavirusinfo.az/az/page/statistika/azerbaycanda-cari-veziyyet" soup = get_soup(source_url) element = soup.find_all("div", class_="gray_little_statistic")[5].find("strong") cumulative_total = clean_count(element.text) if cumulative_total > data["Cumulative total"].max(): new = pd.DataFrame({ "Cumulative total": cumulative_total, "Date": [localdate("Asia/Baku")], "Country": "Azerbaijan", "Units": "tests performed", "Source URL": source_url, "Source label": "Cabinet of Ministers of Azerbaijan", }) df = pd.concat([new, data], sort=False) df.to_csv(path, index=False)
def main(): path = os.path.join(get_project_dir(), "scripts", "scripts", "testing", "automated_sheets", "Nigeria.csv") data = pd.read_csv(path).sort_values(by="Date", ascending=False) source_url = "http://covid19.ncdc.gov.ng/" soup = get_soup(source_url) element = soup.find("div", class_="col-xl-3").find("span") cumulative_total = clean_count(element.text) if cumulative_total > data["Cumulative total"].max(): new = pd.DataFrame({ "Date": [localdate("Africa/Lagos")], "Cumulative total": cumulative_total, "Country": "Nigeria", "Units": "samples tested", "Source URL": source_url, "Source label": "Nigeria Centre for Disease Control", }) df = pd.concat([new, data], sort=False) df.to_csv(path, index=False)
def connect_parse_data(source: str) -> pd.Series: soup = get_soup(source) counters = soup.find_all(class_="elementor-counter-number") assert len(counters) == 6, "New counter in dashboard?" total_vaccinations = clean_count(counters[0]["data-to-value"]) first_doses = clean_count(counters[1]["data-to-value"]) second_doses = clean_count(counters[2]["data-to-value"]) unique_doses = clean_count(counters[3]["data-to-value"]) booster_shots = clean_count(counters[4]["data-to-value"]) immunocompromised_doses = clean_count(counters[5]["data-to-value"]) people_vaccinated = first_doses + unique_doses people_fully_vaccinated = second_doses + unique_doses total_boosters = booster_shots + immunocompromised_doses date = localdate("America/Jamaica") return pd.Series( data={ "total_vaccinations": total_vaccinations, "people_vaccinated": people_vaccinated, "people_fully_vaccinated": people_fully_vaccinated, "total_boosters": total_boosters, "date": date, } )
def read(source: str) -> pd.Series: with get_driver() as driver: driver.get(source) time.sleep(10) for block in driver.find_elements_by_class_name("kpimetric"): if "1ste dosis" in block.text and "%" not in block.text: people_partly_vaccinated = clean_count( block.find_element_by_class_name("valueLabel").text) elif "2de dosis" in block.text and "%" not in block.text: people_fully_vaccinated = clean_count( block.find_element_by_class_name("valueLabel").text) elif "3de dosis" in block.text and "%" not in block.text: total_boosters = clean_count( block.find_element_by_class_name("valueLabel").text) people_vaccinated = people_partly_vaccinated + people_fully_vaccinated return pd.Series( data={ "total_vaccinations": people_vaccinated + people_fully_vaccinated, "people_vaccinated": people_vaccinated, "people_fully_vaccinated": people_fully_vaccinated, "total_boosters": total_boosters, "date": localdate("America/Paramaribo"), })
def _check_last_update(path, country): metadata = S3().get_metadata(path) last_update = metadata["LastModified"] now = localdate(force_today=True, as_datetime=True) num_days = (now - last_update).days if num_days > 4: # Allow maximum 4 days delay raise FileExistsError( f"ICE File for {country} is too old ({num_days} days old)! Please check cowidev.vax.icer" )
def parse_data(self, soup): widgets = soup.find_all(class_="textwidget") total_vaccinations = clean_count(widgets[0].text) people_fully_vaccinated = clean_count(widgets[1].text) people_vaccinated = total_vaccinations - people_fully_vaccinated return pd.Series({ "total_vaccinations": total_vaccinations, "people_vaccinated": people_vaccinated, "people_fully_vaccinated": people_fully_vaccinated, "date": localdate("Asia/Tbilisi"), })
def read(self) -> pd.Series: soup = get_soup(self.source_url, timeout=30) metrics = self._parse_metrics(soup) vaccines = self._parse_vaccines(soup) date = localdate("Asia/Dhaka") return pd.Series( data={ **metrics, "date": date, "vaccine": vaccines, } )
def _parse_data(self, soup: BeautifulSoup) -> pd.DataFrame: """Parse data from soup""" # Get the element elem = soup.find(text="Müayinə aparılıb").parent if not elem: raise ValueError("Element not found, please update the script") # Get the metrics count = self._parse_metrics(elem) df = pd.DataFrame({ "Date": [localdate("Asia/Baku")], "Cumulative total": [count], }) return df
def _parse_data_date(self, soup) -> dict: date_raw = soup.find(class_="text-gray-500").text date = extract_clean_date(date_raw.strip() + str(datetime.now().year), self.regex["date"], "%d. %B%Y", lang="en") if date > localdate("America/Havana", force_today=True): date = extract_clean_date(date_raw.strip() + str(datetime.now().year - 1), self.regex["date"], "%d. %B%Y", lang="en") return {"date": date}
def read(self) -> pd.DataFrame: """Read data from source""" body = str(get_soup(self.source_url)) # Get count count = 0 if "Totaal Testen" in body: count = int(body.split("Totaal Testen")[0].split('data-counter-value="')[-1].split('"')[0]) # Get negative results negative = 0 if "Totaal negatieve" in body: negative = int(body.split("Totaal negatieve")[0].split('data-counter-value="')[-1].split('"')[0]) df = pd.DataFrame( { "Date": [localdate("America/Paramaribo")], "Daily change in cumulative total": [count], "positive": [count - negative], } ) return df
def read(source: str) -> pd.Series: soup = get_soup(source) people_vaccinated = clean_count( re.search(r"^[\d,]+", soup.find_all(class_="info-box-number")[2].text).group(0)) people_fully_vaccinated = clean_count( re.search(r"^[\d,]+", soup.find_all(class_="info-box-number")[3].text).group(0)) total_vaccinations = people_vaccinated + people_fully_vaccinated date = localdate("Asia/Dhaka") return pd.Series( data={ "total_vaccinations": total_vaccinations, "people_vaccinated": people_vaccinated, "people_fully_vaccinated": people_fully_vaccinated, "date": date, })
def connect_parse_data(source: str) -> pd.Series: soup = get_soup(source) df = pd.read_html(str(soup.find(class_="vaccination-count")))[0] assert df.shape == (3, 7) values = df.iloc[:, 2].values total_vaccinations = values[0] people_vaccinated = values[1] people_fully_vaccinated = values[2] assert total_vaccinations == people_vaccinated + people_fully_vaccinated date = soup.find(class_="aly_tx_center").text date = localdate("Asia/Tokyo") data = { "total_vaccinations": total_vaccinations, "people_vaccinated": people_vaccinated, "people_fully_vaccinated": people_fully_vaccinated, "date": date, } return pd.Series(data=data)
def connect_parse_data(source: str) -> pd.Series: soup = get_soup(source) az_dose1 = clean_count(soup.find_all(class_="yellow")[0].text) az_dose2 = clean_count(soup.find_all(class_="yellow")[1].text) assert az_dose1 >= az_dose2 pfizer_dose1 = clean_count(soup.find_all(class_="yellow")[2].text) pfizer_dose2 = clean_count(soup.find_all(class_="yellow")[3].text) assert pfizer_dose1 >= pfizer_dose2 people_vaccinated = az_dose1 + pfizer_dose1 people_fully_vaccinated = az_dose2 + pfizer_dose2 total_vaccinations = people_vaccinated + people_fully_vaccinated date = localdate("America/St_Lucia") data = { "total_vaccinations": total_vaccinations, "people_vaccinated": people_vaccinated, "people_fully_vaccinated": people_fully_vaccinated, "date": date, } return pd.Series(data=data)
def pipe_date(self, ds: pd.Series) -> pd.Series: date = localdate("Europe/Isle_of_Man") return enrich_data(ds, "date", date)
def _weekday_to_date(self, d): new_date = clean_date(d + "+5", "%Y-W%W+%w") if new_date > localdate("Europe/London"): new_date = clean_date(d + "+2", "%Y-W%W+%w") return new_date
def format_date(ds: pd.Series) -> pd.Series: date = localdate("Europe/Chisinau") return enrich_data(ds, "date", date)
def format_date(ds: pd.Series) -> pd.Series: date = localdate("Asia/Beirut") return enrich_data(ds, "date", date)
def pipe_date(self, ds: pd.Series) -> pd.Series: return enrich_data(ds, "date", localdate("Asia/Bishkek"))
def pipe_date(self, ds: pd.Series) -> pd.Series: date = localdate("Europe/Sofia") return enrich_data(ds, "date", date)
def pipe_age_date(self, df: pd.DataFrame) -> pd.DataFrame: df = df.rename(columns={"last_day_of_epi_week": "date"}) df.loc[df.complete_epi_week == 0, "date"] = localdate("America/Lima") return df
def pipe_date(self, ds: pd.Series) -> pd.Series: date = localdate("Asia/Amman") return enrich_data(ds, "date", date)
def enrich_date(ds: pd.Series) -> pd.Series: date_str = localdate("America/Curacao") return enrich_data(ds, "date", date_str)
class Thailand: location: str = "Thailand" source_url: str = "https://ddc.moph.go.th/dcd/pagecontent.php?page=643&dept=dcd" base_url_template: str = "https://ddc.moph.go.th/vaccine-covid19/diaryReportMonth/{}/9/2021" regex_date: str = r"\s?ข้อมูล ณ วันที่ (\d{1,2}) (.*) (\d{4})" _year_difference_conversion = 543 _current_month = localdate("Asia/Bangkok", date_format="%m") @property def regex_vax(self): regex_aux = r"\((?:รา|รำ)ย\)" regex_vax = ( r" ".join([f"เข็มที่ {i} {regex_aux}" for i in range(1, 4)]) + r" รวม \(โดส\)\s+([\d,]+)\s+([\d,]+)\s+([\d,]+)\s+([\d,]+)" ) return regex_vax def read(self, last_update: str) -> pd.DataFrame: # Get Newest Month Report Page url_month = self.base_url_template.format(self._current_month) soup_month = get_soup(url_month) # Get links df = self._parse_data(soup_month, last_update) return df def _parse_data(self, soup: BeautifulSoup, last_update: str): links = self._get_month_links(soup) records = [] for link in links: # print(link["date"]) if link["date"] <= last_update: break records.append(self._parse_metrics(link)) return pd.DataFrame(records) def _get_month_links(self, soup): links = soup.find_all("a", class_="selectModelMedia") links = [ { "link": link.get("href"), "date": self._parse_date_from_link_title(link.parent.parent.text.strip()), } for link in links ] return sorted(links, key=lambda x: x["date"], reverse=True) def _parse_date_from_link_title(self, title): match = re.search(r".*สรุปวัคซีน ประจำวันที่\s+(\d+) .* (25\d\d)", title).group(1, 2) year = int(match[1]) - self._year_difference_conversion return clean_date(f"{year}-{self._current_month}-{match[0]}", "%Y-%m-%d") def _parse_metrics(self, link: str): raw_text = self._text_from_pdf(link["link"]) text = self._substitute_special_chars(raw_text) record = self._parse_variables(text) record["date"] = link["date"] record["source_url"] = link["link"].replace(" ", "%20") return record def _text_from_pdf(self, pdf_link: str): with tempfile.NamedTemporaryFile() as tf: with open(tf.name, mode="wb") as f: f.write(requests.get(pdf_link).content) with open(tf.name, mode="rb") as f: viewer = SimplePDFViewer(f) viewer.render() raw_text = "".join(viewer.canvas.strings) return raw_text def _substitute_special_chars(self, raw_text: str): """Correct Thai Special Character Error.""" special_char_replace = { "\uf701": "\u0e34", "\uf702": "\u0e35", "\uf703": "\u0e36", "\uf704": "\u0e37", "\uf705": "\u0e48", "\uf706": "\u0e49", "\uf70a": "\u0e48", "\uf70b": "\u0e49", "\uf70e": "\u0e4c", "\uf710": "\u0e31", "\uf712": "\u0e47", "\uf713": "\u0e48", "\uf714": "\u0e49", } special_char_replace = dict((re.escape(k), v) for k, v in special_char_replace.items()) pattern = re.compile("|".join(special_char_replace.keys())) text = pattern.sub(lambda m: special_char_replace[re.escape(m.group(0))], raw_text) return text def _parse_variables(self, text: str): metrics = re.search(self.regex_vax, text).groups() people_vaccinated = clean_count(metrics[0]) people_fully_vaccinated = clean_count(metrics[1]) total_boosters = clean_count(metrics[2]) total_vaccinations = clean_count(metrics[3]) return { "total_vaccinations": total_vaccinations, "people_vaccinated": people_vaccinated, "people_fully_vaccinated": people_fully_vaccinated, "total_boosters": total_boosters, } def _parse_date(self, text: str): thai_date_replace = { # Months "มกราคม": 1, "กุมภาพันธ์": 2, "มีนาคม": 3, "เมษายน": 4, "พฤษภาคม": 5, "พฤษภำคม": 5, "มิถุนายน": 6, "มิถุนำยน": 6, "กรกฎาคม": 7, "กรกฎำคม": 7, "สิงหาคม": 8, "สิงหำคม": 8, "กันยายน": 9, "ตุลาคม": 10, "พฤศจิกายน": 11, "ธันวาคม": 12, } date_raw = re.search(self.regex_date, text) day = clean_count(date_raw.group(1)) month = thai_date_replace[date_raw.group(2)] year = clean_count(date_raw.group(3)) - self._year_difference_conversion return clean_date(datetime(year, month, day)) def pipe_location(self, df: pd.DataFrame) -> pd.DataFrame: return df.assign(location=self.location) def pipe_vaccine(self, df: pd.DataFrame) -> pd.DataFrame: return df.assign(vaccine="Oxford/AstraZeneca, Pfizer/BioNTech, Sinopharm/Beijing, Sinovac") def pipeline(self, df: pd.DataFrame) -> pd.DataFrame: return df.pipe(self.pipe_location).pipe(self.pipe_vaccine) def to_csv(self, paths): output_file = paths.tmp_vax_out(self.location) last_update = pd.read_csv(output_file).date.max() df = self.read(last_update) if not df.empty: df = df.pipe(self.pipeline) df = merge_with_current_data(df, output_file) df.to_csv(output_file, index=False)
def _week_to_date(self, week: int) -> str: """Converts week to date.""" year = localdate("Asia/Amman", as_datetime=True).isocalendar().year date = clean_date(f"{year} {week} +5", "%Y %W +%w") return date
class Jordan(CountryTestBase): location: str = "Jordan" units: str = "tests performed" source_label: str = "Ministry of Health" week: str = localdate("Asia/Amman", as_datetime=True).isocalendar().week notes: str = "" source_url: str = ( "https://wabi-west-europe-d-primary-api.analysis.windows.net/public/reports/querydata?synchronous=true" ) source_url_ref: str = "https://corona.moh.gov.jo/ar" def read(self) -> pd.DataFrame: """Reads the data from the source""" try: count = self._request() return self._df_builder(count) except KeyError: raise KeyError( "No value found. Please modify the payload and headers.") @property def headers(Self): """Headers for the request""" return { "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:85.0) Gecko/20100101 Firefox/85.0", "Accept": "application/json, text/plain, */*", "Accept-Language": "en-US", "X-PowerBI-ResourceKey": "f29483dd-2cd3-4be1-9fbd-6c67f0ca1037", "Content-Type": "application/json;charset=UTF-8", "Origin": "https://app.powerbi.com", "Referer": "https://app.powerbi.com/", "Pragma": "no-cache", "Cache-Control": "no-cache", } def payload(self, week: str = None) -> dict: """Request payload""" data = { "version": "1.0.0", "queries": [{ "Query": { "Commands": [{ "SemanticQueryDataShapeCommand": { "Query": { "Version": 2, "From": [ { "Name": "w", "Entity": "weekly data", "Type": 0 }, ], "Select": [{ "Aggregation": { "Expression": { "Column": { "Expression": { "SourceRef": { "Source": "w" } }, "Property": "مجموع الفحوصات المخبرية التراكمي", } }, "Function": 0, }, "Name": "Sum(weekly data.مجموع الفحوصات المخبرية التراكمي)", }], "Where": [ { "Condition": { "In": { "Expressions": [{ "Column": { "Expression": { "SourceRef": { "Source": "w" } }, "Property": "week", } }], "Values": [[{ "Literal": { "Value": f"{week}L" } }]], } } }, ], }, "ExecutionMetricsKind": 1, } }] }, "QueryId": "", "ApplicationContext": { "DatasetId": "805d8b47-2e08-46cc-b1cd-7937fe585c59", }, }], "cancelQueries": [], "modelId": 1187812, } return data def _request(self) -> dict: """Requests data from source.""" response = json.loads( requests.post(self.source_url, headers=self.headers, data=json.dumps(self.payload(str( self.week)))).content )["results"][0]["result"]["data"]["dsr"]["DS"][0]["PH"][0]["DM0"][0] if "M0" in response.keys(): response = response["M0"] else: self.week -= 1 response = self._request() return response def _week_to_date(self, week: int) -> str: """Converts week to date.""" year = localdate("Asia/Amman", as_datetime=True).isocalendar().year date = clean_date(f"{year} {week} +5", "%Y %W +%w") return date def _df_builder(self, count: str) -> pd.DataFrame: """Builds dataframe from the text data""" df = pd.DataFrame({"Cumulative total": [clean_count(count)]}) return df def pipe_date(self, df: pd.DataFrame) -> pd.DataFrame: """Pipes date.""" return df.assign(Date=self._week_to_date(self.week)) def pipeline(self, df: pd.DataFrame) -> pd.DataFrame: """Pipeline for data.""" return df.pipe(self.pipe_date).pipe(self.pipe_metadata) def export(self): """Exports data to CSV.""" df = self.read().pipe(self.pipeline) # Export to CSV self.export_datafile(df, attach=True)