def parse_weekly_pdf(): pdf_url = svc.get_weekly_pdf_url() pdf = sc.pdfdownload(pdf_url, silent=True) """ 29.07.2020 Concernant le traçage des contacts de cas positifs, le 27 juillet, 83 personnes étaient en isolement, 633 en quarantaine de contacts étroits et 901 en quarantaine de retour de voyage. """ dd = sc.DayData(canton='VD', url=pdf_url) year = sc.find('Situation au \d+.*(20\d{2})', pdf) date = sc.find( 'Concernant le traçage des contacts de cas positifs, le (\d+.*),', pdf) if not date: print("isolated/quarantined numbers missing in weekly PDF of VD", file=sys.stderr) return dd.datetime = date + ' ' + year dd.isolated = sc.find( '(\d+)\s(personnes|cas\spositifs)\sétaient\sen\sisolement', pdf) dd.quarantined = text_to_int( sc.find('(\d.\d+|\d+)\scontacts\sétroits\sen\squarantaine\.', pdf)) print(dd) print('-' * 10) dd = sc.DayData(canton='VD', url=pdf_url) date = sc.find('quarantaine. Le (\d+ .*),', pdf) dd.datetime = date + ' ' + year dd.quarantine_riskareatravel = text_to_int( sc.find( ', (\d.\d+|\d+)\spersonnes\sétaient\sen\squarantaines?\ssuite\sà\sun\sretour\sde\svoyage.', pdf)) print(dd) print('-' * 10)
def parse_weekly_pdf(): sc.add_cert_to_bundle() base_url = 'https://www.infosan.vd.ch' d = sc.download(base_url, silent=True) soup = BeautifulSoup(d, 'html.parser') html_url = base_url + soup.find(href=re.compile("/publications/covid-19-point-epidemiologique")).get('href') d = sc.download(html_url, silent=True) soup = BeautifulSoup(d, 'html.parser') pdf_url = base_url + soup.find(href=re.compile("\.pdf$")).get('href') pdf = sc.pdfdownload(pdf_url, silent=True) """ 29.07.2020 Concernant le traçage des contacts de cas positifs, le 27 juillet, 83 personnes étaient en isolement, 633 en quarantaine de contacts étroits et 901 en quarantaine de retour de voyage. """ dd = sc.DayData(canton='VD', url=pdf_url) year= sc.find('Situation au \d+.*(20\d{2})', pdf) date = sc.find('Concernant le traçage des contacts de cas positifs, le (\d+.*),', pdf) dd.datetime = date + ' ' + year dd.isolated = sc.find('(\d+)\s(personnes|cas\spositifs)\sétaient\sen\sisolement', pdf) dd.quarantined = text_to_int(sc.find('(\d.\d+|\d+)\scontacts\sétroits\sen\squarantaine\.', pdf)) print(dd) print('-' * 10) dd = sc.DayData(canton='VD', url=pdf_url) date = sc.find('quarantaine. Le (\d+ .*),', pdf) dd.datetime = date + ' ' + year dd.quarantine_riskareatravel = text_to_int(sc.find(', (\d.\d+|\d+)\spersonnes\sétaient\sen\squarantaines\ssuite\sà\sun\sretour\sde\svoyage.', pdf)) print(dd) print('-' * 10)
def parse_weekly_pdf(): pdf_url = svc.get_weekly_pdf_url() pdf = sc.pdfdownload(pdf_url, silent=True) dd = sc.DayData(canton='VD', url=pdf_url) dd.datetime = sc.find('Point .pid.miologique au (\d+\s+\w+\s+\d{4})', pdf) dd.cases = text_to_int( sc.find( '\s(\d+.\d+)\s+personnes ont .t. test.es positives au SARS-CoV-2.', pdf)) dd.hospitalized = sc.find( '(\d+)\s+patients\s+sont\s+actuellement\s+hospitalis.s', pdf) dd.icu = sc.find('dont\s+(\d+)\s+en\s+soins\s+intensifs', pdf) assert dd print(dd) print('-' * 10)
#!/usr/bin/env python3 import scrape_common as sc import re # get latest from list with all press releases d = sc.download('https://www.regierung.li/coronavirus', silent=True) pdf_url = sc.find( r'<a.*?href="([^"]+\.pdf)[^"]*"[^>]*?>[^<]+?Situationsbericht[^<]+?<\/a>', d) assert pdf_url, "PDF URL not found" # download latest PDF d = sc.pdfdownload(pdf_url, raw=True, silent=True) # extract case numbers reported for previous days d = d.replace(u'\xa0', u' ') # data from the most recent press release dd = sc.DayData(canton='FL', url=pdf_url) dd.datetime = sc.find(r'Situationsbericht vom (.*? 20\d{2})', d) dd.cases = sc.find(r'insgesamt\s+([0-9]+)\s+laborbestätigte\s+Fälle', d) m = re.search(r'Bisher\s+trat(en)?\s+(\S+)\s+(Todesfall|Todesfälle)', d, flags=re.I) if m: dd.deaths = sc.int_or_word(m[2]) if re.search( 'Alle\s+weiteren\s+Erkrankten\s+sind\s+in\s+der\s+Zwischenzeit\s+genesen',
#!/usr/bin/env python3 import scrape_common as sc import re print('AG') # get latest from list with all bulletins d = sc.download('https://www.ag.ch/de/themen_1/coronavirus_2/lagebulletins/lagebulletins_1.jsp') url = sc.find(r'<a [^>]*href="([^"]+\.pdf)">.+Bulletin.+</a>', d) # download latest PDF d = sc.pdfdownload('https://www.ag.ch' + url, raw=True) sc.timestamp() print('Date and time:', sc.find(r'Aarau, (.+? Uhr)', d)) print('Confirmed cases:', sc.find(r'zurzeit\s+([0-9]+)\s+bestätigte\s+Fälle', d)) print('Recovered:', sc.find(r'([0-9]+)\s+Personen.*?als\s+geheilt', d)) print('Hospitalized:', sc.find(r'([0-9]+)\s+Person(en)?\s+sind\s+zurzeit\s+hospitalisiert', d)) print('ICU:', sc.find(r'([0-9]+)\s+Person(en)?.*?auf\s+Intensivstationen', d)) print('Vent:', sc.find(r'([0-9]+)\s+Person(en)?\s+künstlich\s+beatmet', d)) print('Deaths:', sc.find(r'([0-9]+)\s+Person(en)?\s+an\s+den\s+Folgen\s+des\s+Coronavirus\s+verstorben', d))
import datetime import re from bs4 import BeautifulSoup import scrape_common as sc # parse weekly data for isolated and quarantined numbers base_url = 'https://www.vs.ch' stat_url = base_url + '/de/web/coronavirus/statistiques' content = sc.download(stat_url, silent=True) soup = BeautifulSoup(content, 'html.parser') res = soup.find( string=re.compile(r'Synthese COVID19 VS Woche\d+')).find_previous('a') weekly_pdf_url = base_url + res.attrs['href'] weekly_pdf_url = weekly_pdf_url.replace(' ', '%20') content = sc.pdfdownload(weekly_pdf_url, silent=True) # add isolated/quarantined to the existing DayData item week_end_date = sc.find(r'vom (\d+)\. bis (\d+\.\d+\.20\d{2})', content, group=2) week_end_date = sc.date_from_text(week_end_date).isoformat() dd = sc.DayData(canton='VS', url=weekly_pdf_url) dd.datetime = week_end_date dd.isolated = sc.find( r'befanden\ssich\s(\d+)\spositive\sF.lle\snoch\simmer\sin\sIsolation', content) dd.quarantined = sc.find(r'Isolation\sund\s(\d+)\sKontakte\sin\sQuarant.ne', content) dd.quarantine_riskareatravel = sc.find(r'\s(\d+)\sReisende\sin\sQuarant.ne',
#!/usr/bin/env python3 # -*- coding: utf-8 -*- from bs4 import BeautifulSoup import re import scrape_common as sc # get pdf and xlsx URL from covid19 page of TI main_url = 'https://www4.ti.ch/dss/dsp/covid19/home/' d = sc.download(main_url, silent=True) soup = BeautifulSoup(d, 'html.parser') pdf_url = soup.find('a', string=re.compile(r'Dati stato.*')).get('href') pdf_url = f'https://www4.ti.ch/{pdf_url}' pdf_content = sc.pdfdownload(pdf_url, silent=True, raw=True) dd = sc.DayData(canton='TI', url=pdf_url) dd.datetime = sc.find(r'(?:Stato )?(\d+\.\d+\.20\d{2})', pdf_content) dd.isolated = sc.find(r'(\d+)\sPersone\sin\sisolamento', pdf_content) dd.quarantined = sc.find(r'(\d+)\sPersone\sin\squarantena', pdf_content) is_first = True if dd: print(dd) is_first = False xls_url = soup.find(href=re.compile("\.xlsx$")).get('href') assert xls_url, "URL is empty" if not xls_url.startswith('http'): xls_url = f'https://www4.ti.ch/{xls_url}'
#!/usr/bin/env python3 # -*- coding: utf-8 -*- import re import sys import datetime from bs4 import BeautifulSoup import scrape_common as sc d = sc.download( 'https://www.sz.ch/behoerden/information-medien/medienmitteilungen/coronavirus.html/72-416-412-1379-6948', silent=True) soup = BeautifulSoup(d, 'html.parser') pdf_url = soup.find('a', string=re.compile(r'Medienmitteilung vom'))['href'] pdf_content = sc.pdfdownload(pdf_url, layout=True, silent=True) date = sc.find(r'Stand: (\d+\. .* 20\d{2})', pdf_content) res = re.search(r'.*\s+\d+\s+\d+\s+\d+\s+(\d+)\s+(\d+)\s+(\d+)\s+', pdf_content) is_first = True if res is not None: dd = sc.DayData(canton='SZ', url=pdf_url) dd.datetime = date dd.hospitalized = res[1] dd.quarantined = res[2] dd.quarantine_riskareatravel = res[3] print(dd) is_first = False try: xls_url = soup.find(
#!/usr/bin/env python3 # -*- coding: utf-8 -*- import re import datetime from bs4 import BeautifulSoup from selenium import webdriver from selenium.webdriver.chrome.options import Options import scrape_common as sc import scrape_ge_common as sgc is_first = True # parse tested from PDF pdf_url = sgc.get_latest_ge_weekly_pdf_url() pdf = sc.pdfdownload(pdf_url, silent=True) week_number = sc.find(r'Situation semaine (\d+)', pdf) if week_number: week_end_date = datetime.datetime.strptime('2021-W' + week_number + '-7', '%G-W%V-%u').date() number_of_tests = sc.find(r'Au total, (\d+\'\d+) tests PCR ont', pdf) if number_of_tests is not None: number_of_tests = number_of_tests.replace('\'', '') dd_test = sc.DayData(canton='GE', url=pdf_url) dd_test.datetime = week_end_date.isoformat() dd_test.tested = number_of_tests print(dd_test) is_first = False
#!/usr/bin/env python3 import scrape_common as sc print('NE') d = sc.pdfdownload( 'https://www.ne.ch/autorites/DFS/SCSP/medecin-cantonal/maladies-vaccinations/Documents/Covid-19-Statistiques/COVID19_PublicationInternet.pdf', layout=True) sc.timestamp() # Heavily inspired by code by https://github.com/maekke97 # Magic column fix (don't know if this is stable). d = d.replace('avr\n il', 'avril') d = d.replace('avr\n il', 'avril') # Find the start of the table on page 5. d = d[d.find('1mars2020'):] # d # Example rows. """ 18mars2020 32 146 15 3 18 3 3 1 2 19mars2020 29 175 16 3 19 3 3 1 3 20mars2020 13 188 17 4 21 4 2 6 3 21mars2020 12 200 15 5 20 5 1 6 1 4 22mars2020 16 216 22 6 28 6 1 7 4 23mars2020 31 247 22 5 6 33 5 6 0 11 1 5 24mars2020 18 265 24 2 6 32 2 6 3 11 1 6 25mars2020 15 280 31 3 7 41 3 7 2 12 3 9 26mars2020 19 299 33 2 7 42 2 7 1 10 2 11 1avril2020 18 420 52 6 8 66 6 8 4 18 2 23
def strip_value(value): if value: return re.sub(r'[^0-9]', '', value) return None base_url = 'https://www.vs.ch' url = f'{base_url}/web/coronavirus/statistiques' content = sc.download(url, silent=True) soup = BeautifulSoup(content, 'html.parser') pdf_url = soup.find('a', string=re.compile(r'2020.*Sit Epid.*')).get('href') pdf_url = f'{base_url}{pdf_url}' content = sc.pdfdownload(pdf_url, silent=True, layout=True, page=1) dd = sc.DayData(canton='VS', url=pdf_url) dd.datetime = sc.find(r'(\d{2}/\d{2}/20\d{2})', content) dd.datetime = re.sub(r'/', '.', dd.datetime) dd.cases = strip_value( sc.find(r'.*Cumul cas positifs.*\s+(\d+.\d+)\s+', content)) dd.deaths = strip_value(sc.find(r'.*Cumul d.c.s.*\s+(\d+.\d+)\s+', content)) dd.hospitalized = strip_value( sc.find(r'.*Hospitalisations en cours de cas COVID-19.*\s+(\d+)\s+', content)) dd.icu = strip_value(sc.find(r'.*SI en cours.*\s+(\d+)\s+', content)) dd.vent = strip_value(sc.find(r'.*Intubation en cours.*\s+(\d+)\s+', content)) is_first = True if dd:
# Download list of PDFs with statistics updated daily d = sc.download('https://www.vs.ch/de/web/coronavirus/statistiques', silent=True) # 2020-04-02 (but also earlier) """ ... ... <ul> <li><a href="/documents/6756452/7008787/2020 04 02 Sit Epid - État Stand.pdf" target="_blank">2020 04 02 Sit Epid - État Stand.pdf</a></li> <li><a href="/documents/6756452/7008787/2020 04 01 Sit Epid - État Stand" target="_blank">2020 04 01 Sit Epid - État Stand</a></li> <li> """ # Note, these are PDFs, but not all of them have pdf "extension". url = sc.find(r'<li>\s*<a href="([^"]+)"[^>]*>[^<]*Stand(?:\.pdf)?<', d) assert url, "Can't find latest PDF URL" full_url = 'https://www.vs.ch' + urllib.parse.quote(url) dd.url = full_url d = sc.pdfdownload(full_url, raw=True, silent=True) # 2020-03-29 """ État au – Stand : 29.03.2020 15.00h Nombre de cas positifs COVID-19 - Anzahl positive COVID-19 Fälle Total de cas positifs Total positive Fälle ∆ J-1 Incidence cumulée pour 100'000 habitants Kumulierte Inzidenz pro 100'000 Einwohner 964 +62 278.1 ... Nombre de décès – Anzahl Todesfälle Total ∆ J-1
#!/usr/bin/env python3 # -*- coding: utf-8 -*- import datetime import re import scrape_common as sc import scrape_vd_common as svc pdf_urls = svc.get_all_weekly_pdf_urls() for pdf_url in pdf_urls: pdf = sc.pdfdownload(pdf_url, silent=True, page=1) pdf = re.sub(r'(\d+)\'(\d+)', r'\1\2', pdf) pdf = re.sub(r'(\d+)’(\d+)', r'\1\2', pdf) pdf = re.sub(r'(\d)er', r'\1', pdf) td = sc.TestData(canton='VD', url=pdf_url) year = sc.find(r'Situation au \d+.*(20\d{2})', pdf) date = sc.find(r'Point .pid.miologique au (\d+\s+\w+\s+20\d{2})', pdf) res = re.search( r'Entre\s+(?P<et>et\s+)?le\s+(?P<start>\d+\s+\w+)\s+et\s+le\s+(?P<end>\d+\s+\w+)(?P<year>\s+\d{4})?,', pdf, flags=re.I | re.UNICODE) res_with_year = re.search( r'Entre\s+le\s+(?P<start>\d+\s+\w+\s+\d{4})\s+et\s+le\s+(?P<end>\d+\s+\w+\s+\d{4}),', pdf, flags=re.I | re.UNICODE) res_no_month = re.search( r'Entre\s+le\s+(?P<start>\d+)\s+et\s+le\s+(?P<end>\d+\s+\w+),', pdf, flags=re.I | re.UNICODE)
# Download list of PDFs with statistics updated daily d = sc.download('https://www.vs.ch/de/web/coronavirus/statistiques') # 2020-04-02 (but also earlier) """ ... ... <ul> <li><a href="/documents/6756452/7008787/2020 04 02 Sit Epid - État Stand.pdf" target="_blank">2020 04 02 Sit Epid - État Stand.pdf</a></li> <li><a href="/documents/6756452/7008787/2020 04 01 Sit Epid - État Stand" target="_blank">2020 04 01 Sit Epid - État Stand</a></li> <li> """ # Note, these are PDFs, but not all of them have pdf "extension". url = sc.find(r'<li>\s*<a href="([^"]+)"[^>]*>[^<]*Stand(?:\.pdf)?<', d) assert url, "Can't find latest PDF URL" import urllib.parse full_url = 'https://www.vs.ch' + urllib.parse.quote(url) d = sc.pdfdownload(full_url, raw=True) # 2020-03-29 """ État au – Stand : 29.03.2020 15.00h Nombre de cas positifs COVID-19 - Anzahl positive COVID-19 Fälle Total de cas positifs Total positive Fälle ∆ J-1 Incidence cumulée pour 100'000 habitants Kumulierte Inzidenz pro 100'000 Einwohner 964 +62 278.1 ... Nombre de décès – Anzahl Todesfälle Total ∆ J-1
#!/usr/bin/env python3 import scrape_common as sc print('GE') d = sc.pdfdownload( 'https://www.ge.ch/document/point-coronavirus-maladie-covid-19/telecharger' ) sc.timestamp() #d = sc.filter(r'Dans le canton de Genève|Actuellement.*cas ont|décédées|hospitalisés', d) # + 1 line. # 2020-03-23 """ Cette fiche destinée à la population générale dresse un état des lieux de la situation au 23 mars 2020. Chiffres clés au 22 mars 2020 (OMS, OFSP et DGS pour la Suisse et Genève) Chine 81'498 cas 3'267 décès Europe 151'293 cas 7'426 décès
#!/usr/bin/env python3 import scrape_common as sc import re # get latest from list with all press releases d = sc.download('https://www.regierung.li/coronavirus', silent=True) pdf_url = sc.find( r'<a.*?href="([^"]+\.pdf)[^"]*"[^>]*?>[^<]+?Situationsbericht[^<]+?<\/a>', d) assert pdf_url, "PDF URL not found" # download latest PDF d = sc.pdfdownload(pdf_url, raw=True, silent=True) # extract case numbers reported for previous days d = d.replace(u'\xa0', u' ') # data from the most recent press release dd = sc.DayData(canton='FL', url=pdf_url) dd.datetime = sc.find(r'Situationsbericht vom (.*? 20\d{2})', d) dd.cases = sc.find(r'insgesamt\s+([0-9]+)\s+laborbestätigte\s+Fälle', d) m = re.search(r'Bisher\s+trat(en)?\s+(\S+)\s+(Todesfall|Todesfälle)', d, flags=re.I) if m: dd.deaths = sc.int_or_word(m[2]) print(dd)