def parse_weekly_pdf(): sc.add_cert_to_bundle() base_url = 'https://www.infosan.vd.ch' d = sc.download(base_url, silent=True) soup = BeautifulSoup(d, 'html.parser') html_url = base_url + soup.find(href=re.compile("/publications/covid-19-point-epidemiologique")).get('href') d = sc.download(html_url, silent=True) soup = BeautifulSoup(d, 'html.parser') pdf_url = base_url + soup.find(href=re.compile("\.pdf$")).get('href') pdf = sc.pdfdownload(pdf_url, silent=True) """ 29.07.2020 Concernant le traçage des contacts de cas positifs, le 27 juillet, 83 personnes étaient en isolement, 633 en quarantaine de contacts étroits et 901 en quarantaine de retour de voyage. """ dd = sc.DayData(canton='VD', url=pdf_url) year= sc.find('Situation au \d+.*(20\d{2})', pdf) date = sc.find('Concernant le traçage des contacts de cas positifs, le (\d+.*),', pdf) dd.datetime = date + ' ' + year dd.isolated = sc.find('(\d+)\s(personnes|cas\spositifs)\sétaient\sen\sisolement', pdf) dd.quarantined = text_to_int(sc.find('(\d.\d+|\d+)\scontacts\sétroits\sen\squarantaine\.', pdf)) print(dd) print('-' * 10) dd = sc.DayData(canton='VD', url=pdf_url) date = sc.find('quarantaine. Le (\d+ .*),', pdf) dd.datetime = date + ' ' + year dd.quarantine_riskareatravel = text_to_int(sc.find(', (\d.\d+|\d+)\spersonnes\sétaient\sen\squarantaines\ssuite\sà\sun\sretour\sde\svoyage.', pdf)) print(dd) print('-' * 10)
def parse_weekly_pdf(): pdf_url = svc.get_weekly_pdf_url() pdf = sc.pdfdownload(pdf_url, silent=True) """ 29.07.2020 Concernant le traçage des contacts de cas positifs, le 27 juillet, 83 personnes étaient en isolement, 633 en quarantaine de contacts étroits et 901 en quarantaine de retour de voyage. """ dd = sc.DayData(canton='VD', url=pdf_url) year = sc.find('Situation au \d+.*(20\d{2})', pdf) date = sc.find( 'Concernant le traçage des contacts de cas positifs, le (\d+.*),', pdf) if not date: print("isolated/quarantined numbers missing in weekly PDF of VD", file=sys.stderr) return dd.datetime = date + ' ' + year dd.isolated = sc.find( '(\d+)\s(personnes|cas\spositifs)\sétaient\sen\sisolement', pdf) dd.quarantined = text_to_int( sc.find('(\d.\d+|\d+)\scontacts\sétroits\sen\squarantaine\.', pdf)) print(dd) print('-' * 10) dd = sc.DayData(canton='VD', url=pdf_url) date = sc.find('quarantaine. Le (\d+ .*),', pdf) dd.datetime = date + ' ' + year dd.quarantine_riskareatravel = text_to_int( sc.find( ', (\d.\d+|\d+)\spersonnes\sétaient\sen\squarantaines?\ssuite\sà\sun\sretour\sde\svoyage.', pdf)) print(dd) print('-' * 10)
def parse_table(title_pattern, data_url, column_count, parse_fn): table = soup.find(string=re.compile(title_pattern)).find_parent( 'h2').find_next('div').find('table') headers = [ " ".join(cell.stripped_strings) for cell in table.find('tr').find_all('th') ] for row in table.find_all('tr')[1:]: dd = sc.DayData(canton='AG', url=data_url) cells = row.find_all(['td']) assert len( cells ) == column_count, f"Number of columns changed: {len(cells)} != {column_count}" col_num = 0 for cell in cells: header = headers[col_num] value = cell.string value = value.replace("’", "") value = value.replace("'", "") dd = parse_fn(dd, value, header) col_num += 1 print('-' * 10) print(dd)
def parse_weekly_pdf(): pdf_url = svc.get_weekly_pdf_url() pdf = sc.pdfdownload(pdf_url, silent=True) dd = sc.DayData(canton='VD', url=pdf_url) dd.datetime = sc.find('Point .pid.miologique au (\d+\s+\w+\s+\d{4})', pdf) dd.cases = text_to_int( sc.find( '\s(\d+.\d+)\s+personnes ont .t. test.es positives au SARS-CoV-2.', pdf)) dd.hospitalized = sc.find( '(\d+)\s+patients\s+sont\s+actuellement\s+hospitalis.s', pdf) dd.icu = sc.find('dont\s+(\d+)\s+en\s+soins\s+intensifs', pdf) assert dd print(dd) print('-' * 10)
try: xls_url = soup.find( 'a', string=re.compile(r'Coronaf.lle\s*im\s*Kanton\s*Schwyz'))['href'] except TypeError: print("Unable to determine xls url", file=sys.stderr) sys.exit(1) xls = sc.xlsdownload(xls_url, silent=True) rows = sc.parse_xls(xls) for row in rows: if not isinstance(row['Datum'], datetime.datetime): continue if not is_first: print('-' * 10) is_first = False # TODO: remove when source is fixed # handle wrong value on 2020-03-25, see issue #631 if row['Datum'].date().isoformat() == '2020-03-25': row['Bestätigte Fälle (kumuliert)'] = '' dd = sc.DayData(canton='SZ', url=url) dd.datetime = row['Datum'].date().isoformat() if row['Zeit']: dd.datetime += ' ' + row['Zeit'].time().isoformat() dd.cases = row['Bestätigte Fälle (kumuliert)'] dd.deaths = row['Todesfälle (kumuliert)'] dd.recovered = row['Genesene (kumuliert)'] print(dd)
text = re.sub(r'\s\s+', ' ', text) return text.split(' ') # weekly pdf pdf_url = sgc.get_gl_pdf_url() pdf = sc.download_content(pdf_url, silent=True) content = sc.pdftotext(pdf, page=1) pdf_date = sc.find(r'Stand: (\d{2}\.\d{2}.\d{4})', content) pdf_date = sc.date_from_text(pdf_date) number_of_tests = sc.find(r'PCR-Tests\sKanton Glarus\s(\d+\'?\d+)\s', content).replace('\'', '') is_first = True if number_of_tests: dd = sc.DayData(canton='GL', url=pdf_url) dd.datetime = pdf_date dd.tested = number_of_tests is_first = False print(dd) content = sc.pdftotext(pdf, page=2, raw=True) dates = split_whitespace( sc.find(r'\n(\d+\.\d+\s+\d+\.\d+\s+.*)\nMassenquarant.ne', content)) travel_q = split_whitespace( sc.find(r'\nEinreisequarant.ne\s+(\d.*)\n', content)) isolation = split_whitespace(sc.find(r'\nIsolation\s+(\d.*)\n', content)) quarantined = split_whitespace(sc.find(r'\nQuarant.ne\s+(\d.*)\n', content)) ips = split_whitespace(sc.find(r'\nCovid Patienten in IPS\s+(\d.*)\n', content))
<td icms="">0</td> <td icms="">47</td> <td icms="">7</td> <td icms=""> </td> </tr> </tbody> </table> """ soup = BeautifulSoup(d, 'html.parser') data_table = soup.find(string=re.compile( r'Positiv\s+getestete\s+Erkrankungsfälle')).find_parent('table') assert data_table, "Can't find data table" dd = sc.DayData(canton='UR', url=url) dd.datetime = sc.find(r'Stand[A-Za-z ]*[:,]? ([^<)]+ Uhr)<', d) rows = data_table.find_all('tr') assert len(rows) == 2, f"Number of rows changed, {len(rows)} != 2" headers = rows[0].find_all('td') or rows[0].find_all('th') assert len( headers) == 6, f"Number of header columns changed, {len(headers)} != 6" assert headers[0].text.strip() == "Aktive Fälle" assert headers[1].text == "Positiv getestete Erkrankungsfälle" assert headers[2].text == "Hospitalisiert" assert headers[3].text == "Quarantäne" assert headers[4].text == "Verstorben" cells = rows[1].find_all('td')
#!/usr/bin/env python3 # -*- coding: utf-8 -*- import scrape_common as sc xls_url = 'http://www.nw.ch/coronastatistik' xls = sc.xlsdownload(xls_url, silent=True) rows = sc.parse_xls(xls, header_row=3) is_first = True for row in rows: if not is_first: print('-' * 10) is_first = False dd = sc.DayData(canton='NW', url=xls_url) dd.datetime = row['A'].date().isoformat() dd.cases = row['Positiv getestete Personen (kumuliert)'] dd.hospitalized = row['Hospitalisierte Personen'] dd.icu = row['Davon auf der Intensivstation'] dd.deaths = row['Verstorbene Personen'] print(dd)
#!/usr/bin/env python3 # -*- coding: utf-8 -*- import scrape_common as sc xls_url = 'https://www4.ti.ch/fileadmin/DSS/DSP/UMC/malattie_infettive/Coronavirus/dati/COVID19_Dati_TI_per_github.xlsx' xls = sc.xlsdownload(xls_url, silent=True) rows = sc.parse_xls(xls, header_row=0) is_first = True for row in rows: if not is_first: print('-' * 10) is_first = False dd = sc.DayData(canton='TI', url=xls_url) dd.datetime = f"{row['date'].date().isoformat()}" if row['time']: dd.datetime += f"T{row['time'].time().isoformat()}" dd.cases = row['ncumul_conf'] dd.hospitalized = row['current_hosp'] dd.icu = row['current_icu'] dd.vent = row['current_vent'] dd.recovered = row['ncumul_released'] dd.deaths = row['ncumul_deceased'] print(dd)
#!/usr/bin/env python3 import csv from io import StringIO import re from bs4 import BeautifulSoup import scrape_common as sc # hospitalized url_hospitalized = 'https://stada.sg.ch/covid/C19_Faelle_hospitalisiert.html' soup = BeautifulSoup(sc.download(url_hospitalized, silent=True), 'html.parser') dd_hosp = sc.DayData(canton='SG', url=url_hospitalized) hosp_table = soup.find('table') hosp_date = hosp_table.find_next(string=re.compile("Stand")).string dd_hosp.datetime = sc.find(r'Stand:?\s*(.+[0-9]{4})', hosp_date) rows = hosp_table.find_all('tr') headers = rows[0].find_all('td') or rows[0].find_all('th') assert len( headers) == 2, f"Number of header columns changed, {len(headers)} != 2" assert headers[1].text.strip() == "Anzahl" for i in range(1, len(rows)): cells = rows[i].find_all('td') if cells[0].text.strip() == 'Total Covid-19 Patienten': dd_hosp.hospitalized = cells[1].text elif cells[0].text.strip() == '...davon auf Intensivstation ohne Beatmung': dd_hosp.icu = int(cells[1].text) elif cells[0].text.strip() == '...davon auf Intensivstation mit Beatmung':
is_first = True # parse tested from PDF pdf_url = sgc.get_latest_ge_weekly_pdf_url() pdf = sc.pdfdownload(pdf_url, silent=True) week_number = sc.find(r'Situation semaine (\d+)', pdf) if week_number: week_end_date = datetime.datetime.strptime('2021-W' + week_number + '-7', '%G-W%V-%u').date() number_of_tests = sc.find(r'Au total, (\d+\'\d+) tests PCR ont', pdf) if number_of_tests is not None: number_of_tests = number_of_tests.replace('\'', '') dd_test = sc.DayData(canton='GE', url=pdf_url) dd_test.datetime = week_end_date.isoformat() dd_test.tested = number_of_tests print(dd_test) is_first = False # get hospitalized number hosp_url = 'https://www.hug.ch/coronavirus-maladie-covid-19/situation-aux-hug' d = sc.download(hosp_url, silent=True) d = d.replace(' ', ' ') soup = BeautifulSoup(d, 'html.parser') content = soup.find(string=re.compile( "Evolution du nombre de malades.*")).find_previous('p').text dd_hosp = sc.DayData(canton='GE', url=hosp_url) hosp_date = sc.find(r'^Au (\d+\s*(:?\w+)?\s+\w+)\s+à\s+\d+h',
pdf_url = f'{base_url}{pdf_url}' content = sc.pdfdownload(pdf_url, layout=True, silent=True) """ Hospitalisationen im Kanton Anzahl Personen in Isolation davon Kontakte in Quarantäne Anzahl zusätzlicher Personen in Quarantäne nach Rückkehr aus Risikoland Re- Wert*** 6 (6) 120 (71) 280 (189) 388 (280) 1.46 (1.1) """ rows = [] date = sc.find(r'S\s?tand: (\d+\.\d+\.20\d{2})', content) res = re.search( r'Hospitalisationen im Kanton.*\d+ \(\d+\)\s+(\d+) \(\d+\)\s+(\d+) \(\d+\)\s+(\d+) \(\d+\)\s+\d\.\d+ \(\d\.\d+\)', content, re.DOTALL) if res is not None: data = sc.DayData(canton='SO', url=pdf_url) data.datetime = date data.isolated = res[1] data.quarantined = res[2] data.quarantine_riskareatravel = res[3] rows.append(data) url = f"{base_url}/index.php?id=27979" d = sc.download(url, silent=True) d = d.replace(" ", " ") soup = BeautifulSoup(d, 'html.parser') data_table = soup.find( 'h2', text=re.compile("Situation Kanton Solothurn")).find_next("table") if data_table: headers = [cell.string for cell in data_table.find('tr').find_all('th')]
import scrape_common as sc import scrape_ag_common as sac xls_url = sac.get_ag_xls_url() xls = sc.xlsdownload(xls_url, silent=True) is_first = True # quarantine_riskareatravel rows = sc.parse_xls(xls, sheet_name='5. Quarantäne nach Einreise', header_row=2) for row in rows: if not isinstance(row['A'], datetime.datetime): continue dd = sc.DayData(canton='AG', url=xls_url) dd.datetime = f"{row['A'].date().isoformat()} {row['A'].time().isoformat()}" dd.quarantine_riskareatravel = row['Gesamtzahl aktuell betreuter Personen'] if dd: if not is_first: print('-' * 10) is_first = False print(dd) # quarantine + isolation rows = sc.parse_xls(xls, sheet_name='2. Contact Tracing', header_row=2) for row in rows: if not isinstance(row['A'], datetime.datetime): continue dd = sc.DayData(canton='AG', url=xls_url)
def fix_lu_date(date): res = re.match(r'(20\d{2})/(\d+)/(\d+)', date) assert res, 'date could not be matched!' date = datetime.date(int(res[1]), int(res[2]) + 1, int(res[3])) return date.isoformat() hosp_url = 'https://www.lustat.ch/analysen/gesundheit/corona-reporting/hospitalisationen' hosp_csv = 'https://www.lustat.ch/files_ftp/daten/covid/cov_hospitalisationen.csv' is_first = True data = sc.download(hosp_csv, silent=True, encoding='utf-8-sig') reader = csv.DictReader(StringIO(data), delimiter=';') for row in reader: dd = sc.DayData(canton='LU', url=hosp_url) dd.datetime = fix_lu_date(row['utcdatum']) dd.hospitalized = row['current_hosp'] dd.vent = row['current_vent'] if dd: if not is_first: print('-' * 10) is_first = False print(dd) cases_url = 'https://www.lustat.ch/analysen/gesundheit/corona-reporting/entwicklungen-seit-maerz-2020' cases_csv = 'https://www.lustat.ch/files_ftp/daten/covid/cov_faelle_g2.csv' data = sc.download(cases_csv, silent=True, encoding='utf-8-sig') reader = csv.DictReader(StringIO(data), delimiter=';') for row in reader:
assert xls_url, "URL is empty" if not xls_url.startswith('http'): xls_url = f'https://www.jura.ch{xls_url}' xls = sc.xlsdownload(xls_url, silent=True) rows = sc.parse_xls(xls, header_row=0) for i, row in enumerate(rows): if not isinstance(row['Date'], datetime.datetime): continue if not is_first: print('-' * 10) is_first = False dd = sc.DayData(canton='JU', url=xls_url) dd.datetime = row['Date'].date().isoformat() dd.cases = row['Cumul des cas confimés'] dd.hospitalized = row.get('Nb cas actuellement hospitalisés') dd.icu = row.get('Nb cas actuellement en SI') if sc.represents_int(row.get('Nombre de nouveaux décès')): dd.deaths = sum(r['Nombre de nouveaux décès'] for r in rows[:i + 1]) print(dd) data_table = soup.find( 'caption', string=re.compile( r'Evolution du nombre de cas.*Jura')).find_parent('table') if data_table: headers = [ " ".join(cell.stripped_strings)
#!/usr/bin/env python3 import scrape_common as sc url = 'https://www.ar.ch/verwaltung/departement-gesundheit-und-soziales/amt-fuer-gesundheit/informationsseite-coronavirus/' d = sc.download(url, silent=True) d = d.replace(' ', ' ') # Contact Tracing with its own timestamp dd_ct = sc.DayData(canton='AR', url=url) t = sc.find(r'Contact\s+tracing\s+\(?.*?Stand\:?\s+([^\)]+)(Uhr)?.*?\)?', d) or \ sc.find(r'Contact\s+tracing\s+\(?.*?Stand\:?\s+([0-9]+\.[0-9]+\.? \/ [0-9]+h).*?\)?', d) dd_ct.datetime = t dd_ct.isolated = sc.find( r'Aktuell\s+COVID-19-Erkrankte\s+in\s+Isolation:\s+<strong>(\d+)</strong>', d) dd_ct.quarantined = sc.find( r'Aktuell\s+im\s+Kanton\s+wohnhafte\s+Kontaktpersonen\s+in\s+Quarantäne:\s+<strong>(\d+)</strong>', d) print(dd_ct) print('-' * 10) # cases dd = sc.DayData(canton='AR', url=url) # d = sc.filter('Aktuelle Informationen: Zahlen', d)
#!/usr/bin/env python3 import re import datetime from bs4 import BeautifulSoup import scrape_common as sc url = 'https://gesundheit.lu.ch/themen/Humanmedizin/Infektionskrankheiten/Coronavirus' d = sc.download(url, silent=True) dd = sc.DayData(canton='LU', url=url) # 2020-04-01 """ <p><strong>Aktuelle Fallzahlen im Kanton Luzern </strong>(Stand: 1. April 2020, 11:00 Uhr)</p> <table border="0" cellspacing="0" cellpadding="0"> <tbody> <tr> <td valign="top" style="width: 151px;"> <p><strong></strong>Bestätigte Fälle: </p> </td> <td valign="top" style="width: 47px;"> <p style="text-align: right;">401</p> </td> </tr> <tr> <td valign="top" style="width: 151px;"> <p>Hospitalisiert:</p> </td> <td valign="top" style="width: 47px;"> <p style="text-align: right;">57</p> </td>
#!/usr/bin/env python3 import csv import re from io import StringIO from bs4 import BeautifulSoup import scrape_common as sc url = "https://www.zh.ch/de/gesundheit/coronavirus.html#-1310230111" # get quarantined and isolated from website dd_iso_q = sc.DayData(canton='ZH', url=url) d = sc.download(url, silent=True) # 2020-07-08 """ <div class="mdl-richtext "> <h2 class="atm-heading" id="-1310230111" tabindex="-1">Gesundheitliche Lage</h2> <p class="atm-paragraph">Personen mit Wohnsitz im Kanton Zürich<br> </p> <h4 class="atm-heading" id="-718243468">23</h4> <p class="atm-paragraph">neue positive Fälle in den letzten 24 Stunden</p> <h4 class="atm-heading" id="-718243501">11</h4> <p class="atm-paragraph">in Spitalbehandlung</p> <h4 class="atm-heading" id="808114848">3</h4> <p class="atm-paragraph">davon mit künstlicher Beatmung</p> <h4 class="atm-heading" id="-790711940">131</h4> <p class="atm-paragraph">Total Verstorbene seit Pandemiebeginn (78 in Alters- und Pflegeheimen, 51 im Spital, 2 Zuhause)</p> <h4 class="atm-heading" id="-790711785">181</h4> <p class="atm-paragraph">in Isolation</p> <h4 class="atm-heading" id="-790704311">914</h4> <p class="atm-paragraph">in Quarantäne </p>
reader = csv.DictReader(StringIO(d_csv), delimiter=',') data = collections.defaultdict(dict) for row in reader: if row['Typ'] == 'NA' or row['Datum'] == 'NA': continue date = sc.date_from_text(row['Datum']) data[date.isoformat()][row['Typ']] = row['Anzahl'] days = list(data.keys()) is_first = True for day in days: if not is_first: print('-' * 10) is_first = False dd = sc.DayData(canton='ZG', url=main_url) dd.datetime = day dd.isolated = data[day]['Isolation'] dd.quarantined = data[day]['Quarantäne aus Contact Tracing'] dd.quarantine_riskareatravel = data[day].get( 'Quarantäne nach Rückkehr aus Risikoland') dd.quarantine_total = data[day].get('Quarantäne Total') print(dd) cases_csv_url = 'https://www.zg.ch/behoerden/gesundheitsdirektion/statistikfachstelle/daten/themen/result-themen-14-03-01-e1.csv' d_csv = sc.download(cases_csv_url, silent=True) """ "Typ","Datum","Anzahl","Stand","Meta","Type","Content" "Fallzahl","22.04.2020","176","2020-04-22 08:00:00",NA,NA,NA "Fallzahl","23.04.2020","178","2020-04-23 08:00:00",NA,NA,NA
#!/usr/bin/env python3 from bs4 import BeautifulSoup import scrape_common as sc # fetch latest data from HTML table url = 'https://www.ag.ch/de/themen_1/coronavirus_2/coronavirus.jsp' d = sc.download(url, silent=True) d = d.replace("’", "") d = d.replace("'", "") dd = sc.DayData(canton='AG', url=url) date = sc.find( r'Stand: (?:Montag|Dienstag|Mittwoch|Donnerstag|Freitag|Samstag|Sonntag), (.+? Uhr)', d) dd.datetime = date soup = BeautifulSoup(d, 'html.parser') rows = [] for t in soup.find_all('table'): headers = [ " ".join(cell.stripped_strings) for cell in t.find('tr').find_all('th') ] for row in [r for r in t.find_all('tr') if r.find_all('td')]: cells = row.find_all(['td']) col_num = 0 for cell in cells:
stat_url = base_url + '/de/web/coronavirus/statistiques' content = sc.download(stat_url, silent=True) soup = BeautifulSoup(content, 'html.parser') res = soup.find( string=re.compile(r'Synthese COVID19 VS Woche\d+')).find_previous('a') weekly_pdf_url = base_url + res.attrs['href'] weekly_pdf_url = weekly_pdf_url.replace(' ', '%20') content = sc.pdfdownload(weekly_pdf_url, silent=True) # add isolated/quarantined to the existing DayData item week_end_date = sc.find(r'vom (\d+)\. bis (\d+\.\d+\.20\d{2})', content, group=2) week_end_date = sc.date_from_text(week_end_date).isoformat() dd = sc.DayData(canton='VS', url=weekly_pdf_url) dd.datetime = week_end_date dd.isolated = sc.find( r'befanden\ssich\s(\d+)\spositive\sF.lle\snoch\simmer\sin\sIsolation', content) dd.quarantined = sc.find(r'Isolation\sund\s(\d+)\sKontakte\sin\sQuarant.ne', content) dd.quarantine_riskareatravel = sc.find(r'\s(\d+)\sReisende\sin\sQuarant.ne', content) print(dd) xls_url = 'https://raw.githubusercontent.com/statistikZH/covid19_drop/master/Chiffres%20%20COVID-19%20Valais.xlsx' main_url = 'https://www.vs.ch/de/web/coronavirus' xls = sc.xlsdownload(xls_url, silent=True) rows = sc.parse_xls(xls, header_row=1) for i, row in enumerate(rows):
'table', { 'summary': 'Laufend aktualisierte Zahlen zu den Corona-Erkrankungen im Kanton Bern' }): headers = [ " ".join(cell.stripped_strings) for cell in t.find('tr').find_all('th') ] is_first = True for row in [r for r in t.find_all('tr') if r.find_all('td')]: if not is_first: print('-' * 10) is_first = False dd = sc.DayData(canton='BE', url=html_url) for col_num, cell in enumerate(row.find_all(['td'])): value = " ".join(cell.stripped_strings) if value: value = value.replace("'", "") if value and '*' in value and not '**' in value: # the asteriks (*) indicates a not-current value # ** means "Datenkorrektur" continue if value and '(' in value: value = sc.find(r'(\d+)([\s<>br\w]*\(.*\))?', value) if headers[col_num] == 'Datum': date_string = "".join(list(cell.stripped_strings)[0:-1]) time_string = list(cell.stripped_strings)[-1]
# -*- coding: utf-8 -*- from bs4 import BeautifulSoup import re import scrape_common as sc import scrape_bl_common as sbc from collections import OrderedDict, defaultdict from datetime import datetime bulletin_url = sbc.get_latest_bl_bulletin_url() bulletin_content = sc.download(bulletin_url, silent=True) soup = BeautifulSoup(bulletin_content, 'html.parser') content = soup.find('strong', string=re.compile(r'Per heute .*')).string # strip unwanted characters content = content.encode("ascii", errors="ignore").decode() dd = sc.DayData(canton='BL', url=bulletin_url) dd.datetime = sc.find(r'Per heute \w+, (\d+\. \w+ 20\d{2})', content) dd.isolated = sc.find( r'Aktuell befinden sich.*(\d+\s?\d+) Personen in Isolation', content) dd.quarantined = sc.find( r'Aktuell befinden sich.*(\d+\s?\d+) Personen in Quarantäne', content) is_first = True if dd: print(dd) is_first = False main_url = "https://www.baselland.ch/politik-und-behorden/direktionen/volkswirtschafts-und-gesundheitsdirektion/amt-fur-gesundheit/medizinische-dienste/kantonsarztlicher-dienst/aktuelles/covid-19-faelle-kanton-basel-landschaft" main_site = sc.download(main_url, silent=True) # 2020-04-08, two iframes
rows[key]['hospitalized'] = int( float(c[1] or 0) + float(c[2] or 0) + float(c[3] or 0)) rows[key]['icu'] = int( float(c[2] or 0) + float(c[3] or 0)) rows[key]['vent'] = c[3] # order dict by key to ensure the most recent entry is last ordered_rows = OrderedDict(sorted(rows.items())) is_first = True for row_date, row in ordered_rows.items(): if not is_first: print('-' * 10) is_first = False dd = sc.DayData(canton='BL', url=main_url) dd.datetime = row['date'] dd.cases = sc.safeint(row['cases']) try: dd.hospitalized = sc.safeint(row['hospitalized']) except KeyError: pass try: dd.icu = sc.safeint(row['icu']) except KeyError: pass try: dd.vent = sc.safeint(row['vent']) except KeyError: pass dd.deaths = sc.safeint(row['deaths'])
#!/usr/bin/env python3 # -*- coding: utf-8 -*- import datetime import scrape_common as sc xls_url = 'https://raw.githubusercontent.com/statistikZH/covid19_drop/master/Chiffres%20%20COVID-19%20Valais.xlsx' xls = sc.xlsdownload(xls_url, silent=True) rows = sc.parse_xls(xls, header_row=1) is_first = True for row in rows: if not isinstance(row['Date'], datetime.datetime): continue if not is_first: print('-' * 10) is_first = False dd = sc.DayData(canton='VS', url=xls_url) dd.datetime = row['Date'].date().isoformat() dd.cases = row['Cumul cas positifs'] dd.hospitalized = row['Total hospitalisations COVID-19'] dd.icu = row['Patients COVID-19 aux SI total'] dd.vent = row['Patients COVID-19 intubés'] dd.deaths = row['Cumul décès COVID-19'] print(dd)
#!/usr/bin/env python3 import scrape_common as sc url = "https://gd.zh.ch/internet/gesundheitsdirektion/de/themen/coronavirus.html" dd = sc.DayData(canton='ZH', url=url) d = sc.download(url, silent=True) d = d.replace(' ', ' ') d = d.replace('<strong>', ' ').replace('</strong>', ' ') # d = sc.filter(r"Im Kanton Zürich sind zurzeit|\(Stand|Total ([0-9]+) Todesfälle|Spitalbehandlung|beatmet", d) # <h2>Aktuelle Situation im Kanton Zürich (24.3.2020, 9.30 Uhr)</h2> # # # # <p>Im Kanton Zürich sind zurzeit 1211 Personen positiv auf das Coronavirus getestet worden. Total 5 Todesfälle (78-jährig, 80, 88, 96, 97).</p> # <p>(Stand 24.3.2020, 9.30 Uhr)</p> # 2020-03-26 """ <h2>Aktuelle Situation im Kanton Zürich (26.3.2020, 9.30 Uhr)</h2> <p>Im Kanton Zürich sind zurzeit 1476 Personen positiv auf das Coronavirus getestet worden.</p> <p>152 positiv Getestete befinden sich in Spitalbehandlung, davon werden 32 künstlich beatmet.</p> <p>Total 9 Todesfälle (78-jährig, 78, 80, 80, 85, 88, 90, 96, 97).</p> <p>Die Gesundheitsdirektion beschafft sich eine Maschine, die täglich automatisch bis zu 32'000 FFP2-Schutzmasken herstellen kann. In der zweiten Hälfte des Monats April ist die Maschine betriebsbereit. </p> <p>(Stand 26.3.2020, 9.30 Uhr)</p> """
import scrape_common as sc # The list of articles is also available on https://www.gd.bs.ch/medienseite/medienmitteilungen.html URL = sc.download("https://www.gd.bs.ch/", silent=True) URL = sc.filter( r'Tagesbulletin.*Corona.*\d+\s*bestätigte\s*(Fälle|Infektionen)', URL) # 2020-03-25, List of sub-articles: """ <a href="/nm/2020-tagesbulletin-coronavirus-466-bestaetigte-faelle-im-kanton-basel-stadt-gd.html" target="_self">Tagesbulletin Coronavirus: 466 bestätigte Fälle im Kanton Basel-Stadt</a> <a href="/nm/2020-tagesbulletin-coronavirus-414-bestaetigte-faelle-im-kanton-basel-stadt-gd.html" target="_self">Tagesbulletin Coronavirus: 414 bestätigte Fälle im Kanton Basel-Stadt</a> <a href="/nm/2020-tagesbulletin-coronavirus-376-bestaetigte-faelle-im-kanton-basel-stadt-gd.html" target="_self">Tagesbulletin Coronavirus: 376 bestätigte Fälle im Kanton Basel-Stadt</a> """ url = 'https://www.gd.bs.ch/' + sc.filter(r'href', URL).split('"')[1] dd = sc.DayData(canton='BS', url=url) d = sc.download(url, silent=True) d = d.replace('ä', 'ä') d = d.replace('ö', 'ö') d = d.replace(' ', ' ') # 2020-03-25 """ <p>Das Gesundheitsdepartement Basel-Stadt meldet mit Stand Mittwoch, 25. März 2020, 10 Uhr, insgesamt 466 positive Fälle von Personen mit Wohnsitz im Kanton Basel-Stadt sowie drei weitere Todesfälle. </p> """ # There are some extra (or repeated) information in the previous / next paragraphs: # 2020-03-25 """
from bs4 import BeautifulSoup import scrape_common as sc d = sc.download( 'https://www.sz.ch/behoerden/information-medien/medienmitteilungen/coronavirus.html/72-416-412-1379-6948', silent=True) soup = BeautifulSoup(d, 'html.parser') pdf_url = soup.find('a', string=re.compile(r'Medienmitteilung vom'))['href'] pdf_content = sc.pdfdownload(pdf_url, layout=True, silent=True) date = sc.find(r'Stand: (\d+\. .* 20\d{2})', pdf_content) res = re.search(r'.*\s+\d+\s+\d+\s+\d+\s+(\d+)\s+(\d+)\s+(\d+)\s+', pdf_content) is_first = True if res is not None: dd = sc.DayData(canton='SZ', url=pdf_url) dd.datetime = date dd.hospitalized = res[1] dd.quarantined = res[2] dd.quarantine_riskareatravel = res[3] print(dd) is_first = False try: xls_url = soup.find( 'a', string=re.compile(r'Coronaf.lle\s*im\s*Kanton\s*Schwyz'))['href'] except TypeError: print("Unable to determine xls url", file=sys.stderr) sys.exit(1) xls = sc.xlsdownload(xls_url, silent=True)
# get latest from list with all press releases d = sc.download('https://www.regierung.li/coronavirus', silent=True) pdf_url = sc.find( r'<a.*?href="([^"]+\.pdf)[^"]*"[^>]*?>[^<]+?Situationsbericht[^<]+?<\/a>', d) assert pdf_url, "PDF URL not found" # download latest PDF d = sc.pdfdownload(pdf_url, raw=True, silent=True) # extract case numbers reported for previous days d = d.replace(u'\xa0', u' ') # data from the most recent press release dd = sc.DayData(canton='FL', url=pdf_url) dd.datetime = sc.find(r'Situationsbericht vom (.*? 20\d{2})', d) dd.cases = sc.find(r'insgesamt\s+([0-9]+)\s+laborbestätigte\s+Fälle', d) m = re.search(r'Bisher\s+trat(en)?\s+(\S+)\s+(Todesfall|Todesfälle)', d, flags=re.I) if m: dd.deaths = sc.int_or_word(m[2]) if re.search( 'Alle\s+weiteren\s+Erkrankten\s+sind\s+in\s+der\s+Zwischenzeit\s+genesen', d): dd.recovered = int(dd.cases) - int(dd.deaths) print(dd)
base_url = 'https://www.lustat.ch' url = f'{base_url}/daten?id=28177' d = sc.download(url, silent=True) soup = BeautifulSoup(d, 'html.parser') xls_url = soup.find('a', href=re.compile(r'.*\.xlsx')).get('href') if not xls_url.startswith('http'): xls_url = f'{base_url}{xls_url}' xls = sc.xlsdownload(xls_url, silent=True) rows = sc.parse_xls(xls, header_row=5) total_cases = 0 total_deaths = 0 is_first = True for row in rows: dd = sc.DayData(canton='LU', url=xls_url) dd.datetime = row['Datum'] dd.cases = sc.int_or_word(row.search(r'Neue\s+Fälle')) if dd.cases: total_cases += dd.cases dd.cases = total_cases dd.deaths = sc.int_or_word(row['Verstorbene']) if dd.deaths: total_deaths += dd.deaths dd.deaths = total_deaths dd.hospitalized = sc.int_or_word(row['Total']) dd.vent = sc.int_or_word(row.search(r'davon\s+beatmet')) dd.isolated = sc.int_or_word(row.search(r'in\s+Isolation')) dd.quarantined = sc.int_or_word(row.search(r'in\s+Quarantäne')) dd.quarantine_riskareatravel = sc.int_or_word(row.search(r'Reiserückkehrer\s+in\s+Quarantäne')) if dd: