def parse_weekly_pdf(): sc.add_cert_to_bundle() base_url = 'https://www.infosan.vd.ch' d = sc.download(base_url, silent=True) soup = BeautifulSoup(d, 'html.parser') html_url = base_url + soup.find(href=re.compile("/publications/covid-19-point-epidemiologique")).get('href') d = sc.download(html_url, silent=True) soup = BeautifulSoup(d, 'html.parser') pdf_url = base_url + soup.find(href=re.compile("\.pdf$")).get('href') pdf = sc.pdfdownload(pdf_url, silent=True) """ 29.07.2020 Concernant le traçage des contacts de cas positifs, le 27 juillet, 83 personnes étaient en isolement, 633 en quarantaine de contacts étroits et 901 en quarantaine de retour de voyage. """ dd = sc.DayData(canton='VD', url=pdf_url) year= sc.find('Situation au \d+.*(20\d{2})', pdf) date = sc.find('Concernant le traçage des contacts de cas positifs, le (\d+.*),', pdf) dd.datetime = date + ' ' + year dd.isolated = sc.find('(\d+)\s(personnes|cas\spositifs)\sétaient\sen\sisolement', pdf) dd.quarantined = text_to_int(sc.find('(\d.\d+|\d+)\scontacts\sétroits\sen\squarantaine\.', pdf)) print(dd) print('-' * 10) dd = sc.DayData(canton='VD', url=pdf_url) date = sc.find('quarantaine. Le (\d+ .*),', pdf) dd.datetime = date + ' ' + year dd.quarantine_riskareatravel = text_to_int(sc.find(', (\d.\d+|\d+)\spersonnes\sétaient\sen\squarantaines\ssuite\sà\sun\sretour\sde\svoyage.', pdf)) print(dd) print('-' * 10)
def parse_xlsx(): html_url = 'https://www.vd.ch/toutes-les-actualites/hotline-et-informations-sur-le-coronavirus/point-de-situation-statistique-dans-le-canton-de-vaud/' d = sc.download(html_url, silent=True) soup = BeautifulSoup(d, 'html.parser') xls_url = soup.find(href=re.compile("\.xlsx$")).get('href') assert xls_url, "URL is empty" xls = sc.xlsdownload(xls_url, silent=True) rows = sc.parse_xls(xls, header_row=2) is_first = True for row in rows: if not isinstance(row['Date'], datetime.datetime): continue if not is_first: print('-' * 10) is_first = False print('VD') sc.timestamp() print('Downloading:', html_url) print('Date and time:', row['Date'].date().isoformat()) print('Confirmed cases:', row['Nombre total de cas confirmés positifs']) print('Hospitalized:', row['Hospitalisation en cours']) print('ICU:', row['Dont soins intensifs']) print('Deaths:', row['Décès'])
def scrape_zg(): data = [] date = None url = 'https://www.zg.ch/behoerden/gesundheitsdirektion/statistikfachstelle/daten/themen/result-themen-14-03-11.csv' content = sc.download(url) reader = csv.DictReader(StringIO(content), delimiter=',') for row in reader: source = row['Vermutete Ansteckungsquelle'] if source != 'NA': isd = sc.InfectionSourceData('ZG', url) isd.source = source isd.count = row['Anzahl'] data.append(isd) if row['Type'] == 'subtitle': content = row['Content'] res = re.search('Datenstand: (.*)$', content) if res: date = parse_date(res[1]) assert date is not None for item in data: item.date = date.date().isoformat() item.time = date.time().isoformat() print(item)
def get_nw_page(): url = 'https://www.nw.ch/gesundheitsamtdienste/6044' content = sc.download(url, silent=True) content = content.replace(" ", " ") content = re.sub(r'(\d+)\'(\d+)', r'\1\2', content) soup = BeautifulSoup(content, 'html.parser') return url, soup
def get_ag_xls_url(): data_url = 'https://www.ag.ch/de/themen_1/coronavirus_2/lagebulletins/lagebulletins_1.jsp' d = sc.download(data_url, silent=True) soup = BeautifulSoup(d, 'html.parser') xls_url = soup.find('a', href=re.compile(r'\.xlsx$'))['href'] if not xls_url.startswith('http'): xls_url = f'https://www.ag.ch{xls_url}' return xls_url
def scrape_bs(): base_url = 'https://www.coronavirus.bs.ch' content = sc.download(base_url) content = BeautifulSoup(content, 'html.parser') bulletin = content.find( string=re.compile('Coronavirus: .*-Bulletin')).find_parent('a') url = base_url + bulletin.get('href') parse_weekly_bulletin(url)
def get_gl_pdf_url(): d = sc.download( 'https://www.gl.ch/verwaltung/finanzen-und-gesundheit/gesundheit/coronavirus.html/4817', silent=True) soup = BeautifulSoup(d, 'html.parser') # weekly pdf pdf_url = soup.find(href=re.compile(r'Sentinella.*\.pdf$')).get('href') return pdf_url
def get_all_bl_bulletin_urls(): news_url = 'https://www.baselland.ch/politik-und-behorden/direktionen/volkswirtschafts-und-gesundheitsdirektion/amt-fur-gesundheit/medizinische-dienste/kantonsarztlicher-dienst/aktuelles/medienmitteilungen-1' news_content = sc.download(news_url, silent=True) soup = BeautifulSoup(news_content, 'html.parser') bulletins = soup.find_all('a', href=re.compile(r'.*/coronavirus-wochenbulletin.*')) bulletin_urls = [] for bulletin in bulletins: bulletin_urls.append(bulletin.get('href')) return bulletin_urls
def get_all_weekly_pdf_urls(): base_url = 'https://www.infosan.vd.ch' d = sc.download(base_url, silent=True) urls = re.findall(r"window.open\('(.*_epidemio\.pdf)'", d) result = [] for url in urls: if not url.startswith('http'): url = f'{base_url}/{url}' result.append(url) return result
def get_vs_weekly_pdf_urls(): base_url = 'https://www.vs.ch' url = base_url + '/de/web/coronavirus/statistiques' content = sc.download(url, silent=True) soup = BeautifulSoup(content, 'html.parser') links = soup.find_all(href=re.compile(r'Synthese.*Woche')) result = [] for link in links: url = base_url + link['href'].replace(' ', '%20') result.append(url) return result
def scrape_zh(): url = 'https://raw.githubusercontent.com/openZH/covid_19_contact_tracing_ZH/master/data/Ansteckungswege_2021.csv' content = sc.download(url) reader = csv.DictReader(StringIO(content), delimiter=',') for row in reader: isd = sc.InfectionSourceData('ZH', url) isd.date_from = parse_date(row['from']) isd.date_to = parse_date(row['until']) isd.source = f"{row['context_cat']} ({row['context_bool']})" isd.count = row['n_conf'] print(isd)
def get_weekly_bulletins(): base_url = 'https://www.vs.ch' url = base_url + '/de/web/coronavirus/statistiques' content = sc.download(url) content = BeautifulSoup(content, 'html.parser') items = content.find_all(string=re.compile(r'Synthese.*Woche')) result = [] for item in items: link = item.find_previous('a') result.append(base_url + link.attrs['href']) return result
def get_all_weekly_pdf_urls(): base_url = 'https://www.infosan.vd.ch' url = f'{base_url}/resultat-de-la-recherche/search/covid/?tx_solr[sort]=changed_asc asc' d = sc.download(url, silent=True) urls = re.findall(r"window.open\('(.*\.pdf)'", d) result = [] for url in urls: if not url.startswith('http'): url = f'{base_url}/{url}' result.append(url) return result
def get_all_weekly_pdf_urls(): base_url = 'https://corona.so.ch' url = f'{base_url}/bevoelkerung/daten/woechentlicher-situationsbericht/' d = sc.download(url, silent=True) soup = BeautifulSoup(d, 'html.parser') links = soup.find_all(href=re.compile(r'\.pdf$')) result = [] for link in links: file_ref = link.get('href') url = f'{base_url}{file_ref}' if url not in result: result.append(url) return result
def get_fr_xls(): d = sc.download( 'https://www.fr.ch/sante/covid-19/coronavirus-statistiques-evolution-de-la-situation-dans-le-canton', silent=True) soup = BeautifulSoup(d, 'html.parser') xls_url = soup.find(href=re.compile("\.xlsx$")).get('href') assert xls_url, "URL is empty" if not xls_url.startswith('http'): xls_url = f'https://www.fr.ch{xls_url}' xls = sc.xlsdownload(xls_url, silent=True) return xls_url, xls
def get_fr_xls(): d = sc.download( 'https://www.fr.ch/de/gesundheit/covid-19/coronavirus-statistik-ueber-die-entwicklung-im-kanton', silent=True) soup = BeautifulSoup(d, 'html.parser') xls_url = soup.find(href=re.compile(r"\.xlsx$")).get('href') assert xls_url, "URL is empty" if not xls_url.startswith('http'): xls_url = f'https://www.fr.ch{xls_url}' xls = sc.xlsdownload(xls_url, silent=True) return xls_url, xls
def get_ge_weekly_pdf_urls(): d = sc.download('https://www.ge.ch/document/covid-19-bilan-epidemiologique-hebdomadaire', silent=True) soup = BeautifulSoup(d, 'html.parser') links = soup.find_all('a', title=re.compile(r"\.pdf$")) result = [] for link in links: pdf_url = link.get('href') assert pdf_url, "pdf URL is empty" if not pdf_url.startswith('http'): pdf_url = f'https://www.ge.ch{pdf_url}' if pdf_url not in result: result.append(pdf_url) return result
def get_fr_xls(): main_url = 'https://www.fr.ch/de/gesundheit/covid-19/coronavirus-statistik-ueber-die-entwicklung-im-kanton' d = sc.download(main_url, silent=True) soup = BeautifulSoup(d, 'html.parser') item = soup.find( 'span', text=re.compile(r"Statistik .ber die Entwicklungen im Kanton.*")) item = item.find_parent('a') xls_url = item.get('href') assert xls_url, "URL is empty" if not xls_url.startswith('http'): xls_url = f'https://www.fr.ch{xls_url}' xls = sc.xlsdownload(xls_url, silent=True) return xls_url, xls, main_url
def parse_weekly_bulletin(url): content = sc.download(url) content = BeautifulSoup(content, 'html.parser') content = content.find( string=re.compile('([Ii]m )?Zeitraum vom ')).find_parent('p').text # print(content) res = re.match( r'.*([Ii]m )?Zeitraum vom (\d.*20\d{2}|\d.*|\d+\.) bis (\d.*20\d{2})', content, re.DOTALL) start_date = None if res is not None: end_date = parse_bs_date(res[3]).date() try: start_date = parse_bs_date(res[2]).date() except arrow.parser.ParserMatchError: try: start_date = parse_bs_short_date( f'{res[2]}{end_date.month}.{end_date.year}').date() except arrow.parser.ParserMatchError: start_date = parse_bs_date(f'{res[2]} {end_date.year}').date() assert start_date assert end_date total_infections = int( sc.match(r'.* wurden (\d+) Neuinfektionen', content, mode=re.DOTALL)) known_infections = int( sc.match(r'.* Dabei konnten.* \(oder (\d+) F.lle\)', content, mode=re.DOTALL)) unknown_infections = total_infections - known_infections infection_sources = parse_infection_sources(content, known_infections) infection_sources.append((unknown_infections, 'Unbekannt')) for infection_source in infection_sources: isd = sc.InfectionSourceData('BS', url) isd.date_from = start_date.isoformat() isd.date_to = end_date.isoformat() isd.source = infection_source[1] isd.count = str(infection_source[0]) print(isd)
#!/usr/bin/env python3 # -*- coding: utf-8 -*- import re import sys import datetime from bs4 import BeautifulSoup import scrape_common as sc url = 'https://www.sz.ch/behoerden/information-medien/medienmitteilungen/coronavirus.html/72-416-412-1379-6948' d = sc.download(url, silent=True) soup = BeautifulSoup(d, 'html.parser') is_first = True """ Disabled for now, the PDFs from October 2020 contained hospitalized and quarntined data pdfs = soup.find_all('a', string=re.compile(r'Medienmitteilung vom')) for pdf in pdfs: pdf_url = pdf['href'] pdf_content = sc.pdfdownload(pdf_url, layout=True, silent=True) date = sc.find(r'Stand:\s(\d+\.\s.*\s20\d{2})', pdf_content) res = re.search(r'.*\s+(?P<iso>\d+)\s+\d+\s+\d+\s+(?P<hosp>\d+)\s+(?P<quar>\d+)\s+(?P<qtravel>\d+)\s+', pdf_content) if not date or not res: continue if not is_first: print('-' * 10) is_first = False dd = sc.DayData(canton='SZ', url=pdf_url) dd.datetime = date.replace('\n', ' ')
#!/usr/bin/env python3 import scrape_common as sc import re # get latest from list with all press releases d = sc.download('https://www.regierung.li/coronavirus', silent=True) pdf_url = sc.find( r'<a.*?href="([^"]+\.pdf)[^"]*"[^>]*?>[^<]+?Situationsbericht[^<]+?<\/a>', d) assert pdf_url, "PDF URL not found" # download latest PDF d = sc.pdfdownload(pdf_url, raw=True, silent=True) # extract case numbers reported for previous days d = d.replace(u'\xa0', u' ') # data from the most recent press release dd = sc.DayData(canton='FL', url=pdf_url) dd.datetime = sc.find(r'Situationsbericht vom (.*? 20\d{2})', d) dd.cases = sc.find(r'insgesamt\s+([0-9]+)\s+laborbestätigte\s+Fälle', d) m = re.search(r'Bisher\s+trat(en)?\s+(\S+)\s+(Todesfall|Todesfälle)', d, flags=re.I) if m: dd.deaths = sc.int_or_word(m[2]) if re.search( 'Alle\s+weiteren\s+Erkrankten\s+sind\s+in\s+der\s+Zwischenzeit\s+genesen',
dd.quarantined = qua dd.icu = ip if not is_first: print('-' * 10) is_first = False print(dd) else: print('PDF data is inconsistent!', file=sys.stderr) print( f'dates: {len(dates)}, travel quarantined: {len(travel_q)}, isolation: {len(isolation)}, quarantined: {len(quarantined)}, IPS: {len(ips)}', file=sys.stderr) # CSV from Google Spreadsheets main_url = 'https://docs.google.com/spreadsheets/d/1Q7VoxM6wvbdsC84DLWrzyNymkcxUKqIXHy6BpB2Ez0k/edit#gid=0' csv_url = 'https://docs.google.com/spreadsheets/d/1Q7VoxM6wvbdsC84DLWrzyNymkcxUKqIXHy6BpB2Ez0k/export?format=csv&id=1Q7VoxM6wvbdsC84DLWrzyNymkcxUKqIXHy6BpB2Ez0k&gid=0' d_csv = sc.download(csv_url, silent=True) reader = csv.DictReader(StringIO(d_csv), delimiter=',') for row in reader: if row['Datum'] == '': continue if not is_first: print('-' * 10) is_first = False dd = sc.DayData(canton='GL', url=main_url) dd.datetime = row['Datum'] dd.cases = row['Fallzahlen Total'] dd.hospitalized = row['Personen in Spitalpflege'] dd.deaths = row['Todesfälle (kumuliert)'] print(dd)
#!/usr/bin/env python3 # -*- coding: utf-8 -*- import re import sys import scrape_common as sc # The list of articles is also available on https://www.gd.bs.ch/medienseite/medienmitteilungen.html URL = sc.download("https://www.gd.bs.ch/", silent=True) URL = sc.filter( r'Tagesbulletin.*Corona.*\d+\s*bestätigte\s*(Fälle|Infektionen)', URL) # 2020-03-25, List of sub-articles: """ <a href="/nm/2020-tagesbulletin-coronavirus-466-bestaetigte-faelle-im-kanton-basel-stadt-gd.html" target="_self">Tagesbulletin Coronavirus: 466 bestätigte Fälle im Kanton Basel-Stadt</a> <a href="/nm/2020-tagesbulletin-coronavirus-414-bestaetigte-faelle-im-kanton-basel-stadt-gd.html" target="_self">Tagesbulletin Coronavirus: 414 bestätigte Fälle im Kanton Basel-Stadt</a> <a href="/nm/2020-tagesbulletin-coronavirus-376-bestaetigte-faelle-im-kanton-basel-stadt-gd.html" target="_self">Tagesbulletin Coronavirus: 376 bestätigte Fälle im Kanton Basel-Stadt</a> """ url = 'https://www.gd.bs.ch/' + sc.filter(r'href', URL).split('"')[1] dd = sc.DayData(canton='BS', url=url) d = sc.download(url, silent=True) d = d.replace('ä', 'ä') d = d.replace('ö', 'ö') d = d.replace(' ', ' ') # 2020-03-25 """ <p>Das Gesundheitsdepartement Basel-Stadt meldet mit Stand Mittwoch, 25. März 2020, 10 Uhr, insgesamt 466 positive Fälle von Personen mit Wohnsitz im Kanton Basel-Stadt sowie drei weitere Todesfälle. </p> """
#!/usr/bin/env python3 # -*- coding: utf-8 -*- import re import datetime import sys from bs4 import BeautifulSoup import scrape_common as sc d = sc.download('https://www.fr.ch/sante/covid-19/coronavirus-statistiques-evolution-de-la-situation-dans-le-canton', silent=True) soup = BeautifulSoup(d, 'html.parser') xls_url = soup.find(href=re.compile("\.xlsx$")).get('href') assert xls_url, "URL is empty" if not xls_url.startswith('http'): xls_url = f'https://www.fr.ch{xls_url}' xls = sc.xlsdownload(xls_url, silent=True) rows = sc.parse_xls(xls, header_row=0, sheet_name='Données sites internet') is_first = True for row in rows: if not isinstance(row['Date'], datetime.datetime): print(f"WARNING: {row['Date']} is not a valid date, skipping.", file=sys.stderr) continue if not is_first: print('-' * 10) is_first = False print('FR') sc.timestamp()
#!/usr/bin/env python3 import scrape_common as sc import re print('AG') # get latest from list with all bulletins d = sc.download('https://www.ag.ch/de/themen_1/coronavirus_2/lagebulletins/lagebulletins_1.jsp') url = sc.find(r'<a [^>]*href="([^"]+\.pdf)">.+Bulletin.+</a>', d) # download latest PDF d = sc.pdfdownload('https://www.ag.ch' + url, raw=True) sc.timestamp() print('Date and time:', sc.find(r'Aarau, (.+? Uhr)', d)) print('Confirmed cases:', sc.find(r'zurzeit\s+([0-9]+)\s+bestätigte\s+Fälle', d)) print('Recovered:', sc.find(r'([0-9]+)\s+Personen.*?als\s+geheilt', d)) print('Hospitalized:', sc.find(r'([0-9]+)\s+Person(en)?\s+sind\s+zurzeit\s+hospitalisiert', d)) print('ICU:', sc.find(r'([0-9]+)\s+Person(en)?.*?auf\s+Intensivstationen', d)) print('Vent:', sc.find(r'([0-9]+)\s+Person(en)?\s+künstlich\s+beatmet', d)) print('Deaths:', sc.find(r'([0-9]+)\s+Person(en)?\s+an\s+den\s+Folgen\s+des\s+Coronavirus\s+verstorben', d))
#!/usr/bin/env python3 import scrape_common as sc print('GL') d = sc.download('https://www.gl.ch/verwaltung/finanzen-und-gesundheit/gesundheit/coronavirus.html/4817') sc.timestamp() d = d.replace(' ', ' ') d = d.replace('ä', 'ä') d = sc.filter(r'Fallzahlen\s*Kanton\s*Glarus.+Update|Bestätigte\s*Fälle|Wahrscheinliche\s*Fälle|Hospitalisierungen|Verstorbene', d) # <li><strong><a href="#Fallzahlen">Fallzahlen Kanton Glarus</a> (Update 22.03.2020, 13.30 Uhr)</strong></li> #... # <h2><strong><a id="Fallzahlen" name="Fallzahlen"></a>Coronavirus: Update Kanton Glarus</strong></h2> # <h2>Bestätigte Fälle: <strong>31</strong> </h2> # <h2>Wahrscheinliche Fälle: <strong>--</strong></h2> # <h2>Hospitalisierungen: <strong>3</strong> </h2> # 2020-03-26 """ <h2><strong><a id="Fallzahlen" name="Fallzahlen"></a>Coronavirus: Update Kanton Glarus</strong><br /> (Stand: 25.3.2020, 13:30 Uhr)</h2> <h2>Bestätigte Fälle: <strong>40 </strong>(Vortag: 33) <br /> Hospitalisierungen: <strong>2</strong> (Vortag: 3)</h2> <p>Die Zahl der bestätigten Fälle umfasst die seit Messbeginn erfassten Personen, die positiv auf COVID-19 getestet wurden. Bereits wieder genesene Personen sind in diesen Zahlen ebenfalls enthalten.</p> """ # 2020-04-03 # Note, that it misses numbers for hospitalized on this day / time. """ <h2><strong><a id="Fallzahlen" name="Fallzahlen"></a>Coronavirus: Update Kanton Glarus</strong><br /> (Stand: 3.4.2020, 13:30 Uhr)</h2> <h2>Bestätigte Fälle: <strong>59 </strong>(+1) <br /> Personen in Spitalpflege: <strong>5 </strong>(+/-0) <br /> Verstorbene Personen: <strong>2 </strong>(+/-0)</h2>
#!/usr/bin/env python3 import scrape_common as sc from bs4 import BeautifulSoup import datetime import re print('LU') d = sc.download( 'https://gesundheit.lu.ch/themen/Humanmedizin/Infektionskrankheiten/Coronavirus' ) sc.timestamp() # 2020-04-01 """ <p><strong>Aktuelle Fallzahlen im Kanton Luzern </strong>(Stand: 1. April 2020, 11:00 Uhr)</p> <table border="0" cellspacing="0" cellpadding="0"> <tbody> <tr> <td valign="top" style="width: 151px;"> <p><strong></strong>Bestätigte Fälle: </p> </td> <td valign="top" style="width: 47px;"> <p style="text-align: right;">401</p> </td> </tr> <tr> <td valign="top" style="width: 151px;"> <p>Hospitalisiert:</p> </td> <td valign="top" style="width: 47px;">
#!/usr/bin/env python3 import scrape_common as sc import sys import re from bs4 import BeautifulSoup # get the daily bulletins base_url = 'https://www.regierung.li' d = sc.download( f'{base_url}/ministerien/ministerium-fuer-gesellschaft/medienmitteilungen/', silent=True) soup = BeautifulSoup(d, 'html.parser') is_first = True bulletins = soup.find_all('a', text=re.compile(r'.*Situationsbericht.*')) for bulletin in bulletins: url = f"{base_url}{bulletin.get('href')}" bulletin_d = sc.download(url, silent=True) bulletin_soup = BeautifulSoup(bulletin_d, 'html.parser') dd = sc.DayData(canton='FL', url=url) title = bulletin_soup.find('h1', text=re.compile(r'.*Situationsbericht.*')) dd.datetime = sc.find(r'Situationsbericht vom (.*? 20\d{2})', title.text) content = title.find_next('div').text content = re.sub(r'(\d+)’(\d+)', r'\1\2', content) dd.cases = sc.find(r"insgesamt\s+([0-9]+)\s+laborbestätigte\s+Fälle", content)
#!/usr/bin/env python3 # -*- coding: utf-8 -*- from bs4 import BeautifulSoup import re import scrape_common as sc import scrape_bl_common as sbc from collections import OrderedDict, defaultdict from datetime import datetime bulletin_url = sbc.get_latest_bl_bulletin_url() bulletin_content = sc.download(bulletin_url, silent=True) soup = BeautifulSoup(bulletin_content, 'html.parser') content = soup.find('strong', string=re.compile(r'Per heute .*')).string # strip unwanted characters content = content.encode("ascii", errors="ignore").decode() dd = sc.DayData(canton='BL', url=bulletin_url) dd.datetime = sc.find(r'Per heute \w+, (\d+\. \w+ 20\d{2})', content) dd.isolated = sc.find( r'Aktuell befinden sich.*(\d+\s?\d+) Personen in Isolation', content) dd.quarantined = sc.find( r'Aktuell befinden sich.*(\d+\s?\d+) Personen in Quarantäne', content) is_first = True if dd: print(dd) is_first = False main_url = "https://www.baselland.ch/politik-und-behorden/direktionen/volkswirtschafts-und-gesundheitsdirektion/amt-fur-gesundheit/medizinische-dienste/kantonsarztlicher-dienst/aktuelles/covid-19-faelle-kanton-basel-landschaft" main_site = sc.download(main_url, silent=True)
xls_url = 'https://www.jgk.be.ch/jgk/de/index/gemeinden/gemeinden/gemeindedaten.assetref/dam/documents/JGK/AGR/de/Gemeinden/Gemeindedaten/agr_gemeinden_gemeindedaten_gemeinden_rk_de.xlsx' xls = sc.xlsdownload(xls_url, silent=True) xls_data = sc.parse_xls(xls, header_row=1, columns_to_parse=9) communes = {} for item in xls_data: commune = item['Gemeinde / Commune'] # kind of expected in this context commune = commune.replace(' (BE)', '') commune = commune.replace(' BE', '') district = item['Verwaltungskreis / Arrondissement administratif'] communes[commune] = district assert district in district_ids, f'District {district} is unknown!' # start getting and parsing the data html_url = 'https://www.besondere-lage.sites.be.ch/besondere-lage_sites/de/index/corona/index.html' d = sc.download(html_url, silent=True) d = d.replace(' ', ' ') soup = BeautifulSoup(d, 'html.parser') tbody = soup.find('table', {'summary': 'Laufend aktualisierte Zahlen zu den Corona-Erkrankungen im Kanton Bern'}).find_next('tbody') for row in tbody.find_all('tr'): tds = row.find_all('td') date_str = sc.find(r'(\d+\.\d+\.\d+)', tds[0].text) date = sc.date_from_text(date_str) dds = {} for (district, d_id), (district, population) in zip(district_ids.items(), inhabitants.items()): dd = sc.DistrictData(district=district, canton='BE') dd.url = html_url dd.district_id = d_id dd.population = population dd.date = date.isoformat()