def get_vs_weekly_general_data(pdf): content = sc.pdftotext(pdf, page=1) week = sc.find(r'Epidemiologische Situation Woche (\d+)', content) end_date = sc.find(r'(\d+\.\d+\.\d{4})', content) end_date = sc.date_from_text(end_date) start_date = end_date - datetime.timedelta(days=7) year = start_date.year return week, year
import scrape_gl_common as sgc def split_whitespace(text): if not text: return [] text = re.sub(r'\s\s+', ' ', text) return text.split(' ') # weekly pdf pdf_url = sgc.get_gl_pdf_url() pdf = sc.download_content(pdf_url, silent=True) content = sc.pdftotext(pdf, page=1) pdf_date = sc.find(r'Stand: (\d{2}\.\d{2}.\d{4})', content) pdf_date = sc.date_from_text(pdf_date) number_of_tests = sc.find(r'PCR-Tests\sKanton Glarus\s(\d+\'?\d+)\s', content).replace('\'', '') is_first = True if number_of_tests: dd = sc.DayData(canton='GL', url=pdf_url) dd.datetime = pdf_date dd.tested = number_of_tests is_first = False print(dd) content = sc.pdftotext(pdf, page=2, raw=True) dates = split_whitespace( sc.find(r'\n(\d+\.\d+\s+\d+\.\d+\s+.*)\nMassenquarant.ne', content)) travel_q = split_whitespace(
commune = commune.replace(' (BE)', '') commune = commune.replace(' BE', '') district = item['Verwaltungskreis / Arrondissement administratif'] communes[commune] = district assert district in district_ids, f'District {district} is unknown!' # start getting and parsing the data html_url = 'https://www.besondere-lage.sites.be.ch/besondere-lage_sites/de/index/corona/index.html' d = sc.download(html_url, silent=True) d = d.replace(' ', ' ') soup = BeautifulSoup(d, 'html.parser') tbody = soup.find('table', {'summary': 'Laufend aktualisierte Zahlen zu den Corona-Erkrankungen im Kanton Bern'}).find_next('tbody') for row in tbody.find_all('tr'): tds = row.find_all('td') date_str = sc.find(r'(\d+\.\d+\.\d+)', tds[0].text) date = sc.date_from_text(date_str) dds = {} for (district, d_id), (district, population) in zip(district_ids.items(), inhabitants.items()): dd = sc.DistrictData(district=district, canton='BE') dd.url = html_url dd.district_id = d_id dd.population = population dd.date = date.isoformat() dd.new_cases = 0 dds[district] = dd content = tds[2].text.strip() # fix Munchen-<br />\nbuchsee stuff content = re.sub(r'-\n(\w)', r'-\1', content) # fix <br /> without - from above, but no number on the next line...
# parse weekly data for isolated and quarantined numbers base_url = 'https://www.vs.ch' stat_url = base_url + '/de/web/coronavirus/statistiques' content = sc.download(stat_url, silent=True) soup = BeautifulSoup(content, 'html.parser') res = soup.find( string=re.compile(r'Synthese COVID19 VS Woche\d+')).find_previous('a') weekly_pdf_url = base_url + res.attrs['href'] weekly_pdf_url = weekly_pdf_url.replace(' ', '%20') content = sc.pdfdownload(weekly_pdf_url, silent=True) # add isolated/quarantined to the existing DayData item week_end_date = sc.find(r'vom (\d+)\. bis (\d+\.\d+\.20\d{2})', content, group=2) week_end_date = sc.date_from_text(week_end_date).isoformat() dd = sc.DayData(canton='VS', url=weekly_pdf_url) dd.datetime = week_end_date dd.isolated = sc.find( r'befanden\ssich\s(\d+)\spositive\sF.lle\snoch\simmer\sin\sIsolation', content) dd.quarantined = sc.find(r'Isolation\sund\s(\d+)\sKontakte\sin\sQuarant.ne', content) dd.quarantine_riskareatravel = sc.find(r'\s(\d+)\sReisende\sin\sQuarant.ne', content) print(dd) xls_url = 'https://raw.githubusercontent.com/statistikZH/covid19_drop/master/Chiffres%20%20COVID-19%20Valais.xlsx' main_url = 'https://www.vs.ch/de/web/coronavirus' xls = sc.xlsdownload(xls_url, silent=True)
"Positiv getestete Personen","10.03.2020","7",NA,NA,NA "Positiv getestete Personen","11.03.2020","6",NA,NA,NA "Positiv getestete Personen","12.03.2020","6",NA,NA,NA "Positiv getestete Personen","13.03.2020","8",NA,NA,NAh "Positiv getestete Personen","14.03.2020","10",NA,NA,NA "Positiv getestete Personen","15.03.2020","11",NA,NA,NA "Positiv getestete Personen","16.03.2020","19",NA,NA,NA "Positiv getestete Personen","17.03.2020","22",NA,NA,NA """ reader = csv.DictReader(StringIO(d_csv), delimiter=',') data = collections.defaultdict(dict) for row in reader: if row['Typ'] == 'NA' or row['Datum'] == 'NA': continue date = sc.date_from_text(row['Datum']) data[date.isoformat()][row['Typ']] = row['Anzahl'] days = list(data.keys()) is_first = True for day in days: if not is_first: print('-' * 10) is_first = False dd = sc.DayData(canton='ZG', url=main_url) dd.datetime = day dd.isolated = data[day]['Isolation'] dd.quarantined = data[day]['Quarantäne aus Contact Tracing'] dd.quarantine_riskareatravel = data[day].get( 'Quarantäne nach Rückkehr aus Risikoland')
<p>Intensivpflege (aktuell):</p> </td> <td style="text-align: right; vertical-align: top;">4</td> </tr> """ include_hosp = True include_cases = True include_isolated = True case_date_str = sc.find( r'Fallzahlen\s*im\s*Kanton\s*Luzern.*\(Stand:\s*(.+?)\,', d) hosp_date_str = sc.find(r'Hospitalisierungen.*\(Stand:\s*(.+?)\,', d) isolated_date_str = sc.find(r'Isolation.*\(Stand:\s*(.+?)\,', d) case_date = sc.date_from_text(case_date_str) hosp_date = sc.date_from_text(hosp_date_str) isolated_date = sc.date_from_text(isolated_date_str) max_date = max(hosp_date, case_date, isolated_date) if max_date > hosp_date: include_hosp = False else: dd.datetime = hosp_date_str if max_date > case_date: include_cases = False else: dd.datetime = case_date_str if max_date > isolated_date: include_isolated = False else:
from bs4 import BeautifulSoup import re import scrape_common as sc import scrape_bl_common as sbc from datetime import timedelta # weekly data bulletin_urls = sbc.get_all_bl_bulletin_urls() for bulletin_url in bulletin_urls: bulletin_content = sc.download(bulletin_url, silent=True) soup = BeautifulSoup(bulletin_content, 'html.parser') content = soup.find(string=re.compile(r'Per heute .*')).string content = sbc.strip_bl_bulletin_numbers(content) date = sc.find(r'Per heute \w+, (\d+\. \w+ 20\d{2})', content) date = sc.date_from_text(date) # previous week date = date - timedelta(days=7) td = sc.TestData(canton='BL', url=bulletin_url) td.week = date.isocalendar()[1] td.year = date.year td.total_tests = sc.find(r'In der Vorwoche wurden (\d+) PCR-Tests', content) td.positivity_rate = sc.find( r'von diesen waren (\d+\.?,?\d?) Prozent positiv', content) if td.total_tests and td.positivity_rate: td.positivity_rate = td.positivity_rate.replace(',', '.') print(td) # daily data
<td colspan="2">70</td> <td colspan="2">+8</td> </tr><tr class="even" ><td height="20">Verstorbene (kumuliert)</td> <td colspan="2">9</td> <td colspan="2">unverändert</td> </tr></tbody></table> """ include_hosp = True include_cases = True dates = re.findall(r'<h4>Stand ([0-9]+\.\s*[A-Za-z]*\s*[0-9]{4}).*<\/h4>', d) if len(dates) == 1: dd.datetime = dates[0] elif len(dates) >= 2: d1 = sc.date_from_text(dates[0]) d2 = sc.date_from_text(dates[1]) if d1 > d2: include_hosp = False dd.datetime = dates[0] elif d2 > d1: include_cases = False dd.datetime = dates[1] else: dd.datetime = dates[0] else: print("Error: Date not found.", file=sys.stderr) if include_cases: dd.cases = sc.find(r'Laborbest.+?tigte\s*F.+?lle\s*\(kumuliert\)<\/t[hd]>\s*<t[hd][^>]*>([0-9]+)<\/t[hd]>', d.replace("\n", "")) dd.deaths = sc.find(r'>Verstorbene\s*\(kumuliert\)<\/td>\s*<td[^>]*>([0-9]+)[ <]', d.replace("\n", ""))
flags=re.I | re.UNICODE) res_with_year = re.search( r'Entre\s+le\s+(?P<start>\d+\s+\w+\s+\d{4})\s+et\s+le\s+(?P<end>\d+\s+\w+\s+\d{4}),', pdf, flags=re.I | re.UNICODE) res_no_month = re.search( r'Entre\s+le\s+(?P<start>\d+)\s+et\s+le\s+(?P<end>\d+\s+\w+),', pdf, flags=re.I | re.UNICODE) res_no_month_with_year = re.search( r'Entre(?P<et>\s+et)?\s+le\s+(?P<start>\d+)\s+et\s+le\s+(?P<end>\d+\s+\w+\s+\d{4}),', pdf, flags=re.I | re.UNICODE) if res: start_date = sc.date_from_text(f"{res['start']} {year}") end_date = sc.date_from_text(f"{res['end']} {year}") elif res_with_year: start_date = sc.date_from_text(res_with_year['start']) end_date = sc.date_from_text(res_with_year['end']) elif res_no_month: end_date = sc.date_from_text(f"{res_no_month['end']} {year}") start_date = sc.date_from_text( f"{res_no_month['start']}.{end_date.month}.{year}") elif res_no_month_with_year: end_date = sc.date_from_text(res_no_month_with_year['end']) start_date = sc.date_from_text( f"{res_no_month_with_year['start']}.{end_date.month}.{end_date.year}" ) elif date: print(date)
# -*- coding: utf-8 -*- import re from bs4 import BeautifulSoup import scrape_common as sc url = 'https://www.sz.ch/behoerden/information-medien/medienmitteilungen/coronavirus.html/72-416-412-1379-6948' content = sc.download(url, silent=True) soup = BeautifulSoup(content, 'html.parser') pdf_url = soup.find('a', text=re.compile(r'Coronafälle pro Gemeinde')).get('href') content = sc.pdfdownload(pdf_url, layout=True, silent=True) date = sc.find(r'Stand\W+(\d+\.\d+\.20\d{2})', content) date = sc.date_from_text(date).isoformat() district_data = re.findall(r'^Bezirk\W+(\w+)\s+(≤?\s?\d+)', content, re.MULTILINE) # https://www.bfs.admin.ch/bfs/de/home/statistiken/kataloge-datenbanken/karten.assetdetail.5688189.html district_ids = { 'Einsiedeln': 501, 'Gersau': 502, 'Höfe': 503, 'Küssnacht': 504, 'March': 505, 'Schwyz': 506, } # https://www.sz.ch/kanton/bezirke/schwyz.html/72-210-112-106 population = { 'Einsiedeln': 16027,
import csv import datetime from io import StringIO import scrape_common as sc csv_url = 'https://www.zg.ch/behoerden/gesundheitsdirektion/statistikfachstelle/daten/themen/result-themen-14-03-07-i2-k4-b1.csv' d_csv = sc.download(csv_url, silent=True) """ "Woche","Geschlecht","Anzahl Fälle","Meta","Type","Content" 2020-05-25,"männlich","151",NA,NA,NA 2020-06-01,"männlich","117",NA,NA,NA """ reader = csv.DictReader(StringIO(d_csv), delimiter=',') data = collections.defaultdict(dict) for row in reader: if row['Woche'] == 'NA': continue date = sc.date_from_text(row['Woche']) if date not in data: data[date] = 0 data[date] += int(row['Anzahl Fälle']) days = list(data.keys()) for day in days: td = sc.TestData(canton='ZG', url=csv_url) td.start_date = day.isoformat() td.end_date = (day + datetime.timedelta(days=6)).isoformat() td.total_tests = data[day] print(td)
#!/usr/bin/env python3 import csv from io import StringIO import scrape_common as sc url = 'https://covid-kennzahlen.apps.be.ch/#/de/cockpit' csv_url = 'https://raw.githubusercontent.com/openDataBE/covid19Data/develop/vortag_tests.csv' d = sc.download(csv_url, silent=True) reader = csv.DictReader(StringIO(d), delimiter=',') for row in reader: td = sc.TestData(canton='BE', url=url) date = sc.date_from_text(row['datum']).isoformat() td.start_date = date td.end_date = date td.total_tests = row['durchgefuehrte_tests'] td.positive_tests = row['positive_tests'] td.positivity_rate = row['positivitaetsrate'] print(td)
'Rheintal': 1723, 'Werdenberg': 1724, 'Sarganserland': 1725, 'See-Gaster': 1726, 'Toggenburg': 1727, 'Wil': 1728, } url = 'https://www.sg.ch/ueber-den-kanton-st-gallen/statistik/covid-19/_jcr_content/Par/sgch_downloadlist/DownloadListPar/sgch_download.ocFile/KantonSG_C19-Faelle_download.csv' d = sc.download(url, silent=True) # strip the "header" / description lines d = "\n".join(d.split("\n")[5:]) reader = csv.DictReader(StringIO(d), delimiter=';') for row in reader: week = sc.find(r'W(\d+)', row['Kalenderwoche']) date = sc.date_from_text(row['Falldatum']) for key, value in inhabitants.items(): dd = sc.DistrictData(canton='SG', district=key) dd.url = url dd.week = week dd.year = date.year dd.date = date.isoformat() dd.district_id = district_ids[key] dd.new_cases = row['Wahlkreis ' + key] dd.total_cases = row['Wahlkreis ' + key + ' (kumuliert)'] dd.population = value print(dd)
import scrape_common as sc import scrape_vd_common as svc pdf_urls = svc.get_all_weekly_pdf_urls() for pdf_url in pdf_urls: pdf = sc.pdfdownload(pdf_url, silent=True, page=1) pdf = re.sub(r'(\d+)\'(\d+)', r'\1\2', pdf) pdf = re.sub(r'(\d+)’(\d+)', r'\1\2', pdf) td = sc.TestData(canton='VD', url=pdf_url) year = sc.find(r'Situation au \d+.*(20\d{2})', pdf) res = re.search(r'Entre\s+le\s+(\d+\s+\w+)\s+et\s+le\s+(\d+\s+\w+),', pdf) if res: start_date = sc.date_from_text(f'{res[1]} {year}') end_date = sc.date_from_text(f'{res[2]} {year}') else: res = re.search(r'Entre\s+le\s+(\d+)\s+et\s+le\s+(\d+\s+\w+),', pdf) if res: end_date = sc.date_from_text(f'{res[2]} {year}') start_date = sc.date_from_text(f'{res[1]}.{end_date.month}.{year}') assert start_date and end_date, f'failed to extract start and end dates from {pdf_url}' td.start_date = start_date td.end_date = end_date res = re.search(r'une\s+moyenne\s+de\s+(\d+)\s+frottis\s+SARS-CoV(-)?2', pdf) if res: days = (end_date - start_date).days td.total_tests = days * int(res[1])
<tr> <td valign="top"> <p>Intensivpflege (aktuell):</p> </td> <td style="text-align: right; vertical-align: top;">4</td> </tr> """ include_hosp = True include_cases = True case_date_str = sc.find( r'Fallzahlen\s*im\s*Kanton\s*Luzern.*\(Stand:\s*(.+?)\,', d) hosp_date_str = sc.find(r'Hospitalisierungen.*\(Stand:\s*(.+?)\,', d) case_date = sc.date_from_text(case_date_str) hosp_date = sc.date_from_text(hosp_date_str) if case_date > hosp_date: include_hosp = False dd.datetime = case_date_str elif hosp_date > case_date: include_cases = False dd.datetime = hosp_date_str else: dd.datetime = case_date_str soup = BeautifulSoup(d, 'html.parser') rows = [] for table in soup.find(string=re.compile( r'Informationen\s*des\s*Kantons')).find_parent('li').find_all('table'): rows += table.find_all('tr')
if td: print(td) # daily tests for t in soup.find( 'caption', string=re.compile( 'Corona-Erkrankungen im Kanton Bern')).find_parents('table'): headers = [ " ".join(cell.stripped_strings) for cell in t.find('tr').find_all('th') ] for row in [r for r in t.find_all('tr') if r.find_all('td')]: td = sc.TestData(canton='BE', url=html_url) for col_num, cell in enumerate(row.find_all(['td'])): value = " ".join(cell.stripped_strings) if value: value = re.sub(r'[^\d\.\ ]', '', value) if sc.find(r'^(Datum)', headers[col_num]) is not None: dateArr = re.search(r'(\d{2}).(\d{2}).(\d{2})', value) value = dateArr.group(0) date = sc.date_from_text(value).isoformat() td.start_date = date td.end_date = date elif sc.find(r'^(Durch-)', headers[col_num]): td.total_tests = int(value) if td: print(td)