def scrape_zg(): data = [] date = None url = 'https://www.zg.ch/behoerden/gesundheitsdirektion/statistikfachstelle/daten/themen/result-themen-14-03-11.csv' content = sc.download(url) reader = csv.DictReader(StringIO(content), delimiter=',') for row in reader: source = row['Vermutete Ansteckungsquelle'] if source != 'NA': isd = sc.InfectionSourceData('ZG', url) isd.source = source isd.count = row['Anzahl'] data.append(isd) if row['Type'] == 'subtitle': content = row['Content'] res = re.search('Datenstand: (.*)$', content) if res: date = parse_date(res[1]) assert date is not None for item in data: item.date = date.date().isoformat() item.time = date.time().isoformat() print(item)
def parse_data(url, pdf, content): start_date, end_date = parse_dates(content) categories = get_categories_from_diagram(pdf) for category, count in categories.items(): isd = sc.InfectionSourceData('GE', url) isd.source = category isd.count = str(count or '') isd.date_from = start_date.isoformat() isd.date_to = end_date.isoformat() print(isd)
def scrape_zh(): url = 'https://raw.githubusercontent.com/openZH/covid_19_contact_tracing_ZH/master/data/Ansteckungswege_2021.csv' content = sc.download(url) reader = csv.DictReader(StringIO(content), delimiter=',') for row in reader: isd = sc.InfectionSourceData('ZH', url) isd.date_from = parse_date(row['from']) isd.date_to = parse_date(row['until']) isd.source = f"{row['context_cat']} ({row['context_bool']})" isd.count = row['n_conf'] print(isd)
def parse_weekly_bulletin(url): content = sc.download(url) content = BeautifulSoup(content, 'html.parser') content = content.find( string=re.compile('([Ii]m )?Zeitraum vom ')).find_parent('p').text # print(content) res = re.match( r'.*([Ii]m )?Zeitraum vom (\d.*20\d{2}|\d.*|\d+\.) bis (\d.*20\d{2})', content, re.DOTALL) start_date = None if res is not None: end_date = parse_bs_date(res[3]).date() try: start_date = parse_bs_date(res[2]).date() except arrow.parser.ParserMatchError: try: start_date = parse_bs_short_date( f'{res[2]}{end_date.month}.{end_date.year}').date() except arrow.parser.ParserMatchError: start_date = parse_bs_date(f'{res[2]} {end_date.year}').date() assert start_date assert end_date total_infections = int( sc.match(r'.* wurden (\d+) Neuinfektionen', content, mode=re.DOTALL)) known_infections = int( sc.match(r'.* Dabei konnten.* \(oder (\d+) F.lle\)', content, mode=re.DOTALL)) unknown_infections = total_infections - known_infections infection_sources = parse_infection_sources(content, known_infections) infection_sources.append((unknown_infections, 'Unbekannt')) for infection_source in infection_sources: isd = sc.InfectionSourceData('BS', url) isd.date_from = start_date.isoformat() isd.date_to = end_date.isoformat() isd.source = infection_source[1] isd.count = str(infection_source[0]) print(isd)
def parse_vs_data(url, pdf): start_date, end_date = parse_vs_dates(pdf) content = sc.pdf_to_text(pdf, page=5) sources = {} for res in re.finditer(r'\n(\d+) \([\d\.]+%\) ([\w\s\.-]+)(;|\.)\n', content): sources[strip_source(res[2])] = int(res[1]) for res in re.finditer( r'(\d+) (neue|neuer)?\s?(F.lle|F.llen|Fall beim Ausbruch|cas dans),? \(?([\w\s]+)\s?(und|\.|;|\(|\))', content): sources[strip_source(res[4])] = int(res[1]) for source, count in sources.items(): isd = sc.InfectionSourceData('VS', url) isd.source = source isd.count = str(count or '') isd.date_from = start_date.isoformat() isd.date_to = end_date.isoformat() print(isd)
def parse_sz_data(url, pdf): content = sc.pdf_to_text(pdf, page=3, layout=True) date_from, date_to = parse_sz_dates(content) sources = {} start_str = 'Prozentualer Anteil' start_pos = content.find(start_str) + len(start_str) end_pos = content.find('\n Total') content = content[start_pos:end_pos] for line in content.split('\n'): res = re.match(r'^\s(.*)\s\s(\d+)\s+(\d+\.\d{2})$', line) if res is not None: sources[strip_sz_source(res[1])] = int(res[2]) for source, count in sources.items(): isd = sc.InfectionSourceData('SZ', url) isd.source = source isd.count = str(count) isd.date_from = date_from.isoformat() isd.date_to = date_to.isoformat() print(isd)
def scrape_ag(): url = 'https://www.ag.ch/media/kanton_aargau/themen_1/coronavirus_1/daten_excel/Covid-19-Daten_Kanton_Aargau.xlsx' content = sc.download_content(url) xls = xlrd.open_workbook(file_contents=content) xls_datemode = xls.datemode sheet = xls.sheet_by_name('3. Ansteckungsorte') categories = {c: str(sheet.cell_value(1, c) or xlrd.formula.colname(c)) for c in range(1, sheet.ncols, 2)} for row in range(56, sheet.nrows): date = sheet.cell_value(row, 0) if date == '': return date = xlrd.xldate_as_datetime(date, xls_datemode).date() for col, cat in categories.items(): # or should we use total count? count = sheet.cell_value(row, col) if count != '': count = int(count) isd = sc.InfectionSourceData('AG', url) isd.date = date.isoformat() isd.source = cat isd.count = str(count) print(isd)
def parse_sh_data(url, pdf): found_data = False content = sc.pdf_to_text(pdf, page=1) year = parse_sh_year(content) for page in [11, 12, 13, 14, 15, 16, 17, 18]: content = sc.pdf_to_text(pdf, page=page) if re.match(r'.*Lage Schaffhausen . Ansteckungsorte.*', content): start_date, end_date = parse_sh_dates(content, year) categories = get_categories(content) count = get_count(content) if len(categories) == len(count): found_data = True for cat, cnt in zip(categories, count): isd = sc.InfectionSourceData('SH', url) isd.source = cat isd.count = cnt isd.date_from = start_date.isoformat() isd.date_to = end_date.isoformat() print(isd) assert found_data, f'No infection source data found in {url}!'