Esempio n. 1
0
def scrape_zg():
    data = []
    date = None

    url = 'https://www.zg.ch/behoerden/gesundheitsdirektion/statistikfachstelle/daten/themen/result-themen-14-03-11.csv'
    content = sc.download(url)
    reader = csv.DictReader(StringIO(content), delimiter=',')
    for row in reader:
        source = row['Vermutete Ansteckungsquelle']
        if source != 'NA':
            isd = sc.InfectionSourceData('ZG', url)
            isd.source = source
            isd.count = row['Anzahl']
            data.append(isd)

        if row['Type'] == 'subtitle':
            content = row['Content']
            res = re.search('Datenstand: (.*)$', content)
            if res:
                date = parse_date(res[1])

    assert date is not None
    for item in data:
        item.date = date.date().isoformat()
        item.time = date.time().isoformat()
        print(item)
Esempio n. 2
0
def parse_data(url, pdf, content):
    start_date, end_date = parse_dates(content)
    categories = get_categories_from_diagram(pdf)
    for category, count in categories.items():
        isd = sc.InfectionSourceData('GE', url)
        isd.source = category
        isd.count = str(count or '')
        isd.date_from = start_date.isoformat()
        isd.date_to = end_date.isoformat()
        print(isd)
Esempio n. 3
0
def scrape_zh():
    url = 'https://raw.githubusercontent.com/openZH/covid_19_contact_tracing_ZH/master/data/Ansteckungswege_2021.csv'
    content = sc.download(url)
    reader = csv.DictReader(StringIO(content), delimiter=',')
    for row in reader:
        isd = sc.InfectionSourceData('ZH', url)
        isd.date_from = parse_date(row['from'])
        isd.date_to = parse_date(row['until'])
        isd.source = f"{row['context_cat']} ({row['context_bool']})"
        isd.count = row['n_conf']
        print(isd)
Esempio n. 4
0
def parse_weekly_bulletin(url):
    content = sc.download(url)
    content = BeautifulSoup(content, 'html.parser')
    content = content.find(
        string=re.compile('([Ii]m )?Zeitraum vom ')).find_parent('p').text
    # print(content)

    res = re.match(
        r'.*([Ii]m )?Zeitraum vom (\d.*20\d{2}|\d.*|\d+\.) bis (\d.*20\d{2})',
        content, re.DOTALL)
    start_date = None
    if res is not None:
        end_date = parse_bs_date(res[3]).date()
        try:
            start_date = parse_bs_date(res[2]).date()
        except arrow.parser.ParserMatchError:
            try:
                start_date = parse_bs_short_date(
                    f'{res[2]}{end_date.month}.{end_date.year}').date()
            except arrow.parser.ParserMatchError:
                start_date = parse_bs_date(f'{res[2]} {end_date.year}').date()
    assert start_date
    assert end_date

    total_infections = int(
        sc.match(r'.* wurden (\d+) Neuinfektionen', content, mode=re.DOTALL))
    known_infections = int(
        sc.match(r'.* Dabei konnten.* \(oder (\d+) F.lle\)',
                 content,
                 mode=re.DOTALL))
    unknown_infections = total_infections - known_infections

    infection_sources = parse_infection_sources(content, known_infections)
    infection_sources.append((unknown_infections, 'Unbekannt'))

    for infection_source in infection_sources:
        isd = sc.InfectionSourceData('BS', url)
        isd.date_from = start_date.isoformat()
        isd.date_to = end_date.isoformat()
        isd.source = infection_source[1]
        isd.count = str(infection_source[0])
        print(isd)
Esempio n. 5
0
def parse_vs_data(url, pdf):
    start_date, end_date = parse_vs_dates(pdf)

    content = sc.pdf_to_text(pdf, page=5)
    sources = {}
    for res in re.finditer(r'\n(\d+) \([\d\.]+%\) ([\w\s\.-]+)(;|\.)\n',
                           content):
        sources[strip_source(res[2])] = int(res[1])
    for res in re.finditer(
            r'(\d+) (neue|neuer)?\s?(F.lle|F.llen|Fall beim Ausbruch|cas dans),? \(?([\w\s]+)\s?(und|\.|;|\(|\))',
            content):
        sources[strip_source(res[4])] = int(res[1])

    for source, count in sources.items():
        isd = sc.InfectionSourceData('VS', url)
        isd.source = source
        isd.count = str(count or '')
        isd.date_from = start_date.isoformat()
        isd.date_to = end_date.isoformat()
        print(isd)
Esempio n. 6
0
def parse_sz_data(url, pdf):
    content = sc.pdf_to_text(pdf, page=3, layout=True)
    date_from, date_to = parse_sz_dates(content)

    sources = {}
    start_str = 'Prozentualer Anteil'
    start_pos = content.find(start_str) + len(start_str)
    end_pos = content.find('\n Total')
    content = content[start_pos:end_pos]
    for line in content.split('\n'):
        res = re.match(r'^\s(.*)\s\s(\d+)\s+(\d+\.\d{2})$', line)
        if res is not None:
            sources[strip_sz_source(res[1])] = int(res[2])

    for source, count in sources.items():
        isd = sc.InfectionSourceData('SZ', url)
        isd.source = source
        isd.count = str(count)
        isd.date_from = date_from.isoformat()
        isd.date_to = date_to.isoformat()
        print(isd)
Esempio n. 7
0
def scrape_ag():
    url = 'https://www.ag.ch/media/kanton_aargau/themen_1/coronavirus_1/daten_excel/Covid-19-Daten_Kanton_Aargau.xlsx'
    content = sc.download_content(url)
    xls = xlrd.open_workbook(file_contents=content)
    xls_datemode = xls.datemode
    sheet = xls.sheet_by_name('3. Ansteckungsorte')
    categories = {c: str(sheet.cell_value(1, c) or xlrd.formula.colname(c)) for c in range(1, sheet.ncols, 2)}
    for row in range(56, sheet.nrows):
        date = sheet.cell_value(row, 0)
        if date == '':
            return
        date = xlrd.xldate_as_datetime(date, xls_datemode).date()
        for col, cat in categories.items():
            # or should we use total count?
            count = sheet.cell_value(row, col)
            if count != '':
                count = int(count)
                isd = sc.InfectionSourceData('AG', url)
                isd.date = date.isoformat()
                isd.source = cat
                isd.count = str(count)
                print(isd)
Esempio n. 8
0
def parse_sh_data(url, pdf):
    found_data = False
    content = sc.pdf_to_text(pdf, page=1)
    year = parse_sh_year(content)
    for page in [11, 12, 13, 14, 15, 16, 17, 18]:
        content = sc.pdf_to_text(pdf, page=page)
        if re.match(r'.*Lage Schaffhausen . Ansteckungsorte.*', content):
            start_date, end_date = parse_sh_dates(content, year)
            categories = get_categories(content)
            count = get_count(content)

            if len(categories) == len(count):
                found_data = True
                for cat, cnt in zip(categories, count):
                    isd = sc.InfectionSourceData('SH', url)
                    isd.source = cat
                    isd.count = cnt
                    isd.date_from = start_date.isoformat()
                    isd.date_to = end_date.isoformat()
                    print(isd)

    assert found_data, f'No infection source data found in {url}!'