Exemple #1
0
def parse_weekly_pdf():
    sc.add_cert_to_bundle()
    base_url = 'https://www.infosan.vd.ch'
    d = sc.download(base_url, silent=True)
    soup = BeautifulSoup(d, 'html.parser')
    html_url = base_url + soup.find(href=re.compile("/publications/covid-19-point-epidemiologique")).get('href')
    d = sc.download(html_url, silent=True)
    soup = BeautifulSoup(d, 'html.parser')
    pdf_url = base_url + soup.find(href=re.compile("\.pdf$")).get('href')
    pdf = sc.pdfdownload(pdf_url, silent=True)

    """
    29.07.2020
    Concernant le traçage des contacts de cas positifs, le 27 juillet, 83 personnes étaient en isolement, 633 en quarantaine de contacts étroits et 901 en quarantaine de retour de voyage.
    """

    dd = sc.DayData(canton='VD', url=pdf_url)
    year= sc.find('Situation au \d+.*(20\d{2})', pdf)
    date = sc.find('Concernant le traçage des contacts de cas positifs, le (\d+.*),', pdf)
    dd.datetime = date + ' ' + year
    dd.isolated = sc.find('(\d+)\s(personnes|cas\spositifs)\sétaient\sen\sisolement', pdf)
    dd.quarantined = text_to_int(sc.find('(\d.\d+|\d+)\scontacts\sétroits\sen\squarantaine\.', pdf))
    print(dd)
    print('-' * 10)

    dd = sc.DayData(canton='VD', url=pdf_url)
    date = sc.find('quarantaine. Le (\d+ .*),', pdf)
    dd.datetime = date + ' ' + year
    dd.quarantine_riskareatravel = text_to_int(sc.find(', (\d.\d+|\d+)\spersonnes\sétaient\sen\squarantaines\ssuite\sà\sun\sretour\sde\svoyage.', pdf))
    print(dd)
    print('-' * 10)
Exemple #2
0
def parse_xlsx():
    html_url = 'https://www.vd.ch/toutes-les-actualites/hotline-et-informations-sur-le-coronavirus/point-de-situation-statistique-dans-le-canton-de-vaud/'
    d = sc.download(html_url, silent=True)
    soup = BeautifulSoup(d, 'html.parser')
    xls_url = soup.find(href=re.compile("\.xlsx$")).get('href')
    assert xls_url, "URL is empty"
    xls = sc.xlsdownload(xls_url, silent=True)
    rows = sc.parse_xls(xls, header_row=2)
    is_first = True
    for row in rows:
        if not isinstance(row['Date'], datetime.datetime):
            continue

        if not is_first:
            print('-' * 10)
        is_first = False

        print('VD')
        sc.timestamp()
        print('Downloading:', html_url)
        print('Date and time:', row['Date'].date().isoformat())
        print('Confirmed cases:',
              row['Nombre total de cas confirmés positifs'])
        print('Hospitalized:', row['Hospitalisation en cours'])
        print('ICU:', row['Dont soins intensifs'])
        print('Deaths:', row['Décès'])
Exemple #3
0
def scrape_zg():
    data = []
    date = None

    url = 'https://www.zg.ch/behoerden/gesundheitsdirektion/statistikfachstelle/daten/themen/result-themen-14-03-11.csv'
    content = sc.download(url)
    reader = csv.DictReader(StringIO(content), delimiter=',')
    for row in reader:
        source = row['Vermutete Ansteckungsquelle']
        if source != 'NA':
            isd = sc.InfectionSourceData('ZG', url)
            isd.source = source
            isd.count = row['Anzahl']
            data.append(isd)

        if row['Type'] == 'subtitle':
            content = row['Content']
            res = re.search('Datenstand: (.*)$', content)
            if res:
                date = parse_date(res[1])

    assert date is not None
    for item in data:
        item.date = date.date().isoformat()
        item.time = date.time().isoformat()
        print(item)
Exemple #4
0
def get_nw_page():
    url = 'https://www.nw.ch/gesundheitsamtdienste/6044'
    content = sc.download(url, silent=True)
    content = content.replace(" ", " ")
    content = re.sub(r'(\d+)\'(\d+)', r'\1\2', content)
    soup = BeautifulSoup(content, 'html.parser')
    return url, soup
Exemple #5
0
def get_ag_xls_url():
    data_url = 'https://www.ag.ch/de/themen_1/coronavirus_2/lagebulletins/lagebulletins_1.jsp'
    d = sc.download(data_url, silent=True)
    soup = BeautifulSoup(d, 'html.parser')
    xls_url = soup.find('a', href=re.compile(r'\.xlsx$'))['href']
    if not xls_url.startswith('http'):
        xls_url = f'https://www.ag.ch{xls_url}'
    return xls_url
Exemple #6
0
def scrape_bs():
    base_url = 'https://www.coronavirus.bs.ch'
    content = sc.download(base_url)
    content = BeautifulSoup(content, 'html.parser')

    bulletin = content.find(
        string=re.compile('Coronavirus: .*-Bulletin')).find_parent('a')
    url = base_url + bulletin.get('href')
    parse_weekly_bulletin(url)
Exemple #7
0
def get_gl_pdf_url():
    d = sc.download(
        'https://www.gl.ch/verwaltung/finanzen-und-gesundheit/gesundheit/coronavirus.html/4817',
        silent=True)
    soup = BeautifulSoup(d, 'html.parser')

    # weekly pdf
    pdf_url = soup.find(href=re.compile(r'Sentinella.*\.pdf$')).get('href')
    return pdf_url
Exemple #8
0
def get_all_bl_bulletin_urls():
    news_url = 'https://www.baselland.ch/politik-und-behorden/direktionen/volkswirtschafts-und-gesundheitsdirektion/amt-fur-gesundheit/medizinische-dienste/kantonsarztlicher-dienst/aktuelles/medienmitteilungen-1'
    news_content = sc.download(news_url, silent=True)
    soup = BeautifulSoup(news_content, 'html.parser')

    bulletins = soup.find_all('a', href=re.compile(r'.*/coronavirus-wochenbulletin.*'))
    bulletin_urls = []
    for bulletin in bulletins:
        bulletin_urls.append(bulletin.get('href'))
    return bulletin_urls
Exemple #9
0
def get_all_weekly_pdf_urls():
    base_url = 'https://www.infosan.vd.ch'
    d = sc.download(base_url, silent=True)

    urls = re.findall(r"window.open\('(.*_epidemio\.pdf)'", d)
    result = []
    for url in urls:
        if not url.startswith('http'):
            url = f'{base_url}/{url}'
        result.append(url)
    return result
def get_vs_weekly_pdf_urls():
    base_url = 'https://www.vs.ch'
    url = base_url + '/de/web/coronavirus/statistiques'
    content = sc.download(url, silent=True)
    soup = BeautifulSoup(content, 'html.parser')
    links = soup.find_all(href=re.compile(r'Synthese.*Woche'))
    result = []
    for link in links:
        url = base_url + link['href'].replace(' ', '%20')
        result.append(url)
    return result
Exemple #11
0
def scrape_zh():
    url = 'https://raw.githubusercontent.com/openZH/covid_19_contact_tracing_ZH/master/data/Ansteckungswege_2021.csv'
    content = sc.download(url)
    reader = csv.DictReader(StringIO(content), delimiter=',')
    for row in reader:
        isd = sc.InfectionSourceData('ZH', url)
        isd.date_from = parse_date(row['from'])
        isd.date_to = parse_date(row['until'])
        isd.source = f"{row['context_cat']} ({row['context_bool']})"
        isd.count = row['n_conf']
        print(isd)
Exemple #12
0
def get_weekly_bulletins():
    base_url = 'https://www.vs.ch'
    url = base_url + '/de/web/coronavirus/statistiques'
    content = sc.download(url)
    content = BeautifulSoup(content, 'html.parser')

    items = content.find_all(string=re.compile(r'Synthese.*Woche'))
    result = []
    for item in items:
        link = item.find_previous('a')
        result.append(base_url + link.attrs['href'])
    return result
Exemple #13
0
def get_all_weekly_pdf_urls():
    base_url = 'https://www.infosan.vd.ch'
    url = f'{base_url}/resultat-de-la-recherche/search/covid/?tx_solr[sort]=changed_asc asc'
    d = sc.download(url, silent=True)

    urls = re.findall(r"window.open\('(.*\.pdf)'", d)
    result = []
    for url in urls:
        if not url.startswith('http'):
            url = f'{base_url}/{url}'
        result.append(url)
    return result
Exemple #14
0
def get_all_weekly_pdf_urls():
    base_url = 'https://corona.so.ch'
    url = f'{base_url}/bevoelkerung/daten/woechentlicher-situationsbericht/'
    d = sc.download(url, silent=True)
    soup = BeautifulSoup(d, 'html.parser')
    links = soup.find_all(href=re.compile(r'\.pdf$'))
    result = []
    for link in links:
        file_ref = link.get('href')
        url = f'{base_url}{file_ref}'
        if url not in result:
            result.append(url)
    return result
def get_fr_xls():
    d = sc.download(
        'https://www.fr.ch/sante/covid-19/coronavirus-statistiques-evolution-de-la-situation-dans-le-canton',
        silent=True)

    soup = BeautifulSoup(d, 'html.parser')
    xls_url = soup.find(href=re.compile("\.xlsx$")).get('href')
    assert xls_url, "URL is empty"
    if not xls_url.startswith('http'):
        xls_url = f'https://www.fr.ch{xls_url}'

    xls = sc.xlsdownload(xls_url, silent=True)
    return xls_url, xls
Exemple #16
0
def get_fr_xls():
    d = sc.download(
        'https://www.fr.ch/de/gesundheit/covid-19/coronavirus-statistik-ueber-die-entwicklung-im-kanton',
        silent=True)

    soup = BeautifulSoup(d, 'html.parser')
    xls_url = soup.find(href=re.compile(r"\.xlsx$")).get('href')
    assert xls_url, "URL is empty"
    if not xls_url.startswith('http'):
        xls_url = f'https://www.fr.ch{xls_url}'

    xls = sc.xlsdownload(xls_url, silent=True)
    return xls_url, xls
Exemple #17
0
def get_ge_weekly_pdf_urls():
    d = sc.download('https://www.ge.ch/document/covid-19-bilan-epidemiologique-hebdomadaire', silent=True)
    soup = BeautifulSoup(d, 'html.parser')
    links = soup.find_all('a', title=re.compile(r"\.pdf$"))
    result = []
    for link in links:
        pdf_url = link.get('href')
        assert pdf_url, "pdf URL is empty"
        if not pdf_url.startswith('http'):
            pdf_url = f'https://www.ge.ch{pdf_url}'
        if pdf_url not in result:
            result.append(pdf_url)
    return result
Exemple #18
0
def get_fr_xls():
    main_url = 'https://www.fr.ch/de/gesundheit/covid-19/coronavirus-statistik-ueber-die-entwicklung-im-kanton'
    d = sc.download(main_url, silent=True)

    soup = BeautifulSoup(d, 'html.parser')
    item = soup.find(
        'span',
        text=re.compile(r"Statistik .ber die Entwicklungen im Kanton.*"))
    item = item.find_parent('a')
    xls_url = item.get('href')
    assert xls_url, "URL is empty"
    if not xls_url.startswith('http'):
        xls_url = f'https://www.fr.ch{xls_url}'

    xls = sc.xlsdownload(xls_url, silent=True)
    return xls_url, xls, main_url
Exemple #19
0
def parse_weekly_bulletin(url):
    content = sc.download(url)
    content = BeautifulSoup(content, 'html.parser')
    content = content.find(
        string=re.compile('([Ii]m )?Zeitraum vom ')).find_parent('p').text
    # print(content)

    res = re.match(
        r'.*([Ii]m )?Zeitraum vom (\d.*20\d{2}|\d.*|\d+\.) bis (\d.*20\d{2})',
        content, re.DOTALL)
    start_date = None
    if res is not None:
        end_date = parse_bs_date(res[3]).date()
        try:
            start_date = parse_bs_date(res[2]).date()
        except arrow.parser.ParserMatchError:
            try:
                start_date = parse_bs_short_date(
                    f'{res[2]}{end_date.month}.{end_date.year}').date()
            except arrow.parser.ParserMatchError:
                start_date = parse_bs_date(f'{res[2]} {end_date.year}').date()
    assert start_date
    assert end_date

    total_infections = int(
        sc.match(r'.* wurden (\d+) Neuinfektionen', content, mode=re.DOTALL))
    known_infections = int(
        sc.match(r'.* Dabei konnten.* \(oder (\d+) F.lle\)',
                 content,
                 mode=re.DOTALL))
    unknown_infections = total_infections - known_infections

    infection_sources = parse_infection_sources(content, known_infections)
    infection_sources.append((unknown_infections, 'Unbekannt'))

    for infection_source in infection_sources:
        isd = sc.InfectionSourceData('BS', url)
        isd.date_from = start_date.isoformat()
        isd.date_to = end_date.isoformat()
        isd.source = infection_source[1]
        isd.count = str(infection_source[0])
        print(isd)
Exemple #20
0
#!/usr/bin/env python3
# -*- coding: utf-8 -*-

import re
import sys
import datetime
from bs4 import BeautifulSoup
import scrape_common as sc

url = 'https://www.sz.ch/behoerden/information-medien/medienmitteilungen/coronavirus.html/72-416-412-1379-6948'
d = sc.download(url, silent=True)
soup = BeautifulSoup(d, 'html.parser')

is_first = True
"""
Disabled for now, the PDFs from October 2020 contained hospitalized and quarntined data

pdfs = soup.find_all('a', string=re.compile(r'Medienmitteilung vom'))
for pdf in pdfs:
    pdf_url = pdf['href']
    pdf_content = sc.pdfdownload(pdf_url, layout=True, silent=True)
    date = sc.find(r'Stand:\s(\d+\.\s.*\s20\d{2})', pdf_content)
    res = re.search(r'.*\s+(?P<iso>\d+)\s+\d+\s+\d+\s+(?P<hosp>\d+)\s+(?P<quar>\d+)\s+(?P<qtravel>\d+)\s+', pdf_content)
    if not date or not res:
        continue

    if not is_first:
        print('-' * 10)
    is_first = False
    dd = sc.DayData(canton='SZ', url=pdf_url)
    dd.datetime = date.replace('\n', ' ')
#!/usr/bin/env python3

import scrape_common as sc
import re

# get latest from list with all press releases
d = sc.download('https://www.regierung.li/coronavirus', silent=True)

pdf_url = sc.find(
    r'<a.*?href="([^"]+\.pdf)[^"]*"[^>]*?>[^<]+?Situationsbericht[^<]+?<\/a>',
    d)
assert pdf_url, "PDF URL not found"

# download latest PDF
d = sc.pdfdownload(pdf_url, raw=True, silent=True)
# extract case numbers reported for previous days
d = d.replace(u'\xa0', u' ')

# data from the most recent press release
dd = sc.DayData(canton='FL', url=pdf_url)
dd.datetime = sc.find(r'Situationsbericht vom (.*? 20\d{2})', d)

dd.cases = sc.find(r'insgesamt\s+([0-9]+)\s+laborbestätigte\s+Fälle', d)
m = re.search(r'Bisher\s+trat(en)?\s+(\S+)\s+(Todesfall|Todesfälle)',
              d,
              flags=re.I)
if m:
    dd.deaths = sc.int_or_word(m[2])

if re.search(
        'Alle\s+weiteren\s+Erkrankten\s+sind\s+in\s+der\s+Zwischenzeit\s+genesen',
Exemple #22
0
        dd.quarantined = qua
        dd.icu = ip
        if not is_first:
            print('-' * 10)
        is_first = False
        print(dd)
else:
    print('PDF data is inconsistent!', file=sys.stderr)
    print(
        f'dates: {len(dates)}, travel quarantined: {len(travel_q)},  isolation: {len(isolation)},  quarantined: {len(quarantined)}, IPS: {len(ips)}',
        file=sys.stderr)

# CSV from Google Spreadsheets
main_url = 'https://docs.google.com/spreadsheets/d/1Q7VoxM6wvbdsC84DLWrzyNymkcxUKqIXHy6BpB2Ez0k/edit#gid=0'
csv_url = 'https://docs.google.com/spreadsheets/d/1Q7VoxM6wvbdsC84DLWrzyNymkcxUKqIXHy6BpB2Ez0k/export?format=csv&id=1Q7VoxM6wvbdsC84DLWrzyNymkcxUKqIXHy6BpB2Ez0k&gid=0'
d_csv = sc.download(csv_url, silent=True)

reader = csv.DictReader(StringIO(d_csv), delimiter=',')
for row in reader:
    if row['Datum'] == '':
        continue
    if not is_first:
        print('-' * 10)
    is_first = False
    dd = sc.DayData(canton='GL', url=main_url)
    dd.datetime = row['Datum']
    dd.cases = row['Fallzahlen Total']
    dd.hospitalized = row['Personen in Spitalpflege']
    dd.deaths = row['Todesfälle (kumuliert)']
    print(dd)
Exemple #23
0
#!/usr/bin/env python3
# -*- coding: utf-8 -*-

import re
import sys
import scrape_common as sc

# The list of articles is also available on https://www.gd.bs.ch/medienseite/medienmitteilungen.html
URL = sc.download("https://www.gd.bs.ch/", silent=True)
URL = sc.filter(
    r'Tagesbulletin.*Corona.*\d+\s*bestätigte\s*(Fälle|Infektionen)', URL)

# 2020-03-25, List of sub-articles:
"""
    <a href="/nm/2020-tagesbulletin-coronavirus-466-bestaetigte-faelle-im-kanton-basel-stadt-gd.html" target="_self">Tagesbulletin Coronavirus: 466 bestätigte Fälle im Kanton Basel-Stadt</a>
    <a href="/nm/2020-tagesbulletin-coronavirus-414-bestaetigte-faelle-im-kanton-basel-stadt-gd.html" target="_self">Tagesbulletin Coronavirus: 414 bestätigte Fälle im Kanton Basel-Stadt</a>
    <a href="/nm/2020-tagesbulletin-coronavirus-376-bestaetigte-faelle-im-kanton-basel-stadt-gd.html" target="_self">Tagesbulletin Coronavirus: 376 bestätigte Fälle im Kanton Basel-Stadt</a>
"""

url = 'https://www.gd.bs.ch/' + sc.filter(r'href', URL).split('"')[1]
dd = sc.DayData(canton='BS', url=url)
d = sc.download(url, silent=True)

d = d.replace('&auml;', 'ä')
d = d.replace('&ouml;', 'ö')
d = d.replace('&nbsp;', ' ')

# 2020-03-25
"""
                        <p>Das Gesundheitsdepartement Basel-Stadt meldet mit Stand Mittwoch, 25. März 2020, 10 Uhr, insgesamt 466 positive Fälle von Personen mit Wohnsitz im Kanton Basel-Stadt sowie drei weitere Todesfälle. </p>
"""
Exemple #24
0
#!/usr/bin/env python3
# -*- coding: utf-8 -*-

import re
import datetime
import sys
from bs4 import BeautifulSoup
import scrape_common as sc

d = sc.download('https://www.fr.ch/sante/covid-19/coronavirus-statistiques-evolution-de-la-situation-dans-le-canton', silent=True)

soup = BeautifulSoup(d, 'html.parser')
xls_url = soup.find(href=re.compile("\.xlsx$")).get('href')
assert xls_url, "URL is empty"
if not xls_url.startswith('http'):
    xls_url = f'https://www.fr.ch{xls_url}'

xls = sc.xlsdownload(xls_url, silent=True)
rows = sc.parse_xls(xls, header_row=0, sheet_name='Données sites internet')
is_first = True
for row in rows:
    if not isinstance(row['Date'], datetime.datetime):
        print(f"WARNING: {row['Date']} is not a valid date, skipping.", file=sys.stderr)
        continue

    if not is_first:
        print('-' * 10)
    is_first = False

    print('FR')
    sc.timestamp()
Exemple #25
0
#!/usr/bin/env python3

import scrape_common as sc
import re

print('AG')

# get latest from list with all bulletins
d = sc.download('https://www.ag.ch/de/themen_1/coronavirus_2/lagebulletins/lagebulletins_1.jsp')

url = sc.find(r'<a [^>]*href="([^"]+\.pdf)">.+Bulletin.+</a>', d)

# download latest PDF
d = sc.pdfdownload('https://www.ag.ch' + url, raw=True)

sc.timestamp()

print('Date and time:', sc.find(r'Aarau, (.+? Uhr)', d))

print('Confirmed cases:', sc.find(r'zurzeit\s+([0-9]+)\s+bestätigte\s+Fälle', d))

print('Recovered:', sc.find(r'([0-9]+)\s+Personen.*?als\s+geheilt', d))

print('Hospitalized:', sc.find(r'([0-9]+)\s+Person(en)?\s+sind\s+zurzeit\s+hospitalisiert', d))

print('ICU:', sc.find(r'([0-9]+)\s+Person(en)?.*?auf\s+Intensivstationen', d))

print('Vent:', sc.find(r'([0-9]+)\s+Person(en)?\s+künstlich\s+beatmet', d))

print('Deaths:', sc.find(r'([0-9]+)\s+Person(en)?\s+an\s+den\s+Folgen\s+des\s+Coronavirus\s+verstorben', d))
Exemple #26
0
#!/usr/bin/env python3

import scrape_common as sc

print('GL')
d = sc.download('https://www.gl.ch/verwaltung/finanzen-und-gesundheit/gesundheit/coronavirus.html/4817')
sc.timestamp()
d = d.replace('&nbsp;', ' ')
d = d.replace('&auml;', 'ä')

d = sc.filter(r'Fallzahlen\s*Kanton\s*Glarus.+Update|Bestätigte\s*Fälle|Wahrscheinliche\s*Fälle|Hospitalisierungen|Verstorbene', d)

#      <li><strong><a href="#Fallzahlen">Fallzahlen Kanton Glarus</a> (Update 22.03.2020, 13.30 Uhr)</strong></li> 
#...
#      <h2><strong><a id="Fallzahlen" name="Fallzahlen"></a>Coronavirus: Update Kanton Glarus</strong></h2> 
#      <h2>Bestätigte Fälle:&nbsp;<strong>31</strong>&nbsp;</h2> 
#      <h2>Wahrscheinliche Fälle:&nbsp;<strong>--</strong></h2> 
#      <h2>Hospitalisierungen:&nbsp;<strong>3</strong>&nbsp;</h2> 

# 2020-03-26
"""
      <h2><strong><a id="Fallzahlen" name="Fallzahlen"></a>Coronavirus: Update Kanton Glarus</strong><br /> (Stand: 25.3.2020, 13:30 Uhr)</h2> 
      <h2>Bestätigte Fälle: <strong>40&nbsp;</strong>(Vortag: 33)&nbsp;<br /> Hospitalisierungen: <strong>2</strong>&nbsp;(Vortag: 3)</h2> 
      <p>Die Zahl der bestätigten Fälle umfasst die seit Messbeginn erfassten Personen, die positiv auf COVID-19 getestet wurden. Bereits wieder genesene Personen sind in diesen Zahlen ebenfalls enthalten.</p> 
"""

# 2020-04-03
# Note, that it misses numbers for hospitalized on this day / time.
"""
      <h2><strong><a id="Fallzahlen" name="Fallzahlen"></a>Coronavirus: Update Kanton Glarus</strong><br /> (Stand: 3.4.2020, 13:30 Uhr)</h2> 
      <h2>Bestätigte Fälle: <strong>59&nbsp;</strong>(+1)&nbsp;<br /> Personen in Spitalpflege: <strong>5 </strong>(+/-0)&nbsp;<br /> Verstorbene Personen: <strong>2 </strong>(+/-0)</h2> 
Exemple #27
0
#!/usr/bin/env python3

import scrape_common as sc
from bs4 import BeautifulSoup
import datetime
import re

print('LU')
d = sc.download(
    'https://gesundheit.lu.ch/themen/Humanmedizin/Infektionskrankheiten/Coronavirus'
)
sc.timestamp()

# 2020-04-01
"""
<p><strong>Aktuelle Fallzahlen im Kanton Luzern&nbsp;</strong>(Stand: 1. April 2020, 11:00 Uhr)</p>
<table border="0" cellspacing="0" cellpadding="0">
    <tbody>
        <tr>
            <td valign="top" style="width: 151px;">
            <p><strong></strong>Bestätigte Fälle: </p>
            </td>
            <td valign="top" style="width: 47px;">
            <p style="text-align: right;">401</p>
            </td>
        </tr>
        <tr>
            <td valign="top" style="width: 151px;">
            <p>Hospitalisiert:</p>
            </td>
            <td valign="top" style="width: 47px;">
Exemple #28
0
#!/usr/bin/env python3

import scrape_common as sc
import sys
import re
from bs4 import BeautifulSoup

# get the daily bulletins
base_url = 'https://www.regierung.li'
d = sc.download(
    f'{base_url}/ministerien/ministerium-fuer-gesellschaft/medienmitteilungen/',
    silent=True)
soup = BeautifulSoup(d, 'html.parser')

is_first = True
bulletins = soup.find_all('a', text=re.compile(r'.*Situationsbericht.*'))
for bulletin in bulletins:
    url = f"{base_url}{bulletin.get('href')}"
    bulletin_d = sc.download(url, silent=True)
    bulletin_soup = BeautifulSoup(bulletin_d, 'html.parser')

    dd = sc.DayData(canton='FL', url=url)

    title = bulletin_soup.find('h1', text=re.compile(r'.*Situationsbericht.*'))
    dd.datetime = sc.find(r'Situationsbericht vom (.*? 20\d{2})', title.text)

    content = title.find_next('div').text
    content = re.sub(r'(\d+)’(\d+)', r'\1\2', content)

    dd.cases = sc.find(r"insgesamt\s+([0-9]+)\s+laborbestätigte\s+Fälle",
                       content)
Exemple #29
0
#!/usr/bin/env python3
# -*- coding: utf-8 -*-

from bs4 import BeautifulSoup
import re
import scrape_common as sc
import scrape_bl_common as sbc
from collections import OrderedDict, defaultdict
from datetime import datetime

bulletin_url = sbc.get_latest_bl_bulletin_url()
bulletin_content = sc.download(bulletin_url, silent=True)
soup = BeautifulSoup(bulletin_content, 'html.parser')
content = soup.find('strong', string=re.compile(r'Per heute .*')).string
# strip unwanted characters
content = content.encode("ascii", errors="ignore").decode()
dd = sc.DayData(canton='BL', url=bulletin_url)
dd.datetime = sc.find(r'Per heute \w+, (\d+\. \w+ 20\d{2})', content)
dd.isolated = sc.find(
    r'Aktuell befinden sich.*(\d+\s?\d+) Personen in Isolation', content)
dd.quarantined = sc.find(
    r'Aktuell befinden sich.*(\d+\s?\d+) Personen in Quarantäne', content)

is_first = True
if dd:
    print(dd)
    is_first = False

main_url = "https://www.baselland.ch/politik-und-behorden/direktionen/volkswirtschafts-und-gesundheitsdirektion/amt-fur-gesundheit/medizinische-dienste/kantonsarztlicher-dienst/aktuelles/covid-19-faelle-kanton-basel-landschaft"
main_site = sc.download(main_url, silent=True)
Exemple #30
0
xls_url = 'https://www.jgk.be.ch/jgk/de/index/gemeinden/gemeinden/gemeindedaten.assetref/dam/documents/JGK/AGR/de/Gemeinden/Gemeindedaten/agr_gemeinden_gemeindedaten_gemeinden_rk_de.xlsx'
xls = sc.xlsdownload(xls_url, silent=True)
xls_data = sc.parse_xls(xls, header_row=1, columns_to_parse=9)
communes = {}
for item in xls_data:
    commune = item['Gemeinde / Commune']
    # kind of expected in this context
    commune = commune.replace(' (BE)', '')
    commune = commune.replace(' BE', '')
    district = item['Verwaltungskreis / Arrondissement administratif']
    communes[commune] = district
    assert district in district_ids, f'District {district} is unknown!'

# start getting and parsing the data
html_url = 'https://www.besondere-lage.sites.be.ch/besondere-lage_sites/de/index/corona/index.html'
d = sc.download(html_url, silent=True)
d = d.replace('&nbsp;', ' ')
soup = BeautifulSoup(d, 'html.parser')
tbody = soup.find('table', {'summary': 'Laufend aktualisierte Zahlen zu den Corona-Erkrankungen im Kanton Bern'}).find_next('tbody')
for row in tbody.find_all('tr'):
    tds = row.find_all('td')
    date_str = sc.find(r'(\d+\.\d+\.\d+)', tds[0].text)
    date = sc.date_from_text(date_str)

    dds = {}
    for (district, d_id), (district, population) in zip(district_ids.items(), inhabitants.items()):
        dd = sc.DistrictData(district=district, canton='BE')
        dd.url = html_url
        dd.district_id = d_id
        dd.population = population
        dd.date = date.isoformat()