Python DayData Examples, scrape_common.DayData Python Examples

Example #1

0

Show file

def parse_weekly_pdf():
    sc.add_cert_to_bundle()
    base_url = 'https://www.infosan.vd.ch'
    d = sc.download(base_url, silent=True)
    soup = BeautifulSoup(d, 'html.parser')
    html_url = base_url + soup.find(href=re.compile("/publications/covid-19-point-epidemiologique")).get('href')
    d = sc.download(html_url, silent=True)
    soup = BeautifulSoup(d, 'html.parser')
    pdf_url = base_url + soup.find(href=re.compile("\.pdf$")).get('href')
    pdf = sc.pdfdownload(pdf_url, silent=True)

    """
    29.07.2020
    Concernant le traçage des contacts de cas positifs, le 27 juillet, 83 personnes étaient en isolement, 633 en quarantaine de contacts étroits et 901 en quarantaine de retour de voyage.
    """

    dd = sc.DayData(canton='VD', url=pdf_url)
    year= sc.find('Situation au \d+.*(20\d{2})', pdf)
    date = sc.find('Concernant le traçage des contacts de cas positifs, le (\d+.*),', pdf)
    dd.datetime = date + ' ' + year
    dd.isolated = sc.find('(\d+)\s(personnes|cas\spositifs)\sétaient\sen\sisolement', pdf)
    dd.quarantined = text_to_int(sc.find('(\d.\d+|\d+)\scontacts\sétroits\sen\squarantaine\.', pdf))
    print(dd)
    print('-' * 10)

    dd = sc.DayData(canton='VD', url=pdf_url)
    date = sc.find('quarantaine. Le (\d+ .*),', pdf)
    dd.datetime = date + ' ' + year
    dd.quarantine_riskareatravel = text_to_int(sc.find(', (\d.\d+|\d+)\spersonnes\sétaient\sen\squarantaines\ssuite\sà\sun\sretour\sde\svoyage.', pdf))
    print(dd)
    print('-' * 10)

Example #2

0

Show file

File: scrape_vd.py Project: robi-ch/covid_19

def parse_weekly_pdf():
    pdf_url = svc.get_weekly_pdf_url()
    pdf = sc.pdfdownload(pdf_url, silent=True)
    """
    29.07.2020
    Concernant le traçage des contacts de cas positifs, le 27 juillet, 83 personnes étaient en isolement, 633 en quarantaine de contacts étroits et 901 en quarantaine de retour de voyage.
    """

    dd = sc.DayData(canton='VD', url=pdf_url)
    year = sc.find('Situation au \d+.*(20\d{2})', pdf)
    date = sc.find(
        'Concernant le traçage des contacts de cas positifs, le (\d+.*),', pdf)
    if not date:
        print("isolated/quarantined numbers missing in weekly PDF of VD",
              file=sys.stderr)
        return
    dd.datetime = date + ' ' + year
    dd.isolated = sc.find(
        '(\d+)\s(personnes|cas\spositifs)\sétaient\sen\sisolement', pdf)
    dd.quarantined = text_to_int(
        sc.find('(\d.\d+|\d+)\scontacts\sétroits\sen\squarantaine\.', pdf))
    print(dd)
    print('-' * 10)

    dd = sc.DayData(canton='VD', url=pdf_url)
    date = sc.find('quarantaine. Le (\d+ .*),', pdf)
    dd.datetime = date + ' ' + year
    dd.quarantine_riskareatravel = text_to_int(
        sc.find(
            ', (\d.\d+|\d+)\spersonnes\sétaient\sen\squarantaines?\ssuite\sà\sun\sretour\sde\svoyage.',
            pdf))
    print(dd)
    print('-' * 10)

Example #3

0

Show file

def parse_table(title_pattern, data_url, column_count, parse_fn):
    table = soup.find(string=re.compile(title_pattern)).find_parent(
        'h2').find_next('div').find('table')
    headers = [
        " ".join(cell.stripped_strings)
        for cell in table.find('tr').find_all('th')
    ]
    for row in table.find_all('tr')[1:]:
        dd = sc.DayData(canton='AG', url=data_url)
        cells = row.find_all(['td'])

        assert len(
            cells
        ) == column_count, f"Number of columns changed: {len(cells)} != {column_count}"

        col_num = 0
        for cell in cells:
            header = headers[col_num]
            value = cell.string
            value = value.replace("’", "")
            value = value.replace("'", "")
            dd = parse_fn(dd, value, header)
            col_num += 1

        print('-' * 10)
        print(dd)

Example #4

0

Show file

File: scrape_vd.py Project: openZH/covid_19

def parse_weekly_pdf():
    pdf_url = svc.get_weekly_pdf_url()
    pdf = sc.pdfdownload(pdf_url, silent=True)

    dd = sc.DayData(canton='VD', url=pdf_url)
    dd.datetime = sc.find('Point .pid.miologique au (\d+\s+\w+\s+\d{4})', pdf)
    dd.cases = text_to_int(
        sc.find(
            '\s(\d+.\d+)\s+personnes ont .t. test.es positives au SARS-CoV-2.',
            pdf))
    dd.hospitalized = sc.find(
        '(\d+)\s+patients\s+sont\s+actuellement\s+hospitalis.s', pdf)
    dd.icu = sc.find('dont\s+(\d+)\s+en\s+soins\s+intensifs', pdf)
    assert dd
    print(dd)
    print('-' * 10)

Example #5

0

Show file

File: scrape_sz.py Project: robi-ch/covid_19

try:
    xls_url = soup.find(
        'a', string=re.compile(r'Coronaf.lle\s*im\s*Kanton\s*Schwyz'))['href']
except TypeError:
    print("Unable to determine xls url", file=sys.stderr)
    sys.exit(1)
xls = sc.xlsdownload(xls_url, silent=True)

rows = sc.parse_xls(xls)
for row in rows:
    if not isinstance(row['Datum'], datetime.datetime):
        continue

    if not is_first:
        print('-' * 10)
    is_first = False

    # TODO: remove when source is fixed
    # handle wrong value on 2020-03-25, see issue #631
    if row['Datum'].date().isoformat() == '2020-03-25':
        row['Bestätigte Fälle (kumuliert)'] = ''

    dd = sc.DayData(canton='SZ', url=url)
    dd.datetime = row['Datum'].date().isoformat()
    if row['Zeit']:
        dd.datetime += ' ' + row['Zeit'].time().isoformat()
    dd.cases = row['Bestätigte Fälle (kumuliert)']
    dd.deaths = row['Todesfälle (kumuliert)']
    dd.recovered = row['Genesene (kumuliert)']
    print(dd)

Example #6

0

Show file

File: scrape_gl.py Project: donut1996/covid_19

    text = re.sub(r'\s\s+', ' ', text)
    return text.split(' ')


# weekly pdf
pdf_url = sgc.get_gl_pdf_url()
pdf = sc.download_content(pdf_url, silent=True)
content = sc.pdftotext(pdf, page=1)
pdf_date = sc.find(r'Stand: (\d{2}\.\d{2}.\d{4})', content)
pdf_date = sc.date_from_text(pdf_date)

number_of_tests = sc.find(r'PCR-Tests\sKanton Glarus\s(\d+\'?\d+)\s',
                          content).replace('\'', '')
is_first = True
if number_of_tests:
    dd = sc.DayData(canton='GL', url=pdf_url)
    dd.datetime = pdf_date
    dd.tested = number_of_tests
    is_first = False
    print(dd)

content = sc.pdftotext(pdf, page=2, raw=True)
dates = split_whitespace(
    sc.find(r'\n(\d+\.\d+\s+\d+\.\d+\s+.*)\nMassenquarant.ne', content))
travel_q = split_whitespace(
    sc.find(r'\nEinreisequarant.ne\s+(\d.*)\n', content))
isolation = split_whitespace(sc.find(r'\nIsolation\s+(\d.*)\n', content))
quarantined = split_whitespace(sc.find(r'\nQuarant.ne\s+(\d.*)\n', content))
ips = split_whitespace(sc.find(r'\nCovid Patienten in IPS\s+(\d.*)\n',
                               content))

Example #7

0

Show file

File: scrape_ur.py Project: sinead90/covid_19

			<td icms="">0</td>
			<td icms="">47</td>
			<td icms="">7</td>
			<td icms="">&nbsp;</td>
		</tr>
	</tbody>
</table>
"""

soup = BeautifulSoup(d, 'html.parser')
data_table = soup.find(string=re.compile(
    r'Positiv\s+getestete\s+Erkrankungsfälle')).find_parent('table')

assert data_table, "Can't find data table"

dd = sc.DayData(canton='UR', url=url)
dd.datetime = sc.find(r'Stand[A-Za-z ]*[:,]? ([^<)]+ Uhr)<', d)

rows = data_table.find_all('tr')
assert len(rows) == 2, f"Number of rows changed, {len(rows)} != 2"

headers = rows[0].find_all('td') or rows[0].find_all('th')
assert len(
    headers) == 6, f"Number of header columns changed, {len(headers)} != 6"
assert headers[0].text.strip() == "Aktive Fälle"
assert headers[1].text == "Positiv getestete Erkrankungsfälle"
assert headers[2].text == "Hospitalisiert"
assert headers[3].text == "Quarantäne"
assert headers[4].text == "Verstorben"

cells = rows[1].find_all('td')

Example #8

0

Show file

#!/usr/bin/env python3
# -*- coding: utf-8 -*-

import scrape_common as sc

xls_url = 'http://www.nw.ch/coronastatistik'
xls = sc.xlsdownload(xls_url, silent=True)
rows = sc.parse_xls(xls, header_row=3)
is_first = True
for row in rows:
    if not is_first:
        print('-' * 10)
    is_first = False

    dd = sc.DayData(canton='NW', url=xls_url)
    dd.datetime = row['A'].date().isoformat()
    dd.cases = row['Positiv getestete Personen (kumuliert)']
    dd.hospitalized = row['Hospitalisierte Personen']
    dd.icu = row['Davon auf der Intensivstation']
    dd.deaths = row['Verstorbene Personen']
    print(dd)

Example #9

0

Show file

File: scrape_ti.py Project: yangzhe1990/covid_19

#!/usr/bin/env python3
# -*- coding: utf-8 -*-

import scrape_common as sc

xls_url = 'https://www4.ti.ch/fileadmin/DSS/DSP/UMC/malattie_infettive/Coronavirus/dati/COVID19_Dati_TI_per_github.xlsx'
xls = sc.xlsdownload(xls_url, silent=True)
rows = sc.parse_xls(xls, header_row=0)
is_first = True
for row in rows:
    if not is_first:
        print('-' * 10)
    is_first = False

    dd = sc.DayData(canton='TI', url=xls_url)
    dd.datetime = f"{row['date'].date().isoformat()}"
    if row['time']:
        dd.datetime += f"T{row['time'].time().isoformat()}"
    dd.cases = row['ncumul_conf']
    dd.hospitalized = row['current_hosp']
    dd.icu = row['current_icu']
    dd.vent = row['current_vent']
    dd.recovered = row['ncumul_released']
    dd.deaths = row['ncumul_deceased']
    print(dd)

Example #10

0

Show file

#!/usr/bin/env python3

import csv
from io import StringIO
import re
from bs4 import BeautifulSoup
import scrape_common as sc

# hospitalized
url_hospitalized = 'https://stada.sg.ch/covid/C19_Faelle_hospitalisiert.html'
soup = BeautifulSoup(sc.download(url_hospitalized, silent=True), 'html.parser')
dd_hosp = sc.DayData(canton='SG', url=url_hospitalized)
hosp_table = soup.find('table')

hosp_date = hosp_table.find_next(string=re.compile("Stand")).string
dd_hosp.datetime = sc.find(r'Stand:?\s*(.+[0-9]{4})', hosp_date)

rows = hosp_table.find_all('tr')

headers = rows[0].find_all('td') or rows[0].find_all('th')
assert len(
    headers) == 2, f"Number of header columns changed, {len(headers)} != 2"
assert headers[1].text.strip() == "Anzahl"

for i in range(1, len(rows)):
    cells = rows[i].find_all('td')
    if cells[0].text.strip() == 'Total Covid-19 Patienten':
        dd_hosp.hospitalized = cells[1].text
    elif cells[0].text.strip() == '...davon auf Intensivstation ohne Beatmung':
        dd_hosp.icu = int(cells[1].text)
    elif cells[0].text.strip() == '...davon auf Intensivstation mit Beatmung':

Example #11

0

Show file

File: scrape_ge.py Project: robi-ch/covid_19

is_first = True

# parse tested from PDF
pdf_url = sgc.get_latest_ge_weekly_pdf_url()
pdf = sc.pdfdownload(pdf_url, silent=True)

week_number = sc.find(r'Situation semaine (\d+)', pdf)
if week_number:
    week_end_date = datetime.datetime.strptime('2021-W' + week_number + '-7',
                                               '%G-W%V-%u').date()
    number_of_tests = sc.find(r'Au total, (\d+\'\d+) tests PCR ont', pdf)

    if number_of_tests is not None:
        number_of_tests = number_of_tests.replace('\'', '')

        dd_test = sc.DayData(canton='GE', url=pdf_url)
        dd_test.datetime = week_end_date.isoformat()
        dd_test.tested = number_of_tests
        print(dd_test)
        is_first = False

# get hospitalized number
hosp_url = 'https://www.hug.ch/coronavirus-maladie-covid-19/situation-aux-hug'
d = sc.download(hosp_url, silent=True)
d = d.replace('&nbsp;', ' ')
soup = BeautifulSoup(d, 'html.parser')
content = soup.find(string=re.compile(
    "Evolution du nombre de malades.*")).find_previous('p').text

dd_hosp = sc.DayData(canton='GE', url=hosp_url)
hosp_date = sc.find(r'^Au (\d+\s*(:?\w+)?\s+\w+)\s+à\s+\d+h',

Example #12

0

Show file

pdf_url = f'{base_url}{pdf_url}'

content = sc.pdfdownload(pdf_url, layout=True, silent=True)
"""
Hospitalisationen im Kanton  Anzahl Personen in Isolation  davon Kontakte in Quarantäne  Anzahl zusätzlicher Personen in Quarantäne nach Rückkehr aus Risikoland  Re- Wert***
6 (6)                        120 (71)                      280 (189)                     388 (280)                                                                1.46 (1.1)
"""

rows = []

date = sc.find(r'S\s?tand: (\d+\.\d+\.20\d{2})', content)
res = re.search(
    r'Hospitalisationen im Kanton.*\d+ \(\d+\)\s+(\d+) \(\d+\)\s+(\d+) \(\d+\)\s+(\d+) \(\d+\)\s+\d\.\d+ \(\d\.\d+\)',
    content, re.DOTALL)
if res is not None:
    data = sc.DayData(canton='SO', url=pdf_url)
    data.datetime = date
    data.isolated = res[1]
    data.quarantined = res[2]
    data.quarantine_riskareatravel = res[3]
    rows.append(data)

url = f"{base_url}/index.php?id=27979"
d = sc.download(url, silent=True)
d = d.replace("&nbsp;", " ")

soup = BeautifulSoup(d, 'html.parser')
data_table = soup.find(
    'h2', text=re.compile("Situation Kanton Solothurn")).find_next("table")
if data_table:
    headers = [cell.string for cell in data_table.find('tr').find_all('th')]

Example #13

0

Show file

import scrape_common as sc
import scrape_ag_common as sac

xls_url = sac.get_ag_xls_url()
xls = sc.xlsdownload(xls_url, silent=True)
is_first = True

# quarantine_riskareatravel
rows = sc.parse_xls(xls,
                    sheet_name='5. Quarantäne nach Einreise',
                    header_row=2)
for row in rows:
    if not isinstance(row['A'], datetime.datetime):
        continue

    dd = sc.DayData(canton='AG', url=xls_url)
    dd.datetime = f"{row['A'].date().isoformat()} {row['A'].time().isoformat()}"
    dd.quarantine_riskareatravel = row['Gesamtzahl aktuell betreuter Personen']
    if dd:
        if not is_first:
            print('-' * 10)
        is_first = False
        print(dd)

# quarantine + isolation
rows = sc.parse_xls(xls, sheet_name='2. Contact Tracing', header_row=2)
for row in rows:
    if not isinstance(row['A'], datetime.datetime):
        continue

    dd = sc.DayData(canton='AG', url=xls_url)

Example #14

0

Show file

def fix_lu_date(date):
    res = re.match(r'(20\d{2})/(\d+)/(\d+)', date)
    assert res, 'date could not be matched!'
    date = datetime.date(int(res[1]), int(res[2]) + 1, int(res[3]))
    return date.isoformat()


hosp_url = 'https://www.lustat.ch/analysen/gesundheit/corona-reporting/hospitalisationen'
hosp_csv = 'https://www.lustat.ch/files_ftp/daten/covid/cov_hospitalisationen.csv'

is_first = True
data = sc.download(hosp_csv, silent=True, encoding='utf-8-sig')
reader = csv.DictReader(StringIO(data), delimiter=';')
for row in reader:
    dd = sc.DayData(canton='LU', url=hosp_url)
    dd.datetime = fix_lu_date(row['utcdatum'])
    dd.hospitalized = row['current_hosp']
    dd.vent = row['current_vent']
    if dd:
        if not is_first:
            print('-' * 10)
        is_first = False
        print(dd)

cases_url = 'https://www.lustat.ch/analysen/gesundheit/corona-reporting/entwicklungen-seit-maerz-2020'
cases_csv = 'https://www.lustat.ch/files_ftp/daten/covid/cov_faelle_g2.csv'

data = sc.download(cases_csv, silent=True, encoding='utf-8-sig')
reader = csv.DictReader(StringIO(data), delimiter=';')
for row in reader:

Example #15

0

Show file

    assert xls_url, "URL is empty"
    if not xls_url.startswith('http'):
        xls_url = f'https://www.jura.ch{xls_url}'

    xls = sc.xlsdownload(xls_url, silent=True)

    rows = sc.parse_xls(xls, header_row=0)
    for i, row in enumerate(rows):
        if not isinstance(row['Date'], datetime.datetime):
            continue

        if not is_first:
            print('-' * 10)
        is_first = False

        dd = sc.DayData(canton='JU', url=xls_url)
        dd.datetime = row['Date'].date().isoformat()
        dd.cases = row['Cumul des cas confimés']
        dd.hospitalized = row.get('Nb cas actuellement hospitalisés')
        dd.icu = row.get('Nb cas actuellement en SI')
        if sc.represents_int(row.get('Nombre de nouveaux décès')):
            dd.deaths = sum(r['Nombre de nouveaux décès']
                            for r in rows[:i + 1])
        print(dd)

data_table = soup.find(
    'caption', string=re.compile(
        r'Evolution du nombre de cas.*Jura')).find_parent('table')
if data_table:
    headers = [
        " ".join(cell.stripped_strings)

Example #16

0

Show file

File: scrape_ar.py Project: jcblemai/R0_BBRC_experiment

#!/usr/bin/env python3

import scrape_common as sc

url = 'https://www.ar.ch/verwaltung/departement-gesundheit-und-soziales/amt-fuer-gesundheit/informationsseite-coronavirus/'
d = sc.download(url, silent=True)
d = d.replace('&nbsp;', ' ')

# Contact Tracing with its own timestamp

dd_ct = sc.DayData(canton='AR', url=url)

t = sc.find(r'Contact\s+tracing\s+\(?.*?Stand\:?\s+([^\)]+)(Uhr)?.*?\)?', d) or \
    sc.find(r'Contact\s+tracing\s+\(?.*?Stand\:?\s+([0-9]+\.[0-9]+\.? \/ [0-9]+h).*?\)?', d)
dd_ct.datetime = t

dd_ct.isolated = sc.find(
    r'Aktuell\s+COVID-19-Erkrankte\s+in\s+Isolation:\s+<strong>(\d+)</strong>',
    d)
dd_ct.quarantined = sc.find(
    r'Aktuell\s+im\s+Kanton\s+wohnhafte\s+Kontaktpersonen\s+in\s+Quarantäne:\s+<strong>(\d+)</strong>',
    d)

print(dd_ct)
print('-' * 10)

# cases

dd = sc.DayData(canton='AR', url=url)
# d = sc.filter('Aktuelle Informationen: Zahlen', d)

Example #17

0

Show file

#!/usr/bin/env python3

import re
import datetime
from bs4 import BeautifulSoup
import scrape_common as sc

url = 'https://gesundheit.lu.ch/themen/Humanmedizin/Infektionskrankheiten/Coronavirus'
d = sc.download(url, silent=True)
dd = sc.DayData(canton='LU', url=url)

# 2020-04-01
"""
<p><strong>Aktuelle Fallzahlen im Kanton Luzern&nbsp;</strong>(Stand: 1. April 2020, 11:00 Uhr)</p>
<table border="0" cellspacing="0" cellpadding="0">
    <tbody>
        <tr>
            <td valign="top" style="width: 151px;">
            <p><strong></strong>Bestätigte Fälle: </p>
            </td>
            <td valign="top" style="width: 47px;">
            <p style="text-align: right;">401</p>
            </td>
        </tr>
        <tr>
            <td valign="top" style="width: 151px;">
            <p>Hospitalisiert:</p>
            </td>
            <td valign="top" style="width: 47px;">
            <p style="text-align: right;">57</p>
            </td>

Example #18

0

Show file

#!/usr/bin/env python3

import csv
import re
from io import StringIO
from bs4 import BeautifulSoup
import scrape_common as sc

url = "https://www.zh.ch/de/gesundheit/coronavirus.html#-1310230111"

# get quarantined and isolated from website
dd_iso_q = sc.DayData(canton='ZH', url=url)
d = sc.download(url, silent=True)

# 2020-07-08
"""
<div class="mdl-richtext ">
<h2 class="atm-heading" id="-1310230111" tabindex="-1">Gesundheitliche Lage</h2>
<p class="atm-paragraph">Personen mit Wohnsitz im Kanton Zürich<br> </p>
<h4 class="atm-heading" id="-718243468">23</h4>
<p class="atm-paragraph">neue positive Fälle in den letzten 24 Stunden</p>
<h4 class="atm-heading" id="-718243501">11</h4>
<p class="atm-paragraph">in Spitalbehandlung</p>
<h4 class="atm-heading" id="808114848">3</h4>
<p class="atm-paragraph">davon mit künstlicher Beatmung</p>
<h4 class="atm-heading" id="-790711940">131</h4>
<p class="atm-paragraph">Total Verstorbene seit Pandemiebeginn (78 in Alters- und Pflegeheimen, 51 im Spital, 2 Zuhause)</p>
<h4 class="atm-heading" id="-790711785">181</h4>
<p class="atm-paragraph">in Isolation</p>
<h4 class="atm-heading" id="-790704311">914</h4>
<p class="atm-paragraph">in Quarantäne &nbsp;</p>

Example #19

0

Show file

reader = csv.DictReader(StringIO(d_csv), delimiter=',')
data = collections.defaultdict(dict)
for row in reader:
    if row['Typ'] == 'NA' or row['Datum'] == 'NA':
        continue
    date = sc.date_from_text(row['Datum'])
    data[date.isoformat()][row['Typ']] = row['Anzahl']

days = list(data.keys())
is_first = True
for day in days:
    if not is_first:
        print('-' * 10)
    is_first = False

    dd = sc.DayData(canton='ZG', url=main_url)
    dd.datetime = day
    dd.isolated = data[day]['Isolation']
    dd.quarantined = data[day]['Quarantäne aus Contact Tracing']
    dd.quarantine_riskareatravel = data[day].get(
        'Quarantäne nach Rückkehr aus Risikoland')
    dd.quarantine_total = data[day].get('Quarantäne Total')

    print(dd)

cases_csv_url = 'https://www.zg.ch/behoerden/gesundheitsdirektion/statistikfachstelle/daten/themen/result-themen-14-03-01-e1.csv'
d_csv = sc.download(cases_csv_url, silent=True)
"""
"Typ","Datum","Anzahl","Stand","Meta","Type","Content"
"Fallzahl","22.04.2020","176","2020-04-22 08:00:00",NA,NA,NA
"Fallzahl","23.04.2020","178","2020-04-23 08:00:00",NA,NA,NA

Example #20

0

Show file

File: scrape_ag.py Project: Departing/CHCovid-19

#!/usr/bin/env python3

from bs4 import BeautifulSoup
import scrape_common as sc

# fetch latest data from HTML table
url = 'https://www.ag.ch/de/themen_1/coronavirus_2/coronavirus.jsp'
d = sc.download(url, silent=True)
d = d.replace("’", "")
d = d.replace("'", "")

dd = sc.DayData(canton='AG', url=url)

date = sc.find(
    r'Stand: (?:Montag|Dienstag|Mittwoch|Donnerstag|Freitag|Samstag|Sonntag), (.+? Uhr)',
    d)
dd.datetime = date

soup = BeautifulSoup(d, 'html.parser')
rows = []
for t in soup.find_all('table'):
    headers = [
        " ".join(cell.stripped_strings) for cell in t.find('tr').find_all('th')
    ]

    for row in [r for r in t.find_all('tr') if r.find_all('td')]:

        cells = row.find_all(['td'])

        col_num = 0
        for cell in cells:

Example #21

0

Show file

File: scrape_vs.py Project: matsf/covid_19

stat_url = base_url + '/de/web/coronavirus/statistiques'
content = sc.download(stat_url, silent=True)
soup = BeautifulSoup(content, 'html.parser')
res = soup.find(
    string=re.compile(r'Synthese COVID19 VS Woche\d+')).find_previous('a')
weekly_pdf_url = base_url + res.attrs['href']
weekly_pdf_url = weekly_pdf_url.replace(' ', '%20')
content = sc.pdfdownload(weekly_pdf_url, silent=True)

# add isolated/quarantined to the existing DayData item
week_end_date = sc.find(r'vom (\d+)\. bis (\d+\.\d+\.20\d{2})',
                        content,
                        group=2)
week_end_date = sc.date_from_text(week_end_date).isoformat()

dd = sc.DayData(canton='VS', url=weekly_pdf_url)
dd.datetime = week_end_date
dd.isolated = sc.find(
    r'befanden\ssich\s(\d+)\spositive\sF.lle\snoch\simmer\sin\sIsolation',
    content)
dd.quarantined = sc.find(r'Isolation\sund\s(\d+)\sKontakte\sin\sQuarant.ne',
                         content)
dd.quarantine_riskareatravel = sc.find(r'\s(\d+)\sReisende\sin\sQuarant.ne',
                                       content)
print(dd)

xls_url = 'https://raw.githubusercontent.com/statistikZH/covid19_drop/master/Chiffres%20%20COVID-19%20Valais.xlsx'
main_url = 'https://www.vs.ch/de/web/coronavirus'
xls = sc.xlsdownload(xls_url, silent=True)
rows = sc.parse_xls(xls, header_row=1)
for i, row in enumerate(rows):

Example #22

0

Show file

File: scrape_be.py Project: doerfli/covid_19

        'table',
    {
        'summary':
        'Laufend aktualisierte Zahlen zu den Corona-Erkrankungen im Kanton Bern'
    }):
    headers = [
        " ".join(cell.stripped_strings) for cell in t.find('tr').find_all('th')
    ]

    is_first = True
    for row in [r for r in t.find_all('tr') if r.find_all('td')]:
        if not is_first:
            print('-' * 10)
        is_first = False

        dd = sc.DayData(canton='BE', url=html_url)

        for col_num, cell in enumerate(row.find_all(['td'])):
            value = " ".join(cell.stripped_strings)
            if value:
                value = value.replace("'", "")
            if value and '*' in value and not '**' in value:
                # the asteriks (*) indicates a not-current value
                # ** means "Datenkorrektur"
                continue
            if value and '(' in value:
                value = sc.find(r'(\d+)([\s<>br\w]*\(.*\))?', value)

            if headers[col_num] == 'Datum':
                date_string = "".join(list(cell.stripped_strings)[0:-1])
                time_string = list(cell.stripped_strings)[-1]

Example #23

0

Show file

# -*- coding: utf-8 -*-

from bs4 import BeautifulSoup
import re
import scrape_common as sc
import scrape_bl_common as sbc
from collections import OrderedDict, defaultdict
from datetime import datetime

bulletin_url = sbc.get_latest_bl_bulletin_url()
bulletin_content = sc.download(bulletin_url, silent=True)
soup = BeautifulSoup(bulletin_content, 'html.parser')
content = soup.find('strong', string=re.compile(r'Per heute .*')).string
# strip unwanted characters
content = content.encode("ascii", errors="ignore").decode()
dd = sc.DayData(canton='BL', url=bulletin_url)
dd.datetime = sc.find(r'Per heute \w+, (\d+\. \w+ 20\d{2})', content)
dd.isolated = sc.find(
    r'Aktuell befinden sich.*(\d+\s?\d+) Personen in Isolation', content)
dd.quarantined = sc.find(
    r'Aktuell befinden sich.*(\d+\s?\d+) Personen in Quarantäne', content)

is_first = True
if dd:
    print(dd)
    is_first = False

main_url = "https://www.baselland.ch/politik-und-behorden/direktionen/volkswirtschafts-und-gesundheitsdirektion/amt-fur-gesundheit/medizinische-dienste/kantonsarztlicher-dienst/aktuelles/covid-19-faelle-kanton-basel-landschaft"
main_site = sc.download(main_url, silent=True)

# 2020-04-08, two iframes

Example #24

0

Show file

File: scrape_bl.py Project: zc94589523/covid_19

                            rows[key]['hospitalized'] = int(
                                float(c[1] or 0) + float(c[2] or 0) +
                                float(c[3] or 0))
                        rows[key]['icu'] = int(
                            float(c[2] or 0) + float(c[3] or 0))
                        rows[key]['vent'] = c[3]

# order dict by key to ensure the most recent entry is last
ordered_rows = OrderedDict(sorted(rows.items()))
is_first = True
for row_date, row in ordered_rows.items():
    if not is_first:
        print('-' * 10)
    is_first = False

    dd = sc.DayData(canton='BL', url=main_url)
    dd.datetime = row['date']
    dd.cases = sc.safeint(row['cases'])
    try:
        dd.hospitalized = sc.safeint(row['hospitalized'])
    except KeyError:
        pass
    try:
        dd.icu = sc.safeint(row['icu'])
    except KeyError:
        pass
    try:
        dd.vent = sc.safeint(row['vent'])
    except KeyError:
        pass
    dd.deaths = sc.safeint(row['deaths'])

Example #25

0

Show file

#!/usr/bin/env python3
# -*- coding: utf-8 -*-

import datetime
import scrape_common as sc

xls_url = 'https://raw.githubusercontent.com/statistikZH/covid19_drop/master/Chiffres%20%20COVID-19%20Valais.xlsx'
xls = sc.xlsdownload(xls_url, silent=True)
rows = sc.parse_xls(xls, header_row=1)
is_first = True
for row in rows:
    if not isinstance(row['Date'], datetime.datetime):
        continue

    if not is_first:
        print('-' * 10)
    is_first = False

    dd = sc.DayData(canton='VS', url=xls_url)
    dd.datetime = row['Date'].date().isoformat()
    dd.cases = row['Cumul cas positifs']
    dd.hospitalized = row['Total hospitalisations COVID-19']
    dd.icu = row['Patients COVID-19 aux SI total']
    dd.vent = row['Patients COVID-19 intubés']
    dd.deaths = row['Cumul décès COVID-19']
    print(dd)

Example #26

0

Show file

#!/usr/bin/env python3

import scrape_common as sc

url = "https://gd.zh.ch/internet/gesundheitsdirektion/de/themen/coronavirus.html"
dd = sc.DayData(canton='ZH', url=url)
d = sc.download(url, silent=True)
d = d.replace('&nbsp;', ' ')
d = d.replace('<strong>', ' ').replace('</strong>', ' ')
# d = sc.filter(r"Im Kanton Zürich sind zurzeit|\(Stand|Total ([0-9]+) Todesfälle|Spitalbehandlung|beatmet", d)
#                                 <h2>Aktuelle Situation im Kanton Zürich (24.3.2020, 9.30 Uhr)</h2>
#                         
#                         
#                         
#                         <p>Im Kanton Zürich sind zurzeit 1211 Personen positiv auf das Coronavirus getestet worden. Total 5 Todesfälle (78-jährig, 80, 88, 96, 97).</p>
# <p>(Stand 24.3.2020, 9.30 Uhr)</p>


# 2020-03-26
"""
				<h2>Aktuelle Situation im Kanton Zürich (26.3.2020, 9.30 Uhr)</h2>
			
			
			
			<p>Im Kanton Zürich sind zurzeit 1476 Personen positiv auf das Coronavirus getestet worden.</p>
<p>152 positiv Getestete befinden sich in Spitalbehandlung, davon werden 32 künstlich beatmet.</p>
<p>Total 9 Todesfälle (78-jährig, 78, 80, 80, 85, 88, 90, 96, 97).</p>
<p>Die Gesundheitsdirektion beschafft sich eine Maschine, die täglich automatisch bis zu 32'000 FFP2-Schutzmasken herstellen kann. In der zweiten Hälfte des Monats April ist die Maschine betriebsbereit.&nbsp;</p>
<p>(Stand 26.3.2020, 9.30 Uhr)</p>
"""

Example #27

0

Show file

File: scrape_bs.py Project: RiLeone/covid_19

import scrape_common as sc

# The list of articles is also available on https://www.gd.bs.ch/medienseite/medienmitteilungen.html
URL = sc.download("https://www.gd.bs.ch/", silent=True)
URL = sc.filter(
    r'Tagesbulletin.*Corona.*\d+\s*bestätigte\s*(Fälle|Infektionen)', URL)

# 2020-03-25, List of sub-articles:
"""
    <a href="/nm/2020-tagesbulletin-coronavirus-466-bestaetigte-faelle-im-kanton-basel-stadt-gd.html" target="_self">Tagesbulletin Coronavirus: 466 bestätigte Fälle im Kanton Basel-Stadt</a>
    <a href="/nm/2020-tagesbulletin-coronavirus-414-bestaetigte-faelle-im-kanton-basel-stadt-gd.html" target="_self">Tagesbulletin Coronavirus: 414 bestätigte Fälle im Kanton Basel-Stadt</a>
    <a href="/nm/2020-tagesbulletin-coronavirus-376-bestaetigte-faelle-im-kanton-basel-stadt-gd.html" target="_self">Tagesbulletin Coronavirus: 376 bestätigte Fälle im Kanton Basel-Stadt</a>
"""

url = 'https://www.gd.bs.ch/' + sc.filter(r'href', URL).split('"')[1]
dd = sc.DayData(canton='BS', url=url)
d = sc.download(url, silent=True)

d = d.replace('&auml;', 'ä')
d = d.replace('&ouml;', 'ö')
d = d.replace('&nbsp;', ' ')

# 2020-03-25
"""
                        <p>Das Gesundheitsdepartement Basel-Stadt meldet mit Stand Mittwoch, 25. März 2020, 10 Uhr, insgesamt 466 positive Fälle von Personen mit Wohnsitz im Kanton Basel-Stadt sowie drei weitere Todesfälle. </p>
"""

# There are some extra (or repeated) information in the previous / next paragraphs:

# 2020-03-25
"""

Example #28

0

Show file

File: scrape_sz.py Project: ahirlinger/covid_19

from bs4 import BeautifulSoup
import scrape_common as sc

d = sc.download(
    'https://www.sz.ch/behoerden/information-medien/medienmitteilungen/coronavirus.html/72-416-412-1379-6948',
    silent=True)
soup = BeautifulSoup(d, 'html.parser')

pdf_url = soup.find('a', string=re.compile(r'Medienmitteilung vom'))['href']
pdf_content = sc.pdfdownload(pdf_url, layout=True, silent=True)
date = sc.find(r'Stand: (\d+\. .* 20\d{2})', pdf_content)
res = re.search(r'.*\s+\d+\s+\d+\s+\d+\s+(\d+)\s+(\d+)\s+(\d+)\s+',
                pdf_content)
is_first = True
if res is not None:
    dd = sc.DayData(canton='SZ', url=pdf_url)
    dd.datetime = date
    dd.hospitalized = res[1]
    dd.quarantined = res[2]
    dd.quarantine_riskareatravel = res[3]
    print(dd)
    is_first = False

try:
    xls_url = soup.find(
        'a', string=re.compile(r'Coronaf.lle\s*im\s*Kanton\s*Schwyz'))['href']
except TypeError:
    print("Unable to determine xls url", file=sys.stderr)
    sys.exit(1)
xls = sc.xlsdownload(xls_url, silent=True)

Example #29

0

Show file

File: scrape_fl.py Project: jcblemai/R0_BBRC_experiment

# get latest from list with all press releases
d = sc.download('https://www.regierung.li/coronavirus', silent=True)

pdf_url = sc.find(
    r'<a.*?href="([^"]+\.pdf)[^"]*"[^>]*?>[^<]+?Situationsbericht[^<]+?<\/a>',
    d)
assert pdf_url, "PDF URL not found"

# download latest PDF
d = sc.pdfdownload(pdf_url, raw=True, silent=True)
# extract case numbers reported for previous days
d = d.replace(u'\xa0', u' ')

# data from the most recent press release
dd = sc.DayData(canton='FL', url=pdf_url)
dd.datetime = sc.find(r'Situationsbericht vom (.*? 20\d{2})', d)

dd.cases = sc.find(r'insgesamt\s+([0-9]+)\s+laborbestätigte\s+Fälle', d)
m = re.search(r'Bisher\s+trat(en)?\s+(\S+)\s+(Todesfall|Todesfälle)',
              d,
              flags=re.I)
if m:
    dd.deaths = sc.int_or_word(m[2])

if re.search(
        'Alle\s+weiteren\s+Erkrankten\s+sind\s+in\s+der\s+Zwischenzeit\s+genesen',
        d):
    dd.recovered = int(dd.cases) - int(dd.deaths)

print(dd)

Example #30

0

Show file

base_url = 'https://www.lustat.ch'
url = f'{base_url}/daten?id=28177'
d = sc.download(url, silent=True)
soup = BeautifulSoup(d, 'html.parser')

xls_url = soup.find('a', href=re.compile(r'.*\.xlsx')).get('href')
if not xls_url.startswith('http'):
    xls_url = f'{base_url}{xls_url}'
xls = sc.xlsdownload(xls_url, silent=True)
rows = sc.parse_xls(xls, header_row=5)
total_cases = 0
total_deaths = 0
is_first = True
for row in rows:
    dd = sc.DayData(canton='LU', url=xls_url)
    dd.datetime = row['Datum']
    dd.cases = sc.int_or_word(row.search(r'Neue\s+Fälle'))
    if dd.cases:
        total_cases += dd.cases
        dd.cases = total_cases
    dd.deaths = sc.int_or_word(row['Verstorbene'])
    if dd.deaths:
        total_deaths += dd.deaths
        dd.deaths = total_deaths
    dd.hospitalized = sc.int_or_word(row['Total'])
    dd.vent = sc.int_or_word(row.search(r'davon\s+beatmet'))
    dd.isolated = sc.int_or_word(row.search(r'in\s+Isolation'))
    dd.quarantined = sc.int_or_word(row.search(r'in\s+Quarantäne'))
    dd.quarantine_riskareatravel = sc.int_or_word(row.search(r'Reiserückkehrer\s+in\s+Quarantäne'))
    if dd: