Python date_from_text Exemples, scrape_common.date_from_text Python Exemples

Exemple #1

0

Afficher le fichier

Fichier : scrape_vs_common.py Projet : robi-ch/covid_19

def get_vs_weekly_general_data(pdf):
    content = sc.pdftotext(pdf, page=1)
    week = sc.find(r'Epidemiologische Situation Woche (\d+)', content)
    end_date = sc.find(r'(\d+\.\d+\.\d{4})', content)
    end_date = sc.date_from_text(end_date)
    start_date = end_date - datetime.timedelta(days=7)
    year = start_date.year
    return week, year

Exemple #2

0

Afficher le fichier

Fichier : scrape_gl.py Projet : donut1996/covid_19

import scrape_gl_common as sgc


def split_whitespace(text):
    if not text:
        return []
    text = re.sub(r'\s\s+', ' ', text)
    return text.split(' ')


# weekly pdf
pdf_url = sgc.get_gl_pdf_url()
pdf = sc.download_content(pdf_url, silent=True)
content = sc.pdftotext(pdf, page=1)
pdf_date = sc.find(r'Stand: (\d{2}\.\d{2}.\d{4})', content)
pdf_date = sc.date_from_text(pdf_date)

number_of_tests = sc.find(r'PCR-Tests\sKanton Glarus\s(\d+\'?\d+)\s',
                          content).replace('\'', '')
is_first = True
if number_of_tests:
    dd = sc.DayData(canton='GL', url=pdf_url)
    dd.datetime = pdf_date
    dd.tested = number_of_tests
    is_first = False
    print(dd)

content = sc.pdftotext(pdf, page=2, raw=True)
dates = split_whitespace(
    sc.find(r'\n(\d+\.\d+\s+\d+\.\d+\s+.*)\nMassenquarant.ne', content))
travel_q = split_whitespace(

Exemple #3

0

Afficher le fichier

    commune = commune.replace(' (BE)', '')
    commune = commune.replace(' BE', '')
    district = item['Verwaltungskreis / Arrondissement administratif']
    communes[commune] = district
    assert district in district_ids, f'District {district} is unknown!'

# start getting and parsing the data
html_url = 'https://www.besondere-lage.sites.be.ch/besondere-lage_sites/de/index/corona/index.html'
d = sc.download(html_url, silent=True)
d = d.replace('&nbsp;', ' ')
soup = BeautifulSoup(d, 'html.parser')
tbody = soup.find('table', {'summary': 'Laufend aktualisierte Zahlen zu den Corona-Erkrankungen im Kanton Bern'}).find_next('tbody')
for row in tbody.find_all('tr'):
    tds = row.find_all('td')
    date_str = sc.find(r'(\d+\.\d+\.\d+)', tds[0].text)
    date = sc.date_from_text(date_str)

    dds = {}
    for (district, d_id), (district, population) in zip(district_ids.items(), inhabitants.items()):
        dd = sc.DistrictData(district=district, canton='BE')
        dd.url = html_url
        dd.district_id = d_id
        dd.population = population
        dd.date = date.isoformat()
        dd.new_cases = 0
        dds[district] = dd

    content = tds[2].text.strip()
    # fix Munchen-<br />\nbuchsee stuff
    content = re.sub(r'-\n(\w)', r'-\1', content)
    # fix <br /> without - from above, but no number on the next line...

Exemple #4

0

Afficher le fichier

Fichier : scrape_vs.py Projet : matsf/covid_19

# parse weekly data for isolated and quarantined numbers
base_url = 'https://www.vs.ch'
stat_url = base_url + '/de/web/coronavirus/statistiques'
content = sc.download(stat_url, silent=True)
soup = BeautifulSoup(content, 'html.parser')
res = soup.find(
    string=re.compile(r'Synthese COVID19 VS Woche\d+')).find_previous('a')
weekly_pdf_url = base_url + res.attrs['href']
weekly_pdf_url = weekly_pdf_url.replace(' ', '%20')
content = sc.pdfdownload(weekly_pdf_url, silent=True)

# add isolated/quarantined to the existing DayData item
week_end_date = sc.find(r'vom (\d+)\. bis (\d+\.\d+\.20\d{2})',
                        content,
                        group=2)
week_end_date = sc.date_from_text(week_end_date).isoformat()

dd = sc.DayData(canton='VS', url=weekly_pdf_url)
dd.datetime = week_end_date
dd.isolated = sc.find(
    r'befanden\ssich\s(\d+)\spositive\sF.lle\snoch\simmer\sin\sIsolation',
    content)
dd.quarantined = sc.find(r'Isolation\sund\s(\d+)\sKontakte\sin\sQuarant.ne',
                         content)
dd.quarantine_riskareatravel = sc.find(r'\s(\d+)\sReisende\sin\sQuarant.ne',
                                       content)
print(dd)

xls_url = 'https://raw.githubusercontent.com/statistikZH/covid19_drop/master/Chiffres%20%20COVID-19%20Valais.xlsx'
main_url = 'https://www.vs.ch/de/web/coronavirus'
xls = sc.xlsdownload(xls_url, silent=True)

Exemple #5

0

Afficher le fichier

"Positiv getestete Personen","10.03.2020","7",NA,NA,NA
"Positiv getestete Personen","11.03.2020","6",NA,NA,NA
"Positiv getestete Personen","12.03.2020","6",NA,NA,NA
"Positiv getestete Personen","13.03.2020","8",NA,NA,NAh
"Positiv getestete Personen","14.03.2020","10",NA,NA,NA
"Positiv getestete Personen","15.03.2020","11",NA,NA,NA
"Positiv getestete Personen","16.03.2020","19",NA,NA,NA
"Positiv getestete Personen","17.03.2020","22",NA,NA,NA
"""

reader = csv.DictReader(StringIO(d_csv), delimiter=',')
data = collections.defaultdict(dict)
for row in reader:
    if row['Typ'] == 'NA' or row['Datum'] == 'NA':
        continue
    date = sc.date_from_text(row['Datum'])
    data[date.isoformat()][row['Typ']] = row['Anzahl']

days = list(data.keys())
is_first = True
for day in days:
    if not is_first:
        print('-' * 10)
    is_first = False

    dd = sc.DayData(canton='ZG', url=main_url)
    dd.datetime = day
    dd.isolated = data[day]['Isolation']
    dd.quarantined = data[day]['Quarantäne aus Contact Tracing']
    dd.quarantine_riskareatravel = data[day].get(
        'Quarantäne nach Rückkehr aus Risikoland')

Exemple #6

0

Afficher le fichier

            <p>Intensivpflege (aktuell):</p>
            </td>
            <td style="text-align: right; vertical-align: top;">4</td>
        </tr>
"""

include_hosp = True
include_cases = True
include_isolated = True

case_date_str = sc.find(
    r'Fallzahlen\s*im\s*Kanton\s*Luzern.*\(Stand:\s*(.+?)\,', d)
hosp_date_str = sc.find(r'Hospitalisierungen.*\(Stand:\s*(.+?)\,', d)
isolated_date_str = sc.find(r'Isolation.*\(Stand:\s*(.+?)\,', d)

case_date = sc.date_from_text(case_date_str)
hosp_date = sc.date_from_text(hosp_date_str)
isolated_date = sc.date_from_text(isolated_date_str)

max_date = max(hosp_date, case_date, isolated_date)
if max_date > hosp_date:
    include_hosp = False
else:
    dd.datetime = hosp_date_str
if max_date > case_date:
    include_cases = False
else:
    dd.datetime = case_date_str
if max_date > isolated_date:
    include_isolated = False
else:

Exemple #7

0

Afficher le fichier

from bs4 import BeautifulSoup
import re
import scrape_common as sc
import scrape_bl_common as sbc
from datetime import timedelta

# weekly data
bulletin_urls = sbc.get_all_bl_bulletin_urls()
for bulletin_url in bulletin_urls:
    bulletin_content = sc.download(bulletin_url, silent=True)
    soup = BeautifulSoup(bulletin_content, 'html.parser')
    content = soup.find(string=re.compile(r'Per heute .*')).string
    content = sbc.strip_bl_bulletin_numbers(content)

    date = sc.find(r'Per heute \w+, (\d+\. \w+ 20\d{2})', content)
    date = sc.date_from_text(date)
    # previous week
    date = date - timedelta(days=7)

    td = sc.TestData(canton='BL', url=bulletin_url)
    td.week = date.isocalendar()[1]
    td.year = date.year
    td.total_tests = sc.find(r'In der Vorwoche wurden (\d+) PCR-Tests',
                             content)
    td.positivity_rate = sc.find(
        r'von diesen waren (\d+\.?,?\d?) Prozent positiv', content)
    if td.total_tests and td.positivity_rate:
        td.positivity_rate = td.positivity_rate.replace(',', '.')
        print(td)

# daily data

Exemple #8

0

Afficher le fichier

<td colspan="2">70</td>
<td colspan="2">+8</td>
</tr><tr class="even" ><td height="20">Verstorbene (kumuliert)</td>
<td colspan="2">9</td>
<td colspan="2">unverändert</td>
</tr></tbody></table>
"""

include_hosp = True
include_cases = True

dates = re.findall(r'<h4>Stand ([0-9]+\.\s*[A-Za-z]*\s*[0-9]{4}).*<\/h4>', d)
if len(dates) == 1:
    dd.datetime = dates[0]
elif len(dates) >= 2:
    d1 = sc.date_from_text(dates[0])
    d2 = sc.date_from_text(dates[1])
    if d1 > d2:
        include_hosp = False
        dd.datetime = dates[0]
    elif d2 > d1:
        include_cases = False 
        dd.datetime = dates[1]
    else:
        dd.datetime = dates[0]
else:
    print("Error: Date not found.", file=sys.stderr)

if include_cases:
    dd.cases = sc.find(r'Laborbest.+?tigte\s*F.+?lle\s*\(kumuliert\)<\/t[hd]>\s*<t[hd][^>]*>([0-9]+)<\/t[hd]>', d.replace("\n", ""))
    dd.deaths = sc.find(r'>Verstorbene\s*\(kumuliert\)<\/td>\s*<td[^>]*>([0-9]+)[ <]', d.replace("\n", ""))

Exemple #9

0

Afficher le fichier

Fichier : scrape_vd_tests.py Projet : openZH/covid_19

        flags=re.I | re.UNICODE)
    res_with_year = re.search(
        r'Entre\s+le\s+(?P<start>\d+\s+\w+\s+\d{4})\s+et\s+le\s+(?P<end>\d+\s+\w+\s+\d{4}),',
        pdf,
        flags=re.I | re.UNICODE)
    res_no_month = re.search(
        r'Entre\s+le\s+(?P<start>\d+)\s+et\s+le\s+(?P<end>\d+\s+\w+),',
        pdf,
        flags=re.I | re.UNICODE)
    res_no_month_with_year = re.search(
        r'Entre(?P<et>\s+et)?\s+le\s+(?P<start>\d+)\s+et\s+le\s+(?P<end>\d+\s+\w+\s+\d{4}),',
        pdf,
        flags=re.I | re.UNICODE)

    if res:
        start_date = sc.date_from_text(f"{res['start']} {year}")
        end_date = sc.date_from_text(f"{res['end']} {year}")
    elif res_with_year:
        start_date = sc.date_from_text(res_with_year['start'])
        end_date = sc.date_from_text(res_with_year['end'])
    elif res_no_month:
        end_date = sc.date_from_text(f"{res_no_month['end']} {year}")
        start_date = sc.date_from_text(
            f"{res_no_month['start']}.{end_date.month}.{year}")
    elif res_no_month_with_year:
        end_date = sc.date_from_text(res_no_month_with_year['end'])
        start_date = sc.date_from_text(
            f"{res_no_month_with_year['start']}.{end_date.month}.{end_date.year}"
        )
    elif date:
        print(date)

Exemple #10

0

Afficher le fichier

Fichier : scrape_sz_districts.py Projet : robi-ch/covid_19

# -*- coding: utf-8 -*-

import re

from bs4 import BeautifulSoup

import scrape_common as sc

url = 'https://www.sz.ch/behoerden/information-medien/medienmitteilungen/coronavirus.html/72-416-412-1379-6948'
content = sc.download(url, silent=True)
soup = BeautifulSoup(content, 'html.parser')
pdf_url = soup.find('a', text=re.compile(r'Coronafälle pro Gemeinde')).get('href')

content = sc.pdfdownload(pdf_url, layout=True, silent=True)
date = sc.find(r'Stand\W+(\d+\.\d+\.20\d{2})', content)
date = sc.date_from_text(date).isoformat()
district_data = re.findall(r'^Bezirk\W+(\w+)\s+(≤?\s?\d+)', content, re.MULTILINE)

# https://www.bfs.admin.ch/bfs/de/home/statistiken/kataloge-datenbanken/karten.assetdetail.5688189.html
district_ids = {
    'Einsiedeln': 501,
    'Gersau': 502,
    'Höfe': 503,
    'Küssnacht': 504,
    'March': 505,
    'Schwyz': 506,
}

# https://www.sz.ch/kanton/bezirke/schwyz.html/72-210-112-106
population = {
    'Einsiedeln': 16027,

Exemple #11

0

Afficher le fichier

Fichier : scrape_zg_tests.py Projet : robi-ch/covid_19

import csv
import datetime
from io import StringIO
import scrape_common as sc

csv_url = 'https://www.zg.ch/behoerden/gesundheitsdirektion/statistikfachstelle/daten/themen/result-themen-14-03-07-i2-k4-b1.csv'
d_csv = sc.download(csv_url, silent=True)
"""
"Woche","Geschlecht","Anzahl Fälle","Meta","Type","Content"
2020-05-25,"männlich","151",NA,NA,NA
2020-06-01,"männlich","117",NA,NA,NA
"""

reader = csv.DictReader(StringIO(d_csv), delimiter=',')
data = collections.defaultdict(dict)
for row in reader:
    if row['Woche'] == 'NA':
        continue
    date = sc.date_from_text(row['Woche'])
    if date not in data:
        data[date] = 0
    data[date] += int(row['Anzahl Fälle'])

days = list(data.keys())
for day in days:
    td = sc.TestData(canton='ZG', url=csv_url)
    td.start_date = day.isoformat()
    td.end_date = (day + datetime.timedelta(days=6)).isoformat()
    td.total_tests = data[day]
    print(td)

Exemple #12

0

Afficher le fichier

Fichier : scrape_be_tests.py Projet : openZH/covid_19

#!/usr/bin/env python3

import csv
from io import StringIO
import scrape_common as sc

url = 'https://covid-kennzahlen.apps.be.ch/#/de/cockpit'

csv_url = 'https://raw.githubusercontent.com/openDataBE/covid19Data/develop/vortag_tests.csv'
d = sc.download(csv_url, silent=True)
reader = csv.DictReader(StringIO(d), delimiter=',')
for row in reader:
    td = sc.TestData(canton='BE', url=url)
    date = sc.date_from_text(row['datum']).isoformat()
    td.start_date = date
    td.end_date = date
    td.total_tests = row['durchgefuehrte_tests']
    td.positive_tests = row['positive_tests']
    td.positivity_rate = row['positivitaetsrate']
    print(td)

Exemple #13

0

Afficher le fichier

Fichier : scrape_sg_districts.py Projet : wuerschp/covid_19

    'Rheintal': 1723,
    'Werdenberg': 1724,
    'Sarganserland': 1725,
    'See-Gaster': 1726,
    'Toggenburg': 1727,
    'Wil': 1728,
}

url = 'https://www.sg.ch/ueber-den-kanton-st-gallen/statistik/covid-19/_jcr_content/Par/sgch_downloadlist/DownloadListPar/sgch_download.ocFile/KantonSG_C19-Faelle_download.csv'
d = sc.download(url, silent=True)

# strip the "header" / description lines
d = "\n".join(d.split("\n")[5:])

reader = csv.DictReader(StringIO(d), delimiter=';')
for row in reader:
    week = sc.find(r'W(\d+)', row['Kalenderwoche'])
    date = sc.date_from_text(row['Falldatum'])

    for key, value in inhabitants.items():
        dd = sc.DistrictData(canton='SG', district=key)
        dd.url = url
        dd.week = week
        dd.year = date.year
        dd.date = date.isoformat()
        dd.district_id = district_ids[key]
        dd.new_cases = row['Wahlkreis ' + key]
        dd.total_cases = row['Wahlkreis ' + key + ' (kumuliert)']
        dd.population = value
        print(dd)

Exemple #14

0

Afficher le fichier

Fichier : scrape_vd_tests.py Projet : donut1996/covid_19

import scrape_common as sc
import scrape_vd_common as svc


pdf_urls = svc.get_all_weekly_pdf_urls()
for pdf_url in pdf_urls:
    pdf = sc.pdfdownload(pdf_url, silent=True, page=1)
    pdf = re.sub(r'(\d+)\'(\d+)', r'\1\2', pdf)
    pdf = re.sub(r'(\d+)’(\d+)', r'\1\2', pdf)

    td = sc.TestData(canton='VD', url=pdf_url)

    year = sc.find(r'Situation au \d+.*(20\d{2})', pdf)
    res = re.search(r'Entre\s+le\s+(\d+\s+\w+)\s+et\s+le\s+(\d+\s+\w+),', pdf)
    if res:
        start_date = sc.date_from_text(f'{res[1]} {year}')
        end_date = sc.date_from_text(f'{res[2]} {year}')
    else:
        res = re.search(r'Entre\s+le\s+(\d+)\s+et\s+le\s+(\d+\s+\w+),', pdf)
        if res:
            end_date = sc.date_from_text(f'{res[2]} {year}')
            start_date = sc.date_from_text(f'{res[1]}.{end_date.month}.{year}')
    assert start_date and end_date, f'failed to extract start and end dates from {pdf_url}'
    td.start_date = start_date
    td.end_date = end_date

    res = re.search(r'une\s+moyenne\s+de\s+(\d+)\s+frottis\s+SARS-CoV(-)?2', pdf)
    if res:
        days = (end_date - start_date).days
        td.total_tests = days * int(res[1])

Exemple #15

0

Afficher le fichier

         <tr>
            <td valign="top">
            <p>Intensivpflege (aktuell):</p>
            </td>
            <td style="text-align: right; vertical-align: top;">4</td>
        </tr>
"""

include_hosp = True
include_cases = True

case_date_str = sc.find(
    r'Fallzahlen\s*im\s*Kanton\s*Luzern.*\(Stand:\s*(.+?)\,', d)
hosp_date_str = sc.find(r'Hospitalisierungen.*\(Stand:\s*(.+?)\,', d)

case_date = sc.date_from_text(case_date_str)
hosp_date = sc.date_from_text(hosp_date_str)
if case_date > hosp_date:
    include_hosp = False
    dd.datetime = case_date_str
elif hosp_date > case_date:
    include_cases = False
    dd.datetime = hosp_date_str
else:
    dd.datetime = case_date_str

soup = BeautifulSoup(d, 'html.parser')
rows = []
for table in soup.find(string=re.compile(
        r'Informationen\s*des\s*Kantons')).find_parent('li').find_all('table'):
    rows += table.find_all('tr')

Exemple #16

0

Afficher le fichier

        if td:
            print(td)

# daily tests
for t in soup.find(
        'caption', string=re.compile(
            'Corona-Erkrankungen im Kanton Bern')).find_parents('table'):
    headers = [
        " ".join(cell.stripped_strings) for cell in t.find('tr').find_all('th')
    ]

    for row in [r for r in t.find_all('tr') if r.find_all('td')]:
        td = sc.TestData(canton='BE', url=html_url)

        for col_num, cell in enumerate(row.find_all(['td'])):
            value = " ".join(cell.stripped_strings)
            if value:
                value = re.sub(r'[^\d\.\ ]', '', value)

            if sc.find(r'^(Datum)', headers[col_num]) is not None:
                dateArr = re.search(r'(\d{2}).(\d{2}).(\d{2})', value)
                value = dateArr.group(0)
                date = sc.date_from_text(value).isoformat()
                td.start_date = date
                td.end_date = date
            elif sc.find(r'^(Durch-)', headers[col_num]):
                td.total_tests = int(value)

        if td:
            print(td)