Python pdfdownload Examples, scrape_common.pdfdownload Python Examples

Example #1

0

Show file

File: scrape_vd.py Project: robi-ch/covid_19

def parse_weekly_pdf():
    pdf_url = svc.get_weekly_pdf_url()
    pdf = sc.pdfdownload(pdf_url, silent=True)
    """
    29.07.2020
    Concernant le traçage des contacts de cas positifs, le 27 juillet, 83 personnes étaient en isolement, 633 en quarantaine de contacts étroits et 901 en quarantaine de retour de voyage.
    """

    dd = sc.DayData(canton='VD', url=pdf_url)
    year = sc.find('Situation au \d+.*(20\d{2})', pdf)
    date = sc.find(
        'Concernant le traçage des contacts de cas positifs, le (\d+.*),', pdf)
    if not date:
        print("isolated/quarantined numbers missing in weekly PDF of VD",
              file=sys.stderr)
        return
    dd.datetime = date + ' ' + year
    dd.isolated = sc.find(
        '(\d+)\s(personnes|cas\spositifs)\sétaient\sen\sisolement', pdf)
    dd.quarantined = text_to_int(
        sc.find('(\d.\d+|\d+)\scontacts\sétroits\sen\squarantaine\.', pdf))
    print(dd)
    print('-' * 10)

    dd = sc.DayData(canton='VD', url=pdf_url)
    date = sc.find('quarantaine. Le (\d+ .*),', pdf)
    dd.datetime = date + ' ' + year
    dd.quarantine_riskareatravel = text_to_int(
        sc.find(
            ', (\d.\d+|\d+)\spersonnes\sétaient\sen\squarantaines?\ssuite\sà\sun\sretour\sde\svoyage.',
            pdf))
    print(dd)
    print('-' * 10)

Example #2

0

Show file

def parse_weekly_pdf():
    sc.add_cert_to_bundle()
    base_url = 'https://www.infosan.vd.ch'
    d = sc.download(base_url, silent=True)
    soup = BeautifulSoup(d, 'html.parser')
    html_url = base_url + soup.find(href=re.compile("/publications/covid-19-point-epidemiologique")).get('href')
    d = sc.download(html_url, silent=True)
    soup = BeautifulSoup(d, 'html.parser')
    pdf_url = base_url + soup.find(href=re.compile("\.pdf$")).get('href')
    pdf = sc.pdfdownload(pdf_url, silent=True)

    """
    29.07.2020
    Concernant le traçage des contacts de cas positifs, le 27 juillet, 83 personnes étaient en isolement, 633 en quarantaine de contacts étroits et 901 en quarantaine de retour de voyage.
    """

    dd = sc.DayData(canton='VD', url=pdf_url)
    year= sc.find('Situation au \d+.*(20\d{2})', pdf)
    date = sc.find('Concernant le traçage des contacts de cas positifs, le (\d+.*),', pdf)
    dd.datetime = date + ' ' + year
    dd.isolated = sc.find('(\d+)\s(personnes|cas\spositifs)\sétaient\sen\sisolement', pdf)
    dd.quarantined = text_to_int(sc.find('(\d.\d+|\d+)\scontacts\sétroits\sen\squarantaine\.', pdf))
    print(dd)
    print('-' * 10)

    dd = sc.DayData(canton='VD', url=pdf_url)
    date = sc.find('quarantaine. Le (\d+ .*),', pdf)
    dd.datetime = date + ' ' + year
    dd.quarantine_riskareatravel = text_to_int(sc.find(', (\d.\d+|\d+)\spersonnes\sétaient\sen\squarantaines\ssuite\sà\sun\sretour\sde\svoyage.', pdf))
    print(dd)
    print('-' * 10)

Example #3

0

Show file

File: scrape_vd.py Project: openZH/covid_19

def parse_weekly_pdf():
    pdf_url = svc.get_weekly_pdf_url()
    pdf = sc.pdfdownload(pdf_url, silent=True)

    dd = sc.DayData(canton='VD', url=pdf_url)
    dd.datetime = sc.find('Point .pid.miologique au (\d+\s+\w+\s+\d{4})', pdf)
    dd.cases = text_to_int(
        sc.find(
            '\s(\d+.\d+)\s+personnes ont .t. test.es positives au SARS-CoV-2.',
            pdf))
    dd.hospitalized = sc.find(
        '(\d+)\s+patients\s+sont\s+actuellement\s+hospitalis.s', pdf)
    dd.icu = sc.find('dont\s+(\d+)\s+en\s+soins\s+intensifs', pdf)
    assert dd
    print(dd)
    print('-' * 10)

Example #4

0

Show file

File: scrape_fl.py Project: jcblemai/R0_BBRC_experiment

#!/usr/bin/env python3

import scrape_common as sc
import re

# get latest from list with all press releases
d = sc.download('https://www.regierung.li/coronavirus', silent=True)

pdf_url = sc.find(
    r'<a.*?href="([^"]+\.pdf)[^"]*"[^>]*?>[^<]+?Situationsbericht[^<]+?<\/a>',
    d)
assert pdf_url, "PDF URL not found"

# download latest PDF
d = sc.pdfdownload(pdf_url, raw=True, silent=True)
# extract case numbers reported for previous days
d = d.replace(u'\xa0', u' ')

# data from the most recent press release
dd = sc.DayData(canton='FL', url=pdf_url)
dd.datetime = sc.find(r'Situationsbericht vom (.*? 20\d{2})', d)

dd.cases = sc.find(r'insgesamt\s+([0-9]+)\s+laborbestätigte\s+Fälle', d)
m = re.search(r'Bisher\s+trat(en)?\s+(\S+)\s+(Todesfall|Todesfälle)',
              d,
              flags=re.I)
if m:
    dd.deaths = sc.int_or_word(m[2])

if re.search(
        'Alle\s+weiteren\s+Erkrankten\s+sind\s+in\s+der\s+Zwischenzeit\s+genesen',

Example #5

0

Show file

File: scrape_ag.py Project: zukunft/covid_19

#!/usr/bin/env python3

import scrape_common as sc
import re

print('AG')

# get latest from list with all bulletins
d = sc.download('https://www.ag.ch/de/themen_1/coronavirus_2/lagebulletins/lagebulletins_1.jsp')

url = sc.find(r'<a [^>]*href="([^"]+\.pdf)">.+Bulletin.+</a>', d)

# download latest PDF
d = sc.pdfdownload('https://www.ag.ch' + url, raw=True)

sc.timestamp()

print('Date and time:', sc.find(r'Aarau, (.+? Uhr)', d))

print('Confirmed cases:', sc.find(r'zurzeit\s+([0-9]+)\s+bestätigte\s+Fälle', d))

print('Recovered:', sc.find(r'([0-9]+)\s+Personen.*?als\s+geheilt', d))

print('Hospitalized:', sc.find(r'([0-9]+)\s+Person(en)?\s+sind\s+zurzeit\s+hospitalisiert', d))

print('ICU:', sc.find(r'([0-9]+)\s+Person(en)?.*?auf\s+Intensivstationen', d))

print('Vent:', sc.find(r'([0-9]+)\s+Person(en)?\s+künstlich\s+beatmet', d))

print('Deaths:', sc.find(r'([0-9]+)\s+Person(en)?\s+an\s+den\s+Folgen\s+des\s+Coronavirus\s+verstorben', d))

Example #6

0

Show file

File: scrape_vs.py Project: matsf/covid_19

import datetime
import re
from bs4 import BeautifulSoup
import scrape_common as sc

# parse weekly data for isolated and quarantined numbers
base_url = 'https://www.vs.ch'
stat_url = base_url + '/de/web/coronavirus/statistiques'
content = sc.download(stat_url, silent=True)
soup = BeautifulSoup(content, 'html.parser')
res = soup.find(
    string=re.compile(r'Synthese COVID19 VS Woche\d+')).find_previous('a')
weekly_pdf_url = base_url + res.attrs['href']
weekly_pdf_url = weekly_pdf_url.replace(' ', '%20')
content = sc.pdfdownload(weekly_pdf_url, silent=True)

# add isolated/quarantined to the existing DayData item
week_end_date = sc.find(r'vom (\d+)\. bis (\d+\.\d+\.20\d{2})',
                        content,
                        group=2)
week_end_date = sc.date_from_text(week_end_date).isoformat()

dd = sc.DayData(canton='VS', url=weekly_pdf_url)
dd.datetime = week_end_date
dd.isolated = sc.find(
    r'befanden\ssich\s(\d+)\spositive\sF.lle\snoch\simmer\sin\sIsolation',
    content)
dd.quarantined = sc.find(r'Isolation\sund\s(\d+)\sKontakte\sin\sQuarant.ne',
                         content)
dd.quarantine_riskareatravel = sc.find(r'\s(\d+)\sReisende\sin\sQuarant.ne',

Example #7

0

Show file

#!/usr/bin/env python3
# -*- coding: utf-8 -*-

from bs4 import BeautifulSoup
import re
import scrape_common as sc

# get pdf and xlsx URL from covid19 page of TI
main_url = 'https://www4.ti.ch/dss/dsp/covid19/home/'
d = sc.download(main_url, silent=True)
soup = BeautifulSoup(d, 'html.parser')

pdf_url = soup.find('a', string=re.compile(r'Dati stato.*')).get('href')
pdf_url = f'https://www4.ti.ch/{pdf_url}'
pdf_content = sc.pdfdownload(pdf_url, silent=True, raw=True)

dd = sc.DayData(canton='TI', url=pdf_url)
dd.datetime = sc.find(r'(?:Stato )?(\d+\.\d+\.20\d{2})', pdf_content)
dd.isolated = sc.find(r'(\d+)\sPersone\sin\sisolamento', pdf_content)
dd.quarantined = sc.find(r'(\d+)\sPersone\sin\squarantena', pdf_content)
is_first = True
if dd:
    print(dd)
    is_first = False

xls_url = soup.find(href=re.compile("\.xlsx$")).get('href')
assert xls_url, "URL is empty"

if not xls_url.startswith('http'):
    xls_url = f'https://www4.ti.ch/{xls_url}'

Example #8

0

Show file

File: scrape_sz.py Project: ahirlinger/covid_19

#!/usr/bin/env python3
# -*- coding: utf-8 -*-

import re
import sys
import datetime
from bs4 import BeautifulSoup
import scrape_common as sc

d = sc.download(
    'https://www.sz.ch/behoerden/information-medien/medienmitteilungen/coronavirus.html/72-416-412-1379-6948',
    silent=True)
soup = BeautifulSoup(d, 'html.parser')

pdf_url = soup.find('a', string=re.compile(r'Medienmitteilung vom'))['href']
pdf_content = sc.pdfdownload(pdf_url, layout=True, silent=True)
date = sc.find(r'Stand: (\d+\. .* 20\d{2})', pdf_content)
res = re.search(r'.*\s+\d+\s+\d+\s+\d+\s+(\d+)\s+(\d+)\s+(\d+)\s+',
                pdf_content)
is_first = True
if res is not None:
    dd = sc.DayData(canton='SZ', url=pdf_url)
    dd.datetime = date
    dd.hospitalized = res[1]
    dd.quarantined = res[2]
    dd.quarantine_riskareatravel = res[3]
    print(dd)
    is_first = False

try:
    xls_url = soup.find(

Example #9

0

Show file

File: scrape_ge.py Project: robi-ch/covid_19

#!/usr/bin/env python3
# -*- coding: utf-8 -*-

import re
import datetime
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
import scrape_common as sc
import scrape_ge_common as sgc

is_first = True

# parse tested from PDF
pdf_url = sgc.get_latest_ge_weekly_pdf_url()
pdf = sc.pdfdownload(pdf_url, silent=True)

week_number = sc.find(r'Situation semaine (\d+)', pdf)
if week_number:
    week_end_date = datetime.datetime.strptime('2021-W' + week_number + '-7',
                                               '%G-W%V-%u').date()
    number_of_tests = sc.find(r'Au total, (\d+\'\d+) tests PCR ont', pdf)

    if number_of_tests is not None:
        number_of_tests = number_of_tests.replace('\'', '')

        dd_test = sc.DayData(canton='GE', url=pdf_url)
        dd_test.datetime = week_end_date.isoformat()
        dd_test.tested = number_of_tests
        print(dd_test)
        is_first = False

Example #10

0

Show file

#!/usr/bin/env python3

import scrape_common as sc

print('NE')

d = sc.pdfdownload(
    'https://www.ne.ch/autorites/DFS/SCSP/medecin-cantonal/maladies-vaccinations/Documents/Covid-19-Statistiques/COVID19_PublicationInternet.pdf',
    layout=True)
sc.timestamp()

# Heavily inspired by code by https://github.com/maekke97

# Magic column fix (don't know if this is stable).
d = d.replace('avr\n   il', 'avril')
d = d.replace('avr\n il', 'avril')
# Find the start of the table on page 5.
d = d[d.find('1mars2020'):]

# d  # Example rows.
"""
18mars2020                       32                146                15                     3                                                           18                     3                                                                                       3              1                      2
19mars2020                       29                175                16                     3                                                           19                     3                                                                                       3              1                      3
20mars2020                       13                188                17                     4                                                           21                     4                                                              2                        6                                     3
21mars2020                       12                200                15                     5                                                           20                     5                                                              1                        6              1                      4
22mars2020                       16                216                22                     6                                                           28                     6                                                              1                        7                                     4
23mars2020                       31                247                22                                            5                   6                33                                          5                     6                   0                       11              1                      5
24mars2020                       18                265                24                                            2                   6                32                                          2                     6                   3                       11              1                      6
25mars2020                       15                280                31                                            3                   7                41                                          3                     7                   2                       12              3                      9
26mars2020                       19                299                33                                            2                   7                42                                          2                     7                   1                       10              2                     11
1avril2020                        18                420                52                                            6                   8                66                                          6                     8                   4                       18              2                     23

Example #11

0

Show file

File: scrape_vs.py Project: donut1996/covid_19


def strip_value(value):
    if value:
        return re.sub(r'[^0-9]', '', value)
    return None


base_url = 'https://www.vs.ch'
url = f'{base_url}/web/coronavirus/statistiques'
content = sc.download(url, silent=True)
soup = BeautifulSoup(content, 'html.parser')
pdf_url = soup.find('a', string=re.compile(r'2020.*Sit Epid.*')).get('href')
pdf_url = f'{base_url}{pdf_url}'

content = sc.pdfdownload(pdf_url, silent=True, layout=True, page=1)

dd = sc.DayData(canton='VS', url=pdf_url)
dd.datetime = sc.find(r'(\d{2}/\d{2}/20\d{2})', content)
dd.datetime = re.sub(r'/', '.', dd.datetime)
dd.cases = strip_value(
    sc.find(r'.*Cumul cas positifs.*\s+(\d+.\d+)\s+', content))
dd.deaths = strip_value(sc.find(r'.*Cumul d.c.s.*\s+(\d+.\d+)\s+', content))
dd.hospitalized = strip_value(
    sc.find(r'.*Hospitalisations en cours de cas COVID-19.*\s+(\d+)\s+',
            content))
dd.icu = strip_value(sc.find(r'.*SI en cours.*\s+(\d+)\s+', content))
dd.vent = strip_value(sc.find(r'.*Intubation en cours.*\s+(\d+)\s+', content))

is_first = True
if dd:

Example #12

0

Show file

File: scrape_vs.py Project: yangzhe1990/covid_19

# Download list of PDFs with statistics updated daily
d = sc.download('https://www.vs.ch/de/web/coronavirus/statistiques', silent=True)

# 2020-04-02  (but also earlier)
"""
 ... ... <ul> <li><a href="/documents/6756452/7008787/2020 04 02 Sit Epid - État Stand.pdf" target="_blank">2020 04 02 Sit Epid - État Stand.pdf</a></li> <li><a href="/documents/6756452/7008787/2020 04 01 Sit Epid - État Stand" target="_blank">2020 04 01 Sit Epid - État Stand</a></li> <li>
"""

# Note, these are PDFs, but not all of them have pdf "extension".
url = sc.find(r'<li>\s*<a href="([^"]+)"[^>]*>[^<]*Stand(?:\.pdf)?<', d)
assert url, "Can't find latest PDF URL"

full_url = 'https://www.vs.ch' + urllib.parse.quote(url)
dd.url = full_url
d = sc.pdfdownload(full_url, raw=True, silent=True)

# 2020-03-29
"""
État au – Stand : 29.03.2020 15.00h
Nombre de cas positifs COVID-19 - Anzahl positive COVID-19 Fälle
Total de cas positifs
Total positive Fälle
∆ J-1 Incidence cumulée pour 100'000 habitants
Kumulierte Inzidenz pro 100'000 Einwohner
964 +62 278.1

...

Nombre de décès – Anzahl Todesfälle
Total ∆ J-1

Example #13

0

Show file

File: scrape_vd_tests.py Project: openZH/covid_19

#!/usr/bin/env python3
# -*- coding: utf-8 -*-

import datetime
import re
import scrape_common as sc
import scrape_vd_common as svc

pdf_urls = svc.get_all_weekly_pdf_urls()
for pdf_url in pdf_urls:
    pdf = sc.pdfdownload(pdf_url, silent=True, page=1)
    pdf = re.sub(r'(\d+)\'(\d+)', r'\1\2', pdf)
    pdf = re.sub(r'(\d+)’(\d+)', r'\1\2', pdf)
    pdf = re.sub(r'(\d)er', r'\1', pdf)

    td = sc.TestData(canton='VD', url=pdf_url)

    year = sc.find(r'Situation au \d+.*(20\d{2})', pdf)
    date = sc.find(r'Point .pid.miologique au (\d+\s+\w+\s+20\d{2})', pdf)
    res = re.search(
        r'Entre\s+(?P<et>et\s+)?le\s+(?P<start>\d+\s+\w+)\s+et\s+le\s+(?P<end>\d+\s+\w+)(?P<year>\s+\d{4})?,',
        pdf,
        flags=re.I | re.UNICODE)
    res_with_year = re.search(
        r'Entre\s+le\s+(?P<start>\d+\s+\w+\s+\d{4})\s+et\s+le\s+(?P<end>\d+\s+\w+\s+\d{4}),',
        pdf,
        flags=re.I | re.UNICODE)
    res_no_month = re.search(
        r'Entre\s+le\s+(?P<start>\d+)\s+et\s+le\s+(?P<end>\d+\s+\w+),',
        pdf,
        flags=re.I | re.UNICODE)

Example #14

0

Show file

File: scrape_vs.py Project: portugbd/covid_19

# Download list of PDFs with statistics updated daily
d = sc.download('https://www.vs.ch/de/web/coronavirus/statistiques')

# 2020-04-02  (but also earlier)
"""
 ... ... <ul> <li><a href="/documents/6756452/7008787/2020 04 02 Sit Epid - État Stand.pdf" target="_blank">2020 04 02 Sit Epid - État Stand.pdf</a></li> <li><a href="/documents/6756452/7008787/2020 04 01 Sit Epid - État Stand" target="_blank">2020 04 01 Sit Epid - État Stand</a></li> <li>
"""

# Note, these are PDFs, but not all of them have pdf "extension".
url = sc.find(r'<li>\s*<a href="([^"]+)"[^>]*>[^<]*Stand(?:\.pdf)?<', d)
assert url, "Can't find latest PDF URL"

import urllib.parse
full_url = 'https://www.vs.ch' + urllib.parse.quote(url)
d = sc.pdfdownload(full_url, raw=True)

# 2020-03-29
"""
État au – Stand : 29.03.2020 15.00h
Nombre de cas positifs COVID-19 - Anzahl positive COVID-19 Fälle
Total de cas positifs
Total positive Fälle
∆ J-1 Incidence cumulée pour 100'000 habitants
Kumulierte Inzidenz pro 100'000 Einwohner
964 +62 278.1

...

Nombre de décès – Anzahl Todesfälle
Total ∆ J-1

Example #15

0

Show file

#!/usr/bin/env python3

import scrape_common as sc

print('GE')
d = sc.pdfdownload(
    'https://www.ge.ch/document/point-coronavirus-maladie-covid-19/telecharger'
)
sc.timestamp()

#d = sc.filter(r'Dans le canton de Genève|Actuellement.*cas ont|décédées|hospitalisés', d) # + 1 line.

# 2020-03-23
"""
Cette fiche destinée à la population générale
dresse un état des lieux de la situation au 23
mars 2020.

Chiffres clés au 22 mars 2020 (OMS, OFSP
et DGS pour la Suisse et Genève)
Chine

81'498 cas

3'267 décès

Europe

151'293 cas

7'426 décès

Example #16

0

Show file

File: scrape_fl.py Project: maruch93/covid_19

#!/usr/bin/env python3

import scrape_common as sc
import re

# get latest from list with all press releases
d = sc.download('https://www.regierung.li/coronavirus', silent=True)

pdf_url = sc.find(
    r'<a.*?href="([^"]+\.pdf)[^"]*"[^>]*?>[^<]+?Situationsbericht[^<]+?<\/a>',
    d)
assert pdf_url, "PDF URL not found"

# download latest PDF
d = sc.pdfdownload(pdf_url, raw=True, silent=True)
# extract case numbers reported for previous days
d = d.replace(u'\xa0', u' ')

# data from the most recent press release
dd = sc.DayData(canton='FL', url=pdf_url)
dd.datetime = sc.find(r'Situationsbericht vom (.*? 20\d{2})', d)

dd.cases = sc.find(r'insgesamt\s+([0-9]+)\s+laborbestätigte\s+Fälle', d)
m = re.search(r'Bisher\s+trat(en)?\s+(\S+)\s+(Todesfall|Todesfälle)',
              d,
              flags=re.I)
if m:
    dd.deaths = sc.int_or_word(m[2])
print(dd)