def getPage(url, parsed): req = scrape.request_url(url) text = req.text if parsed: text = BeautifulSoup(text, "html.parser") #text=text.decode("UTF-8").encode("UTF-8") # only on some installations ;-) return text
from bs4 import BeautifulSoup import requests import datetime import re import locale locale.setlocale(locale.LC_TIME, "de_DE.utf-8") import scrape import helper from database_interface import * main_url = "https://www.kreis-kleve.de/de/fachbereich5/coronavirus/" req = scrape.request_url(main_url) bs = BeautifulSoup(req.text, "html.parser") cases_pattern = "insgesamt [0-9]+ bestätigte" text = bs.getText() status_raw = re.findall("Stand .*?\)", text)[0] status = helper.get_status(status_raw) cases_raw = re.findall(cases_pattern, text)[0] cases = int(re.findall(r'[0-9]+', cases_raw)[0]) add_to_database("05154", status, cases, "Kreis Kleve")
import logging import locale if scrape.SCRAPER_DEBUG: logging.basicConfig(level=logging.DEBUG) import pprint logger = logging.getLogger(__name__) from database_interface import * DISTRICT_UID = "05974" main_url = "https://www.presse-service.de/rss.aspx?v=2&p=551" req = scrape.request_url(main_url,headers=scrape.RANDOM_CLIENT_HEADERS,options={'debug': scrape.SCRAPER_DEBUG, 'forceEncoding': 'utf8'}) #req.encoding = 'utf8' bs = BeautifulSoup(req.text, "html.parser") news_list = bs.findAll("item") for item in news_list: status_pattern = "(.*) Das Referat .*" cases_pattern = "([0-9]+) .* Corona-Fälle" cases_raw = re.search(cases_pattern, item.title.text) if cases_raw == None: continue cases = int(cases_raw.group(1)) logger.info("\n") logger.debug('%s' % item.title.text) logger.debug('%s' % item.guid.text) logger.debug('%s' % item.pubdate.text)
from bs4 import BeautifulSoup import requests import datetime import re import locale locale.setlocale(locale.LC_TIME, "de_DE.utf-8") import scrape import helper from database_interface import * main_url = "https://www.kreis-reutlingen.de/de/Aktuelles/Landkreis-aktuell/Landkreis-aktuell?view=publish&item=article&id=1923" req = scrape.request_url(main_url, options={'cookies':{'skipEntranceUrl':'1'}}) bs = BeautifulSoup(req.text, "html.parser") cases_pattern = "Gesamtzahl laborbestätigter Fälle: [0-9]+" text=bs.getText() status_raw = re.findall("Stand: .*?2020", text)[0] status=helper.get_status(status_raw) cases_raw = re.findall(cases_pattern,text)[0] cases = int(re.findall(r'[0-9]+', cases_raw)[0]) add_to_database("08415", status, cases, "Kreis Reutlingen")
from bs4 import BeautifulSoup import requests import datetime import re import scrape from helper import * from database_interface import * # (Labordiagnostisch bestätigt, Stand 21.03.2020) url = "https://www.segeberg.de/Quicknavigation/Startseite" req = scrape.request_url(url) bs = BeautifulSoup(req.text, "html.parser") prefix = "FÄLLE IM KREIS SEGEBERG: \d+" prefix_date = "Stand.*\)" text = bs.getText() status_raw = re.findall(prefix_date, text)[0].replace(")", "") status = get_status(status_raw) cases_raw = re.findall(prefix, text)[0] cases = get_number_only(cases_raw) add_to_database("01060", status, cases, name="Kreis Segeberg")