def collect_country(): country_data = fetchhelper.Updater(url_countries, ext='country.json') country_data.check_fetch(rawfile=args.rawfile[0], compressed=True) if not country_data.rawdata.strip(): if datetime.date.today().isoweekday() == 7: # They nowadays turn the servers off on Sundays or sth. pass else: print("Empty country.json") return jdat = json.loads(country_data.rawdata) parse = fetchhelper.ParseData(country_data, 'countries') parse.parsedtime = datatime with open(parse.parsedfile, 'w') as f: cw = csv.writer(f) header = ['Code', 'Country', 'Timestamp', 'Confirmed', 'Deaths'] cw.writerow(header) for data in sorted(jdat['data'], key=(lambda d: d['areaCode'])): code = data['areaCode'] name = data['areaName'] confirmed = data['cumCasesByPublishDate'] deaths = data['cumDeaths28DaysByPublishDate'] cw.writerow([code, name, datatime, confirmed, deaths]) parse.deploy_timestamp() return parse
def collect_utla(): utla_data = fetchhelper.Updater(url_utlas, ext='utla.json') utla_data.check_fetch(rawfile=args.rawfile[1], compressed=True) if not utla_data.rawdata.strip(): if datetime.date.today().isoweekday() == 7: pass else: print("Empty utla.json") return jdat = json.loads(utla_data.rawdata) parse = fetchhelper.ParseData(utla_data, 'utla') parse.parsedtime = datatime with open(parse.parsedfile, 'w') as f: cw = csv.writer(f) header = [ 'Code', 'UTLA', 'Region', 'Timestamp', 'Confirmed', 'Deaths', 'Backdated' ] cw.writerow(header) for data in sorted(jdat['data'], key=(lambda d: d['areaCode'])): code = data['areaCode'] name = data['areaName'] confirmed = data['cumCasesByPublishDate'] fallback = '' if confirmed is None: confirmed = data['cumCasesBySpecimenDate'] if confirmed is not None: fallback += 'C' deaths = data['cumDeaths28DaysByPublishDate'] if deaths is None: deaths = data['cumDeaths28DaysByDeathDate'] if deaths is not None: fallback += 'D' cw.writerow([ code, name, (regions[code][1] if code[0] == 'E' else None), datatime, confirmed, deaths, fallback ]) parse.deploy_timestamp() return parse
import fetchhelper ap = argparse.ArgumentParser() fetchhelper.add_arguments(ap) args = ap.parse_args() fetchhelper.check_oldfetch(args) import subprocess, datetime, re, csv, os, sys from bs4 import BeautifulSoup import dateutil.tz datatz = dateutil.tz.gettz('Europe/Berlin') update = fetchhelper.Updater('https://www.landkreis-muenchen.de/themen/verbraucherschutz-gesundheit/gesundheit/coronavirus/fallzahlen/') update.check_fetch(rawfile=args.rawfile) # accidentally duplicated <tr> and other hrml errors html = BeautifulSoup(update.rawdata, 'html.parser') parse = fetchhelper.ParseData(update, 'data') datatime = None timeguess = False txt = str(html.find(text=re.compile('(?:Stand|Datenstand): '))) for timere, timefmt in [ (r'(?:Stand|Datenstand): (\d\d\.\d\d\.\d\d\d\d, \d\d:\d\d) ?Uhr', '%d.%m.%Y, %H:%M'), (r'(?:Stand|Datenstand): (\d\d\.\d\d\.\d\d\d\d, \d\d\.\d\d) ?Uhr', '%d.%m.%Y, %H.%M'), (r'(?:Stand|Datenstand): (\d\d\.\d\d\.\d\d\d\d, \d\d) ?Uhr', '%d.%m.%Y, %H'), (r'(?:Stand|Datenstand): (\d\d\.\d\d\.\d\d\d\d, \d\d) ?Uhr', '%d.%m.%Y, %H'),
import argparse import fetchhelper ap = argparse.ArgumentParser() fetchhelper.add_arguments(ap) args = ap.parse_args() import subprocess, csv from datetime import datetime, timedelta import dateutil.tz datatz = dateutil.tz.gettz('Europe/Berlin') update = fetchhelper.Updater( 'https://sozialministerium.baden-wuerttemberg.de/fileadmin/redaktion/m-sm/intern/downloads/Downloads_Gesundheitsschutz/Tabelle_Coronavirus-Faelle-BW.xlsx', ext='xlsx') update.check_fetch(args.rawfile, binary=True) parse = fetchhelper.ParseData(update, 'timeline') proc = subprocess.Popen(['xlsx2csv', update.rawfile], stdout=subprocess.PIPE, encoding='utf-8') cr = csv.reader(proc.stdout) with open(parse.parsedfile, 'w') as pf: cpf = csv.writer(pf) start = False dates = None
import fetchhelper ap = argparse.ArgumentParser() fetchhelper.add_arguments(ap) args = ap.parse_args() import subprocess, datetime, re, csv, os, sys from bs4 import BeautifulSoup import dateutil.tz from dataclasses import dataclass datatz = dateutil.tz.gettz('Europe/London') # Public Health England update = fetchhelper.Updater( 'https://fingertips.phe.org.uk/documents/Historic%20COVID-19%20Dashboard%20Data.xlsx', ext='xlsx') update.check_fetch(rawfile=args.rawfile, binary=True) regions = { 'E06000001': ('Hartlepool', 'North East'), 'E06000002': ('Middlesbrough', 'North East'), 'E06000003': ('Redcar and Cleveland', 'North East'), 'E06000004': ('Stockton-on-Tees', 'North East'), 'E06000005': ('Darlington', 'North East'), 'E06000006': ('Halton', 'North West'), 'E06000007': ('Warrington', 'North West'), 'E06000008': ('Blackburn with Darwen', 'North West'), 'E06000009': ('Blackpool', 'North West'), 'E06000010': ('Kingston upon Hull, City of', 'Yorkshire and The Humber'), 'E06000011': ('East Riding of Yorkshire', 'Yorkshire and The Humber'),
now = datetime.datetime.now() if now.time() > datetime.time(12, 0): # We expect data for the current day target = now.date() else: # We expect at least data for the previous day target = (now - datetime.timedelta(days=1)).date() match = target.isoformat() if glob.glob('data/*%s*.csv' % match): # Looks good. print("Data for %s already saved." % match) sys.exit(0) datatz = dateutil.tz.gettz('Europe/Berlin') update = fetchhelper.Updater('https://www.mags.nrw/coronavirus-fallzahlen-nrw') update.check_fetch(rawfile=args.rawfile) html = BeautifulSoup(update.rawdata, 'html.parser') def clean_num(numstr): if numstr in ['', '-']: return 0 return int(re.sub(r'[.:]', '', numstr).strip()) header = html.find(text="Bestätigte Fälle (IfSG)") parse = fetchhelper.ParseData(update, 'data')
ap = argparse.ArgumentParser() fetchhelper.add_arguments(ap) args = ap.parse_args() fetchhelper.check_oldfetch(args) import datetime, re, csv, json, os, sys, shutil import dateutil.tz url = 'https://www.landratsamt-dachau.de/bilder/zahlen.jpg' url = 'https://atlas.jifo.co/api/connectors/c3a4b965-0e10-46db-aec6-cceffdb74857' datatz = dateutil.tz.gettz('Europe/Berlin') update = fetchhelper.Updater(url, ext='json') update.check_fetch(rawfile=args.rawfile) jdat = json.loads(update.rawdata) header = jdat['data'][0][0] i_kom = header.index("Gemeinde") i_con = header.index("Fälle insgesamt") i_rec = header.index("Genesen") parse = fetchhelper.ParseData(update, 'data') parse.parsedtime = datetime.datetime.fromtimestamp(jdat['refreshed'] / 1000, tz=datatz) with open(parse.parsedfile, 'w') as f: cw = csv.writer(f) cw.writerow(['Kommune', 'Timestamp', 'Confirmed', 'Recovered']) for jrow in jdat['data'][0][1:]:
stripped = re_strip_landkreis.sub('', rawlabel) if stripped in rawlabel_landkreis: return rawlabel_landkreis[stripped] else: return rawlabel def get_regierungsbezirk(rawlabel): landkreis = clean_landkreis(rawlabel) return rawlabel_regierungsbezirk[landkreis] datatz = dateutil.tz.gettz('Europe/Berlin') update = fetchhelper.Updater( 'https://www.lgl.bayern.de/gesundheit/infektionsschutz/infektionskrankheiten_a_z/coronavirus/karte_coronavirus/index.htm' ) update.check_fetch(rawfile=args.rawfile) # accidentally duplicated <tr> and other hrml errors update.rawdata = re.sub(r'<tr>\s*<tr>', r'<tr>', update.rawdata) update.rawdata = re.sub(r'(<th><span>[^<>]*</span>)</(td|div)>', r'\1</th>', update.rawdata) html = BeautifulSoup(update.rawdata, 'html.parser') datenode = html.find('script', text=re.compile(r'var publikationsDatum = ')) if datenode is None: print("Cannot find publish date", file=sys.stderr) sys.exit(1) datemo = re.search(r'"(\d\d.\d\d.\d\d\d\d)"', datenode.get_text()) if datemo is None:
if args.rawfile is not None: args.rawfile = args.rawfile.split(',') else: args.rawfile = (None, None) import datetime, re, csv, os import json import dateutil.tz datatz = dateutil.tz.gettz('Europe/Berlin') # Bundesländer url_bl = 'https://services7.arcgis.com/mOBPykOjAyBO2ZKk/ArcGIS/rest/services/Coronaf%c3%a4lle_in_den_Bundesl%c3%a4ndern/FeatureServer/0/query?where=1%3D1&objectIds=&time=&geometry=&geometryType=esriGeometryEnvelope&inSR=&spatialRel=esriSpatialRelIntersects&resultType=none&distance=0.0&units=esriSRUnit_Meter&returnGeodetic=false&outFields=LAN_ew_GEN%2CAktualisierung%2CFallzahl%2CDeath%2CLAN_ew_AGS&returnGeometry=false&returnCentroid=false&featureEncoding=esriDefault&multipatchOption=xyFootprint&maxAllowableOffset=&geometryPrecision=&outSR=&datumTransformation=&applyVCSProjection=false&returnIdsOnly=false&returnUniqueIdsOnly=false&returnCountOnly=false&returnExtentOnly=false&returnQueryGeometry=false&returnDistinctValues=false&cacheHint=false&orderByFields=&groupByFieldsForStatistics=&outStatistics=&having=&resultOffset=&resultRecordCount=&returnZ=false&returnM=false&returnExceededLimitFeatures=true&quantizationParameters=&sqlFormat=none&f=pjson&token=' updatebl = fetchhelper.Updater(url_bl, ext='bl.json') updatebl.check_fetch(rawfile=args.rawfile[0]) jdat = json.loads(updatebl.rawdata) parsebl = fetchhelper.ParseData(updatebl, 'data') parsebl.parsedtime = None with open(parsebl.parsedfile, 'w') as outf: cout = csv.writer(outf) cout.writerow(['Bundesland', 'AGS', 'Timestamp', 'EConfirmed', 'EDeaths']) for jfeat in sorted(jdat['features'], key=(lambda f: f['attributes']['LAN_ew_GEN'])): ts = datetime.datetime.fromtimestamp(jfeat['attributes']['Aktualisierung']/1000, tz=datatz) if parsebl.parsedtime is None or ts > parsebl.parsedtime: parsebl.parsedtime = ts cout.writerow([ jfeat['attributes']['LAN_ew_GEN'],
import fetchhelper ap = argparse.ArgumentParser() fetchhelper.add_arguments(ap) args = ap.parse_args() fetchhelper.check_oldfetch(args) import subprocess, datetime, re, csv, os, sys, shutil import urllib.parse from bs4 import BeautifulSoup import dateutil.tz url = 'https://www.landratsamt-dachau.de/bilder/zahlen.jpg' datatz = dateutil.tz.gettz('Europe/Berlin') #txt = str(html.find(text=re.compile('Landkreis-Statistik '))) #mo = re.search(r'Landkreis-Statistik(?: nach Gemeinden)? für den (\d\d.\d\d.\d\d\d\d)', txt) #datatime = parse.parsedtime = update.contenttime = datetime.datetime.strptime(mo.group(1) + ' 21:30', '%d.%m.%Y %H:%M').replace(tzinfo=datatz) update = fetchhelper.Updater(url, ext='png') update.check_fetch(rawfile=args.rawfile, binary=True, remotetime=True) datatime = datetime.datetime.fromtimestamp(os.stat(update.rawfile).st_mtime) if not os.path.exists('collected'): os.mkdir('collected') shutil.copy( update.rawfile, 'collected/gemeinden_%s.png' % datatime.isoformat(timespec='minutes'))
import argparse import fetchhelper ap = argparse.ArgumentParser() fetchhelper.add_arguments(ap) args = ap.parse_args() import subprocess, datetime, re, csv, os, sys import dateutil.tz datatz = dateutil.tz.gettz('Europe/Stockholm') #https://services5.arcgis.com/fsYDFeRKu1hELJJs/arcgis/rest/services/FOHM_Covid_19_FME_1/FeatureServer/1/query?f=json&where=Statistikdatum%3E%3Dtimestamp%20%272020-03-26%2023%3A00%3A00%27%20AND%20Statistikdatum%3C%3Dtimestamp%20%272020-03-27%2022%3A59%3A59%27&returnGeometry=false&spatialRel=esriSpatialRelIntersects&outFields=*&orderByFields=Statistikdatum%20desc&outSR=102100&resultOffset=0&resultRecordCount=2000&cacheHint=true update = fetchhelper.Updater('https://services5.arcgis.com/fsYDFeRKu1hELJJs/arcgis/rest/services/FOHM_Covid_19_FME_1/FeatureServer/1/query?f=json&where=1%3d1&returnGeometry=false&spatialRel=esriSpatialRelIntersects&outFields=*&orderByFields=Statistikdatum%20desc&outSR=102100&resultOffset=0&resultRecordCount=2000&cacheHint=true', ext='json') update.check_fetch(rawfile=args.rawfile) import json with open(update.rawfile) as f: jd = json.load(f) #'Statistikdatum': 1581552000000, areasum = { 'Blekinge': 0, 'Dalarna': 0, 'Gotland': 0, 'Gävleborg': 0, 'Halland': 0, 'Jämtland': 0, 'Jönköping': 0,
if args.rawfile: args.rawfile = args.rawfile.split(',', 1) else: args.rawfile = (None, None) import subprocess, datetime, re, csv, os, sys, shutil import urllib.parse from bs4 import BeautifulSoup import dateutil.tz url = 'https://www.landratsamt-dachau.de/gesundheit-veterinaerwesen-sicherheitsrecht/gesundheit/coronavirus/corona-statistiken/' datatz = dateutil.tz.gettz('Europe/Berlin') update = fetchhelper.Updater(url) update.check_fetch(rawfile=args.rawfile[0]) html = BeautifulSoup(update.rawdata, 'html.parser') parse = fetchhelper.ParseData(update, 'data') #txt = str(html.find(text=re.compile('Landkreis-Statistik '))) #mo = re.search(r'Landkreis-Statistik(?: nach Gemeinden)? für den (\d\d.\d\d.\d\d\d\d)', txt) #datatime = parse.parsedtime = update.contenttime = datetime.datetime.strptime(mo.group(1) + ' 21:30', '%d.%m.%Y %H:%M').replace(tzinfo=datatz) iframe = html.find('iframe') furl = urllib.parse.urljoin(url, iframe['src']) update_f = fetchhelper.Updater(furl, ext='iframe.html') update_f.check_fetch(rawfile=args.rawfile[1], remotetime=True)
args = ap.parse_args() import subprocess, datetime, re, csv, os from datetime import datetime from bs4 import BeautifulSoup import dateutil.tz def cleannum(s): return int(s.replace('.', '').rstrip('+*^')) datatz = dateutil.tz.gettz('Europe/Vienna') update = fetchhelper.Updater( 'https://www.sozialministerium.at/Informationen-zum-Coronavirus/Neuartiges-Coronavirus-(2019-nCov).html' ) update.check_fetch(rawfile=args.rawfile) html = BeautifulSoup(update.rawdata, 'html.parser') class NotFound(Exception): pass def parse_counts(parse, base, lead): infotext = base.find("div", class_="infobox").find(string=re.compile(lead)) if infotext is None: infotext = base.find("main", id="content").find(string=re.compile(lead))
targetdate = None @dataclasses.dataclass class Cases: kreis: str confirmed: int = 0 deaths: int = 0 recovered: int = 0 date: None = None newest = [] for kreisid, name in sorted(kreise.items()): update = fetchhelper.Updater( f'https://www.lzg.nrw.de/covid19/daten/covid19_{kreisid}.csv', ext=f'{kreisid}.csv') k_rawfile = (None if args.rawfile is None else glob.glob(f'{args.rawfile}.{kreisid}.csv')[0]) update.check_fetch(rawfile=k_rawfile) with open(update.rawfile, 'r', encoding='utf-8-sig') as rf: cf = csv.reader(rf) header = next(cf) cols = coldefs.build(header) # newest line is last, so iterate through whole file # The data contains several "kummuliert" columns, but those sometimes seem to be rounded # Lets hope thats no longer the case cases = Cases(kreisid) for line in cf: fields = cols.get(line) cases.confirmed = fields.confirmed
class CountryData: code: str name: str timestamp: datetime.datetime confirmed: int = None deaths: int = None if args.rawfile is not None: args.rawfile = args.rawfile.split(',') else: args.rawfile = [None, None] countrydata = {} country_data = fetchhelper.Updater(url_countries, ext='country.json') country_data.check_fetch(rawfile=args.rawfile[1]) jdat = json.loads(country_data.rawdata) datatime = datetime.datetime.fromisoformat( jdat['metadata']['lastUpdatedAt'].rstrip('Z')).astimezone( datetime.timezone.utc) parses = [] parse = fetchhelper.ParseData(country_data, 'countries') parse.parsedtime = datatime with open(parse.parsedfile, 'w') as f: cw = csv.writer(f) header = ['Code', 'Country', 'Timestamp', 'Confirmed', 'Deaths'] cw.writerow(header) for (code, data) in jdat.items(): if code == 'metadata':
fetchhelper.check_oldfetch(args) if args.rawfile is None: rawfiles = (None, None) else: rawfiles = args.rawfile.split(',') import datetime, re, csv, json, os, sys, shutil import dateutil.tz url_cases = 'https://atlas.jifo.co/api/connectors/41be7d71-7260-497f-a60b-adce5aa9445d' url_recovered = 'https://atlas.jifo.co/api/connectors/2adaf217-e526-492a-bcad-5ed6ec6ad3ad' datatz = dateutil.tz.gettz('Europe/Berlin') update = fetchhelper.Updater(url_cases, ext='cases.json') update.check_fetch(rawfile=rawfiles[0]) jdat = json.loads(update.rawdata) header = jdat['data'][0][0] i_kom = header.index("Ort") i_con = header.index("Gesamtzahl seit Ausbruch") parses = [] parse = fetchhelper.ParseData(update, 'data') parse.parsedtime = datetime.datetime.fromtimestamp(jdat['refreshed'] / 1000, tz=datatz) with open(parse.parsedfile, 'w') as f: cw = csv.writer(f) cw.writerow(['Kommune', 'Timestamp', 'Confirmed'])
fetchhelper.add_arguments(ap) args = ap.parse_args() import subprocess, datetime, re, csv, os from datetime import datetime from bs4 import BeautifulSoup import dateutil.tz def cleannum(s): return int(s.replace(' ', '')) datatz = dateutil.tz.gettz('America/New York') update = fetchhelper.Updater( 'https://covidtracking.com/api/v1/states/current.csv', ext='csv') update.check_fetch(rawfile=args.rawfile) parse = fetchhelper.ParseData(update, 'data') parse.parsedtime = None with open(update.rawfile) as inf: cr = csv.reader(inf) header = next(cr) selector_labels = ['state', 'lastUpdateEt', 'positive', 'death'] selector = [] for sl in selector_labels: for (i, h) in enumerate(header): if h == sl: selector.append(i) break
def cleannum(s): return int(s.replace('.', '').replace(',', '').rstrip('+*^◊')) datatz = dateutil.tz.gettz('Europe/Vienna') # page: url = 'https://www.sozialministerium.at/Informationen-zum-Coronavirus/Neuartiges-Coronavirus-(2019-nCov).html' # iframe: url = 'https://info.gesundheitsministerium.gv.at/?re=tabelle' # csv: url = 'https://info.gesundheitsministerium.gv.at/data/timeline-faelle-bundeslaender.csv' update = fetchhelper.Updater(url, ext='csv') update.check_fetch(rawfile=args.rawfile, checkdh=False) coldefs = csvtools.CSVColumns( timestamp=['Datum'], area=['Name'], confirmed=['BestaetigteFaelleBundeslaender'], deaths=['Todesfaelle'], recovered=['Genesen'], hospital=["Hospitalisierung"], intensive=["Intensivstation"], tests=["Testungen"], ) coldefs.set_type('timestamp', datetime.fromisoformat) coldefs.set_type('confirmed', int) coldefs.set_type('deaths', int)
import argparse import fetchhelper ap = argparse.ArgumentParser() fetchhelper.add_arguments(ap) args = ap.parse_args() import subprocess, datetime, re, csv, os, sys from bs4 import BeautifulSoup import dateutil.tz datatz = dateutil.tz.gettz('Europe/Berlin') update = fetchhelper.Updater( 'https://www.landratsamt-dachau.de/gesundheit-veterinaerwesen-sicherheitsrecht/gesundheit/coronavirus/' ) update.check_fetch(rawfile=args.rawfile) # accidentally duplicated <tr> and other hrml errors html = BeautifulSoup(update.rawdata, 'html.parser') parse = fetchhelper.ParseData(update, 'data') txt = str(html.find(text=re.compile('Landkreis-Statistik '))) mo = re.search( r'Landkreis-Statistik(?: nach Gemeinden)? für den (\d\d.\d\d.\d\d\d\d)', txt) datatime = parse.parsedtime = update.contenttime = datetime.datetime.strptime( mo.group(1) + ' 21:30', '%d.%m.%Y %H:%M').replace(tzinfo=datatz)
args = ap.parse_args() import subprocess, datetime, re, csv, os from datetime import datetime from bs4 import BeautifulSoup import dateutil.tz def cleannum(s): return int(s.replace(' ', '')) datatz = dateutil.tz.gettz('Europe/Paris') update = fetchhelper.Updater( 'https://www.santepubliquefrance.fr/maladies-et-traumatismes/maladies-et-infections-respiratoires/infection-a-coronavirus/articles/infection-au-nouveau-coronavirus-sars-cov-2-covid-19-france-et-monde' ) update.check_fetch(rawfile=args.rawfile) if args.only_changed: if not update.raw_changed(): print("downloaded raw data unchanged") exit(0) html = BeautifulSoup(update.rawdata, 'html.parser') tab = html.find( string=re.compile('R.*gion de notification')).find_parent('table') datestr = tab.find_previous('h4').get_text() mo = re.search('(\d\d/\d\d/\d\d\d\d) à (\d\d)h', datestr)
class CountryData: code: str name: str timestamp: datetime.datetime confirmed: int = None deaths: int = None if args.rawfile is not None: args.rawfile = args.rawfile.split(',') else: args.rawfile = [None, None] countrydata = {} update_buckets = fetchhelper.Updater(url_buckets, ext='buckets.xml') update_buckets.check_fetch(rawfile=args.rawfile[0]) xdoc = lxml.etree.fromstring(update_buckets.rawdata) newest_name = None newest_time = None for blob in xdoc.xpath('//Blob'): name = blob.xpath('./Name')[0].text if not re.match(r'data_.*\.json', name): continue modified = datetime.datetime.strptime( blob.xpath('./Properties/Last-Modified')[0].text, '%a, %d %b %Y %H:%M:%S %Z') if newest_time is None or modified > newest_time: newest_name = name newest_time = modified