def parse_v2(parses, html): table = fetchhelper.text_table(html.find('table')) ths = table[0] assert ('Bundesland' in ths[0]) assert ('gesamt' in ths[-1]) trs = table[1:] assert ('tigte' in trs[0][0]) assert ('Todesf' in trs[1][0]) assert ('Genesen' in trs[2][0]) assert ('Hospital' in trs[3][0]) assert ('Intensiv' in trs[4][0]) assert ('Testungen' in trs[5][0]) parse = [ fetchhelper.ParseData(update, 'confirmed'), fetchhelper.ParseData(update, 'deaths'), fetchhelper.ParseData(update, 'recovered'), fetchhelper.ParseData(update, 'hospital'), fetchhelper.ParseData(update, 'intensivecare'), fetchhelper.ParseData(update, 'tests'), ] labels = [ 'confirmed', 'deceased', 'recovered', 'hospital', 'intensivecare', 'tests' ] areas = { 'Bgld.': 'Burgenland', 'Kt.': 'Kärnten', 'Ktn.': 'Kärnten', 'NÖ': 'Niederösterreich', 'OÖ': 'Oberösterreich', 'Sbg.': 'Salzburg', 'Stmk.': 'Steiermark', 'T': 'Tirol', 'Vbg.': 'Vorarlberg', 'W': 'Wien' } for i, tds in enumerate(trs): assert (len(ths) == len(tds)) mo = re.search(r'Stand (\d\d.\d\d.\d\d\d\d), *(\d\d:\d\d) ?Uhr', tds[0]) if mo is None: print("cannot parse date") sys.exit(1) parse = fetchhelper.ParseData(update, labels[i]) datadate = parse.parsedtime = datetime.strptime( mo.group(1) + ' ' + mo.group(2), '%d.%m.%Y %H:%M').replace(tzinfo=datatz) with open(parse.parsedfile, 'w') as f: cw = csv.writer(f) cw.writerow(['Area', 'Date', 'Value']) for col in range(1, len(tds) - 1): area = areas[strip_footnote(ths[col])] count = cleannum(tds[col]) cw.writerow([area, datadate.isoformat(), count]) parse.deploy_timestamp() parses.append(parse)
return int(s.replace('.', '').replace('*', '')) area_map = { 'Garching': 'Garching b. München', 'Kirchheim': 'Kirchheim b. München', 'Pullach': 'Pullach im Isartal', } def canonical_area(area): area = area.replace('*', '') return area_map.get(area, area) title = html.find(text=re.compile('Fallzahlen nach Gemeinden')).find_parent('h2') rows = fetchhelper.text_table(title.find_next_sibling('table')) assert(len(rows[0]) == 2 or len(rows[0]) == 3) if rows[0][0] == 'Kommune': rows = rows[1:] with open(parse.parsedfile, 'w') as outf: cout = csv.writer(outf) header = ('Kommune', 'Timestamp', 'Confirmed') cout.writerow(header) for tds in rows: if not tds[0].strip() and not tds[1].strip(): continue cout.writerow((canonical_area(tds[0]), datatime.isoformat(), clean_num(tds[1]))) parse.deploy_timestamp()
else: try: parse.parsedtime = datetime.datetime.strptime( mo.group(1), '%d. %m %Y, %H.%M').replace(tzinfo=datatz) except ValueError: parse.parsedtime = datetime.datetime.strptime( mo.group(1), '%d. %m %Y, %H:%M').replace(tzinfo=datatz) tab = header.find_parent('table') if tab is None: print("couldn't find table", file=sys.stderr) exit(1) with open(parse.parsedfile, 'w') as outf: cout = csv.writer(outf) rows = fetchhelper.text_table(tab) ths = rows[0] assert ('Landkreis' in ths[0]) assert ('Gesamt' in ''.join(rows[-1]) or 'Nordrhein-Westfalen' in ''.join(rows[-1])) rows = rows[1:-1] assert (len(ths) == 5) colnum = len(ths) assert ('Bestätigte' in ths[1]) assert ('Todesfälle' in ths[2]) assert ('Genesene' in ths[3]) assert ('Inzidenz' in ths[4]) cn_deaths = 2 cn_recovered = 3 cout.writerow(['Area', 'Date', 'EConfirmed', 'EDeaths', 'Recovered'])
parse = fetchhelper.ParseData(update, 'data') # page claims updates are at 16:30 and shortly before midnight if datatime.time() < datetime.time(hour=16): parse.parsedtime = (datatime - datetime.timedelta(day=1)).replace( hour=23, minute=50) elif datatime.time() < datetime.time(hour=23): parse.parsedtime = datatime.replace(hour=16, minute=30) else: parse.parsedtime = datatime txt = html.find(text=re.compile('Statistik nach Gemeinden')) if not txt: print("iframe content doesn't look correct", file=sys.stderr) sys.exit(1) rows = fetchhelper.text_table(html) # The structure of the document is currently a mess. Let's wait if it improves in the future. for row in rows: if row[0] == '': del row[0] headers = [] while rows[0][0] != 'Altomünster': headers.append(rows[0]) del rows[0] footers = [] while rows[-1][0] != 'Weichs': footers.insert(0, rows[-1]) del rows[-1]
datatz = dateutil.tz.gettz('Europe/Vienna') update = fetchhelper.Updater( 'https://www.sozialministerium.at/Informationen-zum-Coronavirus/Neuartiges-Coronavirus-(2019-nCov).html' ) update.check_fetch(rawfile=args.rawfile) html = BeautifulSoup(update.rawdata, 'html.parser') def strip_footnote(s): return s.rstrip('*') table = fetchhelper.text_table(html.find('table')) # on 10.09.2020 there were additional empty cells for tr in table: if tr[-1] == '': tr.pop() ths = table[0] assert ('Bundesland' in ths[0]) assert ('gesamt' in ths[-1]) trs = table[1:] assert ('tigte' in trs[0][0]) assert ('Todesf' in trs[1][0]) assert ('Genesen' in trs[2][0]) assert ('Hospital' in trs[3][0]) assert ('Intensiv' in trs[4][0]) assert ('Testungen' in trs[5][0]) parse = [