def scrape_ag(): url = 'https://www.ag.ch/media/kanton_aargau/themen_1/coronavirus_1/daten_excel/Covid-19-Daten_Kanton_Aargau.xlsx' content = sc.download_content(url) xls = xlrd.open_workbook(file_contents=content) xls_datemode = xls.datemode sheet = xls.sheet_by_name('3. Ansteckungsorte') categories = {c: str(sheet.cell_value(1, c) or xlrd.formula.colname(c)) for c in range(1, sheet.ncols, 2)} for row in range(56, sheet.nrows): date = sheet.cell_value(row, 0) if date == '': return date = xlrd.xldate_as_datetime(date, xls_datemode).date() for col, cat in categories.items(): # or should we use total count? count = sheet.cell_value(row, col) if count != '': count = int(count) isd = sc.InfectionSourceData('AG', url) isd.date = date.isoformat() isd.source = cat isd.count = str(count) print(isd)
import csv from io import StringIO import scrape_common as sc import scrape_gl_common as sgc def split_whitespace(text): if not text: return [] text = re.sub(r'\s\s+', ' ', text) return text.split(' ') # weekly pdf pdf_url = sgc.get_gl_pdf_url() pdf = sc.download_content(pdf_url, silent=True) content = sc.pdftotext(pdf, page=1) pdf_date = sc.find(r'Stand: (\d{2}\.\d{2}.\d{4})', content) pdf_date = sc.date_from_text(pdf_date) number_of_tests = sc.find(r'PCR-Tests\sKanton Glarus\s(\d+\'?\d+)\s', content).replace('\'', '') is_first = True if number_of_tests: dd = sc.DayData(canton='GL', url=pdf_url) dd.datetime = pdf_date dd.tested = number_of_tests is_first = False print(dd) content = sc.pdftotext(pdf, page=2, raw=True)