def test_parse_daily_areas_wales(): for file in sorted( glob.glob( "data/raw/coronavirus-covid-19-number-of-cases-in-wales-*.html" )): m = re.match(r".+(\d{4}-\d{2}-\d{2})\.html", file) date = m.group(1) if date <= "2020-03-18": # older pages cannot be parsed with current parser continue if date >= "2020-04-08": # daily areas no longer published on the HTML page continue with open(file) as f: html = f.read() result = parse_daily_areas(date, "Wales", html) assert len(result) > 1 assert result[0] == [ 'Date', 'Country', 'AreaCode', 'Area', 'TotalCases' ] for row in result[1:]: assert row[0] == date assert row[1] == "Wales" assert row[ 2] is not None # Area code can be blank (e.g. 'To be confirmed') assert len(row[3]) > 0 assert int(row[4]) >= 0
def test_parse_daily_areas_scotland(): for file in sorted( glob.glob( "data/raw/coronavirus-covid-19-number-of-cases-in-scotland-*.html" )): m = re.match(r".+(\d{4}-\d{2}-\d{2})\.html", file) date = m.group(1) if date <= "2020-03-18": # older pages cannot be parsed with current parser continue with open(file) as f: html = f.read() result = parse_daily_areas(date, "Scotland", html) assert len(result) > 1 assert result[0] == [ 'Date', 'Country', 'AreaCode', 'Area', 'TotalCases' ] for row in result[1:]: assert row[0] == date assert row[1] == "Scotland" assert row[3] == "Golden Jubilee National Hospital" or len( row[2]) > 0 assert len(row[3]) > 0 assert row[3] == "Golden Jubilee National Hospital" or row[ 4] == "NaN" or int(row[4]) >= 0
def crawl_html(date, country, check_only): html_url = get_html_url(date, country) local_html_file = "data/raw/coronavirus-covid-19-number-of-cases-in-{}-{}.html".format( format_country(country), date) save_html_file = False try: with open(local_html_file) as f: html = f.read() if check_only: return DatasetUpdate.ALREADY_UPDATED except FileNotFoundError: r = requests.get(html_url) html = r.text save_html_file = True results = parse_totals(country, html) if results is None: if check_only: return DatasetUpdate.UPDATE_AVAILABLE sys.stderr.write( "Can't find numbers. Perhaps the page format has changed?\n") sys.exit(1) elif results["Date"] != date: if check_only: return DatasetUpdate.UPDATE_NOT_AVAILABLE sys.stderr.write("Page is dated {}, but want {}\n".format( results["Date"], date)) sys.exit(1) if check_only: return DatasetUpdate.UPDATE_AVAILABLE daily_areas = parse_daily_areas(date, country, html) print_totals(results) #save_indicators(results) save_indicators_to_sqlite(results) if daily_areas is not None: #save_daily_areas(date, country, daily_areas) save_daily_areas_to_sqlite(date, country, daily_areas) if save_html_file: with open(local_html_file, "w") as f: f.write(html)