def generate_csv(): print( "Date,Country,DailyTestsPerformed,TotalTestsPerformed,DailyPeopleTested,TotalPeopleTested" ) for file in sorted( glob.glob( "data/raw/coronavirus-covid-19-number-of-cases-in-uk-*.html")): m = re.match(r".+(\d{4}-\d{2}-\d{2})\.html", file) date = m.group(1) with open(file) as f: html = f.read() if date <= "2020-03-22": # older pages cannot be parsed with current parser continue if date <= "2020-04-07": result = parse_totals("UK", html) print("{},UK,,,,{}".format(date, result["Tests"])) continue result = parse_tests("UK", html) output_row = [ date, "UK", result["DailyTestsPerformed"], result["TotalTestsPerformed"], result["DailyPeopleTested"], result["TotalPeopleTested"] ] print(",".join([str(val) for val in output_row]))
def generate_csv(): indicator_tuples = list(itertools.product( ["", "Pillar1", "Pillar2", "Pillar4"], ["TestsPerformed", "PeopleTested", "Positive"], ["Daily", "Total"], )) indicators = ["{}{}{}".format(t[2], t[0], t[1]) for t in indicator_tuples] + ["DailyPillar2InPersonRoutes", "DailyPillar2DeliveryRoutes", "TotalPillar2InPersonRoutes", "TotalPillar2DeliveryRoutes"] columns = ["Date", "Country"] + indicators print(",".join(columns)) for file in sorted(glob.glob("data/raw/coronavirus-covid-19-number-of-cases-in-uk-*.html")): m = re.match(r".+(\d{4}-\d{2}-\d{2})\.html", file) date = m.group(1) with open(file) as f: html = f.read() if date <= "2020-03-22": # older pages cannot be parsed with current parser continue # if date != "2020-04-28": # continue if date <= "2020-04-07": totals_result = parse_totals("UK", html) result = { "TotalPeopleTested": totals_result["Tests"] } else: result = parse_tests("UK", html) indicator_values = [result.get(indicator, "") for indicator in indicators] output_row = [date, "UK"] + indicator_values print(",".join([str(val) for val in output_row]))
def test_parse_totals_uk(): for file in sorted(glob.glob("data/raw/coronavirus-covid-19-number-of-cases-in-uk-*.html")): m = re.match(r".+(\d{4}-\d{2}-\d{2})\.html", file) date = m.group(1) if date <= "2020-03-22": # older pages cannot be parsed with current parser continue with open(file) as f: html = f.read() result = parse_totals("UK", html) assert result["Country"] == "UK" assert result["Date"] == date assert result["Tests"] >= 0 assert result["ConfirmedCases"] >= 0 assert result["Deaths"] >= 0
def crawl_html(date, country, check_only): html_url = get_html_url(date, country) local_html_file = "data/raw/coronavirus-covid-19-number-of-cases-in-{}-{}.html".format( format_country(country), date) save_html_file = False try: with open(local_html_file) as f: html = f.read() if check_only: return DatasetUpdate.ALREADY_UPDATED except FileNotFoundError: r = requests.get(html_url) html = r.text save_html_file = True results = parse_totals(country, html) if results is None: if check_only: return DatasetUpdate.UPDATE_AVAILABLE sys.stderr.write( "Can't find numbers. Perhaps the page format has changed?\n") sys.exit(1) elif results["Date"] != date: if check_only: return DatasetUpdate.UPDATE_NOT_AVAILABLE sys.stderr.write("Page is dated {}, but want {}\n".format( results["Date"], date)) sys.exit(1) if check_only: return DatasetUpdate.UPDATE_AVAILABLE daily_areas = parse_daily_areas(date, country, html) print_totals(results) #save_indicators(results) save_indicators_to_sqlite(results) if daily_areas is not None: #save_daily_areas(date, country, daily_areas) save_daily_areas_to_sqlite(date, country, daily_areas) if save_html_file: with open(local_html_file, "w") as f: f.write(html)