def save_indicators(results): date = results["Date"] country = results["Country"] tests = results["Tests"] confirmed_cases = results["ConfirmedCases"] deaths = results["Deaths"] if not math.isnan(tests): with open( "data/daily/indicators/covid-19-{}-{}-tests.csv".format( date, format_country(country) ), "w", ) as f: f.write("{},{},{},{}\n".format(date, country, "Tests", tests)) if not math.isnan(confirmed_cases): with open( "data/daily/indicators/covid-19-{}-{}-confirmed-cases.csv".format( date, format_country(country) ), "w", ) as f: f.write( "{},{},{},{}\n".format(date, country, "ConfirmedCases", confirmed_cases) ) if not math.isnan(deaths): with open( "data/daily/indicators/covid-19-{}-{}-deaths.csv".format( date, format_country(country) ), "w", ) as f: f.write("{},{},{},{}\n".format(date, country, "Deaths", deaths))
def save_daily_areas(date, country, rows): csv_file = "data/daily/covid-19-cases-{}-{}.csv".format( date, format_country(country) ) with open(csv_file, "w") as csvfile: writer = csv.writer(csvfile) writer.writerows(rows)
def write_indicator_file(date, country, indicator, value): with open( "data/daily/indicators/covid-19-{}-{}-{}.csv".format( date, format_country(country), camel_to_hyphens(indicator)), "w", ) as f: f.write("{},{},{},{}\n".format(date, country, indicator, value))
def crawl_json(date, country, check_only): if country == "UK": local_data_file = "data/raw/phe/coronavirus-covid-19-number-of-cases-in-{}-{}.json".format( format_country(country), date) if not os.path.exists(local_data_file): data_url = get_json_url(date) if data_url is None: if check_only: return DatasetUpdate.UPDATE_NOT_AVAILABLE sys.stderr.write("No data available for {}\n".format(date)) sys.exit(1) if check_only: return DatasetUpdate.UPDATE_AVAILABLE r = requests.get(data_url) with open(local_data_file, "w") as f: f.write(r.text) if check_only: return DatasetUpdate.ALREADY_UPDATED with open(local_data_file) as f: json_data = json.load(f) totalUKCases = json_data["overview"]["K02000001"]["totalCases"][ "value"] totalUKDeaths = json_data["overview"]["K02000001"]["deaths"][ "value"] englandCases = json_data["countries"]["E92000001"]["totalCases"][ "value"] englandDeaths = json_data["countries"]["E92000001"]["deaths"][ "value"] with sqlite3.connect('data/covid-19-uk.db') as conn: c = conn.cursor() c.execute( f"INSERT OR REPLACE INTO indicators VALUES ('{date}', 'UK', 'ConfirmedCases', {totalUKCases})" ) c.execute( f"INSERT OR REPLACE INTO indicators VALUES ('{date}', 'UK', 'Deaths', {totalUKDeaths})" ) c.execute( f"INSERT OR REPLACE INTO indicators VALUES ('{date}', 'England', 'ConfirmedCases', {englandCases})" ) c.execute( f"INSERT OR REPLACE INTO indicators VALUES ('{date}', 'England', 'Deaths', {englandDeaths})" ) # get area data for England daily_areas = parse_daily_areas_json(date, "England", json_data) if daily_areas is not None: #save_daily_areas(date, "England", daily_areas) save_daily_areas_to_sqlite(date, "England", daily_areas)
def crawl_json(date, country, check_only): if country == "UK": # See https://github.com/PublicHealthEngland/coronavirus-dashboard blobs_url = "https://publicdashacc.blob.core.windows.net/publicdata?restype=container&comp=list" local_data_file = "data/raw/phe/coronavirus-covid-19-number-of-cases-in-{}-{}.json".format( format_country(country), date ) if not os.path.exists(local_data_file): r = requests.get(blobs_url) blobs_xml = r.text blobs_dict = xmltodict.parse(blobs_xml) blob_names = sorted([o["Name"] for o in blobs_dict["EnumerationResults"]["Blobs"]["Blob"] if o["Name"]]) dt = dateparser.parse(date, date_formats=['%Y-%m-%d'], locales=["en-GB"]) blob_names_for_date = [name for name in blob_names if name.startswith("data_{}".format(dt.strftime('%Y%m%d')))] if len(blob_names_for_date) == 0: if check_only: return DatasetUpdate.UPDATE_NOT_AVAILABLE sys.stderr.write("No data available for {}\n".format(date)) sys.exit(1) if check_only: return DatasetUpdate.UPDATE_AVAILABLE # Use most recent date data_url = "https://c19pub.azureedge.net/{}".format(blob_names_for_date[-1]) r = requests.get(data_url) with open(local_data_file, "w") as f: f.write(r.text) if check_only: return DatasetUpdate.ALREADY_UPDATED with open(local_data_file) as f: json_data = json.load(f) totalUKCases = json_data["overview"]["K02000001"]["totalCases"]["value"] totalUKDeaths = json_data["overview"]["K02000001"]["deaths"]["value"] englandCases = json_data["countries"]["E92000001"]["totalCases"]["value"] englandDeaths = json_data["countries"]["E92000001"]["deaths"]["value"] with sqlite3.connect('data/covid-19-uk.db') as conn: c = conn.cursor() c.execute(f"INSERT OR REPLACE INTO indicators VALUES ('{date}', 'UK', 'ConfirmedCases', {totalUKCases})") c.execute(f"INSERT OR REPLACE INTO indicators VALUES ('{date}', 'UK', 'Deaths', {totalUKDeaths})") c.execute(f"INSERT OR REPLACE INTO indicators VALUES ('{date}', 'England', 'ConfirmedCases', {englandCases})") c.execute(f"INSERT OR REPLACE INTO indicators VALUES ('{date}', 'England', 'Deaths', {englandDeaths})") # get area data for England daily_areas = parse_daily_areas_json(date, "England", json_data) if daily_areas is not None: #save_daily_areas(date, "England", daily_areas) save_daily_areas_to_sqlite(date, "England", daily_areas)
def crawl_html(date, country, check_only): html_url = get_html_url(date, country) local_html_file = "data/raw/coronavirus-covid-19-number-of-cases-in-{}-{}.html".format( format_country(country), date) save_html_file = False try: with open(local_html_file) as f: html = f.read() if check_only: return DatasetUpdate.ALREADY_UPDATED except FileNotFoundError: r = requests.get(html_url) html = r.text save_html_file = True results = parse_totals(country, html) if results is None: if check_only: return DatasetUpdate.UPDATE_AVAILABLE sys.stderr.write( "Can't find numbers. Perhaps the page format has changed?\n") sys.exit(1) elif results["Date"] != date: if check_only: return DatasetUpdate.UPDATE_NOT_AVAILABLE sys.stderr.write("Page is dated {}, but want {}\n".format( results["Date"], date)) sys.exit(1) if check_only: return DatasetUpdate.UPDATE_AVAILABLE daily_areas = parse_daily_areas(date, country, html) print_totals(results) #save_indicators(results) save_indicators_to_sqlite(results) if daily_areas is not None: #save_daily_areas(date, country, daily_areas) save_daily_areas_to_sqlite(date, country, daily_areas) if save_html_file: with open(local_html_file, "w") as f: f.write(html)
def convert(indicators_csv_file): indicators = pd.read_csv(indicators_csv_file) for country in ["England", "Northern Ireland", "Scotland", "UK", "Wales"]: wide = indicators[indicators["Country"] == country] wide = wide.pivot(index="Date", columns="Indicator", values="Value") wide = wide.reindex(columns=["Tests", "ConfirmedCases", "Deaths"]) # don't use to_csv since pandas can't write NA ints with open( "data/covid-19-totals-{}.csv".format(format_country(country)), "w") as f: f.write("Date,Tests,ConfirmedCases,Deaths\n") for (i, d) in wide.to_dict("index").items(): f.write("{},{},{},{}\n".format( i, format_int(d["Tests"]), format_int(d["ConfirmedCases"]), format_int(d["Deaths"]), ))