def parse_daily_areas(date, country, html): if country in ("Northern Ireland", "UK"): return None soup = BeautifulSoup(html, features="html.parser") output_rows = [["Date", "Country", "AreaCode", "Area", "TotalCases"]] if country == "Scotland": table = soup.find_all("table")[0] for table_row in table.findAll("tr"): columns = [ normalize_whitespace(col.text) for col in table_row.findAll("td") ] if len(columns) == 0: continue if columns[0].lower() in ("", "health board"): continue area = columns[0].replace("Ayrshire & Arran", "Ayrshire and Arran") area = columns[0].replace("Eileanan Siar (Western Isles)", "Western Isles") area_code = lookup_health_board_code(area) cases = columns[1] if cases == "*": # means 5 or fewer cases cases = "NaN" else: cases = cases.replace("*", "").replace(",", "") output_row = [date, country, area_code, area, cases] output_rows.append(output_row) return output_rows elif country == "Wales": if date >= "2020-04-08": # daily areas no longer published on the HTML page (now published on the dashboard) return None table = soup.find_all("table")[0] for table_row in table.findAll("tr"): columns = [ normalize_whitespace(col.text) for col in table_row.findAll("td") ] if len(columns) == 0: continue if columns[0].lower() in ("", "health board", "wales", "total", "wales total"): continue if is_blank(columns[-1]): continue area = ( columns[0] .replace("City and County of Swansea", "Swansea") .replace("City of Cardiff", "Cardiff") .replace("Newport City", "Newport") .replace("County Borough Council", "") .replace("County Council", "") .replace("Council", "") .replace("Cardiff & Vale", "Cardiff and Vale") .replace("Cwm Taf Morgannwg", "Cwm Taf") .strip() ) if is_blank(area): area = columns[0] cases = columns[-1].replace("*","").replace(",", "") output_row = [date, country, lookup_health_board_code(area), area, cases] output_rows.append(output_row) return output_rows return None
def parse_daily_areas(date, country, html): if country in ("Northern Ireland", "UK"): return None soup = BeautifulSoup(html, features="html.parser") output_rows = [["Date", "Country", "AreaCode", "Area", "TotalCases"]] if country == "Scotland": table = soup.find_all("table")[-1] for table_row in table.findAll("tr"): columns = [ normalize_whitespace(col.text) for col in table_row.findAll("td") ] if len(columns) == 0: continue if columns[0].lower() in ("", "health board"): continue area = columns[0].replace("Ayrshire & Arran", "Ayrshire and Arran") area_code = lookup_health_board_code(area) cases = columns[1].replace("*", "") output_row = [date, country, area_code, area, cases] output_rows.append(output_row) return output_rows elif country == "Wales": table = soup.find_all("table")[0] for table_row in table.findAll("tr"): columns = [ normalize_whitespace(col.text) for col in table_row.findAll("td") ] if len(columns) == 0: continue if columns[0].lower() in ("", "health board", "wales", "total", "wales total"): continue if is_blank(columns[-1]): continue area = (columns[0].replace( "City and County of Swansea", "Swansea").replace("City of Cardiff", "Cardiff").replace( "Newport City", "Newport").replace("County Borough Council", "").replace("County Council", "").replace("Council", "").strip()) if is_blank(area): area = columns[0] cases = columns[-1] output_row = [ date, country, lookup_health_board_code(area), area, cases ] output_rows.append(output_row) return output_rows return None
def convert_wales_la_to_hb(date, country, rows): output_rows = [["Date", "Country", "AreaCode", "Area", "TotalCases"]] def cases_for_one_la(la): return [row[4] for row in rows if row[3] == la][0] def cases_for(las): return sum([cases_for_one_la(la) for la in las]) hb_to_las = { "Aneurin Bevan": ["Blaenau Gwent", "Caerphilly", "Monmouthshire", "Newport", "Torfaen"], "Betsi Cadwaladr": [ "Conwy", "Denbighshire", "Flintshire", "Gwynedd", "Isle of Anglesey", "Wrexham" ], "Cardiff and Vale": ["Cardiff", "Vale of Glamorgan"], "Cwm Taf": ["Bridgend", "Merthyr Tydfil", "Rhondda Cynon Taf"], "Hywel Dda": ["Carmarthenshire", "Ceredigion", "Pembrokeshire"], "Powys": ["Powys"], "Swansea Bay": ["Neath Port Talbot", "Swansea"] } for (hb, las) in hb_to_las.items(): output_rows.append( [date, country, lookup_health_board_code(hb), hb, cases_for(las)]) # append unknown/outside Wales etc for row in rows: if row[2] == "": output_rows.append(row) return output_rows
def crawl_phw(use_local=False): if use_local: file = "data/raw/phw/Rapid COVID-19 surveillance data.xlsx" else: file = "http://www2.nphs.wales.nhs.uk:8080/CommunitySurveillanceDocs.nsf/3dc04669c9e1eaa880257062003b246b/77fdb9a33544aee88025855100300cab/$FILE/Rapid%20COVID-19%20surveillance%20data.xlsx" df = pd.read_excel(file, sheet_name="Tests by specimen date") df["Date"] = df["Specimen date"].apply(lambda x: x.strftime('%Y-%m-%d')).astype(str) df.rename(columns={"Cumulative testing episodes": "Tests", "Cumulative cases": "ConfirmedCases"}, inplace=True) tests = df.groupby("Date", as_index=False)[["Tests"]].sum() cases = df.groupby("Date", as_index=False)[["ConfirmedCases"]].sum() save_indicators_df_to_sqlite(tests, "Wales", "Tests") save_indicators_df_to_sqlite(cases, "Wales", "ConfirmedCases") def lookup_hb(la): hb = la_to_hb(la) if hb is None: return la return hb df.rename(columns={"ConfirmedCases": "TotalCases"}, inplace=True) df["Area"] = df["Local Authority"].apply(lambda la: lookup_hb(la)) area_cases = df.groupby(["Date", "Area"], as_index=False)[["TotalCases"]].sum() area_cases["AreaCode"] = area_cases["Area"].apply(lambda hb: lookup_health_board_code(hb)) area_cases["Country"] = "Wales" area_cases = area_cases[["Date", "Country", "AreaCode", "Area", "TotalCases"]] save_cases_df_to_sqlite(area_cases, "Wales") df = pd.read_excel(file, sheet_name="Deaths by date") df["Date"] = df["Date of death"].apply(lambda x: x.strftime('%Y-%m-%d')).astype(str) df.rename(columns={"Cumulative deaths": "Deaths"}, inplace=True) save_indicators_df_to_sqlite(df, "Wales", "Deaths")
def crawl_phs(use_local=False): if not use_local: urls = get_phs_xlsx_urls() if use_local: file = "data/raw/phs/HSCA+-+SG+Website+-+Indicator+Trends+for+daily+data+publication.xlsx" else: file = urls["totals"] df = pd.read_excel(file, sheet_name="Table 5 - Testing", skiprows=3) df.rename(columns={"Unnamed: 0": "Date"}, inplace=True) df["Date"] = df["Date"].apply(lambda x: x.strftime('%Y-%m-%d')).astype(str) df = df[["Date", "Positive", "Total"]] df.rename(columns={ "Total": "Tests", "Positive": "ConfirmedCases" }, inplace=True) save_indicators_df_to_sqlite(df, "Scotland", "Tests") save_indicators_df_to_sqlite(df, "Scotland", "ConfirmedCases") df = pd.read_excel(file, sheet_name="Table 8 - Deaths", skiprows=2) df.rename(columns={ "Number of COVID-19 confirmed deaths registered to date": "Deaths" }, inplace=True) df["Date"] = df["Date"].apply(lambda x: x.strftime('%Y-%m-%d')).astype(str) save_indicators_df_to_sqlite(df, "Scotland", "Deaths") if use_local: file = "data/raw/phs/Board-level+figures+-+FOR+ONLINE+PUBLICATION.xlsx" else: file = urls["areas"] df = pd.read_excel(file, sheet_name="Table 1 - Cumulative cases", skiprows=2) df["Date"] = df["Date"].apply(lambda x: x.strftime('%Y-%m-%d')).astype(str) df = df.drop(columns=['Scotland']) df = df[[c for c in df.columns if not c.startswith('Unnamed')]] area_cases = df.melt(id_vars=["Date"], var_name="Area", value_name="TotalCases") area_cases = area_cases.replace("*", "NaN") area_cases["Area"] = area_cases["Area"].apply( lambda hb: hb.replace("NHS", "").replace("&", "and").strip()) area_cases["AreaCode"] = area_cases["Area"].apply( lambda hb: lookup_health_board_code(hb)) area_cases["Country"] = "Scotland" area_cases = area_cases[[ "Date", "Country", "AreaCode", "Area", "TotalCases" ]] save_cases_df_to_sqlite(area_cases, "Scotland")
def test_lookup_health_board_code(): assert lookup_health_board_code("Fife") == "S08000029" assert lookup_health_board_code("Bogus") == ""
from util import normalize_whitespace, lookup_health_board_code html_file = sys.argv[1] csv_file = sys.argv[2] m = re.match(".+-(.+)-(\d{4}-\d{2}-\d{2})\.html", html_file) country = m.group(1).title() date = m.group(2) html = open(html_file).read() soup = BeautifulSoup(html, features="html.parser") table = soup.find_all("table")[-1] output_rows = [["Date", "Country", "AreaCode", "Area", "TotalCases"]] for table_row in table.findAll("tr"): columns = [ normalize_whitespace(col.text) for col in table_row.findAll("td") ] if len(columns) == 0: continue area = columns[0].replace("Ayrshire & Arran", "Ayrshire and Arran") area_code = lookup_health_board_code(area) cases = columns[1] output_row = [date, country, area_code, area, cases] output_rows.append(output_row) with open(csv_file, "w") as csvfile: writer = csv.writer(csvfile) writer.writerows(output_rows)
date = dateparser.parse(groups["date"]).strftime("%Y-%m-%d") output_rows = [["Date", "Country", "AreaCode", "Area", "TotalCases"]] for table_row in table.findAll("tr"): columns = [ normalize_whitespace(col.text) for col in table_row.findAll("td") ] if len(columns) == 0: continue if columns[0] == "Health Board" or columns[0] == "Wales" or columns[ 0] == "TOTAL": continue if is_blank(columns[2]): continue area = (columns[0].replace( "City and County of Swansea", "Swansea").replace("City of Cardiff", "Cardiff").replace( "Newport City", "Newport").replace("County Borough Council", "").replace("County Council", "").replace("Council", "").strip()) if is_blank(area): area = columns[0] cases = columns[2] output_row = [date, country, lookup_health_board_code(area), area, cases] output_rows.append(output_row) with open(csv_file, "w") as csvfile: writer = csv.writer(csvfile) writer.writerows(output_rows)