def parse_daily_areas_pdf(date, country, local_pdf_file): if country == "Northern Ireland": pdf = pdfplumber.open(local_pdf_file) for page in pdf.pages: try: table = page.extract_table() if table[0][0] == "Local Government District": output_rows = [[ "Date", "Country", "AreaCode", "Area", "TotalCases" ]] for table_row in table[1:]: if table_row[0].lower() == "total": continue area = normalize_whitespace(titlecase(table_row[0])) area = area.replace("Ards and North Down", "North Down and Ards") area_code = lookup_local_government_district_code(area) cases = normalize_int(table_row[1]) output_row = [date, country, area_code, area, cases] output_rows.append(output_row) return output_rows except IndexError: pass # no table on page elif country == "Wales": pdf = pdfplumber.open(local_pdf_file) for page in pdf.pages: try: table = page.extract_table( table_settings={ # use text alignment since the table doesn't have lines "vertical_strategy": "text", "horizontal_strategy": "text" }) found_start = False output_rows = [[ "Date", "Country", "AreaCode", "Area", "TotalCases" ]] for table_row in table: if table_row[0] is not None and table_row[0].startswith( "Aneurin"): found_start = True if found_start: area = ( normalize_whitespace(table_row[2]).replace( "Anglesey", "Isle of Anglesey").replace( "ff", "ff") # fix ligatures .replace("fi", "fi")) if area.startswith("Wales total"): continue area_code = lookup_local_authority_code(area) cases = normalize_int(table_row[4]) output_row = [date, country, area_code, area, cases] output_rows.append(output_row) if table_row[2] is not None and normalize_whitespace( table_row[2]) == 'Resident outside Wales': break return convert_wales_la_to_hb(date, country, output_rows) except IndexError: pass # no table on page return None
def parse_daily_areas(date, country, html): if country in ("Northern Ireland", "UK"): return None soup = BeautifulSoup(html, features="html.parser") output_rows = [["Date", "Country", "AreaCode", "Area", "TotalCases"]] if country == "Scotland": table = soup.find_all("table")[0] for table_row in table.findAll("tr"): columns = [ normalize_whitespace(col.text) for col in table_row.findAll("td") ] if len(columns) == 0: continue if columns[0].lower() in ("", "health board"): continue area = columns[0].replace("Ayrshire & Arran", "Ayrshire and Arran") area = columns[0].replace("Eileanan Siar (Western Isles)", "Western Isles") area_code = lookup_health_board_code(area) cases = columns[1] if cases == "*": # means 5 or fewer cases cases = "NaN" else: cases = cases.replace("*", "").replace(",", "") output_row = [date, country, area_code, area, cases] output_rows.append(output_row) return output_rows elif country == "Wales": if date >= "2020-04-08": # daily areas no longer published on the HTML page (now published on the dashboard) return None table = soup.find_all("table")[0] for table_row in table.findAll("tr"): columns = [ normalize_whitespace(col.text) for col in table_row.findAll("td") ] if len(columns) == 0: continue if columns[0].lower() in ("", "health board", "wales", "total", "wales total"): continue if is_blank(columns[-1]): continue area = ( columns[0] .replace("City and County of Swansea", "Swansea") .replace("City of Cardiff", "Cardiff") .replace("Newport City", "Newport") .replace("County Borough Council", "") .replace("County Council", "") .replace("Council", "") .replace("Cardiff & Vale", "Cardiff and Vale") .replace("Cwm Taf Morgannwg", "Cwm Taf") .strip() ) if is_blank(area): area = columns[0] cases = columns[-1].replace("*","").replace(",", "") output_row = [date, country, lookup_health_board_code(area), area, cases] output_rows.append(output_row) return output_rows return None
def parse_daily_areas(date, country, html): if country in ("Northern Ireland", "UK"): return None soup = BeautifulSoup(html, features="html.parser") output_rows = [["Date", "Country", "AreaCode", "Area", "TotalCases"]] if country == "Scotland": table = soup.find_all("table")[-1] for table_row in table.findAll("tr"): columns = [ normalize_whitespace(col.text) for col in table_row.findAll("td") ] if len(columns) == 0: continue if columns[0].lower() in ("", "health board"): continue area = columns[0].replace("Ayrshire & Arran", "Ayrshire and Arran") area_code = lookup_health_board_code(area) cases = columns[1].replace("*", "") output_row = [date, country, area_code, area, cases] output_rows.append(output_row) return output_rows elif country == "Wales": table = soup.find_all("table")[0] for table_row in table.findAll("tr"): columns = [ normalize_whitespace(col.text) for col in table_row.findAll("td") ] if len(columns) == 0: continue if columns[0].lower() in ("", "health board", "wales", "total", "wales total"): continue if is_blank(columns[-1]): continue area = (columns[0].replace( "City and County of Swansea", "Swansea").replace("City of Cardiff", "Cardiff").replace( "Newport City", "Newport").replace("County Borough Council", "").replace("County Council", "").replace("Council", "").strip()) if is_blank(area): area = columns[0] cases = columns[-1] output_row = [ date, country, lookup_health_board_code(area), area, cases ] output_rows.append(output_row) return output_rows return None
def crawl_arcgis(date, country, check_only): if country == "UK": item_id = "bc8ee90225644ef7a6f4dd1b13ea1d67" local_data_file = "data/raw/DailyIndicators-{}.xslx".format(date) ret = download_arcgis_item(date, item_id, local_data_file, check_only) if check_only: return ret df = pd.read_excel(local_data_file) print(df) d = df.to_dict("records")[0] date = d["DateVal"].strftime("%Y-%m-%d") with sqlite3.connect('data/covid-19-uk.db') as conn: c = conn.cursor() c.execute( f"INSERT OR REPLACE INTO indicators VALUES ('{date}', 'UK', 'ConfirmedCases', {d['TotalUKCases']})" ) c.execute( f"INSERT OR REPLACE INTO indicators VALUES ('{date}', 'UK', 'Deaths', {d['TotalUKDeaths']})" ) c.execute( f"INSERT OR REPLACE INTO indicators VALUES ('{date}', 'England', 'ConfirmedCases', {d['EnglandCases']})" ) c.execute( f"INSERT OR REPLACE INTO indicators VALUES ('{date}', 'England', 'Deaths', {d['EnglandDeaths']})" ) # c.execute(f"INSERT OR REPLACE INTO indicators VALUES ('{date}', 'Scotland', 'ConfirmedCases', {d['ScotlandCases']})") # c.execute(f"INSERT OR REPLACE INTO indicators VALUES ('{date}', 'Scotland', 'Deaths', {d['ScotlandDeaths']})") # c.execute(f"INSERT OR REPLACE INTO indicators VALUES ('{date}', 'Wales', 'ConfirmedCases', {d['WalesCases']})") # c.execute(f"INSERT OR REPLACE INTO indicators VALUES ('{date}', 'Wales', 'Deaths', {d['WalesDeaths']})") # c.execute(f"INSERT OR REPLACE INTO indicators VALUES ('{date}', 'Northern Ireland', 'ConfirmedCases', {d['NICases']})") # c.execute(f"INSERT OR REPLACE INTO indicators VALUES ('{date}', 'Northern Ireland', 'Deaths', {d['NIDeaths']})") elif country == "England": item_id = "b684319181f94875a6879bbc833ca3a6" local_data_file = "data/raw/CountyUAs_cases_table-{}.csv".format(date) ret = download_arcgis_item(date, item_id, local_data_file, check_only) if check_only: return ret df = pd.read_csv(local_data_file) df["Date"] = date df["Country"] = "England" df = df.rename(columns={"GSS_CD": "AreaCode", "GSS_NM": "Area"}) df = df[["Date", "Country", "AreaCode", "Area", "TotalCases"]] daily_areas = df.to_dict("split")["data"] for row in daily_areas: row[4] = normalize_int(normalize_whitespace(row[4])) daily_areas = [["Date", "Country", "AreaCode", "Area", "TotalCases"] ] + daily_areas #save_daily_areas(date, country, daily_areas) save_daily_areas_to_sqlite(date, country, daily_areas)
def parse_daily_areas_pdf(date, country, local_pdf_file): if country == "Northern Ireland": pdf = pdfplumber.open(local_pdf_file) for page in pdf.pages: try: table = page.extract_table() if table[0][0] == "Local Government District": output_rows = [["Date", "Country", "AreaCode", "Area", "TotalCases"]] for table_row in table[1:]: if table_row[0].lower() == "total": continue area = normalize_whitespace(titlecase(table_row[0])) area_code = lookup_local_government_district_code(area) cases = normalize_int(table_row[1]) output_row = [date, country, area_code, area, cases] output_rows.append(output_row) return output_rows except IndexError: pass # no table on page return None
def get_text_from_html(html): soup = BeautifulSoup(html, features="html.parser") text = soup.get_text(separator=" ") text = normalize_whitespace(text) return text
def get_text_from_pdf(local_pdf_file): pdf = pdfplumber.open(local_pdf_file) page = pdf.pages[0] # just extract first page text = page.extract_text() text = normalize_whitespace(text) return text
def test_normalize_whitespace(): assert normalize_whitespace(" a b ") == "a b"
def parse_tests(country, html): def is_testing_table(table): headers = [th.text for th in table.findAll("th")] return "Tests" in headers soup = BeautifulSoup(html, features="html.parser") tables = soup.find_all("table") testing_tables = [table for table in tables if is_testing_table(table)] if len(testing_tables) == 0: print("Testing table not found") return None elif len(testing_tables) > 1: print("More than one testing table found") return None testing_table = testing_tables[0] table_rows = testing_table.findAll("tr") if len(table_rows) != 3: print("Expecting 3 table rows") return None daily_row = [td.text for td in table_rows[1].findAll("td")] total_row = [td.text for td in table_rows[2].findAll("td")] text = get_text_from_html(html) pattern_dict = { "Date": (r"As of (?P<Time>\d+\s*(am|pm)?) (on )?(?P<Date>.+?),", date_value_parser_fn) } result = parse_totals_general(pattern_dict, country, text) result["DailyTestsPerformed"] = normalize_int(daily_row[1]) result["DailyPeopleTested"] = normalize_int_with_unavailable(daily_row[2]) result["DailyPositive"] = normalize_int(daily_row[3]) result["TotalTestsPerformed"] = normalize_int(total_row[1]) result["TotalPeopleTested"] = normalize_int_with_unavailable(total_row[2]) result["TotalPositive"] = normalize_int(total_row[3]) def is_pillar_table(table): headers = [th.text for th in table.findAll("th")] return "Pillar 1" in headers pillar_tables = [table for table in tables if is_pillar_table(table)] if len(pillar_tables) == 0: # no pillar tables return result elif len(pillar_tables) != 2: print("Expecting two pillar tables (daily and cumulative)") return None for table_num, pillar_table in enumerate(pillar_tables): daily_or_total = "Daily" if table_num == 0 else "Total" table_rows = pillar_table.findAll("tr") if len(table_rows) != 4: print("Expecting 4 table rows") return None for i, col in enumerate(table_rows[0].findAll(re.compile("th|td"))): if col.text.startswith("Pillar"): pillar = remove_whitespace(col.text) for row in table_rows[1:]: test_stat = normalize_whitespace(row.findAll("td")[0].text) if test_stat == "Tests": test_stat = "TestsPerformed" elif test_stat == "People tested": test_stat = "PeopleTested" indicator = "{}{}{}".format(daily_or_total, pillar, test_stat) str_val = row.findAll("td")[i].text val = "" if str_val == "-" else normalize_int_with_unavailable( str_val) result[indicator] = val def is_pillar2_breakdown_table(table): headers = [th.text for th in table.findAll("th")] return any([header.startswith("In-person") for header in headers]) pillar2_breakdown_tables = [ table for table in tables if is_pillar2_breakdown_table(table) ] if len(pillar2_breakdown_tables) == 0: # no pillar 2 breakdown table return result elif len(pillar2_breakdown_tables) > 1: print("More than one pillar 2 breakdown table found") return None pillar2_breakdown_table = pillar2_breakdown_tables[0] table_rows = pillar2_breakdown_table.findAll("tr") if len(table_rows) not in (3, 4): print("Expecting 3 (or 4) table rows in pillar 2 breakdown table") return None daily_row = [td.text for td in table_rows[1].findAll("td")] total_row = [td.text for td in table_rows[2].findAll("td")] result["DailyPillar2InPersonRoutes"] = normalize_int(daily_row[1]) result["DailyPillar2DeliveryRoutes"] = normalize_int(daily_row[2]) result["TotalPillar2InPersonRoutes"] = normalize_int(total_row[1]) result["TotalPillar2DeliveryRoutes"] = normalize_int(total_row[2]) return result
from util import normalize_whitespace, lookup_health_board_code html_file = sys.argv[1] csv_file = sys.argv[2] m = re.match(".+-(.+)-(\d{4}-\d{2}-\d{2})\.html", html_file) country = m.group(1).title() date = m.group(2) html = open(html_file).read() soup = BeautifulSoup(html, features="html.parser") table = soup.find_all("table")[-1] output_rows = [["Date", "Country", "AreaCode", "Area", "TotalCases"]] for table_row in table.findAll("tr"): columns = [ normalize_whitespace(col.text) for col in table_row.findAll("td") ] if len(columns) == 0: continue area = columns[0].replace("Ayrshire & Arran", "Ayrshire and Arran") area_code = lookup_health_board_code(area) cases = columns[1] output_row = [date, country, area_code, area, cases] output_rows.append(output_row) with open(csv_file, "w") as csvfile: writer = csv.writer(csvfile) writer.writerows(output_rows)
html = open(html_file).read() soup = BeautifulSoup(html, features="html.parser") text = soup.get_text() text = text.replace(u"\xa0", u" ") # replace non-breaking spaces with regular spaces patterns = [uk_pattern, wales_pattern, scotland_pattern, ni_pattern] for pattern in patterns: m = re.search(pattern, text) if m is not None: groups = m.groupdict() date = dateparser.parse(groups["date"]).strftime("%Y-%m-%d") country = normalize_whitespace(groups.get("country")).replace( "Scottish", "Scotland") tests = normalize_int(groups.get("tests", float("nan"))) positive_tests = normalize_int(groups["positive_tests"]) negative_tests = normalize_int( groups.get("negative_tests", float("nan"))) deaths = normalize_int(groups.get("deaths", float("nan"))) if not math.isnan(tests): print("{},{},{},{}".format(date, country, "Tests", tests)) # with open( # "data/daily/indicators/covid-19-{}-{}-tests.csv".format( # date, format_country(country) # ), # "w", # ) as f: # f.write("{},{},{},{}\n".format(date, country, "Tests", tests)) if not math.isnan(positive_tests):