Exemple #1
0
def parse_daily_areas(date, country, html):
    if country in ("Northern Ireland", "UK"):
        return None
    soup = BeautifulSoup(html, features="html.parser")
    output_rows = [["Date", "Country", "AreaCode", "Area", "TotalCases"]]
    if country == "Scotland":
        table = soup.find_all("table")[0]
        for table_row in table.findAll("tr"):
            columns = [
                normalize_whitespace(col.text) for col in table_row.findAll("td")
            ]
            if len(columns) == 0:
                continue
            if columns[0].lower() in ("", "health board"):
                continue
            area = columns[0].replace("Ayrshire & Arran", "Ayrshire and Arran")
            area = columns[0].replace("Eileanan Siar (Western Isles)", "Western Isles")
            area_code = lookup_health_board_code(area)
            cases = columns[1]
            if cases == "*": # means 5 or fewer cases
                cases = "NaN"
            else:
                cases = cases.replace("*", "").replace(",", "")
            output_row = [date, country, area_code, area, cases]
            output_rows.append(output_row)
        return output_rows
    elif country == "Wales":
        if date >= "2020-04-08":
            # daily areas no longer published on the HTML page (now published on the dashboard)
            return None
        table = soup.find_all("table")[0]
        for table_row in table.findAll("tr"):
            columns = [
                normalize_whitespace(col.text) for col in table_row.findAll("td")
            ]
            if len(columns) == 0:
                continue
            if columns[0].lower() in ("", "health board", "wales", "total", "wales total"):
                continue
            if is_blank(columns[-1]):
                continue
            area = (
                columns[0]
                .replace("City and County of Swansea", "Swansea")
                .replace("City of Cardiff", "Cardiff")
                .replace("Newport City", "Newport")
                .replace("County Borough Council", "")
                .replace("County Council", "")
                .replace("Council", "")
                .replace("Cardiff & Vale", "Cardiff and Vale")
                .replace("Cwm Taf Morgannwg", "Cwm Taf")
                .strip()
            )
            if is_blank(area):
                area = columns[0]
            cases = columns[-1].replace("*","").replace(",", "")
            output_row = [date, country, lookup_health_board_code(area), area, cases]
            output_rows.append(output_row)
        return output_rows
    return None
Exemple #2
0
def parse_daily_areas(date, country, html):
    if country in ("Northern Ireland", "UK"):
        return None
    soup = BeautifulSoup(html, features="html.parser")
    output_rows = [["Date", "Country", "AreaCode", "Area", "TotalCases"]]
    if country == "Scotland":
        table = soup.find_all("table")[-1]
        for table_row in table.findAll("tr"):
            columns = [
                normalize_whitespace(col.text)
                for col in table_row.findAll("td")
            ]
            if len(columns) == 0:
                continue
            if columns[0].lower() in ("", "health board"):
                continue
            area = columns[0].replace("Ayrshire & Arran", "Ayrshire and Arran")
            area_code = lookup_health_board_code(area)
            cases = columns[1].replace("*", "")
            output_row = [date, country, area_code, area, cases]
            output_rows.append(output_row)
        return output_rows
    elif country == "Wales":
        table = soup.find_all("table")[0]
        for table_row in table.findAll("tr"):
            columns = [
                normalize_whitespace(col.text)
                for col in table_row.findAll("td")
            ]
            if len(columns) == 0:
                continue
            if columns[0].lower() in ("", "health board", "wales", "total",
                                      "wales total"):
                continue
            if is_blank(columns[-1]):
                continue
            area = (columns[0].replace(
                "City and County of Swansea",
                "Swansea").replace("City of Cardiff", "Cardiff").replace(
                    "Newport City",
                    "Newport").replace("County Borough Council",
                                       "").replace("County Council",
                                                   "").replace("Council",
                                                               "").strip())
            if is_blank(area):
                area = columns[0]
            cases = columns[-1]
            output_row = [
                date, country,
                lookup_health_board_code(area), area, cases
            ]
            output_rows.append(output_row)
        return output_rows
    return None
Exemple #3
0
def convert_wales_la_to_hb(date, country, rows):
    output_rows = [["Date", "Country", "AreaCode", "Area", "TotalCases"]]

    def cases_for_one_la(la):
        return [row[4] for row in rows if row[3] == la][0]

    def cases_for(las):
        return sum([cases_for_one_la(la) for la in las])

    hb_to_las = {
        "Aneurin Bevan":
        ["Blaenau Gwent", "Caerphilly", "Monmouthshire", "Newport", "Torfaen"],
        "Betsi Cadwaladr": [
            "Conwy", "Denbighshire", "Flintshire", "Gwynedd",
            "Isle of Anglesey", "Wrexham"
        ],
        "Cardiff and Vale": ["Cardiff", "Vale of Glamorgan"],
        "Cwm Taf": ["Bridgend", "Merthyr Tydfil", "Rhondda Cynon Taf"],
        "Hywel Dda": ["Carmarthenshire", "Ceredigion", "Pembrokeshire"],
        "Powys": ["Powys"],
        "Swansea Bay": ["Neath Port Talbot", "Swansea"]
    }

    for (hb, las) in hb_to_las.items():
        output_rows.append(
            [date, country,
             lookup_health_board_code(hb), hb,
             cases_for(las)])

    # append unknown/outside Wales etc
    for row in rows:
        if row[2] == "":
            output_rows.append(row)

    return output_rows
def crawl_phw(use_local=False):
    if use_local:
        file = "data/raw/phw/Rapid COVID-19 surveillance data.xlsx"
    else:
        file = "http://www2.nphs.wales.nhs.uk:8080/CommunitySurveillanceDocs.nsf/3dc04669c9e1eaa880257062003b246b/77fdb9a33544aee88025855100300cab/$FILE/Rapid%20COVID-19%20surveillance%20data.xlsx"

    df = pd.read_excel(file, sheet_name="Tests by specimen date")
    df["Date"] = df["Specimen date"].apply(lambda x: x.strftime('%Y-%m-%d')).astype(str)
    df.rename(columns={"Cumulative testing episodes": "Tests", "Cumulative cases": "ConfirmedCases"}, inplace=True)

    tests = df.groupby("Date", as_index=False)[["Tests"]].sum()
    cases = df.groupby("Date", as_index=False)[["ConfirmedCases"]].sum()

    save_indicators_df_to_sqlite(tests, "Wales", "Tests")
    save_indicators_df_to_sqlite(cases, "Wales", "ConfirmedCases")

    def lookup_hb(la):
        hb = la_to_hb(la)
        if hb is None:
            return la
        return hb

    df.rename(columns={"ConfirmedCases": "TotalCases"}, inplace=True)
    df["Area"] = df["Local Authority"].apply(lambda la: lookup_hb(la))
    area_cases = df.groupby(["Date", "Area"], as_index=False)[["TotalCases"]].sum()
    area_cases["AreaCode"] = area_cases["Area"].apply(lambda hb: lookup_health_board_code(hb))
    area_cases["Country"] = "Wales"
    area_cases = area_cases[["Date", "Country", "AreaCode", "Area", "TotalCases"]]
    save_cases_df_to_sqlite(area_cases, "Wales")

    df = pd.read_excel(file, sheet_name="Deaths by date")
    df["Date"] = df["Date of death"].apply(lambda x: x.strftime('%Y-%m-%d')).astype(str)
    df.rename(columns={"Cumulative deaths": "Deaths"}, inplace=True)
    save_indicators_df_to_sqlite(df, "Wales", "Deaths")
Exemple #5
0
def crawl_phs(use_local=False):
    if not use_local:
        urls = get_phs_xlsx_urls()

    if use_local:
        file = "data/raw/phs/HSCA+-+SG+Website+-+Indicator+Trends+for+daily+data+publication.xlsx"
    else:
        file = urls["totals"]

    df = pd.read_excel(file, sheet_name="Table 5 - Testing", skiprows=3)
    df.rename(columns={"Unnamed: 0": "Date"}, inplace=True)
    df["Date"] = df["Date"].apply(lambda x: x.strftime('%Y-%m-%d')).astype(str)
    df = df[["Date", "Positive", "Total"]]
    df.rename(columns={
        "Total": "Tests",
        "Positive": "ConfirmedCases"
    },
              inplace=True)
    save_indicators_df_to_sqlite(df, "Scotland", "Tests")
    save_indicators_df_to_sqlite(df, "Scotland", "ConfirmedCases")

    df = pd.read_excel(file, sheet_name="Table 8 - Deaths", skiprows=2)
    df.rename(columns={
        "Number of COVID-19 confirmed deaths registered to date":
        "Deaths"
    },
              inplace=True)
    df["Date"] = df["Date"].apply(lambda x: x.strftime('%Y-%m-%d')).astype(str)
    save_indicators_df_to_sqlite(df, "Scotland", "Deaths")

    if use_local:
        file = "data/raw/phs/Board-level+figures+-+FOR+ONLINE+PUBLICATION.xlsx"
    else:
        file = urls["areas"]

    df = pd.read_excel(file,
                       sheet_name="Table 1 - Cumulative cases",
                       skiprows=2)
    df["Date"] = df["Date"].apply(lambda x: x.strftime('%Y-%m-%d')).astype(str)
    df = df.drop(columns=['Scotland'])
    df = df[[c for c in df.columns if not c.startswith('Unnamed')]]
    area_cases = df.melt(id_vars=["Date"],
                         var_name="Area",
                         value_name="TotalCases")
    area_cases = area_cases.replace("*", "NaN")
    area_cases["Area"] = area_cases["Area"].apply(
        lambda hb: hb.replace("NHS", "").replace("&", "and").strip())
    area_cases["AreaCode"] = area_cases["Area"].apply(
        lambda hb: lookup_health_board_code(hb))
    area_cases["Country"] = "Scotland"
    area_cases = area_cases[[
        "Date", "Country", "AreaCode", "Area", "TotalCases"
    ]]
    save_cases_df_to_sqlite(area_cases, "Scotland")
Exemple #6
0
def test_lookup_health_board_code():
    assert lookup_health_board_code("Fife") == "S08000029"
    assert lookup_health_board_code("Bogus") == ""
Exemple #7
0
from util import normalize_whitespace, lookup_health_board_code

html_file = sys.argv[1]
csv_file = sys.argv[2]

m = re.match(".+-(.+)-(\d{4}-\d{2}-\d{2})\.html", html_file)
country = m.group(1).title()
date = m.group(2)

html = open(html_file).read()
soup = BeautifulSoup(html, features="html.parser")
table = soup.find_all("table")[-1]

output_rows = [["Date", "Country", "AreaCode", "Area", "TotalCases"]]
for table_row in table.findAll("tr"):
    columns = [
        normalize_whitespace(col.text) for col in table_row.findAll("td")
    ]
    if len(columns) == 0:
        continue
    area = columns[0].replace("Ayrshire & Arran", "Ayrshire and Arran")
    area_code = lookup_health_board_code(area)
    cases = columns[1]
    output_row = [date, country, area_code, area, cases]
    output_rows.append(output_row)

with open(csv_file, "w") as csvfile:
    writer = csv.writer(csvfile)
    writer.writerows(output_rows)
date = dateparser.parse(groups["date"]).strftime("%Y-%m-%d")

output_rows = [["Date", "Country", "AreaCode", "Area", "TotalCases"]]
for table_row in table.findAll("tr"):
    columns = [
        normalize_whitespace(col.text) for col in table_row.findAll("td")
    ]
    if len(columns) == 0:
        continue
    if columns[0] == "Health Board" or columns[0] == "Wales" or columns[
            0] == "TOTAL":
        continue
    if is_blank(columns[2]):
        continue
    area = (columns[0].replace(
        "City and County of Swansea",
        "Swansea").replace("City of Cardiff", "Cardiff").replace(
            "Newport City",
            "Newport").replace("County Borough Council",
                               "").replace("County Council",
                                           "").replace("Council", "").strip())
    if is_blank(area):
        area = columns[0]
    cases = columns[2]
    output_row = [date, country, lookup_health_board_code(area), area, cases]
    output_rows.append(output_row)

with open(csv_file, "w") as csvfile:
    writer = csv.writer(csvfile)
    writer.writerows(output_rows)