Exemple #1
0
def parse_daily_areas_pdf(date, country, local_pdf_file):
    if country == "Northern Ireland":
        pdf = pdfplumber.open(local_pdf_file)
        for page in pdf.pages:
            try:
                table = page.extract_table()
                if table[0][0] == "Local Government District":
                    output_rows = [[
                        "Date", "Country", "AreaCode", "Area", "TotalCases"
                    ]]
                    for table_row in table[1:]:
                        if table_row[0].lower() == "total":
                            continue
                        area = normalize_whitespace(titlecase(table_row[0]))
                        area = area.replace("Ards and North Down",
                                            "North Down and Ards")
                        area_code = lookup_local_government_district_code(area)
                        cases = normalize_int(table_row[1])
                        output_row = [date, country, area_code, area, cases]
                        output_rows.append(output_row)
                    return output_rows
            except IndexError:
                pass  # no table on page
    elif country == "Wales":
        pdf = pdfplumber.open(local_pdf_file)
        for page in pdf.pages:
            try:
                table = page.extract_table(
                    table_settings={
                        # use text alignment since the table doesn't have lines
                        "vertical_strategy": "text",
                        "horizontal_strategy": "text"
                    })
                found_start = False
                output_rows = [[
                    "Date", "Country", "AreaCode", "Area", "TotalCases"
                ]]
                for table_row in table:
                    if table_row[0] is not None and table_row[0].startswith(
                            "Aneurin"):
                        found_start = True
                    if found_start:
                        area = (
                            normalize_whitespace(table_row[2]).replace(
                                "Anglesey", "Isle of Anglesey").replace(
                                    "ff", "ff")  # fix ligatures
                            .replace("fi", "fi"))
                        if area.startswith("Wales total"):
                            continue
                        area_code = lookup_local_authority_code(area)
                        cases = normalize_int(table_row[4])
                        output_row = [date, country, area_code, area, cases]
                        output_rows.append(output_row)
                    if table_row[2] is not None and normalize_whitespace(
                            table_row[2]) == 'Resident outside Wales':
                        break
                return convert_wales_la_to_hb(date, country, output_rows)
            except IndexError:
                pass  # no table on page
    return None
def crawl_ni(use_local=False):
    headers = {"X-PowerBI-ResourceKey": "df16636e-99fe-4801-a5a1-20466a39f7bf"}

    request_json = read_json("data/raw/ni/request-cumulative-tests.json")
    if use_local:
        file = "data/raw/ni/response-cumulative-tests.json"
    else:
        file = "https://wabi-north-europe-api.analysis.windows.net/public/reports/querydata?synchronous=true"

    json_data = read_json_post(file, headers, request_json)
    tests = json_data["results"][0]["result"]["data"]["dsr"]["DS"][0]["PH"][0]["DM0"]
    tests = {datetime.datetime.fromtimestamp(elt["C"][0] / 1000).strftime('%Y-%m-%d'): elt["C"][1:] for elt in tests}
    df = pd.DataFrame.from_dict(tests, orient='index', columns=["Tests", "ConfirmedCases"])
    df["Date"] = df.index
    df = df.fillna(method="ffill") # fill missing values from previous
    save_indicators_df_to_sqlite(df, "Northern Ireland", "Tests")
    save_indicators_df_to_sqlite(df, "Northern Ireland", "ConfirmedCases")

    request_json = read_json("data/raw/ni/request-cumulative-deaths.json")
    if use_local:
        file = "data/raw/ni/response-cumulative-deaths.json"
    else:
        file = "https://wabi-north-europe-api.analysis.windows.net/public/reports/querydata?synchronous=true"

    json_data = read_json_post(file, headers, request_json)
    deaths = json_data["results"][0]["result"]["data"]["dsr"]["DS"][0]["PH"][0]["DM0"]
    deaths_dict = {}
    for idx, elt in enumerate(deaths):
        date = datetime.datetime.fromtimestamp(elt["C"][0] / 1000).strftime('%Y-%m-%d') 
        if len(elt["C"]) == 1 and elt.get("R", None) == 2: # R means repeat?
            # use previous
            value = [deaths[idx - 1]["C"][1]]
        else:
            value = [elt["C"][1]]
        deaths_dict[date] = value
    df = pd.DataFrame.from_dict(deaths_dict, orient='index', columns=["Deaths"])
    df["Date"] = df.index
    save_indicators_df_to_sqlite(df, "Northern Ireland", "Deaths")

    request_json = read_json("data/raw/ni/request-area-cases.json")
    if use_local:
        file = "data/raw/ni/response-area-cases.json"
    else:
        file = "https://wabi-north-europe-api.analysis.windows.net/public/reports/querydata?synchronous=true"

    json_data = read_json_post(file, headers, request_json)
    area_cases = json_data["results"][0]["result"]["data"]["dsr"]["DS"][0]["PH"][1]["DM1"]
    area_cases = {elt["C"][0]: [elt["C"][2]] for elt in area_cases}
    df = pd.DataFrame.from_dict(area_cases, orient='index', columns=["TotalCases"])
    df["Area"] = df.index
    df["AreaCode"] = df["Area"].apply(lambda lgd: lookup_local_government_district_code(lgd))
    df["Country"] = "Northern Ireland"
    df["Date"] = json_data["results"][0]["result"]["data"]["timestamp"].split("T")[0]
    df = df[["Date", "Country", "AreaCode", "Area", "TotalCases"]]
    save_cases_df_to_sqlite(df, "Northern Ireland", delete_old=False)
def parse_daily_areas_pdf(date, country, local_pdf_file):
    if country == "Northern Ireland":
        pdf = pdfplumber.open(local_pdf_file)
        for page in pdf.pages:
            try:
                table = page.extract_table()
                if table[0][0] == "Local Government District":
                    output_rows = [["Date", "Country", "AreaCode", "Area", "TotalCases"]]
                    for table_row in table[1:]:
                        if table_row[0].lower() == "total":
                            continue
                        area = normalize_whitespace(titlecase(table_row[0]))
                        area_code = lookup_local_government_district_code(area)
                        cases = normalize_int(table_row[1])
                        output_row = [date, country, area_code, area, cases]
                        output_rows.append(output_row)
                    return output_rows
            except IndexError:
                pass # no table on page
    return None
Exemple #4
0
def test_lookup_local_government_district_code():
    assert lookup_local_government_district_code("Antrim and Newtownabbey") == "N09000001"
    assert lookup_local_government_district_code("Bogus") == ""