Exemple #1
0
def crawl_pdf(date, country):
    if country == "Northern Ireland":
        ym = dateparser.parse(date).strftime('%Y-%m')
        dmy = dateparser.parse(date).strftime('%d.%m.%y')
        pdf_url = "https://www.publichealth.hscni.net/sites/default/files/{}/COVID-19 Surveillance Bulletin {}.pdf".format(
            ym, dmy)
        local_pdf_file = "data/raw/Daily_bulletin_DoH_{}.pdf".format(date)

        if not os.path.exists(local_pdf_file):
            r = requests.get(pdf_url)
            with open(local_pdf_file, "wb") as f:
                f.write(r.content)

        text = get_text_from_pdf(local_pdf_file)
        results = parse_totals_pdf_text(country, text)

        if results is None:
            sys.stderr.write(
                "Can't find numbers. Perhaps the page format has changed?\n")
            sys.exit(1)
        elif results["Date"] != date:
            sys.stderr.write("Page is dated {}, but want {}\n".format(
                results["Date"], date))
            sys.exit(1)

        print_totals(results)
        #save_indicators(results)
        save_indicators_to_sqlite(results)

        daily_areas = parse_daily_areas_pdf(date, country, local_pdf_file)
        if daily_areas is not None:
            save_daily_areas(date, country, daily_areas)
            save_daily_areas_to_sqlite(date, country, daily_areas)
Exemple #2
0
def crawl_json(date, country, check_only):
    if country == "UK":
        local_data_file = "data/raw/phe/coronavirus-covid-19-number-of-cases-in-{}-{}.json".format(
            format_country(country), date)

        if not os.path.exists(local_data_file):
            data_url = get_json_url(date)

            if data_url is None:
                if check_only:
                    return DatasetUpdate.UPDATE_NOT_AVAILABLE
                sys.stderr.write("No data available for {}\n".format(date))
                sys.exit(1)

            if check_only:
                return DatasetUpdate.UPDATE_AVAILABLE

            r = requests.get(data_url)
            with open(local_data_file, "w") as f:
                f.write(r.text)

        if check_only:
            return DatasetUpdate.ALREADY_UPDATED

        with open(local_data_file) as f:
            json_data = json.load(f)

            totalUKCases = json_data["overview"]["K02000001"]["totalCases"][
                "value"]
            totalUKDeaths = json_data["overview"]["K02000001"]["deaths"][
                "value"]
            englandCases = json_data["countries"]["E92000001"]["totalCases"][
                "value"]
            englandDeaths = json_data["countries"]["E92000001"]["deaths"][
                "value"]

            with sqlite3.connect('data/covid-19-uk.db') as conn:
                c = conn.cursor()
                c.execute(
                    f"INSERT OR REPLACE INTO indicators VALUES ('{date}', 'UK', 'ConfirmedCases', {totalUKCases})"
                )
                c.execute(
                    f"INSERT OR REPLACE INTO indicators VALUES ('{date}', 'UK', 'Deaths', {totalUKDeaths})"
                )
                c.execute(
                    f"INSERT OR REPLACE INTO indicators VALUES ('{date}', 'England', 'ConfirmedCases', {englandCases})"
                )
                c.execute(
                    f"INSERT OR REPLACE INTO indicators VALUES ('{date}', 'England', 'Deaths', {englandDeaths})"
                )

            # get area data for England
            daily_areas = parse_daily_areas_json(date, "England", json_data)
            if daily_areas is not None:
                #save_daily_areas(date, "England", daily_areas)
                save_daily_areas_to_sqlite(date, "England", daily_areas)
Exemple #3
0
def crawl_arcgis(date, country, check_only):
    if country == "UK":
        item_id = "bc8ee90225644ef7a6f4dd1b13ea1d67"
        local_data_file = "data/raw/DailyIndicators-{}.xslx".format(date)
        ret = download_arcgis_item(date, item_id, local_data_file, check_only)
        if check_only:
            return ret

        df = pd.read_excel(local_data_file)
        print(df)

        d = df.to_dict("records")[0]
        date = d["DateVal"].strftime("%Y-%m-%d")

        with sqlite3.connect('data/covid-19-uk.db') as conn:
            c = conn.cursor()
            c.execute(
                f"INSERT OR REPLACE INTO indicators VALUES ('{date}', 'UK', 'ConfirmedCases', {d['TotalUKCases']})"
            )
            c.execute(
                f"INSERT OR REPLACE INTO indicators VALUES ('{date}', 'UK', 'Deaths', {d['TotalUKDeaths']})"
            )
            c.execute(
                f"INSERT OR REPLACE INTO indicators VALUES ('{date}', 'England', 'ConfirmedCases', {d['EnglandCases']})"
            )
            c.execute(
                f"INSERT OR REPLACE INTO indicators VALUES ('{date}', 'England', 'Deaths', {d['EnglandDeaths']})"
            )
            # c.execute(f"INSERT OR REPLACE INTO indicators VALUES ('{date}', 'Scotland', 'ConfirmedCases', {d['ScotlandCases']})")
            # c.execute(f"INSERT OR REPLACE INTO indicators VALUES ('{date}', 'Scotland', 'Deaths', {d['ScotlandDeaths']})")
            # c.execute(f"INSERT OR REPLACE INTO indicators VALUES ('{date}', 'Wales', 'ConfirmedCases', {d['WalesCases']})")
            # c.execute(f"INSERT OR REPLACE INTO indicators VALUES ('{date}', 'Wales', 'Deaths', {d['WalesDeaths']})")
            # c.execute(f"INSERT OR REPLACE INTO indicators VALUES ('{date}', 'Northern Ireland', 'ConfirmedCases', {d['NICases']})")
            # c.execute(f"INSERT OR REPLACE INTO indicators VALUES ('{date}', 'Northern Ireland', 'Deaths', {d['NIDeaths']})")

    elif country == "England":
        item_id = "b684319181f94875a6879bbc833ca3a6"
        local_data_file = "data/raw/CountyUAs_cases_table-{}.csv".format(date)
        ret = download_arcgis_item(date, item_id, local_data_file, check_only)
        if check_only:
            return ret

        df = pd.read_csv(local_data_file)
        df["Date"] = date
        df["Country"] = "England"
        df = df.rename(columns={"GSS_CD": "AreaCode", "GSS_NM": "Area"})
        df = df[["Date", "Country", "AreaCode", "Area", "TotalCases"]]
        daily_areas = df.to_dict("split")["data"]
        for row in daily_areas:
            row[4] = normalize_int(normalize_whitespace(row[4]))
        daily_areas = [["Date", "Country", "AreaCode", "Area", "TotalCases"]
                       ] + daily_areas

        #save_daily_areas(date, country, daily_areas)
        save_daily_areas_to_sqlite(date, country, daily_areas)
Exemple #4
0
def crawl_json(date, country, check_only):
    if country == "UK":
        # See https://github.com/PublicHealthEngland/coronavirus-dashboard
        blobs_url = "https://publicdashacc.blob.core.windows.net/publicdata?restype=container&comp=list"
        local_data_file = "data/raw/phe/coronavirus-covid-19-number-of-cases-in-{}-{}.json".format(
            format_country(country), date
        )
        
        if not os.path.exists(local_data_file):
            r = requests.get(blobs_url)
            blobs_xml = r.text
            blobs_dict = xmltodict.parse(blobs_xml)
            blob_names = sorted([o["Name"] for o in blobs_dict["EnumerationResults"]["Blobs"]["Blob"] if o["Name"]])
            dt = dateparser.parse(date, date_formats=['%Y-%m-%d'], locales=["en-GB"])
            blob_names_for_date = [name for name in blob_names if name.startswith("data_{}".format(dt.strftime('%Y%m%d')))]

            if len(blob_names_for_date) == 0:
                if check_only:
                    return DatasetUpdate.UPDATE_NOT_AVAILABLE
                sys.stderr.write("No data available for {}\n".format(date))
                sys.exit(1)         

            if check_only:
                return DatasetUpdate.UPDATE_AVAILABLE       

            # Use most recent date
            data_url = "https://c19pub.azureedge.net/{}".format(blob_names_for_date[-1])
            r = requests.get(data_url)
            with open(local_data_file, "w") as f:
                f.write(r.text)

        if check_only:
            return DatasetUpdate.ALREADY_UPDATED

        with open(local_data_file) as f:
            json_data = json.load(f)

            totalUKCases = json_data["overview"]["K02000001"]["totalCases"]["value"]
            totalUKDeaths = json_data["overview"]["K02000001"]["deaths"]["value"]
            englandCases = json_data["countries"]["E92000001"]["totalCases"]["value"]
            englandDeaths = json_data["countries"]["E92000001"]["deaths"]["value"]

            with sqlite3.connect('data/covid-19-uk.db') as conn:
                c = conn.cursor()
                c.execute(f"INSERT OR REPLACE INTO indicators VALUES ('{date}', 'UK', 'ConfirmedCases', {totalUKCases})")
                c.execute(f"INSERT OR REPLACE INTO indicators VALUES ('{date}', 'UK', 'Deaths', {totalUKDeaths})")
                c.execute(f"INSERT OR REPLACE INTO indicators VALUES ('{date}', 'England', 'ConfirmedCases', {englandCases})")
                c.execute(f"INSERT OR REPLACE INTO indicators VALUES ('{date}', 'England', 'Deaths', {englandDeaths})")

            # get area data for England
            daily_areas = parse_daily_areas_json(date, "England", json_data)
            if daily_areas is not None:
                #save_daily_areas(date, "England", daily_areas)
                save_daily_areas_to_sqlite(date, "England", daily_areas)
def crawl_pdf(date, country, check_only):
    if country == "Northern Ireland":

        dt = dateparser.parse(date, date_formats=['%Y-%m-%d'], locales=["en-GB"])
        ym = dt.strftime('%Y-%m')
        dmy = dt.strftime('%d.%m.%y')
        # the top-level page containing links to PDFs
        html_url = "https://www.publichealth.hscni.net/publications/covid-19-surveillance-reports"
        # the PDF itself
        pdf_url = "https://www.publichealth.hscni.net/sites/default/files/{}/COVID-19 Surveillance Bulletin {}.pdf".format(ym, dmy)
        local_pdf_file = "data/raw/Daily_bulletin_DoH_{}.pdf".format(date)

        if not os.path.exists(local_pdf_file):
            r = requests.get(html_url)
            if "{}.pdf".format(dmy) not in r.text:
                if check_only:
                    return DatasetUpdate.UPDATE_NOT_AVAILABLE
                sys.stderr.write("Page is dated ?, but want {}\n".format(date))
                sys.exit(1)

            if check_only:
                return DatasetUpdate.UPDATE_AVAILABLE

            r = requests.get(pdf_url)
            with open(local_pdf_file, "wb") as f:
                f.write(r.content)

        if check_only:
            return DatasetUpdate.ALREADY_UPDATED

        text = get_text_from_pdf(local_pdf_file)
        results = parse_totals_pdf_text(country, text)

        if results is None:
            sys.stderr.write("Can't find numbers. Perhaps the page format has changed?\n")
            sys.exit(1)
        elif results["Date"] != date:
            sys.stderr.write("Page is dated {}, but want {}\n".format(results["Date"], date))
            sys.exit(1)

        print_totals(results)
        #save_indicators(results)
        save_indicators_to_sqlite(results)

        daily_areas = parse_daily_areas_pdf(date, country, local_pdf_file)
        if daily_areas is not None:
            save_daily_areas(date, country, daily_areas)
            save_daily_areas_to_sqlite(date, country, daily_areas)
Exemple #6
0
def crawl_html(date, country, check_only):
    html_url = get_html_url(date, country)
    local_html_file = "data/raw/coronavirus-covid-19-number-of-cases-in-{}-{}.html".format(
        format_country(country), date)
    save_html_file = False

    try:
        with open(local_html_file) as f:
            html = f.read()
        if check_only:
            return DatasetUpdate.ALREADY_UPDATED
    except FileNotFoundError:
        r = requests.get(html_url)
        html = r.text
        save_html_file = True

    results = parse_totals(country, html)

    if results is None:
        if check_only:
            return DatasetUpdate.UPDATE_AVAILABLE
        sys.stderr.write(
            "Can't find numbers. Perhaps the page format has changed?\n")
        sys.exit(1)
    elif results["Date"] != date:
        if check_only:
            return DatasetUpdate.UPDATE_NOT_AVAILABLE
        sys.stderr.write("Page is dated {}, but want {}\n".format(
            results["Date"], date))
        sys.exit(1)

    if check_only:
        return DatasetUpdate.UPDATE_AVAILABLE

    daily_areas = parse_daily_areas(date, country, html)

    print_totals(results)
    #save_indicators(results)
    save_indicators_to_sqlite(results)

    if daily_areas is not None:
        #save_daily_areas(date, country, daily_areas)
        save_daily_areas_to_sqlite(date, country, daily_areas)

    if save_html_file:
        with open(local_html_file, "w") as f:
            f.write(html)