Python normalize_whitespace Examples, util.normalize_whitespace Python Examples

Example #1

0

Show file

def parse_daily_areas_pdf(date, country, local_pdf_file):
    if country == "Northern Ireland":
        pdf = pdfplumber.open(local_pdf_file)
        for page in pdf.pages:
            try:
                table = page.extract_table()
                if table[0][0] == "Local Government District":
                    output_rows = [[
                        "Date", "Country", "AreaCode", "Area", "TotalCases"
                    ]]
                    for table_row in table[1:]:
                        if table_row[0].lower() == "total":
                            continue
                        area = normalize_whitespace(titlecase(table_row[0]))
                        area = area.replace("Ards and North Down",
                                            "North Down and Ards")
                        area_code = lookup_local_government_district_code(area)
                        cases = normalize_int(table_row[1])
                        output_row = [date, country, area_code, area, cases]
                        output_rows.append(output_row)
                    return output_rows
            except IndexError:
                pass  # no table on page
    elif country == "Wales":
        pdf = pdfplumber.open(local_pdf_file)
        for page in pdf.pages:
            try:
                table = page.extract_table(
                    table_settings={
                        # use text alignment since the table doesn't have lines
                        "vertical_strategy": "text",
                        "horizontal_strategy": "text"
                    })
                found_start = False
                output_rows = [[
                    "Date", "Country", "AreaCode", "Area", "TotalCases"
                ]]
                for table_row in table:
                    if table_row[0] is not None and table_row[0].startswith(
                            "Aneurin"):
                        found_start = True
                    if found_start:
                        area = (
                            normalize_whitespace(table_row[2]).replace(
                                "Anglesey", "Isle of Anglesey").replace(
                                    "ﬀ", "ff")  # fix ligatures
                            .replace("ﬁ", "fi"))
                        if area.startswith("Wales total"):
                            continue
                        area_code = lookup_local_authority_code(area)
                        cases = normalize_int(table_row[4])
                        output_row = [date, country, area_code, area, cases]
                        output_rows.append(output_row)
                    if table_row[2] is not None and normalize_whitespace(
                            table_row[2]) == 'Resident outside Wales':
                        break
                return convert_wales_la_to_hb(date, country, output_rows)
            except IndexError:
                pass  # no table on page
    return None

Example #2

0

Show file

File: parsers.py Project: yukittm/covid-19-uk-data

def parse_daily_areas(date, country, html):
    if country in ("Northern Ireland", "UK"):
        return None
    soup = BeautifulSoup(html, features="html.parser")
    output_rows = [["Date", "Country", "AreaCode", "Area", "TotalCases"]]
    if country == "Scotland":
        table = soup.find_all("table")[0]
        for table_row in table.findAll("tr"):
            columns = [
                normalize_whitespace(col.text) for col in table_row.findAll("td")
            ]
            if len(columns) == 0:
                continue
            if columns[0].lower() in ("", "health board"):
                continue
            area = columns[0].replace("Ayrshire & Arran", "Ayrshire and Arran")
            area = columns[0].replace("Eileanan Siar (Western Isles)", "Western Isles")
            area_code = lookup_health_board_code(area)
            cases = columns[1]
            if cases == "*": # means 5 or fewer cases
                cases = "NaN"
            else:
                cases = cases.replace("*", "").replace(",", "")
            output_row = [date, country, area_code, area, cases]
            output_rows.append(output_row)
        return output_rows
    elif country == "Wales":
        if date >= "2020-04-08":
            # daily areas no longer published on the HTML page (now published on the dashboard)
            return None
        table = soup.find_all("table")[0]
        for table_row in table.findAll("tr"):
            columns = [
                normalize_whitespace(col.text) for col in table_row.findAll("td")
            ]
            if len(columns) == 0:
                continue
            if columns[0].lower() in ("", "health board", "wales", "total", "wales total"):
                continue
            if is_blank(columns[-1]):
                continue
            area = (
                columns[0]
                .replace("City and County of Swansea", "Swansea")
                .replace("City of Cardiff", "Cardiff")
                .replace("Newport City", "Newport")
                .replace("County Borough Council", "")
                .replace("County Council", "")
                .replace("Council", "")
                .replace("Cardiff & Vale", "Cardiff and Vale")
                .replace("Cwm Taf Morgannwg", "Cwm Taf")
                .strip()
            )
            if is_blank(area):
                area = columns[0]
            cases = columns[-1].replace("*","").replace(",", "")
            output_row = [date, country, lookup_health_board_code(area), area, cases]
            output_rows.append(output_row)
        return output_rows
    return None

Example #3

0

Show file

def parse_daily_areas(date, country, html):
    if country in ("Northern Ireland", "UK"):
        return None
    soup = BeautifulSoup(html, features="html.parser")
    output_rows = [["Date", "Country", "AreaCode", "Area", "TotalCases"]]
    if country == "Scotland":
        table = soup.find_all("table")[-1]
        for table_row in table.findAll("tr"):
            columns = [
                normalize_whitespace(col.text)
                for col in table_row.findAll("td")
            ]
            if len(columns) == 0:
                continue
            if columns[0].lower() in ("", "health board"):
                continue
            area = columns[0].replace("Ayrshire & Arran", "Ayrshire and Arran")
            area_code = lookup_health_board_code(area)
            cases = columns[1].replace("*", "")
            output_row = [date, country, area_code, area, cases]
            output_rows.append(output_row)
        return output_rows
    elif country == "Wales":
        table = soup.find_all("table")[0]
        for table_row in table.findAll("tr"):
            columns = [
                normalize_whitespace(col.text)
                for col in table_row.findAll("td")
            ]
            if len(columns) == 0:
                continue
            if columns[0].lower() in ("", "health board", "wales", "total",
                                      "wales total"):
                continue
            if is_blank(columns[-1]):
                continue
            area = (columns[0].replace(
                "City and County of Swansea",
                "Swansea").replace("City of Cardiff", "Cardiff").replace(
                    "Newport City",
                    "Newport").replace("County Borough Council",
                                       "").replace("County Council",
                                                   "").replace("Council",
                                                               "").strip())
            if is_blank(area):
                area = columns[0]
            cases = columns[-1]
            output_row = [
                date, country,
                lookup_health_board_code(area), area, cases
            ]
            output_rows.append(output_row)
        return output_rows
    return None

Example #4

0

Show file

File: crawl.py Project: Jcamain/covid-19-uk-data

def crawl_arcgis(date, country, check_only):
    if country == "UK":
        item_id = "bc8ee90225644ef7a6f4dd1b13ea1d67"
        local_data_file = "data/raw/DailyIndicators-{}.xslx".format(date)
        ret = download_arcgis_item(date, item_id, local_data_file, check_only)
        if check_only:
            return ret

        df = pd.read_excel(local_data_file)
        print(df)

        d = df.to_dict("records")[0]
        date = d["DateVal"].strftime("%Y-%m-%d")

        with sqlite3.connect('data/covid-19-uk.db') as conn:
            c = conn.cursor()
            c.execute(
                f"INSERT OR REPLACE INTO indicators VALUES ('{date}', 'UK', 'ConfirmedCases', {d['TotalUKCases']})"
            )
            c.execute(
                f"INSERT OR REPLACE INTO indicators VALUES ('{date}', 'UK', 'Deaths', {d['TotalUKDeaths']})"
            )
            c.execute(
                f"INSERT OR REPLACE INTO indicators VALUES ('{date}', 'England', 'ConfirmedCases', {d['EnglandCases']})"
            )
            c.execute(
                f"INSERT OR REPLACE INTO indicators VALUES ('{date}', 'England', 'Deaths', {d['EnglandDeaths']})"
            )
            # c.execute(f"INSERT OR REPLACE INTO indicators VALUES ('{date}', 'Scotland', 'ConfirmedCases', {d['ScotlandCases']})")
            # c.execute(f"INSERT OR REPLACE INTO indicators VALUES ('{date}', 'Scotland', 'Deaths', {d['ScotlandDeaths']})")
            # c.execute(f"INSERT OR REPLACE INTO indicators VALUES ('{date}', 'Wales', 'ConfirmedCases', {d['WalesCases']})")
            # c.execute(f"INSERT OR REPLACE INTO indicators VALUES ('{date}', 'Wales', 'Deaths', {d['WalesDeaths']})")
            # c.execute(f"INSERT OR REPLACE INTO indicators VALUES ('{date}', 'Northern Ireland', 'ConfirmedCases', {d['NICases']})")
            # c.execute(f"INSERT OR REPLACE INTO indicators VALUES ('{date}', 'Northern Ireland', 'Deaths', {d['NIDeaths']})")

    elif country == "England":
        item_id = "b684319181f94875a6879bbc833ca3a6"
        local_data_file = "data/raw/CountyUAs_cases_table-{}.csv".format(date)
        ret = download_arcgis_item(date, item_id, local_data_file, check_only)
        if check_only:
            return ret

        df = pd.read_csv(local_data_file)
        df["Date"] = date
        df["Country"] = "England"
        df = df.rename(columns={"GSS_CD": "AreaCode", "GSS_NM": "Area"})
        df = df[["Date", "Country", "AreaCode", "Area", "TotalCases"]]
        daily_areas = df.to_dict("split")["data"]
        for row in daily_areas:
            row[4] = normalize_int(normalize_whitespace(row[4]))
        daily_areas = [["Date", "Country", "AreaCode", "Area", "TotalCases"]
                       ] + daily_areas

        #save_daily_areas(date, country, daily_areas)
        save_daily_areas_to_sqlite(date, country, daily_areas)

Example #5

0

Show file

File: parsers.py Project: rodrigolece/covid-19-uk-data

def parse_daily_areas_pdf(date, country, local_pdf_file):
    if country == "Northern Ireland":
        pdf = pdfplumber.open(local_pdf_file)
        for page in pdf.pages:
            try:
                table = page.extract_table()
                if table[0][0] == "Local Government District":
                    output_rows = [["Date", "Country", "AreaCode", "Area", "TotalCases"]]
                    for table_row in table[1:]:
                        if table_row[0].lower() == "total":
                            continue
                        area = normalize_whitespace(titlecase(table_row[0]))
                        area_code = lookup_local_government_district_code(area)
                        cases = normalize_int(table_row[1])
                        output_row = [date, country, area_code, area, cases]
                        output_rows.append(output_row)
                    return output_rows
            except IndexError:
                pass # no table on page
    return None

Example #6

0

Show file

File: parsers.py Project: RichardPilbery/covid-19-uk-data

def get_text_from_html(html):
    soup = BeautifulSoup(html, features="html.parser")
    text = soup.get_text(separator=" ")
    text = normalize_whitespace(text)
    return text

Example #7

0

Show file

File: parsers.py Project: yukittm/covid-19-uk-data

def get_text_from_pdf(local_pdf_file):
    pdf = pdfplumber.open(local_pdf_file)
    page = pdf.pages[0] # just extract first page
    text = page.extract_text()
    text = normalize_whitespace(text)
    return text

Example #8

0

Show file

def test_normalize_whitespace():
    assert normalize_whitespace("  a  b ") == "a b"

Example #9

0

Show file

def parse_tests(country, html):
    def is_testing_table(table):
        headers = [th.text for th in table.findAll("th")]
        return "Tests" in headers

    soup = BeautifulSoup(html, features="html.parser")
    tables = soup.find_all("table")
    testing_tables = [table for table in tables if is_testing_table(table)]
    if len(testing_tables) == 0:
        print("Testing table not found")
        return None
    elif len(testing_tables) > 1:
        print("More than one testing table found")
        return None
    testing_table = testing_tables[0]
    table_rows = testing_table.findAll("tr")
    if len(table_rows) != 3:
        print("Expecting 3 table rows")
        return None
    daily_row = [td.text for td in table_rows[1].findAll("td")]
    total_row = [td.text for td in table_rows[2].findAll("td")]

    text = get_text_from_html(html)
    pattern_dict = {
        "Date": (r"As of (?P<Time>\d+\s*(am|pm)?) (on )?(?P<Date>.+?),",
                 date_value_parser_fn)
    }
    result = parse_totals_general(pattern_dict, country, text)
    result["DailyTestsPerformed"] = normalize_int(daily_row[1])
    result["DailyPeopleTested"] = normalize_int_with_unavailable(daily_row[2])
    result["DailyPositive"] = normalize_int(daily_row[3])
    result["TotalTestsPerformed"] = normalize_int(total_row[1])
    result["TotalPeopleTested"] = normalize_int_with_unavailable(total_row[2])
    result["TotalPositive"] = normalize_int(total_row[3])

    def is_pillar_table(table):
        headers = [th.text for th in table.findAll("th")]
        return "Pillar 1" in headers

    pillar_tables = [table for table in tables if is_pillar_table(table)]

    if len(pillar_tables) == 0:
        # no pillar tables
        return result
    elif len(pillar_tables) != 2:
        print("Expecting two pillar tables (daily and cumulative)")
        return None

    for table_num, pillar_table in enumerate(pillar_tables):
        daily_or_total = "Daily" if table_num == 0 else "Total"
        table_rows = pillar_table.findAll("tr")
        if len(table_rows) != 4:
            print("Expecting 4 table rows")
            return None
        for i, col in enumerate(table_rows[0].findAll(re.compile("th|td"))):
            if col.text.startswith("Pillar"):
                pillar = remove_whitespace(col.text)
                for row in table_rows[1:]:
                    test_stat = normalize_whitespace(row.findAll("td")[0].text)
                    if test_stat == "Tests":
                        test_stat = "TestsPerformed"
                    elif test_stat == "People tested":
                        test_stat = "PeopleTested"
                    indicator = "{}{}{}".format(daily_or_total, pillar,
                                                test_stat)
                    str_val = row.findAll("td")[i].text
                    val = "" if str_val == "-" else normalize_int_with_unavailable(
                        str_val)
                    result[indicator] = val

    def is_pillar2_breakdown_table(table):
        headers = [th.text for th in table.findAll("th")]
        return any([header.startswith("In-person") for header in headers])

    pillar2_breakdown_tables = [
        table for table in tables if is_pillar2_breakdown_table(table)
    ]
    if len(pillar2_breakdown_tables) == 0:
        # no pillar 2 breakdown table
        return result
    elif len(pillar2_breakdown_tables) > 1:
        print("More than one pillar 2 breakdown table found")
        return None
    pillar2_breakdown_table = pillar2_breakdown_tables[0]
    table_rows = pillar2_breakdown_table.findAll("tr")
    if len(table_rows) not in (3, 4):
        print("Expecting 3 (or 4) table rows in pillar 2 breakdown table")
        return None
    daily_row = [td.text for td in table_rows[1].findAll("td")]
    total_row = [td.text for td in table_rows[2].findAll("td")]
    result["DailyPillar2InPersonRoutes"] = normalize_int(daily_row[1])
    result["DailyPillar2DeliveryRoutes"] = normalize_int(daily_row[2])
    result["TotalPillar2InPersonRoutes"] = normalize_int(total_row[1])
    result["TotalPillar2DeliveryRoutes"] = normalize_int(total_row[2])

    return result

Example #10

0

Show file

from util import normalize_whitespace, lookup_health_board_code

html_file = sys.argv[1]
csv_file = sys.argv[2]

m = re.match(".+-(.+)-(\d{4}-\d{2}-\d{2})\.html", html_file)
country = m.group(1).title()
date = m.group(2)

html = open(html_file).read()
soup = BeautifulSoup(html, features="html.parser")
table = soup.find_all("table")[-1]

output_rows = [["Date", "Country", "AreaCode", "Area", "TotalCases"]]
for table_row in table.findAll("tr"):
    columns = [
        normalize_whitespace(col.text) for col in table_row.findAll("td")
    ]
    if len(columns) == 0:
        continue
    area = columns[0].replace("Ayrshire & Arran", "Ayrshire and Arran")
    area_code = lookup_health_board_code(area)
    cases = columns[1]
    output_row = [date, country, area_code, area, cases]
    output_rows.append(output_row)

with open(csv_file, "w") as csvfile:
    writer = csv.writer(csvfile)
    writer.writerows(output_rows)

Example #11

0

Show file

html = open(html_file).read()
soup = BeautifulSoup(html, features="html.parser")

text = soup.get_text()
text = text.replace(u"\xa0",
                    u" ")  # replace non-breaking spaces with regular spaces

patterns = [uk_pattern, wales_pattern, scotland_pattern, ni_pattern]

for pattern in patterns:
    m = re.search(pattern, text)
    if m is not None:
        groups = m.groupdict()
        date = dateparser.parse(groups["date"]).strftime("%Y-%m-%d")
        country = normalize_whitespace(groups.get("country")).replace(
            "Scottish", "Scotland")
        tests = normalize_int(groups.get("tests", float("nan")))
        positive_tests = normalize_int(groups["positive_tests"])
        negative_tests = normalize_int(
            groups.get("negative_tests", float("nan")))
        deaths = normalize_int(groups.get("deaths", float("nan")))
        if not math.isnan(tests):
            print("{},{},{},{}".format(date, country, "Tests", tests))
            # with open(
            #     "data/daily/indicators/covid-19-{}-{}-tests.csv".format(
            #         date, format_country(country)
            #     ),
            #     "w",
            # ) as f:
            #     f.write("{},{},{},{}\n".format(date, country, "Tests", tests))
        if not math.isnan(positive_tests):