Esempio n. 1
0
def parse_content(page):
    """
    Parse the detail page to extract fatality information.

    :param str news_page: the content of the fatality page
    :return: a dictionary representing a fatality and a list of errors.
    :rtype: dict, list
    """
    d = {}
    parsing_errors = []

    # Normalize the page.
    normalized_detail_page = unicodedata.normalize("NFKD", page)

    # Parse the `Case` field.
    d[Fields.CASE] = regex.match_case_field(normalized_detail_page)
    if not d.get(Fields.CASE):
        raise ValueError('a case number is mandatory')

    # Parse the `Date` field.
    d[Fields.DATE] = regex.match_date_field(normalized_detail_page)
    if not d.get(Fields.DATE):
        raise ValueError('a date is mandatory')

    # Parse the `Crashes` field.
    crash_str = regex.match_crash_field(normalized_detail_page)
    if crash_str:
        d[Fields.CRASH] = crash_str
    else:
        parsing_errors.append("could not retrieve the crash number")

    # Parse the `Time` field.
    time_str = regex.match_time_field(normalized_detail_page)
    time = date_utils.parse_time(time_str)
    if time:
        d[Fields.TIME] = time
    else:
        parsing_errors.append("could not retrieve the crash time")

    # Parse the location field.
    location_str = regex.match_location_field(normalized_detail_page)
    if location_str:
        d[Fields.LOCATION] = location_str.strip()
    else:
        parsing_errors.append("could not retrieve the location")

    # Convert to a report object.
    report, err = twitter.to_report(d)
    parsing_errors.extend(err)

    # Convert the page to a BeautifulSoup object.
    soup = to_soup(normalized_detail_page.replace("<br>", "</br>"))

    # Parse the `Deceased` field.
    deceased_fields, err = parse_deceased_field(soup)
    if deceased_fields:
        report.fatalities = deceased_fields
        parsing_errors.extend(err)
    else:
        parsing_errors.append("could not retrieve the deceased information")
    report.compute_fatalities_age()

    # Fill in Notes from Details page
    if deceased_fields:
        notes = parse_notes_field(soup)
        if notes:
            report.notes = notes
        else:
            parsing_errors.append("could not retrieve the notes information")

    return report, parsing_errors
Esempio n. 2
0
def test_parse_case_field_00(input_, expected):
    """Ensure a case field gets parsed correctly."""
    actual = regex.match_case_field(input_)
    assert actual == expected
Esempio n. 3
0
def parse_page_content(detail_page, notes_parsed=False):
    """
    Parse the detail page to extract fatality information.

    :param str news_page: the content of the fatality page
    :return: a dictionary representing a fatality and a list of errors.
    :rtype: dict, list
    """
    d = {}
    parsing_errors = []
    normalized_detail_page = unicodedata.normalize("NFKD", detail_page)
    soup = to_soup(normalized_detail_page.replace("<br>", "</br>"))

    # Parse the `Case` field.
    d[Fields.CASE] = regex.match_case_field(normalized_detail_page)
    if not d.get(Fields.CASE):
        raise ValueError('A case number is mandatory.')

    # Parse the `Crashes` field.
    crash_str = regex.match_crashes_field(normalized_detail_page)
    if crash_str:
        d[Fields.CRASHES] = crash_str
    else:
        parsing_errors.append("could not retrieve the crash number")

    # Parse the `Date` field.
    date_field = regex.match_date_field(normalized_detail_page)
    if date_field:
        d[Fields.DATE] = date_field
    else:
        parsing_errors.append("could not retrieve the crash date")

    # Parse the `Time` field.
    time_str = regex.match_time_field(normalized_detail_page)
    time = date_utils.parse_time(time_str)
    if time:
        d[Fields.TIME] = time
    else:
        parsing_errors.append("could not retrieve the crash time")

    # Parse the location field.
    location_str = regex.match_location_field(normalized_detail_page)
    if location_str:
        d[Fields.LOCATION] = location_str
    else:
        parsing_errors.append("could not retrieve the location")

    # Parse the `Deceased` field.
    deceased_field_list = parse_deceased_field(soup)
    if deceased_field_list:
        d[Fields.DECEASED] = deceased_field_list
    else:
        parsing_errors.append("could not retrieve the deceased information")

    # Fill in Notes from Details page if not in twitter description.
    if deceased_field_list and not notes_parsed:
        notes = parse_notes_field(soup, d[Fields.DECEASED][-1])
        if notes:
            d[Fields.NOTES] = notes
    if not d.get(Fields.NOTES):
        parsing_errors.append("could not retrieve the notes information")

    return d, parsing_errors