def parse_content(page): """ Parse the detail page to extract fatality information. :param str news_page: the content of the fatality page :return: a dictionary representing a fatality and a list of errors. :rtype: dict, list """ d = {} parsing_errors = [] # Normalize the page. normalized_detail_page = unicodedata.normalize("NFKD", page) # Parse the `Case` field. d[Fields.CASE] = regex.match_case_field(normalized_detail_page) if not d.get(Fields.CASE): raise ValueError('a case number is mandatory') # Parse the `Date` field. d[Fields.DATE] = regex.match_date_field(normalized_detail_page) if not d.get(Fields.DATE): raise ValueError('a date is mandatory') # Parse the `Crashes` field. crash_str = regex.match_crash_field(normalized_detail_page) if crash_str: d[Fields.CRASH] = crash_str else: parsing_errors.append("could not retrieve the crash number") # Parse the `Time` field. time_str = regex.match_time_field(normalized_detail_page) time = date_utils.parse_time(time_str) if time: d[Fields.TIME] = time else: parsing_errors.append("could not retrieve the crash time") # Parse the location field. location_str = regex.match_location_field(normalized_detail_page) if location_str: d[Fields.LOCATION] = location_str.strip() else: parsing_errors.append("could not retrieve the location") # Convert to a report object. report, err = twitter.to_report(d) parsing_errors.extend(err) # Convert the page to a BeautifulSoup object. soup = to_soup(normalized_detail_page.replace("<br>", "</br>")) # Parse the `Deceased` field. deceased_fields, err = parse_deceased_field(soup) if deceased_fields: report.fatalities = deceased_fields parsing_errors.extend(err) else: parsing_errors.append("could not retrieve the deceased information") report.compute_fatalities_age() # Fill in Notes from Details page if deceased_fields: notes = parse_notes_field(soup) if notes: report.notes = notes else: parsing_errors.append("could not retrieve the notes information") return report, parsing_errors
def test_parse_location_field_00(input_, expected): """Ensure.""" actual = regex.match_location_field(input_) assert actual == expected
def parse_page_content(detail_page, notes_parsed=False): """ Parse the detail page to extract fatality information. :param str news_page: the content of the fatality page :return: a dictionary representing a fatality and a list of errors. :rtype: dict, list """ d = {} parsing_errors = [] normalized_detail_page = unicodedata.normalize("NFKD", detail_page) soup = to_soup(normalized_detail_page.replace("<br>", "</br>")) # Parse the `Case` field. d[Fields.CASE] = regex.match_case_field(normalized_detail_page) if not d.get(Fields.CASE): raise ValueError('A case number is mandatory.') # Parse the `Crashes` field. crash_str = regex.match_crashes_field(normalized_detail_page) if crash_str: d[Fields.CRASHES] = crash_str else: parsing_errors.append("could not retrieve the crash number") # Parse the `Date` field. date_field = regex.match_date_field(normalized_detail_page) if date_field: d[Fields.DATE] = date_field else: parsing_errors.append("could not retrieve the crash date") # Parse the `Time` field. time_str = regex.match_time_field(normalized_detail_page) time = date_utils.parse_time(time_str) if time: d[Fields.TIME] = time else: parsing_errors.append("could not retrieve the crash time") # Parse the location field. location_str = regex.match_location_field(normalized_detail_page) if location_str: d[Fields.LOCATION] = location_str else: parsing_errors.append("could not retrieve the location") # Parse the `Deceased` field. deceased_field_list = parse_deceased_field(soup) if deceased_field_list: d[Fields.DECEASED] = deceased_field_list else: parsing_errors.append("could not retrieve the deceased information") # Fill in Notes from Details page if not in twitter description. if deceased_field_list and not notes_parsed: notes = parse_notes_field(soup, d[Fields.DECEASED][-1]) if notes: d[Fields.NOTES] = notes if not d.get(Fields.NOTES): parsing_errors.append("could not retrieve the notes information") return d, parsing_errors