def _get_header_type(self, line: str): """Retrieves the header type of a header. Possible header types: ['header', 'subheader', 'subsubheader']. This method should only be called on lines that you are confident are headers. The method returns "header" if `self._is_header` returns true; else returns "subsubheader" if `self._is_subsubheader` returns true; else returns "subheader". Arguments: line: str. Line in a Hansard transcript. Returns: str. One of: ['header', 'subheader', 'subsubheader']. By default, returns 'subheader' if the criteria for 'header' or 'subsubheader' are not met. """ line_text, flatworld_tags = extract_flatworld_tags(line) header_type = 'subheader' if self.RuleLineTypeLabeler._is_header(line_text, flatworld_tags): header_type = 'header' elif self.RuleLineTypeLabeler._is_subsubheader(line_text, flatworld_tags): header_type = 'subsubheader' return header_type
def label_lines(self, lines: List[str]) -> List[str]: """Returns the label/class of each line in the Hansard transcript. Possible labels: header, subheader, subsubheader, speech, scene, garbage. Arguments: lines: List[str]. List of lines in a Hansard transcript. Returns: labels: List[str]. List of line labels. """ assert isinstance(lines[0], str), 'Each item in `lines` must be a string.' # KLUDGE: removes flatworld tag from text before making prediction. # TODO: this logic should happen in the tensorflow preprocessing. if RM_FLATWORLD_TAGS: line_texts = [] for line in lines: line_text, _ = extract_flatworld_tags(line) line_texts.append(line_text) else: line_texts = lines preds = self._get_predictions(line_texts) labels = [LINE_CODE2LABEL[l] for l in preds] # retrieves header type ('header', 'subheader', or 'subsubheader'). for i, line in enumerate(line_texts): if labels[i] == 'header': labels[i] = self._get_header_type(line) # print(labels[i], line) return labels
def label_speaker_spans(self, lines: List[str], types: List[str] = None) -> List[str]: """Labels the speaker span in each line using BIO tagging. Arguments: lines: List[str]. List of lines to be assigned BIO tags. types: List[str] = None. List of line types. Only used if LABEL_SPEECHES_ONLY==True. If LABEL_SPEECHES_ONLY==True, then the speaker span is tagged only for lines with type == 'speech'. Returns: preds: List[str]. BIO prediction for each line. e.g. ["BIIIIIIO", "OOOOOOOO", ...] """ assert isinstance(lines[0], str), 'Each item in `lines` must be a string.' if RM_FLATWORLD_TAGS: line_texts = [] for line in lines: line_text, _ = extract_flatworld_tags(line) line_texts.append(line_text) else: line_texts = lines pred_labels = self._get_predictions(line_texts, types=types) pred_labels_bio = [] for i, pred in enumerate(pred_labels): if pred == 1: pred_bio = self.RuleLineSpeakerSpanLabeler._get_prediction( line_texts[i]) # KLUDGE: if line contains an A-z character, no B or I tags are # found despite that the line is predicted to have a speaker, # and line is less than 200 characters in length, # then assign the whole line as a speaker. if len(line_texts[i]) < 200 and bool( re.search(r'[A-z]', line_texts[i])) and not bool( re.search(r'[BI]', pred_bio)): if self.verbosity > 1: logging.warn( f'Line is predicted to have a speaker, ' f'but I failed to extract a speaker. I ' f'am over-riding by assigning the whole ' f'line as a speaker name. Line: {line_texts[i]}') pred_bio = ['B'] + ['I'] * (len(pred_bio) - 1) pred_bio = ''.join(pred_bio) else: pred_bio = 'O' * len(line_texts[i]) pred_labels_bio.append(pred_bio) pred_labels = pred_labels_bio # self._line_speaker_span_preds = list(zip(lines, pred_labels)) # picks out the speaker name from the text in each line. return pred_labels
def test_extract_tag_missing_close(self): """tests that `extract_flatworld_tags` is insensitive to cases where there is no closing tag in the line. """ strings = [ ('<header>MOTIONS', ('MOTIONS', ['header'])), ('<NewSpeech>MR. GIKARIA: ', ('MR. GIKARIA: ', ['newspeech'])), ] for s, expected in strings: s2, tags = utils.extract_flatworld_tags(s) self.assertEqual(s2, expected[0]) self.assertEqual(tags, expected[1])
def test_extract_scene_tag(self): """tests that `extract_flatworld_tags` extracts <scene> tags. """ strings = [ ('<scene>(Question proposed)</scene>', ('(Question proposed)', ['scene'])), ('<scene>(applause) </scene>', ('(applause) ', ['scene'])), ] for s, expected in strings: s2, tags = utils.extract_flatworld_tags(s) self.assertEqual(s2, expected[0]) self.assertEqual(tags, expected[1])
def test_dont_extract_other_tags(self): """tests that `extract_flatworld_tags` does not extract non-flatworld tags. (e.g. <i>text</i>). """ strings = [ ('<i>text</i>', ('<i>text</i>', [])), ('<b>text</i>', ('<b>text</i>', [])), ('<div>text</div>', ('<div>text</div>', [])), ] for s, expected in strings: s2, tags = utils.extract_flatworld_tags(s) self.assertEqual(s2, expected[0]) self.assertEqual(tags, expected[1])
def test_extract_hyphen_tag(self): """tests that `extract_flatworld_tags` is insensitive to common hyphens (e.g. "new-speech"). """ strings = [ ('<sub-header>bills</sub-header>', ('bills', ['subheader'])), ('<new-speech>Mr. Gikaria:</new-speech>', ('Mr. Gikaria:', ['newspeech'])), ] for s, expected in strings: s2, tags = utils.extract_flatworld_tags(s) self.assertEqual(s2, expected[0]) self.assertEqual(tags, expected[1])
def test_extract_nothing(self): """tests that `extract_flatworld_tags` extracts nothing when there is no Flatworld tag to extract. """ strings = [ ('ORAL ANSWERS TO QUESTIONS ', ('ORAL ANSWERS TO QUESTIONS ', [])), ('bills', ('bills', [])), ('Header 3 states that', ('Header 3 states that', [])), ('speech by Mr. Gikaria was ', ('speech by Mr. Gikaria was ', [])), ] for s, expected in strings: s2, tags = utils.extract_flatworld_tags(s) self.assertEqual(s2, expected[0]) self.assertEqual(tags, expected[1])
def test_extract_tag_wrong_closing(self): """tests that `extract_flatworld_tags` is insensitive to cases where the closing tag has incorrect syntax (e.g. `<header />`, `<header>`). """ strings = [ ('<header>MOTIONS<header />', ('MOTIONS', ['header'])), ('<header>MOTIONS<header/ >', ('MOTIONS', ['header'])), ('<header>MOTIONS<header / >', ('MOTIONS', ['header'])), ('<header>MOTIONS<header>', ('MOTIONS', ['header'])), ] for s, expected in strings: s2, tags = utils.extract_flatworld_tags(s) self.assertEqual(s2, expected[0]) self.assertEqual(tags, expected[1])
def test_extract_misspelled_tag(self): """tests that `extract_flatworld_tags` is insensitive to common tag mis-spellings. """ strings = [ ('<headr>bills</headr>', ('bills', ['headr'])), ('<scen>(Question proposed)</scen>', ('(Question proposed)', ['scen'])), ('<newspech>Mr. Gikaria:</newspech>', ('Mr. Gikaria:', ['newspech'])), ] for s, expected in strings: s2, tags = utils.extract_flatworld_tags(s) self.assertEqual(s2, expected[0]) self.assertEqual(tags, expected[1])
def test_dont_extract_erroneous_bracket(self): """tests that `extract_flatworld_tags` does not extract tags where an angle bracket appears but is not a tag (e.g. `del<i`). This occurs sometimes due to OCR errors. """ strings = [ ('del<i', ('del<i', [])), ('del<i>', ('del<i>', [])), ] for s, expected in strings: s2, tags = utils.extract_flatworld_tags(s) self.assertEqual(s2, expected[0]) self.assertEqual(tags, expected[1])
def test_extract_tag_is_nested(self): """tests that `extract_flatworld_tags` is insensitive to cases where the Flatworld tag is nested. """ strings = [ ('<i><newspeech>Mr. Speaker: Anyone here from the Ministry of</newspeech></i>', ('<i>Mr. Speaker: Anyone here from the Ministry of</i>', ['newspeech'])), ('<b><header>MOTIONS</header></b>', ('<b>MOTIONS</b>', ['header' ])), ] for s, expected in strings: s2, tags = utils.extract_flatworld_tags(s) self.assertEqual(s2, expected[0]) self.assertEqual(tags, expected[1])
def test_extract_header_tag(self): """tests that `extract_flatworld_tags` extracts <header> tags. """ strings = [ ('<header>ORAL ANSWERS TO QUESTIONS </header>', ('ORAL ANSWERS TO QUESTIONS ', ['header'])), ('<header>bills</header>', ('bills', ['header'])), ('<header>MOTIONS</header>', ('MOTIONS', ['header'])), ('<subheader>Question no. 259</subheader>', ('Question no. 259', ['subheader'])), ] for s, expected in strings: s2, tags = utils.extract_flatworld_tags(s) self.assertEqual(s2, expected[0]) self.assertEqual(tags, expected[1])
def test_extract_tag_missing_bracket(self): """tests that `extract_flatworld_tags` is insensitive to cases where there is a missing angle bracket (e.g. `<header`). """ strings = [ ('<header MOTIONS</header>', ('MOTIONS', ['header'])), ('<header>MOTIONS /header>', ('MOTIONS', ['header'])), ('header>MOTIONS /header>', ('MOTIONS', ['header'])), ('header>MOTIONS </header', ('MOTIONS ', ['header'])), ('<header>MOTIONS<header / >', ('MOTIONS', ['header'])), ('<header>MOTIONS<header>', ('MOTIONS', ['header'])), ] for s, expected in strings: s2, tags = utils.extract_flatworld_tags(s) self.assertEqual(s2, expected[0]) self.assertEqual(tags, expected[1])
def test_extract_speech_tag(self): """tests that `extract_flatworld_tags` extracts <speech> tags. """ strings = [ ('<newspeech>Mr. Speaker: Anyone here from the Ministry of</newspeech>', ('Mr. Speaker: Anyone here from the Ministry of', ['newspeech'])), ('<newspeech>Mr. Gikaria:</newspeech>', ('Mr. Gikaria:', ['newspeech'])), ('<newspeech>MR. GIKARIA: </newspeech>', ('MR. GIKARIA: ', ['newspeech'])), ('<speech>MR. GIKARIA: </speech>', ('MR. GIKARIA: ', ['speech'])), ] for s, expected in strings: s2, tags = utils.extract_flatworld_tags(s) self.assertEqual(s2, expected[0]) self.assertEqual(tags, expected[1])
def test_extract_tag_mismatched(self): """tests that `extract_flatworld_tags` is insensitive to cases where the open and close tags are mis-matched. """ strings = [ ('<header>MOTIONS</subheader>', ('MOTIONS', ['header', 'subheader'])), ('<speech>MOTIONS</subheader>', ('MOTIONS', ['speech', 'subheader'])), ('<NewSpeech>MR. GIKARIA: </header>', ('MR. GIKARIA: ', ['header', 'newspeech'])), ('<NewSpeech>MR. GIKARIA: </speech>', ('MR. GIKARIA: ', ['newspeech', 'speech'])), ] for s, expected in strings: s2, tags = utils.extract_flatworld_tags(s) self.assertEqual(s2, expected[0]) self.assertEqual(tags, expected[1])
def test_extract_tag_any_case(self): """tests that `extract_flatworld_tags` is insensitive to the case of the tag. """ strings = [ ('<NEWSPEECH>Mr. Speaker: Anyone here from the Ministry of</NEWSPEECH>', ('Mr. Speaker: Anyone here from the Ministry of', ['newspeech'])), ('<NEWSPEECH>Mr. Speaker: Anyone here from the Ministry of</newspeech>', ('Mr. Speaker: Anyone here from the Ministry of', ['newspeech'])), ('<Newspeech>Mr. Gikaria:</Newspeech>', ('Mr. Gikaria:', ['newspeech'])), ('<NewSpeech>MR. GIKARIA: </Newspeech>', ('MR. GIKARIA: ', ['newspeech'])), ('<HEADER>MOTIONS</header>', ('MOTIONS', ['header'])), ('<Header>MOTIONS</Header>', ('MOTIONS', ['header'])), ('<Header>MOTIONS</HEADER>', ('MOTIONS', ['header'])), ] for s, expected in strings: s2, tags = utils.extract_flatworld_tags(s) self.assertEqual(s2, expected[0]) self.assertEqual(tags, expected[1])
def _preprocess_text(self, text: str) -> List[str]: """splits text on line breaks and skips over lines with no length. """ # NOTE: the splitting regex in the line below leads to # differences from the line numbers in the original text file. This # is only a problem if you want to make direct comparisons to the # original text file. text = re.split(r'[\n\r]+', text) num_lines = len(text) new_text = [] for line in text: line = line.strip() line, _ = utils.extract_flatworld_tags(line) if len(line) > 0: new_text.append(line) num_lines_after = len(new_text) if self.verbosity > 1: print(f'Number of lines before preprocessing: {num_lines}') print(f'Number of lines after preprocessing: {num_lines_after}') return new_text
def test_extract_tag_with_spacing(self): """tests that `extract_flatworld_tags` is insensitive to cases where there is spacing between the angle brackets and the tag name (e.g. `< Header >`). """ strings = [ ('< HEADER >MOTIONS</ header >', ('MOTIONS', ['header'])), ('< HEADER >MOTIONS</\nheader>', ('MOTIONS', ['header'])), ('< Sub HEADER >Question No. 259</\n sub header>', ('Question No. 259', ['subheader'])), ('< NewSpeech >MR. GIKARIA: </ Newspeech >', ('MR. GIKARIA: ', ['newspeech'])), ('< NewSpeech>MR. GIKARIA: < /Newspeech>', ('MR. GIKARIA: ', ['newspeech'])), ('<NewSpeech >MR. GIKARIA: < /Newspeech >', ('MR. GIKARIA: ', ['newspeech'])), ('<New Speech >MR. GIKARIA: < /New speech >', ('MR. GIKARIA: ', ['newspeech'])), ] for s, expected in strings: s2, tags = utils.extract_flatworld_tags(s) self.assertEqual(s2, expected[0]) self.assertEqual(tags, expected[1])
def _label_one_line(self, line: str, check_if_page_header: bool = True) -> Optional[str]: """Returns the label of a single line, extracted using a rule-based parser. Arguments: line: str. Line in a Hansard transcript. check_if_page_header: bool. If True, checks if line is a page number or page header (and returns "garbage" label if so). Returns: label: Optional[str]. Label of line. If no label is found, returns None. Todos: TODO: lines with a speaker name in all caps are getting labeled as a header, but they shouldn't be. Examples:: `MR. OMYAHCHA (CTD.):` `MR. BIDU (CTD):` One way to address this would be to try to extract a speaker name from the line. If a speaker name is extracted, then it is a speech. """ if self._is_garbage(line, check_if_page_header): return 'garbage' if is_punct(line, True): return 'punct' line_text, flatworld_tags = extract_flatworld_tags(line) test_results = { 'header': self._is_header(line_text, flatworld_tags), 'subheader': self._is_subheader(line_text, flatworld_tags), 'subsubheader': self._is_subsubheader(line_text, flatworld_tags), 'speech': self._is_speech(line_text, flatworld_tags), 'scene': self._is_scene(line_text, flatworld_tags) } if sum(test_results.values()) > 1: # KLUDGE: gives precedence to header over speech if test_results['speech'] and test_results['header']: test_results['speech'] = False # KLUDGE: gives precedence to scene over speech if test_results['speech'] and test_results['scene']: test_results['speech'] = False # KLUDGE: gives precedence to header over scene if test_results['header'] and test_results['scene']: test_results['scene'] = False if self.verbosity > 1 and sum(test_results.values()) > 1: logging.warn( f'Multiple labels found for line: {line};\nLabels found: {", ".join([k for k, v in test_results.items() if v])}' ) # returns label string. for k, v in test_results.items(): if v: return k if self.verbosity > 1: logging.warn(f'Did not find label for line: {line}') return None