コード例 #1
0
ファイル: Supervised.py プロジェクト: duncan09/hansardparser
    def _get_header_type(self, line: str):
        """Retrieves the header type of a header.

        Possible header types: ['header', 'subheader', 'subsubheader'].

        This method should only be called on lines that you are confident are headers.
        The method returns "header" if `self._is_header` returns true; else returns
        "subsubheader" if `self._is_subsubheader` returns true; else returns
        "subheader".

        Arguments:

            line: str. Line in a Hansard transcript.

        Returns:

            str. One of: ['header', 'subheader', 'subsubheader']. By default,
                returns 'subheader' if the criteria for 'header' or 'subsubheader'
                are not met.
        """
        line_text, flatworld_tags = extract_flatworld_tags(line)
        header_type = 'subheader'
        if self.RuleLineTypeLabeler._is_header(line_text, flatworld_tags):
            header_type = 'header'
        elif self.RuleLineTypeLabeler._is_subsubheader(line_text, flatworld_tags):
            header_type = 'subsubheader'
        return header_type
コード例 #2
0
ファイル: Supervised.py プロジェクト: duncan09/hansardparser
    def label_lines(self, lines: List[str]) -> List[str]:
        """Returns the label/class of each line in the Hansard transcript.

        Possible labels: header, subheader, subsubheader, speech, scene, garbage.

        Arguments:

            lines: List[str]. List of lines in a Hansard transcript.

        Returns:

            labels: List[str]. List of line labels.
        """
        assert isinstance(lines[0], str), 'Each item in `lines` must be a string.'
        # KLUDGE: removes flatworld tag from text before making prediction.
        # TODO: this logic should happen in the tensorflow preprocessing.
        if RM_FLATWORLD_TAGS:
            line_texts = []
            for line in lines:
                line_text, _ = extract_flatworld_tags(line)
                line_texts.append(line_text)
        else:
            line_texts = lines
        preds = self._get_predictions(line_texts)
        labels = [LINE_CODE2LABEL[l] for l in preds]
        # retrieves header type ('header', 'subheader', or 'subsubheader').
        for i, line in enumerate(line_texts):
            if labels[i] == 'header':
                labels[i] = self._get_header_type(line)
                # print(labels[i], line)
        return labels
コード例 #3
0
    def label_speaker_spans(self,
                            lines: List[str],
                            types: List[str] = None) -> List[str]:
        """Labels the speaker span in each line using BIO tagging.

        Arguments:

            lines: List[str]. List of lines to be assigned BIO tags.

            types: List[str] = None. List of line types. Only used if
                LABEL_SPEECHES_ONLY==True. If LABEL_SPEECHES_ONLY==True, then
                the speaker span is tagged only for lines with type == 'speech'.
        

        Returns:

            preds: List[str]. BIO prediction for each line. e.g. ["BIIIIIIO", "OOOOOOOO", ...]
        """
        assert isinstance(lines[0],
                          str), 'Each item in `lines` must be a string.'
        if RM_FLATWORLD_TAGS:
            line_texts = []
            for line in lines:
                line_text, _ = extract_flatworld_tags(line)
                line_texts.append(line_text)
        else:
            line_texts = lines
        pred_labels = self._get_predictions(line_texts, types=types)
        pred_labels_bio = []
        for i, pred in enumerate(pred_labels):
            if pred == 1:
                pred_bio = self.RuleLineSpeakerSpanLabeler._get_prediction(
                    line_texts[i])
                # KLUDGE: if line contains an A-z character, no B or I tags are
                # found despite that the line is predicted to have a speaker,
                # and line is less than 200 characters in length,
                # then assign the whole line as a speaker.
                if len(line_texts[i]) < 200 and bool(
                        re.search(r'[A-z]', line_texts[i])) and not bool(
                            re.search(r'[BI]', pred_bio)):
                    if self.verbosity > 1:
                        logging.warn(
                            f'Line is predicted to have a speaker, '
                            f'but I failed to extract a speaker. I '
                            f'am over-riding by assigning the whole '
                            f'line as a speaker name. Line: {line_texts[i]}')
                    pred_bio = ['B'] + ['I'] * (len(pred_bio) - 1)
                    pred_bio = ''.join(pred_bio)
            else:
                pred_bio = 'O' * len(line_texts[i])
            pred_labels_bio.append(pred_bio)
        pred_labels = pred_labels_bio
        # self._line_speaker_span_preds = list(zip(lines, pred_labels))
        # picks out the speaker name from the text in each line.
        return pred_labels
コード例 #4
0
ファイル: test_utils.py プロジェクト: duncan09/hansardparser
 def test_extract_tag_missing_close(self):
     """tests that `extract_flatworld_tags` is insensitive to cases where there is no
     closing tag in the line.
     """
     strings = [
         ('<header>MOTIONS', ('MOTIONS', ['header'])),
         ('<NewSpeech>MR. GIKARIA: ', ('MR. GIKARIA: ', ['newspeech'])),
     ]
     for s, expected in strings:
         s2, tags = utils.extract_flatworld_tags(s)
         self.assertEqual(s2, expected[0])
         self.assertEqual(tags, expected[1])
コード例 #5
0
ファイル: test_utils.py プロジェクト: duncan09/hansardparser
 def test_extract_scene_tag(self):
     """tests that `extract_flatworld_tags` extracts <scene> tags.
     """
     strings = [
         ('<scene>(Question proposed)</scene>', ('(Question proposed)',
                                                 ['scene'])),
         ('<scene>(applause) </scene>', ('(applause) ', ['scene'])),
     ]
     for s, expected in strings:
         s2, tags = utils.extract_flatworld_tags(s)
         self.assertEqual(s2, expected[0])
         self.assertEqual(tags, expected[1])
コード例 #6
0
ファイル: test_utils.py プロジェクト: duncan09/hansardparser
 def test_dont_extract_other_tags(self):
     """tests that `extract_flatworld_tags` does not extract non-flatworld tags.
     (e.g. <i>text</i>).
     """
     strings = [
         ('<i>text</i>', ('<i>text</i>', [])),
         ('<b>text</i>', ('<b>text</i>', [])),
         ('<div>text</div>', ('<div>text</div>', [])),
     ]
     for s, expected in strings:
         s2, tags = utils.extract_flatworld_tags(s)
         self.assertEqual(s2, expected[0])
         self.assertEqual(tags, expected[1])
コード例 #7
0
ファイル: test_utils.py プロジェクト: duncan09/hansardparser
 def test_extract_hyphen_tag(self):
     """tests that `extract_flatworld_tags` is insensitive to common hyphens
     (e.g. "new-speech").
     """
     strings = [
         ('<sub-header>bills</sub-header>', ('bills', ['subheader'])),
         ('<new-speech>Mr. Gikaria:</new-speech>', ('Mr. Gikaria:',
                                                    ['newspeech'])),
     ]
     for s, expected in strings:
         s2, tags = utils.extract_flatworld_tags(s)
         self.assertEqual(s2, expected[0])
         self.assertEqual(tags, expected[1])
コード例 #8
0
ファイル: test_utils.py プロジェクト: duncan09/hansardparser
 def test_extract_nothing(self):
     """tests that `extract_flatworld_tags` extracts nothing when there is
     no Flatworld tag to extract.
     """
     strings = [
         ('ORAL ANSWERS TO QUESTIONS ', ('ORAL ANSWERS TO QUESTIONS ', [])),
         ('bills', ('bills', [])),
         ('Header 3 states that', ('Header 3 states that', [])),
         ('speech by Mr. Gikaria was ', ('speech by Mr. Gikaria was ', [])),
     ]
     for s, expected in strings:
         s2, tags = utils.extract_flatworld_tags(s)
         self.assertEqual(s2, expected[0])
         self.assertEqual(tags, expected[1])
コード例 #9
0
ファイル: test_utils.py プロジェクト: duncan09/hansardparser
 def test_extract_tag_wrong_closing(self):
     """tests that `extract_flatworld_tags` is insensitive to cases where the closing tag
     has incorrect syntax (e.g. `<header />`, `<header>`).
     """
     strings = [
         ('<header>MOTIONS<header />', ('MOTIONS', ['header'])),
         ('<header>MOTIONS<header/ >', ('MOTIONS', ['header'])),
         ('<header>MOTIONS<header / >', ('MOTIONS', ['header'])),
         ('<header>MOTIONS<header>', ('MOTIONS', ['header'])),
     ]
     for s, expected in strings:
         s2, tags = utils.extract_flatworld_tags(s)
         self.assertEqual(s2, expected[0])
         self.assertEqual(tags, expected[1])
コード例 #10
0
ファイル: test_utils.py プロジェクト: duncan09/hansardparser
 def test_extract_misspelled_tag(self):
     """tests that `extract_flatworld_tags` is insensitive to common tag mis-spellings.
     """
     strings = [
         ('<headr>bills</headr>', ('bills', ['headr'])),
         ('<scen>(Question proposed)</scen>', ('(Question proposed)',
                                               ['scen'])),
         ('<newspech>Mr. Gikaria:</newspech>', ('Mr. Gikaria:',
                                                ['newspech'])),
     ]
     for s, expected in strings:
         s2, tags = utils.extract_flatworld_tags(s)
         self.assertEqual(s2, expected[0])
         self.assertEqual(tags, expected[1])
コード例 #11
0
ファイル: test_utils.py プロジェクト: duncan09/hansardparser
    def test_dont_extract_erroneous_bracket(self):
        """tests that `extract_flatworld_tags` does not extract tags where an angle bracket
        appears but is not a tag (e.g. `del<i`).

        This occurs sometimes due to OCR errors.
        """
        strings = [
            ('del<i', ('del<i', [])),
            ('del<i>', ('del<i>', [])),
        ]
        for s, expected in strings:
            s2, tags = utils.extract_flatworld_tags(s)
            self.assertEqual(s2, expected[0])
            self.assertEqual(tags, expected[1])
コード例 #12
0
ファイル: test_utils.py プロジェクト: duncan09/hansardparser
 def test_extract_tag_is_nested(self):
     """tests that `extract_flatworld_tags` is insensitive to cases where the
     Flatworld tag is nested.
     """
     strings = [
         ('<i><newspeech>Mr. Speaker: Anyone here from the Ministry of</newspeech></i>',
          ('<i>Mr. Speaker: Anyone here from the Ministry of</i>',
           ['newspeech'])),
         ('<b><header>MOTIONS</header></b>', ('<b>MOTIONS</b>', ['header'
                                                                 ])),
     ]
     for s, expected in strings:
         s2, tags = utils.extract_flatworld_tags(s)
         self.assertEqual(s2, expected[0])
         self.assertEqual(tags, expected[1])
コード例 #13
0
ファイル: test_utils.py プロジェクト: duncan09/hansardparser
 def test_extract_header_tag(self):
     """tests that `extract_flatworld_tags` extracts <header> tags.
     """
     strings = [
         ('<header>ORAL ANSWERS TO QUESTIONS </header>',
          ('ORAL ANSWERS TO QUESTIONS ', ['header'])),
         ('<header>bills</header>', ('bills', ['header'])),
         ('<header>MOTIONS</header>', ('MOTIONS', ['header'])),
         ('<subheader>Question no. 259</subheader>', ('Question no. 259',
                                                      ['subheader'])),
     ]
     for s, expected in strings:
         s2, tags = utils.extract_flatworld_tags(s)
         self.assertEqual(s2, expected[0])
         self.assertEqual(tags, expected[1])
コード例 #14
0
ファイル: test_utils.py プロジェクト: duncan09/hansardparser
 def test_extract_tag_missing_bracket(self):
     """tests that `extract_flatworld_tags` is insensitive to cases where there is a
     missing angle bracket (e.g. `<header`).
     """
     strings = [
         ('<header MOTIONS</header>', ('MOTIONS', ['header'])),
         ('<header>MOTIONS /header>', ('MOTIONS', ['header'])),
         ('header>MOTIONS /header>', ('MOTIONS', ['header'])),
         ('header>MOTIONS </header', ('MOTIONS ', ['header'])),
         ('<header>MOTIONS<header / >', ('MOTIONS', ['header'])),
         ('<header>MOTIONS<header>', ('MOTIONS', ['header'])),
     ]
     for s, expected in strings:
         s2, tags = utils.extract_flatworld_tags(s)
         self.assertEqual(s2, expected[0])
         self.assertEqual(tags, expected[1])
コード例 #15
0
ファイル: test_utils.py プロジェクト: duncan09/hansardparser
 def test_extract_speech_tag(self):
     """tests that `extract_flatworld_tags` extracts <speech> tags.
     """
     strings = [
         ('<newspeech>Mr. Speaker: Anyone here from the Ministry of</newspeech>',
          ('Mr. Speaker: Anyone here from the Ministry of', ['newspeech'])),
         ('<newspeech>Mr. Gikaria:</newspeech>', ('Mr. Gikaria:',
                                                  ['newspeech'])),
         ('<newspeech>MR. GIKARIA: </newspeech>', ('MR. GIKARIA: ',
                                                   ['newspeech'])),
         ('<speech>MR. GIKARIA: </speech>', ('MR. GIKARIA: ', ['speech'])),
     ]
     for s, expected in strings:
         s2, tags = utils.extract_flatworld_tags(s)
         self.assertEqual(s2, expected[0])
         self.assertEqual(tags, expected[1])
コード例 #16
0
ファイル: test_utils.py プロジェクト: duncan09/hansardparser
 def test_extract_tag_mismatched(self):
     """tests that `extract_flatworld_tags` is insensitive to cases where the
     open and close tags are mis-matched.
     """
     strings = [
         ('<header>MOTIONS</subheader>', ('MOTIONS',
                                          ['header', 'subheader'])),
         ('<speech>MOTIONS</subheader>', ('MOTIONS',
                                          ['speech', 'subheader'])),
         ('<NewSpeech>MR. GIKARIA: </header>', ('MR. GIKARIA: ',
                                                ['header', 'newspeech'])),
         ('<NewSpeech>MR. GIKARIA: </speech>', ('MR. GIKARIA: ',
                                                ['newspeech', 'speech'])),
     ]
     for s, expected in strings:
         s2, tags = utils.extract_flatworld_tags(s)
         self.assertEqual(s2, expected[0])
         self.assertEqual(tags, expected[1])
コード例 #17
0
ファイル: test_utils.py プロジェクト: duncan09/hansardparser
 def test_extract_tag_any_case(self):
     """tests that `extract_flatworld_tags` is insensitive to the case of the tag.
     """
     strings = [
         ('<NEWSPEECH>Mr. Speaker: Anyone here from the Ministry of</NEWSPEECH>',
          ('Mr. Speaker: Anyone here from the Ministry of', ['newspeech'])),
         ('<NEWSPEECH>Mr. Speaker: Anyone here from the Ministry of</newspeech>',
          ('Mr. Speaker: Anyone here from the Ministry of', ['newspeech'])),
         ('<Newspeech>Mr. Gikaria:</Newspeech>', ('Mr. Gikaria:',
                                                  ['newspeech'])),
         ('<NewSpeech>MR. GIKARIA: </Newspeech>', ('MR. GIKARIA: ',
                                                   ['newspeech'])),
         ('<HEADER>MOTIONS</header>', ('MOTIONS', ['header'])),
         ('<Header>MOTIONS</Header>', ('MOTIONS', ['header'])),
         ('<Header>MOTIONS</HEADER>', ('MOTIONS', ['header'])),
     ]
     for s, expected in strings:
         s2, tags = utils.extract_flatworld_tags(s)
         self.assertEqual(s2, expected[0])
         self.assertEqual(tags, expected[1])
コード例 #18
0
ファイル: TxtParser.py プロジェクト: duncan09/hansardparser
 def _preprocess_text(self, text: str) -> List[str]:
     """splits text on line breaks and skips over lines with no length.
     """
     # NOTE: the splitting regex in the line below leads to
     # differences from the line numbers in the original text file. This
     # is only a problem if you want to make direct comparisons to the
     # original text file.
     text = re.split(r'[\n\r]+', text)
     num_lines = len(text)
     new_text = []
     for line in text:
         line = line.strip()
         line, _ = utils.extract_flatworld_tags(line)
         if len(line) > 0:
             new_text.append(line)
     num_lines_after = len(new_text)
     if self.verbosity > 1:
         print(f'Number of lines before preprocessing: {num_lines}')
         print(f'Number of lines after preprocessing: {num_lines_after}')
     return new_text
コード例 #19
0
ファイル: test_utils.py プロジェクト: duncan09/hansardparser
 def test_extract_tag_with_spacing(self):
     """tests that `extract_flatworld_tags` is insensitive to cases where there is spacing
     between the angle brackets and the tag name (e.g. `< Header >`).
     """
     strings = [
         ('< HEADER >MOTIONS</ header >', ('MOTIONS', ['header'])),
         ('< HEADER >MOTIONS</\nheader>', ('MOTIONS', ['header'])),
         ('< Sub HEADER >Question No. 259</\n sub header>',
          ('Question No. 259', ['subheader'])),
         ('< NewSpeech >MR. GIKARIA: </ Newspeech >', ('MR. GIKARIA: ',
                                                       ['newspeech'])),
         ('<  NewSpeech>MR. GIKARIA: < /Newspeech>', ('MR. GIKARIA: ',
                                                      ['newspeech'])),
         ('<NewSpeech >MR. GIKARIA: <  /Newspeech  >', ('MR. GIKARIA: ',
                                                        ['newspeech'])),
         ('<New Speech >MR. GIKARIA: <  /New speech  >', ('MR. GIKARIA: ',
                                                          ['newspeech'])),
     ]
     for s, expected in strings:
         s2, tags = utils.extract_flatworld_tags(s)
         self.assertEqual(s2, expected[0])
         self.assertEqual(tags, expected[1])
コード例 #20
0
ファイル: Rule.py プロジェクト: duncan09/hansardparser
    def _label_one_line(self,
                        line: str,
                        check_if_page_header: bool = True) -> Optional[str]:
        """Returns the label of a single line, extracted using a rule-based parser.

        Arguments:

            line: str. Line in a Hansard transcript.

            check_if_page_header: bool. If True, checks if line is a page number
                or page header (and returns "garbage" label if so).

        Returns:

            label: Optional[str]. Label of line. If no label is found, returns
                None.
        
        Todos:

            TODO: lines with a speaker name in all caps are getting labeled as a header,
                but they shouldn't be.
                Examples::

                    `MR. OMYAHCHA (CTD.):`
                    `MR. BIDU (CTD):`

                One way to address this would be to try to extract a speaker name
                from the line. If a speaker name is extracted, then it is a speech.
        """
        if self._is_garbage(line, check_if_page_header):
            return 'garbage'
        if is_punct(line, True):
            return 'punct'
        line_text, flatworld_tags = extract_flatworld_tags(line)
        test_results = {
            'header': self._is_header(line_text, flatworld_tags),
            'subheader': self._is_subheader(line_text, flatworld_tags),
            'subsubheader': self._is_subsubheader(line_text, flatworld_tags),
            'speech': self._is_speech(line_text, flatworld_tags),
            'scene': self._is_scene(line_text, flatworld_tags)
        }
        if sum(test_results.values()) > 1:
            # KLUDGE: gives precedence to header over speech
            if test_results['speech'] and test_results['header']:
                test_results['speech'] = False
            # KLUDGE: gives precedence to scene over speech
            if test_results['speech'] and test_results['scene']:
                test_results['speech'] = False
            # KLUDGE: gives precedence to header over scene
            if test_results['header'] and test_results['scene']:
                test_results['scene'] = False
        if self.verbosity > 1 and sum(test_results.values()) > 1:
            logging.warn(
                f'Multiple labels found for line: {line};\nLabels found: {", ".join([k for k, v in test_results.items() if v])}'
            )
        # returns label string.
        for k, v in test_results.items():
            if v:
                return k
        if self.verbosity > 1:
            logging.warn(f'Did not find label for line: {line}')
        return None