Example #1
0
    def test_start_end_matches(self):
        """
                    original string: (a) aaa (a) aaa aaaa (a).(a)
                    after elimination: aaa  aaa asaa .
        """
        matches = [(0, 3), (8, 11), (21, 24), (25, 28)]
        raw_tokens = [(1, 4), (6, 9), (10, 14), (15, 16)]
        expected_tokens = [(4, 7), (12, 15), (16, 20), (24, 25)]

        self.assertEqual(fix_raw_tokens_after_elimination(raw_tokens, matches), expected_tokens)
Example #2
0
    def test_near_matches(self):
        """
                    original string: aaa (aa) (aaa) (aaa.1) aaa aaaa (a) 1 aaaaa
                    after elimination: aaa   aaa aaaa  1 aaaaa
        """
        matches = [(4, 8), (9, 14), (15, 22), (32, 35)]
        raw_tokens = [(0, 3), (7, 10), (11, 15), (17, 18), (19, 25)]
        expected_tokens = [(0, 3), (23, 26), (27, 31), (36, 37), (38, 44)]

        self.assertEqual(fix_raw_tokens_after_elimination(raw_tokens, matches), expected_tokens)
Example #3
0
    def test_separated_matches(self):
        """
            original string: aaa (aa) aaaaa (aaa.1) aaa aaaa (a) 1 aaaaa
            after elimination: aaa  aaaaa  aaa aaaa  1 aaaaa
        """
        matches = [(4, 8), (15, 22), (32, 35)]
        raw_tokens = [(0, 3), (5, 10), (12, 15), (16, 20), (22, 23), (24, 30)]
        expected_tokens = [(0, 3), (9, 14), (23, 26), (27, 31), (36, 37), (38, 44)]

        self.assertEqual(fix_raw_tokens_after_elimination(raw_tokens, matches), expected_tokens)
Example #4
0
    def test_inner_matches(self):
        """
            original string: peptide (Arg(289)↓Lys(290)).
            after elimination: peptide (Arg↓Lys).
        """
        matches = [(12, 17), (21, 26)]
        raw_tokens = [(0, 7), (8, 9), (9, 16), (16, 17), (17, 18)]
        expected_tokens = [(0, 7), (8, 9), (9, 21), (26, 27), (27, 28)]

        self.assertEqual(expected_tokens, fix_raw_tokens_after_elimination(raw_tokens, matches))
Example #5
0
    def segment(self, text):
        sentences = []
        tokens = []
        raw_tokens = []
        token_sent_start = 0

        if self.post:
            text, eliminations = eliminate_references_and_figures(text)

        for raw_sent_start, raw_sent_end in self.sent_model.span_tokenize(
                text):
            sent_raw_tokens = list(
                nltk._treebank_word_tokenizer.span_tokenize(
                    text[raw_sent_start:raw_sent_end]))

            raw_tokens.extend((start + raw_sent_start, end + raw_sent_start)
                              for start, end in sent_raw_tokens)
            tokens.extend(text[start + raw_sent_start:end + raw_sent_start]
                          for start, end in sent_raw_tokens)
            sentences.append(
                Sentence(token_sent_start,
                         token_sent_start + len(sent_raw_tokens)))

            token_sent_start += len(sent_raw_tokens)

        if self.post:
            tokens, sentences, raw_tokens = fix_joined_tokens(
                tokens, sentences, raw_tokens)
            tokens, sentences, raw_tokens = fix_joined_names(
                tokens, sentences, raw_tokens)
            tokens = expand_names(tokens)
            raw_tokens = fix_raw_tokens_after_elimination(
                raw_tokens, eliminations)
            sentences = fix_sentences_ends(tokens, sentences)

        return tokens, sentences, raw_tokens
Example #6
0
    def test_no_matches(self):
        matches = []
        raw_tokens = [(0, 3), (4, 6), (7, 10)]

        self.assertEqual(fix_raw_tokens_after_elimination(raw_tokens, matches), raw_tokens)