Exemple #1
0
    def test_alpha_beta(self):
        text = "Bacteria-gen(Αβ) (Fig.1)"
        expected_text = "Bacteria-gen(Αβ) "
        expected_matches = [(17, 24)]
        actual_text, actual_matches = eliminate_references_and_figures(text)

        self.assertEqual(actual_text, expected_text)
        self.assertEqual(actual_matches, expected_matches)
Exemple #2
0
    def test_figure_tables(self):
        text = "Pelecypod-associated (Fig. 1) bacteria(vegetables 3) in(2) habitat(Table 3)(64%).(Figure 3)"
        expected_text = "Pelecypod-associated  bacteria(vegetables 3) in habitat."
        expected_matches = [(21, 29), (55, 58), (66, 75), (75, 80), (81, 91)]
        actual_text, actual_matches = eliminate_references_and_figures(text)

        self.assertEqual(actual_text, expected_text)
        self.assertEqual(actual_matches, expected_matches)
Exemple #3
0
    def test_no_references(self):
        text = "Pelecypod-associated bacteria in habitat."
        expected_text = text
        expected_matches = []
        actual_text, actual_matches = eliminate_references_and_figures(text)

        self.assertEqual(actual_text, expected_text)
        self.assertEqual(actual_matches, expected_matches)
Exemple #4
0
    def test_digits_lists(self):
        text = "(1, 1-3) bacteria(3) (Li et al.2004) (a 230-231) (55, 56, 57, 10–16)"
        expected_text = " bacteria  (a 230-231) "
        expected_matches = [(0, 8), (17, 20), (21, 36), (49, 68)]

        actual_text, actual_matches = eliminate_references_and_figures(text)

        self.assertEqual(actual_text, expected_text)
        self.assertEqual(actual_matches, expected_matches)
Exemple #5
0
    def test_references(self):
        text = "Pelecypod-associated (yum et.al 2004) bacteria(greenwood et al,2003) " \
               "in habitat(lee et.al. 2007).(wang et al. 80)"
        expected_text = "Pelecypod-associated  bacteria in habitat."
        expected_matches = [(21, 37), (46, 68), (79, 96), (97, 113)]
        actual_text, actual_matches = eliminate_references_and_figures(text)

        self.assertEqual(actual_text, expected_text)
        self.assertEqual(actual_matches, expected_matches)
Exemple #6
0
    def segment(self, text):
        sentences = []
        tokens = []
        raw_tokens = []
        token_sent_start = 0

        if self.post:
            text, eliminations = eliminate_references_and_figures(text)

        for raw_sent_start, raw_sent_end in self.sent_model.span_tokenize(
                text):
            sent_raw_tokens = list(
                nltk._treebank_word_tokenizer.span_tokenize(
                    text[raw_sent_start:raw_sent_end]))

            raw_tokens.extend((start + raw_sent_start, end + raw_sent_start)
                              for start, end in sent_raw_tokens)
            tokens.extend(text[start + raw_sent_start:end + raw_sent_start]
                          for start, end in sent_raw_tokens)
            sentences.append(
                Sentence(token_sent_start,
                         token_sent_start + len(sent_raw_tokens)))

            token_sent_start += len(sent_raw_tokens)

        if self.post:
            tokens, sentences, raw_tokens = fix_joined_tokens(
                tokens, sentences, raw_tokens)
            tokens, sentences, raw_tokens = fix_joined_names(
                tokens, sentences, raw_tokens)
            tokens = expand_names(tokens)
            raw_tokens = fix_raw_tokens_after_elimination(
                raw_tokens, eliminations)
            sentences = fix_sentences_ends(tokens, sentences)

        return tokens, sentences, raw_tokens