Beispiel #1
0
    def test_segmenter(self):
        def make_sentences(segmented_tokens):
            for sentence in segmented_tokens:
                yield "".join(str(token) for token in sentence).strip()

        self.maxDiff = None
        expected = "\n".join(make_sentences(SEGMENTED_TOKENS))
        received = "\n".join(make_sentences(segmenter.split(TOKENIZER.tokenize(TEXT))))
        assert expected == OSPL
        assert expected == received
        assert SEGMENTED_TOKENS == segmenter.split(TOKENIZER.tokenize(TEXT))
Beispiel #2
0
 def test_brackets_before_the_terminal(self):
     tokens = Tokenizer().split(
         "Brackets before the terminal [2]. You know I told you so."
     )
     sep = 8
     result = segmenter.split(iter(tokens))
     self.assertEqual([tokens[:sep], tokens[sep:]], result)
Beispiel #3
0
 def test_two_sentences_with_quotes_and_prenthesis_in_both(self):
     tokens = Tokenizer().split(
         '{"This is a sentence."} ["This is another sentence."]'
     )
     sep = 9
     result = segmenter.split(iter(tokens))
     self.assertEqual([tokens[:sep], tokens[sep:]], result)
Beispiel #4
0
 def test_sentence_marker_after_abbreviation(self):
     tokens = Tokenizer().split(
         "Let's meet at 14.10 in N.Y.. This happened in the U.S. last week."
     )
     sep = 9
     result = segmenter.split(iter(tokens))
     self.assertEqual([tokens[:sep], tokens[sep:]], result)
Beispiel #5
0
 def test_sentences_with_nasty_abbreviations(self):
     tokens = Tokenizer().split(
         "This is Capt. Motto here. And here is Sra. Smithers."
     )
     sep = 7
     result = segmenter.split(iter(tokens))
     self.assertEqual([tokens[:sep], tokens[sep:]], result)
Beispiel #6
0
 def test_simple(self):
     tokens = list(
         map(lambda v: Token('', v, 0),
             ["This", "is", "a", "sentence", "."]))
     # noinspection PyTypeChecker
     result = segmenter.split(iter(tokens))
     self.assertEqual([tokens], result)
Beispiel #7
0
 def test_split_long_text_inside_parenthesis2(self):
     tokens = Tokenizer().split(
         "This is one (Here is another view of the same. And then there is a different case here.)"
     )
     sep1 = 3
     sep2 = 12
     result = segmenter.split(iter(tokens))
     self.assertEqual([tokens[:sep1], tokens[sep1:sep2], tokens[sep2:]], result)
Beispiel #8
0
 def test_split_with_a_simple_parenthesis_structure(self):
     tokens = Tokenizer().split("And another sentence on the same line. "
                                "(How about a sentence in parenthesis?) "
                                'Or a sentence with "a quote!"')
     sep1 = 8
     sep2 = 17
     result = segmenter.split(iter(tokens))
     self.assertEqual([tokens[:sep1], tokens[sep1:sep2], tokens[sep2:]],
                      result)
Beispiel #9
0
def text_to_chunks(text):
    chunksize = 1
    tokenized_sents = list(split(Tokenizer().tokenize(text)))
    sents = [
        ' '.join(str(token) for token in sent) for sent in tokenized_sents
    ]
    sents = [
        ' '.join(sents[i:i + chunksize])
        for i in range(len(sents) - chunksize)
    ]
    sentences = []
    for sent in sents:
        sent = text_to_sentence(sent)
        if sent is not None:
            sentences.append(sent)
    return sentences
Beispiel #10
0
 def test_two_sentences_with_quotes_in_first(self):
     tokens = Tokenizer().split('"This is a sentence." This is another sentence.')
     sep = 7
     result = segmenter.split(iter(tokens))
     self.assertEqual([tokens[:sep], tokens[sep:]], result)
Beispiel #11
0
 def test_sentence_with_single_quotes(self):
     tokens = Tokenizer().split("This is a sentence. 'This is another sentence.'")
     sep = 5
     result = segmenter.split(iter(tokens))
     self.assertEqual([tokens[:sep], tokens[sep:]], result)
Beispiel #12
0
 def test_two_sentences_with_parenthesis_in_second(self):
     tokens = Tokenizer().split("This is a sentence. (This is another sentence.)")
     sep = 5
     result = segmenter.split(iter(tokens))
     self.assertEqual([tokens[:sep], tokens[sep:]], result)
Beispiel #13
0
 def test_two_questions(self):
     tokens = Tokenizer().split("Is this a sentence? Is this another sentence?")
     sep = 5
     result = segmenter.split(iter(tokens))
     self.assertEqual([tokens[:sep], tokens[sep:]], result)
Beispiel #14
0
 def test_two_exclamations(self):
     tokens = Tokenizer().split("This is a sentence! This is another sentence!")
     sep = 5
     result = segmenter.split(iter(tokens))
     self.assertEqual([tokens[:sep], tokens[sep:]], result)
Beispiel #15
0
 def test_two_tokens(self):
     tokens = [Token("", "I", 0), Token("", ".", 1)]
     result = segmenter.split(iter(tokens))
     self.assertEqual([tokens], result)
Beispiel #16
0
 def test_one_token(self):
     tokens = [Token("", "I", 0)]
     result = segmenter.split(iter(tokens))
     self.assertEqual([tokens], result)
Beispiel #17
0
 def test_empty(self):
     tokens = []
     result = segmenter.split(iter(tokens))
     self.assertEqual([], result)
def syntok_tokenize(text):
    tokens = syntok_tokenizer.split(text)
    result = syntok_segmenter.split(iter(tokens))
    segments = [sent for sent in make_sentences(result)]
    return segments