def test_segmenter(self): def make_sentences(segmented_tokens): for sentence in segmented_tokens: yield "".join(str(token) for token in sentence).strip() self.maxDiff = None expected = "\n".join(make_sentences(SEGMENTED_TOKENS)) received = "\n".join(make_sentences(segmenter.split(TOKENIZER.tokenize(TEXT)))) assert expected == OSPL assert expected == received assert SEGMENTED_TOKENS == segmenter.split(TOKENIZER.tokenize(TEXT))
def test_brackets_before_the_terminal(self): tokens = Tokenizer().split( "Brackets before the terminal [2]. You know I told you so." ) sep = 8 result = segmenter.split(iter(tokens)) self.assertEqual([tokens[:sep], tokens[sep:]], result)
def test_two_sentences_with_quotes_and_prenthesis_in_both(self): tokens = Tokenizer().split( '{"This is a sentence."} ["This is another sentence."]' ) sep = 9 result = segmenter.split(iter(tokens)) self.assertEqual([tokens[:sep], tokens[sep:]], result)
def test_sentence_marker_after_abbreviation(self): tokens = Tokenizer().split( "Let's meet at 14.10 in N.Y.. This happened in the U.S. last week." ) sep = 9 result = segmenter.split(iter(tokens)) self.assertEqual([tokens[:sep], tokens[sep:]], result)
def test_sentences_with_nasty_abbreviations(self): tokens = Tokenizer().split( "This is Capt. Motto here. And here is Sra. Smithers." ) sep = 7 result = segmenter.split(iter(tokens)) self.assertEqual([tokens[:sep], tokens[sep:]], result)
def test_simple(self): tokens = list( map(lambda v: Token('', v, 0), ["This", "is", "a", "sentence", "."])) # noinspection PyTypeChecker result = segmenter.split(iter(tokens)) self.assertEqual([tokens], result)
def test_split_long_text_inside_parenthesis2(self): tokens = Tokenizer().split( "This is one (Here is another view of the same. And then there is a different case here.)" ) sep1 = 3 sep2 = 12 result = segmenter.split(iter(tokens)) self.assertEqual([tokens[:sep1], tokens[sep1:sep2], tokens[sep2:]], result)
def test_split_with_a_simple_parenthesis_structure(self): tokens = Tokenizer().split("And another sentence on the same line. " "(How about a sentence in parenthesis?) " 'Or a sentence with "a quote!"') sep1 = 8 sep2 = 17 result = segmenter.split(iter(tokens)) self.assertEqual([tokens[:sep1], tokens[sep1:sep2], tokens[sep2:]], result)
def text_to_chunks(text): chunksize = 1 tokenized_sents = list(split(Tokenizer().tokenize(text))) sents = [ ' '.join(str(token) for token in sent) for sent in tokenized_sents ] sents = [ ' '.join(sents[i:i + chunksize]) for i in range(len(sents) - chunksize) ] sentences = [] for sent in sents: sent = text_to_sentence(sent) if sent is not None: sentences.append(sent) return sentences
def test_two_sentences_with_quotes_in_first(self): tokens = Tokenizer().split('"This is a sentence." This is another sentence.') sep = 7 result = segmenter.split(iter(tokens)) self.assertEqual([tokens[:sep], tokens[sep:]], result)
def test_sentence_with_single_quotes(self): tokens = Tokenizer().split("This is a sentence. 'This is another sentence.'") sep = 5 result = segmenter.split(iter(tokens)) self.assertEqual([tokens[:sep], tokens[sep:]], result)
def test_two_sentences_with_parenthesis_in_second(self): tokens = Tokenizer().split("This is a sentence. (This is another sentence.)") sep = 5 result = segmenter.split(iter(tokens)) self.assertEqual([tokens[:sep], tokens[sep:]], result)
def test_two_questions(self): tokens = Tokenizer().split("Is this a sentence? Is this another sentence?") sep = 5 result = segmenter.split(iter(tokens)) self.assertEqual([tokens[:sep], tokens[sep:]], result)
def test_two_exclamations(self): tokens = Tokenizer().split("This is a sentence! This is another sentence!") sep = 5 result = segmenter.split(iter(tokens)) self.assertEqual([tokens[:sep], tokens[sep:]], result)
def test_two_tokens(self): tokens = [Token("", "I", 0), Token("", ".", 1)] result = segmenter.split(iter(tokens)) self.assertEqual([tokens], result)
def test_one_token(self): tokens = [Token("", "I", 0)] result = segmenter.split(iter(tokens)) self.assertEqual([tokens], result)
def test_empty(self): tokens = [] result = segmenter.split(iter(tokens)) self.assertEqual([], result)
def syntok_tokenize(text): tokens = syntok_tokenizer.split(text) result = syntok_segmenter.split(iter(tokens)) segments = [sent for sent in make_sentences(result)] return segments