def test_set_sentence_segmentation_with_max_number_of_sentences(): tokenizer = Tokenizer(TokenizerConfiguration(max_nr_of_sentences=2)) tokenized = tokenizer.tokenize_document([ "This is a sentence. This is another sentence.", "One more sentence here.", "Last sentence here.", ]) assert len(tokenized) == 2
def test_min_max_sentence_length(): tokenizer = Tokenizer( TokenizerConfiguration(segment_sentences=True, min_sentence_length=10, max_sentence_length=15)) tokenized = tokenizer.tokenize_text( "short. A very long sentence. This is fine") assert len(tokenized) == 1 assert len(tokenized[0]) == 3
def test_document_cleaning(): tokenizer = Tokenizer( TokenizerConfiguration( text_cleaning={"rules": ["html_to_text", "strip_spaces"]}, segment_sentences=True, )) tokenized = tokenizer.tokenize_document([html_text]) assert len(tokenized) == 2 assert (len(tokenized[0]) == 7 ), "Expected [My, First, Heading, My, first, paragraph, .]" assert len(tokenized[1]) == 4, "Expected [My, second, paragraph, .]"
def test_text_cleaning_with_sentence_segmentation_and_max_sequence(): tokenizer = Tokenizer( TokenizerConfiguration( max_sequence_length=8, text_cleaning={"rules": ["html_to_text", "strip_spaces"]}, segment_sentences=True, )) tokenized = tokenizer.tokenize_text(html_text) assert len(tokenized) == 2 assert len(tokenized[0]) == 2, "Expected [My, First]" assert len(tokenized[1]) == 2, "Expected [My, second]"
def build_tokenizer(self) -> Tokenizer: """Build the pipeline tokenizer""" if self.tokenizer_config.use_transformers: return TransformersTokenizer(self.tokenizer_config) return Tokenizer(self.tokenizer_config)
def test_using_allennlp_tokens(): tokenizer = Tokenizer(TokenizerConfiguration(use_spacy_tokens=False)) tokenized = tokenizer.tokenize_text("This is a text") assert len(tokenized) == 1 assert len(tokenized[0]) == 4 assert all(map(lambda t: isinstance(t, AllennlpToken), tokenized[0]))