def test_set_sentence_segmentation_with_max_number_of_sentences():
    tokenizer = Tokenizer(TokenizerConfiguration(max_nr_of_sentences=2))
    tokenized = tokenizer.tokenize_document([
        "This is a sentence. This is another sentence.",
        "One more sentence here.",
        "Last sentence here.",
    ])
    assert len(tokenized) == 2
Esempio n. 2
0
def test_min_max_sentence_length():
    tokenizer = Tokenizer(
        TokenizerConfiguration(segment_sentences=True,
                               min_sentence_length=10,
                               max_sentence_length=15))
    tokenized = tokenizer.tokenize_text(
        "short. A very long sentence. This is fine")

    assert len(tokenized) == 1
    assert len(tokenized[0]) == 3
def test_document_cleaning():
    tokenizer = Tokenizer(
        TokenizerConfiguration(
            text_cleaning={"rules": ["html_to_text", "strip_spaces"]},
            segment_sentences=True,
        ))

    tokenized = tokenizer.tokenize_document([html_text])
    assert len(tokenized) == 2
    assert (len(tokenized[0]) == 7
            ), "Expected [My, First, Heading, My, first, paragraph, .]"
    assert len(tokenized[1]) == 4, "Expected [My, second, paragraph, .]"
def test_text_cleaning_with_sentence_segmentation_and_max_sequence():
    tokenizer = Tokenizer(
        TokenizerConfiguration(
            max_sequence_length=8,
            text_cleaning={"rules": ["html_to_text", "strip_spaces"]},
            segment_sentences=True,
        ))

    tokenized = tokenizer.tokenize_text(html_text)
    assert len(tokenized) == 2
    assert len(tokenized[0]) == 2, "Expected [My, First]"
    assert len(tokenized[1]) == 2, "Expected [My, second]"
Esempio n. 5
0
 def build_tokenizer(self) -> Tokenizer:
     """Build the pipeline tokenizer"""
     if self.tokenizer_config.use_transformers:
         return TransformersTokenizer(self.tokenizer_config)
     return Tokenizer(self.tokenizer_config)
def test_using_allennlp_tokens():
    tokenizer = Tokenizer(TokenizerConfiguration(use_spacy_tokens=False))
    tokenized = tokenizer.tokenize_text("This is a text")
    assert len(tokenized) == 1
    assert len(tokenized[0]) == 4
    assert all(map(lambda t: isinstance(t, AllennlpToken), tokenized[0]))