Example #1
0
def test_pipeline_without_word_features():
    tokenizer_config = TokenizerConfiguration()
    char_features = CharFeatures(
        embedding_dim=2,
        encoder={
            "type": "gru",
            "hidden_size": 2,
            "num_layers": 1,
            "bidirectional": True,
        },
        dropout=0.1,
    )
    features_config = FeaturesConfiguration(char=char_features)
    encoder_spec = Seq2SeqEncoderConfiguration(type="gru",
                                               hidden_size=2,
                                               num_layers=1,
                                               bidirectional=True)

    head_spec = TaskHeadConfiguration(
        type="TextClassification",
        labels=["duplicate", "not_duplicate"],
        pooler={"type": "boe"},
    )

    pipeline_config = PipelineConfiguration(
        name="no_word_features",
        head=head_spec,
        features=features_config,
        tokenizer=tokenizer_config,
        encoder=encoder_spec,
    )

    pl = Pipeline.from_config(pipeline_config)
    assert "word" not in pl.backbone.featurizer.indexer
    assert "char" in pl.backbone.featurizer.indexer
def test_set_sentence_segmentation_with_max_number_of_sentences():
    tokenizer = Tokenizer(TokenizerConfiguration(max_nr_of_sentences=2))
    tokenized = tokenizer.tokenize_document([
        "This is a sentence. This is another sentence.",
        "One more sentence here.",
        "Last sentence here.",
    ])
    assert len(tokenized) == 2
Example #3
0
def test_pipeline_config(pipeline_yaml):
    tokenizer_config = TokenizerConfiguration(
        text_cleaning={"rules": ["strip_spaces"]}, use_spacy_tokens=True)

    word_features = WordFeatures(embedding_dim=2, lowercase_tokens=True)
    char_features = CharFeatures(
        embedding_dim=2,
        encoder={
            "type": "gru",
            "hidden_size": 2,
            "num_layers": 1,
            "bidirectional": True,
        },
        dropout=0.1,
    )
    features_config = FeaturesConfiguration(word=word_features,
                                            char=char_features)

    encoder_spec = Seq2SeqEncoderConfiguration(type="gru",
                                               hidden_size=2,
                                               num_layers=1,
                                               bidirectional=True)

    head_spec = TaskHeadConfiguration(
        type=TextClassification,
        labels=["duplicate", "not_duplicate"],
        pooler={"type": "boe"},
    )

    pipeline_config = PipelineConfiguration(
        name="test_pipeline_config",
        head=head_spec,
        features=features_config,
        tokenizer=tokenizer_config,
        encoder=encoder_spec,
    )

    pl = Pipeline.from_config(pipeline_config)

    pl_yaml = Pipeline.from_yaml(pipeline_yaml)

    assert pl.named_trainable_parameters == pl_yaml.named_trainable_parameters
    assert pl.num_trainable_parameters == pl_yaml.num_trainable_parameters
    assert pl.num_parameters == pl_yaml.num_parameters

    sample_text = "My simple text"
    for instance in [
            pl.backbone.featurizer(sample_text),
            pl_yaml.backbone.featurizer(sample_text),
    ]:
        for key, value in instance.items():
            assert key == "record"
            assert isinstance(value, ListField)
            assert len(value) == 1
            for text in value:
                assert isinstance(text, TextField)
                assert all(map(lambda t: isinstance(t, Token), text.tokens))
                assert sample_text == " ".join([t.text for t in text.tokens])
Example #4
0
def test_min_max_sentence_length():
    tokenizer = Tokenizer(
        TokenizerConfiguration(segment_sentences=True,
                               min_sentence_length=10,
                               max_sentence_length=15))
    tokenized = tokenizer.tokenize_text(
        "short. A very long sentence. This is fine")

    assert len(tokenized) == 1
    assert len(tokenized[0]) == 3
Example #5
0
def test_pipeline_default_tokenizer(pipeline_dict):
    pipeline_dict["features"].update({"word": {"embedding_dim": 2}})
    pl = Pipeline.from_config(pipeline_dict)

    assert pl.config.tokenizer_config == TokenizerConfiguration()
    assert pl.config.features.transformers.mismatched is True
    assert (type(pl.backbone.featurizer.indexer["transformers"]) is
            PretrainedTransformerMismatchedIndexer)
    assert type(pl.backbone.tokenizer) is Tokenizer

    prediction = pl.predict("Test this!")
def test_document_cleaning():
    tokenizer = Tokenizer(
        TokenizerConfiguration(
            text_cleaning={"rules": ["html_to_text", "strip_spaces"]},
            segment_sentences=True,
        ))

    tokenized = tokenizer.tokenize_document([html_text])
    assert len(tokenized) == 2
    assert (len(tokenized[0]) == 7
            ), "Expected [My, First, Heading, My, first, paragraph, .]"
    assert len(tokenized[1]) == 4, "Expected [My, second, paragraph, .]"
def test_text_cleaning_with_sentence_segmentation_and_max_sequence():
    tokenizer = Tokenizer(
        TokenizerConfiguration(
            max_sequence_length=8,
            text_cleaning={"rules": ["html_to_text", "strip_spaces"]},
            segment_sentences=True,
        ))

    tokenized = tokenizer.tokenize_text(html_text)
    assert len(tokenized) == 2
    assert len(tokenized[0]) == 2, "Expected [My, First]"
    assert len(tokenized[1]) == 2, "Expected [My, second]"
def test_using_allennlp_tokens():
    tokenizer = Tokenizer(TokenizerConfiguration(use_spacy_tokens=False))
    tokenized = tokenizer.tokenize_text("This is a text")
    assert len(tokenized) == 1
    assert len(tokenized[0]) == 4
    assert all(map(lambda t: isinstance(t, AllennlpToken), tokenized[0]))