def test_pipeline_without_word_features(): tokenizer_config = TokenizerConfiguration() char_features = CharFeatures( embedding_dim=2, encoder={ "type": "gru", "hidden_size": 2, "num_layers": 1, "bidirectional": True, }, dropout=0.1, ) features_config = FeaturesConfiguration(char=char_features) encoder_spec = Seq2SeqEncoderConfiguration(type="gru", hidden_size=2, num_layers=1, bidirectional=True) head_spec = TaskHeadConfiguration( type="TextClassification", labels=["duplicate", "not_duplicate"], pooler={"type": "boe"}, ) pipeline_config = PipelineConfiguration( name="no_word_features", head=head_spec, features=features_config, tokenizer=tokenizer_config, encoder=encoder_spec, ) pl = Pipeline.from_config(pipeline_config) assert "word" not in pl.backbone.featurizer.indexer assert "char" in pl.backbone.featurizer.indexer
def test_set_sentence_segmentation_with_max_number_of_sentences(): tokenizer = Tokenizer(TokenizerConfiguration(max_nr_of_sentences=2)) tokenized = tokenizer.tokenize_document([ "This is a sentence. This is another sentence.", "One more sentence here.", "Last sentence here.", ]) assert len(tokenized) == 2
def test_pipeline_config(pipeline_yaml): tokenizer_config = TokenizerConfiguration( text_cleaning={"rules": ["strip_spaces"]}, use_spacy_tokens=True) word_features = WordFeatures(embedding_dim=2, lowercase_tokens=True) char_features = CharFeatures( embedding_dim=2, encoder={ "type": "gru", "hidden_size": 2, "num_layers": 1, "bidirectional": True, }, dropout=0.1, ) features_config = FeaturesConfiguration(word=word_features, char=char_features) encoder_spec = Seq2SeqEncoderConfiguration(type="gru", hidden_size=2, num_layers=1, bidirectional=True) head_spec = TaskHeadConfiguration( type=TextClassification, labels=["duplicate", "not_duplicate"], pooler={"type": "boe"}, ) pipeline_config = PipelineConfiguration( name="test_pipeline_config", head=head_spec, features=features_config, tokenizer=tokenizer_config, encoder=encoder_spec, ) pl = Pipeline.from_config(pipeline_config) pl_yaml = Pipeline.from_yaml(pipeline_yaml) assert pl.named_trainable_parameters == pl_yaml.named_trainable_parameters assert pl.num_trainable_parameters == pl_yaml.num_trainable_parameters assert pl.num_parameters == pl_yaml.num_parameters sample_text = "My simple text" for instance in [ pl.backbone.featurizer(sample_text), pl_yaml.backbone.featurizer(sample_text), ]: for key, value in instance.items(): assert key == "record" assert isinstance(value, ListField) assert len(value) == 1 for text in value: assert isinstance(text, TextField) assert all(map(lambda t: isinstance(t, Token), text.tokens)) assert sample_text == " ".join([t.text for t in text.tokens])
def test_min_max_sentence_length(): tokenizer = Tokenizer( TokenizerConfiguration(segment_sentences=True, min_sentence_length=10, max_sentence_length=15)) tokenized = tokenizer.tokenize_text( "short. A very long sentence. This is fine") assert len(tokenized) == 1 assert len(tokenized[0]) == 3
def test_pipeline_default_tokenizer(pipeline_dict): pipeline_dict["features"].update({"word": {"embedding_dim": 2}}) pl = Pipeline.from_config(pipeline_dict) assert pl.config.tokenizer_config == TokenizerConfiguration() assert pl.config.features.transformers.mismatched is True assert (type(pl.backbone.featurizer.indexer["transformers"]) is PretrainedTransformerMismatchedIndexer) assert type(pl.backbone.tokenizer) is Tokenizer prediction = pl.predict("Test this!")
def test_document_cleaning(): tokenizer = Tokenizer( TokenizerConfiguration( text_cleaning={"rules": ["html_to_text", "strip_spaces"]}, segment_sentences=True, )) tokenized = tokenizer.tokenize_document([html_text]) assert len(tokenized) == 2 assert (len(tokenized[0]) == 7 ), "Expected [My, First, Heading, My, first, paragraph, .]" assert len(tokenized[1]) == 4, "Expected [My, second, paragraph, .]"
def test_text_cleaning_with_sentence_segmentation_and_max_sequence(): tokenizer = Tokenizer( TokenizerConfiguration( max_sequence_length=8, text_cleaning={"rules": ["html_to_text", "strip_spaces"]}, segment_sentences=True, )) tokenized = tokenizer.tokenize_text(html_text) assert len(tokenized) == 2 assert len(tokenized[0]) == 2, "Expected [My, First]" assert len(tokenized[1]) == 2, "Expected [My, second]"
def test_using_allennlp_tokens(): tokenizer = Tokenizer(TokenizerConfiguration(use_spacy_tokens=False)) tokenized = tokenizer.tokenize_text("This is a text") assert len(tokenized) == 1 assert len(tokenized[0]) == 4 assert all(map(lambda t: isinstance(t, AllennlpToken), tokenized[0]))