def test_count_vector_featurizer_shared_vocab(sentence, intent, response, text_features, intent_features, response_features): ftr = CountVectorsFeaturizer({ "token_pattern": r"(?u)\b\w+\b", "use_shared_vocab": True }) tk = WhitespaceTokenizer() train_message = Message(sentence) # this is needed for a valid training example train_message.set(INTENT, intent) train_message.set(RESPONSE, response) data = TrainingData([train_message]) tk.train(data) ftr.train(data) assert np.all( train_message.get(SPARSE_FEATURE_NAMES[TEXT]).toarray()[0] == text_features) assert np.all( train_message.get(SPARSE_FEATURE_NAMES[INTENT]).toarray()[0] == intent_features) assert np.all( train_message.get(SPARSE_FEATURE_NAMES[RESPONSE]).toarray()[0] == response_features)
def test_count_vector_featurizer_attribute_featurization( sentence, intent, response, intent_features, response_features): ftr = CountVectorsFeaturizer({"token_pattern": r"(?u)\b\w+\b"}) tk = WhitespaceTokenizer() train_message = Message(sentence) # this is needed for a valid training example train_message.set(INTENT, intent) train_message.set(RESPONSE, response) data = TrainingData([train_message]) tk.train(data) ftr.train(data) if intent_features: assert (train_message.get( SPARSE_FEATURE_NAMES[INTENT]).toarray()[0] == intent_features) else: assert train_message.get(SPARSE_FEATURE_NAMES[INTENT]) is None if response_features: assert (train_message.get( SPARSE_FEATURE_NAMES[RESPONSE]).toarray()[0] == response_features) else: assert train_message.get(SPARSE_FEATURE_NAMES[RESPONSE]) is None
def test_count_vector_featurizer_response_attribute_featurization( sentence, intent, response, intent_features, response_features): ftr = CountVectorsFeaturizer({"token_pattern": r"(?u)\b\w+\b"}) tk = WhitespaceTokenizer() train_message = Message(sentence) # this is needed for a valid training example train_message.set(INTENT, intent) train_message.set(RESPONSE, response) # add a second example that has some response, so that the vocabulary for # response exists second_message = Message("hello") second_message.set(RESPONSE, "hi") second_message.set(INTENT, "greet") data = TrainingData([train_message, second_message]) tk.train(data) ftr.train(data) if intent_features: assert (train_message.get( SPARSE_FEATURE_NAMES[INTENT]).toarray()[0] == intent_features) else: assert train_message.get(SPARSE_FEATURE_NAMES[INTENT]) is None if response_features: assert (train_message.get( SPARSE_FEATURE_NAMES[RESPONSE]).toarray()[0] == response_features) else: assert train_message.get(SPARSE_FEATURE_NAMES[RESPONSE]) is None
def test_count_vector_featurizer_attribute_featurization( sentence, intent, response, intent_features, response_features): from rasa.nlu.featurizers.sparse_featurizer.count_vectors_featurizer import ( CountVectorsFeaturizer, ) ftr = CountVectorsFeaturizer({ "token_pattern": r"(?u)\b\w+\b", "return_sequence": True }) train_message = Message(sentence) # this is needed for a valid training example train_message.set("intent", intent) train_message.set("response", response) data = TrainingData([train_message]) ftr.train(data) if intent_features: assert (train_message.get("intent_sparse_features").toarray()[0] == intent_features) else: assert train_message.get("intent_sparse_features") is None if response_features: assert (train_message.get("response_sparse_features").toarray()[0] == response_features) else: assert train_message.get("response_sparse_features") is None
def test_count_vector_featurizer_attribute_featurization( sentence, intent, response, intent_features, response_features): ftr = CountVectorsFeaturizer() tk = WhitespaceTokenizer() train_message = Message(sentence) # this is needed for a valid training example train_message.set(INTENT, intent) train_message.set(RESPONSE, response) data = TrainingData([train_message]) tk.train(data) ftr.train(data) intent_seq_vecs, intent_sen_vecs = train_message.get_sparse_features( INTENT, []) response_seq_vecs, response_sen_vecs = train_message.get_sparse_features( RESPONSE, []) if intent_features: assert intent_seq_vecs.toarray()[0] == intent_features assert intent_sen_vecs is None else: assert intent_seq_vecs is None assert intent_sen_vecs is None if response_features: assert response_seq_vecs.toarray()[0] == response_features assert response_sen_vecs is not None else: assert response_seq_vecs is None assert response_sen_vecs is None
def test_count_vector_featurizer_shared_vocab(sentence, intent, response, text_features, intent_features, response_features): from rasa.nlu.featurizers.sparse_featurizer.count_vectors_featurizer import ( CountVectorsFeaturizer, ) ftr = CountVectorsFeaturizer({ "token_pattern": r"(?u)\b\w+\b", "use_shared_vocab": True, "return_sequence": True, }) train_message = Message(sentence) # this is needed for a valid training example train_message.set("intent", intent) train_message.set("response", response) data = TrainingData([train_message]) ftr.train(data) assert np.all( train_message.get("text_sparse_features").toarray()[0] == text_features) assert np.all( train_message.get("intent_sparse_features").toarray()[0] == intent_features) assert np.all( train_message.get("response_sparse_features").toarray()[0] == response_features)
def test_count_vector_featurizer_shared_vocab(sentence, intent, response, text_features, intent_features, response_features): ftr = CountVectorsFeaturizer({ "token_pattern": r"(?u)\b\w+\b", "use_shared_vocab": True }) tk = WhitespaceTokenizer() train_message = Message(sentence) # this is needed for a valid training example train_message.set(INTENT, intent) train_message.set(RESPONSE, response) data = TrainingData([train_message]) tk.train(data) ftr.train(data) seq_vec, sen_vec = train_message.get_sparse_features(TEXT, []) assert np.all(seq_vec.toarray()[0] == text_features) assert sen_vec is not None seq_vec, sen_vec = train_message.get_sparse_features(INTENT, []) assert np.all(seq_vec.toarray()[0] == intent_features) assert sen_vec is None seq_vec, sen_vec = train_message.get_sparse_features(RESPONSE, []) assert np.all(seq_vec.toarray()[0] == response_features) assert sen_vec is not None
def test_convert_training_examples( spacy_nlp: Any, text: Text, intent: Optional[Text], entities: Optional[List[Dict[Text, Any]]], attributes: List[Text], real_sparse_feature_sizes: Dict[Text, Dict[Text, List[int]]], ): message = Message(data={TEXT: text, INTENT: intent, ENTITIES: entities}) tokenizer = SpacyTokenizer() count_vectors_featurizer = CountVectorsFeaturizer() spacy_featurizer = SpacyFeaturizer() message.set(SPACY_DOCS[TEXT], spacy_nlp(text)) training_data = TrainingData([message]) tokenizer.train(training_data) count_vectors_featurizer.train(training_data) spacy_featurizer.train(training_data) entity_tag_spec = [ EntityTagSpec( "entity", { 0: "O", 1: "name", 2: "location" }, { "O": 0, "name": 1, "location": 2 }, 3, ) ] output, sparse_feature_sizes = model_data_utils.featurize_training_examples( [message], attributes=attributes, entity_tag_specs=entity_tag_spec, ) assert len(output) == 1 for attribute in attributes: assert attribute in output[0] for attribute in {INTENT, TEXT, ENTITIES} - set(attributes): assert attribute not in output[0] # we have sparse sentence, sparse sequence, dense sentence, and dense sequence # features in the list assert len(output[0][TEXT]) == 4 if INTENT in attributes: # we will just have space sentence features assert len(output[0][INTENT]) == 1 if ENTITIES in attributes: # we will just have space sentence features assert len(output[0][ENTITIES]) == len(entity_tag_spec) # check that it calculates sparse_feature_sizes correctly assert sparse_feature_sizes == real_sparse_feature_sizes
def test_count_vector_featurizer_action_attribute_featurization( sentence: Text, action_name: Text, action_text: Text, action_name_features: np.ndarray, response_features: np.ndarray, ): ftr = CountVectorsFeaturizer({ "token_pattern": r"(?u)\b\w+\b", "additional_vocabulary_size": { "text": 0, "response": 0, "action_text": 0 }, }) tk = WhitespaceTokenizer() train_message = Message(data={TEXT: sentence}) # this is needed for a valid training example train_message.set(ACTION_NAME, action_name) train_message.set(ACTION_TEXT, action_text) # add a second example that has some response, so that the vocabulary for # response exists second_message = Message(data={TEXT: "hello"}) second_message.set(ACTION_TEXT, "hi") second_message.set(ACTION_NAME, "greet") data = TrainingData([train_message, second_message]) tk.train(data) ftr.train(data) action_name_seq_vecs, action_name_sen_vecs = train_message.get_sparse_features( ACTION_NAME, []) if action_name_seq_vecs: action_name_seq_vecs = action_name_seq_vecs.features if action_name_sen_vecs: action_name_sen_vecs = action_name_sen_vecs.features response_seq_vecs, response_sen_vecs = train_message.get_sparse_features( ACTION_TEXT, []) if response_seq_vecs: response_seq_vecs = response_seq_vecs.features if response_sen_vecs: response_sen_vecs = response_sen_vecs.features if action_name_features: assert action_name_seq_vecs.toarray()[0] == action_name_features assert action_name_sen_vecs is None else: assert action_name_seq_vecs is None assert action_name_sen_vecs is None if response_features: assert response_seq_vecs.toarray()[0] == response_features assert response_sen_vecs is not None else: assert response_seq_vecs is None assert response_sen_vecs is None
def inner(config: Optional[Dict[Text, Any]] = None) -> CountVectorsFeaturizer: config = config or {} return CountVectorsFeaturizer.create( {**CountVectorsFeaturizer.get_default_config(), **config}, default_model_storage, Resource("count_vectors_featurizer"), default_execution_context, )
def inner( config: Optional[Dict[Text, Any]] = None, is_finetuning: bool = False ) -> CountVectorsFeaturizer: config = config or {} return CountVectorsFeaturizer.load( {**CountVectorsFeaturizer.get_default_config(), **config}, default_model_storage, Resource("count_vectors_featurizer"), dataclasses.replace(default_execution_context, is_finetuning=is_finetuning), )
def featurizer_sparse(tmpdir): """Generate a featurizer for tests.""" node_storage = LocalModelStorage(pathlib.Path(tmpdir)) node_resource = Resource("sparse_feat") context = ExecutionContext(node_storage, node_resource) return CountVectorsFeaturizer( config=CountVectorsFeaturizer.get_default_config(), resource=node_resource, model_storage=node_storage, execution_context=context, )
def test_count_vector_featurizer_response_attribute_featurization( sentence, intent, response, intent_features, response_features): ftr = CountVectorsFeaturizer( {"additional_vocabulary_size": { "text": 0, "response": 0 }}) tk = WhitespaceTokenizer() train_message = Message(data={TEXT: sentence}) # this is needed for a valid training example train_message.set(INTENT, intent) train_message.set(RESPONSE, response) # add a second example that has some response, so that the vocabulary for # response exists second_message = Message(data={TEXT: "hello"}) second_message.set(RESPONSE, "hi") second_message.set(INTENT, "greet") data = TrainingData([train_message, second_message]) tk.train(data) ftr.train(data) intent_seq_vecs, intent_sen_vecs = train_message.get_sparse_features( INTENT, []) if intent_seq_vecs: intent_seq_vecs = intent_seq_vecs.features if intent_sen_vecs: intent_sen_vecs = intent_sen_vecs.features response_seq_vecs, response_sen_vecs = train_message.get_sparse_features( RESPONSE, []) if response_seq_vecs: response_seq_vecs = response_seq_vecs.features if response_sen_vecs: response_sen_vecs = response_sen_vecs.features if intent_features: assert intent_seq_vecs.toarray()[0] == intent_features assert intent_sen_vecs is None else: assert intent_seq_vecs is None assert intent_sen_vecs is None if response_features: assert response_seq_vecs.toarray()[0] == response_features assert response_sen_vecs is not None else: assert response_seq_vecs is None assert response_sen_vecs is None
def test_component_changes_features_cvf(): """If there are no features we need to add them""" tok_whitespace = WhitespaceTokenizer() tok_stanza = StanzaTokenizer(component_config={ "lang": "en", "cache_dir": "tests/data/stanza" }) txt = "i am running and giving many greetings" feats_whitespace = fetch_sparse_features(txt, tok_whitespace, CountVectorsFeaturizer()) feats_stanza = fetch_sparse_features(txt, tok_stanza, CountVectorsFeaturizer()) # Because the lemma is being used internally we expect less features assert feats_stanza.shape[1] < feats_whitespace.shape[1]
def test_count_vectors_featurizer_train(): featurizer = CountVectorsFeaturizer.create({}, RasaNLUModelConfig()) sentence = "Hey how are you today ?" message = Message(sentence) message.set(RESPONSE, sentence) message.set(INTENT, "intent") WhitespaceTokenizer().train(TrainingData([message])) featurizer.train(TrainingData([message]), RasaNLUModelConfig()) expected = np.array([0, 1, 0, 0, 0]) expected_cls = np.array([1, 1, 1, 1, 1]) seq_vec, sen_vec = message.get_sparse_features(TEXT, []) assert (5, 5) == seq_vec.shape assert (1, 5) == sen_vec.shape assert np.all(seq_vec.toarray()[0] == expected) assert np.all(sen_vec.toarray()[-1] == expected_cls) seq_vec, sen_vec = message.get_sparse_features(RESPONSE, []) assert (5, 5) == seq_vec.shape assert (1, 5) == sen_vec.shape assert np.all(seq_vec.toarray()[0] == expected) assert np.all(sen_vec.toarray()[-1] == expected_cls) seq_vec, sen_vec = message.get_sparse_features(INTENT, []) assert sen_vec is None assert (1, 1) == seq_vec.shape assert np.all(seq_vec.toarray()[0] == np.array([1]))
def test_additional_vocab_size_deprecation(): with pytest.warns(FutureWarning) as warning: _ = CountVectorsFeaturizer.create( {"additional_vocabulary_size": {TEXT: 5, RESPONSE: 10}}, RasaNLUModelConfig(), ) assert "The parameter has been deprecated" in warning[0].message.args[0]
def test_count_vectors_featurizer_train(): featurizer = CountVectorsFeaturizer.create({}, RasaNLUModelConfig()) sentence = "Hey how are you today ?" message = Message(sentence) message.set(RESPONSE, sentence) message.set(INTENT, "intent") WhitespaceTokenizer().train(TrainingData([message])) featurizer.train(TrainingData([message]), RasaNLUModelConfig()) expected = np.array([0, 1, 0, 0, 0]) expected_cls = np.array([1, 1, 1, 1, 1]) vecs = message.get(SPARSE_FEATURE_NAMES[TEXT]) assert (6, 5) == vecs.shape assert np.all(vecs.toarray()[0] == expected) assert np.all(vecs.toarray()[-1] == expected_cls) vecs = message.get(SPARSE_FEATURE_NAMES[RESPONSE]) assert (6, 5) == vecs.shape assert np.all(vecs.toarray()[0] == expected) assert np.all(vecs.toarray()[-1] == expected_cls) vecs = message.get(SPARSE_FEATURE_NAMES[INTENT]) assert (1, 1) == vecs.shape assert np.all(vecs.toarray()[0] == np.array([1]))
def test_count_vector_featurizer_process_by_attribute( sentence: Text, action_name: Text, action_text: Text, action_name_features: np.ndarray, response_features: np.ndarray, ): ftr = CountVectorsFeaturizer({ "token_pattern": r"(?u)\b\w+\b", "additional_vocabulary_size": { "text": 0, "response": 0, "action_text": 0 }, }) tk = WhitespaceTokenizer() # add a second example that has some response, so that the vocabulary for # response exists train_message = Message(data={TEXT: "hello"}) train_message.set(ACTION_NAME, "greet") train_message1 = Message(data={TEXT: "hello"}) train_message1.set(ACTION_TEXT, "hi") data = TrainingData([train_message, train_message1]) tk.train(data) ftr.train(data) test_message = Message(data={TEXT: sentence}) test_message.set(ACTION_NAME, action_name) test_message.set(ACTION_TEXT, action_text) for module in [tk, ftr]: module.process(test_message) action_name_seq_vecs, action_name_sen_vecs = test_message.get_sparse_features( ACTION_NAME, []) if action_name_seq_vecs: action_name_seq_vecs = action_name_seq_vecs.features if action_name_sen_vecs: action_name_sen_vecs = action_name_sen_vecs.features assert action_name_seq_vecs.toarray()[0] == action_name_features assert action_name_sen_vecs is None
def test_component_changes_features_cvf(): """If there are no features we need to add them""" tokenizer = ThaiTokenizer() txt = "ก็จะรู้ความชั่วร้ายที่ทำไว้ และคงจะไม่ยอมให้ทำนาบนหลังคน " feats = fetch_sparse_features( txt=txt, tokenizer=tokenizer, featurizer=CountVectorsFeaturizer() ) assert feats.shape[1] > 0 and isinstance(feats, np.ndarray)
def test_count_vector_featurizer_shared_vocab(sentence, intent, response, text_features, intent_features, response_features): ftr = CountVectorsFeaturizer({ "use_shared_vocab": True, "additional_vocabulary_size": { "text": 0, "response": 0 }, }) tk = WhitespaceTokenizer() train_message = Message(data={TEXT: sentence}) # this is needed for a valid training example train_message.set(INTENT, intent) train_message.set(RESPONSE, response) data = TrainingData([train_message]) tk.train(data) ftr.train(data) seq_vec, sen_vec = train_message.get_sparse_features(TEXT, []) if seq_vec: seq_vec = seq_vec.features if sen_vec: sen_vec = sen_vec.features assert np.all(seq_vec.toarray()[0] == text_features) assert sen_vec is not None seq_vec, sen_vec = train_message.get_sparse_features(INTENT, []) if seq_vec: seq_vec = seq_vec.features if sen_vec: sen_vec = sen_vec.features assert np.all(seq_vec.toarray()[0] == intent_features) assert sen_vec is None seq_vec, sen_vec = train_message.get_sparse_features(RESPONSE, []) if seq_vec: seq_vec = seq_vec.features if sen_vec: sen_vec = sen_vec.features assert np.all(seq_vec.toarray()[0] == response_features) assert sen_vec is not None
def test_cvf_independent_train_vocabulary_expand( additional_size: Optional[int], text: Text, real_vocabulary_size: int, total_vocabulary_size: int, ): tokenizer = WhitespaceTokenizer() featurizer = CountVectorsFeaturizer( { "additional_vocabulary_size": { TEXT: additional_size, RESPONSE: additional_size, ACTION_TEXT: additional_size, } }, finetune_mode=False, ) train_message = Message( data={ TEXT: text, INTENT: "intent_1", RESPONSE: text, ACTION_TEXT: text, ACTION_NAME: "action_1", }) data = TrainingData([train_message]) tokenizer.train(data) featurizer.train(data) for attribute in [TEXT, RESPONSE, ACTION_TEXT]: attribute_vocabulary = featurizer.vectorizers[attribute].vocabulary_ assert len(attribute_vocabulary) == total_vocabulary_size assert (featurizer._get_starting_empty_index(attribute_vocabulary) == real_vocabulary_size) for attribute in [INTENT, ACTION_NAME]: attribute_vocabulary = featurizer.vectorizers[attribute].vocabulary_ assert len(attribute_vocabulary) == 1
def test_count_vector_featurizer_using_tokens(tokens, expected): ftr = CountVectorsFeaturizer() # using empty string instead of real text string to make sure # count vector only can come from `tokens` feature. # using `message.text` can not get correct result tokens_feature = [Token(i, 0) for i in tokens] train_message = Message("") train_message.set(TOKENS_NAMES[TEXT], tokens_feature) data = TrainingData([train_message]) ftr.train(data) test_message = Message("") test_message.set(TOKENS_NAMES[TEXT], tokens_feature) ftr.process(test_message) seq_vec, sen_vec = train_message.get_sparse_features(TEXT, []) assert np.all(seq_vec.toarray()[0] == expected) assert sen_vec is not None
def test_count_vector_featurizer_using_tokens(tokens, expected): from rasa.nlu.featurizers.sparse_featurizer.count_vectors_featurizer import ( CountVectorsFeaturizer, ) ftr = CountVectorsFeaturizer({ "token_pattern": r"(?u)\b\w+\b", "return_sequence": True }) # using empty string instead of real text string to make sure # count vector only can come from `tokens` feature. # using `message.text` can not get correct result tokens_feature = [Token(i, 0) for i in tokens] train_message = Message("") train_message.set("tokens", tokens_feature) # this is needed for a valid training example train_message.set("intent", "bla") data = TrainingData([train_message]) ftr.train(data) test_message = Message("") test_message.set("tokens", tokens_feature) ftr.process(test_message) assert np.all( test_message.get("text_sparse_features").toarray()[0] == expected)
def test_count_vector_featurizer_using_tokens(tokens, expected): ftr = CountVectorsFeaturizer({"token_pattern": r"(?u)\b\w+\b"}) # using empty string instead of real text string to make sure # count vector only can come from `tokens` feature. # using `message.text` can not get correct result tokens_feature = [Token(i, 0) for i in tokens] train_message = Message("") train_message.set(TOKENS_NAMES[TEXT_ATTRIBUTE], tokens_feature) data = TrainingData([train_message]) ftr.train(data) test_message = Message("") test_message.set(TOKENS_NAMES[TEXT_ATTRIBUTE], tokens_feature) ftr.process(test_message) assert np.all( test_message.get(SPARSE_FEATURE_NAMES[TEXT_ATTRIBUTE]).toarray()[0] == expected)
def test_use_shared_vocab_exception( initial_train_text: Text, additional_train_text: Text, use_shared_vocab: bool, tmp_path: Path, ): """Tests if an exception is raised when `use_shared_vocab` is set to True during incremental training.""" tk = WhitespaceTokenizer() initial_cvf = CountVectorsFeaturizer( component_config={"use_shared_vocab": use_shared_vocab} ) train_message = Message(data={"text": initial_train_text}) data = TrainingData([train_message]) tk.train(data) initial_cvf.train(data) file_dict = initial_cvf.persist("ftr", tmp_path) meta = initial_cvf.component_config.copy() meta.update(file_dict) new_cvf = CountVectorsFeaturizer.load(meta, tmp_path, should_finetune=True) additional_train_message = Message(data={"text": additional_train_text}) data = TrainingData([train_message, additional_train_message]) tk.train(data) if use_shared_vocab: with pytest.raises(Exception) as exec_info: new_cvf.train(data) assert ( "Using a shared vocabulary in `CountVectorsFeaturizer` is not supported" in str(exec_info.value) ) else: new_cvf.train(data)
def test_count_vector_featurizer_use_lemma( spacy_nlp: Any, sentence: Text, sequence_features: List[List[int]], sentence_features: List[List[int]], use_lemma: bool, ): ftr = CountVectorsFeaturizer({"use_lemma": use_lemma}) train_message = Message(data={TEXT: sentence}) train_message.set(SPACY_DOCS[TEXT], spacy_nlp(sentence)) test_message = Message(data={TEXT: sentence}) test_message.set(SPACY_DOCS[TEXT], spacy_nlp(sentence)) SpacyTokenizer().process(train_message) SpacyTokenizer().process(test_message) ftr.train(TrainingData([train_message])) ftr.process(test_message) seq_vecs, sen_vecs = test_message.get_sparse_features(TEXT, []) assert isinstance(seq_vecs.features, scipy.sparse.coo_matrix) assert isinstance(sen_vecs.features, scipy.sparse.coo_matrix) actual_seq_vecs = seq_vecs.features.toarray() actual_sen_vecs = sen_vecs.features.toarray() assert np.all(actual_seq_vecs[0] == sequence_features) assert np.all(actual_sen_vecs[-1] == sentence_features)
def test_count_vector_featurizer(sentence, expected, expected_cls): ftr = CountVectorsFeaturizer() train_message = Message(data={TEXT: sentence}) test_message = Message(data={TEXT: sentence}) WhitespaceTokenizer().process(train_message) WhitespaceTokenizer().process(test_message) ftr.train(TrainingData([train_message])) ftr.process(test_message) seq_vecs, sen_vecs = test_message.get_sparse_features(TEXT, []) if seq_vecs: seq_vecs = seq_vecs.features if sen_vecs: sen_vecs = sen_vecs.features assert isinstance(seq_vecs, scipy.sparse.coo_matrix) assert isinstance(sen_vecs, scipy.sparse.coo_matrix) actual_seq_vecs = seq_vecs.toarray() actual_sen_vecs = sen_vecs.toarray() assert np.all(actual_seq_vecs[0] == expected) assert np.all(actual_sen_vecs[-1] == expected_cls)
def test_cvf_incremental_train_vocabulary_overflow(tmp_path: Path, ): additional_size = 3 original_train_text = "hello my name is John." additional_train_text = "I am also new." tokenizer = WhitespaceTokenizer() original_featurizer = CountVectorsFeaturizer( {"additional_vocabulary_size": { "text": additional_size }}, finetune_mode=False, ) train_message = Message(data={"text": original_train_text}) data = TrainingData([train_message]) tokenizer.train(data) original_featurizer.train(data) file_dict = original_featurizer.persist("ftr", str(tmp_path)) # load original_featurizer meta = original_featurizer.component_config.copy() meta.update(file_dict) new_featurizer = CountVectorsFeaturizer.load(meta, str(tmp_path), should_finetune=True) additional_train_message = Message(data={"text": additional_train_text}) data = TrainingData([train_message, additional_train_message]) tokenizer.train(data) with pytest.warns(UserWarning) as warning: new_featurizer.train(data) assert "New data contains vocabulary of size" in warning[0].message.args[0]
def test_count_vector_featurizer_oov_words(sentence, expected): ftr = CountVectorsFeaturizer({ "OOV_token": "__oov__", "OOV_words": ["oov_word0", "OOV_word1"], "additional_vocabulary_size": { "text": 0 }, }) train_message = Message(data={TEXT: sentence}) WhitespaceTokenizer().process(train_message) data = TrainingData([train_message]) ftr.train(data) test_message = Message(data={TEXT: sentence}) ftr.process(test_message) seq_vec, sen_vec = train_message.get_sparse_features(TEXT, []) if seq_vec: seq_vec = seq_vec.features if sen_vec: sen_vec = sen_vec.features assert np.all(seq_vec.toarray()[0] == expected) assert sen_vec is not None
def test_count_vector_featurizer_char(sentence, expected): ftr = CountVectorsFeaturizer({ "min_ngram": 1, "max_ngram": 2, "analyzer": "char", "additional_vocabulary_size": { "text": 0 }, }) train_message = Message(data={TEXT: sentence}) WhitespaceTokenizer().process(train_message) data = TrainingData([train_message]) ftr.train(data) test_message = Message(data={TEXT: sentence}) WhitespaceTokenizer().process(test_message) ftr.process(test_message) seq_vec, sen_vec = train_message.get_sparse_features(TEXT, []) if seq_vec: seq_vec = seq_vec.features if sen_vec: sen_vec = sen_vec.features assert np.all(seq_vec.toarray()[0] == expected) assert sen_vec is not None