def test_cvf_independent_train_vocabulary_expand( additional_size: Optional[int], text: Text, real_vocabulary_size: int, total_vocabulary_size: int, ): tokenizer = WhitespaceTokenizer() featurizer = CountVectorsFeaturizer( { "additional_vocabulary_size": { TEXT: additional_size, RESPONSE: additional_size, ACTION_TEXT: additional_size, } }, finetune_mode=False, ) train_message = Message( data={ TEXT: text, INTENT: "intent_1", RESPONSE: text, ACTION_TEXT: text, ACTION_NAME: "action_1", }) data = TrainingData([train_message]) tokenizer.train(data) featurizer.train(data) for attribute in [TEXT, RESPONSE, ACTION_TEXT]: attribute_vocabulary = featurizer.vectorizers[attribute].vocabulary_ assert len(attribute_vocabulary) == total_vocabulary_size assert (featurizer._get_starting_empty_index(attribute_vocabulary) == real_vocabulary_size) for attribute in [INTENT, ACTION_NAME]: attribute_vocabulary = featurizer.vectorizers[attribute].vocabulary_ assert len(attribute_vocabulary) == 1
def test_cvf_shared_train_vocabulary_expand( additional_size: Optional[int], text: Text, real_vocabulary_size: int, total_vocabulary_size: int, ): tokenizer = WhitespaceTokenizer() featurizer = CountVectorsFeaturizer( { "additional_vocabulary_size": { "text": additional_size, "response": additional_size, "action_text": additional_size, }, "use_shared_vocab": True, }, finetune_mode=False, ) train_message = Message( data={ TEXT: text, INTENT: "intent_1", RESPONSE: text, ACTION_TEXT: text, ACTION_NAME: "action_1", }) data = TrainingData([train_message]) tokenizer.train(data) featurizer.train(data) shared_vocabulary = featurizer.vectorizers["text"].vocabulary_ assert len(shared_vocabulary) == total_vocabulary_size assert (featurizer._get_starting_empty_index(shared_vocabulary) == real_vocabulary_size)