def inner( config: Dict[Text, Any] = None, known_patterns: Optional[List[Dict[Text, Any]]] = None, ) -> RegexFeaturizer: config = config or {} return RegexFeaturizer( {**RegexFeaturizer.get_default_config(), **config}, default_model_storage, resource, default_execution_context, known_patterns, )
def test_persist_load_for_finetuning( create_featurizer: Callable[..., RegexFeaturizer], default_model_storage: ModelStorage, default_execution_context: ExecutionContext, resource: Resource, whitespace_tokenizer: WhitespaceTokenizer, ): patterns = [ {"pattern": "[0-9]+", "name": "number", "usage": "intent"}, {"pattern": "\\bhey*", "name": "hello", "usage": "intent"}, {"pattern": "[0-1]+", "name": "binary", "usage": "intent"}, ] featurizer = create_featurizer() sentence = "hey how are you today 19.12.2019 ?" message = Message(data={TEXT: sentence}) message.set(RESPONSE, sentence) message.set(INTENT, "intent") training_data = TrainingData([message], regex_features=patterns) whitespace_tokenizer.process_training_data(training_data) featurizer.train(training_data) loaded_featurizer = RegexFeaturizer.load( RegexFeaturizer.get_default_config(), default_model_storage, resource, dataclasses.replace(default_execution_context, is_finetuning=True), ) # Test component loaded in finetune mode and also with # same patterns as before and vocabulary statistics assert loaded_featurizer.known_patterns == featurizer.known_patterns assert loaded_featurizer.finetune_mode new_lookups = [{"name": "plates", "elements": "data/test/lookup_tables/plates.txt"}] training_data = TrainingData() training_data.lookup_tables = new_lookups loaded_featurizer.train(training_data) # Test merging of a new pattern to an already trained component. assert len(loaded_featurizer.known_patterns) == 4
def test_vocabulary_expand_for_finetuning( create_featurizer: Callable[..., RegexFeaturizer], default_model_storage: ModelStorage, resource: Resource, default_execution_context: ExecutionContext, whitespace_tokenizer: WhitespaceTokenizer, ): patterns = [ {"pattern": "[0-9]+", "name": "number", "usage": "intent"}, {"pattern": "\\bhey*", "name": "hello", "usage": "intent"}, ] featurizer = create_featurizer() sentence = "hey hey 2020" message = Message(data={TEXT: sentence}) message.set(RESPONSE, sentence) message.set(INTENT, "intent") training_data = TrainingData([message], regex_features=patterns) whitespace_tokenizer.process_training_data(training_data) featurizer.train(training_data) featurizer.process_training_data(training_data) # Test featurization of message expected = np.array([1, 0]) expected_cls = np.array([1, 1]) seq_vecs, sen_vec = message.get_sparse_features(TEXT, []) if seq_vecs: seq_vecs = seq_vecs.features if sen_vec: sen_vec = sen_vec.features assert (3, 2) == seq_vecs.shape assert (1, 2) == sen_vec.shape assert np.all(seq_vecs.toarray()[0] == expected) assert np.all(sen_vec.toarray()[-1] == expected_cls) loaded_featurizer = RegexFeaturizer.load( RegexFeaturizer.get_default_config(), default_model_storage, resource, dataclasses.replace(default_execution_context, is_finetuning=True), ) new_patterns = [ {"pattern": "\\btoday*", "name": "day", "usage": "intent"}, {"pattern": "\\bhey+", "name": "hello", "usage": "intent"}, ] new_sentence = "hey today" message = Message(data={TEXT: new_sentence}) message.set(RESPONSE, new_sentence) message.set(INTENT, "intent") new_training_data = TrainingData([message], regex_features=patterns + new_patterns) whitespace_tokenizer.process_training_data(new_training_data) loaded_featurizer.train(new_training_data) loaded_featurizer.process_training_data(new_training_data) # Test featurization of message, this time for the extra pattern as well. expected_token_1 = np.array([1, 0, 0]) expected_token_2 = np.array([0, 0, 1]) expected_cls = np.array([1, 0, 1]) seq_vecs, sen_vec = message.get_sparse_features(TEXT, []) if seq_vecs: seq_vecs = seq_vecs.features if sen_vec: sen_vec = sen_vec.features assert (2, 3) == seq_vecs.shape assert (1, 3) == sen_vec.shape assert np.all(seq_vecs.toarray()[0] == expected_token_1) assert np.all(seq_vecs.toarray()[1] == expected_token_2) assert np.all(sen_vec.toarray()[-1] == expected_cls) # let's check if the order of patterns is preserved for old_index, pattern in enumerate(featurizer.known_patterns): assert pattern["name"] == loaded_featurizer.known_patterns[old_index]["name"] # we also modified a pattern, check if that is correctly modified pattern_to_check = [ pattern for pattern in loaded_featurizer.known_patterns if pattern["name"] == "hello" ] assert pattern_to_check == [new_patterns[1]]