def test_convert_featurizer_number_of_sub_tokens( create_or_load_convert_featurizer: Callable[[Dict[Text, Any]], ConveRTFeaturizer], text: Text, expected_number_of_sub_tokens: List[int], monkeypatch: MonkeyPatch, whitespace_tokenizer: WhitespaceTokenizer, ): monkeypatch.setattr( ConveRTFeaturizer, "_validate_model_url", lambda _: None, ) component_config = { FEATURIZER_CLASS_ALIAS: "alias", "model_url": RESTRICTED_ACCESS_URL, } featurizer = create_or_load_convert_featurizer(component_config) message = Message.build(text=text) td = TrainingData([message]) whitespace_tokenizer.process_training_data(td) tokens = featurizer.tokenize(message, attribute=TEXT) assert [t.get(NUMBER_OF_SUB_TOKENS) for t in tokens] == expected_number_of_sub_tokens
def test_convert_featurizer_token_edge_cases( create_or_load_convert_featurizer: Callable[[Dict[Text, Any]], ConveRTFeaturizer], text: Text, expected_tokens: List[Text], expected_indices: List[Tuple[int]], monkeypatch: MonkeyPatch, whitespace_tokenizer: WhitespaceTokenizer, ): monkeypatch.setattr( ConveRTFeaturizer, "_validate_model_url", lambda _: None, ) component_config = { FEATURIZER_CLASS_ALIAS: "alias", "model_url": RESTRICTED_ACCESS_URL, } featurizer = create_or_load_convert_featurizer(component_config) message = Message.build(text=text) td = TrainingData([message]) whitespace_tokenizer.process_training_data(td) tokens = featurizer.tokenize(message, attribute=TEXT) assert [t.text for t in tokens] == expected_tokens assert [t.start for t in tokens] == [i[0] for i in expected_indices] assert [t.end for t in tokens] == [i[1] for i in expected_indices]
def test_convert_featurizer_train( create_or_load_convert_featurizer: Callable[[Dict[Text, Any]], ConveRTFeaturizer], monkeypatch: MonkeyPatch, load: bool, whitespace_tokenizer: WhitespaceTokenizer, ): monkeypatch.setattr( ConveRTFeaturizer, "_validate_model_url", lambda _: None, ) component_config = { FEATURIZER_CLASS_ALIAS: "alias", "model_url": RESTRICTED_ACCESS_URL, } featurizer = create_or_load_convert_featurizer(component_config, load=True) sentence = "Hey how are you today ?" message = Message(data={TEXT: sentence}) message.set(RESPONSE, sentence) td = TrainingData([message]) whitespace_tokenizer.process_training_data(td) tokens = featurizer.tokenize(message, attribute=TEXT) message.set(TOKENS_NAMES[TEXT], tokens) message.set(TOKENS_NAMES[RESPONSE], tokens) featurizer.process_training_data(TrainingData([message])) expected = np.array( [2.2636216, -0.26475656, -1.1358104, -0.49751878, -1.3946456]) expected_cls = np.array( [1.0251294, -0.04053932, -0.7018805, -0.82054937, -0.75054353]) seq_vecs, sent_vecs = message.get_dense_features(TEXT, []) seq_vecs = seq_vecs.features sent_vecs = sent_vecs.features assert len(tokens) == len(seq_vecs) assert np.allclose(seq_vecs[0][:5], expected, atol=1e-5) assert np.allclose(sent_vecs[-1][:5], expected_cls, atol=1e-5) seq_vecs, sent_vecs = message.get_dense_features(RESPONSE, []) seq_vecs = seq_vecs.features sent_vecs = sent_vecs.features assert len(tokens) == len(seq_vecs) assert np.allclose(seq_vecs[0][:5], expected, atol=1e-5) assert np.allclose(sent_vecs[-1][:5], expected_cls, atol=1e-5) seq_vecs, sent_vecs = message.get_dense_features(INTENT, []) assert seq_vecs is None assert sent_vecs is None
async def test_train_persist_load_with_composite_entities( crf_entity_extractor: Callable[[Dict[Text, Any]], CRFEntityExtractor], default_model_storage: ModelStorage, default_execution_context: ExecutionContext, whitespace_tokenizer: WhitespaceTokenizer, ): importer = RasaFileImporter( training_data_paths=["data/test/demo-rasa-composite-entities.yml"]) training_data = importer.get_nlu_data() whitespace_tokenizer.process_training_data(training_data) crf_extractor = crf_entity_extractor({}) crf_extractor.train(training_data) message = Message(data={TEXT: "I am looking for an italian restaurant"}) whitespace_tokenizer.process([message]) message2 = copy.deepcopy(message) processed_message = crf_extractor.process([message])[0] loaded_extractor = CRFEntityExtractor.load( CRFEntityExtractor.get_default_config(), default_model_storage, Resource("CRFEntityExtractor"), default_execution_context, ) processed_message2 = loaded_extractor.process([message2])[0] assert processed_message2.fingerprint() == processed_message.fingerprint() assert list(loaded_extractor.entity_taggers.keys()) == list( crf_extractor.entity_taggers.keys())
def test_convert_featurizer_tokens_to_text( create_or_load_convert_featurizer: Callable[[Dict[Text, Any]], ConveRTFeaturizer], sentence: Text, expected_text: Text, monkeypatch: MonkeyPatch, whitespace_tokenizer: WhitespaceTokenizer, ): monkeypatch.setattr( ConveRTFeaturizer, "_validate_model_url", lambda _: None, ) component_config = { FEATURIZER_CLASS_ALIAS: "alias", "model_url": RESTRICTED_ACCESS_URL, } featurizer = create_or_load_convert_featurizer(component_config) message = Message.build(text=sentence) td = TrainingData([message]) whitespace_tokenizer.process_training_data(td) tokens = featurizer.tokenize(message, attribute=TEXT) actual_text = ConveRTFeaturizer._tokens_to_text([tokens])[0] assert expected_text == actual_text
def test_use_shared_vocab_exception( initial_train_text: Text, additional_train_text: Text, use_shared_vocab: bool, create_featurizer: Callable[..., CountVectorsFeaturizer], load_featurizer: Callable[..., CountVectorsFeaturizer], whitespace_tokenizer: WhitespaceTokenizer, ): """Tests if an exception is raised when `use_shared_vocab` is set to True during incremental training.""" config = {"use_shared_vocab": use_shared_vocab} initial_cvf = create_featurizer(config) train_message = Message(data={"text": initial_train_text}) data = TrainingData([train_message]) whitespace_tokenizer.process_training_data(data) initial_cvf.train(data) new_cvf = load_featurizer(config, is_finetuning=True) additional_train_message = Message(data={"text": additional_train_text}) data = TrainingData([train_message, additional_train_message]) whitespace_tokenizer.process_training_data(data) if use_shared_vocab: with pytest.raises(Exception) as exec_info: new_cvf.train(data) assert ( "Using a shared vocabulary in `CountVectorsFeaturizer` is not supported" in str(exec_info.value) ) else: new_cvf.train(data)
def test_count_vector_featurizer_action_attribute_featurization( sentence: Text, action_name: Text, action_text: Text, action_name_features: np.ndarray, response_features: np.ndarray, create_featurizer: Callable[..., CountVectorsFeaturizer], whitespace_tokenizer: WhitespaceTokenizer, ): ftr = create_featurizer({"token_pattern": r"(?u)\b\w+\b"}) train_message = Message(data={TEXT: sentence}) # this is needed for a valid training example train_message.set(ACTION_NAME, action_name) train_message.set(ACTION_TEXT, action_text) # add a second example that has some response, so that the vocabulary for # response exists second_message = Message(data={TEXT: "hello"}) second_message.set(ACTION_TEXT, "hi") second_message.set(ACTION_NAME, "greet") data = TrainingData([train_message, second_message]) whitespace_tokenizer.process_training_data(data) ftr.train(data) ftr.process_training_data(data) action_name_seq_vecs, action_name_sen_vecs = train_message.get_sparse_features( ACTION_NAME, [] ) if action_name_seq_vecs: action_name_seq_vecs = action_name_seq_vecs.features if action_name_sen_vecs: action_name_sen_vecs = action_name_sen_vecs.features response_seq_vecs, response_sen_vecs = train_message.get_sparse_features( ACTION_TEXT, [] ) if response_seq_vecs: response_seq_vecs = response_seq_vecs.features if response_sen_vecs: response_sen_vecs = response_sen_vecs.features if action_name_features: assert action_name_seq_vecs.toarray()[0] == action_name_features assert action_name_sen_vecs is None else: assert action_name_seq_vecs is None assert action_name_sen_vecs is None if response_features: assert response_seq_vecs.toarray()[0] == response_features assert response_sen_vecs is not None else: assert response_seq_vecs is None assert response_sen_vecs is None
def test_regex_featurizer_train( create_featurizer: Callable[..., RegexFeaturizer], whitespace_tokenizer: WhitespaceTokenizer, ): patterns = [ {"pattern": "[0-9]+", "name": "number", "usage": "intent"}, {"pattern": "\\bhey*", "name": "hello", "usage": "intent"}, {"pattern": "[0-1]+", "name": "binary", "usage": "intent"}, ] featurizer = create_featurizer() sentence = "hey how are you today 19.12.2019 ?" message = Message(data={TEXT: sentence}) message.set(RESPONSE, sentence) message.set(INTENT, "intent") whitespace_tokenizer.process_training_data(TrainingData([message])) training_data = TrainingData([message], regex_features=patterns) featurizer.train(training_data) featurizer.process_training_data(training_data) expected = np.array([0, 1, 0]) expected_cls = np.array([1, 1, 1]) seq_vecs, sen_vec = message.get_sparse_features(TEXT, []) if seq_vecs: seq_vecs = seq_vecs.features if sen_vec: sen_vec = sen_vec.features assert (6, 3) == seq_vecs.shape assert (1, 3) == sen_vec.shape assert np.all(seq_vecs.toarray()[0] == expected) assert np.all(sen_vec.toarray()[-1] == expected_cls) seq_vecs, sen_vec = message.get_sparse_features(RESPONSE, []) if seq_vecs: seq_vecs = seq_vecs.features if sen_vec: sen_vec = sen_vec.features assert (6, 3) == seq_vecs.shape assert (1, 3) == sen_vec.shape assert np.all(seq_vecs.toarray()[0] == expected) assert np.all(sen_vec.toarray()[-1] == expected_cls) seq_vecs, sen_vec = message.get_sparse_features(INTENT, []) if seq_vecs: seq_vecs = seq_vecs.features if sen_vec: sen_vec = sen_vec.features assert seq_vecs is None assert sen_vec is None
def test_count_vector_featurizer_response_attribute_featurization( sentence: Text, intent: Text, response: Optional[Text], intent_features: List[List[int]], response_features: Optional[List[List[int]]], create_featurizer: Callable[..., CountVectorsFeaturizer], whitespace_tokenizer: WhitespaceTokenizer, ): ftr = create_featurizer() train_message = Message(data={TEXT: sentence}) # this is needed for a valid training example train_message.set(INTENT, intent) train_message.set(RESPONSE, response) # add a second example that has some response, so that the vocabulary for # response exists second_message = Message(data={TEXT: "hello"}) second_message.set(RESPONSE, "hi") second_message.set(INTENT, "greet") data = TrainingData([train_message, second_message]) whitespace_tokenizer.process_training_data(data) ftr.train(data) ftr.process_training_data(data) intent_seq_vecs, intent_sen_vecs = train_message.get_sparse_features(INTENT, []) if intent_seq_vecs: intent_seq_vecs = intent_seq_vecs.features if intent_sen_vecs: intent_sen_vecs = intent_sen_vecs.features response_seq_vecs, response_sen_vecs = train_message.get_sparse_features( RESPONSE, [] ) if response_seq_vecs: response_seq_vecs = response_seq_vecs.features if response_sen_vecs: response_sen_vecs = response_sen_vecs.features if intent_features: assert intent_seq_vecs.toarray()[0] == intent_features assert intent_sen_vecs is None else: assert intent_seq_vecs is None assert intent_sen_vecs is None if response_features: assert response_seq_vecs.toarray()[0] == response_features assert response_sen_vecs is not None else: assert response_seq_vecs is None assert response_sen_vecs is None
def test_training_data_fingerprint_incorporates_tokens( whitespace_tokenizer: WhitespaceTokenizer, ): from rasa.shared.importers.utils import training_data_from_paths files = [ "data/examples/rasa/demo-rasa.yml", "data/examples/rasa/demo-rasa-responses.yml", ] training_data = training_data_from_paths(files, language="en") fp1 = training_data.fingerprint() whitespace_tokenizer.process_training_data(training_data) # training data fingerprint has changed assert fp1 != training_data.fingerprint()
def test_check_correct_entity_annotations( text: Text, warnings: int, whitespace_tokenizer: WhitespaceTokenizer): reader = RasaYAMLReader() training_data = reader.reads(text) whitespace_tokenizer.process_training_data(training_data) with pytest.warns(UserWarning) as record: EntityExtractorMixin.check_correct_entity_annotations(training_data) assert len(record) == warnings assert all([excerpt in record[0].message.args[0]] for excerpt in ["Misaligned entity annotation in sentence"])
def test_count_vectors_featurizer_train( create_featurizer: Callable[..., CountVectorsFeaturizer], whitespace_tokenizer: WhitespaceTokenizer, ): featurizer = create_featurizer() sentence = "Hey how are you today ?" message = Message(data={TEXT: sentence}) message.set(RESPONSE, sentence) message.set(INTENT, "intent") whitespace_tokenizer.process_training_data(TrainingData([message])) data = TrainingData([message]) featurizer.train(data) featurizer.process_training_data(data) expected = np.array([0, 1, 0, 0, 0]) expected_cls = np.array([1, 1, 1, 1, 1]) seq_vec, sen_vec = message.get_sparse_features(TEXT, []) if seq_vec: seq_vec = seq_vec.features if sen_vec: sen_vec = sen_vec.features assert (5, 5) == seq_vec.shape assert (1, 5) == sen_vec.shape assert np.all(seq_vec.toarray()[0] == expected) assert np.all(sen_vec.toarray()[-1] == expected_cls) seq_vec, sen_vec = message.get_sparse_features(RESPONSE, []) if seq_vec: seq_vec = seq_vec.features if sen_vec: sen_vec = sen_vec.features assert (5, 5) == seq_vec.shape assert (1, 5) == sen_vec.shape assert np.all(seq_vec.toarray()[0] == expected) assert np.all(sen_vec.toarray()[-1] == expected_cls) seq_vec, sen_vec = message.get_sparse_features(INTENT, []) if seq_vec: seq_vec = seq_vec.features if sen_vec: sen_vec = sen_vec.features assert sen_vec is None assert (1, 1) == seq_vec.shape assert np.all(seq_vec.toarray()[0] == np.array([1]))
def test_model_data_signature_with_entities( messages: List[Message], entity_expected: bool, create_diet: Callable[..., DIETClassifier], whitespace_tokenizer: WhitespaceTokenizer, ): classifier = create_diet({"BILOU_flag": False}) training_data = TrainingData(messages) # create tokens for entity parsing inside DIET whitespace_tokenizer.process_training_data(training_data) model_data = classifier.preprocess_train_data(training_data) entity_exists = "entities" in model_data.get_signature().keys() assert entity_exists == entity_expected
def test_count_vector_featurizer_shared_vocab( sentence: Text, intent: Text, response: Text, text_features: List[List[int]], intent_features: List[List[int]], response_features: List[List[int]], create_featurizer: Callable[..., CountVectorsFeaturizer], whitespace_tokenizer: WhitespaceTokenizer, ): ftr = create_featurizer({ "use_shared_vocab": True, }) train_message = Message(data={TEXT: sentence}) # this is needed for a valid training example train_message.set(INTENT, intent) train_message.set(RESPONSE, response) data = TrainingData([train_message]) whitespace_tokenizer.process_training_data(data) ftr.train(data) ftr.process_training_data(data) seq_vec, sen_vec = train_message.get_sparse_features(TEXT, []) if seq_vec: seq_vec = seq_vec.features if sen_vec: sen_vec = sen_vec.features assert np.all(seq_vec.toarray()[0] == text_features) assert sen_vec is not None seq_vec, sen_vec = train_message.get_sparse_features(INTENT, []) if seq_vec: seq_vec = seq_vec.features if sen_vec: sen_vec = sen_vec.features assert np.all(seq_vec.toarray()[0] == intent_features) assert sen_vec is None seq_vec, sen_vec = train_message.get_sparse_features(RESPONSE, []) if seq_vec: seq_vec = seq_vec.features if sen_vec: sen_vec = sen_vec.features assert np.all(seq_vec.toarray()[0] == response_features) assert sen_vec is not None
def process_training_text( texts: List[Text], model_name: Text, model_weights: Text, create_language_model_featurizer: Callable[[Dict[Text, Any]], LanguageModelFeaturizer], whitespace_tokenizer: WhitespaceTokenizer, ) -> List[Message]: """ Creates a featurizer and process training data """ config = create_pretrained_transformers_config(model_name, model_weights) lm_featurizer = create_language_model_featurizer(config) messages = [Message.build(text=text) for text in texts] td = TrainingData(messages) whitespace_tokenizer.process_training_data(td) lm_featurizer.process_training_data(td) return messages
def test_persist_load_for_finetuning( create_featurizer: Callable[..., RegexFeaturizer], default_model_storage: ModelStorage, default_execution_context: ExecutionContext, resource: Resource, whitespace_tokenizer: WhitespaceTokenizer, ): patterns = [ {"pattern": "[0-9]+", "name": "number", "usage": "intent"}, {"pattern": "\\bhey*", "name": "hello", "usage": "intent"}, {"pattern": "[0-1]+", "name": "binary", "usage": "intent"}, ] featurizer = create_featurizer() sentence = "hey how are you today 19.12.2019 ?" message = Message(data={TEXT: sentence}) message.set(RESPONSE, sentence) message.set(INTENT, "intent") training_data = TrainingData([message], regex_features=patterns) whitespace_tokenizer.process_training_data(training_data) featurizer.train(training_data) loaded_featurizer = RegexFeaturizer.load( RegexFeaturizer.get_default_config(), default_model_storage, resource, dataclasses.replace(default_execution_context, is_finetuning=True), ) # Test component loaded in finetune mode and also with # same patterns as before and vocabulary statistics assert loaded_featurizer.known_patterns == featurizer.known_patterns assert loaded_featurizer.finetune_mode new_lookups = [{"name": "plates", "elements": "data/test/lookup_tables/plates.txt"}] training_data = TrainingData() training_data.lookup_tables = new_lookups loaded_featurizer.train(training_data) # Test merging of a new pattern to an already trained component. assert len(loaded_featurizer.known_patterns) == 4
def test_cvf_incremental_training( initial_train_text: Text, additional_train_text: Text, initial_vocabulary_size: int, final_vocabulary_size: int, create_featurizer: Callable[..., CountVectorsFeaturizer], load_featurizer: Callable[..., CountVectorsFeaturizer], whitespace_tokenizer: WhitespaceTokenizer, ): initial_cvf = create_featurizer() train_message = Message(data={"text": initial_train_text}) data = TrainingData([train_message]) whitespace_tokenizer.process_training_data(data) initial_cvf.train(data) # Check initial vocabulary size initial_vocab = initial_cvf.vectorizers["text"].vocabulary_ assert len(initial_vocab) == initial_vocabulary_size # persist and load initial cvf new_cvf = load_featurizer(is_finetuning=True) # Check vocabulary size again assert len( new_cvf.vectorizers["text"].vocabulary_) == initial_vocabulary_size additional_train_message = Message(data={"text": additional_train_text}) data = TrainingData([train_message, additional_train_message]) whitespace_tokenizer.process_training_data(data) new_cvf.train(data) new_vocab = new_cvf.vectorizers["text"].vocabulary_ # Check vocabulary size after finetuning assert len(new_vocab) == final_vocabulary_size # Check indices of initial vocabulary haven't changed in the new vocabulary for vocab_token, vocab_index in initial_vocab.items(): assert vocab_token in new_vocab assert new_vocab.get(vocab_token) == vocab_index
def test_log_longer_sequence( sequence_length: int, model_name: Text, model_weights: Text, should_overflow: bool, caplog: LogCaptureFixture, create_language_model_featurizer: Callable[[Dict[Text, Any]], LanguageModelFeaturizer], whitespace_tokenizer: WhitespaceTokenizer, ): config = {"model_name": model_name, "model_weights": model_weights} featurizer = create_language_model_featurizer(config) text = " ".join(["hi"] * sequence_length) message = Message.build(text=text) td = TrainingData([message]) whitespace_tokenizer.process_training_data(td) caplog.set_level(logging.DEBUG) featurizer.process([message]) if should_overflow: assert "hi hi hi" in caplog.text assert len(message.features) >= 2
def test_vocabulary_expand_for_finetuning( create_featurizer: Callable[..., RegexFeaturizer], default_model_storage: ModelStorage, resource: Resource, default_execution_context: ExecutionContext, whitespace_tokenizer: WhitespaceTokenizer, ): patterns = [ {"pattern": "[0-9]+", "name": "number", "usage": "intent"}, {"pattern": "\\bhey*", "name": "hello", "usage": "intent"}, ] featurizer = create_featurizer() sentence = "hey hey 2020" message = Message(data={TEXT: sentence}) message.set(RESPONSE, sentence) message.set(INTENT, "intent") training_data = TrainingData([message], regex_features=patterns) whitespace_tokenizer.process_training_data(training_data) featurizer.train(training_data) featurizer.process_training_data(training_data) # Test featurization of message expected = np.array([1, 0]) expected_cls = np.array([1, 1]) seq_vecs, sen_vec = message.get_sparse_features(TEXT, []) if seq_vecs: seq_vecs = seq_vecs.features if sen_vec: sen_vec = sen_vec.features assert (3, 2) == seq_vecs.shape assert (1, 2) == sen_vec.shape assert np.all(seq_vecs.toarray()[0] == expected) assert np.all(sen_vec.toarray()[-1] == expected_cls) loaded_featurizer = RegexFeaturizer.load( RegexFeaturizer.get_default_config(), default_model_storage, resource, dataclasses.replace(default_execution_context, is_finetuning=True), ) new_patterns = [ {"pattern": "\\btoday*", "name": "day", "usage": "intent"}, {"pattern": "\\bhey+", "name": "hello", "usage": "intent"}, ] new_sentence = "hey today" message = Message(data={TEXT: new_sentence}) message.set(RESPONSE, new_sentence) message.set(INTENT, "intent") new_training_data = TrainingData([message], regex_features=patterns + new_patterns) whitespace_tokenizer.process_training_data(new_training_data) loaded_featurizer.train(new_training_data) loaded_featurizer.process_training_data(new_training_data) # Test featurization of message, this time for the extra pattern as well. expected_token_1 = np.array([1, 0, 0]) expected_token_2 = np.array([0, 0, 1]) expected_cls = np.array([1, 0, 1]) seq_vecs, sen_vec = message.get_sparse_features(TEXT, []) if seq_vecs: seq_vecs = seq_vecs.features if sen_vec: sen_vec = sen_vec.features assert (2, 3) == seq_vecs.shape assert (1, 3) == sen_vec.shape assert np.all(seq_vecs.toarray()[0] == expected_token_1) assert np.all(seq_vecs.toarray()[1] == expected_token_2) assert np.all(sen_vec.toarray()[-1] == expected_cls) # let's check if the order of patterns is preserved for old_index, pattern in enumerate(featurizer.known_patterns): assert pattern["name"] == loaded_featurizer.known_patterns[old_index]["name"] # we also modified a pattern, check if that is correctly modified pattern_to_check = [ pattern for pattern in loaded_featurizer.known_patterns if pattern["name"] == "hello" ] assert pattern_to_check == [new_patterns[1]]
def test_apply_bilou_schema(whitespace_tokenizer: WhitespaceTokenizer): message_1 = Message.build(text="Germany is part of the European Union", intent="inform") message_1.set( ENTITIES, [ { "start": 0, "end": 7, "value": "Germany", "entity": "location" }, { "start": 23, "end": 37, "value": "European Union", "entity": "organisation", }, ], ) message_2 = Message.build(text="Berlin is the capital of Germany", intent="inform") message_2.set( ENTITIES, [ { "start": 0, "end": 6, "value": "Berlin", "entity": "location" }, { "start": 25, "end": 32, "value": "Germany", "entity": "location" }, ], ) training_data = TrainingData([message_1, message_2]) whitespace_tokenizer.process_training_data(training_data) bilou_utils.apply_bilou_schema(training_data) assert message_1.get(BILOU_ENTITIES) == [ "U-location", "O", "O", "O", "O", "B-organisation", "L-organisation", ] assert message_2.get(BILOU_ENTITIES) == [ "U-location", "O", "O", "O", "O", "U-location", ]