def fetch_info_from_message(interpreter, text_input): msg = Message({TEXT: text_input}) blob = interpreter.interpreter.parse(text_input) nlu_dict = interpreter.featurize_message(msg).as_dict_nlu() tokens = [t.text for t in nlu_dict["text_tokens"]] return blob, nlu_dict, tokens
def test_container_keys(): message_data_list = [{INTENT: "1"}, {INTENT: "2"}, {TEXT: "3", "other": 3}] container = MessageContainerForCoreFeaturization() container.add_all([Message(data=data) for data in message_data_list]) assert set(container.keys(INTENT)) == {"1", "2"} assert set(container.keys(TEXT)) == {"3"}
def test_container_fingerprints_differ_for_different_containers(): container1 = MessageContainerForCoreFeaturization() container1.add(Message(data={INTENT: "1"})) container2 = MessageContainerForCoreFeaturization() container2.add(Message(data={INTENT: "2"})) assert container2.fingerprint() != container1.fingerprint()
def test_count_vector_featurizer_action_attribute_featurization( sentence: Text, action_name: Text, action_text: Text, action_name_features: np.ndarray, response_features: np.ndarray, ): ftr = CountVectorsFeaturizer({"token_pattern": r"(?u)\b\w+\b",}) tk = WhitespaceTokenizer() train_message = Message(data={TEXT: sentence}) # this is needed for a valid training example train_message.set(ACTION_NAME, action_name) train_message.set(ACTION_TEXT, action_text) # add a second example that has some response, so that the vocabulary for # response exists second_message = Message(data={TEXT: "hello"}) second_message.set(ACTION_TEXT, "hi") second_message.set(ACTION_NAME, "greet") data = TrainingData([train_message, second_message]) tk.train(data) ftr.train(data) action_name_seq_vecs, action_name_sen_vecs = train_message.get_sparse_features( ACTION_NAME, [] ) if action_name_seq_vecs: action_name_seq_vecs = action_name_seq_vecs.features if action_name_sen_vecs: action_name_sen_vecs = action_name_sen_vecs.features response_seq_vecs, response_sen_vecs = train_message.get_sparse_features( ACTION_TEXT, [] ) if response_seq_vecs: response_seq_vecs = response_seq_vecs.features if response_sen_vecs: response_sen_vecs = response_sen_vecs.features if action_name_features: assert action_name_seq_vecs.toarray()[0] == action_name_features assert action_name_sen_vecs is None else: assert action_name_seq_vecs is None assert action_name_sen_vecs is None if response_features: assert response_seq_vecs.toarray()[0] == response_features assert response_sen_vecs is not None else: assert response_seq_vecs is None assert response_sen_vecs is None
def test_count_vector_featurizer_response_attribute_featurization( sentence, intent, response, intent_features, response_features ): ftr = CountVectorsFeaturizer() tk = WhitespaceTokenizer() train_message = Message(data={TEXT: sentence}) # this is needed for a valid training example train_message.set(INTENT, intent) train_message.set(RESPONSE, response) # add a second example that has some response, so that the vocabulary for # response exists second_message = Message(data={TEXT: "hello"}) second_message.set(RESPONSE, "hi") second_message.set(INTENT, "greet") data = TrainingData([train_message, second_message]) tk.train(data) ftr.train(data) intent_seq_vecs, intent_sen_vecs = train_message.get_sparse_features(INTENT, []) if intent_seq_vecs: intent_seq_vecs = intent_seq_vecs.features if intent_sen_vecs: intent_sen_vecs = intent_sen_vecs.features response_seq_vecs, response_sen_vecs = train_message.get_sparse_features( RESPONSE, [] ) if response_seq_vecs: response_seq_vecs = response_seq_vecs.features if response_sen_vecs: response_sen_vecs = response_sen_vecs.features if intent_features: assert intent_seq_vecs.toarray()[0] == intent_features assert intent_sen_vecs is None else: assert intent_seq_vecs is None assert intent_sen_vecs is None if response_features: assert response_seq_vecs.toarray()[0] == response_features assert response_sen_vecs is not None else: assert response_seq_vecs is None assert response_sen_vecs is None
def test_regex_featurizer_train(): patterns = [ { "pattern": "[0-9]+", "name": "number", "usage": "intent" }, { "pattern": "\\bhey*", "name": "hello", "usage": "intent" }, { "pattern": "[0-1]+", "name": "binary", "usage": "intent" }, ] featurizer = RegexFeaturizer.create({}, RasaNLUModelConfig()) sentence = "hey how are you today 19.12.2019 ?" message = Message(data={TEXT: sentence}) message.set(RESPONSE, sentence) message.set(INTENT, "intent") WhitespaceTokenizer().train(TrainingData([message])) featurizer.train(TrainingData([message], regex_features=patterns), RasaNLUModelConfig()) expected = np.array([0, 1, 0]) expected_cls = np.array([1, 1, 1]) seq_vecs, sen_vec = message.get_sparse_features(TEXT, []) if seq_vecs: seq_vecs = seq_vecs.features if sen_vec: sen_vec = sen_vec.features assert (6, 3) == seq_vecs.shape assert (1, 3) == sen_vec.shape assert np.all(seq_vecs.toarray()[0] == expected) assert np.all(sen_vec.toarray()[-1] == expected_cls) seq_vecs, sen_vec = message.get_sparse_features(RESPONSE, []) if seq_vecs: seq_vecs = seq_vecs.features if sen_vec: sen_vec = sen_vec.features assert (6, 3) == seq_vecs.shape assert (1, 3) == sen_vec.shape assert np.all(seq_vecs.toarray()[0] == expected) assert np.all(sen_vec.toarray()[-1] == expected_cls) seq_vecs, sen_vec = message.get_sparse_features(INTENT, []) if seq_vecs: seq_vecs = seq_vecs.features if sen_vec: sen_vec = sen_vec.features assert seq_vecs is None assert sen_vec is None
def unpack_regex_message( message: Message, domain: Optional[Domain] = None, entity_extractor_name: Optional[Text] = None, ) -> Message: """Unpacks the message if `TEXT` contains an encoding of attributes. Args: message: some message domain: the domain entity_extractor_name: An extractor name which should be added for the entities. Returns: the given message if that message does not need to be unpacked, and a new message with the extracted attributes otherwise """ user_text = message.get(TEXT).strip() # If the prefix doesn't match, we don't even need to try to match the pattern. if not user_text.startswith(INTENT_MESSAGE_PREFIX): return message # Try to match the pattern. match = YAMLStoryReader._regex_message_pattern().match(user_text) # If it doesn't match, then (potentially) something went wrong, because the # message text did start with the special prefix -- however, a user might # just have decided to start their text this way. if not match: logger.warning( f"Failed to parse intent end entities from '{user_text}'.") return message # Extract attributes from the match - and validate it via the domain. intent_name = YAMLStoryReader._intent_name_from_regex_match( match, domain) confidence = YAMLStoryReader._confidences_from_regex_match(match) entities = YAMLStoryReader._entities_from_regex_match( match, domain, entity_extractor_name) # The intent name is *not* optional, but during parsing we might find out # that the given intent is unknown (and warn). In this case, stop here. if intent_name is None: return message if match.group("rest"): rasa.shared.utils.io.raise_warning( f"Failed to parse arguments in line '{match.string}'. " f"Failed to interpret some parts. " f"Continuing without {match.group('rest')}. ", docs=DOCS_URL_STORIES, ) # Add the results to the message. intent_data = { INTENT_NAME_KEY: intent_name, PREDICTED_CONFIDENCE_KEY: confidence, } intent_ranking = [{ INTENT_NAME_KEY: intent_name, PREDICTED_CONFIDENCE_KEY: confidence, }] message_data = {} message_data[TEXT] = user_text message_data[INTENT] = intent_data message_data[INTENT_RANKING_KEY] = intent_ranking message_data[ENTITIES] = entities return Message(message_data, output_properties=set(message_data.keys()))
def test_is_core_or_domain_message( message: Message, result: bool, ): assert result == message.is_core_or_domain_message()
def test_add_diagnostic_data_with_repeated_component_raises_warning(): message = Message() message.add_diagnostic_data("a", {}) with pytest.warns(UserWarning): message.add_diagnostic_data("a", {})
def test_fingerprint_is_same_when_loading_data_again(): from rasa.shared.importers.utils import training_data_from_paths files = [ "data/examples/rasa/demo-rasa.md", "data/examples/rasa/demo-rasa-responses.md", ] td1 = training_data_from_paths(files, language="en") td2 = training_data_from_paths(files, language="en") assert td1.fingerprint() == td2.fingerprint() @pytest.mark.parametrize( "message", [ Message({INTENT: "intent2"}), Message({ENTITIES: [{"entity": "entity2"}]}), Message({ENTITIES: [{"entity": "entity1", "group": "new_group"}]}), Message({ENTITIES: [{"entity": "entity1", "role": "new_role"}]}), Message({ACTION_NAME: "action_name2"}), ], ) def test_label_fingerprints(message: Message): training_data1 = TrainingData( [ Message({INTENT: "intent1"}), Message({ENTITIES: [{"entity": "entity1"}]}), Message({ACTION_NAME: "action_name1"}), ] ) training_data2 = training_data1.merge(TrainingData([message]))
features: Optional[List[Features]], attribute: Text, featurizers: List[Text], expected: bool, ): message = Message(data={TEXT: "This is a test sentence."}, features=features) actual = message.features_present(attribute, featurizers) assert actual == expected @pytest.mark.parametrize( "message, result", [ (Message({INTENT: "intent", TEXT: "text"}), False), (Message({RESPONSE: "response", TEXT: "text"}), False), (Message({INTENT: "intent"}), True), (Message({ACTION_TEXT: "action text"}), True), (Message({ACTION_NAME: "action name"}), True), (Message({TEXT: "text"}), True), ], ) def test_is_core_or_domain_message( message: Message, result: bool, ): assert result == message.is_core_or_domain_message() def test_add_diagnostic_data_with_repeated_component_raises_warning(): message = Message()
def test_spacy_featurizer_train(spacy_nlp): featurizer = SpacyFeaturizer.create({}, RasaNLUModelConfig()) sentence = "Hey how are you today" message = Message(data={TEXT: sentence}) message.set(RESPONSE, sentence) message.set(INTENT, "intent") message.set(SPACY_DOCS[TEXT], spacy_nlp(sentence)) message.set(SPACY_DOCS[RESPONSE], spacy_nlp(sentence)) featurizer.train(TrainingData([message]), RasaNLUModelConfig()) expected = np.array([-0.28451, 0.31007, -0.57039, -0.073056, -0.17322]) expected_cls = np.array([-0.196496, 0.3249364, -0.37408298, -0.10622784, 0.062756]) seq_vecs, sen_vecs = message.get_dense_features(TEXT, []) if seq_vecs: seq_vecs = seq_vecs.features if sen_vecs: sen_vecs = sen_vecs.features assert 5 == len(seq_vecs) assert 1 == len(sen_vecs) assert np.allclose(seq_vecs[0][:5], expected, atol=1e-5) assert np.allclose(sen_vecs[-1][:5], expected_cls, atol=1e-5) seq_vecs, sen_vecs = message.get_dense_features(RESPONSE, []) if seq_vecs: seq_vecs = seq_vecs.features if sen_vecs: sen_vecs = sen_vecs.features assert 5 == len(seq_vecs) assert 1 == len(sen_vecs) assert np.allclose(seq_vecs[0][:5], expected, atol=1e-5) assert np.allclose(sen_vecs[-1][:5], expected_cls, atol=1e-5) seq_vecs, sen_vecs = message.get_dense_features(INTENT, []) if seq_vecs: seq_vecs = seq_vecs.features if sen_vecs: sen_vecs = sen_vecs.features assert seq_vecs is None assert sen_vecs is None
for i, o in enumerate(output): assert isinstance(o, np.ndarray) assert o[0][i] == 1 assert o.shape == (1, len(label_features)) @pytest.mark.parametrize( "messages, expected", [ ( [ Message( data={TEXT: "test a"}, features=[ Features(np.zeros(1), FEATURE_TYPE_SEQUENCE, TEXT, "test"), Features(np.zeros(1), FEATURE_TYPE_SENTENCE, TEXT, "test"), ], ), Message( data={TEXT: "test b"}, features=[ Features(np.zeros(1), FEATURE_TYPE_SEQUENCE, TEXT, "test"), Features(np.zeros(1), FEATURE_TYPE_SENTENCE, TEXT, "test"), ], ), ], True, ), ( [
def process(self, message: Message, **kwargs: Any) -> None: from tokenizer_tools.tagset.NER.BILUO import BILUOSequenceEncoderDecoder from tokenizer_tools.tagset.offset.sequence import Sequence decoder = BILUOSequenceEncoderDecoder() real_result_dir = os.path.join(self.model_dir, self.result_dir) print(real_result_dir) input_text = message.text input_feature = { 'words': [[i for i in input_text]], 'words_len': [len(input_text)], } print(input_feature) predictions = self.predict_fn(input_feature) tags = predictions['tags'][0] # print(predictions['tags']) # decode Unicode tags_seq = [i.decode() for i in tags] print(tags_seq) # BILUO to offset failed = False try: seq = decoder.to_offset(tags_seq, input_text) except Exception as e: # invalid tag sequence will raise exception # so return a empty result logger.error("Decode error: {}".format(e)) seq = Sequence(input_text) failed = True # print(seq) print(seq, tags_seq, failed) entity_set = [] seq.span_set.fill_text(input_text) for span in seq.span_set: ent = { "entity": span.entity, "value": span.value, "start": span.start, "confidence": None, "end": span.end } entity_set.append(ent) extracted = self.add_extractor_name(entity_set) message.set("entities", message.get("entities", []) + extracted, add_to_output=True)
def test_encode_state__with_lookup__looksup_or_creates_features(action_name: Text): """Tests that features from table are combined or created from scratch. If the given action name is ... - ACTION_LISTEN_NAME then the user substate and the action name are encoded - some "other" action, then the user-substate is not encoed but the action name is - set to "None", then we remove the action name from the user substate and as a result there should be no encoding for the action name and for the user substate """ f = SingleStateFeaturizer() f._default_feature_states[INTENT] = {"greet": 0, "inform": 1} f._default_feature_states[ENTITIES] = { "city": 0, "name": 1, f"city{ENTITY_LABEL_SEPARATOR}to": 2, f"city{ENTITY_LABEL_SEPARATOR}from": 3, } f._default_feature_states[ACTION_NAME] = { "NOT_action_listen": 0, "utter_greet": 1, ACTION_LISTEN_NAME: 2, } # `_0` in slots represent feature dimension f._default_feature_states[SLOTS] = {"slot_1_0": 0, "slot_2_0": 1, "slot_3_0": 2} f._default_feature_states[ACTIVE_LOOP] = { "active_loop_1": 0, "active_loop_2": 1, "active_loop_3": 2, "active_loop_4": 3, } # create state text = "I am flying from London to Paris" tokens = [ Token(text=match.group(), start=match.start()) for match in re.finditer(r"\S+", text) ] entity_name_list = ["city", f"city{ENTITY_LABEL_SEPARATOR}to"] action_text = "throw a ball" intent = "inform" state = { USER: {TEXT: text, INTENT: intent, ENTITIES: entity_name_list,}, PREVIOUS_ACTION: {ACTION_NAME: action_name, ACTION_TEXT: action_text,}, ACTIVE_LOOP: {"name": "active_loop_4"}, SLOTS: {"slot_1": (1.0,)}, } if action_name is None: del state[PREVIOUS_ACTION][ACTION_NAME] # Build lookup table with all relevant information - and dummy features for all # dense featurizable attributes. # Note that we don't need to add the `ENTITIES` to the message including `TEXT` # here because `encode_state` won't featurize the entities using the lookup table # (only `encode_entities` does that). units = 300 precomputations = MessageContainerForCoreFeaturization() precomputations.add_all( [ Message( data={TEXT: text, TOKENS_NAMES[TEXT]: tokens}, features=[ dummy_features( fill_value=11, units=units, attribute=TEXT, type=SENTENCE, is_sparse=True, ), dummy_features( fill_value=12, units=units, attribute=TEXT, type=SEQUENCE, is_sparse=False, ), # Note: sparse sequence feature is last here dummy_features( fill_value=13, units=units, attribute=TEXT, type=SEQUENCE, is_sparse=True, ), ], ), Message(data={INTENT: intent}), Message( data={ACTION_TEXT: action_text}, features=[ dummy_features( fill_value=1, units=units, attribute=ACTION_TEXT, type=SEQUENCE, is_sparse=True, ) ], ), ] ) if action_name is not None: precomputations.add(Message(data={ACTION_NAME: action_name})) # encode the state encoded = f.encode_state(state, precomputations=precomputations,) # check all the features are encoded and *_text features are encoded by a # dense featurizer expected_attributes = [SLOTS, ACTIVE_LOOP, ACTION_TEXT] if action_name is not None: # i.e. we did not remove it from the state expected_attributes += [ACTION_NAME] if action_name == ACTION_LISTEN_NAME: expected_attributes += [TEXT, ENTITIES, INTENT] assert set(encoded.keys()) == set(expected_attributes) # Remember, sparse sequence features come first (and `.features` denotes the matrix # not a `Features` object) if action_name == ACTION_LISTEN_NAME: assert encoded[TEXT][0].features.shape[-1] == units assert encoded[TEXT][0].is_sparse() assert encoded[ENTITIES][0].features.shape[-1] == 4 assert sparse_equals_dense(encoded[INTENT][0].features, np.array([[0, 1]])) assert encoded[ACTION_TEXT][0].features.shape[-1] == units assert encoded[ACTION_TEXT][0].is_sparse() if action_name is not None: if action_name == "NOT_action_listen": action_name_encoding = [1, 0, 0] else: # action_listen action_name_encoding = [0, 0, 1] assert sparse_equals_dense( encoded[ACTION_NAME][0].features, np.array([action_name_encoding]) ) else: assert ACTION_NAME not in encoded assert sparse_equals_dense(encoded[SLOTS][0].features, np.array([[1, 0, 0]])) assert sparse_equals_dense( encoded[ACTIVE_LOOP][0].features, np.array([[0, 0, 0, 1]]) )
Message( data={ TEXT: "some message", INTENT: { INTENT_NAME_KEY: "greet", PREDICTED_CONFIDENCE_KEY: 0.234891876578331, }, INTENT_RANKING_KEY: [ { INTENT_NAME_KEY: "greet", PREDICTED_CONFIDENCE_KEY: 0.234891876578331, }, { INTENT_NAME_KEY: "stop", PREDICTED_CONFIDENCE_KEY: 0.5 - 0.0001, }, { INTENT_NAME_KEY: "affirm", PREDICTED_CONFIDENCE_KEY: 0 }, { INTENT_NAME_KEY: "inform", PREDICTED_CONFIDENCE_KEY: -100 }, { INTENT_NAME_KEY: "deny", PREDICTED_CONFIDENCE_KEY: 0.0879683718085289, }, ], }),
def test_encode_entities__with_bilou_entity_roles_and_groups(): # Instantiate domain and configure the single state featurizer for this domain. # Note that there are 2 entity tags here. entity_tags = ["city", f"city{ENTITY_LABEL_SEPARATOR}to"] domain = Domain( intents=[], entities=entity_tags, slots=[], responses={}, forms={}, action_names=[], ) f = SingleStateFeaturizer() f.prepare_for_training(domain, bilou_tagging=True) # (1) example with both entities # create message that has been tokenized and where entities have been extracted text = "I am flying from London to Paris" tokens = [ Token(text=match.group(), start=match.start()) for match in re.finditer(r"\S+", text) ] entities = [ { ENTITY_ATTRIBUTE_TYPE: entity_tags[0], ENTITY_ATTRIBUTE_VALUE: "London", ENTITY_ATTRIBUTE_START: 17, ENTITY_ATTRIBUTE_END: 23, }, { ENTITY_ATTRIBUTE_TYPE: entity_tags[1], ENTITY_ATTRIBUTE_VALUE: "Paris", ENTITY_ATTRIBUTE_START: 27, ENTITY_ATTRIBUTE_END: 32, }, ] message = Message({TEXT: text, TOKENS_NAMES[TEXT]: tokens, ENTITIES: entities}) # create a lookup table that has seen this message precomputations = MessageContainerForCoreFeaturization() precomputations.add(message) # encode! encoded = f.encode_entities( {TEXT: text, ENTITIES: entities,}, precomputations=precomputations, bilou_tagging=True, ) assert sorted(list(encoded.keys())) == sorted([ENTITY_TAGS]) assert np.all( encoded[ENTITY_TAGS][0].features == [[0], [0], [0], [0], [4], [0], [8]] ) # (2) example with only the "city" entity # create message that has been tokenized and where entities have been extracted text = "I am flying to Saint Petersburg" tokens = [ Token(text=match.group(), start=match.start()) for match in re.finditer(r"\S+", text) ] entities = [ { ENTITY_ATTRIBUTE_TYPE: "city", ENTITY_ATTRIBUTE_VALUE: "Saint Petersburg", ENTITY_ATTRIBUTE_START: 15, ENTITY_ATTRIBUTE_END: 31, }, ] message = Message({TEXT: text, TOKENS_NAMES[TEXT]: tokens, ENTITIES: entities}) # create a lookup table that has seen this message precomputations = MessageContainerForCoreFeaturization() precomputations.add(message) # encode! encoded = f.encode_entities( {TEXT: text, ENTITIES: entities,}, precomputations=precomputations, bilou_tagging=True, ) assert sorted(list(encoded.keys())) == sorted([ENTITY_TAGS]) assert np.all(encoded[ENTITY_TAGS][0].features == [[0], [0], [0], [0], [1], [3]])
def _features_for_patterns( self, message: Message, attribute: Text ) -> Tuple[Optional[scipy.sparse.coo_matrix], Optional[scipy.sparse.coo_matrix]]: """Checks which known patterns match the message. Given a sentence, returns a vector of {1,0} values indicating which regexes did match. Furthermore, if the message is tokenized, the function will mark all tokens with a dict relating the name of the regex to whether it was matched. Args: message: Message to be featurized. attribute: Attribute of message to be featurized. Returns: Token and sentence level features of message attribute. """ # Attribute not set (e.g. response not present) if not message.get(attribute): return None, None tokens = message.get(TOKENS_NAMES[attribute], []) if not tokens: # nothing to featurize return None, None flags = 0 # default flag if not self.case_sensitive: flags = re.IGNORECASE sequence_length = len(tokens) num_patterns = len(self.known_patterns) sequence_features = np.zeros([sequence_length, num_patterns]) sentence_features = np.zeros([1, num_patterns]) for pattern_index, pattern in enumerate(self.known_patterns): matches = re.finditer(pattern["pattern"], message.get(attribute), flags=flags) matches = list(matches) for token_index, t in enumerate(tokens): patterns = t.get("pattern", default={}) patterns[pattern["name"]] = False for match in matches: if t.start < match.end() and t.end > match.start(): patterns[pattern["name"]] = True sequence_features[token_index][pattern_index] = 1.0 if attribute in [RESPONSE, TEXT, ACTION_TEXT]: # sentence vector should contain all patterns sentence_features[0][pattern_index] = 1.0 t.set("pattern", patterns) return ( scipy.sparse.coo_matrix(sequence_features), scipy.sparse.coo_matrix(sentence_features), )
def get_doc(self, message: Message, attribute: Text) -> Optional["Doc"]: return message.get(SPACY_DOCS[attribute])
def test_convert_featurizer_train( create_or_load_convert_featurizer: Callable[[Dict[Text, Any]], ConveRTFeaturizer], monkeypatch: MonkeyPatch, load: bool, whitespace_tokenizer: WhitespaceTokenizer, ): monkeypatch.setattr( ConveRTFeaturizer, "_validate_model_url", lambda _: None, ) component_config = { FEATURIZER_CLASS_ALIAS: "alias", "model_url": RESTRICTED_ACCESS_URL, } featurizer = create_or_load_convert_featurizer(component_config, load=True) sentence = "Hey how are you today ?" message = Message(data={TEXT: sentence}) message.set(RESPONSE, sentence) td = TrainingData([message]) whitespace_tokenizer.process_training_data(td) tokens = featurizer.tokenize(message, attribute=TEXT) message.set(TOKENS_NAMES[TEXT], tokens) message.set(TOKENS_NAMES[RESPONSE], tokens) featurizer.process_training_data(TrainingData([message])) expected = np.array( [2.2636216, -0.26475656, -1.1358104, -0.49751878, -1.3946456]) expected_cls = np.array( [1.0251294, -0.04053932, -0.7018805, -0.82054937, -0.75054353]) seq_vecs, sent_vecs = message.get_dense_features(TEXT, []) seq_vecs = seq_vecs.features sent_vecs = sent_vecs.features assert len(tokens) == len(seq_vecs) assert np.allclose(seq_vecs[0][:5], expected, atol=1e-5) assert np.allclose(sent_vecs[-1][:5], expected_cls, atol=1e-5) seq_vecs, sent_vecs = message.get_dense_features(RESPONSE, []) seq_vecs = seq_vecs.features sent_vecs = sent_vecs.features assert len(tokens) == len(seq_vecs) assert np.allclose(seq_vecs[0][:5], expected, atol=1e-5) assert np.allclose(sent_vecs[-1][:5], expected_cls, atol=1e-5) seq_vecs, sent_vecs = message.get_dense_features(INTENT, []) assert seq_vecs is None assert sent_vecs is None
def test_count_vector_featurizer_persist_load(tmp_path: Path): # set non default values to config config = { "analyzer": "char", "strip_accents": "ascii", "stop_words": "stop", "min_df": 2, "max_df": 3, "min_ngram": 2, "max_ngram": 3, "max_features": 10, "lowercase": False, } train_ftr = CountVectorsFeaturizer(config) sentence1 = "ababab 123 13xc лаомтгцу sfjv oö aà" sentence2 = "abababalidcn 123123 13xcdc лаомтгцу sfjv oö aà" train_message1 = Message(data={TEXT: sentence1}) train_message2 = Message(data={TEXT: sentence2}) WhitespaceTokenizer().process(train_message1) WhitespaceTokenizer().process(train_message2) data = TrainingData([train_message1, train_message2]) train_ftr.train(data) # persist featurizer file_dict = train_ftr.persist("ftr", str(tmp_path)) train_vect_params = { attribute: vectorizer.get_params() for attribute, vectorizer in train_ftr.vectorizers.items() } # add trained vocabulary to vectorizer params for attribute, attribute_vect_params in train_vect_params.items(): if hasattr(train_ftr.vectorizers[attribute], "vocabulary_"): train_vect_params[attribute].update( {"vocabulary": train_ftr.vectorizers[attribute].vocabulary_} ) # load featurizer meta = train_ftr.component_config.copy() meta.update(file_dict) test_ftr = CountVectorsFeaturizer.load(meta, str(tmp_path), finetune_mode=False) test_vect_params = { attribute: vectorizer.get_params() for attribute, vectorizer in test_ftr.vectorizers.items() } assert train_vect_params == test_vect_params # check if vocaculary was loaded correctly assert hasattr(test_ftr.vectorizers[TEXT], "vocabulary_") test_message1 = Message(data={TEXT: sentence1}) WhitespaceTokenizer().process(test_message1) test_ftr.process(test_message1) test_message2 = Message(data={TEXT: sentence2}) WhitespaceTokenizer().process(test_message2) test_ftr.process(test_message2) test_seq_vec_1, test_sen_vec_1 = test_message1.get_sparse_features(TEXT, []) if test_seq_vec_1: test_seq_vec_1 = test_seq_vec_1.features if test_sen_vec_1: test_sen_vec_1 = test_sen_vec_1.features train_seq_vec_1, train_sen_vec_1 = train_message1.get_sparse_features(TEXT, []) if train_seq_vec_1: train_seq_vec_1 = train_seq_vec_1.features if train_sen_vec_1: train_sen_vec_1 = train_sen_vec_1.features test_seq_vec_2, test_sen_vec_2 = test_message2.get_sparse_features(TEXT, []) if test_seq_vec_2: test_seq_vec_2 = test_seq_vec_2.features if test_sen_vec_2: test_sen_vec_2 = test_sen_vec_2.features train_seq_vec_2, train_sen_vec_2 = train_message2.get_sparse_features(TEXT, []) if train_seq_vec_2: train_seq_vec_2 = train_seq_vec_2.features if train_sen_vec_2: train_sen_vec_2 = train_sen_vec_2.features # check that train features and test features after loading are the same assert np.all(test_seq_vec_1.toarray() == train_seq_vec_1.toarray()) assert np.all(test_sen_vec_1.toarray() == train_sen_vec_1.toarray()) assert np.all(test_seq_vec_2.toarray() == train_seq_vec_2.toarray()) assert np.all(test_sen_vec_2.toarray() == train_sen_vec_2.toarray())
async def test_adjusting_layers_incremental_training( create_response_selector: Callable[[Dict[Text, Any]], ResponseSelector], load_response_selector: Callable[[Dict[Text, Any]], ResponseSelector], train_and_preprocess: Callable[..., Tuple[TrainingData, List[GraphComponent]]], process_message: Callable[..., Message], ): """Tests adjusting sparse layers of `ResponseSelector` to increased sparse feature sizes during incremental training. Testing is done by checking the layer sizes. Checking if they were replaced correctly is also important and is done in `test_replace_dense_for_sparse_layers` in `test_rasa_layers.py`. """ iter1_data_path = "data/test_incremental_training/iter1/" iter2_data_path = "data/test_incremental_training/" pipeline = [ { "component": WhitespaceTokenizer }, { "component": LexicalSyntacticFeaturizer }, { "component": RegexFeaturizer }, { "component": CountVectorsFeaturizer }, { "component": CountVectorsFeaturizer, "analyzer": "char_wb", "min_ngram": 1, "max_ngram": 4, }, ] training_data, loaded_pipeline = train_and_preprocess( pipeline, iter1_data_path) response_selector = create_response_selector({EPOCHS: 1}) response_selector.train(training_data=training_data) old_data_signature = response_selector.model.data_signature old_predict_data_signature = response_selector.model.predict_data_signature message = Message(data={TEXT: "Rasa is great!"}) message = process_message(loaded_pipeline, message) message2 = copy.deepcopy(message) classified_message = response_selector.process([message])[0] old_sparse_feature_sizes = classified_message.get_sparse_feature_sizes( attribute=TEXT) initial_rs_layers = response_selector.model._tf_layers[ "sequence_layer.text"]._tf_layers["feature_combining"] initial_rs_sequence_layer = initial_rs_layers._tf_layers[ "sparse_dense.sequence"]._tf_layers["sparse_to_dense"] initial_rs_sentence_layer = initial_rs_layers._tf_layers[ "sparse_dense.sentence"]._tf_layers["sparse_to_dense"] initial_rs_sequence_size = initial_rs_sequence_layer.get_kernel().shape[0] initial_rs_sentence_size = initial_rs_sentence_layer.get_kernel().shape[0] assert initial_rs_sequence_size == sum( old_sparse_feature_sizes[FEATURE_TYPE_SEQUENCE]) assert initial_rs_sentence_size == sum( old_sparse_feature_sizes[FEATURE_TYPE_SENTENCE]) loaded_selector = load_response_selector({EPOCHS: 1}) classified_message2 = loaded_selector.process([message2])[0] assert classified_message2.fingerprint() == classified_message.fingerprint( ) training_data2, loaded_pipeline2 = train_and_preprocess( pipeline, iter2_data_path) response_selector.train(training_data=training_data2) new_message = Message.build(text="Rasa is great!") new_message = process_message(loaded_pipeline2, new_message) classified_new_message = response_selector.process([new_message])[0] new_sparse_feature_sizes = classified_new_message.get_sparse_feature_sizes( attribute=TEXT) final_rs_layers = response_selector.model._tf_layers[ "sequence_layer.text"]._tf_layers["feature_combining"] final_rs_sequence_layer = final_rs_layers._tf_layers[ "sparse_dense.sequence"]._tf_layers["sparse_to_dense"] final_rs_sentence_layer = final_rs_layers._tf_layers[ "sparse_dense.sentence"]._tf_layers["sparse_to_dense"] final_rs_sequence_size = final_rs_sequence_layer.get_kernel().shape[0] final_rs_sentence_size = final_rs_sentence_layer.get_kernel().shape[0] assert final_rs_sequence_size == sum( new_sparse_feature_sizes[FEATURE_TYPE_SEQUENCE]) assert final_rs_sentence_size == sum( new_sparse_feature_sizes[FEATURE_TYPE_SENTENCE]) # check if the data signatures were correctly updated new_data_signature = response_selector.model.data_signature new_predict_data_signature = response_selector.model.predict_data_signature iter2_data = load_data(iter2_data_path) expected_sequence_lengths = len([ message for message in iter2_data.training_examples if message.get(INTENT_RESPONSE_KEY) ]) def test_data_signatures( new_signature: Dict[Text, Dict[Text, List[FeatureArray]]], old_signature: Dict[Text, Dict[Text, List[FeatureArray]]], ): # Wherever attribute / feature_type signature is not # expected to change, directly compare it to old data signature. # Else compute its expected signature and compare attributes_expected_to_change = [TEXT] feature_types_expected_to_change = [ FEATURE_TYPE_SEQUENCE, FEATURE_TYPE_SENTENCE, ] for attribute, signatures in new_signature.items(): for feature_type, feature_signatures in signatures.items(): if feature_type == "sequence_lengths": assert feature_signatures[ 0].units == expected_sequence_lengths elif feature_type not in feature_types_expected_to_change: assert feature_signatures == old_signature.get( attribute).get(feature_type) else: for index, feature_signature in enumerate( feature_signatures): if (feature_signature.is_sparse and attribute in attributes_expected_to_change): assert feature_signature.units == sum( new_sparse_feature_sizes.get(feature_type)) else: # dense signature or attributes that are not # expected to change can be compared directly assert ( feature_signature.units == old_signature.get( attribute).get(feature_type)[index].units) test_data_signatures(new_data_signature, old_data_signature) test_data_signatures(new_predict_data_signature, old_predict_data_signature)
def test_count_vector_featurizer_process_by_attribute( sentence: Text, action_name: Text, action_text: Text, action_name_features: np.ndarray, response_features: np.ndarray, ): ftr = CountVectorsFeaturizer({"token_pattern": r"(?u)\b\w+\b",}) tk = WhitespaceTokenizer() # add a second example that has some response, so that the vocabulary for # response exists train_message = Message(data={TEXT: "hello"}) train_message.set(ACTION_NAME, "greet") train_message1 = Message(data={TEXT: "hello"}) train_message1.set(ACTION_TEXT, "hi") data = TrainingData([train_message, train_message1]) tk.train(data) ftr.train(data) test_message = Message(data={TEXT: sentence}) test_message.set(ACTION_NAME, action_name) test_message.set(ACTION_TEXT, action_text) for module in [tk, ftr]: module.process(test_message) action_name_seq_vecs, action_name_sen_vecs = test_message.get_sparse_features( ACTION_NAME, [] ) if action_name_seq_vecs: action_name_seq_vecs = action_name_seq_vecs.features if action_name_sen_vecs: action_name_sen_vecs = action_name_sen_vecs.features assert action_name_seq_vecs.toarray()[0] == action_name_features assert action_name_sen_vecs is None
async def test_sparse_feature_sizes_decreased_incremental_training( iter1_path: Text, iter2_path: Text, should_raise_exception: bool, create_response_selector: Callable[[Dict[Text, Any]], ResponseSelector], load_response_selector: Callable[[Dict[Text, Any]], ResponseSelector], default_execution_context: ExecutionContext, train_and_preprocess: Callable[..., Tuple[TrainingData, List[GraphComponent]]], process_message: Callable[..., Message], ): pipeline = [ { "component": WhitespaceTokenizer }, { "component": LexicalSyntacticFeaturizer }, { "component": RegexFeaturizer }, { "component": CountVectorsFeaturizer }, { "component": CountVectorsFeaturizer, "analyzer": "char_wb", "min_ngram": 1, "max_ngram": 4, }, ] training_data, loaded_pipeline = train_and_preprocess(pipeline, iter1_path) response_selector = create_response_selector({EPOCHS: 1}) response_selector.train(training_data=training_data) message = Message(data={TEXT: "Rasa is great!"}) message = process_message(loaded_pipeline, message) message2 = copy.deepcopy(message) classified_message = response_selector.process([message])[0] default_execution_context.is_finetuning = True loaded_selector = load_response_selector({EPOCHS: 1}) classified_message2 = loaded_selector.process([message2])[0] assert classified_message2.fingerprint() == classified_message.fingerprint( ) if should_raise_exception: with pytest.raises(Exception) as exec_info: training_data2, loaded_pipeline2 = train_and_preprocess( pipeline, iter2_path) loaded_selector.train(training_data=training_data2) assert "Sparse feature sizes have decreased" in str(exec_info.value) else: training_data2, loaded_pipeline2 = train_and_preprocess( pipeline, iter2_path) loaded_selector.train(training_data=training_data2) assert loaded_selector.model
default_model_storage: ModelStorage, default_execution_context: ExecutionContext, ): return CoreFeaturizationCollector.create( CoreFeaturizationCollector.get_default_config(), default_model_storage, Resource("CoreFeaturizationCollector"), default_execution_context, ) @pytest.mark.parametrize( "messages_with_unique_lookup_key", [ [ Message(data={TEXT: "A"}, features=[_dummy_features(1, TEXT)]), Message(data={ACTION_TEXT: "B"}), ], [], ], ) def test_collection( collector: CoreFeaturizationCollector, messages_with_unique_lookup_key: List[Message], ): messages = messages_with_unique_lookup_key # pass as training data training_data = TrainingData(training_examples=messages) precomputations = collector.collect(training_data)
def test_convert_featurizer_train(monkeypatch: MonkeyPatch): tokenizer = WhitespaceTokenizer() monkeypatch.setattr(ConveRTFeaturizer, "_get_validated_model_url", lambda x: RESTRICTED_ACCESS_URL) component_config = { "name": "ConveRTFeaturizer", "model_url": RESTRICTED_ACCESS_URL } featurizer = ConveRTFeaturizer(component_config) sentence = "Hey how are you today ?" message = Message(data={TEXT: sentence}) message.set(RESPONSE, sentence) td = TrainingData([message]) tokenizer.train(td) tokens = featurizer.tokenize(message, attribute=TEXT) message.set(TOKENS_NAMES[TEXT], tokens) message.set(TOKENS_NAMES[RESPONSE], tokens) featurizer.train(TrainingData([message]), RasaNLUModelConfig(), tf_hub_module=featurizer.module) expected = np.array( [2.2636216, -0.26475656, -1.1358104, -0.49751878, -1.3946456]) expected_cls = np.array( [1.0251294, -0.04053932, -0.7018805, -0.82054937, -0.75054353]) seq_vecs, sent_vecs = message.get_dense_features(TEXT, []) seq_vecs = seq_vecs.features sent_vecs = sent_vecs.features assert len(tokens) == len(seq_vecs) assert np.allclose(seq_vecs[0][:5], expected, atol=1e-5) assert np.allclose(sent_vecs[-1][:5], expected_cls, atol=1e-5) seq_vecs, sent_vecs = message.get_dense_features(RESPONSE, []) seq_vecs = seq_vecs.features sent_vecs = sent_vecs.features assert len(tokens) == len(seq_vecs) assert np.allclose(seq_vecs[0][:5], expected, atol=1e-5) assert np.allclose(sent_vecs[-1][:5], expected_cls, atol=1e-5) seq_vecs, sent_vecs = message.get_dense_features(INTENT, []) assert seq_vecs is None assert sent_vecs is None
def test_container_all_messages(): message_data_list = [{INTENT: "1"}, {INTENT: "2", "other": 3}, {TEXT: "3"}] container = MessageContainerForCoreFeaturization() container.add_all([Message(data=data) for data in message_data_list]) assert len(container.all_messages()) == 3
def test_whitespace_training(supervised_embeddings_config: RasaNLUModelConfig): examples = [ Message( data={ TEXT: "Any Mexican restaurant will do", "intent": "restaurant_search", "entities": [{ "start": 4, "end": 11, "value": "Mexican", "entity": "cuisine" }], }), Message( data={ TEXT: "I want Tacos!", "intent": "restaurant_search", "entities": [{ "start": 7, "end": 12, "value": "Mexican", "entity": "cuisine" }], }), Message(data={ TEXT: "action_restart", "action_name": "action_restart" }), Message( data={ TEXT: "Where are you going?", ACTION_NAME: "Where are you going?", ACTION_TEXT: "Where are you going?", }), ] component_config = { "case_sensitive": False, "intent_tokenization_flag": True } tk = WhitespaceTokenizer(component_config) tk.train(TrainingData(training_examples=examples), supervised_embeddings_config) assert examples[0].data.get(TOKENS_NAMES[TEXT])[0].text == "Any" assert examples[0].data.get(TOKENS_NAMES[TEXT])[1].text == "Mexican" assert examples[0].data.get(TOKENS_NAMES[TEXT])[2].text == "restaurant" assert examples[0].data.get(TOKENS_NAMES[TEXT])[3].text == "will" assert examples[0].data.get(TOKENS_NAMES[TEXT])[4].text == "do" assert examples[1].data.get(TOKENS_NAMES[TEXT])[0].text == "I" assert examples[1].data.get(TOKENS_NAMES[TEXT])[1].text == "want" assert examples[1].data.get(TOKENS_NAMES[TEXT])[2].text == "Tacos" assert examples[2].data.get(TOKENS_NAMES[ACTION_NAME])[0].text == "action" assert examples[2].data.get(TOKENS_NAMES[ACTION_NAME])[1].text == "restart" assert examples[2].data.get(TOKENS_NAMES[TEXT])[0].text == "action_restart" assert examples[2].data.get(TOKENS_NAMES[ACTION_TEXT]) is None assert examples[3].data.get(TOKENS_NAMES[ACTION_TEXT])[0].text == "Where" assert examples[3].data.get(TOKENS_NAMES[ACTION_TEXT])[1].text == "are" assert examples[3].data.get(TOKENS_NAMES[ACTION_TEXT])[2].text == "you" assert examples[3].data.get(TOKENS_NAMES[ACTION_TEXT])[3].text == "going"
def process(self, message: Message, **kwargs: Any) -> None: for attribute in DENSE_FEATURIZABLE_ATTRIBUTES: if message.get(attribute): message.set(SPACY_DOCS[attribute], self.doc_for_text(message.get(attribute)))
for i, o in enumerate(output): assert isinstance(o, np.ndarray) assert o[0][i] == 1 assert o.shape == (1, len(label_features)) @pytest.mark.parametrize( "messages, expected", [ ( [ Message( data={TEXT: "test a"}, features=[ Features(np.zeros(1), FEATURE_TYPE_SEQUENCE, TEXT, "test"), Features(np.zeros(1), FEATURE_TYPE_SENTENCE, TEXT, "test"), ], ), Message( data={TEXT: "test b"}, features=[ Features(np.zeros(1), FEATURE_TYPE_SEQUENCE, TEXT, "test"), Features(np.zeros(1), FEATURE_TYPE_SENTENCE, TEXT, "test"), ], ), ], True, ), ( [