def test_encode_entities__with_entity_roles_and_groups(): # create fake message that has been tokenized and entities have been extracted text = "I am flying from London to Paris" tokens = [ Token(text=match.group(), start=match.start()) for match in re.finditer(r"\S+", text) ] entity_tags = ["city", f"city{ENTITY_LABEL_SEPARATOR}to"] entities = [ { ENTITY_ATTRIBUTE_TYPE: entity_tags[0], ENTITY_ATTRIBUTE_VALUE: "London", ENTITY_ATTRIBUTE_START: 17, ENTITY_ATTRIBUTE_END: 23, }, { ENTITY_ATTRIBUTE_TYPE: entity_tags[1], ENTITY_ATTRIBUTE_VALUE: "Paris", ENTITY_ATTRIBUTE_START: 27, ENTITY_ATTRIBUTE_END: 32, }, ] message = Message({ TEXT: text, TOKENS_NAMES[TEXT]: tokens, ENTITIES: entities }) # create a lookup table that has seen this message precomputations = MessageContainerForCoreFeaturization() precomputations.add(message) # instantiate matching domain and single state featurizer domain = Domain( intents=[], entities=entity_tags, slots=[], responses={}, forms={}, action_names=[], ) f = SingleStateFeaturizer() f.prepare_for_training(domain) # encode! encoded = f.encode_entities(entity_data={ TEXT: text, ENTITIES: entities }, precomputations=precomputations) # check assert len(f.entity_tag_specs) == 1 tags_to_ids = f.entity_tag_specs[0].tags_to_ids for idx, entity_tag in enumerate(entity_tags): tags_to_ids[entity_tag] = idx + 1 # hence, city -> 1, city#to -> 2 assert sorted(list(encoded.keys())) == [ENTITY_TAGS] assert np.all(encoded[ENTITY_TAGS][0].features == [[0], [0], [0], [0], [1], [0], [2]])
def test_load_multi_file_training_data(domain: Domain): featurizer = MaxHistoryTrackerFeaturizer(SingleStateFeaturizer(), max_history=2) trackers = training.load_data( "data/test_yaml_stories/stories.yml", domain, augmentation_factor=0 ) trackers = sorted(trackers, key=lambda t: t.sender_id) (tr_as_sts, tr_as_acts) = featurizer.training_states_and_labels(trackers, domain) hashed = [] for sts, acts in zip(tr_as_sts, tr_as_acts): hashed.append(json.dumps(sts + acts, sort_keys=True)) hashed = sorted(hashed, reverse=True) data, label_ids, _ = featurizer.featurize_trackers( trackers, domain, precomputations=None ) featurizer_mul = MaxHistoryTrackerFeaturizer(SingleStateFeaturizer(), max_history=2) trackers_mul = training.load_data( "data/test_multifile_yaml_stories", domain, augmentation_factor=0 ) trackers_mul = sorted(trackers_mul, key=lambda t: t.sender_id) (tr_as_sts_mul, tr_as_acts_mul) = featurizer.training_states_and_labels( trackers_mul, domain ) hashed_mul = [] for sts_mul, acts_mul in zip(tr_as_sts_mul, tr_as_acts_mul): hashed_mul.append(json.dumps(sts_mul + acts_mul, sort_keys=True)) hashed_mul = sorted(hashed_mul, reverse=True) data_mul, label_ids_mul, _ = featurizer_mul.featurize_trackers( trackers_mul, domain, precomputations=None ) assert hashed == hashed_mul # we check for intents, action names and entities -- the features which # are included in the story files data = _surface_attributes(data) data_mul = _surface_attributes(data_mul) for attribute in [INTENT, ACTION_NAME, ENTITIES]: if attribute not in data or attribute not in data_mul: continue assert len(data.get(attribute)) == len(data_mul.get(attribute)) for idx_tracker in range(len(data.get(attribute))): for idx_dialogue in range(len(data.get(attribute)[idx_tracker])): f1 = data.get(attribute)[idx_tracker][idx_dialogue] f2 = data_mul.get(attribute)[idx_tracker][idx_dialogue] if f1 is None or f2 is None: assert f1 == f2 continue for idx_turn in range(len(f1)): f1 = data.get(attribute)[idx_tracker][idx_dialogue][idx_turn] f2 = data_mul.get(attribute)[idx_tracker][idx_dialogue][idx_turn] assert np.all((f1 == f2).data) assert np.all(label_ids == label_ids_mul)
def test_encode_all_labels__encoded_all_action_names_and_texts(): # ... where "labels" means actions... domain = Domain( intents=[], entities=[], slots=[], responses={}, forms={}, action_names=["a", "b", "c", "d"], data={}, ) f = SingleStateFeaturizer() f.prepare_for_training(domain) precomputations = MessageContainerForCoreFeaturization() precomputations.derive_messages_from_domain_and_add(domain) encoded_actions = f.encode_all_labels(domain, precomputations=precomputations) assert len(encoded_actions) == len(domain.action_names_or_texts) assert all( [ ACTION_NAME in encoded_action and ACTION_TEXT not in encoded_action for encoded_action in encoded_actions ] )
def test_encode_state__with_lookup__creates_features_for_intent_and_action_name( with_action_listen: bool, ): """Tests that features for intent and action name are created if needed. Especially tests that this is the case even though no features are present in the given lookup table for this intent and action_name. However, if no `action_listen` is in the given sub-state, then the user sub-state should not be featurized (hence, no features for intent) should be created. """ f = SingleStateFeaturizer() f._default_feature_states[INTENT] = {"a": 0, "b": 1} f._default_feature_states[ACTION_NAME] = {"c": 0, "d": 1, ACTION_LISTEN_NAME: 2} # create state action_name = ACTION_LISTEN_NAME if with_action_listen else "c" state = {USER: {INTENT: "e"}, PREVIOUS_ACTION: {ACTION_NAME: action_name}} # create a lookup table with all relevant entries **but no Features** precomputations = MessageContainerForCoreFeaturization() precomputations.add(Message(data={INTENT: state[USER][INTENT]})) precomputations.add( Message(data={ACTION_NAME: state[PREVIOUS_ACTION][ACTION_NAME]}) ) # encode! encoded = f.encode_state(state, precomputations=precomputations) if with_action_listen: assert set(encoded.keys()) == set([INTENT, ACTION_NAME]) assert ( encoded[INTENT][0].features != scipy.sparse.coo_matrix([[0, 0]]) ).nnz == 0 else: assert set(encoded.keys()) == set([ACTION_NAME])
def test_single_state_featurizer_correctly_encodes_non_existing_value(): f = SingleStateFeaturizer() f._default_feature_states[INTENT] = {"a": 0, "b": 1} f._default_feature_states[ACTION_NAME] = {"c": 0, "d": 1} encoded = f.encode_state( {"user": {"intent": "e"}, "prev_action": {"action_name": "action_listen"}}, interpreter=RegexInterpreter(), ) assert list(encoded.keys()) == [INTENT, ACTION_NAME] assert (encoded[INTENT][0].features != scipy.sparse.coo_matrix([[0, 0]])).nnz == 0
def test_persist_and_load_tracker_featurizer(tmp_path: Text, moodbot_domain: Domain): state_featurizer = SingleStateFeaturizer() state_featurizer.prepare_for_training(moodbot_domain, RegexInterpreter()) tracker_featurizer = MaxHistoryTrackerFeaturizer(state_featurizer) tracker_featurizer.persist(tmp_path) loaded_tracker_featurizer = TrackerFeaturizer.load(tmp_path) assert loaded_tracker_featurizer is not None assert loaded_tracker_featurizer.state_featurizer is not None
def test_single_state_featurizer_with_interpreter_state_with_no_action_name( unpacked_trained_moodbot_path: Text, ): # check that action name features are not added by the featurizer when not # present in the state and # check user input is ignored when action is not action_listen # and action_name is features are not added from rasa.core.agent import Agent interpreter = Agent.load(unpacked_trained_moodbot_path).interpreter f = SingleStateFeaturizer() f._default_feature_states[INTENT] = {"a": 0, "b": 1} f._default_feature_states[ENTITIES] = {"c": 0} f._default_feature_states[ACTION_NAME] = {"e": 0, "d": 1, "action_listen": 2} f._default_feature_states[SLOTS] = {"e_0": 0, "f_0": 1, "g_0": 2} f._default_feature_states[ACTIVE_LOOP] = {"h": 0, "i": 1, "j": 2, "k": 3} encoded = f.encode_state( { "user": {"text": "a ball", "intent": "b", "entities": ["c"]}, "prev_action": {"action_text": "throw a ball"}, "active_loop": {"name": "k"}, "slots": {"e": (1.0,)}, }, interpreter=interpreter, ) assert list(encoded.keys()) == [ACTION_TEXT, ACTIVE_LOOP, SLOTS] assert encoded[ACTION_TEXT][0].features.shape[-1] == 300 assert (encoded[SLOTS][0].features != scipy.sparse.coo_matrix([[1, 0, 0]])).nnz == 0 assert ( encoded[ACTIVE_LOOP][0].features != scipy.sparse.coo_matrix([[0, 0, 0, 1]]) ).nnz == 0
def test_single_state_featurizer_with_interpreter_state_with_action_listen( unpacked_trained_spacybot_path: Text, ): interpreter = Agent.load(unpacked_trained_spacybot_path).interpreter f = SingleStateFeaturizer() f._default_feature_states[INTENT] = {"greet": 0, "inform": 1} f._default_feature_states[ENTITIES] = { "city": 0, "name": 1, f"city{ENTITY_LABEL_SEPARATOR}to": 2, f"city{ENTITY_LABEL_SEPARATOR}from": 3, } f._default_feature_states[ACTION_NAME] = { "utter_ask_where_to": 0, "utter_greet": 1, "action_listen": 2, } # `_0` in slots represent feature dimension f._default_feature_states[SLOTS] = {"slot_1_0": 0, "slot_2_0": 1, "slot_3_0": 2} f._default_feature_states[ACTIVE_LOOP] = { "active_loop_1": 0, "active_loop_2": 1, "active_loop_3": 2, "active_loop_4": 3, } encoded = f.encode_state( { "user": { "text": "I am flying from London to Paris", "intent": "inform", "entities": ["city", f"city{ENTITY_LABEL_SEPARATOR}to"], }, "prev_action": { "action_name": "action_listen", "action_text": "throw a ball", }, "active_loop": {"name": "active_loop_4"}, "slots": {"slot_1": (1.0,)}, }, interpreter=interpreter, ) # check all the features are encoded and *_text features are encoded by a # dense featurizer assert sorted(list(encoded.keys())) == sorted( [TEXT, ENTITIES, ACTION_NAME, SLOTS, ACTIVE_LOOP, INTENT, ACTION_TEXT] ) assert encoded[TEXT][0].features.shape[-1] == 300 assert encoded[ACTION_TEXT][0].features.shape[-1] == 300 assert (encoded[INTENT][0].features != scipy.sparse.coo_matrix([[0, 1]])).nnz == 0 assert ( encoded[ACTION_NAME][0].features != scipy.sparse.coo_matrix([[0, 0, 1]]) ).nnz == 0 assert encoded[ENTITIES][0].features.shape[-1] == 4 assert (encoded[SLOTS][0].features != scipy.sparse.coo_matrix([[1, 0, 0]])).nnz == 0 assert ( encoded[ACTIVE_LOOP][0].features != scipy.sparse.coo_matrix([[0, 0, 0, 1]]) ).nnz == 0
def _standard_featurizer( max_history: int = DEFAULT_MAX_HISTORY, ) -> MaxHistoryTrackerFeaturizer: # Sklearn policy always uses MaxHistoryTrackerFeaturizer return MaxHistoryTrackerFeaturizer( state_featurizer=SingleStateFeaturizer(), max_history=5 )
def test_single_state_featurizer_creates_encoded_all_actions(): domain = Domain( intents=[], entities=[], slots=[], templates={}, forms=[], action_names=["a", "b", "c", "d"], ) f = SingleStateFeaturizer() f.prepare_from_domain(domain) encoded_actions = f.encode_all_actions(domain, RegexInterpreter()) assert len(encoded_actions) == len(domain.action_names) assert all([ ACTION_NAME in encoded_action and ACTION_TEXT not in encoded_action for encoded_action in encoded_actions ])
def test_single_state_featurizer_without_interpreter_state_not_with_action_listen(): """This test are for encoding state without a trained interpreter. action_name is not action_listen, so, INTENT, TEXT and ENTITIES should not be featurized. """ f = SingleStateFeaturizer() f._default_feature_states[INTENT] = {"a": 0, "b": 1} f._default_feature_states[ACTION_NAME] = {"c": 0, "d": 1, "action_listen": 2} f._default_feature_states[SLOTS] = {"e_0": 0, "f_0": 1, "g_0": 2} f._default_feature_states[ACTIVE_LOOP] = {"h": 0, "i": 1, "j": 2, "k": 3} encoded = f.encode_state( { "user": {"intent": "a", "text": "blah blah blah"}, "prev_action": {"action_name": "d", "action_text": "boom"}, "active_loop": {"name": "i"}, "slots": {"g": (1.0,)}, }, interpreter=RegexInterpreter(), ) # user input is ignored as prev action is not action_listen assert list(encoded.keys()) == [ACTION_NAME, ACTIVE_LOOP, SLOTS] assert ( encoded[ACTION_NAME][0].features != scipy.sparse.coo_matrix([[0, 1, 0]]) ).nnz == 0 assert ( encoded[ACTIVE_LOOP][0].features != scipy.sparse.coo_matrix([[0, 1, 0, 0]]) ).nnz == 0 assert (encoded[SLOTS][0].features != scipy.sparse.coo_matrix([[0, 0, 1]])).nnz == 0
def test_single_state_featurizer_without_interpreter_state_no_intent_no_action_name( ): f = SingleStateFeaturizer() f._default_feature_states[INTENT] = {"a": 0, "b": 1} f._default_feature_states[ACTION_NAME] = { "c": 0, "d": 1, "action_listen": 2 } f._default_feature_states[SLOTS] = {"e_0": 0, "f_0": 1, "g_0": 2} f._default_feature_states[ACTIVE_LOOP] = {"h": 0, "i": 1, "j": 2, "k": 3} # check that no intent / action_name features are added when the interpreter isn't there and # intent / action_name not in input encoded = f.encode_state( { "user": { "text": "blah blah blah" }, "prev_action": { "action_text": "boom" }, "active_loop": { "name": "k" }, "slots": { "e": (1.0, ) }, }, interpreter=RegexInterpreter(), ) assert list(encoded.keys()) == [ACTIVE_LOOP, SLOTS] assert (encoded[ACTIVE_LOOP][0].features != scipy.sparse.coo_matrix( [[0, 0, 0, 1]])).nnz == 0 assert (encoded[SLOTS][0].features != scipy.sparse.coo_matrix([[1, 0, 0] ])).nnz == 0
def test_single_state_featurizer_without_interpreter_state_with_action_listen(): """This test are for encoding state without a trained interpreter. action_name is action_listen, so, INTENT and ENTITIES should be featurized while text shouldn't because we don't have an interpreter. """ f = SingleStateFeaturizer() f._default_feature_states[INTENT] = {"a": 0, "b": 1} f._default_feature_states[ACTION_NAME] = {"c": 0, "d": 1, "action_listen": 2} f._default_feature_states[SLOTS] = {"e_0": 0, "f_0": 1, "g_0": 2} f._default_feature_states[ACTIVE_LOOP] = {"h": 0, "i": 1, "j": 2, "k": 3} encoded = f.encode_state( { "user": {"intent": "a", "text": "blah blah blah"}, "prev_action": {"action_name": "action_listen", "action_text": "boom"}, "active_loop": {"name": "k"}, "slots": {"e": (1.0,)}, }, interpreter=RegexInterpreter(), ) # we featurize all the features except for *_text ones because NLU wasn't trained assert list(encoded.keys()) == [INTENT, ACTION_NAME, ACTIVE_LOOP, SLOTS] assert (encoded[INTENT][0].features != scipy.sparse.coo_matrix([[1, 0]])).nnz == 0 assert ( encoded[ACTION_NAME][0].features != scipy.sparse.coo_matrix([[0, 0, 1]]) ).nnz == 0 assert ( encoded[ACTIVE_LOOP][0].features != scipy.sparse.coo_matrix([[0, 0, 0, 1]]) ).nnz == 0 assert (encoded[SLOTS][0].features != scipy.sparse.coo_matrix([[1, 0, 0]])).nnz == 0
def test_create_features__dtype_float(): f = SingleStateFeaturizer() f._default_feature_states[INTENT] = {"a": 0, "b": 1} f._default_feature_states[ACTION_NAME] = {"e": 0, "d": 1} f._default_feature_states[ENTITIES] = {"c": 0} encoded = f._create_features({ACTION_NAME: "d"}, attribute=ACTION_NAME) assert len(encoded) == 1 # cause for some reason this is a list assert encoded[0].features.dtype == np.float32
def test_encode_state__without_lookup(action_name: Text): """Tests that `encode_state` creates features for every attribute. In particular, that this is done even when there is no lookup table. If there is no action_listen in the state, then no features should be created for the user sub-state. """ f = SingleStateFeaturizer() f._default_feature_states[INTENT] = {"a": 0, "b": 1} f._default_feature_states[ACTION_NAME] = { "c": 0, "d": 1, "NOT_action_listen": 2, ACTION_LISTEN_NAME: 3, } f._default_feature_states[SLOTS] = {"e_0": 0, "f_0": 1, "g_0": 2} f._default_feature_states[ACTIVE_LOOP] = {"h": 0, "i": 1, "j": 2, "k": 3} state = { USER: {INTENT: "a", TEXT: "blah blah blah"}, PREVIOUS_ACTION: {ACTION_TEXT: "boom"}, ACTIVE_LOOP: {"name": "i"}, SLOTS: {"g": (1.0,)}, } if action_name is not None: state[PREVIOUS_ACTION][ACTION_NAME] = action_name encoded = f.encode_state(state, precomputations=None) # this differs depending on whether action name is ACTION_LISTEN_NAME or "d" expected_attributes = [ACTIVE_LOOP, SLOTS] if action_name == ACTION_LISTEN_NAME: expected_attributes += [INTENT] if action_name is not None: expected_attributes += [ACTION_NAME] assert set(encoded.keys()) == set(expected_attributes) # the encoding of action_name of course depends on the sub-state if action_name is not None: if action_name == "NOT_action_listen": action_name_encoding = [0, 0, 1, 0] else: action_name_encoding = [0, 0, 0, 1] assert sparse_equals_dense( encoded[ACTION_NAME][0].features, np.array([action_name_encoding]) ) # the intent / user substate is only featurized if action_listen is # with_action_listen if action_name == ACTION_LISTEN_NAME: assert sparse_equals_dense(encoded[INTENT][0].features, np.array([[1, 0]])) # this is always the same assert sparse_equals_dense( encoded[ACTIVE_LOOP][0].features, np.array([[0, 1, 0, 0]]) ) assert sparse_equals_dense(encoded[SLOTS][0].features, np.array([[0, 0, 1]]))
def test_single_state_featurizer_uses_regex_interpreter( unpacked_trained_moodbot_path: Text, ): from rasa.core.agent import Agent domain = Domain( intents=[], entities=[], slots=[], responses={}, forms=[], action_names=[], ) f = SingleStateFeaturizer() # simulate that core was trained separately by passing # RegexInterpreter to prepare_for_training f.prepare_for_training(domain, RegexInterpreter()) # simulate that nlu and core models were manually combined for prediction # by passing trained interpreter to encode_all_actions interpreter = Agent.load(unpacked_trained_moodbot_path).interpreter features = f._extract_state_features({TEXT: "some text"}, interpreter) # RegexInterpreter cannot create features for text, therefore since featurizer # was trained without nlu, features for text should be empty assert not features
def test_single_state_featurizer_with_entity_roles_and_groups( unpacked_trained_moodbot_path: Text, ): from rasa.core.agent import Agent interpreter = Agent.load(unpacked_trained_moodbot_path).interpreter # TODO roles and groups are not supported in e2e yet domain = Domain( intents=[], entities=["city", f"city{ENTITY_LABEL_SEPARATOR}to"], slots=[], responses={}, forms={}, action_names=[], ) f = SingleStateFeaturizer() f.prepare_for_training(domain, RegexInterpreter()) encoded = f.encode_entities( { TEXT: "I am flying from London to Paris", ENTITIES: [ { ENTITY_ATTRIBUTE_TYPE: "city", ENTITY_ATTRIBUTE_VALUE: "London", ENTITY_ATTRIBUTE_START: 17, ENTITY_ATTRIBUTE_END: 23, }, { ENTITY_ATTRIBUTE_TYPE: f"city{ENTITY_LABEL_SEPARATOR}to", ENTITY_ATTRIBUTE_VALUE: "Paris", ENTITY_ATTRIBUTE_START: 27, ENTITY_ATTRIBUTE_END: 32, }, ], }, interpreter=interpreter, ) assert sorted(list(encoded.keys())) == sorted([ENTITY_TAGS]) assert np.all( encoded[ENTITY_TAGS][0].features == [[0], [0], [0], [0], [1], [0], [2]] )
def test_single_state_featurizer_with_interpreter_state_with_action_listen( unpacked_trained_moodbot_path: Text, ): from rasa.core.agent import Agent interpreter = Agent.load(unpacked_trained_moodbot_path).interpreter f = SingleStateFeaturizer() f._default_feature_states[INTENT] = {"a": 0, "b": 1} f._default_feature_states[ENTITIES] = {"c": 0} f._default_feature_states[ACTION_NAME] = { "e": 0, "d": 1, "action_listen": 2 } f._default_feature_states[SLOTS] = {"e_0": 0, "f_0": 1, "g_0": 2} f._default_feature_states[ACTIVE_LOOP] = {"h": 0, "i": 1, "j": 2, "k": 3} encoded = f.encode_state( { "user": { "text": "a ball", "intent": "b", "entities": ["c"] }, "prev_action": { "action_name": "action_listen", "action_text": "throw a ball", }, "active_loop": { "name": "k" }, "slots": { "e": (1.0, ) }, }, interpreter=interpreter, ) # check all the features are encoded and *_text features are encoded by a densefeaturizer assert sorted(list(encoded.keys())) == sorted( [TEXT, ENTITIES, ACTION_NAME, SLOTS, ACTIVE_LOOP, INTENT, ACTION_TEXT]) assert encoded[TEXT][0].features.shape[-1] == 300 assert encoded[ACTION_TEXT][0].features.shape[-1] == 300 assert (encoded[INTENT][0].features != scipy.sparse.coo_matrix( [[0, 1]])).nnz == 0 assert (encoded[ACTION_NAME][0].features != scipy.sparse.coo_matrix( [[0, 0, 1]])).nnz == 0 assert encoded[ENTITIES][0].features.shape[-1] == 1 assert (encoded[SLOTS][0].features != scipy.sparse.coo_matrix([[1, 0, 0] ])).nnz == 0 assert (encoded[ACTIVE_LOOP][0].features != scipy.sparse.coo_matrix( [[0, 0, 0, 1]])).nnz == 0
def test_single_state_featurizer_prepare_for_training(): domain = Domain( intents=["greet"], entities=["name"], slots=[Slot("name")], templates={}, forms=[], action_names=["utter_greet", "action_check_weather"], ) f = SingleStateFeaturizer() f.prepare_for_training(domain, RegexInterpreter()) assert len(f._default_feature_states[INTENT]) > 1 assert "greet" in f._default_feature_states[INTENT] assert len(f._default_feature_states[ENTITIES]) == 1 assert f._default_feature_states[ENTITIES]["name"] == 0 assert len(f._default_feature_states[SLOTS]) == 1 assert f._default_feature_states[SLOTS]["name_0"] == 0 assert len(f._default_feature_states[ACTION_NAME]) > 2 assert "utter_greet" in f._default_feature_states[ACTION_NAME] assert "action_check_weather" in f._default_feature_states[ACTION_NAME] assert len(f._default_feature_states[ACTIVE_LOOP]) == 0
def test_prepare_for_training(): domain = Domain( intents=["greet"], entities=["name"], slots=[TextSlot("name", mappings=[{}])], responses={}, forms={}, action_names=["utter_greet", "action_check_weather"], data={}, ) f = SingleStateFeaturizer() f.prepare_for_training(domain) assert len(f._default_feature_states[INTENT]) > 1 assert "greet" in f._default_feature_states[INTENT] assert len(f._default_feature_states[ENTITIES]) == 1 assert f._default_feature_states[ENTITIES]["name"] == 0 assert len(f._default_feature_states[SLOTS]) == 1 assert f._default_feature_states[SLOTS]["name_0"] == 0 assert len(f._default_feature_states[ACTION_NAME]) > 2 assert "utter_greet" in f._default_feature_states[ACTION_NAME] assert "action_check_weather" in f._default_feature_states[ACTION_NAME] assert len(f._default_feature_states[ACTIVE_LOOP]) == 0
def test_featurize_trackers_with_max_history_tracker_featurizer( moodbot_domain: Domain): state_featurizer = SingleStateFeaturizer() tracker_featurizer = MaxHistoryTrackerFeaturizer(state_featurizer) tracker = tracker_from_dialogue_file("data/test_dialogues/moodbot.json", moodbot_domain) state_features, labels, entity_tags = tracker_featurizer.featurize_trackers( [tracker], moodbot_domain, RegexInterpreter()) assert state_features is not None assert len(state_features) == 7 assert labels is not None assert len(labels) == 7 # moodbot doesn't contain e2e entities assert not any([any(turn_tags) for turn_tags in entity_tags])
def test_single_state_featurizer_with_entity_roles_and_groups( unpacked_trained_moodbot_path: Text, ): from rasa.core.agent import Agent interpreter = Agent.load(unpacked_trained_moodbot_path).interpreter f = SingleStateFeaturizer() f._default_feature_states[INTENT] = {"a": 0, "b": 1} f._default_feature_states[ENTITIES] = { "c": 0, "d": 1, f"d{ENTITY_LABEL_SEPARATOR}e": 2, } f._default_feature_states[ACTION_NAME] = { "e": 0, "d": 1, "action_listen": 2 } f._default_feature_states[SLOTS] = {"e_0": 0, "f_0": 1, "g_0": 2} f._default_feature_states[ACTIVE_LOOP] = {"h": 0, "i": 1, "j": 2, "k": 3} encoded = f.encode_state( { "user": { "text": "a ball", "intent": "b", "entities": ["c", f"d{ENTITY_LABEL_SEPARATOR}e"], }, "prev_action": { "action_name": "action_listen", "action_text": "throw a ball", }, "active_loop": { "name": "k" }, "slots": { "e": (1.0, ) }, }, interpreter=interpreter, ) # check all the features are encoded and *_text features are encoded by a densefeaturizer assert sorted(list(encoded.keys())) == sorted( [TEXT, ENTITIES, ACTION_NAME, SLOTS, ACTIVE_LOOP, INTENT, ACTION_TEXT]) assert np.all(encoded[ENTITIES][0].features.toarray() == [1, 0, 1])
def test_to_sparse_sentence_features(): features = [ Features( scipy.sparse.csr_matrix(np.random.randint(5, size=(5, 10))), FEATURE_TYPE_SEQUENCE, TEXT, "some-featurizer", ) ] sentence_features = SingleStateFeaturizer._to_sparse_sentence_features(features) assert len(sentence_features) == 1 assert FEATURE_TYPE_SENTENCE == sentence_features[0].type assert features[0].origin == sentence_features[0].origin assert features[0].attribute == sentence_features[0].attribute assert sentence_features[0].features.shape == (1, 10)
def test_single_state_featurizer_uses_dtype_float(): f = SingleStateFeaturizer() f._default_feature_states[INTENT] = {"a": 0, "b": 1} f._default_feature_states[ACTION_NAME] = {"e": 0, "d": 1} f._default_feature_states[ENTITIES] = {"c": 0} encoded = f.encode_state( { "user": {"intent": "a", "entities": ["c"]}, "prev_action": {"action_name": "d"}, }, interpreter=RegexInterpreter(), ) assert encoded[ACTION_NAME][0].features.dtype == np.float32
def test_generate_training_data_with_cycles(domain: Domain): featurizer = MaxHistoryTrackerFeaturizer(SingleStateFeaturizer(), max_history=4) training_trackers = training.load_data( "data/test_yaml_stories/stories_with_cycle.yml", domain, augmentation_factor=0, ) _, label_ids, _ = featurizer.featurize_trackers( training_trackers, domain, precomputations=None ) # how many there are depends on the graph which is not created in a # deterministic way but should always be 3 or 4 assert len(training_trackers) == 3 or len(training_trackers) == 4 # if we have 4 trackers, there is going to be one example more for label 10 num_tens = len(training_trackers) - 1 # if new default actions are added the keys of the actions will be changed all_label_ids = [id for ids in label_ids for id in ids] assert Counter(all_label_ids) == {0: 6, 15: 3, 14: num_tens, 1: 2, 16: 1}
async def test_generate_training_data_with_cycles(stories_file: Text, default_domain: Domain): featurizer = MaxHistoryTrackerFeaturizer(SingleStateFeaturizer(), max_history=4) training_trackers = await training.load_data(stories_file, default_domain, augmentation_factor=0) training_data, label_ids = featurizer.featurize_trackers( training_trackers, default_domain, interpreter=RegexInterpreter()) # how many there are depends on the graph which is not created in a # deterministic way but should always be 3 or 4 assert len(training_trackers) == 3 or len(training_trackers) == 4 # if we have 4 trackers, there is going to be one example more for label 10 num_tens = len(training_trackers) - 1 # if new default actions are added the keys of the actions will be changed all_label_ids = [id for ids in label_ids for id in ids] assert Counter(all_label_ids) == {0: 6, 12: num_tens, 14: 1, 1: 2, 13: 3}
def _standard_featurizer() -> MaxHistoryTrackerFeaturizer: return MaxHistoryTrackerFeaturizer(SingleStateFeaturizer())
def test_state_features_for_attribute_raises_on_not_supported_attribute(): f = SingleStateFeaturizer() with pytest.raises(ValueError): f._state_features_for_attribute({}, "not-supported-attribute")
def _standard_featurizer(max_history: Optional[int] = None) -> TrackerFeaturizer: return MaxHistoryTrackerFeaturizer( SingleStateFeaturizer(), max_history=max_history )
async def test_load_multi_file_training_data( stories_resources: List, default_domain: Domain ): # the stories file in `data/test_multifile_stories` is the same as in # `data/test_stories/stories.md`, but split across multiple files featurizer = MaxHistoryTrackerFeaturizer(SingleStateFeaturizer(), max_history=2) trackers = await training.load_data( stories_resources[0], default_domain, augmentation_factor=0 ) (tr_as_sts, tr_as_acts) = featurizer.training_states_and_actions( trackers, default_domain ) hashed = [] for sts, acts in zip(tr_as_sts, tr_as_acts): hashed.append(json.dumps(sts + acts, sort_keys=True)) hashed = sorted(hashed, reverse=True) data, label_ids = featurizer.featurize_trackers( trackers, default_domain, interpreter=RegexInterpreter() ) featurizer_mul = MaxHistoryTrackerFeaturizer(SingleStateFeaturizer(), max_history=2) trackers_mul = await training.load_data( stories_resources[1], default_domain, augmentation_factor=0 ) (tr_as_sts_mul, tr_as_acts_mul) = featurizer.training_states_and_actions( trackers_mul, default_domain ) hashed_mul = [] for sts_mul, acts_mul in zip(tr_as_sts_mul, tr_as_acts_mul): hashed_mul.append(json.dumps(sts_mul + acts_mul, sort_keys=True)) hashed_mul = sorted(hashed_mul, reverse=True) data_mul, label_ids_mul = featurizer_mul.featurize_trackers( trackers_mul, default_domain, interpreter=RegexInterpreter() ) assert hashed == hashed_mul # we check for intents, action names and entities -- the features which # are included in the story files data = surface_attributes(data) data_mul = surface_attributes(data_mul) for attribute in [INTENT, ACTION_NAME, ENTITIES]: if attribute not in data or attribute not in data_mul: continue assert len(data.get(attribute)) == len(data_mul.get(attribute)) for idx_tracker in range(len(data.get(attribute))): for idx_dialogue in range(len(data.get(attribute)[idx_tracker])): f1 = data.get(attribute)[idx_tracker][idx_dialogue] f2 = data_mul.get(attribute)[idx_tracker][idx_dialogue] if f1 is None or f2 is None: assert f1 == f2 continue for idx_turn in range(len(f1)): f1 = data.get(attribute)[idx_tracker][idx_dialogue][idx_turn] f2 = data_mul.get(attribute)[idx_tracker][idx_dialogue][idx_turn] assert np.all((f1 == f2).data) assert np.all(label_ids == label_ids_mul)