def test_container_add_does_not_fail_if_message_feature_content_differs(): # construct a set of unique substates dummy_value = "this-could-be-anything" substates_with_unique_key_attribute = [ { INTENT: "greet" }, { TEXT: "text", ENTITIES: dummy_value }, { ACTION_TEXT: "action_text" }, { ACTION_NAME: "action_name" }, ] constant_feature = _dummy_features(id=1, attribute="arbitrary") different_feature = _dummy_features(id=1, attribute="arbitrary") lookup_table = MessageContainerForCoreFeaturization() for sub_state in substates_with_unique_key_attribute: lookup_table.add(Message(data=sub_state, features=[constant_feature])) length = len(lookup_table) # with different feature for sub_state in substates_with_unique_key_attribute: lookup_table.add(Message(data=sub_state, features=[different_feature])) assert len(lookup_table) == length
def test_encode_state__with_lookup__creates_features_for_intent_and_action_name( with_action_listen: bool, ): """Tests that features for intent and action name are created if needed. Especially tests that this is the case even though no features are present in the given lookup table for this intent and action_name. However, if no `action_listen` is in the given sub-state, then the user sub-state should not be featurized (hence, no features for intent) should be created. """ f = SingleStateFeaturizer() f._default_feature_states[INTENT] = {"a": 0, "b": 1} f._default_feature_states[ACTION_NAME] = {"c": 0, "d": 1, ACTION_LISTEN_NAME: 2} # create state action_name = ACTION_LISTEN_NAME if with_action_listen else "c" state = {USER: {INTENT: "e"}, PREVIOUS_ACTION: {ACTION_NAME: action_name}} # create a lookup table with all relevant entries **but no Features** precomputations = MessageContainerForCoreFeaturization() precomputations.add(Message(data={INTENT: state[USER][INTENT]})) precomputations.add( Message(data={ACTION_NAME: state[PREVIOUS_ACTION][ACTION_NAME]}) ) # encode! encoded = f.encode_state(state, precomputations=precomputations) if with_action_listen: assert set(encoded.keys()) == set([INTENT, ACTION_NAME]) assert ( encoded[INTENT][0].features != scipy.sparse.coo_matrix([[0, 0]]) ).nnz == 0 else: assert set(encoded.keys()) == set([ACTION_NAME])
def test_encode_entities__with_entity_roles_and_groups(): # create fake message that has been tokenized and entities have been extracted text = "I am flying from London to Paris" tokens = [ Token(text=match.group(), start=match.start()) for match in re.finditer(r"\S+", text) ] entity_tags = ["city", f"city{ENTITY_LABEL_SEPARATOR}to"] entities = [ { ENTITY_ATTRIBUTE_TYPE: entity_tags[0], ENTITY_ATTRIBUTE_VALUE: "London", ENTITY_ATTRIBUTE_START: 17, ENTITY_ATTRIBUTE_END: 23, }, { ENTITY_ATTRIBUTE_TYPE: entity_tags[1], ENTITY_ATTRIBUTE_VALUE: "Paris", ENTITY_ATTRIBUTE_START: 27, ENTITY_ATTRIBUTE_END: 32, }, ] message = Message({ TEXT: text, TOKENS_NAMES[TEXT]: tokens, ENTITIES: entities }) # create a lookup table that has seen this message precomputations = MessageContainerForCoreFeaturization() precomputations.add(message) # instantiate matching domain and single state featurizer domain = Domain( intents=[], entities=entity_tags, slots=[], responses={}, forms={}, action_names=[], ) f = SingleStateFeaturizer() f.prepare_for_training(domain) # encode! encoded = f.encode_entities(entity_data={ TEXT: text, ENTITIES: entities }, precomputations=precomputations) # check assert len(f.entity_tag_specs) == 1 tags_to_ids = f.entity_tag_specs[0].tags_to_ids for idx, entity_tag in enumerate(entity_tags): tags_to_ids[entity_tag] = idx + 1 # hence, city -> 1, city#to -> 2 assert sorted(list(encoded.keys())) == [ENTITY_TAGS] assert np.all(encoded[ENTITY_TAGS][0].features == [[0], [0], [0], [0], [1], [0], [2]])
def test_container_add_message_copies(): # construct a set of unique substates and messages dummy_value = "this-could-be-anything" substates_with_unique_key_attribute = [ {INTENT: "greet"}, {TEXT: "text", ENTITIES: dummy_value}, {TEXT: "other-text"}, {ACTION_TEXT: "action_text"}, {ACTION_NAME: "action_name"}, ] unique_messages = [ Message(sub_state) for sub_state in substates_with_unique_key_attribute ] # make some copies num_copies = 3 messages = unique_messages * (1 + num_copies) # build table lookup_table = MessageContainerForCoreFeaturization() for message in messages: lookup_table.add(message) # assert that we have as many entries as unique keys assert len(lookup_table) == len(substates_with_unique_key_attribute) assert set(lookup_table.all_messages()) == set(unique_messages) assert ( lookup_table.num_collisions_ignored == len(substates_with_unique_key_attribute) * num_copies )
def test_container_fingerprint_differ_for_containers_with_different_insertion_order(): # because we use this for training data and order might affect training of # e.g. featurizers, we want this to differ container1 = MessageContainerForCoreFeaturization() container1.add(Message(data={INTENT: "1"})) container1.add(Message(data={INTENT: "2"})) container2 = MessageContainerForCoreFeaturization() container2.add(Message(data={INTENT: "2"})) container2.add(Message(data={INTENT: "1"})) assert container2.fingerprint() != container1.fingerprint()
def test_container_add_fails_if_messages_are_different_but_have_same_key(): # construct a set of unique substates dummy_value = "this-could-be-anything" substates_with_unique_key_attribute = [ {INTENT: "greet"}, {TEXT: "text", ENTITIES: dummy_value}, {ACTION_TEXT: "action_text"}, {ACTION_NAME: "action_name"}, ] constant_feature = _dummy_features(id=1, attribute="arbitrary") different_feature = _dummy_features(id=1, attribute="arbitrary") # adding the unique messages works fine of course,... lookup_table = MessageContainerForCoreFeaturization() for sub_state in substates_with_unique_key_attribute: lookup_table.add(Message(data=sub_state, features=[constant_feature])) # ... but adding any substate with same key but different content doesn't new_key = "some-new-key" expected_error_message = "Expected added message to be consistent" for sub_state in substates_with_unique_key_attribute: # with extra attribute sub_state_with_extra_attribute = sub_state.copy() sub_state_with_extra_attribute[new_key] = "some-value-for-the-new-key" with pytest.raises(ValueError, match=expected_error_message): lookup_table.add(Message(data=sub_state_with_extra_attribute)) # with new feature with pytest.raises(ValueError, match=expected_error_message): lookup_table.add( Message(data=sub_state, features=[constant_feature, different_feature]) ) # without features with pytest.raises(ValueError, match=expected_error_message): lookup_table.add(Message(data=sub_state))
def test_container_fingerprints_differ_for_different_containers(): container1 = MessageContainerForCoreFeaturization() container1.add(Message(data={INTENT: "1"})) container2 = MessageContainerForCoreFeaturization() container2.add(Message(data={INTENT: "2"})) assert container2.fingerprint() != container1.fingerprint()
def test_encode_entities__with_bilou_entity_roles_and_groups(): # Instantiate domain and configure the single state featurizer for this domain. # Note that there are 2 entity tags here. entity_tags = ["city", f"city{ENTITY_LABEL_SEPARATOR}to"] domain = Domain( intents=[], entities=entity_tags, slots=[], responses={}, forms={}, action_names=[], ) f = SingleStateFeaturizer() f.prepare_for_training(domain, bilou_tagging=True) # (1) example with both entities # create message that has been tokenized and where entities have been extracted text = "I am flying from London to Paris" tokens = [ Token(text=match.group(), start=match.start()) for match in re.finditer(r"\S+", text) ] entities = [ { ENTITY_ATTRIBUTE_TYPE: entity_tags[0], ENTITY_ATTRIBUTE_VALUE: "London", ENTITY_ATTRIBUTE_START: 17, ENTITY_ATTRIBUTE_END: 23, }, { ENTITY_ATTRIBUTE_TYPE: entity_tags[1], ENTITY_ATTRIBUTE_VALUE: "Paris", ENTITY_ATTRIBUTE_START: 27, ENTITY_ATTRIBUTE_END: 32, }, ] message = Message({TEXT: text, TOKENS_NAMES[TEXT]: tokens, ENTITIES: entities}) # create a lookup table that has seen this message precomputations = MessageContainerForCoreFeaturization() precomputations.add(message) # encode! encoded = f.encode_entities( {TEXT: text, ENTITIES: entities,}, precomputations=precomputations, bilou_tagging=True, ) assert sorted(list(encoded.keys())) == sorted([ENTITY_TAGS]) assert np.all( encoded[ENTITY_TAGS][0].features == [[0], [0], [0], [0], [4], [0], [8]] ) # (2) example with only the "city" entity # create message that has been tokenized and where entities have been extracted text = "I am flying to Saint Petersburg" tokens = [ Token(text=match.group(), start=match.start()) for match in re.finditer(r"\S+", text) ] entities = [ { ENTITY_ATTRIBUTE_TYPE: "city", ENTITY_ATTRIBUTE_VALUE: "Saint Petersburg", ENTITY_ATTRIBUTE_START: 15, ENTITY_ATTRIBUTE_END: 31, }, ] message = Message({TEXT: text, TOKENS_NAMES[TEXT]: tokens, ENTITIES: entities}) # create a lookup table that has seen this message precomputations = MessageContainerForCoreFeaturization() precomputations.add(message) # encode! encoded = f.encode_entities( {TEXT: text, ENTITIES: entities,}, precomputations=precomputations, bilou_tagging=True, ) assert sorted(list(encoded.keys())) == sorted([ENTITY_TAGS]) assert np.all(encoded[ENTITY_TAGS][0].features == [[0], [0], [0], [0], [1], [3]])
def test_encode_state__with_lookup__looksup_or_creates_features(action_name: Text): """Tests that features from table are combined or created from scratch. If the given action name is ... - ACTION_LISTEN_NAME then the user substate and the action name are encoded - some "other" action, then the user-substate is not encoed but the action name is - set to "None", then we remove the action name from the user substate and as a result there should be no encoding for the action name and for the user substate """ f = SingleStateFeaturizer() f._default_feature_states[INTENT] = {"greet": 0, "inform": 1} f._default_feature_states[ENTITIES] = { "city": 0, "name": 1, f"city{ENTITY_LABEL_SEPARATOR}to": 2, f"city{ENTITY_LABEL_SEPARATOR}from": 3, } f._default_feature_states[ACTION_NAME] = { "NOT_action_listen": 0, "utter_greet": 1, ACTION_LISTEN_NAME: 2, } # `_0` in slots represent feature dimension f._default_feature_states[SLOTS] = {"slot_1_0": 0, "slot_2_0": 1, "slot_3_0": 2} f._default_feature_states[ACTIVE_LOOP] = { "active_loop_1": 0, "active_loop_2": 1, "active_loop_3": 2, "active_loop_4": 3, } # create state text = "I am flying from London to Paris" tokens = [ Token(text=match.group(), start=match.start()) for match in re.finditer(r"\S+", text) ] entity_name_list = ["city", f"city{ENTITY_LABEL_SEPARATOR}to"] action_text = "throw a ball" intent = "inform" state = { USER: {TEXT: text, INTENT: intent, ENTITIES: entity_name_list,}, PREVIOUS_ACTION: {ACTION_NAME: action_name, ACTION_TEXT: action_text,}, ACTIVE_LOOP: {"name": "active_loop_4"}, SLOTS: {"slot_1": (1.0,)}, } if action_name is None: del state[PREVIOUS_ACTION][ACTION_NAME] # Build lookup table with all relevant information - and dummy features for all # dense featurizable attributes. # Note that we don't need to add the `ENTITIES` to the message including `TEXT` # here because `encode_state` won't featurize the entities using the lookup table # (only `encode_entities` does that). units = 300 precomputations = MessageContainerForCoreFeaturization() precomputations.add_all( [ Message( data={TEXT: text, TOKENS_NAMES[TEXT]: tokens}, features=[ dummy_features( fill_value=11, units=units, attribute=TEXT, type=SENTENCE, is_sparse=True, ), dummy_features( fill_value=12, units=units, attribute=TEXT, type=SEQUENCE, is_sparse=False, ), # Note: sparse sequence feature is last here dummy_features( fill_value=13, units=units, attribute=TEXT, type=SEQUENCE, is_sparse=True, ), ], ), Message(data={INTENT: intent}), Message( data={ACTION_TEXT: action_text}, features=[ dummy_features( fill_value=1, units=units, attribute=ACTION_TEXT, type=SEQUENCE, is_sparse=True, ) ], ), ] ) if action_name is not None: precomputations.add(Message(data={ACTION_NAME: action_name})) # encode the state encoded = f.encode_state(state, precomputations=precomputations,) # check all the features are encoded and *_text features are encoded by a # dense featurizer expected_attributes = [SLOTS, ACTIVE_LOOP, ACTION_TEXT] if action_name is not None: # i.e. we did not remove it from the state expected_attributes += [ACTION_NAME] if action_name == ACTION_LISTEN_NAME: expected_attributes += [TEXT, ENTITIES, INTENT] assert set(encoded.keys()) == set(expected_attributes) # Remember, sparse sequence features come first (and `.features` denotes the matrix # not a `Features` object) if action_name == ACTION_LISTEN_NAME: assert encoded[TEXT][0].features.shape[-1] == units assert encoded[TEXT][0].is_sparse() assert encoded[ENTITIES][0].features.shape[-1] == 4 assert sparse_equals_dense(encoded[INTENT][0].features, np.array([[0, 1]])) assert encoded[ACTION_TEXT][0].features.shape[-1] == units assert encoded[ACTION_TEXT][0].is_sparse() if action_name is not None: if action_name == "NOT_action_listen": action_name_encoding = [1, 0, 0] else: # action_listen action_name_encoding = [0, 0, 1] assert sparse_equals_dense( encoded[ACTION_NAME][0].features, np.array([action_name_encoding]) ) else: assert ACTION_NAME not in encoded assert sparse_equals_dense(encoded[SLOTS][0].features, np.array([[1, 0, 0]])) assert sparse_equals_dense( encoded[ACTIVE_LOOP][0].features, np.array([[0, 0, 0, 1]]) )