def _set_features(self, message: Message, attribute: Text = TEXT) -> None: """Sets the features on a single message. Utility method.""" tokens = message.get(TEXT_TOKENS) # If the message doesn't have tokens, we can't create features. if not tokens: return None # We need to reshape here such that the shape is equivalent to that of sparsely # generated features. Without it, it'd be a 1D tensor. We need 2D (n_utterance, n_dim). text_vector = self._create_word_vector(document=message.get(TEXT)).reshape( 1, -1 ) word_vectors = np.array( [self._create_word_vector(document=t.text) for t in tokens] ) final_sequence_features = Features( word_vectors, FEATURE_TYPE_SEQUENCE, attribute, self._config[FEATURIZER_CLASS_ALIAS], ) message.add_features(final_sequence_features) final_sentence_features = Features( text_vector, FEATURE_TYPE_SENTENCE, attribute, self._config[FEATURIZER_CLASS_ALIAS], ) message.add_features(final_sentence_features)
def moodbot_features( request: Request, moodbot_domain: Domain) -> Dict[Text, Dict[Text, Features]]: """Makes intent and action features for the moodbot domain to faciliate making expected state features. Returns: A dict containing dicts for mapping action and intent names to features. """ origin = getattr(request, "param", "SingleStateFeaturizer") action_shape = (1, len(moodbot_domain.action_names_or_texts)) actions = {} for index, action in enumerate(moodbot_domain.action_names_or_texts): actions[action] = Features( sparse.coo_matrix(([1.0], [[0], [index]]), shape=action_shape), FEATURE_TYPE_SENTENCE, ACTION_NAME, origin, ) intent_shape = (1, len(moodbot_domain.intents)) intents = {} for index, intent in enumerate(moodbot_domain.intents): intents[intent] = Features( sparse.coo_matrix(([1.0], [[0], [index]]), shape=intent_shape), FEATURE_TYPE_SENTENCE, INTENT, origin, ) return {"intents": intents, "actions": actions}
def _set_attribute_features( self, attribute: Text, sequence_features: List[scipy.sparse.spmatrix], sentence_features: List[scipy.sparse.spmatrix], examples: List[Message], ) -> None: """Set computed features of the attribute to corresponding message objects""" for i, message in enumerate(examples): # create bag for each example if sequence_features[i] is not None: final_sequence_features = Features( sequence_features[i], FEATURE_TYPE_SEQUENCE, attribute, self.component_config[FEATURIZER_CLASS_ALIAS], ) message.add_features(final_sequence_features) if sentence_features[i] is not None: final_sentence_features = Features( sentence_features[i], FEATURE_TYPE_SENTENCE, attribute, self.component_config[FEATURIZER_CLASS_ALIAS], ) message.add_features(final_sentence_features)
def _set_semantic_map_features(self, message: Message, attribute: Text) -> None: """Adds semantic map features to the given attribute of the message. Args: message: The message to modify. attribute: The name of the attribute that should be changed. """ if not message.get(TOKENS_NAMES[attribute], []): return sequence_features, sentence_features = self._featurize_tokens( message.get(TOKENS_NAMES[attribute], []) ) if sequence_features is not None: final_sequence_features = Features( sequence_features, FEATURE_TYPE_SEQUENCE, attribute, self.component_config[FEATURIZER_CLASS_ALIAS], ) message.add_features(final_sequence_features) if sentence_features is not None: final_sentence_features = Features( sentence_features, FEATURE_TYPE_SENTENCE, attribute, self.component_config[FEATURIZER_CLASS_ALIAS], ) message.add_features(final_sentence_features)
def test_reduce_raises_if_combining_different_origins_or_attributes( differ: Text): # create features accordingly arbitrary_fixed_type = FEATURE_TYPE_SENTENCE features_list = [] for idx in range(2): first_dim = 1 arbitrary_matrix_matching_type = np.full(shape=(first_dim, 1), fill_value=1) config = dict( features=arbitrary_matrix_matching_type, attribute="fixed-attribute" if differ != "attribute" else f"attr-{idx}", feature_type=arbitrary_fixed_type, origin="fixed-origin" if differ != "origin" else f"origin-{idx}", ) feat = Features(**config) features_list.append(feat) # reduce! if differ == "attribute": message = "Expected all Features to describe the same attribute" expected_origin = ["origin"] else: message = "Expected 'origin-1' to be the origin of the 0-th" expected_origin = ["origin-1"] with pytest.raises(ValueError, match=message): Features.reduce(features_list, expected_origins=expected_origin)
def test_create_zero_features(): # DENSE FEATURES dense_feature_sentence_features = Features( features=np.random.rand(shape), attribute=INTENT, feature_type=SENTENCE, origin=[], ) features = [[None, None, [dense_feature_sentence_features]]] zero_features = model_data_utils.create_zero_features(features) assert len(zero_features) == 1 assert zero_features[0].is_dense() assert (zero_features[0].features == np.zeros(shape)).all() # SPARSE FEATURES sparse_feature_sentence_features = Features( features=scipy.sparse.coo_matrix(np.random.rand(shape)), attribute=INTENT, feature_type=SENTENCE, origin=[], ) features = [[None, None, [sparse_feature_sentence_features]]] zero_features = model_data_utils.create_zero_features(features) assert len(zero_features) == 1 assert zero_features[0].is_sparse() assert (zero_features[0].features != scipy.sparse.coo_matrix( (1, shape))).nnz == 0
def set_gensim_features(self, message: Message, attribute: Text = TEXT) -> None: tokens = message.get(TOKENS_NAMES[attribute]) if not tokens: return None # If the key is not available then we featurize it with an array of zeros word_vectors = np.array([ self.kv[t.text] if t.text in self.kv else np.zeros(self.kv.vector_size) for t in tokens ]) # Sum up all the word vectors so that we have one for the complete utterance, e.g. sentence vector text_vector = reduce(lambda a, b: a + b, word_vectors).reshape(1, -1) final_sequence_features = Features( word_vectors, FEATURE_TYPE_SEQUENCE, attribute, self.component_config[FEATURIZER_CLASS_ALIAS], ) message.add_features(final_sequence_features) final_sentence_features = Features( text_vector, FEATURE_TYPE_SENTENCE, attribute, self.component_config[FEATURIZER_CLASS_ALIAS], ) message.add_features(final_sentence_features)
def _text_features_with_regex(self, message: Message, attribute: Text) -> None: """Helper method to extract features and set them appropriately in the message. Args: message: Message to be featurized. attribute: Attribute of message to be featurized. """ if self.known_patterns: sequence_features, sentence_features = self._features_for_patterns( message, attribute) if sequence_features is not None: final_sequence_features = Features( sequence_features, FEATURE_TYPE_SEQUENCE, attribute, self.component_config[FEATURIZER_CLASS_ALIAS], ) message.add_features(final_sequence_features) if sentence_features is not None: final_sentence_features = Features( sentence_features, FEATURE_TYPE_SENTENCE, attribute, self.component_config[FEATURIZER_CLASS_ALIAS], ) message.add_features(final_sentence_features)
def test_create_fake_features(): # DENSE FEATURES dense_feature_sentence_features = Features( features=np.random.rand(shape), attribute=INTENT, feature_type=SENTENCE, origin=[], ) features = [[None, None, [dense_feature_sentence_features]]] fake_features = model_data_utils._create_fake_features(features) assert len(fake_features) == 1 assert fake_features[0].is_dense() assert fake_features[0].features.shape == (0, shape) # SPARSE FEATURES sparse_feature_sentence_features = Features( features=scipy.sparse.coo_matrix(np.random.rand(shape)), attribute=INTENT, feature_type=SENTENCE, origin=[], ) features = [[None, None, [sparse_feature_sentence_features]]] fake_features = model_data_utils._create_fake_features(features) assert len(fake_features) == 1 assert fake_features[0].is_sparse() assert fake_features[0].features.shape == (0, shape) assert fake_features[0].features.nnz == 0
def test_combine(is_sparse: bool, type: Text, number: int): features_list, modifications = _generate_feature_list_and_modifications( is_sparse=is_sparse, type=type, number=number ) modified_features = [Features(**config) for config in modifications] first_dim = features_list[0].features.shape[0] expected_origin = [f"origin-{idx}" for idx in range(len(features_list))] if number == 1: # in this case the origin will be same str as before, not a list expected_origin = expected_origin[0] # works as expected combination = Features.combine(features_list, expected_origins=expected_origin) assert combination.features.shape[1] == int(number * (number + 1) / 2) assert combination.features.shape[0] == first_dim assert combination.origin == expected_origin assert combination.is_sparse() == is_sparse matrix = combination.features if is_sparse: matrix = combination.features.todense() for idx in range(number): offset = int(idx * (idx + 1) / 2) assert np.all(matrix[:, offset : (offset + idx + 1)] == idx + 1) # fails as expected in these cases if number > 1: for modified_feature in modified_features: features_list_copy = features_list.copy() features_list_copy[-1] = modified_feature with pytest.raises(ValueError): Features.combine(features_list_copy, expected_origins=expected_origin)
def _set_lm_features(self, message: Message, attribute: Text = TEXT) -> None: """Adds the precomputed word vectors to the messages features.""" doc = self._get_doc(message, attribute) if doc is None: return sequence_features = doc[SEQUENCE_FEATURES] sentence_features = doc[SENTENCE_FEATURES] final_sequence_features = Features( sequence_features, FEATURE_TYPE_SEQUENCE, attribute, self.component_config[FEATURIZER_CLASS_ALIAS], ) message.add_features(final_sequence_features) final_sentence_features = Features( sentence_features, FEATURE_TYPE_SENTENCE, attribute, self.component_config[FEATURIZER_CLASS_ALIAS], ) message.add_features(final_sentence_features)
def _set_features(self, message: Message, attribute: Text = TEXT) -> None: """Sets the features on a single message. Utility method.""" tokens = message.get(TEXT_TOKENS) # If the message doesn't have tokens, we can't create features. if not tokens: return None # Make distinction between sentence and sequence features text_vector = self.tfm.transform([message.get(TEXT)]) word_vectors = self.tfm.transform([t.text for t in tokens]) final_sequence_features = Features( word_vectors, FEATURE_TYPE_SEQUENCE, attribute, self._config[FEATURIZER_CLASS_ALIAS], ) message.add_features(final_sequence_features) final_sentence_features = Features( text_vector, FEATURE_TYPE_SENTENCE, attribute, self._config[FEATURIZER_CLASS_ALIAS], ) message.add_features(final_sentence_features)
def _set_spacy_features(self, message: Message, attribute: Text = TEXT) -> None: """Adds the spacy word vectors to the messages features.""" doc = self.get_doc(message, attribute) if doc is None: return # in case an empty spaCy model was used, no vectors are present if doc.vocab.vectors_length == 0: logger.debug( "No features present. You are using an empty spaCy model.") return sequence_features = self._features_for_doc(doc) sentence_features = self._calculate_sentence_features( sequence_features, self.pooling_operation) final_sequence_features = Features( sequence_features, FEATURE_TYPE_SEQUENCE, attribute, self.component_config[FEATURIZER_CLASS_ALIAS], ) message.add_features(final_sequence_features) final_sentence_features = Features( sentence_features, FEATURE_TYPE_SENTENCE, attribute, self.component_config[FEATURIZER_CLASS_ALIAS], ) message.add_features(final_sentence_features)
def test_groupby( num_features_per_attribute: Dict[Text, int], specified_attributes: Optional[List[Text]], ): features_list = [] for attribute, number in num_features_per_attribute.items(): for idx in range(number): matrix = np.full(shape=(1, idx + 1), fill_value=idx + 1) config = dict( features=matrix, attribute=attribute, feature_type=FEATURE_TYPE_SEQUENCE, # doesn't matter origin=f"origin-{idx}", # doens't matter ) feat = Features(**config) features_list.append(feat) result = Features.groupby_attribute(features_list, attributes=specified_attributes) if specified_attributes is None: for attribute, number in num_features_per_attribute.items(): if number > 0: assert attribute in result assert len(result[attribute]) == number else: assert attribute not in result else: assert set(result.keys()) == set(specified_attributes) for attribute in specified_attributes: assert attribute in result number = num_features_per_attribute.get(attribute, 0) assert len(result[attribute]) == number
def set_bpemb_features(self, message: Message, attribute: Text = TEXT) -> None: tokens = message.get(TOKENS_NAMES[attribute]) if not tokens: return None # We need to reshape here such that the shape is equivalent to that of sparsely # generated features. Without it, it'd be a 1D tensor. We need 2D (n_utterance, n_dim). text_vector = self.create_word_vector( document=message.get(TEXT)).reshape(1, -1) word_vectors = np.array( [self.create_word_vector(document=t.text) for t in tokens]) final_sequence_features = Features( word_vectors, FEATURE_TYPE_SEQUENCE, attribute, self.component_config[FEATURIZER_CLASS_ALIAS], ) message.add_features(final_sequence_features) final_sentence_features = Features( text_vector, FEATURE_TYPE_SENTENCE, attribute, self.component_config[FEATURIZER_CLASS_ALIAS], ) message.add_features(final_sentence_features)
def test_message_fingerprint_includes_data_and_features( whitespace_tokenizer: WhitespaceTokenizer, ): message = Message(data={TEXT: "This is a test sentence."}) fp1 = message.fingerprint() whitespace_tokenizer.process([message]) fp2 = message.fingerprint() assert fp1 != fp2 message.add_features( Features(scipy.sparse.csr_matrix([1, 1, 0]), FEATURE_TYPE_SEQUENCE, TEXT, "c2",) ) fp3 = message.fingerprint() assert fp2 != fp3 message.add_features( Features(np.ndarray([1, 2, 2]), FEATURE_TYPE_SEQUENCE, TEXT, "c1") ) fp4 = message.fingerprint() assert fp3 != fp4 assert len({fp1, fp2, fp3, fp4}) == 4
def test_reduce(shuffle_mode: Text, num_features_per_combination: Tuple[int, int, int, int]): # all combinations - in the expected order # (i.e. all sparse before all dense and sequence before sentence) all_combinations = [ (FEATURE_TYPE_SEQUENCE, True), (FEATURE_TYPE_SENTENCE, True), (FEATURE_TYPE_SEQUENCE, False), (FEATURE_TYPE_SENTENCE, False), ] # multiply accordingly and mess up the order chosen_combinations = [ spec for spec, num in zip(all_combinations, num_features_per_combination) for _ in range(num) ] if shuffle_mode == "reversed": messed_up_order = reversed(chosen_combinations) else: # Note: rng.permutation would mess up the types rng = np.random.default_rng(23452345) permutation = rng.permutation(len(chosen_combinations)) messed_up_order = [chosen_combinations[idx] for idx in permutation] # create features accordingly features_list = [] for idx, (type, is_sparse) in enumerate(messed_up_order): first_dim = 1 if type == FEATURE_TYPE_SEQUENCE else 3 matrix = np.full(shape=(first_dim, 1), fill_value=1) if is_sparse: matrix = scipy.sparse.coo_matrix(matrix) config = dict( features=matrix, attribute="fixed-attribute", # must be the same feature_type=type, origin="origin-does-matter-here", # must be the same ) feat = Features(**config) features_list.append(feat) # reduce! reduced_list = Features.reduce(features_list) assert len(reduced_list) == sum(num > 0 for num in num_features_per_combination) idx = 0 for num, (type, is_sparse) in zip(num_features_per_combination, all_combinations): if num == 0: # nothing to check here - because we already checked the length above # and check the types and shape of all existing features in this loop pass else: feature = reduced_list[idx] assert feature.is_sparse() == is_sparse assert feature.type == type assert feature.features.shape[-1] == num idx += 1
def test_combine_with_existing_dense_features_shape_mismatch(): existing_features = Features( np.array([[1, 0, 2, 3], [2, 0, 0, 1]]), FEATURE_TYPE_SEQUENCE, TEXT, "test" ) new_features = Features(np.array([[0, 1]]), FEATURE_TYPE_SEQUENCE, TEXT, "origin") with pytest.raises(ValueError): existing_features.combine_with_features(new_features)
def test_combine_with_existing_dense_features(): existing_features = Features(np.array([[1, 0, 2, 3], [2, 0, 0, 1]]), FEATURE_TYPE_SEQUENCE, TEXT, "test") new_features = Features(np.array([[1, 0], [0, 1]]), FEATURE_TYPE_SEQUENCE, TEXT, "origin") expected_features = np.array([[1, 0, 2, 3, 1, 0], [2, 0, 0, 1, 0, 1]]) existing_features.combine_with_features(new_features) assert np.all(expected_features == existing_features.features)
def test_surface_attributes(): intent_features = { INTENT: [ Features( features=np.random.rand(shape), attribute=INTENT, feature_type=SENTENCE, origin=[], ) ] } action_name_features = scipy.sparse.coo_matrix(np.random.rand(shape)) action_name_features = { ACTION_NAME: [ Features( features=action_name_features, attribute=ACTION_NAME, feature_type=SENTENCE, origin=[], ) ] } state_features = copy.deepcopy(intent_features) state_features.update(copy.deepcopy(action_name_features)) # test on 2 dialogs -- one with dialog length 3 the other one with dialog length 2 dialogs = [[state_features, intent_features, {}], [{}, action_name_features]] surfaced_features = model_data_utils.surface_attributes(dialogs) assert INTENT in surfaced_features and ACTION_NAME in surfaced_features # check that number of lists corresponds to number of dialogs assert (len(surfaced_features.get(INTENT)) == 2 and len(surfaced_features.get(ACTION_NAME)) == 2) # length of each list corresponds to length of the dialog assert (len(surfaced_features.get(INTENT)[0]) == 3 and len(surfaced_features.get(INTENT)[1]) == 2) assert (len(surfaced_features.get(ACTION_NAME)[0]) == 3 and len(surfaced_features.get(ACTION_NAME)[1]) == 2) # check that features are correctly populated with `None`s assert (surfaced_features.get(INTENT)[0][2] is None and surfaced_features.get(INTENT)[1][0] is None and surfaced_features.get(INTENT)[1][1] is None) assert (surfaced_features.get(ACTION_NAME)[0][1] is None and surfaced_features.get(ACTION_NAME)[0][2] is None and surfaced_features.get(ACTION_NAME)[1][0] is None) # check that all features are the same as before assert all([ (turn[0].features == intent_features[INTENT][0].features).all() for dialogue in surfaced_features.get(INTENT) for turn in dialogue if turn is not None ]) assert all([(turn[0].features != action_name_features[ACTION_NAME][0].features).nnz == 0 for dialogue in surfaced_features.get(ACTION_NAME) for turn in dialogue if turn is not None])
def test_combine_with_existing_sparse_features_shape_mismatch(): existing_features = Features( scipy.sparse.csr_matrix([[1, 0, 2, 3], [2, 0, 0, 1]]), FEATURE_TYPE_SEQUENCE, TEXT, "test", ) new_features = Features(scipy.sparse.csr_matrix([[0, 1]]), FEATURE_TYPE_SEQUENCE, TEXT, "origin") with pytest.raises(ValueError): existing_features.combine_with_features(new_features)
def test_combine_with_existing_sparse_features(): existing_features = Features( scipy.sparse.csr_matrix([[1, 0, 2, 3], [2, 0, 0, 1]]), FEATURE_TYPE_SEQUENCE, TEXT, "test", ) new_features = Features(scipy.sparse.csr_matrix([[1, 0], [0, 1]]), FEATURE_TYPE_SEQUENCE, TEXT, "origin") expected_features = [[1, 0, 2, 3, 1, 0], [2, 0, 0, 1, 0, 1]] existing_features.combine_with_features(new_features) actual_features = existing_features.features.toarray() assert np.all(expected_features == actual_features)
def test_extract_features(): fake_features = np.zeros(shape) fake_features_as_features = Features( features=fake_features, attribute=INTENT, feature_type=SENTENCE, origin=[] ) # create zero features fake_features_list = [fake_features_as_features] # create tracker state features by setting a random index in the array to 1 random_inds = np.random.randint(shape, size=6) list_of_features = [] for idx in random_inds: current_features = copy.deepcopy(fake_features_as_features) current_features.features[idx] = 1 list_of_features.append([current_features]) # organize the created features into lists ~ dialog history tracker_features = [ [list_of_features[0], None, list_of_features[1]], [None, None, list_of_features[2]], [list_of_features[3], list_of_features[4], list_of_features[5]], ] ( attribute_masks, dense_features, sparse_features, ) = model_data_utils._extract_features(tracker_features, fake_features_list, INTENT) expected_mask = np.array([[1, 0, 1], [0, 0, 1], [1, 1, 1]]) assert np.all(np.squeeze(np.array(attribute_masks), 2) == expected_mask) assert np.array(dense_features[SENTENCE]).shape[-1] == fake_features.shape[-1] assert sparse_features == {}
def add_features_to_message( self, sequence: FeatureType, sentence: Optional[FeatureType], attribute: Text, message: Message, ) -> None: """Adds sequence and sentence features for the attribute to the given message. Args: sequence: sequence feature matrix sentence: sentence feature matrix attribute: the attribute which both features describe message: the message to which we want to add those features """ for type, features in [ (FEATURE_TYPE_SEQUENCE, sequence), (FEATURE_TYPE_SENTENCE, sentence), ]: if features is not None: wrapped_feature = Features( features, type, attribute, self._identifier, ) message.add_features(wrapped_feature)
def collect_features(self, sub_state: SubState, attributes: Optional[Iterable[Text]] = None ) -> Dict[Text, List[Features]]: """Collects features for all attributes in the given substate. There might be be multiple messages in the container that contain features relevant for the given substate, e.g. this is the case if `TEXT` and `INTENT` are present in the given substate. All of those messages will be collected and their features combined. Args: sub_state: substate for which we want to extract the relevent features attributes: if not `None`, this specifies the list of the attributes of the `Features` that we're interested in (i.e. all other `Features` contained in the relevant messages will be ignored) Returns: a dictionary that maps all the (requested) attributes to a list of `Features` Raises: `ValueError`: if there exists some key pair (i.e. key attribute and corresponding value) from the given substate cannot be found `RuntimeError`: if features for the same attribute are found in two different messages that are associated with the given substate """ # If we specify a list of attributes, then we want a dict with one entry # for each attribute back - even if the corresponding list of features is empty. features: Dict[Text, List[Features]] = (dict() if attributes is None else { attribute: [] for attribute in attributes }) # collect all relevant key attributes key_attributes = set(sub_state.keys()).intersection( self.KEY_ATTRIBUTES) for key_attribute in key_attributes: key_value = str(sub_state[key_attribute]) message = self._table[key_attribute].get(key_value) if not message: raise ValueError( f"Unknown key ({key_attribute},{key_value}). Cannot retrieve " f"features for substate {sub_state}") features_from_message = Features.groupby_attribute( message.features, attributes=attributes) for feat_attribute, feat_value in features_from_message.items(): existing_values = features.get(feat_attribute) # Note: the following if-s are needed because if we specify a list of # attributes then `features_from_message` will contain one entry per # attribute even if the corresponding feature list is empty. if feat_value and existing_values: raise RuntimeError( f"Feature for attribute {feat_attribute} has already been " f"extracted from a different message stored under a key " f"in {key_attributes} " f"that is different from {key_attribute}. This means there's a " f"redundancy in the message container.") if feat_value: features[feat_attribute] = feat_value return features
def test_process_does_not_do_anything( regex_message_handler: RegexMessageHandler, text: Text): message = Message( data={ TEXT: text, INTENT: "bla" }, features=[ Features( features=np.zeros((1, 1)), feature_type=FEATURE_TYPE_SENTENCE, attribute=TEXT, origin="nlu-pipeline", ) ], ) # construct domain from expected intent/entities domain = Domain( intents=["intent"], entities=["entity"], slots=[], responses={}, action_names=[], forms={}, data={}, ) parsed_messages = regex_message_handler.process([message], domain) assert parsed_messages[0] == message
def get_tag_ids(example: Message, tag_spec: "EntityTagSpec", bilou_tagging: bool) -> "Features": """Creates a feature array containing the entity tag ids of the given example. Args: example: the message tag_spec: entity tag spec bilou_tagging: indicates whether BILOU tagging should be used or not Returns: A list of features. """ from rasa.nlu.test import determine_token_labels from rasa.nlu.utils.bilou_utils import bilou_tags_to_ids from rasa.shared.nlu.training_data.features import Features if bilou_tagging: _tags = bilou_tags_to_ids(example, tag_spec.tags_to_ids, tag_spec.tag_name) else: _tags = [] for token in example.get(TOKENS_NAMES[TEXT]): _tag = determine_token_labels(token, example.get(ENTITIES), attribute_key=tag_spec.tag_name) _tags.append(tag_spec.tags_to_ids[_tag]) # transpose to have seq_len x 1 return Features(np.array([_tags]).T, IDS, tag_spec.tag_name, TAG_ID_ORIGIN)
def _dummy_features(id: int, attribute: Text) -> Features: return Features( np.full(shape=(1), fill_value=id), attribute=attribute, feature_type="really-anything", origin="", )
def test_for_features_fingerprinting_collisions(): """Tests that features fingerprints are unique.""" m1 = np.asarray([[0.5, 3.1, 3.0], [1.1, 1.2, 1.3], [4.7, 0.3, 2.7]]) m2 = np.asarray([[0, 0, 0], [1, 2, 3], [0, 0, 1]]) dense_features = [ Features(m1, FEATURE_TYPE_SENTENCE, TEXT, "CountVectorsFeaturizer"), Features(m2, FEATURE_TYPE_SENTENCE, TEXT, "CountVectorsFeaturizer"), Features(m1, FEATURE_TYPE_SEQUENCE, TEXT, "CountVectorsFeaturizer"), Features(m1, FEATURE_TYPE_SEQUENCE, TEXT, "RegexFeaturizer"), Features(m1, FEATURE_TYPE_SENTENCE, INTENT, "CountVectorsFeaturizer"), ] dense_fingerprints = {f.fingerprint() for f in dense_features} assert len(dense_fingerprints) == len(dense_features) sparse_features = [ Features( scipy.sparse.coo_matrix(m1), FEATURE_TYPE_SENTENCE, TEXT, "CountVectorsFeaturizer", ), Features( scipy.sparse.coo_matrix(m2), FEATURE_TYPE_SENTENCE, TEXT, "CountVectorsFeaturizer", ), Features( scipy.sparse.coo_matrix(m1), FEATURE_TYPE_SEQUENCE, TEXT, "CountVectorsFeaturizer", ), Features( scipy.sparse.coo_matrix(m1), FEATURE_TYPE_SEQUENCE, TEXT, "RegexFeaturizer" ), Features( scipy.sparse.coo_matrix(m1), FEATURE_TYPE_SENTENCE, INTENT, "CountVectorsFeaturizer", ), ] sparse_fingerprints = {f.fingerprint() for f in sparse_features} assert len(sparse_fingerprints) == len(sparse_features)
def _extract_state_features( self, sub_state: SubState, precomputations: Optional[MessageContainerForCoreFeaturization], sparse: bool = False, ) -> Dict[Text, List[Features]]: # Remove entities from possible attributes attributes = set( attribute for attribute in sub_state.keys() if attribute != ENTITIES ) if precomputations is not None: # Collect features for all those attributes attributes_to_features = precomputations.collect_features( sub_state, attributes=attributes ) # if features for INTENT or ACTION_NAME exist, # they are always sparse sequence features; # transform them to sentence sparse features if attributes_to_features.get(INTENT): attributes_to_features[INTENT] = self._to_sparse_sentence_features( attributes_to_features[INTENT] ) if attributes_to_features.get(ACTION_NAME): attributes_to_features[ACTION_NAME] = self._to_sparse_sentence_features( attributes_to_features[ACTION_NAME] ) # Combine and sort the features: # Per attribute, combine features of same type and level into one Feature, # and (if there are any such features) store the results in a list where # - all the sparse features are listed first and a # - sequence feature is always listed before the sentence feature of the # same type (sparse/not sparse). output = { attribute: Features.reduce( features_list=features_list, expected_origins=None ) for attribute, features_list in attributes_to_features.items() if len(features_list) > 0 # otherwise, following will fail } else: output = {} # Check that the name attribute has features name_attribute = self._get_name_attribute(attributes) if name_attribute and name_attribute not in output: # nlu pipeline didn't create features for user or action # this might happen, for example, when we have action_name in the state # but it did not get featurized because only character level # CountVectorsFeaturizer was included in the config. output[name_attribute] = self._create_features( sub_state, name_attribute, sparse ) return output