Exemple #1
0
    def _set_features(self, message: Message, attribute: Text = TEXT) -> None:
        """Sets the features on a single message. Utility method."""
        tokens = message.get(TEXT_TOKENS)

        # If the message doesn't have tokens, we can't create features.
        if not tokens:
            return None

        # We need to reshape here such that the shape is equivalent to that of sparsely
        # generated features. Without it, it'd be a 1D tensor. We need 2D (n_utterance, n_dim).
        text_vector = self._create_word_vector(document=message.get(TEXT)).reshape(
            1, -1
        )
        word_vectors = np.array(
            [self._create_word_vector(document=t.text) for t in tokens]
        )

        final_sequence_features = Features(
            word_vectors,
            FEATURE_TYPE_SEQUENCE,
            attribute,
            self._config[FEATURIZER_CLASS_ALIAS],
        )
        message.add_features(final_sequence_features)
        final_sentence_features = Features(
            text_vector,
            FEATURE_TYPE_SENTENCE,
            attribute,
            self._config[FEATURIZER_CLASS_ALIAS],
        )
        message.add_features(final_sentence_features)
Exemple #2
0
def moodbot_features(
        request: Request,
        moodbot_domain: Domain) -> Dict[Text, Dict[Text, Features]]:
    """Makes intent and action features for the moodbot domain to faciliate
    making expected state features.

    Returns:
      A dict containing dicts for mapping action and intent names to features.
    """
    origin = getattr(request, "param", "SingleStateFeaturizer")
    action_shape = (1, len(moodbot_domain.action_names_or_texts))
    actions = {}
    for index, action in enumerate(moodbot_domain.action_names_or_texts):
        actions[action] = Features(
            sparse.coo_matrix(([1.0], [[0], [index]]), shape=action_shape),
            FEATURE_TYPE_SENTENCE,
            ACTION_NAME,
            origin,
        )
    intent_shape = (1, len(moodbot_domain.intents))
    intents = {}
    for index, intent in enumerate(moodbot_domain.intents):
        intents[intent] = Features(
            sparse.coo_matrix(([1.0], [[0], [index]]), shape=intent_shape),
            FEATURE_TYPE_SENTENCE,
            INTENT,
            origin,
        )
    return {"intents": intents, "actions": actions}
Exemple #3
0
    def _set_attribute_features(
        self,
        attribute: Text,
        sequence_features: List[scipy.sparse.spmatrix],
        sentence_features: List[scipy.sparse.spmatrix],
        examples: List[Message],
    ) -> None:
        """Set computed features of the attribute to corresponding message objects"""
        for i, message in enumerate(examples):
            # create bag for each example
            if sequence_features[i] is not None:
                final_sequence_features = Features(
                    sequence_features[i],
                    FEATURE_TYPE_SEQUENCE,
                    attribute,
                    self.component_config[FEATURIZER_CLASS_ALIAS],
                )
                message.add_features(final_sequence_features)

            if sentence_features[i] is not None:
                final_sentence_features = Features(
                    sentence_features[i],
                    FEATURE_TYPE_SENTENCE,
                    attribute,
                    self.component_config[FEATURIZER_CLASS_ALIAS],
                )
                message.add_features(final_sentence_features)
Exemple #4
0
    def _set_semantic_map_features(self, message: Message, attribute: Text) -> None:
        """Adds semantic map features to the given attribute of the message.

        Args:
            message: The message to modify.
            attribute: The name of the attribute that should be changed.
        """
        if not message.get(TOKENS_NAMES[attribute], []):
            return

        sequence_features, sentence_features = self._featurize_tokens(
            message.get(TOKENS_NAMES[attribute], [])
        )

        if sequence_features is not None:
            final_sequence_features = Features(
                sequence_features,
                FEATURE_TYPE_SEQUENCE,
                attribute,
                self.component_config[FEATURIZER_CLASS_ALIAS],
            )
            message.add_features(final_sequence_features)

        if sentence_features is not None:
            final_sentence_features = Features(
                sentence_features,
                FEATURE_TYPE_SENTENCE,
                attribute,
                self.component_config[FEATURIZER_CLASS_ALIAS],
            )
            message.add_features(final_sentence_features)
Exemple #5
0
def test_reduce_raises_if_combining_different_origins_or_attributes(
        differ: Text):
    # create features accordingly
    arbitrary_fixed_type = FEATURE_TYPE_SENTENCE
    features_list = []
    for idx in range(2):
        first_dim = 1
        arbitrary_matrix_matching_type = np.full(shape=(first_dim, 1),
                                                 fill_value=1)
        config = dict(
            features=arbitrary_matrix_matching_type,
            attribute="fixed-attribute"
            if differ != "attribute" else f"attr-{idx}",
            feature_type=arbitrary_fixed_type,
            origin="fixed-origin" if differ != "origin" else f"origin-{idx}",
        )
        feat = Features(**config)
        features_list.append(feat)

    # reduce!
    if differ == "attribute":
        message = "Expected all Features to describe the same attribute"
        expected_origin = ["origin"]
    else:
        message = "Expected 'origin-1' to be the origin of the 0-th"
        expected_origin = ["origin-1"]
    with pytest.raises(ValueError, match=message):
        Features.reduce(features_list, expected_origins=expected_origin)
Exemple #6
0
def test_create_zero_features():
    # DENSE FEATURES
    dense_feature_sentence_features = Features(
        features=np.random.rand(shape),
        attribute=INTENT,
        feature_type=SENTENCE,
        origin=[],
    )
    features = [[None, None, [dense_feature_sentence_features]]]

    zero_features = model_data_utils.create_zero_features(features)
    assert len(zero_features) == 1
    assert zero_features[0].is_dense()
    assert (zero_features[0].features == np.zeros(shape)).all()

    # SPARSE FEATURES
    sparse_feature_sentence_features = Features(
        features=scipy.sparse.coo_matrix(np.random.rand(shape)),
        attribute=INTENT,
        feature_type=SENTENCE,
        origin=[],
    )
    features = [[None, None, [sparse_feature_sentence_features]]]
    zero_features = model_data_utils.create_zero_features(features)
    assert len(zero_features) == 1
    assert zero_features[0].is_sparse()
    assert (zero_features[0].features != scipy.sparse.coo_matrix(
        (1, shape))).nnz == 0
    def set_gensim_features(self,
                            message: Message,
                            attribute: Text = TEXT) -> None:
        tokens = message.get(TOKENS_NAMES[attribute])

        if not tokens:
            return None

        # If the key is not available then we featurize it with an array of zeros
        word_vectors = np.array([
            self.kv[t.text]
            if t.text in self.kv else np.zeros(self.kv.vector_size)
            for t in tokens
        ])

        # Sum up all the word vectors so that we have one for the complete utterance, e.g. sentence vector
        text_vector = reduce(lambda a, b: a + b, word_vectors).reshape(1, -1)

        final_sequence_features = Features(
            word_vectors,
            FEATURE_TYPE_SEQUENCE,
            attribute,
            self.component_config[FEATURIZER_CLASS_ALIAS],
        )
        message.add_features(final_sequence_features)
        final_sentence_features = Features(
            text_vector,
            FEATURE_TYPE_SENTENCE,
            attribute,
            self.component_config[FEATURIZER_CLASS_ALIAS],
        )
        message.add_features(final_sentence_features)
Exemple #8
0
    def _text_features_with_regex(self, message: Message,
                                  attribute: Text) -> None:
        """Helper method to extract features and set them appropriately in the message.

        Args:
            message: Message to be featurized.
            attribute: Attribute of message to be featurized.
        """
        if self.known_patterns:
            sequence_features, sentence_features = self._features_for_patterns(
                message, attribute)

            if sequence_features is not None:
                final_sequence_features = Features(
                    sequence_features,
                    FEATURE_TYPE_SEQUENCE,
                    attribute,
                    self.component_config[FEATURIZER_CLASS_ALIAS],
                )
                message.add_features(final_sequence_features)

            if sentence_features is not None:
                final_sentence_features = Features(
                    sentence_features,
                    FEATURE_TYPE_SENTENCE,
                    attribute,
                    self.component_config[FEATURIZER_CLASS_ALIAS],
                )
                message.add_features(final_sentence_features)
def test_create_fake_features():
    # DENSE FEATURES
    dense_feature_sentence_features = Features(
        features=np.random.rand(shape),
        attribute=INTENT,
        feature_type=SENTENCE,
        origin=[],
    )
    features = [[None, None, [dense_feature_sentence_features]]]

    fake_features = model_data_utils._create_fake_features(features)
    assert len(fake_features) == 1
    assert fake_features[0].is_dense()
    assert fake_features[0].features.shape == (0, shape)

    # SPARSE FEATURES
    sparse_feature_sentence_features = Features(
        features=scipy.sparse.coo_matrix(np.random.rand(shape)),
        attribute=INTENT,
        feature_type=SENTENCE,
        origin=[],
    )
    features = [[None, None, [sparse_feature_sentence_features]]]
    fake_features = model_data_utils._create_fake_features(features)
    assert len(fake_features) == 1
    assert fake_features[0].is_sparse()
    assert fake_features[0].features.shape == (0, shape)
    assert fake_features[0].features.nnz == 0
Exemple #10
0
def test_combine(is_sparse: bool, type: Text, number: int):

    features_list, modifications = _generate_feature_list_and_modifications(
        is_sparse=is_sparse, type=type, number=number
    )
    modified_features = [Features(**config) for config in modifications]
    first_dim = features_list[0].features.shape[0]

    expected_origin = [f"origin-{idx}" for idx in range(len(features_list))]
    if number == 1:
        # in this case the origin will be same str as before, not a list
        expected_origin = expected_origin[0]

    # works as expected
    combination = Features.combine(features_list, expected_origins=expected_origin)
    assert combination.features.shape[1] == int(number * (number + 1) / 2)
    assert combination.features.shape[0] == first_dim
    assert combination.origin == expected_origin
    assert combination.is_sparse() == is_sparse
    matrix = combination.features
    if is_sparse:
        matrix = combination.features.todense()
    for idx in range(number):
        offset = int(idx * (idx + 1) / 2)
        assert np.all(matrix[:, offset : (offset + idx + 1)] == idx + 1)

    # fails as expected in these cases
    if number > 1:
        for modified_feature in modified_features:
            features_list_copy = features_list.copy()
            features_list_copy[-1] = modified_feature
            with pytest.raises(ValueError):
                Features.combine(features_list_copy, expected_origins=expected_origin)
Exemple #11
0
    def _set_lm_features(self,
                         message: Message,
                         attribute: Text = TEXT) -> None:
        """Adds the precomputed word vectors to the messages features."""
        doc = self._get_doc(message, attribute)

        if doc is None:
            return

        sequence_features = doc[SEQUENCE_FEATURES]
        sentence_features = doc[SENTENCE_FEATURES]

        final_sequence_features = Features(
            sequence_features,
            FEATURE_TYPE_SEQUENCE,
            attribute,
            self.component_config[FEATURIZER_CLASS_ALIAS],
        )
        message.add_features(final_sequence_features)
        final_sentence_features = Features(
            sentence_features,
            FEATURE_TYPE_SENTENCE,
            attribute,
            self.component_config[FEATURIZER_CLASS_ALIAS],
        )
        message.add_features(final_sentence_features)
Exemple #12
0
    def _set_features(self, message: Message, attribute: Text = TEXT) -> None:
        """Sets the features on a single message. Utility method."""
        tokens = message.get(TEXT_TOKENS)

        # If the message doesn't have tokens, we can't create features.
        if not tokens:
            return None

        # Make distinction between sentence and sequence features
        text_vector = self.tfm.transform([message.get(TEXT)])
        word_vectors = self.tfm.transform([t.text for t in tokens])

        final_sequence_features = Features(
            word_vectors,
            FEATURE_TYPE_SEQUENCE,
            attribute,
            self._config[FEATURIZER_CLASS_ALIAS],
        )
        message.add_features(final_sequence_features)
        final_sentence_features = Features(
            text_vector,
            FEATURE_TYPE_SENTENCE,
            attribute,
            self._config[FEATURIZER_CLASS_ALIAS],
        )
        message.add_features(final_sentence_features)
Exemple #13
0
    def _set_spacy_features(self,
                            message: Message,
                            attribute: Text = TEXT) -> None:
        """Adds the spacy word vectors to the messages features."""
        doc = self.get_doc(message, attribute)

        if doc is None:
            return

        # in case an empty spaCy model was used, no vectors are present
        if doc.vocab.vectors_length == 0:
            logger.debug(
                "No features present. You are using an empty spaCy model.")
            return

        sequence_features = self._features_for_doc(doc)
        sentence_features = self._calculate_sentence_features(
            sequence_features, self.pooling_operation)

        final_sequence_features = Features(
            sequence_features,
            FEATURE_TYPE_SEQUENCE,
            attribute,
            self.component_config[FEATURIZER_CLASS_ALIAS],
        )
        message.add_features(final_sequence_features)
        final_sentence_features = Features(
            sentence_features,
            FEATURE_TYPE_SENTENCE,
            attribute,
            self.component_config[FEATURIZER_CLASS_ALIAS],
        )
        message.add_features(final_sentence_features)
Exemple #14
0
def test_groupby(
    num_features_per_attribute: Dict[Text, int],
    specified_attributes: Optional[List[Text]],
):

    features_list = []
    for attribute, number in num_features_per_attribute.items():
        for idx in range(number):
            matrix = np.full(shape=(1, idx + 1), fill_value=idx + 1)
            config = dict(
                features=matrix,
                attribute=attribute,
                feature_type=FEATURE_TYPE_SEQUENCE,  # doesn't matter
                origin=f"origin-{idx}",  # doens't matter
            )
            feat = Features(**config)
            features_list.append(feat)

    result = Features.groupby_attribute(features_list, attributes=specified_attributes)
    if specified_attributes is None:
        for attribute, number in num_features_per_attribute.items():
            if number > 0:
                assert attribute in result
                assert len(result[attribute]) == number
            else:
                assert attribute not in result
    else:
        assert set(result.keys()) == set(specified_attributes)
        for attribute in specified_attributes:
            assert attribute in result
            number = num_features_per_attribute.get(attribute, 0)
            assert len(result[attribute]) == number
Exemple #15
0
    def set_bpemb_features(self,
                           message: Message,
                           attribute: Text = TEXT) -> None:
        tokens = message.get(TOKENS_NAMES[attribute])

        if not tokens:
            return None

        # We need to reshape here such that the shape is equivalent to that of sparsely
        # generated features. Without it, it'd be a 1D tensor. We need 2D (n_utterance, n_dim).
        text_vector = self.create_word_vector(
            document=message.get(TEXT)).reshape(1, -1)
        word_vectors = np.array(
            [self.create_word_vector(document=t.text) for t in tokens])

        final_sequence_features = Features(
            word_vectors,
            FEATURE_TYPE_SEQUENCE,
            attribute,
            self.component_config[FEATURIZER_CLASS_ALIAS],
        )
        message.add_features(final_sequence_features)
        final_sentence_features = Features(
            text_vector,
            FEATURE_TYPE_SENTENCE,
            attribute,
            self.component_config[FEATURIZER_CLASS_ALIAS],
        )
        message.add_features(final_sentence_features)
Exemple #16
0
def test_message_fingerprint_includes_data_and_features(
    whitespace_tokenizer: WhitespaceTokenizer,
):
    message = Message(data={TEXT: "This is a test sentence."})
    fp1 = message.fingerprint()
    whitespace_tokenizer.process([message])
    fp2 = message.fingerprint()

    assert fp1 != fp2

    message.add_features(
        Features(scipy.sparse.csr_matrix([1, 1, 0]), FEATURE_TYPE_SEQUENCE, TEXT, "c2",)
    )

    fp3 = message.fingerprint()
    assert fp2 != fp3

    message.add_features(
        Features(np.ndarray([1, 2, 2]), FEATURE_TYPE_SEQUENCE, TEXT, "c1")
    )

    fp4 = message.fingerprint()

    assert fp3 != fp4

    assert len({fp1, fp2, fp3, fp4}) == 4
Exemple #17
0
def test_reduce(shuffle_mode: Text,
                num_features_per_combination: Tuple[int, int, int, int]):

    # all combinations - in the expected order
    # (i.e. all sparse before all dense and sequence before sentence)
    all_combinations = [
        (FEATURE_TYPE_SEQUENCE, True),
        (FEATURE_TYPE_SENTENCE, True),
        (FEATURE_TYPE_SEQUENCE, False),
        (FEATURE_TYPE_SENTENCE, False),
    ]

    # multiply accordingly and mess up the order
    chosen_combinations = [
        spec
        for spec, num in zip(all_combinations, num_features_per_combination)
        for _ in range(num)
    ]
    if shuffle_mode == "reversed":
        messed_up_order = reversed(chosen_combinations)
    else:
        # Note: rng.permutation would mess up the types
        rng = np.random.default_rng(23452345)
        permutation = rng.permutation(len(chosen_combinations))
        messed_up_order = [chosen_combinations[idx] for idx in permutation]

    # create features accordingly
    features_list = []
    for idx, (type, is_sparse) in enumerate(messed_up_order):
        first_dim = 1 if type == FEATURE_TYPE_SEQUENCE else 3
        matrix = np.full(shape=(first_dim, 1), fill_value=1)
        if is_sparse:
            matrix = scipy.sparse.coo_matrix(matrix)
        config = dict(
            features=matrix,
            attribute="fixed-attribute",  # must be the same
            feature_type=type,
            origin="origin-does-matter-here",  # must be the same
        )
        feat = Features(**config)
        features_list.append(feat)

    # reduce!
    reduced_list = Features.reduce(features_list)
    assert len(reduced_list) == sum(num > 0
                                    for num in num_features_per_combination)
    idx = 0
    for num, (type, is_sparse) in zip(num_features_per_combination,
                                      all_combinations):
        if num == 0:
            # nothing to check here - because we already checked the length above
            # and check the types and shape of all existing features in this loop
            pass
        else:
            feature = reduced_list[idx]
            assert feature.is_sparse() == is_sparse
            assert feature.type == type
            assert feature.features.shape[-1] == num
            idx += 1
Exemple #18
0
def test_combine_with_existing_dense_features_shape_mismatch():
    existing_features = Features(
        np.array([[1, 0, 2, 3], [2, 0, 0, 1]]), FEATURE_TYPE_SEQUENCE, TEXT, "test"
    )
    new_features = Features(np.array([[0, 1]]), FEATURE_TYPE_SEQUENCE, TEXT, "origin")

    with pytest.raises(ValueError):
        existing_features.combine_with_features(new_features)
Exemple #19
0
def test_combine_with_existing_dense_features():
    existing_features = Features(np.array([[1, 0, 2, 3], [2, 0, 0, 1]]),
                                 FEATURE_TYPE_SEQUENCE, TEXT, "test")
    new_features = Features(np.array([[1, 0], [0, 1]]), FEATURE_TYPE_SEQUENCE,
                            TEXT, "origin")
    expected_features = np.array([[1, 0, 2, 3, 1, 0], [2, 0, 0, 1, 0, 1]])

    existing_features.combine_with_features(new_features)

    assert np.all(expected_features == existing_features.features)
Exemple #20
0
def test_surface_attributes():
    intent_features = {
        INTENT: [
            Features(
                features=np.random.rand(shape),
                attribute=INTENT,
                feature_type=SENTENCE,
                origin=[],
            )
        ]
    }

    action_name_features = scipy.sparse.coo_matrix(np.random.rand(shape))
    action_name_features = {
        ACTION_NAME: [
            Features(
                features=action_name_features,
                attribute=ACTION_NAME,
                feature_type=SENTENCE,
                origin=[],
            )
        ]
    }
    state_features = copy.deepcopy(intent_features)
    state_features.update(copy.deepcopy(action_name_features))
    # test on 2 dialogs -- one with dialog length 3 the other one with dialog length 2
    dialogs = [[state_features, intent_features, {}], [{},
                                                       action_name_features]]
    surfaced_features = model_data_utils.surface_attributes(dialogs)
    assert INTENT in surfaced_features and ACTION_NAME in surfaced_features
    # check that number of lists corresponds to number of dialogs
    assert (len(surfaced_features.get(INTENT)) == 2
            and len(surfaced_features.get(ACTION_NAME)) == 2)
    # length of each list corresponds to length of the dialog
    assert (len(surfaced_features.get(INTENT)[0]) == 3
            and len(surfaced_features.get(INTENT)[1]) == 2)
    assert (len(surfaced_features.get(ACTION_NAME)[0]) == 3
            and len(surfaced_features.get(ACTION_NAME)[1]) == 2)
    # check that features are correctly populated with `None`s
    assert (surfaced_features.get(INTENT)[0][2] is None
            and surfaced_features.get(INTENT)[1][0] is None
            and surfaced_features.get(INTENT)[1][1] is None)
    assert (surfaced_features.get(ACTION_NAME)[0][1] is None
            and surfaced_features.get(ACTION_NAME)[0][2] is None
            and surfaced_features.get(ACTION_NAME)[1][0] is None)
    # check that all features are the same as before
    assert all([
        (turn[0].features == intent_features[INTENT][0].features).all()
        for dialogue in surfaced_features.get(INTENT) for turn in dialogue
        if turn is not None
    ])
    assert all([(turn[0].features !=
                 action_name_features[ACTION_NAME][0].features).nnz == 0
                for dialogue in surfaced_features.get(ACTION_NAME)
                for turn in dialogue if turn is not None])
Exemple #21
0
def test_combine_with_existing_sparse_features_shape_mismatch():
    existing_features = Features(
        scipy.sparse.csr_matrix([[1, 0, 2, 3], [2, 0, 0, 1]]),
        FEATURE_TYPE_SEQUENCE,
        TEXT,
        "test",
    )
    new_features = Features(scipy.sparse.csr_matrix([[0, 1]]),
                            FEATURE_TYPE_SEQUENCE, TEXT, "origin")

    with pytest.raises(ValueError):
        existing_features.combine_with_features(new_features)
Exemple #22
0
def test_combine_with_existing_sparse_features():
    existing_features = Features(
        scipy.sparse.csr_matrix([[1, 0, 2, 3], [2, 0, 0, 1]]),
        FEATURE_TYPE_SEQUENCE,
        TEXT,
        "test",
    )
    new_features = Features(scipy.sparse.csr_matrix([[1, 0], [0, 1]]),
                            FEATURE_TYPE_SEQUENCE, TEXT, "origin")
    expected_features = [[1, 0, 2, 3, 1, 0], [2, 0, 0, 1, 0, 1]]

    existing_features.combine_with_features(new_features)
    actual_features = existing_features.features.toarray()

    assert np.all(expected_features == actual_features)
def test_extract_features():
    fake_features = np.zeros(shape)
    fake_features_as_features = Features(
        features=fake_features, attribute=INTENT, feature_type=SENTENCE, origin=[]
    )
    # create zero features
    fake_features_list = [fake_features_as_features]

    # create tracker state features by setting a random index in the array to 1
    random_inds = np.random.randint(shape, size=6)
    list_of_features = []
    for idx in random_inds:
        current_features = copy.deepcopy(fake_features_as_features)
        current_features.features[idx] = 1
        list_of_features.append([current_features])

    # organize the created features into lists ~ dialog history
    tracker_features = [
        [list_of_features[0], None, list_of_features[1]],
        [None, None, list_of_features[2]],
        [list_of_features[3], list_of_features[4], list_of_features[5]],
    ]

    (
        attribute_masks,
        dense_features,
        sparse_features,
    ) = model_data_utils._extract_features(tracker_features, fake_features_list, INTENT)
    expected_mask = np.array([[1, 0, 1], [0, 0, 1], [1, 1, 1]])

    assert np.all(np.squeeze(np.array(attribute_masks), 2) == expected_mask)
    assert np.array(dense_features[SENTENCE]).shape[-1] == fake_features.shape[-1]
    assert sparse_features == {}
Exemple #24
0
    def add_features_to_message(
        self,
        sequence: FeatureType,
        sentence: Optional[FeatureType],
        attribute: Text,
        message: Message,
    ) -> None:
        """Adds sequence and sentence features for the attribute to the given message.

        Args:
          sequence: sequence feature matrix
          sentence: sentence feature matrix
          attribute: the attribute which both features describe
          message: the message to which we want to add those features
        """
        for type, features in [
            (FEATURE_TYPE_SEQUENCE, sequence),
            (FEATURE_TYPE_SENTENCE, sentence),
        ]:
            if features is not None:
                wrapped_feature = Features(
                    features,
                    type,
                    attribute,
                    self._identifier,
                )
                message.add_features(wrapped_feature)
Exemple #25
0
    def collect_features(self,
                         sub_state: SubState,
                         attributes: Optional[Iterable[Text]] = None
                         ) -> Dict[Text, List[Features]]:
        """Collects features for all attributes in the given substate.

        There might be be multiple messages in the container that contain features
        relevant for the given substate, e.g. this is the case if `TEXT` and
        `INTENT` are present in the given substate. All of those messages will be
        collected and their features combined.

        Args:
          sub_state: substate for which we want to extract the relevent features
          attributes: if not `None`, this specifies the list of the attributes of the
            `Features` that we're interested in (i.e. all other `Features` contained
            in the relevant messages will be ignored)

        Returns:
          a dictionary that maps all the (requested) attributes to a list of `Features`

        Raises:
          `ValueError`: if there exists some key pair (i.e. key attribute and
            corresponding value) from the given substate cannot be found
          `RuntimeError`: if features for the same attribute are found in two
            different messages that are associated with the given substate
        """
        # If we specify a list of attributes, then we want a dict with one entry
        # for each attribute back - even if the corresponding list of features is empty.
        features: Dict[Text,
                       List[Features]] = (dict() if attributes is None else {
                           attribute: []
                           for attribute in attributes
                       })
        # collect all relevant key attributes
        key_attributes = set(sub_state.keys()).intersection(
            self.KEY_ATTRIBUTES)
        for key_attribute in key_attributes:
            key_value = str(sub_state[key_attribute])
            message = self._table[key_attribute].get(key_value)
            if not message:
                raise ValueError(
                    f"Unknown key ({key_attribute},{key_value}). Cannot retrieve "
                    f"features for substate {sub_state}")
            features_from_message = Features.groupby_attribute(
                message.features, attributes=attributes)
            for feat_attribute, feat_value in features_from_message.items():
                existing_values = features.get(feat_attribute)
                # Note: the following if-s are needed because if we specify a list of
                # attributes then `features_from_message` will contain one entry per
                # attribute even if the corresponding feature list is empty.
                if feat_value and existing_values:
                    raise RuntimeError(
                        f"Feature for attribute {feat_attribute} has already been "
                        f"extracted from a different message stored under a key "
                        f"in {key_attributes} "
                        f"that is different from {key_attribute}. This means there's a "
                        f"redundancy in the message container.")
                if feat_value:
                    features[feat_attribute] = feat_value
        return features
def test_process_does_not_do_anything(
        regex_message_handler: RegexMessageHandler, text: Text):

    message = Message(
        data={
            TEXT: text,
            INTENT: "bla"
        },
        features=[
            Features(
                features=np.zeros((1, 1)),
                feature_type=FEATURE_TYPE_SENTENCE,
                attribute=TEXT,
                origin="nlu-pipeline",
            )
        ],
    )

    # construct domain from expected intent/entities
    domain = Domain(
        intents=["intent"],
        entities=["entity"],
        slots=[],
        responses={},
        action_names=[],
        forms={},
        data={},
    )

    parsed_messages = regex_message_handler.process([message], domain)

    assert parsed_messages[0] == message
Exemple #27
0
def get_tag_ids(example: Message, tag_spec: "EntityTagSpec",
                bilou_tagging: bool) -> "Features":
    """Creates a feature array containing the entity tag ids of the given example.

    Args:
        example: the message
        tag_spec: entity tag spec
        bilou_tagging: indicates whether BILOU tagging should be used or not

    Returns:
        A list of features.
    """
    from rasa.nlu.test import determine_token_labels
    from rasa.nlu.utils.bilou_utils import bilou_tags_to_ids
    from rasa.shared.nlu.training_data.features import Features

    if bilou_tagging:
        _tags = bilou_tags_to_ids(example, tag_spec.tags_to_ids,
                                  tag_spec.tag_name)
    else:
        _tags = []
        for token in example.get(TOKENS_NAMES[TEXT]):
            _tag = determine_token_labels(token,
                                          example.get(ENTITIES),
                                          attribute_key=tag_spec.tag_name)
            _tags.append(tag_spec.tags_to_ids[_tag])

    # transpose to have seq_len x 1
    return Features(np.array([_tags]).T, IDS, tag_spec.tag_name, TAG_ID_ORIGIN)
Exemple #28
0
def _dummy_features(id: int, attribute: Text) -> Features:
    return Features(
        np.full(shape=(1), fill_value=id),
        attribute=attribute,
        feature_type="really-anything",
        origin="",
    )
Exemple #29
0
def test_for_features_fingerprinting_collisions():
    """Tests that features fingerprints are unique."""
    m1 = np.asarray([[0.5, 3.1, 3.0], [1.1, 1.2, 1.3], [4.7, 0.3, 2.7]])

    m2 = np.asarray([[0, 0, 0], [1, 2, 3], [0, 0, 1]])

    dense_features = [
        Features(m1, FEATURE_TYPE_SENTENCE, TEXT, "CountVectorsFeaturizer"),
        Features(m2, FEATURE_TYPE_SENTENCE, TEXT, "CountVectorsFeaturizer"),
        Features(m1, FEATURE_TYPE_SEQUENCE, TEXT, "CountVectorsFeaturizer"),
        Features(m1, FEATURE_TYPE_SEQUENCE, TEXT, "RegexFeaturizer"),
        Features(m1, FEATURE_TYPE_SENTENCE, INTENT, "CountVectorsFeaturizer"),
    ]
    dense_fingerprints = {f.fingerprint() for f in dense_features}
    assert len(dense_fingerprints) == len(dense_features)

    sparse_features = [
        Features(
            scipy.sparse.coo_matrix(m1),
            FEATURE_TYPE_SENTENCE,
            TEXT,
            "CountVectorsFeaturizer",
        ),
        Features(
            scipy.sparse.coo_matrix(m2),
            FEATURE_TYPE_SENTENCE,
            TEXT,
            "CountVectorsFeaturizer",
        ),
        Features(
            scipy.sparse.coo_matrix(m1),
            FEATURE_TYPE_SEQUENCE,
            TEXT,
            "CountVectorsFeaturizer",
        ),
        Features(
            scipy.sparse.coo_matrix(m1), FEATURE_TYPE_SEQUENCE, TEXT, "RegexFeaturizer"
        ),
        Features(
            scipy.sparse.coo_matrix(m1),
            FEATURE_TYPE_SENTENCE,
            INTENT,
            "CountVectorsFeaturizer",
        ),
    ]
    sparse_fingerprints = {f.fingerprint() for f in sparse_features}
    assert len(sparse_fingerprints) == len(sparse_features)
Exemple #30
0
    def _extract_state_features(
        self,
        sub_state: SubState,
        precomputations: Optional[MessageContainerForCoreFeaturization],
        sparse: bool = False,
    ) -> Dict[Text, List[Features]]:

        # Remove entities from possible attributes
        attributes = set(
            attribute for attribute in sub_state.keys() if attribute != ENTITIES
        )

        if precomputations is not None:

            # Collect features for all those attributes
            attributes_to_features = precomputations.collect_features(
                sub_state, attributes=attributes
            )
            # if features for INTENT or ACTION_NAME exist,
            # they are always sparse sequence features;
            # transform them to sentence sparse features
            if attributes_to_features.get(INTENT):
                attributes_to_features[INTENT] = self._to_sparse_sentence_features(
                    attributes_to_features[INTENT]
                )
            if attributes_to_features.get(ACTION_NAME):
                attributes_to_features[ACTION_NAME] = self._to_sparse_sentence_features(
                    attributes_to_features[ACTION_NAME]
                )

            # Combine and sort the features:
            # Per attribute, combine features of same type and level into one Feature,
            # and (if there are any such features) store the results in a list where
            # - all the sparse features are listed first and a
            # - sequence feature is always listed before the sentence feature of the
            #   same type (sparse/not sparse).
            output = {
                attribute: Features.reduce(
                    features_list=features_list, expected_origins=None
                )
                for attribute, features_list in attributes_to_features.items()
                if len(features_list) > 0  # otherwise, following will fail
            }
        else:
            output = {}

        # Check that the name attribute has features
        name_attribute = self._get_name_attribute(attributes)
        if name_attribute and name_attribute not in output:
            # nlu pipeline didn't create features for user or action
            # this might happen, for example, when we have action_name in the state
            # but it did not get featurized because only character level
            # CountVectorsFeaturizer was included in the config.
            output[name_attribute] = self._create_features(
                sub_state, name_attribute, sparse
            )
        return output