Ejemplo n.º 1
0
def test_message_fingerprint_includes_data_and_features(
    whitespace_tokenizer: WhitespaceTokenizer,
):
    message = Message(data={TEXT: "This is a test sentence."})
    fp1 = message.fingerprint()
    whitespace_tokenizer.process([message])
    fp2 = message.fingerprint()

    assert fp1 != fp2

    message.add_features(
        Features(scipy.sparse.csr_matrix([1, 1, 0]), FEATURE_TYPE_SEQUENCE, TEXT, "c2",)
    )

    fp3 = message.fingerprint()
    assert fp2 != fp3

    message.add_features(
        Features(np.ndarray([1, 2, 2]), FEATURE_TYPE_SEQUENCE, TEXT, "c1")
    )

    fp4 = message.fingerprint()

    assert fp3 != fp4

    assert len({fp1, fp2, fp3, fp4}) == 4
Ejemplo n.º 2
0
    def _set_semantic_map_features(self, message: Message, attribute: Text) -> None:
        """Adds semantic map features to the given attribute of the message.

        Args:
            message: The message to modify.
            attribute: The name of the attribute that should be changed.
        """
        if not message.get(TOKENS_NAMES[attribute], []):
            return

        sequence_features, sentence_features = self._featurize_tokens(
            message.get(TOKENS_NAMES[attribute], [])
        )

        if sequence_features is not None:
            final_sequence_features = Features(
                sequence_features,
                FEATURE_TYPE_SEQUENCE,
                attribute,
                self.component_config[FEATURIZER_CLASS_ALIAS],
            )
            message.add_features(final_sequence_features)

        if sentence_features is not None:
            final_sentence_features = Features(
                sentence_features,
                FEATURE_TYPE_SENTENCE,
                attribute,
                self.component_config[FEATURIZER_CLASS_ALIAS],
            )
            message.add_features(final_sentence_features)
Ejemplo n.º 3
0
    def _text_features_with_regex(self, message: Message,
                                  attribute: Text) -> None:
        """Helper method to extract features and set them appropriately in the message.

        Args:
            message: Message to be featurized.
            attribute: Attribute of message to be featurized.
        """
        if self.known_patterns:
            sequence_features, sentence_features = self._features_for_patterns(
                message, attribute)

            if sequence_features is not None:
                final_sequence_features = Features(
                    sequence_features,
                    FEATURE_TYPE_SEQUENCE,
                    attribute,
                    self.component_config[FEATURIZER_CLASS_ALIAS],
                )
                message.add_features(final_sequence_features)

            if sentence_features is not None:
                final_sentence_features = Features(
                    sentence_features,
                    FEATURE_TYPE_SENTENCE,
                    attribute,
                    self.component_config[FEATURIZER_CLASS_ALIAS],
                )
                message.add_features(final_sentence_features)
Ejemplo n.º 4
0
    def _set_spacy_features(self,
                            message: Message,
                            attribute: Text = TEXT) -> None:
        """Adds the spacy word vectors to the messages features."""
        doc = self.get_doc(message, attribute)

        if doc is None:
            return

        # in case an empty spaCy model was used, no vectors are present
        if doc.vocab.vectors_length == 0:
            logger.debug(
                "No features present. You are using an empty spaCy model.")
            return

        sequence_features = self._features_for_doc(doc)
        sentence_features = self._calculate_sentence_features(
            sequence_features, self.pooling_operation)

        final_sequence_features = Features(
            sequence_features,
            FEATURE_TYPE_SEQUENCE,
            attribute,
            self.component_config[FEATURIZER_CLASS_ALIAS],
        )
        message.add_features(final_sequence_features)
        final_sentence_features = Features(
            sentence_features,
            FEATURE_TYPE_SENTENCE,
            attribute,
            self.component_config[FEATURIZER_CLASS_ALIAS],
        )
        message.add_features(final_sentence_features)
Ejemplo n.º 5
0
    def _set_lm_features(self,
                         message: Message,
                         attribute: Text = TEXT) -> None:
        """Adds the precomputed word vectors to the messages features."""
        doc = self._get_doc(message, attribute)

        if doc is None:
            return

        sequence_features = doc[SEQUENCE_FEATURES]
        sentence_features = doc[SENTENCE_FEATURES]

        final_sequence_features = Features(
            sequence_features,
            FEATURE_TYPE_SEQUENCE,
            attribute,
            self.component_config[FEATURIZER_CLASS_ALIAS],
        )
        message.add_features(final_sequence_features)
        final_sentence_features = Features(
            sentence_features,
            FEATURE_TYPE_SENTENCE,
            attribute,
            self.component_config[FEATURIZER_CLASS_ALIAS],
        )
        message.add_features(final_sentence_features)
Ejemplo n.º 6
0
    def _set_features(self, message: Message, attribute: Text = TEXT) -> None:
        """Sets the features on a single message. Utility method."""
        tokens = message.get(TEXT_TOKENS)

        # If the message doesn't have tokens, we can't create features.
        if not tokens:
            return None

        # Make distinction between sentence and sequence features
        text_vector = self.tfm.transform([message.get(TEXT)])
        word_vectors = self.tfm.transform([t.text for t in tokens])

        final_sequence_features = Features(
            word_vectors,
            FEATURE_TYPE_SEQUENCE,
            attribute,
            self._config[FEATURIZER_CLASS_ALIAS],
        )
        message.add_features(final_sequence_features)
        final_sentence_features = Features(
            text_vector,
            FEATURE_TYPE_SENTENCE,
            attribute,
            self._config[FEATURIZER_CLASS_ALIAS],
        )
        message.add_features(final_sentence_features)
Ejemplo n.º 7
0
    def set_bpemb_features(self,
                           message: Message,
                           attribute: Text = TEXT) -> None:
        tokens = message.get(TOKENS_NAMES[attribute])

        if not tokens:
            return None

        # We need to reshape here such that the shape is equivalent to that of sparsely
        # generated features. Without it, it'd be a 1D tensor. We need 2D (n_utterance, n_dim).
        text_vector = self.create_word_vector(
            document=message.get(TEXT)).reshape(1, -1)
        word_vectors = np.array(
            [self.create_word_vector(document=t.text) for t in tokens])

        final_sequence_features = Features(
            word_vectors,
            FEATURE_TYPE_SEQUENCE,
            attribute,
            self.component_config[FEATURIZER_CLASS_ALIAS],
        )
        message.add_features(final_sequence_features)
        final_sentence_features = Features(
            text_vector,
            FEATURE_TYPE_SENTENCE,
            attribute,
            self.component_config[FEATURIZER_CLASS_ALIAS],
        )
        message.add_features(final_sentence_features)
    def set_gensim_features(self,
                            message: Message,
                            attribute: Text = TEXT) -> None:
        tokens = message.get(TOKENS_NAMES[attribute])

        if not tokens:
            return None

        # If the key is not available then we featurize it with an array of zeros
        word_vectors = np.array([
            self.kv[t.text]
            if t.text in self.kv else np.zeros(self.kv.vector_size)
            for t in tokens
        ])

        # Sum up all the word vectors so that we have one for the complete utterance, e.g. sentence vector
        text_vector = reduce(lambda a, b: a + b, word_vectors).reshape(1, -1)

        final_sequence_features = Features(
            word_vectors,
            FEATURE_TYPE_SEQUENCE,
            attribute,
            self.component_config[FEATURIZER_CLASS_ALIAS],
        )
        message.add_features(final_sequence_features)
        final_sentence_features = Features(
            text_vector,
            FEATURE_TYPE_SENTENCE,
            attribute,
            self.component_config[FEATURIZER_CLASS_ALIAS],
        )
        message.add_features(final_sentence_features)
Ejemplo n.º 9
0
    def add_features_to_message(
        self,
        sequence: FeatureType,
        sentence: Optional[FeatureType],
        attribute: Text,
        message: Message,
    ) -> None:
        """Adds sequence and sentence features for the attribute to the given message.

        Args:
          sequence: sequence feature matrix
          sentence: sentence feature matrix
          attribute: the attribute which both features describe
          message: the message to which we want to add those features
        """
        for type, features in [
            (FEATURE_TYPE_SEQUENCE, sequence),
            (FEATURE_TYPE_SENTENCE, sentence),
        ]:
            if features is not None:
                wrapped_feature = Features(
                    features,
                    type,
                    attribute,
                    self._identifier,
                )
                message.add_features(wrapped_feature)
Ejemplo n.º 10
0
    def _set_features(self, message: Message, attribute: Text = TEXT) -> None:
        """Sets the features on a single message. Utility method."""
        tokens = message.get(TEXT_TOKENS)

        # If the message doesn't have tokens, we can't create features.
        if not tokens:
            return None

        # We need to reshape here such that the shape is equivalent to that of sparsely
        # generated features. Without it, it'd be a 1D tensor. We need 2D (n_utterance, n_dim).
        text_vector = self._create_word_vector(document=message.get(TEXT)).reshape(
            1, -1
        )
        word_vectors = np.array(
            [self._create_word_vector(document=t.text) for t in tokens]
        )

        final_sequence_features = Features(
            word_vectors,
            FEATURE_TYPE_SEQUENCE,
            attribute,
            self._config[FEATURIZER_CLASS_ALIAS],
        )
        message.add_features(final_sequence_features)
        final_sentence_features = Features(
            text_vector,
            FEATURE_TYPE_SENTENCE,
            attribute,
            self._config[FEATURIZER_CLASS_ALIAS],
        )
        message.add_features(final_sentence_features)
Ejemplo n.º 11
0
    def _set_features(
        self,
        message: Message,
        sequence_features: np.ndarray,
        sentence_features: np.ndarray,
        attribute: Text,
    ) -> None:
        final_sequence_features = Features(
            sequence_features,
            FEATURE_TYPE_SEQUENCE,
            attribute,
            self._config[FEATURIZER_CLASS_ALIAS],
        )
        message.add_features(final_sequence_features)

        final_sentence_features = Features(
            sentence_features,
            FEATURE_TYPE_SENTENCE,
            attribute,
            self._config[FEATURIZER_CLASS_ALIAS],
        )
        message.add_features(final_sentence_features)
Ejemplo n.º 12
0
    def _create_sparse_features(self, message: Message) -> None:
        """Convert incoming messages into sparse features using the configured
        features."""
        import scipy.sparse

        tokens = message.get(TOKENS_NAMES[TEXT])
        # this check is required because there might be training data examples without
        # TEXT, e.g., `Message("", {action_name: "action_listen"})`
        if tokens:

            sentence_features = self._tokens_to_features(tokens)
            one_hot_seq_feature_vector = self._features_to_one_hot(sentence_features)

            sequence_features = scipy.sparse.coo_matrix(one_hot_seq_feature_vector)

            final_sequence_features = Features(
                sequence_features,
                FEATURE_TYPE_SEQUENCE,
                TEXT,
                self.component_config[FEATURIZER_CLASS_ALIAS],
            )
            message.add_features(final_sequence_features)
Ejemplo n.º 13
0
    def _text_features_with_regex(self, message: Message,
                                  attribute: Text) -> None:
        if self.known_patterns:
            sequence_features, sentence_features = self._features_for_patterns(
                message, attribute)

            if sequence_features is not None:
                final_sequence_features = Features(
                    sequence_features,
                    FEATURE_TYPE_SEQUENCE,
                    attribute,
                    self.component_config[FEATURIZER_CLASS_ALIAS],
                )
                message.add_features(final_sequence_features)

            if sentence_features is not None:
                final_sentence_features = Features(
                    sentence_features,
                    FEATURE_TYPE_SENTENCE,
                    attribute,
                    self.component_config[FEATURIZER_CLASS_ALIAS],
                )
                message.add_features(final_sentence_features)