def test_message_fingerprint_includes_data_and_features( whitespace_tokenizer: WhitespaceTokenizer, ): message = Message(data={TEXT: "This is a test sentence."}) fp1 = message.fingerprint() whitespace_tokenizer.process([message]) fp2 = message.fingerprint() assert fp1 != fp2 message.add_features( Features(scipy.sparse.csr_matrix([1, 1, 0]), FEATURE_TYPE_SEQUENCE, TEXT, "c2",) ) fp3 = message.fingerprint() assert fp2 != fp3 message.add_features( Features(np.ndarray([1, 2, 2]), FEATURE_TYPE_SEQUENCE, TEXT, "c1") ) fp4 = message.fingerprint() assert fp3 != fp4 assert len({fp1, fp2, fp3, fp4}) == 4
def _set_semantic_map_features(self, message: Message, attribute: Text) -> None: """Adds semantic map features to the given attribute of the message. Args: message: The message to modify. attribute: The name of the attribute that should be changed. """ if not message.get(TOKENS_NAMES[attribute], []): return sequence_features, sentence_features = self._featurize_tokens( message.get(TOKENS_NAMES[attribute], []) ) if sequence_features is not None: final_sequence_features = Features( sequence_features, FEATURE_TYPE_SEQUENCE, attribute, self.component_config[FEATURIZER_CLASS_ALIAS], ) message.add_features(final_sequence_features) if sentence_features is not None: final_sentence_features = Features( sentence_features, FEATURE_TYPE_SENTENCE, attribute, self.component_config[FEATURIZER_CLASS_ALIAS], ) message.add_features(final_sentence_features)
def _text_features_with_regex(self, message: Message, attribute: Text) -> None: """Helper method to extract features and set them appropriately in the message. Args: message: Message to be featurized. attribute: Attribute of message to be featurized. """ if self.known_patterns: sequence_features, sentence_features = self._features_for_patterns( message, attribute) if sequence_features is not None: final_sequence_features = Features( sequence_features, FEATURE_TYPE_SEQUENCE, attribute, self.component_config[FEATURIZER_CLASS_ALIAS], ) message.add_features(final_sequence_features) if sentence_features is not None: final_sentence_features = Features( sentence_features, FEATURE_TYPE_SENTENCE, attribute, self.component_config[FEATURIZER_CLASS_ALIAS], ) message.add_features(final_sentence_features)
def _set_spacy_features(self, message: Message, attribute: Text = TEXT) -> None: """Adds the spacy word vectors to the messages features.""" doc = self.get_doc(message, attribute) if doc is None: return # in case an empty spaCy model was used, no vectors are present if doc.vocab.vectors_length == 0: logger.debug( "No features present. You are using an empty spaCy model.") return sequence_features = self._features_for_doc(doc) sentence_features = self._calculate_sentence_features( sequence_features, self.pooling_operation) final_sequence_features = Features( sequence_features, FEATURE_TYPE_SEQUENCE, attribute, self.component_config[FEATURIZER_CLASS_ALIAS], ) message.add_features(final_sequence_features) final_sentence_features = Features( sentence_features, FEATURE_TYPE_SENTENCE, attribute, self.component_config[FEATURIZER_CLASS_ALIAS], ) message.add_features(final_sentence_features)
def _set_lm_features(self, message: Message, attribute: Text = TEXT) -> None: """Adds the precomputed word vectors to the messages features.""" doc = self._get_doc(message, attribute) if doc is None: return sequence_features = doc[SEQUENCE_FEATURES] sentence_features = doc[SENTENCE_FEATURES] final_sequence_features = Features( sequence_features, FEATURE_TYPE_SEQUENCE, attribute, self.component_config[FEATURIZER_CLASS_ALIAS], ) message.add_features(final_sequence_features) final_sentence_features = Features( sentence_features, FEATURE_TYPE_SENTENCE, attribute, self.component_config[FEATURIZER_CLASS_ALIAS], ) message.add_features(final_sentence_features)
def _set_features(self, message: Message, attribute: Text = TEXT) -> None: """Sets the features on a single message. Utility method.""" tokens = message.get(TEXT_TOKENS) # If the message doesn't have tokens, we can't create features. if not tokens: return None # Make distinction between sentence and sequence features text_vector = self.tfm.transform([message.get(TEXT)]) word_vectors = self.tfm.transform([t.text for t in tokens]) final_sequence_features = Features( word_vectors, FEATURE_TYPE_SEQUENCE, attribute, self._config[FEATURIZER_CLASS_ALIAS], ) message.add_features(final_sequence_features) final_sentence_features = Features( text_vector, FEATURE_TYPE_SENTENCE, attribute, self._config[FEATURIZER_CLASS_ALIAS], ) message.add_features(final_sentence_features)
def set_bpemb_features(self, message: Message, attribute: Text = TEXT) -> None: tokens = message.get(TOKENS_NAMES[attribute]) if not tokens: return None # We need to reshape here such that the shape is equivalent to that of sparsely # generated features. Without it, it'd be a 1D tensor. We need 2D (n_utterance, n_dim). text_vector = self.create_word_vector( document=message.get(TEXT)).reshape(1, -1) word_vectors = np.array( [self.create_word_vector(document=t.text) for t in tokens]) final_sequence_features = Features( word_vectors, FEATURE_TYPE_SEQUENCE, attribute, self.component_config[FEATURIZER_CLASS_ALIAS], ) message.add_features(final_sequence_features) final_sentence_features = Features( text_vector, FEATURE_TYPE_SENTENCE, attribute, self.component_config[FEATURIZER_CLASS_ALIAS], ) message.add_features(final_sentence_features)
def set_gensim_features(self, message: Message, attribute: Text = TEXT) -> None: tokens = message.get(TOKENS_NAMES[attribute]) if not tokens: return None # If the key is not available then we featurize it with an array of zeros word_vectors = np.array([ self.kv[t.text] if t.text in self.kv else np.zeros(self.kv.vector_size) for t in tokens ]) # Sum up all the word vectors so that we have one for the complete utterance, e.g. sentence vector text_vector = reduce(lambda a, b: a + b, word_vectors).reshape(1, -1) final_sequence_features = Features( word_vectors, FEATURE_TYPE_SEQUENCE, attribute, self.component_config[FEATURIZER_CLASS_ALIAS], ) message.add_features(final_sequence_features) final_sentence_features = Features( text_vector, FEATURE_TYPE_SENTENCE, attribute, self.component_config[FEATURIZER_CLASS_ALIAS], ) message.add_features(final_sentence_features)
def add_features_to_message( self, sequence: FeatureType, sentence: Optional[FeatureType], attribute: Text, message: Message, ) -> None: """Adds sequence and sentence features for the attribute to the given message. Args: sequence: sequence feature matrix sentence: sentence feature matrix attribute: the attribute which both features describe message: the message to which we want to add those features """ for type, features in [ (FEATURE_TYPE_SEQUENCE, sequence), (FEATURE_TYPE_SENTENCE, sentence), ]: if features is not None: wrapped_feature = Features( features, type, attribute, self._identifier, ) message.add_features(wrapped_feature)
def _set_features(self, message: Message, attribute: Text = TEXT) -> None: """Sets the features on a single message. Utility method.""" tokens = message.get(TEXT_TOKENS) # If the message doesn't have tokens, we can't create features. if not tokens: return None # We need to reshape here such that the shape is equivalent to that of sparsely # generated features. Without it, it'd be a 1D tensor. We need 2D (n_utterance, n_dim). text_vector = self._create_word_vector(document=message.get(TEXT)).reshape( 1, -1 ) word_vectors = np.array( [self._create_word_vector(document=t.text) for t in tokens] ) final_sequence_features = Features( word_vectors, FEATURE_TYPE_SEQUENCE, attribute, self._config[FEATURIZER_CLASS_ALIAS], ) message.add_features(final_sequence_features) final_sentence_features = Features( text_vector, FEATURE_TYPE_SENTENCE, attribute, self._config[FEATURIZER_CLASS_ALIAS], ) message.add_features(final_sentence_features)
def _set_features( self, message: Message, sequence_features: np.ndarray, sentence_features: np.ndarray, attribute: Text, ) -> None: final_sequence_features = Features( sequence_features, FEATURE_TYPE_SEQUENCE, attribute, self._config[FEATURIZER_CLASS_ALIAS], ) message.add_features(final_sequence_features) final_sentence_features = Features( sentence_features, FEATURE_TYPE_SENTENCE, attribute, self._config[FEATURIZER_CLASS_ALIAS], ) message.add_features(final_sentence_features)
def _create_sparse_features(self, message: Message) -> None: """Convert incoming messages into sparse features using the configured features.""" import scipy.sparse tokens = message.get(TOKENS_NAMES[TEXT]) # this check is required because there might be training data examples without # TEXT, e.g., `Message("", {action_name: "action_listen"})` if tokens: sentence_features = self._tokens_to_features(tokens) one_hot_seq_feature_vector = self._features_to_one_hot(sentence_features) sequence_features = scipy.sparse.coo_matrix(one_hot_seq_feature_vector) final_sequence_features = Features( sequence_features, FEATURE_TYPE_SEQUENCE, TEXT, self.component_config[FEATURIZER_CLASS_ALIAS], ) message.add_features(final_sequence_features)
def _text_features_with_regex(self, message: Message, attribute: Text) -> None: if self.known_patterns: sequence_features, sentence_features = self._features_for_patterns( message, attribute) if sequence_features is not None: final_sequence_features = Features( sequence_features, FEATURE_TYPE_SEQUENCE, attribute, self.component_config[FEATURIZER_CLASS_ALIAS], ) message.add_features(final_sequence_features) if sentence_features is not None: final_sentence_features = Features( sentence_features, FEATURE_TYPE_SENTENCE, attribute, self.component_config[FEATURIZER_CLASS_ALIAS], ) message.add_features(final_sentence_features)