def test_count_vector_featurizer(sentence, expected): from rasa.nlu.featurizers.count_vectors_featurizer import CountVectorsFeaturizer ftr = CountVectorsFeaturizer({"token_pattern": r"(?u)\b\w+\b"}) train_message = Message(sentence) # this is needed for a valid training example train_message.set("intent", "bla") data = TrainingData([train_message]) ftr.train(data) test_message = Message(sentence) ftr.process(test_message) assert np.all(test_message.get("text_features") == expected)
def tokenize(self, message: Message, attribute: Text) -> List[Token]: import MicroTokenizer text = message.get(attribute) tokenized = MicroTokenizer.cut(text) tokens = [] offset = 0 for word in tokenized: tokens.append(Token(word, offset)) offset += len(word) return tokens
def test_do_not_overwrite_any_entities(): message = Message("Max lives in Berlin.") message.set(ENTITIES, [{ "entity": "person", "value": "Max", "start": 0, "end": 3 }]) training_data = TrainingData() training_data.training_examples = [ Message("Hi Max!", data={"entities": [{ "entity": "person", "value": "Max" }]}), Message( "I live in Berlin", data={"entities": [{ "entity": "city", "value": "Berlin" }]}, ), ] training_data.lookup_tables = [{ "name": "city", "elements": ["London", "Berlin", "Amsterdam"] }] entity_extractor = RegexEntityExtractor() entity_extractor.train(training_data) entity_extractor.process(message) entities = message.get(ENTITIES) assert entities == [ { "entity": "person", "value": "Max", "start": 0, "end": 3 }, { "entity": "city", "value": "Berlin", "start": 13, "end": 19, "extractor": "RegexEntityExtractor", }, ]
def process(self, message: Message, **kwargs: Any): """Process an incoming message. This is the components chance to process an incoming message. The component can rely on any context attribute to be present, that gets created by a call to :meth:`components.Component.pipeline_init` of ANY component and on any context attributes created by a call to :meth:`components.Component.process` of components previous to this one.""" # TODO 分词, 如果利用其它分词组件, 需要进一步调整 if not message.get("tokens", default=None): self.extract_tokens(message) # 词性标注 self.extract_poses(message) # 句法依存 self.extract_parses(message) # 抽取实体<序列标注+实体提取> self.extract_entities(message) # 抽取代词 self.extract_pronouns(message) else: # rasa tokenizers tokens = message.get("tokens") message.set("tokenizers", tokens) # List tokens tokens = [tokenizer_extract(token) for token in tokens] message.set("tokens", tokens) self.extract_poses(message) # 句法依存 self.extract_parses(message) # 抽取实体<序列标注+实体提取> # 语义分割 -> self.entity_segment(message) # 属性分析 -> self.link_analyze(message)
def test_lookup_tables(sentence, expected, labeled_tokens, spacy_nlp): from rasa.nlu.featurizers.sparse_featurizer.regex_featurizer import RegexFeaturizer lookups = [ { "name": "drinks", "elements": ["mojito", "lemonade", "sweet berry wine", "tea", "club?mate"], }, { "name": "plates", "elements": "data/test/lookup_tables/plates.txt" }, ] ftr = RegexFeaturizer() ftr.add_lookup_tables(lookups) # adds tokens to the message component_config = {"name": "SpacyTokenizer"} tokenizer = SpacyTokenizer(component_config) message = Message(sentence) message.set("text_spacy_doc", spacy_nlp(sentence)) tokenizer.process(message) sequence_features, sentence_features = ftr._features_for_patterns( message, TEXT) assert np.allclose(sequence_features.toarray(), expected[:-1], atol=1e-10) assert np.allclose(sentence_features.toarray(), expected[-1], atol=1e-10) # the tokenizer should have added tokens assert len(message.get(TOKENS_NAMES[TEXT], [])) > 0 # the number of regex matches on each token should match for i, token in enumerate(message.get(TOKENS_NAMES[TEXT])): token_matches = token.get("pattern").values() num_matches = sum(token_matches) assert num_matches == labeled_tokens.count(i)
def tokens_without_cls( message: Message, attribute: Text = TEXT ) -> Optional[List[Token]]: """Return tokens of given message without __CLS__ token. All tokenizers add a __CLS__ token to the end of the list of tokens for text and responses. The token captures the sentence features. Args: message: The message. attribute: Return tokens of provided attribute. Returns: Tokens without CLS token. """ # return all tokens up to __CLS__ token for text and responses if attribute in DENSE_FEATURIZABLE_ATTRIBUTES: tokens = message.get(TOKENS_NAMES[attribute]) if tokens is not None: return tokens[:POSITION_OF_CLS_TOKEN] return None # we don't add the __CLS__ token for intents, return all tokens return message.get(TOKENS_NAMES[attribute])
def process(self, message: Message, **kwargs: Any) -> None: """Process an incoming message""" entities = list(message.get('entities')) # Get file path of lookup table in json format cur_path = os.path.dirname(__file__) if os.name == 'nt': partial_lookup_file_path = '..\\data\\lookup_table.json' else: partial_lookup_file_path = '../data/lookup_table.json' lookup_file_path = os.path.join(cur_path, partial_lookup_file_path) with open(lookup_file_path, 'r') as file: lookup_data = json.load(file) tokens = message.get('tokens') for token in tokens: similarity_score = self.get_fuzzy_similarity( token.text, lookup_data, self.threshold) if similarity_score is not None: print("'" + token.text + "'" + " matches with " + str(similarity_score[0]) + "[" + similarity_score[2] + "]" + " with a score of: " + str(similarity_score[1])) for i, item in enumerate(entities): # if entity already exist, update it (because diet classifier is higher in hierarchy) if item['entity'] == similarity_score[2]: item.update({"value": similarity_score[0]}) entities.append({ "start": token.start, "end": token.end, "value": similarity_score[0], "confidence": similarity_score[1], "entity": similarity_score[2] }) file.close() message.set("entities", entities, add_to_output=True)
def test_mitie_featurizer_train(mitie_feature_extractor): featurizer = MitieFeaturizer.create({}, RasaNLUModelConfig()) sentence = "Hey how are you today" message = Message(sentence) message.set(RESPONSE, sentence) message.set(INTENT, "intent") MitieTokenizer().train(TrainingData([message])) featurizer.train( TrainingData([message]), RasaNLUModelConfig(), **{"mitie_feature_extractor": mitie_feature_extractor}, ) expected = np.array( [0.00000000e00, -5.12735510e00, 4.39929873e-01, -5.60760403e00, -8.26445103e00] ) expected_cls = np.array([0.0, -4.4551446, 0.26073121, -1.46632245, -1.84205751]) vecs = message.get(DENSE_FEATURE_NAMES[TEXT]) assert len(message.get(TOKENS_NAMES[TEXT])) == len(vecs) assert np.allclose(vecs[0][:5], expected, atol=1e-5) assert np.allclose(vecs[-1][:5], expected_cls, atol=1e-5) vecs = message.get(DENSE_FEATURE_NAMES[RESPONSE]) assert len(message.get(TOKENS_NAMES[RESPONSE])) == len(vecs) assert np.allclose(vecs[0][:5], expected, atol=1e-5) assert np.allclose(vecs[-1][:5], expected_cls, atol=1e-5) vecs = message.get(DENSE_FEATURE_NAMES[INTENT]) assert vecs is None
def test_count_vector_featurizer_response_attribute_featurization( sentence, intent, response, intent_features, response_features): ftr = CountVectorsFeaturizer({"token_pattern": r"(?u)\b\w+\b"}) tk = WhitespaceTokenizer() train_message = Message(sentence) # this is needed for a valid training example train_message.set(INTENT_ATTRIBUTE, intent) train_message.set(RESPONSE_ATTRIBUTE, response) # add a second example that has some response, so that the vocabulary for # response exists second_message = Message("hello") second_message.set(RESPONSE_ATTRIBUTE, "hi") second_message.set(INTENT_ATTRIBUTE, "greet") data = TrainingData([train_message, second_message]) tk.train(data) ftr.train(data) if intent_features: assert (train_message.get( SPARSE_FEATURE_NAMES[INTENT_ATTRIBUTE]).toarray()[0] == intent_features) else: assert train_message.get( SPARSE_FEATURE_NAMES[INTENT_ATTRIBUTE]) is None if response_features: assert (train_message.get( SPARSE_FEATURE_NAMES[RESPONSE_ATTRIBUTE]).toarray()[0] == response_features) else: assert train_message.get( SPARSE_FEATURE_NAMES[RESPONSE_ATTRIBUTE]) is None
def _from_text_to_crf(self, message: Message, entities: List[Text] = None) -> List[CRFToken]: """Takes a sentence and switches it to crfsuite format.""" crf_format = [] if self.pos_features: tokens = message.get(SPACY_DOCS[TEXT_ATTRIBUTE]) else: tokens = message.get(TOKENS_NAMES[TEXT_ATTRIBUTE]) text_dense_features = self.__get_dense_features(message) for i, token in enumerate(tokens): pattern = self.__pattern_of_token(message, i) entity = entities[i] if entities else "N/A" tag = self.__tag_of_token(token) if self.pos_features else None dense_features = (text_dense_features[i] if text_dense_features is not None else []) crf_format.append( CRFToken(token.text, tag, entity, pattern, dense_features)) return crf_format
def process(self, message: Message, **kwargs: Any) -> None: mitie_feature_extractor = kwargs.get("mitie_feature_extractor") if not mitie_feature_extractor: raise Exception("Failed to train 'MitieFeaturizer'. " "Missing a proper MITIE feature extractor.") ents = self.extract_entities(message.text, self._tokens_without_cls(message), mitie_feature_extractor) extracted = self.add_extractor_name(ents) extracted = self.clean_up_entities(message, extracted) message.set(ENTITIES, message.get(ENTITIES, []) + extracted, add_to_output=True)
def _get_processed_message_tokens_by_attribute( self, message: Message, attribute: Text = TEXT ) -> List[Text]: """Get processed text of attribute of a message""" if message.get(attribute) is None: # return empty list since sklearn countvectorizer does not like None # object while training and predicting return [] tokens = self._get_message_tokens_by_attribute(message, attribute) tokens = self._process_tokens(tokens, attribute) tokens = self._replace_with_oov_token(tokens, attribute) return tokens
def test_custom_intent_symbol(text, expected_tokens): component_config = { "intent_tokenization_flag": True, "intent_split_symbol": "+" } tk = MitieTokenizer(component_config) message = Message(text) message.set(INTENT, text) tk.train(TrainingData([message])) assert [t.text for t in message.get(TOKENS_NAMES[INTENT])] == expected_tokens
def test_spacy_ner_featurizer_config(spacy_nlp): from rasa.nlu.featurizers.spacy_featurizer import SpacyFeaturizer sentence = "hi there friend" doc = spacy_nlp(sentence) spacy_config = {"ner_feature_vectors": False} ftr = SpacyFeaturizer.create(spacy_config, RasaNLUModelConfig()) greet = {"intent": "greet", "text_features": [0.5]} message = Message(sentence, greet) message.set("spacy_doc", doc) ftr._set_spacy_features(message) ftr._set_spacy_ner_features(message) vecs = np.array(message.get("ner_features")) assert vecs.shape[0] == len(doc) assert vecs.shape[1] == 0
def _from_text_to_crf( self, message: Message, entities: List[Text] = None ) -> List[Tuple[Optional[Text], Optional[Text], Text, Dict[Text, Any], Optional[Dict[Text, Any]], ]]: """Takes a sentence and switches it to crfsuite format.""" crf_format = [] if self.pos_features: tokens = message.get("spacy_doc") else: tokens = message.get("tokens") ner_features = (self.__additional_ner_features(message) if self.use_ner_features else None) for i, token in enumerate(tokens): pattern = self.__pattern_of_token(message, i) entity = entities[i] if entities else "N/A" tag = self.__tag_of_token(token) if self.pos_features else None custom_ner_features = ner_features[ i] if self.use_ner_features else None crf_format.append( (token.text, tag, entity, pattern, custom_ner_features)) return crf_format
def test_spacy_ner_featurizer(sentence, expected, spacy_nlp): from rasa.nlu.featurizers.spacy_featurizer import SpacyFeaturizer doc = spacy_nlp(sentence) token_vectors = [t.vector for t in doc] spacy_config = {"ner_feature_vectors": True} ftr = SpacyFeaturizer.create(spacy_config, RasaNLUModelConfig()) greet = {"intent": "greet", "text_features": [0.5]} message = Message(sentence, greet) message.set("spacy_doc", doc) ftr._set_spacy_features(message) ftr._set_spacy_ner_features(message) vecs = message.get("ner_features")[0][:5] assert np.allclose(token_vectors[0][:5], vecs, atol=1e-4) assert np.allclose(vecs, expected, atol=1e-4)
def _set_spacy_ner_features(self, message: Message): """If we want to use spacy as an NER featurizer, set token vectors""" doc = message.get(MESSAGE_SPACY_FEATURES_NAMES[MESSAGE_TEXT_ATTRIBUTE]) if self.ner_feature_vectors: ner_features = np.array([t.vector for t in doc]) else: ner_features = np.array([[] for t in doc]) combined_features = self._combine_with_existing_features( message, ner_features, MESSAGE_VECTOR_FEATURE_NAMES[MESSAGE_ENTITIES_ATTRIBUTE], ) message.set( MESSAGE_VECTOR_FEATURE_NAMES[MESSAGE_ENTITIES_ATTRIBUTE], combined_features )
def test_convert_featurizer_train(component_builder): tokenizer = component_builder.create_component_from_class(ConveRTTokenizer) featurizer = component_builder.create_component_from_class(ConveRTFeaturizer) sentence = "Hey how are you today ?" message = Message(sentence) message.set(RESPONSE, sentence) tokens = tokenizer.tokenize(message, attribute=TEXT) tokens = tokenizer.add_cls_token(tokens, attribute=TEXT) message.set(TOKENS_NAMES[TEXT], tokens) message.set(TOKENS_NAMES[RESPONSE], tokens) featurizer.train( TrainingData([message]), RasaNLUModelConfig(), tf_hub_module=tokenizer.module ) expected = np.array([2.2636216, -0.26475656, -1.1358104, -0.49751878, -1.3946456]) expected_cls = np.array( [1.0251294, -0.04053932, -0.7018805, -0.82054937, -0.75054353] ) vecs = message.get(DENSE_FEATURE_NAMES[TEXT]) assert len(tokens) == len(vecs) assert np.allclose(vecs[0][:5], expected, atol=1e-5) assert np.allclose(vecs[-1][:5], expected_cls, atol=1e-5) vecs = message.get(DENSE_FEATURE_NAMES[RESPONSE]) assert len(tokens) == len(vecs) assert np.allclose(vecs[0][:5], expected, atol=1e-5) assert np.allclose(vecs[-1][:5], expected_cls, atol=1e-5) vecs = message.get(DENSE_FEATURE_NAMES[INTENT]) assert vecs is None
def test_count_vector_featurizer_oov_token(sentence, expected): ftr = CountVectorsFeaturizer({ "token_pattern": r"(?u)\b\w+\b", "OOV_token": "__oov__" }) train_message = Message(sentence) WhitespaceTokenizer().process(train_message) data = TrainingData([train_message]) ftr.train(data) test_message = Message(sentence) ftr.process(test_message) assert np.all( test_message.get(SPARSE_FEATURE_NAMES[TEXT]).toarray()[0] == expected)
def _create_sparse_features(self, message: Message) -> None: """Convert incoming messages into sparse features using the configured features.""" import scipy.sparse # [:-1] to remove CLS token tokens = message.get(TOKENS_NAMES[TEXT])[:-1] sentence_features = self._tokens_to_features(tokens) one_hot_feature_vector = self._features_to_one_hot(sentence_features) sparse_features = scipy.sparse.coo_matrix(one_hot_feature_vector) sparse_features = self._combine_with_existing_sparse_features( message, sparse_features, feature_name=SPARSE_FEATURE_NAMES[TEXT]) message.set(SPARSE_FEATURE_NAMES[TEXT], sparse_features)
def test_lm_tokenizer_number_of_sub_tokens(text, expected_number_of_sub_tokens): transformers_config = {"model_name": "bert"} # Test for one should be enough transformers_nlp = HFTransformersNLP(transformers_config) lm_tokenizer = LanguageModelTokenizer() message = Message(text) td = TrainingData([message]) transformers_nlp.train(td) lm_tokenizer.train(td) assert [ t.get(NUMBER_OF_SUB_TOKENS) for t in message.get(TOKENS_NAMES[TEXT])[:-1] ] == expected_number_of_sub_tokens
def _split_intent(self, message: Message, attribute: Text = INTENT) -> List[Token]: text = message.get(attribute) # for INTENT_RESPONSE_KEY attribute, # first split by RESPONSE_IDENTIFIER_DELIMITER if attribute == INTENT_RESPONSE_KEY: intent, response_key = text.split(RESPONSE_IDENTIFIER_DELIMITER) words = self._tokenize_on_split_symbol( intent) + self._tokenize_on_split_symbol(response_key) else: words = self._tokenize_on_split_symbol(text) return self._convert_words_to_tokens(words, text)
def set_fasttext_features(self, message: Message, attribute: Text = TEXT) -> None: tokens = message.get(TOKENS_NAMES[attribute]) if not tokens: return None text_vector = self.model.get_word_vector(message.text) word_vectors = [ self.model.get_word_vector(t.text) for t in train_utils.tokens_without_cls(message, attribute) ] X = np.array(word_vectors + [text_vector]) # remember, we need one for __CLS__ features = self._combine_with_existing_dense_features( message, additional_features=X, feature_name=DENSE_FEATURE_NAMES[attribute] ) message.set(DENSE_FEATURE_NAMES[attribute], features)
def process(self, message: Message, **kwargs: Any) -> None: import tensorflow as tf import numpy as np real_result_dir = os.path.join(self.model_dir, self.result_dir) # print(real_result_dir) if self.predict_fn is None: self.predict_fn = tf.keras.experimental.load_from_saved_model( real_result_dir) real_lookup_table_file = os.path.join(real_result_dir, self.lookup_table_file) # print(real_lookup_table_file) if self.lookup_table is None: with open(real_lookup_table_file, 'rt') as fd: self.lookup_table = json.load(fd) text_feature = message.get("text_features") np_feature = np.array([text_feature]) predict_np_int = self.predict_fn.predict(np_feature) intent_score = [] for intent_id, score in enumerate(predict_np_int[0]): # convert np.float32 to vanilla float, # if not it will cause json_dumps of ujson raise exception OverflowError: Maximum recursion level reached # see https://github.com/esnme/ultrajson/issues/221 float_score = float(score) intent_score.append((float_score, intent_id)) reversed_lookup_table = { index: value for value, index in self.lookup_table.items() } intent_str_score = [(k, reversed_lookup_table[v]) for k, v in intent_score] sorted_intent_str_score = sorted(intent_str_score, key=lambda x: x[0], reverse=True) # print(sorted_intent_str_score) self._set_intent_output(message, sorted_intent_str_score)
def test_lm_tokenizer_custom_intent_symbol(text, expected_tokens): component_config = {"intent_tokenization_flag": True, "intent_split_symbol": "+"} transformers_config = {"model_name": "bert"} # Test for one should be enough transformers_nlp = HFTransformersNLP(transformers_config) lm_tokenizer = LanguageModelTokenizer(component_config) message = Message(text) message.set(INTENT, text) td = TrainingData([message]) transformers_nlp.train(td) lm_tokenizer.train(td) assert [t.text for t in message.get(TOKENS_NAMES[INTENT])] == expected_tokens
def _create_sparse_features(self, message: Message) -> None: """Convert incoming messages into sparse features using the configured features.""" import scipy.sparse # [:-1] to remove CLS token tokens = message.get(TOKENS_NAMES[TEXT])[:-1] sentence_features = self._tokens_to_features(tokens) one_hot_feature_vector = self._features_to_one_hot(sentence_features) sparse_features = scipy.sparse.coo_matrix(one_hot_feature_vector) final_features = Features( sparse_features, TEXT, self.component_config[FEATURIZER_CLASS_ALIAS] ) message.add_features(final_features)
def test_count_vector_featurizer(sentence, expected): from rasa.nlu.featurizers.count_vectors_featurizer import \ CountVectorsFeaturizer ftr = CountVectorsFeaturizer({"min_ngram": 1, "max_ngram": 2, "analyzer": 'char'}) train_message = Message(sentence) # this is needed for a valid training example train_message.set("intent", "bla") data = TrainingData([train_message]) ftr.train(data) test_message = Message(sentence) ftr.process(test_message) assert np.all(test_message.get("text_features") == expected)
async def rebuild_original_text(example: Message) -> str: """ Rebuilds original training text in Markdown form. """ original_entities = example.get("entities") original_text = example.text if original_entities: original_text = list(original_text) for entity in sorted(original_entities, key=lambda x: x.get("start"), reverse=True): start = entity["start"] end = entity["end"] value = entity["value"] name = entity["entity"] original_text[start:end] = f"[{value}]({name})" original_text = "".join(original_text) return original_text
def testing_tokenizer(text, cls, lang='en'): from rasa.nlu.training_data import TrainingData, Message defaults = { # Flag to check whether to split intents "intent_tokenization_flag": False, # Symbol on which intent should be split "intent_split_symbol": "_", # text will be tokenized with case sensitive as default "case_sensitive": True, "lang": lang, } tok = cls(defaults) example = Message(text, {"intent": "wish", "entities": []}) # tokenizer tok.process(example, x='.') for token in example.get("tokens"): print(token.text, token.offset)
def test_spacy_featurizer_cls_vector(spacy_nlp): featurizer = SpacyFeaturizer.create({}, RasaNLUModelConfig()) sentence = "Hey how are you today" message = Message(sentence) message.set(SPACY_DOCS[TEXT_ATTRIBUTE], spacy_nlp(sentence)) featurizer._set_spacy_features(message) vecs = message.get(DENSE_FEATURE_NAMES[TEXT_ATTRIBUTE]) expected = np.array([-0.28451, 0.31007, -0.57039, -0.073056, -0.17322]) expected_cls = np.array( [-0.196496, 0.3249364, -0.37408298, -0.10622784, 0.062756]) assert 6 == len(vecs) assert np.allclose(vecs[0][:5], expected, atol=1e-5) assert np.allclose(vecs[-1][:5], expected_cls, atol=1e-5)