コード例 #1
0
def test_count_vector_featurizer(sentence, expected):
    from rasa.nlu.featurizers.count_vectors_featurizer import CountVectorsFeaturizer

    ftr = CountVectorsFeaturizer({"token_pattern": r"(?u)\b\w+\b"})
    train_message = Message(sentence)
    # this is needed for a valid training example
    train_message.set("intent", "bla")
    data = TrainingData([train_message])
    ftr.train(data)

    test_message = Message(sentence)
    ftr.process(test_message)

    assert np.all(test_message.get("text_features") == expected)
コード例 #2
0
    def tokenize(self, message: Message, attribute: Text) -> List[Token]:
        import MicroTokenizer

        text = message.get(attribute)

        tokenized = MicroTokenizer.cut(text)

        tokens = []
        offset = 0
        for word in tokenized:
            tokens.append(Token(word, offset))
            offset += len(word)

        return tokens
コード例 #3
0
def test_do_not_overwrite_any_entities():
    message = Message("Max lives in Berlin.")
    message.set(ENTITIES, [{
        "entity": "person",
        "value": "Max",
        "start": 0,
        "end": 3
    }])

    training_data = TrainingData()
    training_data.training_examples = [
        Message("Hi Max!",
                data={"entities": [{
                    "entity": "person",
                    "value": "Max"
                }]}),
        Message(
            "I live in Berlin",
            data={"entities": [{
                "entity": "city",
                "value": "Berlin"
            }]},
        ),
    ]
    training_data.lookup_tables = [{
        "name":
        "city",
        "elements": ["London", "Berlin", "Amsterdam"]
    }]

    entity_extractor = RegexEntityExtractor()
    entity_extractor.train(training_data)
    entity_extractor.process(message)

    entities = message.get(ENTITIES)
    assert entities == [
        {
            "entity": "person",
            "value": "Max",
            "start": 0,
            "end": 3
        },
        {
            "entity": "city",
            "value": "Berlin",
            "start": 13,
            "end": 19,
            "extractor": "RegexEntityExtractor",
        },
    ]
コード例 #4
0
ファイル: ltp.py プロジェクト: lsx0930/rasa_usage
    def process(self, message: Message, **kwargs: Any):
        """Process an incoming message.

        This is the components chance to process an incoming
        message. The component can rely on
        any context attribute to be present, that gets created
        by a call to :meth:`components.Component.pipeline_init`
        of ANY component and
        on any context attributes created by a call to
        :meth:`components.Component.process`
        of components previous to this one."""
        # TODO 分词, 如果利用其它分词组件, 需要进一步调整
        if not message.get("tokens", default=None):
            self.extract_tokens(message)
            # 词性标注
            self.extract_poses(message)
            # 句法依存
            self.extract_parses(message)
            # 抽取实体<序列标注+实体提取>
            self.extract_entities(message)
            # 抽取代词
            self.extract_pronouns(message)
        else:
            # rasa tokenizers
            tokens = message.get("tokens")
            message.set("tokenizers", tokens)
            # List tokens
            tokens = [tokenizer_extract(token) for token in tokens]
            message.set("tokens", tokens)
            self.extract_poses(message)
            # 句法依存
            self.extract_parses(message)
            # 抽取实体<序列标注+实体提取>
            # 语义分割 ->
            self.entity_segment(message)
            # 属性分析 ->
            self.link_analyze(message)
コード例 #5
0
def test_lookup_tables(sentence, expected, labeled_tokens, spacy_nlp):
    from rasa.nlu.featurizers.sparse_featurizer.regex_featurizer import RegexFeaturizer

    lookups = [
        {
            "name":
            "drinks",
            "elements":
            ["mojito", "lemonade", "sweet berry wine", "tea", "club?mate"],
        },
        {
            "name": "plates",
            "elements": "data/test/lookup_tables/plates.txt"
        },
    ]
    ftr = RegexFeaturizer()
    ftr.add_lookup_tables(lookups)

    # adds tokens to the message
    component_config = {"name": "SpacyTokenizer"}
    tokenizer = SpacyTokenizer(component_config)
    message = Message(sentence)
    message.set("text_spacy_doc", spacy_nlp(sentence))
    tokenizer.process(message)

    sequence_features, sentence_features = ftr._features_for_patterns(
        message, TEXT)
    assert np.allclose(sequence_features.toarray(), expected[:-1], atol=1e-10)
    assert np.allclose(sentence_features.toarray(), expected[-1], atol=1e-10)

    # the tokenizer should have added tokens
    assert len(message.get(TOKENS_NAMES[TEXT], [])) > 0
    # the number of regex matches on each token should match
    for i, token in enumerate(message.get(TOKENS_NAMES[TEXT])):
        token_matches = token.get("pattern").values()
        num_matches = sum(token_matches)
        assert num_matches == labeled_tokens.count(i)
コード例 #6
0
ファイル: train_utils.py プロジェクト: zylhub/rasa
def tokens_without_cls(
    message: Message, attribute: Text = TEXT
) -> Optional[List[Token]]:
    """Return tokens of given message without __CLS__ token.

    All tokenizers add a __CLS__ token to the end of the list of tokens for
    text and responses. The token captures the sentence features.

    Args:
        message: The message.
        attribute: Return tokens of provided attribute.

    Returns:
        Tokens without CLS token.
    """
    # return all tokens up to __CLS__ token for text and responses
    if attribute in DENSE_FEATURIZABLE_ATTRIBUTES:
        tokens = message.get(TOKENS_NAMES[attribute])
        if tokens is not None:
            return tokens[:POSITION_OF_CLS_TOKEN]
        return None

    # we don't add the __CLS__ token for intents, return all tokens
    return message.get(TOKENS_NAMES[attribute])
コード例 #7
0
    def process(self, message: Message, **kwargs: Any) -> None:
        """Process an incoming message"""
        entities = list(message.get('entities'))
        # Get file path of lookup table in json format
        cur_path = os.path.dirname(__file__)
        if os.name == 'nt':
            partial_lookup_file_path = '..\\data\\lookup_table.json'
        else:
            partial_lookup_file_path = '../data/lookup_table.json'
        lookup_file_path = os.path.join(cur_path, partial_lookup_file_path)

        with open(lookup_file_path, 'r') as file:
            lookup_data = json.load(file)
            tokens = message.get('tokens')
            for token in tokens:
                similarity_score = self.get_fuzzy_similarity(
                    token.text, lookup_data, self.threshold)
                if similarity_score is not None:
                    print("'" + token.text + "'" + " matches with " +
                          str(similarity_score[0]) + "[" +
                          similarity_score[2] + "]" + " with a score of: " +
                          str(similarity_score[1]))
                    for i, item in enumerate(entities):
                        # if entity already exist, update it (because diet classifier is higher in hierarchy)
                        if item['entity'] == similarity_score[2]:
                            item.update({"value": similarity_score[0]})
                    entities.append({
                        "start": token.start,
                        "end": token.end,
                        "value": similarity_score[0],
                        "confidence": similarity_score[1],
                        "entity": similarity_score[2]
                    })

        file.close()
        message.set("entities", entities, add_to_output=True)
コード例 #8
0
ファイル: test_mitie_featurizer.py プロジェクト: zylhub/rasa
def test_mitie_featurizer_train(mitie_feature_extractor):

    featurizer = MitieFeaturizer.create({}, RasaNLUModelConfig())

    sentence = "Hey how are you today"
    message = Message(sentence)
    message.set(RESPONSE, sentence)
    message.set(INTENT, "intent")
    MitieTokenizer().train(TrainingData([message]))

    featurizer.train(
        TrainingData([message]),
        RasaNLUModelConfig(),
        **{"mitie_feature_extractor": mitie_feature_extractor},
    )

    expected = np.array(
        [0.00000000e00, -5.12735510e00, 4.39929873e-01, -5.60760403e00, -8.26445103e00]
    )
    expected_cls = np.array([0.0, -4.4551446, 0.26073121, -1.46632245, -1.84205751])

    vecs = message.get(DENSE_FEATURE_NAMES[TEXT])

    assert len(message.get(TOKENS_NAMES[TEXT])) == len(vecs)
    assert np.allclose(vecs[0][:5], expected, atol=1e-5)
    assert np.allclose(vecs[-1][:5], expected_cls, atol=1e-5)

    vecs = message.get(DENSE_FEATURE_NAMES[RESPONSE])

    assert len(message.get(TOKENS_NAMES[RESPONSE])) == len(vecs)
    assert np.allclose(vecs[0][:5], expected, atol=1e-5)
    assert np.allclose(vecs[-1][:5], expected_cls, atol=1e-5)

    vecs = message.get(DENSE_FEATURE_NAMES[INTENT])

    assert vecs is None
コード例 #9
0
def test_count_vector_featurizer_response_attribute_featurization(
        sentence, intent, response, intent_features, response_features):
    ftr = CountVectorsFeaturizer({"token_pattern": r"(?u)\b\w+\b"})
    tk = WhitespaceTokenizer()

    train_message = Message(sentence)
    # this is needed for a valid training example
    train_message.set(INTENT_ATTRIBUTE, intent)
    train_message.set(RESPONSE_ATTRIBUTE, response)

    # add a second example that has some response, so that the vocabulary for
    # response exists
    second_message = Message("hello")
    second_message.set(RESPONSE_ATTRIBUTE, "hi")
    second_message.set(INTENT_ATTRIBUTE, "greet")

    data = TrainingData([train_message, second_message])

    tk.train(data)
    ftr.train(data)

    if intent_features:
        assert (train_message.get(
            SPARSE_FEATURE_NAMES[INTENT_ATTRIBUTE]).toarray()[0] ==
                intent_features)
    else:
        assert train_message.get(
            SPARSE_FEATURE_NAMES[INTENT_ATTRIBUTE]) is None

    if response_features:
        assert (train_message.get(
            SPARSE_FEATURE_NAMES[RESPONSE_ATTRIBUTE]).toarray()[0] ==
                response_features)
    else:
        assert train_message.get(
            SPARSE_FEATURE_NAMES[RESPONSE_ATTRIBUTE]) is None
コード例 #10
0
    def _from_text_to_crf(self,
                          message: Message,
                          entities: List[Text] = None) -> List[CRFToken]:
        """Takes a sentence and switches it to crfsuite format."""

        crf_format = []
        if self.pos_features:
            tokens = message.get(SPACY_DOCS[TEXT_ATTRIBUTE])
        else:
            tokens = message.get(TOKENS_NAMES[TEXT_ATTRIBUTE])

        text_dense_features = self.__get_dense_features(message)

        for i, token in enumerate(tokens):
            pattern = self.__pattern_of_token(message, i)
            entity = entities[i] if entities else "N/A"
            tag = self.__tag_of_token(token) if self.pos_features else None
            dense_features = (text_dense_features[i]
                              if text_dense_features is not None else [])

            crf_format.append(
                CRFToken(token.text, tag, entity, pattern, dense_features))

        return crf_format
コード例 #11
0
    def process(self, message: Message, **kwargs: Any) -> None:

        mitie_feature_extractor = kwargs.get("mitie_feature_extractor")
        if not mitie_feature_extractor:
            raise Exception("Failed to train 'MitieFeaturizer'. "
                            "Missing a proper MITIE feature extractor.")

        ents = self.extract_entities(message.text,
                                     self._tokens_without_cls(message),
                                     mitie_feature_extractor)
        extracted = self.add_extractor_name(ents)
        extracted = self.clean_up_entities(message, extracted)
        message.set(ENTITIES,
                    message.get(ENTITIES, []) + extracted,
                    add_to_output=True)
コード例 #12
0
    def _get_processed_message_tokens_by_attribute(
        self, message: Message, attribute: Text = TEXT
    ) -> List[Text]:
        """Get processed text of attribute of a message"""

        if message.get(attribute) is None:
            # return empty list since sklearn countvectorizer does not like None
            # object while training and predicting
            return []

        tokens = self._get_message_tokens_by_attribute(message, attribute)
        tokens = self._process_tokens(tokens, attribute)
        tokens = self._replace_with_oov_token(tokens, attribute)

        return tokens
コード例 #13
0
def test_custom_intent_symbol(text, expected_tokens):
    component_config = {
        "intent_tokenization_flag": True,
        "intent_split_symbol": "+"
    }

    tk = MitieTokenizer(component_config)

    message = Message(text)
    message.set(INTENT, text)

    tk.train(TrainingData([message]))

    assert [t.text
            for t in message.get(TOKENS_NAMES[INTENT])] == expected_tokens
コード例 #14
0
def test_spacy_ner_featurizer_config(spacy_nlp):
    from rasa.nlu.featurizers.spacy_featurizer import SpacyFeaturizer

    sentence = "hi there friend"
    doc = spacy_nlp(sentence)
    spacy_config = {"ner_feature_vectors": False}
    ftr = SpacyFeaturizer.create(spacy_config, RasaNLUModelConfig())
    greet = {"intent": "greet", "text_features": [0.5]}
    message = Message(sentence, greet)
    message.set("spacy_doc", doc)
    ftr._set_spacy_features(message)
    ftr._set_spacy_ner_features(message)
    vecs = np.array(message.get("ner_features"))
    assert vecs.shape[0] == len(doc)
    assert vecs.shape[1] == 0
コード例 #15
0
    def _from_text_to_crf(
        self,
        message: Message,
        entities: List[Text] = None
    ) -> List[Tuple[Optional[Text], Optional[Text], Text, Dict[Text, Any],
                    Optional[Dict[Text, Any]], ]]:
        """Takes a sentence and switches it to crfsuite format."""

        crf_format = []
        if self.pos_features:
            tokens = message.get("spacy_doc")
        else:
            tokens = message.get("tokens")
        ner_features = (self.__additional_ner_features(message)
                        if self.use_ner_features else None)
        for i, token in enumerate(tokens):
            pattern = self.__pattern_of_token(message, i)
            entity = entities[i] if entities else "N/A"
            tag = self.__tag_of_token(token) if self.pos_features else None
            custom_ner_features = ner_features[
                i] if self.use_ner_features else None
            crf_format.append(
                (token.text, tag, entity, pattern, custom_ner_features))
        return crf_format
コード例 #16
0
def test_spacy_ner_featurizer(sentence, expected, spacy_nlp):
    from rasa.nlu.featurizers.spacy_featurizer import SpacyFeaturizer

    doc = spacy_nlp(sentence)
    token_vectors = [t.vector for t in doc]
    spacy_config = {"ner_feature_vectors": True}
    ftr = SpacyFeaturizer.create(spacy_config, RasaNLUModelConfig())
    greet = {"intent": "greet", "text_features": [0.5]}
    message = Message(sentence, greet)
    message.set("spacy_doc", doc)
    ftr._set_spacy_features(message)
    ftr._set_spacy_ner_features(message)
    vecs = message.get("ner_features")[0][:5]
    assert np.allclose(token_vectors[0][:5], vecs, atol=1e-4)
    assert np.allclose(vecs, expected, atol=1e-4)
コード例 #17
0
 def _set_spacy_ner_features(self, message: Message):
     """If we want to use spacy as an NER featurizer, set token vectors"""
     doc = message.get(MESSAGE_SPACY_FEATURES_NAMES[MESSAGE_TEXT_ATTRIBUTE])
     if self.ner_feature_vectors:
         ner_features = np.array([t.vector for t in doc])
     else:
         ner_features = np.array([[] for t in doc])
     combined_features = self._combine_with_existing_features(
         message,
         ner_features,
         MESSAGE_VECTOR_FEATURE_NAMES[MESSAGE_ENTITIES_ATTRIBUTE],
     )
     message.set(
         MESSAGE_VECTOR_FEATURE_NAMES[MESSAGE_ENTITIES_ATTRIBUTE], combined_features
     )
コード例 #18
0
def test_convert_featurizer_train(component_builder):
    tokenizer = component_builder.create_component_from_class(ConveRTTokenizer)
    featurizer = component_builder.create_component_from_class(ConveRTFeaturizer)

    sentence = "Hey how are you today ?"
    message = Message(sentence)
    message.set(RESPONSE, sentence)
    tokens = tokenizer.tokenize(message, attribute=TEXT)
    tokens = tokenizer.add_cls_token(tokens, attribute=TEXT)
    message.set(TOKENS_NAMES[TEXT], tokens)
    message.set(TOKENS_NAMES[RESPONSE], tokens)

    featurizer.train(
        TrainingData([message]), RasaNLUModelConfig(), tf_hub_module=tokenizer.module
    )

    expected = np.array([2.2636216, -0.26475656, -1.1358104, -0.49751878, -1.3946456])
    expected_cls = np.array(
        [1.0251294, -0.04053932, -0.7018805, -0.82054937, -0.75054353]
    )

    vecs = message.get(DENSE_FEATURE_NAMES[TEXT])

    assert len(tokens) == len(vecs)
    assert np.allclose(vecs[0][:5], expected, atol=1e-5)
    assert np.allclose(vecs[-1][:5], expected_cls, atol=1e-5)

    vecs = message.get(DENSE_FEATURE_NAMES[RESPONSE])

    assert len(tokens) == len(vecs)
    assert np.allclose(vecs[0][:5], expected, atol=1e-5)
    assert np.allclose(vecs[-1][:5], expected_cls, atol=1e-5)

    vecs = message.get(DENSE_FEATURE_NAMES[INTENT])

    assert vecs is None
コード例 #19
0
def test_count_vector_featurizer_oov_token(sentence, expected):
    ftr = CountVectorsFeaturizer({
        "token_pattern": r"(?u)\b\w+\b",
        "OOV_token": "__oov__"
    })
    train_message = Message(sentence)
    WhitespaceTokenizer().process(train_message)

    data = TrainingData([train_message])
    ftr.train(data)

    test_message = Message(sentence)
    ftr.process(test_message)

    assert np.all(
        test_message.get(SPARSE_FEATURE_NAMES[TEXT]).toarray()[0] == expected)
コード例 #20
0
    def _create_sparse_features(self, message: Message) -> None:
        """Convert incoming messages into sparse features using the configured
        features."""
        import scipy.sparse

        # [:-1] to remove CLS token
        tokens = message.get(TOKENS_NAMES[TEXT])[:-1]

        sentence_features = self._tokens_to_features(tokens)
        one_hot_feature_vector = self._features_to_one_hot(sentence_features)

        sparse_features = scipy.sparse.coo_matrix(one_hot_feature_vector)

        sparse_features = self._combine_with_existing_sparse_features(
            message, sparse_features, feature_name=SPARSE_FEATURE_NAMES[TEXT])
        message.set(SPARSE_FEATURE_NAMES[TEXT], sparse_features)
コード例 #21
0
ファイル: test_lm_tokenizer.py プロジェクト: zylhub/rasa
def test_lm_tokenizer_number_of_sub_tokens(text, expected_number_of_sub_tokens):
    transformers_config = {"model_name": "bert"}  # Test for one should be enough

    transformers_nlp = HFTransformersNLP(transformers_config)
    lm_tokenizer = LanguageModelTokenizer()

    message = Message(text)

    td = TrainingData([message])

    transformers_nlp.train(td)
    lm_tokenizer.train(td)

    assert [
        t.get(NUMBER_OF_SUB_TOKENS) for t in message.get(TOKENS_NAMES[TEXT])[:-1]
    ] == expected_number_of_sub_tokens
コード例 #22
0
ファイル: tokenizer.py プロジェクト: zuiwanting/rasa
    def _split_intent(self,
                      message: Message,
                      attribute: Text = INTENT) -> List[Token]:
        text = message.get(attribute)

        # for INTENT_RESPONSE_KEY attribute,
        # first split by RESPONSE_IDENTIFIER_DELIMITER
        if attribute == INTENT_RESPONSE_KEY:
            intent, response_key = text.split(RESPONSE_IDENTIFIER_DELIMITER)
            words = self._tokenize_on_split_symbol(
                intent) + self._tokenize_on_split_symbol(response_key)

        else:
            words = self._tokenize_on_split_symbol(text)

        return self._convert_words_to_tokens(words, text)
コード例 #23
0
    def set_fasttext_features(self, message: Message, attribute: Text = TEXT) -> None:
        tokens = message.get(TOKENS_NAMES[attribute])

        if not tokens:
            return None

        text_vector = self.model.get_word_vector(message.text)
        word_vectors = [
            self.model.get_word_vector(t.text)
            for t in train_utils.tokens_without_cls(message, attribute)
        ]
        X = np.array(word_vectors + [text_vector])  # remember, we need one for __CLS__

        features = self._combine_with_existing_dense_features(
            message, additional_features=X, feature_name=DENSE_FEATURE_NAMES[attribute]
        )
        message.set(DENSE_FEATURE_NAMES[attribute], features)
コード例 #24
0
    def process(self, message: Message, **kwargs: Any) -> None:
        import tensorflow as tf
        import numpy as np

        real_result_dir = os.path.join(self.model_dir, self.result_dir)
        # print(real_result_dir)

        if self.predict_fn is None:
            self.predict_fn = tf.keras.experimental.load_from_saved_model(
                real_result_dir)

        real_lookup_table_file = os.path.join(real_result_dir,
                                              self.lookup_table_file)
        # print(real_lookup_table_file)

        if self.lookup_table is None:
            with open(real_lookup_table_file, 'rt') as fd:
                self.lookup_table = json.load(fd)

        text_feature = message.get("text_features")
        np_feature = np.array([text_feature])

        predict_np_int = self.predict_fn.predict(np_feature)

        intent_score = []
        for intent_id, score in enumerate(predict_np_int[0]):
            # convert np.float32 to vanilla float,
            # if not it will cause json_dumps of ujson raise exception OverflowError: Maximum recursion level reached
            # see https://github.com/esnme/ultrajson/issues/221
            float_score = float(score)
            intent_score.append((float_score, intent_id))

        reversed_lookup_table = {
            index: value
            for value, index in self.lookup_table.items()
        }
        intent_str_score = [(k, reversed_lookup_table[v])
                            for k, v in intent_score]

        sorted_intent_str_score = sorted(intent_str_score,
                                         key=lambda x: x[0],
                                         reverse=True)

        # print(sorted_intent_str_score)

        self._set_intent_output(message, sorted_intent_str_score)
コード例 #25
0
ファイル: test_lm_tokenizer.py プロジェクト: zylhub/rasa
def test_lm_tokenizer_custom_intent_symbol(text, expected_tokens):
    component_config = {"intent_tokenization_flag": True, "intent_split_symbol": "+"}

    transformers_config = {"model_name": "bert"}  # Test for one should be enough

    transformers_nlp = HFTransformersNLP(transformers_config)
    lm_tokenizer = LanguageModelTokenizer(component_config)

    message = Message(text)
    message.set(INTENT, text)

    td = TrainingData([message])

    transformers_nlp.train(td)
    lm_tokenizer.train(td)

    assert [t.text for t in message.get(TOKENS_NAMES[INTENT])] == expected_tokens
コード例 #26
0
    def _create_sparse_features(self, message: Message) -> None:
        """Convert incoming messages into sparse features using the configured
        features."""
        import scipy.sparse

        # [:-1] to remove CLS token
        tokens = message.get(TOKENS_NAMES[TEXT])[:-1]

        sentence_features = self._tokens_to_features(tokens)
        one_hot_feature_vector = self._features_to_one_hot(sentence_features)

        sparse_features = scipy.sparse.coo_matrix(one_hot_feature_vector)

        final_features = Features(
            sparse_features, TEXT, self.component_config[FEATURIZER_CLASS_ALIAS]
        )
        message.add_features(final_features)
コード例 #27
0
ファイル: test_featurizers.py プロジェクト: yungliu/rasa_nlu
def test_count_vector_featurizer(sentence, expected):
    from rasa.nlu.featurizers.count_vectors_featurizer import \
        CountVectorsFeaturizer

    ftr = CountVectorsFeaturizer({"min_ngram": 1,
                                  "max_ngram": 2,
                                  "analyzer": 'char'})
    train_message = Message(sentence)
    # this is needed for a valid training example
    train_message.set("intent", "bla")
    data = TrainingData([train_message])
    ftr.train(data)

    test_message = Message(sentence)
    ftr.process(test_message)

    assert np.all(test_message.get("text_features") == expected)
コード例 #28
0
 async def rebuild_original_text(example: Message) -> str:
     """
     Rebuilds original training text in Markdown form.
     """
     original_entities = example.get("entities")
     original_text = example.text
     if original_entities:
         original_text = list(original_text)
         for entity in sorted(original_entities,
                              key=lambda x: x.get("start"),
                              reverse=True):
             start = entity["start"]
             end = entity["end"]
             value = entity["value"]
             name = entity["entity"]
             original_text[start:end] = f"[{value}]({name})"
         original_text = "".join(original_text)
     return original_text
コード例 #29
0
ファイル: saai_cli.py プロジェクト: samlet/saai
def testing_tokenizer(text, cls, lang='en'):
    from rasa.nlu.training_data import TrainingData, Message
    defaults = {
        # Flag to check whether to split intents
        "intent_tokenization_flag": False,
        # Symbol on which intent should be split
        "intent_split_symbol": "_",
        # text will be tokenized with case sensitive as default
        "case_sensitive": True,
        "lang": lang,
    }

    tok = cls(defaults)
    example = Message(text, {"intent": "wish", "entities": []})
    # tokenizer
    tok.process(example, x='.')
    for token in example.get("tokens"):
        print(token.text, token.offset)
コード例 #30
0
def test_spacy_featurizer_cls_vector(spacy_nlp):
    featurizer = SpacyFeaturizer.create({}, RasaNLUModelConfig())

    sentence = "Hey how are you today"
    message = Message(sentence)
    message.set(SPACY_DOCS[TEXT_ATTRIBUTE], spacy_nlp(sentence))

    featurizer._set_spacy_features(message)

    vecs = message.get(DENSE_FEATURE_NAMES[TEXT_ATTRIBUTE])

    expected = np.array([-0.28451, 0.31007, -0.57039, -0.073056, -0.17322])
    expected_cls = np.array(
        [-0.196496, 0.3249364, -0.37408298, -0.10622784, 0.062756])

    assert 6 == len(vecs)
    assert np.allclose(vecs[0][:5], expected, atol=1e-5)
    assert np.allclose(vecs[-1][:5], expected_cls, atol=1e-5)