def test_count_vector_featurizer_oov_words(sentence, expected):
    from rasa.nlu.featurizers.sparse_featurizer.count_vectors_featurizer import (
        CountVectorsFeaturizer, )

    ftr = CountVectorsFeaturizer({
        "token_pattern": r"(?u)\b\w+\b",
        "OOV_token": "__oov__",
        "OOV_words": ["oov_word0", "OOV_word1"],
        "return_sequence": True,
    })
    train_message = Message(sentence)
    # this is needed for a valid training example
    train_message.set("intent", "bla")
    data = TrainingData([train_message])
    ftr.train(data)

    test_message = Message(sentence)
    ftr.process(test_message)

    assert np.all(
        test_message.get("text_sparse_features").toarray()[0] == expected)
    def process(self, message: Message, **kwargs: Any) -> None:

        mitie_feature_extractor = kwargs.get("mitie_feature_extractor")
        if not mitie_feature_extractor:
            raise Exception("Failed to train 'MitieFeaturizer'. "
                            "Missing a proper MITIE feature extractor.")

        if self.clf:
            token_strs = self._tokens_of_message(message)
            intent, confidence = self.clf(token_strs, mitie_feature_extractor)
        else:
            # either the model didn't get trained or it wasn't
            # provided with any data
            intent = None
            confidence = 0.0

        message.set("intent", {
            "name": intent,
            "confidence": confidence
        },
                    add_to_output=True)
def test_spacy_featurizer_sequence(sentence, expected, spacy_nlp):
    from rasa.nlu.featurizers.dense_featurizer.spacy_featurizer import SpacyFeaturizer

    doc = spacy_nlp(sentence)
    token_vectors = [t.vector for t in doc]

    ftr = SpacyFeaturizer.create({}, RasaNLUModelConfig())

    greet = {"intent": "greet", "text_features": [0.5]}

    message = Message(sentence, greet)
    message.set(SPACY_DOCS[TEXT], doc)

    ftr._set_spacy_features(message)

    seq_vecs, sen_vecs = message.get_dense_features(TEXT, [])
    vecs = seq_vecs[0][:5]

    assert np.allclose(token_vectors[0][:5], vecs, atol=1e-4)
    assert np.allclose(vecs, expected, atol=1e-4)
    assert sen_vecs is not None
def test_convert_featurizer_process():
    featurizer = ConveRTFeaturizer.create({}, RasaNLUModelConfig())

    sentence = "Hey how are you today ?"
    message = Message(sentence)
    tokens = ConveRTTokenizer().tokenize(message, attribute=TEXT)
    tokens = Tokenizer.add_cls_token(tokens, attribute=TEXT)
    message.set(TOKENS_NAMES[TEXT], tokens)

    featurizer.process(message)

    expected = np.array(
        [2.2636216, -0.26475656, -1.1358104, -0.49751878, -1.3946456])
    expected_cls = np.array(
        [1.0251294, -0.04053932, -0.7018805, -0.82054937, -0.75054353])

    vecs = message.get(DENSE_FEATURE_NAMES[TEXT])

    assert len(tokens) == len(vecs)
    assert np.allclose(vecs[0][:5], expected, atol=1e-5)
    assert np.allclose(vecs[-1][:5], expected_cls, atol=1e-5)
Esempio n. 5
0
    def set_fasttext_features(self,
                              message: Message,
                              attribute: Text = TEXT) -> None:
        tokens = message.get(TOKENS_NAMES[attribute])

        if not tokens:
            return None

        text_vector = self.model.get_word_vector(message.text)
        word_vectors = [
            self.model.get_word_vector(t.text)
            for t in train_utils.tokens_without_cls(message, attribute)
        ]
        X = np.array(word_vectors +
                     [text_vector])  # remember, we need one for __CLS__

        features = self._combine_with_existing_dense_features(
            message,
            additional_features=X,
            feature_name=DENSE_FEATURE_NAMES[attribute])
        message.set(DENSE_FEATURE_NAMES[attribute], features)
def test_count_vector_featurizer_char(sentence, expected):
    from rasa.nlu.featurizers.sparse_featurizer.count_vectors_featurizer import (
        CountVectorsFeaturizer, )

    ftr = CountVectorsFeaturizer({
        "min_ngram": 1,
        "max_ngram": 2,
        "analyzer": "char",
        "return_sequence": True
    })
    train_message = Message(sentence)
    # this is needed for a valid training example
    train_message.set("intent", "bla")
    data = TrainingData([train_message])
    ftr.train(data)

    test_message = Message(sentence)
    ftr.process(test_message)

    assert np.all(
        test_message.get("text_sparse_features").toarray()[0] == expected)
Esempio n. 7
0
def test_count_vector_featurizer_shared_vocab(sentence, intent, response,
                                              text_features, intent_features,
                                              response_features):
    from rasa.nlu.featurizers.count_vectors_featurizer import CountVectorsFeaturizer

    ftr = CountVectorsFeaturizer({
        "token_pattern": r"(?u)\b\w+\b",
        "use_shared_vocab": True
    })
    train_message = Message(sentence)

    # this is needed for a valid training example
    train_message.set("intent", intent)
    train_message.set("response", response)

    data = TrainingData([train_message])
    ftr.train(data)

    assert np.all(train_message.get("text_features") == text_features)
    assert np.all(train_message.get("intent_features") == intent_features)
    assert np.all(train_message.get("response_features") == response_features)
Esempio n. 8
0
    def process(self, message: Message, **kwargs: Any):

        spans = message.get("spans", [])
        pronouns = [span for span in spans if span['label'] == 'Pronoun']
        coreferences = []
        for pronoun in pronouns:
            ent = self.stag(pronoun, message)
            if ent:
                coreferences.append({
                    "pronoun": {
                        'start': pronoun['start'],
                        'end': pronoun['end']
                    },
                    "entity": {
                        'start': ent['start'],
                        'end': ent['end']
                    }
                })
        span_output_format(spans)
        message.set("coreferences", coreferences, add_to_output=True)
        logging.info("coref data: {}".format(message.data))
Esempio n. 9
0
def test_text_featurizer_using_pos(sentence, expected, spacy_nlp):
    featurizer = LexicalSyntacticFeaturizer({"features": [["pos", "pos2"]]})

    train_message = Message(sentence)
    test_message = Message(sentence)

    train_message.set(SPACY_DOCS[TEXT], spacy_nlp(sentence))
    test_message.set(SPACY_DOCS[TEXT], spacy_nlp(sentence))

    SpacyTokenizer().process(train_message)
    SpacyTokenizer().process(test_message)

    featurizer.train(TrainingData([train_message]))

    featurizer.process(test_message)

    actual = test_message.get_sparse_features(TEXT, [])

    assert isinstance(actual, scipy.sparse.coo_matrix)

    assert np.all(actual.toarray() == expected)
Esempio n. 10
0
def test_crf_use_dense_features(spacy_nlp):
    crf_extractor = CRFEntityExtractor(
        component_config={
            "features": [
                ["low", "title", "upper", "pos", "pos2"],
                [
                    "low",
                    "suffix3",
                    "suffix2",
                    "upper",
                    "title",
                    "digit",
                    "pos",
                    "pos2",
                    "text_dense_features",
                ],
                ["low", "title", "upper", "pos", "pos2"],
            ]
        }
    )

    spacy_featurizer = SpacyFeaturizer()
    spacy_tokenizer = SpacyTokenizer()

    text = "Rasa is a company in Berlin"
    message = Message(text)
    message.set(SPACY_DOCS[TEXT], spacy_nlp(text))

    spacy_tokenizer.process(message)
    spacy_featurizer.process(message)

    text_data = crf_extractor._from_text_to_crf(message)
    features = crf_extractor._sentence_to_features(text_data)

    assert "0:text_dense_features" in features[0]
    for i in range(0, len(message.data.get("text_dense_features")[0])):
        assert (
            features[0]["0:text_dense_features"]["text_dense_features"][str(i)]
            == message.data.get("text_dense_features")[0][i]
        )
Esempio n. 11
0
def test_count_vector_featurizer_using_tokens(tokens, expected):
    from rasa.nlu.featurizers.count_vectors_featurizer import CountVectorsFeaturizer

    ftr = CountVectorsFeaturizer({"token_pattern": r"(?u)\b\w+\b"})

    # using empty string instead of real text string to make sure
    # count vector only can come from `tokens` feature.
    # using `message.text` can not get correct result

    tokens_feature = [Token(i, 0) for i in tokens]

    train_message = Message("")
    train_message.set("tokens", tokens_feature)
    # this is needed for a valid training example
    train_message.set("intent", "bla")
    data = TrainingData([train_message])

    ftr.train(data)

    test_message = Message("")
    test_message.set("tokens", tokens_feature)

    ftr.process(test_message)

    assert np.all(test_message.get("text_features") == expected)
Esempio n. 12
0
def test_regex_featurizer(sentence, expected, labeled_tokens, spacy_nlp):
    from rasa.nlu.featurizers.sparse_featurizer.regex_featurizer import RegexFeaturizer

    patterns = [
        {
            "pattern": "[0-9]+",
            "name": "number",
            "usage": "intent"
        },
        {
            "pattern": "\\bhey*",
            "name": "hello",
            "usage": "intent"
        },
        {
            "pattern": "[0-1]+",
            "name": "binary",
            "usage": "intent"
        },
    ]
    ftr = RegexFeaturizer({}, known_patterns=patterns)

    # adds tokens to the message
    tokenizer = SpacyTokenizer({})
    message = Message(sentence, data={RESPONSE: sentence})
    message.set(SPACY_DOCS[TEXT], spacy_nlp(sentence))
    tokenizer.process(message)

    sequence_features, sentence_features = ftr._features_for_patterns(
        message, TEXT)
    assert np.allclose(sequence_features.toarray(), expected[:-1], atol=1e-10)
    assert np.allclose(sentence_features.toarray(), expected[-1], atol=1e-10)

    # the tokenizer should have added tokens
    assert len(message.get(TOKENS_NAMES[TEXT], [])) > 0
    # the number of regex matches on each token should match
    for i, token in enumerate(message.get(TOKENS_NAMES[TEXT])):
        token_matches = token.get("pattern").values()
        num_matches = sum(token_matches)
        assert num_matches == labeled_tokens.count(i)
Esempio n. 13
0
def test_elmo_featurizer_train():
    featurizer = ElmoFeaturizer.create({}, RasaNLUModelConfig())

    sentence = "Hey how are you today ?"
    message = Message(sentence)
    message.set(RESPONSE_ATTRIBUTE, sentence)
    tokens = WhitespaceTokenizer().tokenize(message, attribute=TEXT_ATTRIBUTE)
    tokens = Tokenizer.add_cls_token(tokens, attribute=TEXT_ATTRIBUTE)
    message.set(TOKENS_NAMES[TEXT_ATTRIBUTE], tokens)
    message.set(TOKENS_NAMES[RESPONSE_ATTRIBUTE], tokens)

    featurizer.train(TrainingData([message]), RasaNLUModelConfig())

    expected = np.array(
        [2.2636216, -0.26475656, -1.1358104, -0.49751878, -1.3946456])
    expected_cls = np.array(
        [1.0251294, -0.04053932, -0.7018805, -0.82054937, -0.75054353])

    vecs = message.get(DENSE_FEATURE_NAMES[TEXT_ATTRIBUTE])

    assert len(tokens) == len(vecs)
    assert np.allclose(vecs[0][:5], expected, atol=1e-5)
    assert np.allclose(vecs[-1][:5], expected_cls, atol=1e-5)

    vecs = message.get(DENSE_FEATURE_NAMES[RESPONSE_ATTRIBUTE])

    assert len(tokens) == len(vecs)
    assert np.allclose(vecs[0][:5], expected, atol=1e-5)
    assert np.allclose(vecs[-1][:5], expected_cls, atol=1e-5)

    vecs = message.get(DENSE_FEATURE_NAMES[INTENT_ATTRIBUTE])

    assert vecs is None
Esempio n. 14
0
    def process(self, message: Message, **kwargs: Any) -> None:

        if self._url() is not None:
            reference_time = self._reference_time_from_message(message)
            matches = self._duckling_parse(message.text, reference_time)
            all_extracted = convert_duckling_format_to_rasa(matches)
            dimensions = self.component_config["dimensions"]
            extracted = DucklingHTTPExtractor.filter_irrelevant_entities(
                all_extracted, dimensions)
        else:
            extracted = []
            warnings.warn("Duckling HTTP component in pipeline, but no "
                          "`url` configuration in the config "
                          "file nor is `RASA_DUCKLING_HTTP_URL` "
                          "set as an environment variable.")

        extracted = self.add_extractor_name(extracted)
        message.set(
            ENTITIES_ATTRIBUTE,
            message.get(ENTITIES_ATTRIBUTE, []) + extracted,
            add_to_output=True,
        )
Esempio n. 15
0
def test_mitie_featurizer_train(mitie_feature_extractor):

    featurizer = MitieFeaturizer.create({}, RasaNLUModelConfig())

    sentence = "Hey how are you today"
    message = Message(sentence)
    message.set(RESPONSE, sentence)
    message.set(INTENT, "intent")
    MitieTokenizer().train(TrainingData([message]))

    featurizer.train(
        TrainingData([message]),
        RasaNLUModelConfig(),
        **{"mitie_feature_extractor": mitie_feature_extractor},
    )

    expected = np.array([
        0.00000000e00, -5.12735510e00, 4.39929873e-01, -5.60760403e00,
        -8.26445103e00
    ])
    expected_cls = np.array(
        [0.0, -4.4551446, 0.26073121, -1.46632245, -1.84205751])

    seq_vec, sen_vec = message.get_dense_features(TEXT, [])

    assert len(message.get(TOKENS_NAMES[TEXT])) == len(seq_vec)
    assert np.allclose(seq_vec[0][:5], expected, atol=1e-5)
    assert np.allclose(sen_vec[-1][:5], expected_cls, atol=1e-5)

    seq_vec, sen_vec = message.get_dense_features(RESPONSE, [])

    assert len(message.get(TOKENS_NAMES[RESPONSE])) == len(seq_vec)
    assert np.allclose(seq_vec[0][:5], expected, atol=1e-5)
    assert np.allclose(sen_vec[-1][:5], expected_cls, atol=1e-5)

    seq_vec, sen_vec = message.get_dense_features(INTENT, [])

    assert seq_vec is None
    assert sen_vec is None
Esempio n. 16
0
def test_lookup_tables(sentence, expected, labeled_tokens, spacy_nlp):
    from rasa.nlu.featurizers.sparse_featurizer.regex_featurizer import RegexFeaturizer

    lookups = [
        {
            "name":
            "drinks",
            "elements":
            ["mojito", "lemonade", "sweet berry wine", "tea", "club?mate"],
        },
        {
            "name": "plates",
            "elements": "data/test/lookup_tables/plates.txt"
        },
    ]
    ftr = RegexFeaturizer()
    training_data = TrainingData()
    training_data.lookup_tables = lookups
    ftr.train(training_data)

    # adds tokens to the message
    component_config = {"name": "SpacyTokenizer"}
    tokenizer = SpacyTokenizer(component_config)
    message = Message(sentence)
    message.set("text_spacy_doc", spacy_nlp(sentence))
    tokenizer.process(message)

    sequence_features, sentence_features = ftr._features_for_patterns(
        message, TEXT)
    assert np.allclose(sequence_features.toarray(), expected[:-1], atol=1e-10)
    assert np.allclose(sentence_features.toarray(), expected[-1], atol=1e-10)

    # the tokenizer should have added tokens
    assert len(message.get(TOKENS_NAMES[TEXT], [])) > 0
    # the number of regex matches on each token should match
    for i, token in enumerate(message.get(TOKENS_NAMES[TEXT])):
        token_matches = token.get("pattern").values()
        num_matches = sum(token_matches)
        assert num_matches == labeled_tokens.count(i)
Esempio n. 17
0
 def process(self, message: Message, **kwargs: Any) -> None:
     if self.third_party_service_endpoint is not None:
         headers = {
             'Content-type': 'application/json',
             'Accept': 'application/json'
         }
         req = requests.post(self.third_party_service_endpoint,
                             data=json.dumps({"text": message.text}),
                             headers=headers)
         extracted = [
             self.transform_to_extracted(v) for v in req.json()
             if v["domainType"] != ""
         ]
     else:
         logger.warning(
             "Third party tokenizer component in pipeline, but no "
             "`third_party_service_endpoint` configuration in the config.")
         extracted = []
     extracted = self.add_extractor_name(extracted)
     message.set("entities",
                 message.get("entities", []) + extracted,
                 add_to_output=True)
Esempio n. 18
0
    def process(self, message: Message, **kwargs: Any) -> None:
        """Process incoming message and compute and set features"""

        if self.vectorizers is None:
            logger.error("There is no trained CountVectorizer: "
                         "component is either not trained or "
                         "didn't receive enough training data")
        else:
            message_text = self._get_message_text_by_attribute(
                message, attribute=MESSAGE_TEXT_ATTRIBUTE)

            bag = (self.vectorizers[MESSAGE_TEXT_ATTRIBUTE].transform(
                [message_text]).toarray().squeeze())
            message.set(
                MESSAGE_VECTOR_FEATURE_NAMES[MESSAGE_TEXT_ATTRIBUTE],
                self._combine_with_existing_features(
                    message,
                    bag,
                    feature_name=MESSAGE_VECTOR_FEATURE_NAMES[
                        MESSAGE_TEXT_ATTRIBUTE],
                ),
            )
Esempio n. 19
0
def test_convert_featurizer_process(component_builder):
    tokenizer = component_builder.create_component_from_class(ConveRTTokenizer)
    featurizer = component_builder.create_component_from_class(ConveRTFeaturizer)

    sentence = "Hey how are you today ?"
    message = Message(sentence)
    tokens = tokenizer.tokenize(message, attribute=TEXT)
    tokens = tokenizer.add_cls_token(tokens, attribute=TEXT)
    message.set(TOKENS_NAMES[TEXT], tokens)

    featurizer.process(message, tf_hub_module=tokenizer.module)

    expected = np.array([2.2636216, -0.26475656, -1.1358104, -0.49751878, -1.3946456])
    expected_cls = np.array(
        [1.0251294, -0.04053932, -0.7018805, -0.82054937, -0.75054353]
    )

    vecs = message.get(DENSE_FEATURE_NAMES[TEXT])

    assert len(tokens) == len(vecs)
    assert np.allclose(vecs[0][:5], expected, atol=1e-5)
    assert np.allclose(vecs[-1][:5], expected_cls, atol=1e-5)
Esempio n. 20
0
def test_regex_featurizer(sentence, expected, labeled_tokens, spacy_nlp):
    from rasa.nlu.featurizers.regex_featurizer import RegexFeaturizer

    patterns = [
        {
            "pattern": "[0-9]+",
            "name": "number",
            "usage": "intent"
        },
        {
            "pattern": "\\bhey*",
            "name": "hello",
            "usage": "intent"
        },
        {
            "pattern": "[0-1]+",
            "name": "binary",
            "usage": "intent"
        },
    ]
    ftr = RegexFeaturizer(known_patterns=patterns)

    # adds tokens to the message
    tokenizer = SpacyTokenizer()
    message = Message(sentence)
    message.set("spacy_doc", spacy_nlp(sentence))
    tokenizer.process(message)

    result = ftr.features_for_patterns(message)
    assert np.allclose(result, expected, atol=1e-10)

    # the tokenizer should have added tokens
    assert len(message.get("tokens", [])) > 0
    # the number of regex matches on each token should match
    for i, token in enumerate(message.get("tokens")):
        token_matches = token.get("pattern").values()
        num_matches = sum(token_matches)
        assert num_matches == labeled_tokens.count(i)
Esempio n. 21
0
    def process(self, message: Message, **kwargs: Any) -> None:
        from seq2label.server.paddle_inference import Inference

        real_result_dir = os.path.join(self.model_dir, self.result_dir)
        print(real_result_dir)

        # for cache
        if not self.predict_fn:
            self.predict_fn = Inference(real_result_dir)

        input_text = message.text

        best_result, candidate_ranking = self.predict_fn.infer(input_text)

        intent = {"name": best_result,
                  "confidence": candidate_ranking[0][1]}

        intent_ranking = [{"name": name,
                           "confidence": score}
                          for name, score in candidate_ranking]

        message.set("intent", intent, add_to_output=True)
        message.set("intent_ranking", intent_ranking, add_to_output=True)
def test_count_vector_featurizer_no_sequence(sentence, expected):
    from rasa.nlu.featurizers.sparse_featurizer.count_vectors_featurizer import (
        CountVectorsFeaturizer, )

    ftr = CountVectorsFeaturizer({
        "token_pattern": r"(?u)\b\w+\b",
        "return_sequence": False
    })
    train_message = Message(sentence)
    # this is needed for a valid training example
    train_message.set("intent", "bla")
    data = TrainingData([train_message])
    ftr.train(data)

    test_message = Message(sentence)
    ftr.process(test_message)

    assert isinstance(test_message.get("text_sparse_features"),
                      scipy.sparse.coo_matrix)

    actual = test_message.get("text_sparse_features").toarray()

    assert np.all(actual == expected)
Esempio n. 23
0
 def process(self, message: Message, **kwargs: Any) -> None:
     iu_list = message.get('iu_list')
     last_iu = iu_list[-1]
     iu_word, iu_type = last_iu
     if iu_type == "add":
         bag = self.CVF.vectorizers[MESSAGE_TEXT_ATTRIBUTE].transform(
             [iu_word]).toarray().squeeze()
         return message.set("text_features",
                            self._add_text_features(message, bag))
     elif iu_type == "revoke":
         return self._revoke(message, iu_word)
     else:
         logger.error("incompatible iu type, expected 'add' or 'revoke',"
                      " got '" + iu_type + "'")
Esempio n. 24
0
def test_lm_tokenizer_custom_intent_symbol(text, expected_tokens):
    component_config = {
        "intent_tokenization_flag": True,
        "intent_split_symbol": "+"
    }

    transformers_config = {
        "model_name": "bert"
    }  # Test for one should be enough

    transformers_nlp = HFTransformersNLP(transformers_config)
    lm_tokenizer = LanguageModelTokenizer(component_config)

    message = Message(text)
    message.set(INTENT, text)

    td = TrainingData([message])

    transformers_nlp.train(td)
    lm_tokenizer.train(td)

    assert [t.text
            for t in message.get(TOKENS_NAMES[INTENT])] == expected_tokens
Esempio n. 25
0
    def process(self, message: Message, **kwargs: Any):

        self.request = self.http_session.open()
        # extracted = self.add_extractor_name(self.extract_entities(text, nlp))
        chains = self.extract_coref(message.text)
        self.request.close()

        if not chains:
            message.set("sentence", message.text)
        else:
            target_words = []
            flatten_chains = []
            for chain in chains:
                # 计算出实体词
                words = [s['mention']
                         for s in chain]  # [pair('该', 'r'), pair('员工', 'n')]
                auxiliary_words = words[0]  # 备选的实体词
                for i in range(1, len(words)):
                    flags = [s.flag for s in list(jposseg.cut(words[i]))]
                    if flags[0] != 'r' and len(
                            words[i]) > len(auxiliary_words):
                        auxiliary_words = words[i]
                target_words.append(auxiliary_words)

                # 拼接字典并排序
                for s in chain:
                    s.update({'replace': auxiliary_words})
                flatten_chains.extend(chain)

            flatten_chains.sort(key=lambda x: x['start'], reverse=True)
            temp_sentence = list(message.text)

            for d in flatten_chains:
                temp_sentence[d['start']:d['end']] = d['replace']

            sentence = ''.join(temp_sentence)
            message.set("sentence", sentence)
Esempio n. 26
0
    def process(self, message: Message, **kwargs: Any):
        """Process an incoming message.

        This is the components chance to process an incoming
        message. The component can rely on
        any context attribute to be present, that gets created
        by a call to :meth:`components.Component.pipeline_init`
        of ANY component and
        on any context attributes created by a call to
        :meth:`components.Component.process`
        of components previous to this one."""
        # TODO 分词, 如果利用其它分词组件, 需要进一步调整
        if not message.get("tokens", default=None):
            self.extract_tokens(message)
            # 词性标注
            self.extract_poses(message)
            # 句法依存
            self.extract_parses(message)
            # 抽取实体<序列标注+实体提取>
            self.extract_entities(message)
            # 抽取代词
            self.extract_pronouns(message)
        else:
            # rasa tokenizers
            tokens = message.get("tokens")
            message.set("tokenizers", tokens)
            # List tokens
            tokens = [tokenizer_extract(token) for token in tokens]
            message.set("tokens", tokens)
            self.extract_poses(message)
            # 句法依存
            self.extract_parses(message)
            # 抽取实体<序列标注+实体提取>
            # 语义分割 ->
            self.entity_segment(message)
            # 属性分析 ->
            self.link_analyze(message)
def test_count_vector_featurizer_shared_vocab(sentence, intent, response,
                                              text_features, intent_features,
                                              response_features):
    ftr = CountVectorsFeaturizer({"use_shared_vocab": True})
    tk = WhitespaceTokenizer()

    train_message = Message(sentence)
    # this is needed for a valid training example
    train_message.set(INTENT, intent)
    train_message.set(RESPONSE, response)

    data = TrainingData([train_message])
    tk.train(data)
    ftr.train(data)

    seq_vec, sen_vec = train_message.get_sparse_features(TEXT, [])
    assert np.all(seq_vec.toarray()[0] == text_features)
    assert sen_vec is not None
    seq_vec, sen_vec = train_message.get_sparse_features(INTENT, [])
    assert np.all(seq_vec.toarray()[0] == intent_features)
    assert sen_vec is None
    seq_vec, sen_vec = train_message.get_sparse_features(RESPONSE, [])
    assert np.all(seq_vec.toarray()[0] == response_features)
    assert sen_vec is not None
Esempio n. 28
0
    def process(self, message: Message, **kwargs: Any) -> None:
        """Process incoming message and compute and set features"""

        if self.vectorizers is None:
            logger.error("There is no trained CountVectorizer: "
                         "component is either not trained or "
                         "didn't receive enough training data")
            return

        attribute = TEXT_ATTRIBUTE
        message_tokens = self._get_processed_message_tokens_by_attribute(
            message, attribute)

        # features shape (1, seq, dim)
        features = self._create_sequence(attribute, [message_tokens])

        message.set(
            SPARSE_FEATURE_NAMES[attribute],
            self._combine_with_existing_sparse_features(
                message,
                features[0],  # 0 -> batch dimension
                feature_name=SPARSE_FEATURE_NAMES[attribute],
            ),
        )
    def _set_spacy_features(self,
                            message: Message,
                            attribute: Text = TEXT) -> None:
        """Adds the spacy word vectors to the messages features."""
        doc = self.get_doc(message, attribute)

        if doc is None:
            return

        # in case an empty spaCy model was used, no vectors are present
        if doc.vocab.vectors_length == 0:
            logger.debug(
                "No features present. You are using an empty spaCy model.")
            return

        features = self._features_for_doc(doc)

        cls_token_vec = self._calculate_cls_vector(features,
                                                   self.pooling_operation)
        features = np.concatenate([features, cls_token_vec])

        features = self._combine_with_existing_dense_features(
            message, features, DENSE_FEATURE_NAMES[attribute])
        message.set(DENSE_FEATURE_NAMES[attribute], features)
Esempio n. 30
0
def test_count_vector_featurizer_using_tokens(tokens, expected):

    ftr = CountVectorsFeaturizer({"token_pattern": r"(?u)\b\w+\b"})

    # using empty string instead of real text string to make sure
    # count vector only can come from `tokens` feature.
    # using `message.text` can not get correct result

    tokens_feature = [Token(i, 0) for i in tokens]

    train_message = Message("")
    train_message.set(TOKENS_NAMES[TEXT], tokens_feature)

    data = TrainingData([train_message])

    ftr.train(data)

    test_message = Message("")
    test_message.set(TOKENS_NAMES[TEXT], tokens_feature)

    ftr.process(test_message)

    assert np.all(
        test_message.get(SPARSE_FEATURE_NAMES[TEXT]).toarray()[0] == expected)