Ejemplo n.º 1
0
def test_count_vector_featurizer_using_tokens(tokens, expected):
    from rasa.nlu.featurizers.count_vectors_featurizer import CountVectorsFeaturizer

    ftr = CountVectorsFeaturizer({"token_pattern": r"(?u)\b\w+\b"})

    # using empty string instead of real text string to make sure
    # count vector only can come from `tokens` feature.
    # using `message.text` can not get correct result

    tokens_feature = [Token(i, 0) for i in tokens]

    train_message = Message("")
    train_message.set("tokens", tokens_feature)
    # this is needed for a valid training example
    train_message.set("intent", "bla")
    data = TrainingData([train_message])

    ftr.train(data)

    test_message = Message("")
    test_message.set("tokens", tokens_feature)

    ftr.process(test_message)

    assert np.all(test_message.get("text_features") == expected)
Ejemplo n.º 2
0
def test_count_vector_featurizer_persist_load(tmpdir):
    from rasa.nlu.featurizers.count_vectors_featurizer import CountVectorsFeaturizer

    # set non default values to config
    config = {
        "analyzer": "char",
        "token_pattern": r"(?u)\b\w+\b",
        "strip_accents": "ascii",
        "stop_words": "stop",
        "min_df": 2,
        "max_df": 3,
        "min_ngram": 2,
        "max_ngram": 3,
        "max_features": 10,
        "lowercase": False,
    }
    train_ftr = CountVectorsFeaturizer(config)

    sentence1 = "ababab 123 13xc лаомтгцу sfjv oö aà"
    sentence2 = "abababalidcn 123123 13xcdc лаомтгцу sfjv oö aà"
    train_message1 = Message(sentence1)
    train_message2 = Message(sentence2)

    # this is needed for a valid training example
    train_message1.set("intent", "bla")
    train_message2.set("intent", "bla")
    data = TrainingData([train_message1, train_message2])
    train_ftr.train(data)
    # persist featurizer
    file_dict = train_ftr.persist("ftr", tmpdir.strpath)
    train_vect_params = train_ftr.vectorizer.get_params()
    # add trained vocabulary to vectorizer params
    train_vect_params.update({"vocabulary": train_ftr.vectorizer.vocabulary_})

    # load featurizer
    meta = train_ftr.component_config.copy()
    meta.update(file_dict)
    test_ftr = CountVectorsFeaturizer.load(meta, tmpdir.strpath)
    test_vect_params = test_ftr.vectorizer.get_params()

    assert train_vect_params == test_vect_params

    test_message1 = Message(sentence1)
    test_ftr.process(test_message1)
    test_message2 = Message(sentence2)
    test_ftr.process(test_message2)

    # check that train features and test features after loading are the same
    assert np.all([
        train_message1.get("text_features") == test_message1.get(
            "text_features"),
        train_message2.get("text_features") == test_message2.get(
            "text_features"),
    ])
Ejemplo n.º 3
0
def test_count_vector_featurizer(sentence, expected):
    from rasa.nlu.featurizers.count_vectors_featurizer import CountVectorsFeaturizer

    ftr = CountVectorsFeaturizer({"token_pattern": r"(?u)\b\w+\b"})
    train_message = Message(sentence)
    # this is needed for a valid training example
    train_message.set("intent", "bla")
    data = TrainingData([train_message])
    ftr.train(data)

    test_message = Message(sentence)
    ftr.process(test_message)

    assert np.all(test_message.get("text_features") == expected)
Ejemplo n.º 4
0
def test_count_vector_featurizer_char(sentence, expected):
    from rasa.nlu.featurizers.count_vectors_featurizer import CountVectorsFeaturizer

    ftr = CountVectorsFeaturizer({"min_ngram": 1, "max_ngram": 2, "analyzer": "char"})
    train_message = Message(sentence)
    # this is needed for a valid training example
    train_message.set("intent", "bla")
    data = TrainingData([train_message])
    ftr.train(data)

    test_message = Message(sentence)
    ftr.process(test_message)

    assert np.all(test_message.get("text_features") == expected)
Ejemplo n.º 5
0
def test_count_vector_featurizer_attribute_featurization(
        sentence, intent, response, intent_features, response_features):
    from rasa.nlu.featurizers.count_vectors_featurizer import CountVectorsFeaturizer

    ftr = CountVectorsFeaturizer({"token_pattern": r"(?u)\b\w+\b"})
    train_message = Message(sentence)

    # this is needed for a valid training example
    train_message.set("intent", intent)
    train_message.set("response", response)

    data = TrainingData([train_message])
    ftr.train(data)

    assert train_message.get("intent_features") == intent_features
    assert train_message.get("response_features") == response_features
Ejemplo n.º 6
0
def test_count_vector_featurizer_char_intent_featurizer():
    from rasa.nlu.featurizers.count_vectors_featurizer import CountVectorsFeaturizer

    ftr = CountVectorsFeaturizer({
        "min_ngram": 1,
        "max_ngram": 2,
        "analyzer": "char"
    })
    td = training_data.load_data("data/examples/rasa/demo-rasa.json")
    ftr.train(td, config=None)

    intent_features_exist = np.array([
        True if example.get("intent_features") is not None else False
        for example in td.intent_examples
    ])

    # no intent features should have been set
    assert not any(intent_features_exist)
Ejemplo n.º 7
0
def test_count_vector_featurizer_shared_vocab(
    sentence, intent, response, text_features, intent_features, response_features
):
    from rasa.nlu.featurizers.count_vectors_featurizer import CountVectorsFeaturizer

    ftr = CountVectorsFeaturizer(
        {"token_pattern": r"(?u)\b\w+\b", "use_shared_vocab": True}
    )
    train_message = Message(sentence)

    # this is needed for a valid training example
    train_message.set("intent", intent)
    train_message.set("response", response)

    data = TrainingData([train_message])
    ftr.train(data)

    assert np.all(train_message.get("text_features") == text_features)
    assert np.all(train_message.get("intent_features") == intent_features)
    assert np.all(train_message.get("response_features") == response_features)
Ejemplo n.º 8
0
class IncrementalCVF(IncrementalComponent):
    name = "IncrementalCVF"
    """ Since this is a wrapper for the non-incremental
    CountVectorsFeaturizer to be used with our incremental
    EmbeddingIntentClassifier, we just need to take its
    provides, requires, and defaults.

    """
    provides = CountVectorsFeaturizer.provides
    requires = CountVectorsFeaturizer.requires
    defaults = CountVectorsFeaturizer.defaults

    @classmethod
    def required_packages(cls) -> List[Text]:
        reqs = CountVectorsFeaturizer.required_packages()
        reqs.append("numpy")
        return reqs

    def __init__(self, component_config=None):
        super(IncrementalCVF, self).__init__(component_config)

        self.CVF = CountVectorsFeaturizer()

    # we don't have anything to clear since our featuers are storeed
    # in the Message, which the IncrementalInterpreter clears.
    def new_utterance(self) -> None:
        return

    def train(self,
              training_data: TrainingData,
              cfg: RasaNLUModelConfig = None,
              **kwargs: Any) -> None:

        return self.CVF.train(training_data, cfg, **kwargs)

    # Similar to Featurizer's _combine_with_existing_text_features
    # Except we are doing a vector sum instead of array stack. This
    # is because we're adding the new features of that word in particular
    # rather than entire utterances side by side.
    def _add_text_features(self, message, additional_features):
        if message.get("text_features") is not None:
            return np.add(message.get("text_features"), additional_features)
        else:
            return additional_features

    # On revoke, remove the word's features from the vector
    def _sub_text_features(self, message, to_sub):
        if message.get("text_features") is not None:
            return np.subtract(message.get("text_features"), to_sub)
        #else:
        #    logger.error("Nothing in text features, cannot subtract")

    # assuming not using spacy_doc or tokens, so just setting message.text
    def process(self, message: Message, **kwargs: Any) -> None:
        iu_list = message.get('iu_list')
        last_iu = iu_list[-1]
        iu_word, iu_type = last_iu
        if iu_type == "add":
            bag = self.CVF.vectorizers[MESSAGE_TEXT_ATTRIBUTE].transform(
                [iu_word]).toarray().squeeze()
            return message.set("text_features",
                               self._add_text_features(message, bag))
        elif iu_type == "revoke":
            return self._revoke(message, iu_word)
        else:
            logger.error("incompatible iu type, expected 'add' or 'revoke',"
                         " got '" + iu_type + "'")

    # TODO: can we just subtract the vector instead of
    # storing previous features?
    def _revoke(self, message, word):
        # revoke on empty should do nothing
        if message.get("text_features") is not None:
            return
        else:
            bag = self.CVF.vectorizers[MESSAGE_TEXT_ATTRIBUTE].transform(
                [word]).toarray().squeeze()
            return message.set("text_features",
                               self._sub_text_features(message, bag))

    def persist(self, file_name: Text,
                model_dir: Text) -> Optional[Dict[Text, Any]]:

        file_name = file_name + ".pkl"
        featurizer_file = os.path.join(model_dir, file_name)
        utils.json_pickle(featurizer_file, self)
        return {"file": file_name}

    @classmethod
    def load(cls,
             meta: Dict[Text, Any],
             model_dir: Text = None,
             model_metadata: Metadata = None,
             cached_component: Optional['IncrementalCVF'] = None,
             **kwargs: Any) -> 'IncrementalCVF':

        if model_dir and meta.get("file"):
            file_name = meta.get("file")
            featurizer_file = os.path.join(model_dir, file_name)
            return utils.json_unpickle(featurizer_file)
        else:
            logger.warning("Failed to load featurizer. Maybe path {} "
                           "doesn't exist".format(os.path.abspath(model_dir)))
            return IncrementalCVF(meta)