def test_lookup_tables_without_use_word_boundaries(sentence, tokens, expected,
                                                   labeled_tokens):
    from rasa.nlu.featurizers.sparse_featurizer.regex_featurizer import RegexFeaturizer
    from rasa.nlu.tokenizers.tokenizer import Token

    lookups = [
        {
            "name": "cites",
            "elements": ["北京", "上海", "广州", "深圳", "杭州"],
        },
        {
            "name": "dates",
            "elements": ["昨天", "今天", "明天", "后天"],
        },
    ]
    ftr = RegexFeaturizer({"use_word_boundaries": False})
    training_data = TrainingData()
    training_data.lookup_tables = lookups
    ftr.train(training_data)

    # adds tokens to the message
    message = Message(data={TEXT: sentence})
    message.set(TOKENS_NAMES[TEXT],
                [Token(word, start) for (word, start) in tokens])

    sequence_features, sentence_features = ftr._features_for_patterns(
        message, TEXT)
    assert np.allclose(sequence_features.toarray(), expected[:-1], atol=1e-10)
    assert np.allclose(sentence_features.toarray(), expected[-1], atol=1e-10)

    # the number of regex matches on each token should match
    for i, token in enumerate(message.get(TOKENS_NAMES[TEXT])):
        token_matches = token.get("pattern").values()
        num_matches = sum(token_matches)
        assert num_matches == labeled_tokens.count(i)
    def process(self, message: Message, **kwargs: Any) -> None:
        from seq2annotation.server.paddle_inference import Inference

        real_result_dir = os.path.join(self.model_dir, self.result_dir)
        print(real_result_dir)

        # for cache
        if not self.predict_fn:
            self.predict_fn = Inference(real_result_dir)

        input_text = message.text

        seq = self.predict_fn.infer(input_text)

        seq.span_set.fill_text(input_text)
        entity_set = []
        for span in seq.span_set:
            ent = {
                "entity": span.entity,
                "value": span.value,
                "start": span.start,
                "confidence": None,
                "end": span.end
            }
            entity_set.append(ent)

        extracted = self.add_extractor_name(entity_set)

        message.set("entities",
                    message.get("entities", []) + extracted,
                    add_to_output=True)
Exemple #3
0
    def process(self, message: Message, **kwargs: Any) -> None:
        from seq2label.input import to_fixed_len

        input_text = message.text

        input_feature = {
            'words': [to_fixed_len([i for i in input_text], 20, '<pad>')],
        }

        print(input_feature)

        predictions = self.predict_fn(input_feature)
        label = predictions['label'][0].decode()

        intent = {"name": label, "confidence": 1}

        ranking = zip([i.decode() for i in predictions['label_mapping']],
                      [float(i) for i in predictions['label_prob'][0]])
        intent_ranking = [{
            "name": name,
            "confidence": score
        } for name, score in ranking]

        message.set("intent", intent, add_to_output=True)
        message.set("intent_ranking", intent_ranking, add_to_output=True)
Exemple #4
0
    def process(self, message: Message, **kwargs: Any) -> None:
        extracted_entities = self._extract_entities(message)
        extracted_entities = self.add_extractor_name(extracted_entities)

        message.set(
            ENTITIES, message.get(ENTITIES, []) + extracted_entities, add_to_output=True
        )
def test_regex_featurizer_case_sensitive(
    sentence: Text,
    expected_sequence_features: List[float],
    expected_sentence_features: List[float],
    case_sensitive: bool,
    spacy_nlp: Any,
):

    patterns = [
        {"pattern": "[0-9]+", "name": "number", "usage": "intent"},
        {"pattern": "\\bhey*", "name": "hello", "usage": "intent"},
        {"pattern": "[0-1]+", "name": "binary", "usage": "intent"},
    ]
    ftr = RegexFeaturizer(
        {"case_sensitive": case_sensitive, "number_additional_patterns": 0},
        known_patterns=patterns,
    )

    # adds tokens to the message
    tokenizer = SpacyTokenizer()
    message = Message(data={TEXT: sentence})
    message.set(SPACY_DOCS[TEXT], spacy_nlp(sentence))
    tokenizer.process(message)

    sequence_features, sentence_features = ftr._features_for_patterns(message, TEXT)
    assert np.allclose(
        sequence_features.toarray()[0], expected_sequence_features, atol=1e-10
    )
    assert np.allclose(
        sentence_features.toarray()[-1], expected_sentence_features, atol=1e-10
    )
Exemple #6
0
    def process(self, message: Message, **kwargs: Any) -> None:
        urls: Set[Any] = set()
        last_pos = 0
        for url in self.extractor.gen_urls(message.data.get("text")):
            start = message.data.get("text").find(url, last_pos)
            end = start + len(url)
            last_pos = end
            urls.add(
                tuple(
                    {
                        "start": start,
                        "end": end,
                        "value": url,
                        "entity": "URL",
                        "extractor": self.name,
                        "confidence": 1.0,
                    }.items()
                )
            )
        entities = message.get("entities", []) + list(
            sorted(map(dict, urls), key=lambda x: x.get("start"))  # type: ignore
        )

        message.set(
            "entities",
            sorted(entities, key=lambda x: x.get("confidence", 0), reverse=True),
            add_to_output=True,
        )
Exemple #7
0
    def process(self, message: Message, **kwargs: Any) -> None:

        if self._url() is not None:
            # mod >
            params = kwargs
            timezone = self._timezone_from_config_or_request(
                self.component_config, params.get("timezone", None))
            reference_time = self._reference_time_from_message_or_request(
                message, params.get("reference_time", None))
            matches = self._duckling_parse(message.text, reference_time,
                                           timezone)
            # </ mod
            all_extracted = convert_duckling_format_to_rasa(matches)
            dimensions = self.component_config["dimensions"]
            extracted = DucklingEntityExtractor.filter_irrelevant_entities(
                all_extracted, dimensions)
        else:
            extracted = []
            raise_warning(
                "Duckling HTTP component in pipeline, but no "
                "`url` configuration in the config "
                "file nor is `RASA_DUCKLING_HTTP_URL` "
                "set as an environment variable. No entities will be extracted!",
                docs=DOCS_URL_COMPONENTS + "#ducklinghttpextractor",
            )

        extracted = self.add_extractor_name(extracted)
        message.set(
            ENTITIES,
            message.get(ENTITIES, []) + extracted,
            add_to_output=True,
        )
Exemple #8
0
def test_count_vector_featurizer_using_tokens(tokens, expected):
    ftr = CountVectorsFeaturizer()

    # using empty string instead of real text string to make sure
    # count vector only can come from `tokens` feature.
    # using `message.text` can not get correct result

    tokens_feature = [Token(i, 0) for i in tokens]

    train_message = Message(data={TEXT: ""})
    train_message.set(TOKENS_NAMES[TEXT], tokens_feature)

    data = TrainingData([train_message])

    ftr.train(data)

    test_message = Message(data={TEXT: ""})
    test_message.set(TOKENS_NAMES[TEXT], tokens_feature)

    ftr.process(test_message)

    seq_vec, sen_vec = train_message.get_sparse_features(TEXT, [])
    if seq_vec:
        seq_vec = seq_vec.features
    if sen_vec:
        sen_vec = sen_vec.features
    assert np.all(seq_vec.toarray()[0] == expected)
    assert sen_vec is not None
Exemple #9
0
    def process(self, message: Message, **kwargs: Any) -> None:

        entities = message.get("entities", [])
        new_entities = []

        for entity in entities:
            config = self._find_entity(entity, self.entities)
            if config is None or not isinstance(entity["value"], str):
                new_entities.append(entity)
                continue

            matches = process.extract(
                entity["value"],
                self.gazette.get(entity["entity"], []),
                limit=self.limit,
                scorer=config["mode"],
            )
            primary, score = matches[0] if len(matches) else (None, None)

            if primary is not None and score > config["min_score"]:
                entity["value"] = primary
                entity["gazette_matches"] = [{
                    "value": value,
                    "score": num
                } for value, num in matches]
                new_entities.append(entity)

        message.set("entities", new_entities)
Exemple #10
0
def test_regex_featurizer_no_sequence(
    sentence: Text,
    expected_sequence_features: List[float],
    expected_sentence_features: List[float],
    spacy_nlp: Any,
    create_featurizer: Callable[..., RegexFeaturizer],
    spacy_tokenizer: SpacyTokenizer,
):

    patterns = [
        {"pattern": "[0-9]+", "name": "number", "usage": "intent"},
        {"pattern": "\\bhey*", "name": "hello", "usage": "intent"},
        {"pattern": "[0-1]+", "name": "binary", "usage": "intent"},
    ]
    ftr = create_featurizer(known_patterns=patterns)

    # adds tokens to the message
    message = Message(data={TEXT: sentence})
    message.set(SPACY_DOCS[TEXT], spacy_nlp(sentence))
    spacy_tokenizer.process([message])

    sequence_features, sentence_features = ftr._features_for_patterns(message, TEXT)
    assert np.allclose(
        sequence_features.toarray()[0], expected_sequence_features, atol=1e-10
    )
    assert np.allclose(
        sentence_features.toarray()[-1], expected_sentence_features, atol=1e-10
    )
def test_text_featurizer_using_pos(sentence, expected, spacy_nlp):
    featurizer = LexicalSyntacticFeaturizer({"features": [["pos", "pos2"]]})

    train_message = Message(data={TEXT: sentence})
    test_message = Message(data={TEXT: sentence})

    train_message.set(SPACY_DOCS[TEXT], spacy_nlp(sentence))
    test_message.set(SPACY_DOCS[TEXT], spacy_nlp(sentence))

    SpacyTokenizer().process(train_message)
    SpacyTokenizer().process(test_message)

    featurizer.train(TrainingData([train_message]))

    featurizer.process(test_message)

    seq_vec, sen_vec = test_message.get_sparse_features(TEXT, [])
    if seq_vec:
        seq_vec = seq_vec.features
    if sen_vec:
        sen_vec = sen_vec.features

    assert isinstance(seq_vec, scipy.sparse.coo_matrix)
    assert sen_vec is None

    assert np.all(seq_vec.toarray() == expected)
Exemple #12
0
    def process(self, message: Message, **kwargs: Any) -> None:
        """Return the most likely intent and its probability for a message."""

        if not self.clf:
            # component is either not trained or didn't
            # receive enough training data
            intent = None
            intent_ranking = []
        else:
            X = self._get_sentence_features(message).reshape(1, -1)

            intent_ids, probabilities = self.predict(X)
            intents = self.transform_labels_num2str(np.ravel(intent_ids))
            # `predict` returns a matrix as it is supposed
            # to work for multiple examples as well, hence we need to flatten
            probabilities = probabilities.flatten()

            if intents.size > 0 and probabilities.size > 0:
                ranking = list(zip(list(intents),
                                   list(probabilities)))[:LABEL_RANKING_LENGTH]

                intent = {"name": intents[0], "confidence": probabilities[0]}

                intent_ranking = [{
                    "name": intent_name,
                    "confidence": score
                } for intent_name, score in ranking]
            else:
                intent = {"name": None, "confidence": 0.0}
                intent_ranking = []

        message.set("intent", intent, add_to_output=True)
        message.set("intent_ranking", intent_ranking, add_to_output=True)
def test_process_does_not_overwrite_any_entities(
    create_or_load_extractor: Callable[...,
                                       RegexEntityExtractorGraphComponent], ):

    pre_existing_entity = {
        ENTITY_ATTRIBUTE_TYPE: "person",
        ENTITY_ATTRIBUTE_VALUE: "Max",
        ENTITY_ATTRIBUTE_START: 0,
        ENTITY_ATTRIBUTE_END: 3,
        EXTRACTOR: "other extractor",
    }
    message = Message(data={TEXT: "Max lives in Berlin.", INTENT: "infrom"})
    message.set(ENTITIES, [copy.deepcopy(pre_existing_entity)])

    training_data = TrainingData()
    training_data.training_examples = [
        Message(
            data={
                TEXT:
                "Hi Max!",
                INTENT:
                "greet",
                ENTITIES: [{
                    ENTITY_ATTRIBUTE_TYPE: "person",
                    ENTITY_ATTRIBUTE_VALUE: "Max"
                }],
            }),
        Message(
            data={
                TEXT:
                "I live in Berlin",
                INTENT:
                "inform",
                ENTITIES: [{
                    ENTITY_ATTRIBUTE_TYPE: "city",
                    ENTITY_ATTRIBUTE_VALUE: "Berlin"
                }],
            }),
    ]
    training_data.lookup_tables = [{
        "name":
        "city",
        "elements": ["London", "Berlin", "Amsterdam"]
    }]

    entity_extractor = create_or_load_extractor(config={})
    entity_extractor.train(training_data)
    entity_extractor.process([message])

    entities = message.get(ENTITIES)
    assert entities == [
        pre_existing_entity,
        {
            ENTITY_ATTRIBUTE_TYPE: "city",
            ENTITY_ATTRIBUTE_VALUE: "Berlin",
            ENTITY_ATTRIBUTE_START: 13,
            ENTITY_ATTRIBUTE_END: 19,
            EXTRACTOR: RegexEntityExtractorGraphComponent.__name__,
        },
    ]
def test_convert_featurizer_process(component_builder, monkeypatch: MonkeyPatch):

    monkeypatch.setattr(
        ConveRTTokenizer, "_get_validated_model_url", lambda x: RESTRICTED_ACCESS_URL
    )

    component_config = {"name": "ConveRTTokenizer", "model_url": RESTRICTED_ACCESS_URL}
    tokenizer = ConveRTTokenizer(component_config)
    featurizer = component_builder.create_component_from_class(ConveRTFeaturizer)

    sentence = "Hey how are you today ?"
    message = Message(data={TEXT: sentence})
    tokens = tokenizer.tokenize(message, attribute=TEXT)
    message.set(TOKENS_NAMES[TEXT], tokens)

    featurizer.process(message, tf_hub_module=tokenizer.module)

    expected = np.array([2.2636216, -0.26475656, -1.1358104, -0.49751878, -1.3946456])
    expected_cls = np.array(
        [1.0251294, -0.04053932, -0.7018805, -0.82054937, -0.75054353]
    )

    seq_vecs, sent_vecs = message.get_dense_features(TEXT, [])

    seq_vecs = seq_vecs.features
    sent_vecs = sent_vecs.features

    assert len(tokens) == len(seq_vecs)
    assert np.allclose(seq_vecs[0][:5], expected, atol=1e-5)
    assert np.allclose(sent_vecs[-1][:5], expected_cls, atol=1e-5)
Exemple #15
0
    def process(self, message: Message, **kwargs: Any) -> None:
        """Process an incoming message.

        This is the components chance to process an incoming
        message. The component can rely on
        any context attribute to be present, that gets created
        by a call to :meth:`components.Component.pipeline_init`
        of ANY component and
        on any context attributes created by a call to
        :meth:`components.Component.process`
        of components previous to this one."""
        res = self.client.concepts.extract(message.get(TEXT),
                                           lang=self.lang,
                                           properties=self.properties,
                                           split=self.split,
                                           precision=self.precision)
        concepts = []
        for concept in res.concepts:
            for label in concept.labels:
                for mention in label.mentions:
                    concepts.append({
                        "value": label.text,
                        "start": mention.start,
                        "end": mention.end,
                        "entity": concept.id,
                        "properties": concept.properties,
                        "confidence": concept.weight
                    })
        all_extracted = self.add_extractor_name(concepts)
        dimensions = self.component_config.get("dimensions")
        extracted = self.filter_irrelevant_entities(all_extracted, dimensions)
        extracted = self.add_extractor_name(extracted)
        message.set("concepts",
                    message.get("concepts", []) + extracted,
                    add_to_output=True)
Exemple #16
0
    def process(self, message: Message, **kwargs: Any) -> None:
        """Process an incoming message.

        This is the components chance to process an incoming
        message. The component can rely on
        any context attribute to be present, that gets created
        by a call to :meth:`components.Component.pipeline_init`
        of ANY component and
        on any context attributes created by a call to
        :meth:`components.Component.process`
        of components previous to this one."""
        res = self.client.entities.extract(message.get(TEXT))
        entities = [{
            "entity": ent.ent_type,
            "value": ent.text,
            "start": ent.start,
            "confidence": None,
            "end": ent.end,
        } for ent in res.entities]
        all_extracted = self.add_extractor_name(entities)
        dimensions = self.component_config.get("dimensions")
        extracted = self.filter_irrelevant_entities(all_extracted, dimensions)
        extracted = self.add_extractor_name(extracted)
        message.set(ENTITIES,
                    message.get(ENTITIES, []) + extracted,
                    add_to_output=True)
Exemple #17
0
def test_regex_featurizer_no_sequence(sentence, expected, expected_cls,
                                      spacy_nlp):

    patterns = [
        {
            "pattern": "[0-9]+",
            "name": "number",
            "usage": "intent"
        },
        {
            "pattern": "\\bhey*",
            "name": "hello",
            "usage": "intent"
        },
        {
            "pattern": "[0-1]+",
            "name": "binary",
            "usage": "intent"
        },
    ]
    ftr = RegexFeaturizer({}, known_patterns=patterns)

    # adds tokens to the message
    tokenizer = SpacyTokenizer()
    message = Message(data={TEXT: sentence})
    message.set(SPACY_DOCS[TEXT], spacy_nlp(sentence))
    tokenizer.process(message)

    sequence_featrures, sentence_features = ftr._features_for_patterns(
        message, TEXT)
    assert np.allclose(sequence_featrures.toarray()[0], expected, atol=1e-10)
    assert np.allclose(sentence_features.toarray()[-1],
                       expected_cls,
                       atol=1e-10)
Exemple #18
0
def test_convert_featurizer_process(component_builder):
    tokenizer = component_builder.create_component_from_class(ConveRTTokenizer)
    featurizer = component_builder.create_component_from_class(
        ConveRTFeaturizer)

    sentence = "Hey how are you today ?"
    message = Message(data={TEXT: sentence})
    tokens = tokenizer.tokenize(message, attribute=TEXT)
    message.set(TOKENS_NAMES[TEXT], tokens)

    featurizer.process(message, tf_hub_module=tokenizer.module)

    expected = np.array(
        [2.2636216, -0.26475656, -1.1358104, -0.49751878, -1.3946456])
    expected_cls = np.array(
        [1.0251294, -0.04053932, -0.7018805, -0.82054937, -0.75054353])

    seq_vecs, sent_vecs = message.get_dense_features(TEXT, [])

    seq_vecs = seq_vecs.features
    sent_vecs = sent_vecs.features

    assert len(tokens) == len(seq_vecs)
    assert np.allclose(seq_vecs[0][:5], expected, atol=1e-5)
    assert np.allclose(sent_vecs[-1][:5], expected_cls, atol=1e-5)
Exemple #19
0
    def tokenize(self, message: Message, attribute: Text) -> List[Token]:
        """Tokenize the text using the ConveRT model.

        ConveRT adds a special char in front of (some) words and splits words into
        sub-words. To ensure the entity start and end values matches the token values,
        reuse the tokens that are already assigned to the message. If individual tokens
        are split up into multiple tokens, add this information to the
        respected tokens.
        """
        tokens_in = message.get(TOKENS_NAMES[attribute])

        tokens_out = []

        for token in tokens_in:
            # use ConveRT model to tokenize the text
            split_token_strings = self._tokenize(token.text)[0]

            # clean tokens (remove special chars and empty tokens)
            split_token_strings = self._clean_tokens(split_token_strings)

            token.set(NUMBER_OF_SUB_TOKENS, len(split_token_strings))

            tokens_out.append(token)

        message.set(TOKENS_NAMES[attribute], tokens_out)
        return tokens_out
Exemple #20
0
def test_count_vector_featurizer_use_lemma(
    spacy_nlp: Any,
    sentence: Text,
    sequence_features: List[List[int]],
    sentence_features: List[List[int]],
    use_lemma: bool,
):
    ftr = CountVectorsFeaturizer({"use_lemma": use_lemma})

    train_message = Message(data={TEXT: sentence})
    train_message.set(SPACY_DOCS[TEXT], spacy_nlp(sentence))
    test_message = Message(data={TEXT: sentence})
    test_message.set(SPACY_DOCS[TEXT], spacy_nlp(sentence))

    SpacyTokenizer().process(train_message)
    SpacyTokenizer().process(test_message)

    ftr.train(TrainingData([train_message]))

    ftr.process(test_message)

    seq_vecs, sen_vecs = test_message.get_sparse_features(TEXT, [])

    assert isinstance(seq_vecs.features, scipy.sparse.coo_matrix)
    assert isinstance(sen_vecs.features, scipy.sparse.coo_matrix)

    actual_seq_vecs = seq_vecs.features.toarray()
    actual_sen_vecs = sen_vecs.features.toarray()

    assert np.all(actual_seq_vecs[0] == sequence_features)
    assert np.all(actual_sen_vecs[-1] == sentence_features)
Exemple #21
0
def test_spacy_featurizer_sequence(sentence, expected, spacy_nlp):
    from rasa.nlu.featurizers.dense_featurizer.spacy_featurizer import SpacyFeaturizer

    doc = spacy_nlp(sentence)
    token_vectors = [t.vector for t in doc]

    ftr = SpacyFeaturizer.create({}, RasaNLUModelConfig())

    greet = {TEXT: sentence, "intent": "greet", "text_features": [0.5]}

    message = Message(data=greet)
    message.set(SPACY_DOCS[TEXT], doc)

    ftr._set_spacy_features(message)

    seq_vecs, sen_vecs = message.get_dense_features(TEXT, [])
    if seq_vecs:
        seq_vecs = seq_vecs.features
    if sen_vecs:
        sen_vecs = sen_vecs.features

    vecs = seq_vecs[0][:5]

    assert np.allclose(token_vectors[0][:5], vecs, atol=1e-4)
    assert np.allclose(vecs, expected, atol=1e-4)
    assert sen_vecs is not None
def test_convert_training_examples(
    spacy_nlp: Any,
    text: Text,
    intent: Optional[Text],
    entities: Optional[List[Dict[Text, Any]]],
    attributes: List[Text],
    real_sparse_feature_sizes: Dict[Text, Dict[Text, List[int]]],
):
    message = Message(data={TEXT: text, INTENT: intent, ENTITIES: entities})

    tokenizer = SpacyTokenizer()
    count_vectors_featurizer = CountVectorsFeaturizer()
    spacy_featurizer = SpacyFeaturizer()

    message.set(SPACY_DOCS[TEXT], spacy_nlp(text))

    training_data = TrainingData([message])
    tokenizer.train(training_data)
    count_vectors_featurizer.train(training_data)
    spacy_featurizer.train(training_data)

    entity_tag_spec = [
        EntityTagSpec(
            "entity",
            {
                0: "O",
                1: "name",
                2: "location"
            },
            {
                "O": 0,
                "name": 1,
                "location": 2
            },
            3,
        )
    ]
    output, sparse_feature_sizes = model_data_utils.featurize_training_examples(
        [message],
        attributes=attributes,
        entity_tag_specs=entity_tag_spec,
    )

    assert len(output) == 1
    for attribute in attributes:
        assert attribute in output[0]
    for attribute in {INTENT, TEXT, ENTITIES} - set(attributes):
        assert attribute not in output[0]
    # we have sparse sentence, sparse sequence, dense sentence, and dense sequence
    # features in the list
    assert len(output[0][TEXT]) == 4
    if INTENT in attributes:
        # we will just have space sentence features
        assert len(output[0][INTENT]) == 1
    if ENTITIES in attributes:
        # we will just have space sentence features
        assert len(output[0][ENTITIES]) == len(entity_tag_spec)
    # check that it calculates sparse_feature_sizes correctly
    assert sparse_feature_sizes == real_sparse_feature_sizes
Exemple #23
0
def test_persist_load_for_finetuning(
    create_featurizer: Callable[..., RegexFeaturizerGraphComponent],
    default_model_storage: ModelStorage,
    default_execution_context: ExecutionContext,
    resource: Resource,
    whitespace_tokenizer: WhitespaceTokenizerGraphComponent,
):
    patterns = [
        {
            "pattern": "[0-9]+",
            "name": "number",
            "usage": "intent"
        },
        {
            "pattern": "\\bhey*",
            "name": "hello",
            "usage": "intent"
        },
        {
            "pattern": "[0-1]+",
            "name": "binary",
            "usage": "intent"
        },
    ]

    featurizer = create_featurizer()

    sentence = "hey how are you today 19.12.2019 ?"
    message = Message(data={TEXT: sentence})
    message.set(RESPONSE, sentence)
    message.set(INTENT, "intent")
    training_data = TrainingData([message], regex_features=patterns)
    whitespace_tokenizer.process_training_data(training_data)

    featurizer.train(training_data)

    loaded_featurizer = RegexFeaturizerGraphComponent.load(
        RegexFeaturizerGraphComponent.get_default_config(),
        default_model_storage,
        resource,
        dataclasses.replace(default_execution_context, is_finetuning=True),
    )

    # Test component loaded in finetune mode and also with
    # same patterns as before and vocabulary statistics
    assert loaded_featurizer.known_patterns == featurizer.known_patterns
    assert loaded_featurizer.finetune_mode

    new_lookups = [{
        "name": "plates",
        "elements": "data/test/lookup_tables/plates.txt"
    }]

    training_data = TrainingData()
    training_data.lookup_tables = new_lookups
    loaded_featurizer.train(training_data)

    # Test merging of a new pattern to an already trained component.
    assert len(loaded_featurizer.known_patterns) == 4
Exemple #24
0
    def process(self, message: Message, **kwargs: Any) -> None:
        intent_name = self._map_keyword_to_intent(message.get(TEXT))

        confidence = 0.0 if intent_name is None else 1.0
        intent = {"name": intent_name, "confidence": confidence}

        if message.get(INTENT) is None or intent is not None:
            message.set(INTENT, intent, add_to_output=True)
Exemple #25
0
def test_lookup_with_and_without_boundaries(
    sentence: Text,
    expected_sequence_features: List[List[float]],
    expected_sentence_features: List[float],
    labeled_tokens: List[float],
    use_word_boundaries: bool,
    spacy_nlp: Any,
):
    ftr = RegexFeaturizer({
        "use_word_boundaries": use_word_boundaries,
        "number_additional_patterns": 0
    })
    training_data = TrainingData()

    # we use lookups because the "use_word_boundaries" flag is only used when
    # producing patterns from lookup tables
    lookups = [{"name": "how", "elements": ["how"]}]
    training_data.lookup_tables = lookups
    ftr.train(training_data)

    # adds tokens to the message
    tokenizer = SpacyTokenizer()
    message = Message(data={TEXT: sentence})
    message.set(SPACY_DOCS[TEXT], spacy_nlp(sentence))
    tokenizer.process(message)

    (sequence_features,
     sentence_features) = ftr._features_for_patterns(message, TEXT)

    sequence_features = sequence_features.toarray()
    sentence_features = sentence_features.toarray()
    num_of_patterns = sum([len(lookup["elements"]) for lookup in lookups])
    assert sequence_features.shape == (
        len(message.get(TOKENS_NAMES[TEXT])),
        num_of_patterns,
    )
    num_of_lookup_tables = len(lookups)
    assert sentence_features.shape == (num_of_lookup_tables, num_of_patterns)

    # sequence_features should be {0,1} for each token: 1 if match, 0 if not
    assert np.allclose(sequence_features,
                       expected_sequence_features,
                       atol=1e-10)
    # sentence_features should be {0,1} for each lookup table: 1 if sentence
    # contains match from that table, 0 if not
    assert np.allclose(sentence_features,
                       expected_sentence_features,
                       atol=1e-10)

    # the tokenizer should have added tokens
    assert len(message.get(TOKENS_NAMES[TEXT], [])) > 0

    # the number of regex matches on each token should match
    for i, token in enumerate(message.get(TOKENS_NAMES[TEXT])):
        token_matches = token.get("pattern").values()
        num_matches = sum(token_matches)
        # labeled_tokens should list the token(s) which match a pattern
        assert num_matches == labeled_tokens.count(i)
def test_persist_load_for_finetuning(tmp_path: Path):
    patterns = [
        {"pattern": "[0-9]+", "name": "number", "usage": "intent"},
        {"pattern": "\\bhey*", "name": "hello", "usage": "intent"},
        {"pattern": "[0-1]+", "name": "binary", "usage": "intent"},
    ]

    featurizer = RegexFeaturizer.create(
        {"number_additional_patterns": 5}, RasaNLUModelConfig()
    )

    sentence = "hey how are you today 19.12.2019 ?"
    message = Message(data={TEXT: sentence})
    message.set(RESPONSE, sentence)
    message.set(INTENT, "intent")
    WhitespaceTokenizer().train(TrainingData([message]))

    featurizer.train(
        TrainingData([message], regex_features=patterns), RasaNLUModelConfig()
    )

    persist_value = featurizer.persist("ftr", str(tmp_path))

    # Test all artifacts stored as part of persist
    assert persist_value["file"] == "ftr"
    assert (tmp_path / "ftr.patterns.pkl").exists()
    assert (tmp_path / "ftr.vocabulary_stats.pkl").exists()
    assert featurizer.vocabulary_stats == {
        "max_number_patterns": 8,
        "pattern_slots_filled": 3,
    }

    loaded_featurizer = RegexFeaturizer.load(
        meta={"number_additional_patterns": 5, "file": persist_value["file"],},
        should_finetune=True,
        model_dir=str(tmp_path),
    )

    # Test component loaded in finetune mode and also with
    # same patterns as before and vocabulary statistics
    assert loaded_featurizer.known_patterns == featurizer.known_patterns
    assert loaded_featurizer.finetune_mode
    assert loaded_featurizer.pattern_vocabulary_stats == featurizer.vocabulary_stats

    new_lookups = [{"name": "plates", "elements": "data/test/lookup_tables/plates.txt"}]

    training_data = TrainingData()
    training_data.lookup_tables = new_lookups
    loaded_featurizer.train(training_data)

    # Test merging of a new pattern to an already trained component.
    assert len(loaded_featurizer.known_patterns) == 4
    assert loaded_featurizer.vocabulary_stats == {
        "max_number_patterns": 8,
        "pattern_slots_filled": 4,
    }
def test_do_not_overwrite_any_entities():
    message = Message(data={TEXT: "Max lives in Berlin.", INTENT: "infrom"})
    message.set(ENTITIES, [{
        "entity": "person",
        "value": "Max",
        "start": 0,
        "end": 3
    }])

    training_data = TrainingData()
    training_data.training_examples = [
        Message(
            data={
                TEXT: "Hi Max!",
                INTENT: "greet",
                ENTITIES: [{
                    "entity": "person",
                    "value": "Max"
                }],
            }),
        Message(
            data={
                TEXT: "I live in Berlin",
                INTENT: "inform",
                ENTITIES: [{
                    "entity": "city",
                    "value": "Berlin"
                }],
            }),
    ]
    training_data.lookup_tables = [{
        "name":
        "city",
        "elements": ["London", "Berlin", "Amsterdam"]
    }]

    entity_extractor = RegexEntityExtractor()
    entity_extractor.train(training_data)
    entity_extractor.process(message)

    entities = message.get(ENTITIES)
    assert entities == [
        {
            "entity": "person",
            "value": "Max",
            "start": 0,
            "end": 3
        },
        {
            "entity": "city",
            "value": "Berlin",
            "start": 13,
            "end": 19,
            "extractor": "RegexEntityExtractor",
        },
    ]
Exemple #28
0
def test_regex_featurizer_train(
    create_featurizer: Callable[..., RegexFeaturizer],
    whitespace_tokenizer: WhitespaceTokenizer,
):
    patterns = [
        {"pattern": "[0-9]+", "name": "number", "usage": "intent"},
        {"pattern": "\\bhey*", "name": "hello", "usage": "intent"},
        {"pattern": "[0-1]+", "name": "binary", "usage": "intent"},
    ]

    featurizer = create_featurizer()
    sentence = "hey how are you today 19.12.2019 ?"
    message = Message(data={TEXT: sentence})
    message.set(RESPONSE, sentence)
    message.set(INTENT, "intent")

    whitespace_tokenizer.process_training_data(TrainingData([message]))
    training_data = TrainingData([message], regex_features=patterns)

    featurizer.train(training_data)
    featurizer.process_training_data(training_data)

    expected = np.array([0, 1, 0])
    expected_cls = np.array([1, 1, 1])

    seq_vecs, sen_vec = message.get_sparse_features(TEXT, [])
    if seq_vecs:
        seq_vecs = seq_vecs.features
    if sen_vec:
        sen_vec = sen_vec.features

    assert (6, 3) == seq_vecs.shape
    assert (1, 3) == sen_vec.shape
    assert np.all(seq_vecs.toarray()[0] == expected)
    assert np.all(sen_vec.toarray()[-1] == expected_cls)

    seq_vecs, sen_vec = message.get_sparse_features(RESPONSE, [])
    if seq_vecs:
        seq_vecs = seq_vecs.features
    if sen_vec:
        sen_vec = sen_vec.features

    assert (6, 3) == seq_vecs.shape
    assert (1, 3) == sen_vec.shape
    assert np.all(seq_vecs.toarray()[0] == expected)
    assert np.all(sen_vec.toarray()[-1] == expected_cls)

    seq_vecs, sen_vec = message.get_sparse_features(INTENT, [])
    if seq_vecs:
        seq_vecs = seq_vecs.features
    if sen_vec:
        sen_vec = sen_vec.features

    assert seq_vecs is None
    assert sen_vec is None
def test_regex_featurizer_train():

    patterns = [
        {"pattern": "[0-9]+", "name": "number", "usage": "intent"},
        {"pattern": "\\bhey*", "name": "hello", "usage": "intent"},
        {"pattern": "[0-1]+", "name": "binary", "usage": "intent"},
    ]

    featurizer = RegexFeaturizer.create(
        {"number_additional_patterns": 0}, RasaNLUModelConfig()
    )

    sentence = "hey how are you today 19.12.2019 ?"
    message = Message(data={TEXT: sentence})
    message.set(RESPONSE, sentence)
    message.set(INTENT, "intent")
    WhitespaceTokenizer().train(TrainingData([message]))

    featurizer.train(
        TrainingData([message], regex_features=patterns), RasaNLUModelConfig()
    )

    expected = np.array([0, 1, 0])
    expected_cls = np.array([1, 1, 1])

    seq_vecs, sen_vec = message.get_sparse_features(TEXT, [])
    if seq_vecs:
        seq_vecs = seq_vecs.features
    if sen_vec:
        sen_vec = sen_vec.features

    assert (6, 3) == seq_vecs.shape
    assert (1, 3) == sen_vec.shape
    assert np.all(seq_vecs.toarray()[0] == expected)
    assert np.all(sen_vec.toarray()[-1] == expected_cls)

    seq_vecs, sen_vec = message.get_sparse_features(RESPONSE, [])
    if seq_vecs:
        seq_vecs = seq_vecs.features
    if sen_vec:
        sen_vec = sen_vec.features

    assert (6, 3) == seq_vecs.shape
    assert (1, 3) == sen_vec.shape
    assert np.all(seq_vecs.toarray()[0] == expected)
    assert np.all(sen_vec.toarray()[-1] == expected_cls)

    seq_vecs, sen_vec = message.get_sparse_features(INTENT, [])
    if seq_vecs:
        seq_vecs = seq_vecs.features
    if sen_vec:
        sen_vec = sen_vec.features

    assert seq_vecs is None
    assert sen_vec is None
Exemple #30
0
def test_regex_featurizer(
    sentence: Text,
    expected_sequence_features: List[float],
    expected_sentence_features: List[float],
    labeled_tokens: List[int],
    additional_vocabulary_size: int,
    spacy_nlp: Any,
):
    from rasa.nlu.featurizers.sparse_featurizer.regex_featurizer import RegexFeaturizer

    patterns = [
        {
            "pattern": "[0-9]+",
            "name": "number",
            "usage": "intent"
        },
        {
            "pattern": "\\bhey*",
            "name": "hello",
            "usage": "intent"
        },
        {
            "pattern": "[0-1]+",
            "name": "binary",
            "usage": "intent"
        },
    ]
    ftr = RegexFeaturizer(
        {"number_additional_patterns": additional_vocabulary_size},
        known_patterns=patterns,
    )

    # adds tokens to the message
    tokenizer = SpacyTokenizer({})
    message = Message(data={TEXT: sentence, RESPONSE: sentence})
    message.set(SPACY_DOCS[TEXT], spacy_nlp(sentence))
    tokenizer.process(message)

    sequence_features, sentence_features = ftr._features_for_patterns(
        message, TEXT)
    assert np.allclose(sequence_features.toarray(),
                       expected_sequence_features,
                       atol=1e-10)
    assert np.allclose(sentence_features.toarray(),
                       expected_sentence_features,
                       atol=1e-10)

    # the tokenizer should have added tokens
    assert len(message.get(TOKENS_NAMES[TEXT], [])) > 0
    # the number of regex matches on each token should match
    for i, token in enumerate(message.get(TOKENS_NAMES[TEXT])):
        token_matches = token.get("pattern").values()
        num_matches = sum(token_matches)
        assert num_matches == labeled_tokens.count(i)