Beispiel #1
0
def fetch_info_from_message(interpreter, text_input):
    msg = Message({TEXT: text_input})
    blob = interpreter.interpreter.parse(text_input)
    nlu_dict = interpreter.featurize_message(msg).as_dict_nlu()
    tokens = [t.text for t in nlu_dict["text_tokens"]]
    return blob, nlu_dict, tokens
Beispiel #2
0
def test_container_keys():
    message_data_list = [{INTENT: "1"}, {INTENT: "2"}, {TEXT: "3", "other": 3}]
    container = MessageContainerForCoreFeaturization()
    container.add_all([Message(data=data) for data in message_data_list])
    assert set(container.keys(INTENT)) == {"1", "2"}
    assert set(container.keys(TEXT)) == {"3"}
Beispiel #3
0
def test_container_fingerprints_differ_for_different_containers():
    container1 = MessageContainerForCoreFeaturization()
    container1.add(Message(data={INTENT: "1"}))
    container2 = MessageContainerForCoreFeaturization()
    container2.add(Message(data={INTENT: "2"}))
    assert container2.fingerprint() != container1.fingerprint()
Beispiel #4
0
def test_count_vector_featurizer_action_attribute_featurization(
    sentence: Text,
    action_name: Text,
    action_text: Text,
    action_name_features: np.ndarray,
    response_features: np.ndarray,
):
    ftr = CountVectorsFeaturizer({"token_pattern": r"(?u)\b\w+\b",})
    tk = WhitespaceTokenizer()

    train_message = Message(data={TEXT: sentence})
    # this is needed for a valid training example
    train_message.set(ACTION_NAME, action_name)
    train_message.set(ACTION_TEXT, action_text)

    # add a second example that has some response, so that the vocabulary for
    # response exists
    second_message = Message(data={TEXT: "hello"})
    second_message.set(ACTION_TEXT, "hi")
    second_message.set(ACTION_NAME, "greet")

    data = TrainingData([train_message, second_message])

    tk.train(data)
    ftr.train(data)

    action_name_seq_vecs, action_name_sen_vecs = train_message.get_sparse_features(
        ACTION_NAME, []
    )
    if action_name_seq_vecs:
        action_name_seq_vecs = action_name_seq_vecs.features
    if action_name_sen_vecs:
        action_name_sen_vecs = action_name_sen_vecs.features
    response_seq_vecs, response_sen_vecs = train_message.get_sparse_features(
        ACTION_TEXT, []
    )
    if response_seq_vecs:
        response_seq_vecs = response_seq_vecs.features
    if response_sen_vecs:
        response_sen_vecs = response_sen_vecs.features

    if action_name_features:
        assert action_name_seq_vecs.toarray()[0] == action_name_features
        assert action_name_sen_vecs is None
    else:
        assert action_name_seq_vecs is None
        assert action_name_sen_vecs is None

    if response_features:
        assert response_seq_vecs.toarray()[0] == response_features
        assert response_sen_vecs is not None
    else:
        assert response_seq_vecs is None
        assert response_sen_vecs is None
Beispiel #5
0
def test_count_vector_featurizer_response_attribute_featurization(
    sentence, intent, response, intent_features, response_features
):
    ftr = CountVectorsFeaturizer()
    tk = WhitespaceTokenizer()

    train_message = Message(data={TEXT: sentence})
    # this is needed for a valid training example
    train_message.set(INTENT, intent)
    train_message.set(RESPONSE, response)

    # add a second example that has some response, so that the vocabulary for
    # response exists
    second_message = Message(data={TEXT: "hello"})
    second_message.set(RESPONSE, "hi")
    second_message.set(INTENT, "greet")

    data = TrainingData([train_message, second_message])

    tk.train(data)
    ftr.train(data)

    intent_seq_vecs, intent_sen_vecs = train_message.get_sparse_features(INTENT, [])
    if intent_seq_vecs:
        intent_seq_vecs = intent_seq_vecs.features
    if intent_sen_vecs:
        intent_sen_vecs = intent_sen_vecs.features
    response_seq_vecs, response_sen_vecs = train_message.get_sparse_features(
        RESPONSE, []
    )
    if response_seq_vecs:
        response_seq_vecs = response_seq_vecs.features
    if response_sen_vecs:
        response_sen_vecs = response_sen_vecs.features

    if intent_features:
        assert intent_seq_vecs.toarray()[0] == intent_features
        assert intent_sen_vecs is None
    else:
        assert intent_seq_vecs is None
        assert intent_sen_vecs is None

    if response_features:
        assert response_seq_vecs.toarray()[0] == response_features
        assert response_sen_vecs is not None
    else:
        assert response_seq_vecs is None
        assert response_sen_vecs is None
Beispiel #6
0
def test_regex_featurizer_train():

    patterns = [
        {
            "pattern": "[0-9]+",
            "name": "number",
            "usage": "intent"
        },
        {
            "pattern": "\\bhey*",
            "name": "hello",
            "usage": "intent"
        },
        {
            "pattern": "[0-1]+",
            "name": "binary",
            "usage": "intent"
        },
    ]

    featurizer = RegexFeaturizer.create({}, RasaNLUModelConfig())

    sentence = "hey how are you today 19.12.2019 ?"
    message = Message(data={TEXT: sentence})
    message.set(RESPONSE, sentence)
    message.set(INTENT, "intent")
    WhitespaceTokenizer().train(TrainingData([message]))

    featurizer.train(TrainingData([message], regex_features=patterns),
                     RasaNLUModelConfig())

    expected = np.array([0, 1, 0])
    expected_cls = np.array([1, 1, 1])

    seq_vecs, sen_vec = message.get_sparse_features(TEXT, [])
    if seq_vecs:
        seq_vecs = seq_vecs.features
    if sen_vec:
        sen_vec = sen_vec.features

    assert (6, 3) == seq_vecs.shape
    assert (1, 3) == sen_vec.shape
    assert np.all(seq_vecs.toarray()[0] == expected)
    assert np.all(sen_vec.toarray()[-1] == expected_cls)

    seq_vecs, sen_vec = message.get_sparse_features(RESPONSE, [])
    if seq_vecs:
        seq_vecs = seq_vecs.features
    if sen_vec:
        sen_vec = sen_vec.features

    assert (6, 3) == seq_vecs.shape
    assert (1, 3) == sen_vec.shape
    assert np.all(seq_vecs.toarray()[0] == expected)
    assert np.all(sen_vec.toarray()[-1] == expected_cls)

    seq_vecs, sen_vec = message.get_sparse_features(INTENT, [])
    if seq_vecs:
        seq_vecs = seq_vecs.features
    if sen_vec:
        sen_vec = sen_vec.features

    assert seq_vecs is None
    assert sen_vec is None
Beispiel #7
0
    def unpack_regex_message(
        message: Message,
        domain: Optional[Domain] = None,
        entity_extractor_name: Optional[Text] = None,
    ) -> Message:
        """Unpacks the message if `TEXT` contains an encoding of attributes.

        Args:
            message: some message
            domain: the domain
            entity_extractor_name: An extractor name which should be added for the
                entities.

        Returns:
            the given message if that message does not need to be unpacked, and a new
            message with the extracted attributes otherwise
        """
        user_text = message.get(TEXT).strip()

        # If the prefix doesn't match, we don't even need to try to match the pattern.
        if not user_text.startswith(INTENT_MESSAGE_PREFIX):
            return message

        # Try to match the pattern.
        match = YAMLStoryReader._regex_message_pattern().match(user_text)

        # If it doesn't match, then (potentially) something went wrong, because the
        # message text did start with the special prefix -- however, a user might
        # just have decided to start their text this way.
        if not match:
            logger.warning(
                f"Failed to parse intent end entities from '{user_text}'.")
            return message

        # Extract attributes from the match - and validate it via the domain.
        intent_name = YAMLStoryReader._intent_name_from_regex_match(
            match, domain)
        confidence = YAMLStoryReader._confidences_from_regex_match(match)
        entities = YAMLStoryReader._entities_from_regex_match(
            match, domain, entity_extractor_name)

        # The intent name is *not* optional, but during parsing we might find out
        # that the given intent is unknown (and warn). In this case, stop here.
        if intent_name is None:
            return message

        if match.group("rest"):
            rasa.shared.utils.io.raise_warning(
                f"Failed to parse arguments in line '{match.string}'. "
                f"Failed to interpret some parts. "
                f"Continuing without {match.group('rest')}. ",
                docs=DOCS_URL_STORIES,
            )

        # Add the results to the message.
        intent_data = {
            INTENT_NAME_KEY: intent_name,
            PREDICTED_CONFIDENCE_KEY: confidence,
        }
        intent_ranking = [{
            INTENT_NAME_KEY: intent_name,
            PREDICTED_CONFIDENCE_KEY: confidence,
        }]
        message_data = {}
        message_data[TEXT] = user_text
        message_data[INTENT] = intent_data
        message_data[INTENT_RANKING_KEY] = intent_ranking
        message_data[ENTITIES] = entities
        return Message(message_data,
                       output_properties=set(message_data.keys()))
Beispiel #8
0
def test_is_core_or_domain_message(
    message: Message, result: bool,
):
    assert result == message.is_core_or_domain_message()
Beispiel #9
0
def test_add_diagnostic_data_with_repeated_component_raises_warning():
    message = Message()
    message.add_diagnostic_data("a", {})
    with pytest.warns(UserWarning):
        message.add_diagnostic_data("a", {})
Beispiel #10
0
def test_fingerprint_is_same_when_loading_data_again():
    from rasa.shared.importers.utils import training_data_from_paths

    files = [
        "data/examples/rasa/demo-rasa.md",
        "data/examples/rasa/demo-rasa-responses.md",
    ]
    td1 = training_data_from_paths(files, language="en")
    td2 = training_data_from_paths(files, language="en")
    assert td1.fingerprint() == td2.fingerprint()


@pytest.mark.parametrize(
    "message",
    [
        Message({INTENT: "intent2"}),
        Message({ENTITIES: [{"entity": "entity2"}]}),
        Message({ENTITIES: [{"entity": "entity1", "group": "new_group"}]}),
        Message({ENTITIES: [{"entity": "entity1", "role": "new_role"}]}),
        Message({ACTION_NAME: "action_name2"}),
    ],
)
def test_label_fingerprints(message: Message):
    training_data1 = TrainingData(
        [
            Message({INTENT: "intent1"}),
            Message({ENTITIES: [{"entity": "entity1"}]}),
            Message({ACTION_NAME: "action_name1"}),
        ]
    )
    training_data2 = training_data1.merge(TrainingData([message]))
Beispiel #11
0
    features: Optional[List[Features]],
    attribute: Text,
    featurizers: List[Text],
    expected: bool,
):
    message = Message(data={TEXT: "This is a test sentence."}, features=features)

    actual = message.features_present(attribute, featurizers)

    assert actual == expected


@pytest.mark.parametrize(
    "message, result",
    [
        (Message({INTENT: "intent", TEXT: "text"}), False),
        (Message({RESPONSE: "response", TEXT: "text"}), False),
        (Message({INTENT: "intent"}), True),
        (Message({ACTION_TEXT: "action text"}), True),
        (Message({ACTION_NAME: "action name"}), True),
        (Message({TEXT: "text"}), True),
    ],
)
def test_is_core_or_domain_message(
    message: Message, result: bool,
):
    assert result == message.is_core_or_domain_message()


def test_add_diagnostic_data_with_repeated_component_raises_warning():
    message = Message()
Beispiel #12
0
def test_spacy_featurizer_train(spacy_nlp):

    featurizer = SpacyFeaturizer.create({}, RasaNLUModelConfig())

    sentence = "Hey how are you today"
    message = Message(data={TEXT: sentence})
    message.set(RESPONSE, sentence)
    message.set(INTENT, "intent")
    message.set(SPACY_DOCS[TEXT], spacy_nlp(sentence))
    message.set(SPACY_DOCS[RESPONSE], spacy_nlp(sentence))

    featurizer.train(TrainingData([message]), RasaNLUModelConfig())

    expected = np.array([-0.28451, 0.31007, -0.57039, -0.073056, -0.17322])
    expected_cls = np.array([-0.196496, 0.3249364, -0.37408298, -0.10622784, 0.062756])

    seq_vecs, sen_vecs = message.get_dense_features(TEXT, [])
    if seq_vecs:
        seq_vecs = seq_vecs.features
    if sen_vecs:
        sen_vecs = sen_vecs.features

    assert 5 == len(seq_vecs)
    assert 1 == len(sen_vecs)
    assert np.allclose(seq_vecs[0][:5], expected, atol=1e-5)
    assert np.allclose(sen_vecs[-1][:5], expected_cls, atol=1e-5)

    seq_vecs, sen_vecs = message.get_dense_features(RESPONSE, [])
    if seq_vecs:
        seq_vecs = seq_vecs.features
    if sen_vecs:
        sen_vecs = sen_vecs.features

    assert 5 == len(seq_vecs)
    assert 1 == len(sen_vecs)
    assert np.allclose(seq_vecs[0][:5], expected, atol=1e-5)
    assert np.allclose(sen_vecs[-1][:5], expected_cls, atol=1e-5)

    seq_vecs, sen_vecs = message.get_dense_features(INTENT, [])
    if seq_vecs:
        seq_vecs = seq_vecs.features
    if sen_vecs:
        sen_vecs = sen_vecs.features

    assert seq_vecs is None
    assert sen_vecs is None
    for i, o in enumerate(output):
        assert isinstance(o, np.ndarray)
        assert o[0][i] == 1
        assert o.shape == (1, len(label_features))


@pytest.mark.parametrize(
    "messages, expected",
    [
        (
            [
                Message(
                    data={TEXT: "test a"},
                    features=[
                        Features(np.zeros(1), FEATURE_TYPE_SEQUENCE, TEXT, "test"),
                        Features(np.zeros(1), FEATURE_TYPE_SENTENCE, TEXT, "test"),
                    ],
                ),
                Message(
                    data={TEXT: "test b"},
                    features=[
                        Features(np.zeros(1), FEATURE_TYPE_SEQUENCE, TEXT, "test"),
                        Features(np.zeros(1), FEATURE_TYPE_SENTENCE, TEXT, "test"),
                    ],
                ),
            ],
            True,
        ),
        (
            [
    def process(self, message: Message, **kwargs: Any) -> None:
        from tokenizer_tools.tagset.NER.BILUO import BILUOSequenceEncoderDecoder
        from tokenizer_tools.tagset.offset.sequence import Sequence

        decoder = BILUOSequenceEncoderDecoder()

        real_result_dir = os.path.join(self.model_dir, self.result_dir)
        print(real_result_dir)

        input_text = message.text

        input_feature = {
            'words': [[i for i in input_text]],
            'words_len': [len(input_text)],
        }

        print(input_feature)

        predictions = self.predict_fn(input_feature)
        tags = predictions['tags'][0]
        # print(predictions['tags'])

        # decode Unicode
        tags_seq = [i.decode() for i in tags]

        print(tags_seq)

        # BILUO to offset
        failed = False
        try:
            seq = decoder.to_offset(tags_seq, input_text)
        except Exception as e:
            # invalid tag sequence will raise exception
            # so return a empty result
            logger.error("Decode error: {}".format(e))
            seq = Sequence(input_text)
            failed = True
        # print(seq)

        print(seq, tags_seq, failed)

        entity_set = []

        seq.span_set.fill_text(input_text)

        for span in seq.span_set:
            ent = {
                "entity": span.entity,
                "value": span.value,
                "start": span.start,
                "confidence": None,
                "end": span.end
            }

            entity_set.append(ent)

        extracted = self.add_extractor_name(entity_set)

        message.set("entities",
                    message.get("entities", []) + extracted,
                    add_to_output=True)
def test_encode_state__with_lookup__looksup_or_creates_features(action_name: Text):
    """Tests that features from table are combined or created from scratch.
    If the given action name is ...
    - ACTION_LISTEN_NAME then the user substate and the action name are encoded
    - some "other" action, then the user-substate is not encoed but the action name is
    - set to "None", then we remove the action name from the user substate and as a
      result there should be no encoding for the action name and for the user substate
    """
    f = SingleStateFeaturizer()
    f._default_feature_states[INTENT] = {"greet": 0, "inform": 1}
    f._default_feature_states[ENTITIES] = {
        "city": 0,
        "name": 1,
        f"city{ENTITY_LABEL_SEPARATOR}to": 2,
        f"city{ENTITY_LABEL_SEPARATOR}from": 3,
    }
    f._default_feature_states[ACTION_NAME] = {
        "NOT_action_listen": 0,
        "utter_greet": 1,
        ACTION_LISTEN_NAME: 2,
    }
    # `_0` in slots represent feature dimension
    f._default_feature_states[SLOTS] = {"slot_1_0": 0, "slot_2_0": 1, "slot_3_0": 2}
    f._default_feature_states[ACTIVE_LOOP] = {
        "active_loop_1": 0,
        "active_loop_2": 1,
        "active_loop_3": 2,
        "active_loop_4": 3,
    }

    # create state
    text = "I am flying from London to Paris"
    tokens = [
        Token(text=match.group(), start=match.start())
        for match in re.finditer(r"\S+", text)
    ]
    entity_name_list = ["city", f"city{ENTITY_LABEL_SEPARATOR}to"]
    action_text = "throw a ball"
    intent = "inform"
    state = {
        USER: {TEXT: text, INTENT: intent, ENTITIES: entity_name_list,},
        PREVIOUS_ACTION: {ACTION_NAME: action_name, ACTION_TEXT: action_text,},
        ACTIVE_LOOP: {"name": "active_loop_4"},
        SLOTS: {"slot_1": (1.0,)},
    }
    if action_name is None:
        del state[PREVIOUS_ACTION][ACTION_NAME]

    # Build lookup table with all relevant information - and dummy features for all
    # dense featurizable attributes.
    # Note that we don't need to add the `ENTITIES` to the message including `TEXT`
    # here because `encode_state` won't featurize the entities using the lookup table
    # (only `encode_entities` does that).
    units = 300
    precomputations = MessageContainerForCoreFeaturization()
    precomputations.add_all(
        [
            Message(
                data={TEXT: text, TOKENS_NAMES[TEXT]: tokens},
                features=[
                    dummy_features(
                        fill_value=11,
                        units=units,
                        attribute=TEXT,
                        type=SENTENCE,
                        is_sparse=True,
                    ),
                    dummy_features(
                        fill_value=12,
                        units=units,
                        attribute=TEXT,
                        type=SEQUENCE,
                        is_sparse=False,
                    ),
                    # Note: sparse sequence feature is last here
                    dummy_features(
                        fill_value=13,
                        units=units,
                        attribute=TEXT,
                        type=SEQUENCE,
                        is_sparse=True,
                    ),
                ],
            ),
            Message(data={INTENT: intent}),
            Message(
                data={ACTION_TEXT: action_text},
                features=[
                    dummy_features(
                        fill_value=1,
                        units=units,
                        attribute=ACTION_TEXT,
                        type=SEQUENCE,
                        is_sparse=True,
                    )
                ],
            ),
        ]
    )
    if action_name is not None:
        precomputations.add(Message(data={ACTION_NAME: action_name}))

    # encode the state
    encoded = f.encode_state(state, precomputations=precomputations,)

    # check all the features are encoded and *_text features are encoded by a
    # dense featurizer
    expected_attributes = [SLOTS, ACTIVE_LOOP, ACTION_TEXT]
    if action_name is not None:  # i.e. we did not remove it from the state
        expected_attributes += [ACTION_NAME]
    if action_name == ACTION_LISTEN_NAME:
        expected_attributes += [TEXT, ENTITIES, INTENT]
    assert set(encoded.keys()) == set(expected_attributes)

    # Remember, sparse sequence features come first (and `.features` denotes the matrix
    # not a `Features` object)
    if action_name == ACTION_LISTEN_NAME:
        assert encoded[TEXT][0].features.shape[-1] == units
        assert encoded[TEXT][0].is_sparse()
        assert encoded[ENTITIES][0].features.shape[-1] == 4
        assert sparse_equals_dense(encoded[INTENT][0].features, np.array([[0, 1]]))
    assert encoded[ACTION_TEXT][0].features.shape[-1] == units
    assert encoded[ACTION_TEXT][0].is_sparse()
    if action_name is not None:
        if action_name == "NOT_action_listen":
            action_name_encoding = [1, 0, 0]
        else:  # action_listen
            action_name_encoding = [0, 0, 1]
        assert sparse_equals_dense(
            encoded[ACTION_NAME][0].features, np.array([action_name_encoding])
        )
    else:
        assert ACTION_NAME not in encoded
    assert sparse_equals_dense(encoded[SLOTS][0].features, np.array([[1, 0, 0]]))
    assert sparse_equals_dense(
        encoded[ACTIVE_LOOP][0].features, np.array([[0, 0, 0, 1]])
    )
 Message(
     data={
         TEXT:
         "some message",
         INTENT: {
             INTENT_NAME_KEY: "greet",
             PREDICTED_CONFIDENCE_KEY: 0.234891876578331,
         },
         INTENT_RANKING_KEY: [
             {
                 INTENT_NAME_KEY: "greet",
                 PREDICTED_CONFIDENCE_KEY: 0.234891876578331,
             },
             {
                 INTENT_NAME_KEY: "stop",
                 PREDICTED_CONFIDENCE_KEY: 0.5 - 0.0001,
             },
             {
                 INTENT_NAME_KEY: "affirm",
                 PREDICTED_CONFIDENCE_KEY: 0
             },
             {
                 INTENT_NAME_KEY: "inform",
                 PREDICTED_CONFIDENCE_KEY: -100
             },
             {
                 INTENT_NAME_KEY: "deny",
                 PREDICTED_CONFIDENCE_KEY: 0.0879683718085289,
             },
         ],
     }),
def test_encode_entities__with_bilou_entity_roles_and_groups():

    # Instantiate domain and configure the single state featurizer for this domain.
    # Note that there are 2 entity tags here.
    entity_tags = ["city", f"city{ENTITY_LABEL_SEPARATOR}to"]
    domain = Domain(
        intents=[],
        entities=entity_tags,
        slots=[],
        responses={},
        forms={},
        action_names=[],
    )
    f = SingleStateFeaturizer()
    f.prepare_for_training(domain, bilou_tagging=True)

    # (1) example with both entities

    # create message that has been tokenized and where entities have been extracted
    text = "I am flying from London to Paris"
    tokens = [
        Token(text=match.group(), start=match.start())
        for match in re.finditer(r"\S+", text)
    ]
    entities = [
        {
            ENTITY_ATTRIBUTE_TYPE: entity_tags[0],
            ENTITY_ATTRIBUTE_VALUE: "London",
            ENTITY_ATTRIBUTE_START: 17,
            ENTITY_ATTRIBUTE_END: 23,
        },
        {
            ENTITY_ATTRIBUTE_TYPE: entity_tags[1],
            ENTITY_ATTRIBUTE_VALUE: "Paris",
            ENTITY_ATTRIBUTE_START: 27,
            ENTITY_ATTRIBUTE_END: 32,
        },
    ]
    message = Message({TEXT: text, TOKENS_NAMES[TEXT]: tokens, ENTITIES: entities})

    # create a lookup table that has seen this message
    precomputations = MessageContainerForCoreFeaturization()
    precomputations.add(message)

    # encode!
    encoded = f.encode_entities(
        {TEXT: text, ENTITIES: entities,},
        precomputations=precomputations,
        bilou_tagging=True,
    )
    assert sorted(list(encoded.keys())) == sorted([ENTITY_TAGS])
    assert np.all(
        encoded[ENTITY_TAGS][0].features == [[0], [0], [0], [0], [4], [0], [8]]
    )

    # (2) example with only the "city" entity

    # create message that has been tokenized and where entities have been extracted
    text = "I am flying to Saint Petersburg"
    tokens = [
        Token(text=match.group(), start=match.start())
        for match in re.finditer(r"\S+", text)
    ]
    entities = [
        {
            ENTITY_ATTRIBUTE_TYPE: "city",
            ENTITY_ATTRIBUTE_VALUE: "Saint Petersburg",
            ENTITY_ATTRIBUTE_START: 15,
            ENTITY_ATTRIBUTE_END: 31,
        },
    ]
    message = Message({TEXT: text, TOKENS_NAMES[TEXT]: tokens, ENTITIES: entities})

    # create a lookup table that has seen this message
    precomputations = MessageContainerForCoreFeaturization()
    precomputations.add(message)

    # encode!
    encoded = f.encode_entities(
        {TEXT: text, ENTITIES: entities,},
        precomputations=precomputations,
        bilou_tagging=True,
    )
    assert sorted(list(encoded.keys())) == sorted([ENTITY_TAGS])
    assert np.all(encoded[ENTITY_TAGS][0].features == [[0], [0], [0], [0], [1], [3]])
Beispiel #18
0
    def _features_for_patterns(
        self, message: Message, attribute: Text
    ) -> Tuple[Optional[scipy.sparse.coo_matrix],
               Optional[scipy.sparse.coo_matrix]]:
        """Checks which known patterns match the message.

        Given a sentence, returns a vector of {1,0} values indicating which
        regexes did match. Furthermore, if the
        message is tokenized, the function will mark all tokens with a dict
        relating the name of the regex to whether it was matched.

        Args:
            message: Message to be featurized.
            attribute: Attribute of message to be featurized.

        Returns:
           Token and sentence level features of message attribute.
        """
        # Attribute not set (e.g. response not present)
        if not message.get(attribute):
            return None, None

        tokens = message.get(TOKENS_NAMES[attribute], [])

        if not tokens:
            # nothing to featurize
            return None, None

        flags = 0  # default flag
        if not self.case_sensitive:
            flags = re.IGNORECASE

        sequence_length = len(tokens)

        num_patterns = len(self.known_patterns)

        sequence_features = np.zeros([sequence_length, num_patterns])
        sentence_features = np.zeros([1, num_patterns])

        for pattern_index, pattern in enumerate(self.known_patterns):
            matches = re.finditer(pattern["pattern"],
                                  message.get(attribute),
                                  flags=flags)
            matches = list(matches)

            for token_index, t in enumerate(tokens):
                patterns = t.get("pattern", default={})
                patterns[pattern["name"]] = False

                for match in matches:
                    if t.start < match.end() and t.end > match.start():
                        patterns[pattern["name"]] = True
                        sequence_features[token_index][pattern_index] = 1.0
                        if attribute in [RESPONSE, TEXT, ACTION_TEXT]:
                            # sentence vector should contain all patterns
                            sentence_features[0][pattern_index] = 1.0

                t.set("pattern", patterns)

        return (
            scipy.sparse.coo_matrix(sequence_features),
            scipy.sparse.coo_matrix(sentence_features),
        )
Beispiel #19
0
 def get_doc(self, message: Message, attribute: Text) -> Optional["Doc"]:
     return message.get(SPACY_DOCS[attribute])
Beispiel #20
0
def test_convert_featurizer_train(
    create_or_load_convert_featurizer: Callable[[Dict[Text, Any]],
                                                ConveRTFeaturizer],
    monkeypatch: MonkeyPatch,
    load: bool,
    whitespace_tokenizer: WhitespaceTokenizer,
):

    monkeypatch.setattr(
        ConveRTFeaturizer,
        "_validate_model_url",
        lambda _: None,
    )
    component_config = {
        FEATURIZER_CLASS_ALIAS: "alias",
        "model_url": RESTRICTED_ACCESS_URL,
    }
    featurizer = create_or_load_convert_featurizer(component_config, load=True)

    sentence = "Hey how are you today ?"
    message = Message(data={TEXT: sentence})
    message.set(RESPONSE, sentence)

    td = TrainingData([message])
    whitespace_tokenizer.process_training_data(td)

    tokens = featurizer.tokenize(message, attribute=TEXT)

    message.set(TOKENS_NAMES[TEXT], tokens)
    message.set(TOKENS_NAMES[RESPONSE], tokens)

    featurizer.process_training_data(TrainingData([message]))

    expected = np.array(
        [2.2636216, -0.26475656, -1.1358104, -0.49751878, -1.3946456])
    expected_cls = np.array(
        [1.0251294, -0.04053932, -0.7018805, -0.82054937, -0.75054353])

    seq_vecs, sent_vecs = message.get_dense_features(TEXT, [])

    seq_vecs = seq_vecs.features
    sent_vecs = sent_vecs.features

    assert len(tokens) == len(seq_vecs)
    assert np.allclose(seq_vecs[0][:5], expected, atol=1e-5)
    assert np.allclose(sent_vecs[-1][:5], expected_cls, atol=1e-5)

    seq_vecs, sent_vecs = message.get_dense_features(RESPONSE, [])

    seq_vecs = seq_vecs.features
    sent_vecs = sent_vecs.features

    assert len(tokens) == len(seq_vecs)
    assert np.allclose(seq_vecs[0][:5], expected, atol=1e-5)
    assert np.allclose(sent_vecs[-1][:5], expected_cls, atol=1e-5)

    seq_vecs, sent_vecs = message.get_dense_features(INTENT, [])

    assert seq_vecs is None
    assert sent_vecs is None
Beispiel #21
0
def test_count_vector_featurizer_persist_load(tmp_path: Path):
    # set non default values to config
    config = {
        "analyzer": "char",
        "strip_accents": "ascii",
        "stop_words": "stop",
        "min_df": 2,
        "max_df": 3,
        "min_ngram": 2,
        "max_ngram": 3,
        "max_features": 10,
        "lowercase": False,
    }
    train_ftr = CountVectorsFeaturizer(config)

    sentence1 = "ababab 123 13xc лаомтгцу sfjv oö aà"
    sentence2 = "abababalidcn 123123 13xcdc лаомтгцу sfjv oö aà"

    train_message1 = Message(data={TEXT: sentence1})
    train_message2 = Message(data={TEXT: sentence2})
    WhitespaceTokenizer().process(train_message1)
    WhitespaceTokenizer().process(train_message2)

    data = TrainingData([train_message1, train_message2])
    train_ftr.train(data)

    # persist featurizer
    file_dict = train_ftr.persist("ftr", str(tmp_path))
    train_vect_params = {
        attribute: vectorizer.get_params()
        for attribute, vectorizer in train_ftr.vectorizers.items()
    }

    # add trained vocabulary to vectorizer params
    for attribute, attribute_vect_params in train_vect_params.items():
        if hasattr(train_ftr.vectorizers[attribute], "vocabulary_"):
            train_vect_params[attribute].update(
                {"vocabulary": train_ftr.vectorizers[attribute].vocabulary_}
            )

    # load featurizer
    meta = train_ftr.component_config.copy()
    meta.update(file_dict)
    test_ftr = CountVectorsFeaturizer.load(meta, str(tmp_path), finetune_mode=False)
    test_vect_params = {
        attribute: vectorizer.get_params()
        for attribute, vectorizer in test_ftr.vectorizers.items()
    }

    assert train_vect_params == test_vect_params

    # check if vocaculary was loaded correctly
    assert hasattr(test_ftr.vectorizers[TEXT], "vocabulary_")

    test_message1 = Message(data={TEXT: sentence1})
    WhitespaceTokenizer().process(test_message1)
    test_ftr.process(test_message1)
    test_message2 = Message(data={TEXT: sentence2})
    WhitespaceTokenizer().process(test_message2)
    test_ftr.process(test_message2)

    test_seq_vec_1, test_sen_vec_1 = test_message1.get_sparse_features(TEXT, [])
    if test_seq_vec_1:
        test_seq_vec_1 = test_seq_vec_1.features
    if test_sen_vec_1:
        test_sen_vec_1 = test_sen_vec_1.features
    train_seq_vec_1, train_sen_vec_1 = train_message1.get_sparse_features(TEXT, [])
    if train_seq_vec_1:
        train_seq_vec_1 = train_seq_vec_1.features
    if train_sen_vec_1:
        train_sen_vec_1 = train_sen_vec_1.features
    test_seq_vec_2, test_sen_vec_2 = test_message2.get_sparse_features(TEXT, [])
    if test_seq_vec_2:
        test_seq_vec_2 = test_seq_vec_2.features
    if test_sen_vec_2:
        test_sen_vec_2 = test_sen_vec_2.features
    train_seq_vec_2, train_sen_vec_2 = train_message2.get_sparse_features(TEXT, [])
    if train_seq_vec_2:
        train_seq_vec_2 = train_seq_vec_2.features
    if train_sen_vec_2:
        train_sen_vec_2 = train_sen_vec_2.features

    # check that train features and test features after loading are the same
    assert np.all(test_seq_vec_1.toarray() == train_seq_vec_1.toarray())
    assert np.all(test_sen_vec_1.toarray() == train_sen_vec_1.toarray())
    assert np.all(test_seq_vec_2.toarray() == train_seq_vec_2.toarray())
    assert np.all(test_sen_vec_2.toarray() == train_sen_vec_2.toarray())
Beispiel #22
0
async def test_adjusting_layers_incremental_training(
    create_response_selector: Callable[[Dict[Text, Any]], ResponseSelector],
    load_response_selector: Callable[[Dict[Text, Any]], ResponseSelector],
    train_and_preprocess: Callable[..., Tuple[TrainingData,
                                              List[GraphComponent]]],
    process_message: Callable[..., Message],
):
    """Tests adjusting sparse layers of `ResponseSelector` to increased sparse
    feature sizes during incremental training.

    Testing is done by checking the layer sizes.
    Checking if they were replaced correctly is also important
    and is done in `test_replace_dense_for_sparse_layers`
    in `test_rasa_layers.py`.
    """
    iter1_data_path = "data/test_incremental_training/iter1/"
    iter2_data_path = "data/test_incremental_training/"
    pipeline = [
        {
            "component": WhitespaceTokenizer
        },
        {
            "component": LexicalSyntacticFeaturizer
        },
        {
            "component": RegexFeaturizer
        },
        {
            "component": CountVectorsFeaturizer
        },
        {
            "component": CountVectorsFeaturizer,
            "analyzer": "char_wb",
            "min_ngram": 1,
            "max_ngram": 4,
        },
    ]
    training_data, loaded_pipeline = train_and_preprocess(
        pipeline, iter1_data_path)
    response_selector = create_response_selector({EPOCHS: 1})
    response_selector.train(training_data=training_data)

    old_data_signature = response_selector.model.data_signature
    old_predict_data_signature = response_selector.model.predict_data_signature

    message = Message(data={TEXT: "Rasa is great!"})
    message = process_message(loaded_pipeline, message)

    message2 = copy.deepcopy(message)

    classified_message = response_selector.process([message])[0]

    old_sparse_feature_sizes = classified_message.get_sparse_feature_sizes(
        attribute=TEXT)

    initial_rs_layers = response_selector.model._tf_layers[
        "sequence_layer.text"]._tf_layers["feature_combining"]
    initial_rs_sequence_layer = initial_rs_layers._tf_layers[
        "sparse_dense.sequence"]._tf_layers["sparse_to_dense"]
    initial_rs_sentence_layer = initial_rs_layers._tf_layers[
        "sparse_dense.sentence"]._tf_layers["sparse_to_dense"]

    initial_rs_sequence_size = initial_rs_sequence_layer.get_kernel().shape[0]
    initial_rs_sentence_size = initial_rs_sentence_layer.get_kernel().shape[0]
    assert initial_rs_sequence_size == sum(
        old_sparse_feature_sizes[FEATURE_TYPE_SEQUENCE])
    assert initial_rs_sentence_size == sum(
        old_sparse_feature_sizes[FEATURE_TYPE_SENTENCE])

    loaded_selector = load_response_selector({EPOCHS: 1})

    classified_message2 = loaded_selector.process([message2])[0]

    assert classified_message2.fingerprint() == classified_message.fingerprint(
    )

    training_data2, loaded_pipeline2 = train_and_preprocess(
        pipeline, iter2_data_path)

    response_selector.train(training_data=training_data2)

    new_message = Message.build(text="Rasa is great!")
    new_message = process_message(loaded_pipeline2, new_message)

    classified_new_message = response_selector.process([new_message])[0]
    new_sparse_feature_sizes = classified_new_message.get_sparse_feature_sizes(
        attribute=TEXT)

    final_rs_layers = response_selector.model._tf_layers[
        "sequence_layer.text"]._tf_layers["feature_combining"]
    final_rs_sequence_layer = final_rs_layers._tf_layers[
        "sparse_dense.sequence"]._tf_layers["sparse_to_dense"]
    final_rs_sentence_layer = final_rs_layers._tf_layers[
        "sparse_dense.sentence"]._tf_layers["sparse_to_dense"]

    final_rs_sequence_size = final_rs_sequence_layer.get_kernel().shape[0]
    final_rs_sentence_size = final_rs_sentence_layer.get_kernel().shape[0]
    assert final_rs_sequence_size == sum(
        new_sparse_feature_sizes[FEATURE_TYPE_SEQUENCE])
    assert final_rs_sentence_size == sum(
        new_sparse_feature_sizes[FEATURE_TYPE_SENTENCE])
    # check if the data signatures were correctly updated
    new_data_signature = response_selector.model.data_signature
    new_predict_data_signature = response_selector.model.predict_data_signature
    iter2_data = load_data(iter2_data_path)
    expected_sequence_lengths = len([
        message for message in iter2_data.training_examples
        if message.get(INTENT_RESPONSE_KEY)
    ])

    def test_data_signatures(
        new_signature: Dict[Text, Dict[Text, List[FeatureArray]]],
        old_signature: Dict[Text, Dict[Text, List[FeatureArray]]],
    ):
        # Wherever attribute / feature_type signature is not
        # expected to change, directly compare it to old data signature.
        # Else compute its expected signature and compare
        attributes_expected_to_change = [TEXT]
        feature_types_expected_to_change = [
            FEATURE_TYPE_SEQUENCE,
            FEATURE_TYPE_SENTENCE,
        ]

        for attribute, signatures in new_signature.items():

            for feature_type, feature_signatures in signatures.items():

                if feature_type == "sequence_lengths":
                    assert feature_signatures[
                        0].units == expected_sequence_lengths

                elif feature_type not in feature_types_expected_to_change:
                    assert feature_signatures == old_signature.get(
                        attribute).get(feature_type)
                else:
                    for index, feature_signature in enumerate(
                            feature_signatures):
                        if (feature_signature.is_sparse and attribute
                                in attributes_expected_to_change):
                            assert feature_signature.units == sum(
                                new_sparse_feature_sizes.get(feature_type))
                        else:
                            # dense signature or attributes that are not
                            # expected to change can be compared directly
                            assert (
                                feature_signature.units == old_signature.get(
                                    attribute).get(feature_type)[index].units)

    test_data_signatures(new_data_signature, old_data_signature)
    test_data_signatures(new_predict_data_signature,
                         old_predict_data_signature)
Beispiel #23
0
def test_count_vector_featurizer_process_by_attribute(
    sentence: Text,
    action_name: Text,
    action_text: Text,
    action_name_features: np.ndarray,
    response_features: np.ndarray,
):
    ftr = CountVectorsFeaturizer({"token_pattern": r"(?u)\b\w+\b",})
    tk = WhitespaceTokenizer()

    # add a second example that has some response, so that the vocabulary for
    # response exists
    train_message = Message(data={TEXT: "hello"})
    train_message.set(ACTION_NAME, "greet")

    train_message1 = Message(data={TEXT: "hello"})
    train_message1.set(ACTION_TEXT, "hi")

    data = TrainingData([train_message, train_message1])

    tk.train(data)
    ftr.train(data)

    test_message = Message(data={TEXT: sentence})
    test_message.set(ACTION_NAME, action_name)
    test_message.set(ACTION_TEXT, action_text)

    for module in [tk, ftr]:
        module.process(test_message)

    action_name_seq_vecs, action_name_sen_vecs = test_message.get_sparse_features(
        ACTION_NAME, []
    )
    if action_name_seq_vecs:
        action_name_seq_vecs = action_name_seq_vecs.features
    if action_name_sen_vecs:
        action_name_sen_vecs = action_name_sen_vecs.features

    assert action_name_seq_vecs.toarray()[0] == action_name_features
    assert action_name_sen_vecs is None
Beispiel #24
0
async def test_sparse_feature_sizes_decreased_incremental_training(
    iter1_path: Text,
    iter2_path: Text,
    should_raise_exception: bool,
    create_response_selector: Callable[[Dict[Text, Any]], ResponseSelector],
    load_response_selector: Callable[[Dict[Text, Any]], ResponseSelector],
    default_execution_context: ExecutionContext,
    train_and_preprocess: Callable[..., Tuple[TrainingData,
                                              List[GraphComponent]]],
    process_message: Callable[..., Message],
):
    pipeline = [
        {
            "component": WhitespaceTokenizer
        },
        {
            "component": LexicalSyntacticFeaturizer
        },
        {
            "component": RegexFeaturizer
        },
        {
            "component": CountVectorsFeaturizer
        },
        {
            "component": CountVectorsFeaturizer,
            "analyzer": "char_wb",
            "min_ngram": 1,
            "max_ngram": 4,
        },
    ]
    training_data, loaded_pipeline = train_and_preprocess(pipeline, iter1_path)

    response_selector = create_response_selector({EPOCHS: 1})
    response_selector.train(training_data=training_data)

    message = Message(data={TEXT: "Rasa is great!"})
    message = process_message(loaded_pipeline, message)

    message2 = copy.deepcopy(message)

    classified_message = response_selector.process([message])[0]

    default_execution_context.is_finetuning = True

    loaded_selector = load_response_selector({EPOCHS: 1})

    classified_message2 = loaded_selector.process([message2])[0]

    assert classified_message2.fingerprint() == classified_message.fingerprint(
    )

    if should_raise_exception:
        with pytest.raises(Exception) as exec_info:
            training_data2, loaded_pipeline2 = train_and_preprocess(
                pipeline, iter2_path)
            loaded_selector.train(training_data=training_data2)
        assert "Sparse feature sizes have decreased" in str(exec_info.value)
    else:
        training_data2, loaded_pipeline2 = train_and_preprocess(
            pipeline, iter2_path)
        loaded_selector.train(training_data=training_data2)
        assert loaded_selector.model
Beispiel #25
0
    default_model_storage: ModelStorage,
    default_execution_context: ExecutionContext,
):
    return CoreFeaturizationCollector.create(
        CoreFeaturizationCollector.get_default_config(),
        default_model_storage,
        Resource("CoreFeaturizationCollector"),
        default_execution_context,
    )


@pytest.mark.parametrize(
    "messages_with_unique_lookup_key",
    [
        [
            Message(data={TEXT: "A"}, features=[_dummy_features(1, TEXT)]),
            Message(data={ACTION_TEXT: "B"}),
        ],
        [],
    ],
)
def test_collection(
    collector: CoreFeaturizationCollector,
    messages_with_unique_lookup_key: List[Message],
):

    messages = messages_with_unique_lookup_key

    # pass as training data
    training_data = TrainingData(training_examples=messages)
    precomputations = collector.collect(training_data)
def test_convert_featurizer_train(monkeypatch: MonkeyPatch):
    tokenizer = WhitespaceTokenizer()

    monkeypatch.setattr(ConveRTFeaturizer, "_get_validated_model_url",
                        lambda x: RESTRICTED_ACCESS_URL)
    component_config = {
        "name": "ConveRTFeaturizer",
        "model_url": RESTRICTED_ACCESS_URL
    }
    featurizer = ConveRTFeaturizer(component_config)

    sentence = "Hey how are you today ?"
    message = Message(data={TEXT: sentence})
    message.set(RESPONSE, sentence)

    td = TrainingData([message])
    tokenizer.train(td)

    tokens = featurizer.tokenize(message, attribute=TEXT)

    message.set(TOKENS_NAMES[TEXT], tokens)
    message.set(TOKENS_NAMES[RESPONSE], tokens)

    featurizer.train(TrainingData([message]),
                     RasaNLUModelConfig(),
                     tf_hub_module=featurizer.module)

    expected = np.array(
        [2.2636216, -0.26475656, -1.1358104, -0.49751878, -1.3946456])
    expected_cls = np.array(
        [1.0251294, -0.04053932, -0.7018805, -0.82054937, -0.75054353])

    seq_vecs, sent_vecs = message.get_dense_features(TEXT, [])

    seq_vecs = seq_vecs.features
    sent_vecs = sent_vecs.features

    assert len(tokens) == len(seq_vecs)
    assert np.allclose(seq_vecs[0][:5], expected, atol=1e-5)
    assert np.allclose(sent_vecs[-1][:5], expected_cls, atol=1e-5)

    seq_vecs, sent_vecs = message.get_dense_features(RESPONSE, [])

    seq_vecs = seq_vecs.features
    sent_vecs = sent_vecs.features

    assert len(tokens) == len(seq_vecs)
    assert np.allclose(seq_vecs[0][:5], expected, atol=1e-5)
    assert np.allclose(sent_vecs[-1][:5], expected_cls, atol=1e-5)

    seq_vecs, sent_vecs = message.get_dense_features(INTENT, [])

    assert seq_vecs is None
    assert sent_vecs is None
Beispiel #27
0
def test_container_all_messages():
    message_data_list = [{INTENT: "1"}, {INTENT: "2", "other": 3}, {TEXT: "3"}]
    container = MessageContainerForCoreFeaturization()
    container.add_all([Message(data=data) for data in message_data_list])
    assert len(container.all_messages()) == 3
def test_whitespace_training(supervised_embeddings_config: RasaNLUModelConfig):
    examples = [
        Message(
            data={
                TEXT:
                "Any Mexican restaurant will do",
                "intent":
                "restaurant_search",
                "entities": [{
                    "start": 4,
                    "end": 11,
                    "value": "Mexican",
                    "entity": "cuisine"
                }],
            }),
        Message(
            data={
                TEXT:
                "I want Tacos!",
                "intent":
                "restaurant_search",
                "entities": [{
                    "start": 7,
                    "end": 12,
                    "value": "Mexican",
                    "entity": "cuisine"
                }],
            }),
        Message(data={
            TEXT: "action_restart",
            "action_name": "action_restart"
        }),
        Message(
            data={
                TEXT: "Where are you going?",
                ACTION_NAME: "Where are you going?",
                ACTION_TEXT: "Where are you going?",
            }),
    ]

    component_config = {
        "case_sensitive": False,
        "intent_tokenization_flag": True
    }
    tk = WhitespaceTokenizer(component_config)

    tk.train(TrainingData(training_examples=examples),
             supervised_embeddings_config)

    assert examples[0].data.get(TOKENS_NAMES[TEXT])[0].text == "Any"
    assert examples[0].data.get(TOKENS_NAMES[TEXT])[1].text == "Mexican"
    assert examples[0].data.get(TOKENS_NAMES[TEXT])[2].text == "restaurant"
    assert examples[0].data.get(TOKENS_NAMES[TEXT])[3].text == "will"
    assert examples[0].data.get(TOKENS_NAMES[TEXT])[4].text == "do"
    assert examples[1].data.get(TOKENS_NAMES[TEXT])[0].text == "I"
    assert examples[1].data.get(TOKENS_NAMES[TEXT])[1].text == "want"
    assert examples[1].data.get(TOKENS_NAMES[TEXT])[2].text == "Tacos"
    assert examples[2].data.get(TOKENS_NAMES[ACTION_NAME])[0].text == "action"
    assert examples[2].data.get(TOKENS_NAMES[ACTION_NAME])[1].text == "restart"
    assert examples[2].data.get(TOKENS_NAMES[TEXT])[0].text == "action_restart"
    assert examples[2].data.get(TOKENS_NAMES[ACTION_TEXT]) is None
    assert examples[3].data.get(TOKENS_NAMES[ACTION_TEXT])[0].text == "Where"
    assert examples[3].data.get(TOKENS_NAMES[ACTION_TEXT])[1].text == "are"
    assert examples[3].data.get(TOKENS_NAMES[ACTION_TEXT])[2].text == "you"
    assert examples[3].data.get(TOKENS_NAMES[ACTION_TEXT])[3].text == "going"
Beispiel #29
0
 def process(self, message: Message, **kwargs: Any) -> None:
     for attribute in DENSE_FEATURIZABLE_ATTRIBUTES:
         if message.get(attribute):
             message.set(SPACY_DOCS[attribute],
                         self.doc_for_text(message.get(attribute)))
Beispiel #30
0
    for i, o in enumerate(output):
        assert isinstance(o, np.ndarray)
        assert o[0][i] == 1
        assert o.shape == (1, len(label_features))


@pytest.mark.parametrize(
    "messages, expected",
    [
        (
            [
                Message(
                    data={TEXT: "test a"},
                    features=[
                        Features(np.zeros(1), FEATURE_TYPE_SEQUENCE, TEXT, "test"),
                        Features(np.zeros(1), FEATURE_TYPE_SENTENCE, TEXT, "test"),
                    ],
                ),
                Message(
                    data={TEXT: "test b"},
                    features=[
                        Features(np.zeros(1), FEATURE_TYPE_SEQUENCE, TEXT, "test"),
                        Features(np.zeros(1), FEATURE_TYPE_SENTENCE, TEXT, "test"),
                    ],
                ),
            ],
            True,
        ),
        (
            [