Esempio n. 1
0
def load_data(resource_name: Text,
              language: Optional[Text] = "en") -> "TrainingData":
    """Load training data from disk.

    Merges them if loaded from disk and multiple files are found."""
    if not os.path.exists(resource_name):
        raise ValueError(f"File '{resource_name}' does not exist.")

    if os.path.isfile(resource_name):
        files = [resource_name]
    else:
        files = rasa.shared.utils.io.list_files(resource_name)

    data_sets = [_load(f, language) for f in files]
    data_sets = [ds for ds in data_sets if ds]
    if len(data_sets) == 0:
        training_data = TrainingData()
    elif len(data_sets) == 1:
        training_data = data_sets[0]
    else:
        training_data = data_sets[0].merge(*data_sets[1:])

    return training_data
Esempio n. 2
0
def test_features_for_messages_with_missing_part_of_speech_tags(
    create_lexical_syntactic_featurizer: Callable[[Dict[
        Text, Any]], LexicalSyntacticFeaturizer], ):
    # build the message and do NOT add part of speech information
    sentence = "hello goodbye hello"
    message_data = {
        TOKENS_NAMES[TEXT]: [
            Token(text=match[0], start=match.start())
            for match in re.finditer(r"\w+", sentence)
        ]
    }
    message = Message(data=message_data)

    # train and process
    featurizer = create_lexical_syntactic_featurizer({
        "alias":
        "lsf",
        "features": [["BOS", "pos"]]
    })
    featurizer.train(TrainingData([message]))
    featurizer.process([message])
    feature = message.features[0]
    assert feature.features.shape[1] == 3  # BOS = True/False, pos = None
Esempio n. 3
0
def test_cvf_shared_train_vocabulary_expand(
    additional_size: Optional[int],
    text: Text,
    real_vocabulary_size: int,
    total_vocabulary_size: int,
):

    tokenizer = WhitespaceTokenizer()
    featurizer = CountVectorsFeaturizer(
        {
            "additional_vocabulary_size": {
                "text": additional_size,
                "response": additional_size,
                "action_text": additional_size,
            },
            "use_shared_vocab": True,
        },
        finetune_mode=False,
    )

    train_message = Message(
        data={
            TEXT: text,
            INTENT: "intent_1",
            RESPONSE: text,
            ACTION_TEXT: text,
            ACTION_NAME: "action_1",
        })
    data = TrainingData([train_message])

    tokenizer.train(data)
    featurizer.train(data)

    shared_vocabulary = featurizer.vectorizers["text"].vocabulary_
    assert len(shared_vocabulary) == total_vocabulary_size
    assert (featurizer._get_starting_empty_index(shared_vocabulary) ==
            real_vocabulary_size)
Esempio n. 4
0
def test_count_vector_featurizer_use_lemma(
    spacy_nlp: Any,
    sentence: Text,
    sequence_features: List[List[int]],
    sentence_features: List[List[int]],
    use_lemma: bool,
):
    ftr = CountVectorsFeaturizer({
        "use_lemma": use_lemma,
        "additional_vocabulary_size": {
            "text": 0
        }
    })

    train_message = Message(data={TEXT: sentence})
    train_message.set(SPACY_DOCS[TEXT], spacy_nlp(sentence))
    test_message = Message(data={TEXT: sentence})
    test_message.set(SPACY_DOCS[TEXT], spacy_nlp(sentence))

    SpacyTokenizer().process(train_message)
    SpacyTokenizer().process(test_message)

    ftr.train(TrainingData([train_message]))

    ftr.process(test_message)

    seq_vecs, sen_vecs = test_message.get_sparse_features(TEXT, [])

    assert isinstance(seq_vecs.features, scipy.sparse.coo_matrix)
    assert isinstance(sen_vecs.features, scipy.sparse.coo_matrix)

    actual_seq_vecs = seq_vecs.features.toarray()
    actual_sen_vecs = sen_vecs.features.toarray()

    assert np.all(actual_seq_vecs[0] == sequence_features)
    assert np.all(actual_sen_vecs[-1] == sentence_features)
Esempio n. 5
0
def test_convert_featurizer_process(
    create_or_load_convert_featurizer: Callable[[Dict[Text, Any]],
                                                ConveRTFeaturizer],
    monkeypatch: MonkeyPatch,
    whitespace_tokenizer: WhitespaceTokenizer,
):

    monkeypatch.setattr(ConveRTFeaturizer, "_validate_model_url",
                        lambda _: RESTRICTED_ACCESS_URL)
    component_config = {
        FEATURIZER_CLASS_ALIAS: "alias",
        "model_url": RESTRICTED_ACCESS_URL,
    }
    featurizer = create_or_load_convert_featurizer(component_config)
    sentence = "Hey how are you today ?"
    message = Message.build(text=sentence)

    td = TrainingData([message])
    whitespace_tokenizer.process_training_data(td)
    tokens = featurizer.tokenize(message, attribute=TEXT)

    featurizer.process([message])

    expected = np.array(
        [2.2636216, -0.26475656, -1.1358104, -0.49751878, -1.3946456])
    expected_cls = np.array(
        [1.0251294, -0.04053932, -0.7018805, -0.82054937, -0.75054353])

    seq_vecs, sent_vecs = message.get_dense_features(TEXT, [])

    seq_vecs = seq_vecs.features
    sent_vecs = sent_vecs.features

    assert len(tokens) == len(seq_vecs)
    assert np.allclose(seq_vecs[0][:5], expected, atol=1e-5)
    assert np.allclose(sent_vecs[-1][:5], expected_cls, atol=1e-5)
Esempio n. 6
0
def test_convert_featurizer_tokens_to_text(
    create_or_load_convert_featurizer: Callable[[Dict[Text, Any]],
                                                ConveRTFeaturizer],
    sentence: Text,
    expected_text: Text,
    monkeypatch: MonkeyPatch,
    whitespace_tokenizer: WhitespaceTokenizer,
):

    monkeypatch.setattr(ConveRTFeaturizer, "_validate_model_url",
                        lambda _: None)
    component_config = {
        FEATURIZER_CLASS_ALIAS: "alias",
        "model_url": RESTRICTED_ACCESS_URL,
    }
    featurizer = create_or_load_convert_featurizer(component_config)
    message = Message.build(text=sentence)
    td = TrainingData([message])
    whitespace_tokenizer.process_training_data(td)
    tokens = featurizer.tokenize(message, attribute=TEXT)

    actual_text = ConveRTFeaturizer._tokens_to_text([tokens])[0]

    assert expected_text == actual_text
Esempio n. 7
0
    def create_zip_file(nlu: TrainingData, domain: Domain, stories: StoryGraph,
                        config: Dict, bot: Text):
        """
        adds training files to zip

        :param nlu: nlu data
        :param domain: domain data
        :param stories: stories data
        :param config: config data
        :param bot: bot id
        :return: None
        """
        directory = Utility.save_files(
            nlu.nlu_as_markdown().encode(),
            domain.as_yaml().encode(),
            stories.as_story_string().encode(),
            yaml.dump(config).encode(),
        )
        zip_path = os.path.join(tempfile.gettempdir(), bot)
        zip_file = shutil.make_archive(zip_path,
                                       format="zip",
                                       root_dir=directory)
        shutil.rmtree(directory)
        return zip_file
Esempio n. 8
0
    def convert_for_training(
        self,
        domain: Domain,
        story_graph: StoryGraph,
    ) -> TrainingData:
        """Creates de-duplicated training data.

        Each possible user text and intent and each action name and action text
        that can be found in the given domain and story graph appears exactly once
        in the resulting training data. Moreover, each item is contained in a separate
        messsage.

        Args:
           domain: the domain
           story_graph: a story graph
        Returns:
           training data
        """
        container = MessageContainerForCoreFeaturization()

        # collect all action and user (intent-only) substates known from domain
        container.derive_messages_from_domain_and_add(domain=domain)

        # collect all substates we see in the given data
        all_events = (
            event for step in story_graph.story_steps for event in step.events
            if isinstance(event, UserUttered)
            # because all action names and texts are known to the domain
        )
        container.derive_messages_from_events_and_add(events=all_events)

        # Reminder: in case of complex recipes that train CountVectorizers, we'll have
        # to make sure that there is at least one user substate with a TEXT to ensure
        # `CountVectorizer` is trained...

        return TrainingData(training_examples=container.all_messages())
Esempio n. 9
0
def test_only_featurizes_text_attribute(
    create_lexical_syntactic_featurizer: Callable[[Dict[
        Text, Any]], LexicalSyntacticFeaturizerGraphComponent], ):
    # build a message with tokens for lots of attributes
    sentence = "hello goodbye hello"
    tokens = [
        Token(text=match[0], start=match.start())
        for match in re.finditer(r"\w+", sentence)
    ]
    message_data = {}
    for attribute in MESSAGE_ATTRIBUTES + DENSE_FEATURIZABLE_ATTRIBUTES:
        message_data[attribute] = sentence
        message_data[TOKENS_NAMES[attribute]] = tokens
    message = Message(data=message_data)

    # train and process
    featurizer = create_lexical_syntactic_featurizer({
        "alias": "lsf",
        "features": [["BOS"]]
    })
    featurizer.train(TrainingData([message]))
    featurizer.process([message])
    assert len(message.features) == 1
    assert message.features[0].attribute == TEXT
Esempio n. 10
0
def test_count_vector_featurizer_char(sentence, expected):
    ftr = CountVectorsFeaturizer({
        "min_ngram": 1,
        "max_ngram": 2,
        "analyzer": "char"
    })

    train_message = Message(data={TEXT: sentence})
    WhitespaceTokenizer().process(train_message)

    data = TrainingData([train_message])
    ftr.train(data)

    test_message = Message(data={TEXT: sentence})
    WhitespaceTokenizer().process(test_message)
    ftr.process(test_message)

    seq_vec, sen_vec = train_message.get_sparse_features(TEXT, [])
    if seq_vec:
        seq_vec = seq_vec.features
    if sen_vec:
        sen_vec = sen_vec.features
    assert np.all(seq_vec.toarray()[0] == expected)
    assert sen_vec is not None
Esempio n. 11
0
def test_log_longer_sequence(
    sequence_length: int,
    model_name: Text,
    model_weights: Text,
    should_overflow: bool,
    caplog: LogCaptureFixture,
    create_language_model_featurizer: Callable[
        [Dict[Text, Any]], LanguageModelFeaturizerGraphComponent
    ],
    whitespace_tokenizer: WhitespaceTokenizerGraphComponent,
):
    config = {"model_name": model_name, "model_weights": model_weights}

    featurizer = create_language_model_featurizer(config)

    text = " ".join(["hi"] * sequence_length)
    message = Message.build(text=text)
    td = TrainingData([message])
    whitespace_tokenizer.process_training_data(td)
    caplog.set_level(logging.DEBUG)
    featurizer.process([message])
    if should_overflow:
        assert "hi hi hi" in caplog.text
    assert len(message.features) >= 2
Esempio n. 12
0
def test_lm_tokenizer_custom_intent_symbol(text, expected_tokens):
    component_config = {
        "intent_tokenization_flag": True,
        "intent_split_symbol": "+"
    }

    transformers_config = {
        "model_name": "bert",
        "model_weights": "bert-base-uncased",
    }  # Test for one should be enough

    transformers_nlp = HFTransformersNLP(transformers_config)
    lm_tokenizer = LanguageModelTokenizer(component_config)

    message = Message.build(text=text)
    message.set(INTENT, text)

    td = TrainingData([message])

    transformers_nlp.train(td)
    lm_tokenizer.train(td)

    assert [t.text
            for t in message.get(TOKENS_NAMES[INTENT])] == expected_tokens
def test_text_featurizer_window_size(sentence, expected):
    featurizer = LexicalSyntacticFeaturizer(
        {"features": [["upper"], ["digit"], ["low"], ["digit"]]})

    train_message = Message(data={TEXT: sentence})
    test_message = Message(data={TEXT: sentence})

    WhitespaceTokenizer().process(train_message)
    WhitespaceTokenizer().process(test_message)

    featurizer.train(TrainingData([train_message]))

    featurizer.process(test_message)

    seq_vec, sen_vec = test_message.get_sparse_features(TEXT, [])
    if seq_vec:
        seq_vec = seq_vec.features
    if sen_vec:
        sen_vec = sen_vec.features

    assert isinstance(seq_vec, scipy.sparse.coo_matrix)
    assert sen_vec is None

    assert np.all(seq_vec.toarray()[0] == expected)
Esempio n. 14
0
def test_count_vector_featurizer_oov_words(
    sentence: Text,
    expected: List[List[int]],
    create_featurizer: Callable[..., CountVectorsFeaturizer],
    whitespace_tokenizer: WhitespaceTokenizer,
):
    ftr = create_featurizer({
        "OOV_token": "__oov__",
        "OOV_words": ["oov_word0", "OOV_word1"],
    })
    train_message = Message(data={TEXT: sentence})
    whitespace_tokenizer.process([train_message])

    data = TrainingData([train_message])
    ftr.train(data)
    ftr.process_training_data(data)

    seq_vec, sen_vec = train_message.get_sparse_features(TEXT, [])
    if seq_vec:
        seq_vec = seq_vec.features
    if sen_vec:
        sen_vec = sen_vec.features
    assert np.all(seq_vec.toarray()[0] == expected)
    assert sen_vec is not None
Esempio n. 15
0
def test_warn_if_part_of_speech_features_cannot_be_computed(
    create_lexical_syntactic_featurizer: Callable[[Dict[Text, Any]],
                                                  LexicalSyntacticFeaturizer],
    sentence: Text,
    feature_config: Dict[Text, Any],
    expected_features: np.ndarray,
):

    featurizer = create_lexical_syntactic_featurizer({
        "alias": "lsf",
        "features": feature_config
    })

    # build the message - with tokens but *no* part-of-speech tags
    tokens = [
        Token(text=match[0], start=match.start())
        for match in re.finditer(r"\w+", sentence)
    ]
    message = Message(data={TOKENS_NAMES[TEXT]: tokens})

    # train
    with pytest.warns(
            UserWarning,
            match=
            "Expected training data to include tokens with part-of-speech tags",
    ):
        featurizer.train(TrainingData([message]))
    assert not message.features

    # process
    with pytest.warns(None) as records:
        featurizer.process([message])
    assert len(records) == 0
    assert len(message.features) == 1
    feature = message.features[0]
    assert np.all(feature.features.todense() == expected_features)
Esempio n. 16
0
def test_count_vector_featurizer_shared_vocab(
    sentence, intent, response, text_features, intent_features, response_features
):
    ftr = CountVectorsFeaturizer({"use_shared_vocab": True,})
    tk = WhitespaceTokenizer()

    train_message = Message(data={TEXT: sentence})
    # this is needed for a valid training example
    train_message.set(INTENT, intent)
    train_message.set(RESPONSE, response)

    data = TrainingData([train_message])
    tk.train(data)
    ftr.train(data)

    seq_vec, sen_vec = train_message.get_sparse_features(TEXT, [])
    if seq_vec:
        seq_vec = seq_vec.features
    if sen_vec:
        sen_vec = sen_vec.features
    assert np.all(seq_vec.toarray()[0] == text_features)
    assert sen_vec is not None
    seq_vec, sen_vec = train_message.get_sparse_features(INTENT, [])
    if seq_vec:
        seq_vec = seq_vec.features
    if sen_vec:
        sen_vec = sen_vec.features
    assert np.all(seq_vec.toarray()[0] == intent_features)
    assert sen_vec is None
    seq_vec, sen_vec = train_message.get_sparse_features(RESPONSE, [])
    if seq_vec:
        seq_vec = seq_vec.features
    if sen_vec:
        sen_vec = sen_vec.features
    assert np.all(seq_vec.toarray()[0] == response_features)
    assert sen_vec is not None
    def _write_nlu_lookup_table_yaml(cls, lookup_table: Dict[Text, Any],
                                     output_dir_path: Path) -> None:
        """Converts and writes lookup tables examples from `txt` to `YAML` format.

        Args:
            lookup_table: Lookup tables items.
            output_dir_path: Path to the target output directory.
        """
        lookup_table_file = lookup_table.get("elements")
        if not lookup_table_file or not isinstance(lookup_table_file, str):
            return

        examples_from_file = read_lookup_table_file(lookup_table_file)
        target_filename = cls.generate_path_for_converted_training_data_file(
            Path(lookup_table_file), output_dir_path)
        entity_name = Path(lookup_table_file).stem

        RasaYAMLWriter().dump(
            target_filename,
            TrainingData(lookup_tables=[{
                "name": entity_name,
                "elements": examples_from_file
            }]),
        )
Esempio n. 18
0
    def _additional_training_data_from_stories(self) -> TrainingData:
        stories = self.get_stories()

        utterances, actions = _unique_events_from_stories(stories)

        # Sort events to guarantee deterministic behavior and to avoid that the NLU
        # model has to be retrained due to changes in the event order within
        # the stories.
        sorted_utterances = sorted(
            utterances, key=lambda user: user.intent_name or user.text or ""
        )
        sorted_actions = sorted(
            actions, key=lambda action: action.action_name or action.action_text or ""
        )

        additional_messages_from_stories = [
            _messages_from_action(action) for action in sorted_actions
        ] + [_messages_from_user_utterance(user) for user in sorted_utterances]

        logger.debug(
            f"Added {len(additional_messages_from_stories)} training data examples "
            f"from the story training data."
        )
        return TrainingData(additional_messages_from_stories)
Esempio n. 19
0
def test_convert_training_examples(
    spacy_nlp: Language,
    text: Text,
    intent: Optional[Text],
    entities: Optional[List[Dict[Text, Any]]],
    attributes: List[Text],
    real_sparse_feature_sizes: Dict[Text, Dict[Text, List[int]]],
    default_model_storage: ModelStorage,
    default_execution_context: ExecutionContext,
):
    message = Message(data={TEXT: text, INTENT: intent, ENTITIES: entities})

    tokenizer = SpacyTokenizer.create(
        SpacyTokenizer.get_default_config(),
        default_model_storage,
        Resource("tokenizer"),
        default_execution_context,
    )
    count_vectors_featurizer = CountVectorsFeaturizer.create(
        CountVectorsFeaturizer.get_default_config(),
        default_model_storage,
        Resource("count_featurizer"),
        default_execution_context,
    )
    spacy_featurizer = SpacyFeaturizer.create(
        SpacyFeaturizer.get_default_config(),
        default_model_storage,
        Resource("spacy_featurizer"),
        default_execution_context,
    )

    message.set(SPACY_DOCS[TEXT], spacy_nlp(text))

    training_data = TrainingData([message])
    tokenizer.process_training_data(training_data)
    count_vectors_featurizer.train(training_data)
    count_vectors_featurizer.process_training_data(training_data)
    spacy_featurizer.process_training_data(training_data)

    entity_tag_spec = [
        EntityTagSpec(
            "entity",
            {
                0: "O",
                1: "name",
                2: "location"
            },
            {
                "O": 0,
                "name": 1,
                "location": 2
            },
            3,
        )
    ]
    output, sparse_feature_sizes = model_data_utils.featurize_training_examples(
        [message], attributes=attributes, entity_tag_specs=entity_tag_spec)

    assert len(output) == 1
    for attribute in attributes:
        assert attribute in output[0]
    for attribute in {INTENT, TEXT, ENTITIES} - set(attributes):
        assert attribute not in output[0]
    # we have sparse sentence, sparse sequence, dense sentence, and dense sequence
    # features in the list
    assert len(output[0][TEXT]) == 4
    if INTENT in attributes:
        # we will just have space sentence features
        assert len(output[0][INTENT]) == 1
    if ENTITIES in attributes:
        # we will just have space sentence features
        assert len(output[0][ENTITIES]) == len(entity_tag_spec)
    # check that it calculates sparse_feature_sizes correctly
    assert sparse_feature_sizes == real_sparse_feature_sizes
Esempio n. 20
0
def test_is_empty():
    assert TrainingData().is_empty()
Esempio n. 21
0
def test_train_extract_load(
    create_or_load_mitie_extractor: Callable[[Dict[Text, Any]],
                                             MitieEntityExtractor],
    mitie_model: MitieModel,
    with_trainable_examples: bool,
):

    # some texts where last token is a city
    texts_ending_with_city = [
        "Bert lives in Berlin", "Ernie asks where is Bielefeld"
    ]

    # create some messages with entities
    messages_with_entities = []
    for text in texts_ending_with_city:
        tokens = [
            Token(text=match.group(), start=match.start(), end=match.end())
            for match in re.finditer(r"\w+", text)
        ]
        entities = [{
            ENTITY_ATTRIBUTE_TYPE: "city",
            ENTITY_ATTRIBUTE_VALUE: tokens[-1].text,
            ENTITY_ATTRIBUTE_START: tokens[-1].start,
            ENTITY_ATTRIBUTE_END: tokens[-1].end,
            EXTRACTOR: None,  # must be None or mitie_entity_extractor.name
        }]

        message = Message(text=text)
        message.data[TOKENS_NAMES[TEXT]] = tokens
        message.data[ENTITIES] = entities
        if with_trainable_examples:
            message.data[
                INTENT] = "must have intent otherwise not an NLU example"
        else:
            pass  # not adding an intent is sufficient to make this a "core example"
        messages_with_entities.append(message)

    # turn them into training data
    training_data = TrainingData(messages_with_entities)

    # train the extractor
    mitie_entity_extractor = create_or_load_mitie_extractor(config={},
                                                            load=False)
    mitie_entity_extractor.train(training_data, model=mitie_model)

    # create some messages "without entities" - for processing
    messages_without_entities = [
        Message(
            data={
                TEXT: message.data[TEXT],
                TOKENS_NAMES[TEXT]: message.data[TOKENS_NAMES[TEXT]],
            }) for message in messages_with_entities
    ]

    # process!
    mitie_entity_extractor.process(messages=messages_without_entities,
                                   model=mitie_model)

    # check that extractor added the expected entities to the messages
    # (that initially were) "with no entities"
    if with_trainable_examples:
        for processed_message, labeled_message in zip(
                messages_without_entities,
                messages_with_entities):  # i.e. "without (before process)"
            assert ENTITIES in processed_message.data
            computed_entities = processed_message.data[ENTITIES]
            assert len(computed_entities) == 1
            computed_entity = copy.copy(
                computed_entities[0])  # we need it later
            # check confidence
            assert computed_entity.pop(ENTITY_ATTRIBUTE_CONFIDENCE,
                                       "surprise") is None
            # check extractor
            assert computed_entity.pop(EXTRACTOR,
                                       None) == mitie_entity_extractor.name
            # compare the rest
            expected_entity = labeled_message.data[ENTITIES][0]
            expected_entity.pop(EXTRACTOR)
            assert computed_entity == expected_entity

    else:
        for processed_message in messages_without_entities:
            assert ENTITIES not in processed_message.data

    # load the same extractor again
    loaded_extractor = create_or_load_mitie_extractor(config={}, load=True)

    # check results are the same
    same_messages_without_entities = [
        Message(
            data={
                TEXT: message.data[TEXT],
                TOKENS_NAMES[TEXT]: message.data[TOKENS_NAMES[TEXT]],
            }) for message in messages_with_entities
    ]
    loaded_extractor.process(messages=same_messages_without_entities,
                             model=mitie_model)
    assert same_messages_without_entities[0].data == messages_without_entities[
        0].data
Esempio n. 22
0
def test_load_lookup_table(
    source_lookup_table: Dict[Text, Any], expected_lookup_table: Dict[Text, Any]
):
    assert TrainingData._load_lookup_table(source_lookup_table) == expected_lookup_table
Esempio n. 23
0
 async def get_nlu_data(self, language: Optional[Text] = "en") -> TrainingData:
     return TrainingData()
Esempio n. 24
0
 def run() -> TrainingData:
     return TrainingData()
Esempio n. 25
0
 def run(param0: TrainingData, param1: TrainingData,
         param2: TrainingData) -> TrainingData:
     return TrainingData()
Esempio n. 26
0
 def run(
     self, some_param: TrainingData = TrainingData()) -> TrainingData:
     pass
Esempio n. 27
0
def test_incremental_train_featurization(tmp_path: Path):
    patterns = [
        {
            "pattern": "[0-9]+",
            "name": "number",
            "usage": "intent"
        },
        {
            "pattern": "\\bhey*",
            "name": "hello",
            "usage": "intent"
        },
        {
            "pattern": "[0-1]+",
            "name": "binary",
            "usage": "intent"
        },
    ]

    featurizer = RegexFeaturizer.create({"number_additional_patterns": 5},
                                        RasaNLUModelConfig())

    sentence = "hey how are you today 19.12.2019 ?"
    message = Message(data={TEXT: sentence})
    message.set(RESPONSE, sentence)
    message.set(INTENT, "intent")
    WhitespaceTokenizer().train(TrainingData([message]))

    featurizer.train(TrainingData([message], regex_features=patterns),
                     RasaNLUModelConfig())

    # Test featurization of message
    expected = np.array([0, 1, 0, 0, 0, 0, 0, 0])
    expected_cls = np.array([1, 1, 1, 0, 0, 0, 0, 0])

    seq_vecs, sen_vec = message.get_sparse_features(TEXT, [])
    if seq_vecs:
        seq_vecs = seq_vecs.features
    if sen_vec:
        sen_vec = sen_vec.features

    assert (6, 8) == seq_vecs.shape
    assert (1, 8) == sen_vec.shape
    assert np.all(seq_vecs.toarray()[0] == expected)
    assert np.all(sen_vec.toarray()[-1] == expected_cls)

    persist_value = featurizer.persist("ftr", str(tmp_path))
    loaded_featurizer = RegexFeaturizer.load(
        meta={
            "number_additional_patterns": 5,
            "file": persist_value["file"],
        },
        should_finetune=True,
        model_dir=str(tmp_path),
    )

    new_patterns = [
        {
            "pattern": "\\btoday*",
            "name": "day",
            "usage": "intent"
        },
        {
            "pattern": "\\bhey+",
            "name": "hello",
            "usage": "intent"
        },
    ]

    message = Message(data={TEXT: sentence})
    message.set(RESPONSE, sentence)
    message.set(INTENT, "intent")
    WhitespaceTokenizer().train(TrainingData([message]))

    loaded_featurizer.train(
        TrainingData([message], regex_features=patterns + new_patterns),
        RasaNLUModelConfig(),
    )

    # Test featurization of message, this time for the extra pattern as well.
    expected_token_1 = np.array([0, 1, 0, 0, 0, 0, 0, 0])
    expected_token_2 = np.array([0, 0, 0, 1, 0, 0, 0, 0])
    expected_cls = np.array([1, 1, 1, 1, 0, 0, 0, 0])

    seq_vecs, sen_vec = message.get_sparse_features(TEXT, [])
    if seq_vecs:
        seq_vecs = seq_vecs.features
    if sen_vec:
        sen_vec = sen_vec.features

    assert (6, 8) == seq_vecs.shape
    assert (1, 8) == sen_vec.shape
    assert np.all(seq_vecs.toarray()[0] == expected_token_1)
    assert np.all(seq_vecs.toarray()[-2] == expected_token_2)
    assert np.all(sen_vec.toarray()[-1] == expected_cls)

    # we also modified a pattern, check if that is correctly modified
    pattern_to_check = [
        pattern for pattern in loaded_featurizer.known_patterns
        if pattern["name"] == "hello"
    ]
    assert pattern_to_check == [new_patterns[1]]
Esempio n. 28
0
def test_persist_load_for_finetuning(tmp_path: Path):
    patterns = [
        {
            "pattern": "[0-9]+",
            "name": "number",
            "usage": "intent"
        },
        {
            "pattern": "\\bhey*",
            "name": "hello",
            "usage": "intent"
        },
        {
            "pattern": "[0-1]+",
            "name": "binary",
            "usage": "intent"
        },
    ]

    featurizer = RegexFeaturizer.create({"number_additional_patterns": 5},
                                        RasaNLUModelConfig())

    sentence = "hey how are you today 19.12.2019 ?"
    message = Message(data={TEXT: sentence})
    message.set(RESPONSE, sentence)
    message.set(INTENT, "intent")
    WhitespaceTokenizer().train(TrainingData([message]))

    featurizer.train(TrainingData([message], regex_features=patterns),
                     RasaNLUModelConfig())

    persist_value = featurizer.persist("ftr", str(tmp_path))

    # Test all artifacts stored as part of persist
    assert persist_value["file"] == "ftr"
    assert (tmp_path / "ftr.patterns.pkl").exists()
    assert (tmp_path / "ftr.vocabulary_stats.pkl").exists()
    assert featurizer.vocabulary_stats == {
        "max_number_patterns": 8,
        "pattern_slots_filled": 3,
    }

    loaded_featurizer = RegexFeaturizer.load(
        meta={
            "number_additional_patterns": 5,
            "file": persist_value["file"],
        },
        should_finetune=True,
        model_dir=str(tmp_path),
    )

    # Test component loaded in finetune mode and also with
    # same patterns as before and vocabulary statistics
    assert loaded_featurizer.known_patterns == featurizer.known_patterns
    assert loaded_featurizer.finetune_mode
    assert loaded_featurizer.pattern_vocabulary_stats == featurizer.vocabulary_stats

    new_lookups = [{
        "name": "plates",
        "elements": "data/test/lookup_tables/plates.txt"
    }]

    training_data = TrainingData()
    training_data.lookup_tables = new_lookups
    loaded_featurizer.train(training_data)

    # Test merging of a new pattern to an already trained component.
    assert len(loaded_featurizer.known_patterns) == 4
    assert loaded_featurizer.vocabulary_stats == {
        "max_number_patterns": 8,
        "pattern_slots_filled": 4,
    }
Esempio n. 29
0
def test_regex_featurizer_train():

    patterns = [
        {
            "pattern": "[0-9]+",
            "name": "number",
            "usage": "intent"
        },
        {
            "pattern": "\\bhey*",
            "name": "hello",
            "usage": "intent"
        },
        {
            "pattern": "[0-1]+",
            "name": "binary",
            "usage": "intent"
        },
    ]

    featurizer = RegexFeaturizer.create({"number_additional_patterns": 0},
                                        RasaNLUModelConfig())

    sentence = "hey how are you today 19.12.2019 ?"
    message = Message(data={TEXT: sentence})
    message.set(RESPONSE, sentence)
    message.set(INTENT, "intent")
    WhitespaceTokenizer().train(TrainingData([message]))

    featurizer.train(TrainingData([message], regex_features=patterns),
                     RasaNLUModelConfig())

    expected = np.array([0, 1, 0])
    expected_cls = np.array([1, 1, 1])

    seq_vecs, sen_vec = message.get_sparse_features(TEXT, [])
    if seq_vecs:
        seq_vecs = seq_vecs.features
    if sen_vec:
        sen_vec = sen_vec.features

    assert (6, 3) == seq_vecs.shape
    assert (1, 3) == sen_vec.shape
    assert np.all(seq_vecs.toarray()[0] == expected)
    assert np.all(sen_vec.toarray()[-1] == expected_cls)

    seq_vecs, sen_vec = message.get_sparse_features(RESPONSE, [])
    if seq_vecs:
        seq_vecs = seq_vecs.features
    if sen_vec:
        sen_vec = sen_vec.features

    assert (6, 3) == seq_vecs.shape
    assert (1, 3) == sen_vec.shape
    assert np.all(seq_vecs.toarray()[0] == expected)
    assert np.all(sen_vec.toarray()[-1] == expected_cls)

    seq_vecs, sen_vec = message.get_sparse_features(INTENT, [])
    if seq_vecs:
        seq_vecs = seq_vecs.features
    if sen_vec:
        sen_vec = sen_vec.features

    assert seq_vecs is None
    assert sen_vec is None
Esempio n. 30
0
def validate_required_components_from_data(
    pipeline: List["Component"], data: TrainingData
) -> None:
    """Validates that all components are present in the pipeline based on data.

    Args:
        pipeline: The list of the :class:`rasa.nlu.components.Component`s.
        data: The :class:`rasa.shared.nlu.training_data.training_data.TrainingData`.
    """

    if data.response_examples and not any_components_in_pipeline(
        ["ResponseSelector"], pipeline
    ):
        rasa.shared.utils.io.raise_warning(
            "You have defined training data with examples for training a response "
            "selector, but your NLU pipeline does not include a response selector "
            "component. To train a model on your response selector data, add a "
            "'ResponseSelector' to your pipeline."
        )

    if data.entity_examples and not any_components_in_pipeline(
        TRAINABLE_EXTRACTORS, pipeline
    ):
        rasa.shared.utils.io.raise_warning(
            "You have defined training data consisting of entity examples, but "
            "your NLU pipeline does not include an entity extractor trained on "
            "your training data. To extract non-pretrained entities, add one of "
            f"{TRAINABLE_EXTRACTORS} to your pipeline."
        )

    if data.entity_examples and not any_components_in_pipeline(
        {"DIETClassifier", "CRFEntityExtractor"}, pipeline
    ):
        if data.entity_roles_groups_used():
            rasa.shared.utils.io.raise_warning(
                "You have defined training data with entities that have roles/groups, "
                "but your NLU pipeline does not include a 'DIETClassifier' or a "
                "'CRFEntityExtractor'. To train entities that have roles/groups, "
                "add either 'DIETClassifier' or 'CRFEntityExtractor' to your "
                "pipeline."
            )

    if data.regex_features and not any_components_in_pipeline(
        ["RegexFeaturizer", "RegexEntityExtractor"], pipeline
    ):
        rasa.shared.utils.io.raise_warning(
            "You have defined training data with regexes, but "
            "your NLU pipeline does not include a 'RegexFeaturizer' or a "
            "'RegexEntityExtractor'. To use regexes, include either a "
            "'RegexFeaturizer' or a 'RegexEntityExtractor' in your pipeline."
        )

    if data.lookup_tables and not any_components_in_pipeline(
        ["RegexFeaturizer", "RegexEntityExtractor"], pipeline
    ):
        rasa.shared.utils.io.raise_warning(
            "You have defined training data consisting of lookup tables, but "
            "your NLU pipeline does not include a 'RegexFeaturizer' or a "
            "'RegexEntityExtractor'. To use lookup tables, include either a "
            "'RegexFeaturizer' or a 'RegexEntityExtractor' in your pipeline."
        )

    if data.lookup_tables:
        if not any_components_in_pipeline(
            ["CRFEntityExtractor", "DIETClassifier"], pipeline
        ):
            rasa.shared.utils.io.raise_warning(
                "You have defined training data consisting of lookup tables, but "
                "your NLU pipeline does not include any components that use these "
                "features. To make use of lookup tables, add a 'DIETClassifier' or a "
                "'CRFEntityExtractor' with the 'pattern' feature to your pipeline."
            )
        elif any_components_in_pipeline(["CRFEntityExtractor"], pipeline):
            crf_components = [c for c in pipeline if c.name == "CRFEntityExtractor"]
            # check to see if any of the possible CRFEntityExtractors will
            # featurize `pattern`
            has_pattern_feature = False
            for crf in crf_components:
                crf_features = crf.component_config.get("features")
                # iterate through [[before],[word],[after]] features
                has_pattern_feature = "pattern" in itertools.chain(*crf_features)

            if not has_pattern_feature:
                rasa.shared.utils.io.raise_warning(
                    "You have defined training data consisting of lookup tables, but "
                    "your NLU pipeline's 'CRFEntityExtractor' does not include the "
                    "'pattern' feature. To featurize lookup tables, add the 'pattern' "
                    "feature to the 'CRFEntityExtractor' in your pipeline."
                )

    if data.entity_synonyms and not any_components_in_pipeline(
        ["EntitySynonymMapper"], pipeline
    ):
        rasa.shared.utils.io.raise_warning(
            "You have defined synonyms in your training data, but "
            "your NLU pipeline does not include an 'EntitySynonymMapper'. "
            "To map synonyms, add an 'EntitySynonymMapper' to your pipeline."
        )