Ejemplo n.º 1
0
def test_vocabulary_overflow_log():
    patterns = [
        {"pattern": "[0-9]+", "name": "number", "usage": "intent"},
        {"pattern": "\\bhey*", "name": "hello", "usage": "intent"},
        {"pattern": "[0-1]+", "name": "binary", "usage": "intent"},
    ]

    featurizer = RegexFeaturizer(
        {"number_additional_patterns": 1},
        known_patterns=patterns,
        finetune_mode=True,
        pattern_vocabulary_stats={"max_number_patterns": 4, "pattern_slots_filled": 3},
    )

    additional_patterns = [
        {"pattern": "\\btoday*", "name": "day", "usage": "intent"},
        {"pattern": "\\bhello+", "name": "greet", "usage": "intent"},
    ]

    with pytest.warns(UserWarning) as warning:
        featurizer.train(TrainingData([], regex_features=additional_patterns))
    assert (
        "The originally trained model was configured to handle "
        "a maximum number of 4 patterns" in warning[0].message.args[0]
    )
Ejemplo n.º 2
0
def test_lookup_tables_without_use_word_boundaries(sentence, tokens, expected,
                                                   labeled_tokens):
    from rasa.nlu.featurizers.sparse_featurizer.regex_featurizer import RegexFeaturizer
    from rasa.nlu.tokenizers.tokenizer import Token

    lookups = [
        {
            "name": "cites",
            "elements": ["北京", "上海", "广州", "深圳", "杭州"],
        },
        {
            "name": "dates",
            "elements": ["昨天", "今天", "明天", "后天"],
        },
    ]
    ftr = RegexFeaturizer({"use_word_boundaries": False})
    training_data = TrainingData()
    training_data.lookup_tables = lookups
    ftr.train(training_data)

    # adds tokens to the message
    message = Message(data={TEXT: sentence})
    message.set(TOKENS_NAMES[TEXT],
                [Token(word, start) for (word, start) in tokens])

    sequence_features, sentence_features = ftr._features_for_patterns(
        message, TEXT)
    assert np.allclose(sequence_features.toarray(), expected[:-1], atol=1e-10)
    assert np.allclose(sentence_features.toarray(), expected[-1], atol=1e-10)

    # the number of regex matches on each token should match
    for i, token in enumerate(message.get(TOKENS_NAMES[TEXT])):
        token_matches = token.get("pattern").values()
        num_matches = sum(token_matches)
        assert num_matches == labeled_tokens.count(i)
Ejemplo n.º 3
0
def test_lookup_with_and_without_boundaries(
    sentence: Text,
    expected_sequence_features: List[List[float]],
    expected_sentence_features: List[float],
    labeled_tokens: List[float],
    use_word_boundaries: bool,
    spacy_nlp: Any,
):
    ftr = RegexFeaturizer({
        "use_word_boundaries": use_word_boundaries,
        "number_additional_patterns": 0
    })
    training_data = TrainingData()

    # we use lookups because the "use_word_boundaries" flag is only used when
    # producing patterns from lookup tables
    lookups = [{"name": "how", "elements": ["how"]}]
    training_data.lookup_tables = lookups
    ftr.train(training_data)

    # adds tokens to the message
    tokenizer = SpacyTokenizer()
    message = Message(data={TEXT: sentence})
    message.set(SPACY_DOCS[TEXT], spacy_nlp(sentence))
    tokenizer.process(message)

    (sequence_features,
     sentence_features) = ftr._features_for_patterns(message, TEXT)

    sequence_features = sequence_features.toarray()
    sentence_features = sentence_features.toarray()
    num_of_patterns = sum([len(lookup["elements"]) for lookup in lookups])
    assert sequence_features.shape == (
        len(message.get(TOKENS_NAMES[TEXT])),
        num_of_patterns,
    )
    num_of_lookup_tables = len(lookups)
    assert sentence_features.shape == (num_of_lookup_tables, num_of_patterns)

    # sequence_features should be {0,1} for each token: 1 if match, 0 if not
    assert np.allclose(sequence_features,
                       expected_sequence_features,
                       atol=1e-10)
    # sentence_features should be {0,1} for each lookup table: 1 if sentence
    # contains match from that table, 0 if not
    assert np.allclose(sentence_features,
                       expected_sentence_features,
                       atol=1e-10)

    # the tokenizer should have added tokens
    assert len(message.get(TOKENS_NAMES[TEXT], [])) > 0

    # the number of regex matches on each token should match
    for i, token in enumerate(message.get(TOKENS_NAMES[TEXT])):
        token_matches = token.get("pattern").values()
        num_matches = sum(token_matches)
        # labeled_tokens should list the token(s) which match a pattern
        assert num_matches == labeled_tokens.count(i)
Ejemplo n.º 4
0
def test_lookup_tables(
    sentence: Text,
    expected_sequence_features: List[float],
    expected_sentence_features: List[float],
    labeled_tokens: List[float],
    spacy_nlp: Any,
):
    from rasa.nlu.featurizers.sparse_featurizer.regex_featurizer import RegexFeaturizer

    lookups = [
        {
            "name":
            "drinks",
            "elements":
            ["mojito", "lemonade", "sweet berry wine", "tea", "club?mate"],
        },
        {
            "name": "plates",
            "elements": "data/test/lookup_tables/plates.txt"
        },
    ]
    ftr = RegexFeaturizer({"number_additional_patterns": 0})
    training_data = TrainingData()
    training_data.lookup_tables = lookups
    ftr.train(training_data)

    # adds tokens to the message
    component_config = {"name": "SpacyTokenizer"}
    tokenizer = SpacyTokenizer(component_config)
    message = Message(data={TEXT: sentence})
    message.set("text_spacy_doc", spacy_nlp(sentence))
    tokenizer.process(message)

    sequence_features, sentence_features = ftr._features_for_patterns(
        message, TEXT)
    assert np.allclose(sequence_features.toarray(),
                       expected_sequence_features,
                       atol=1e-10)
    assert np.allclose(sentence_features.toarray(),
                       expected_sentence_features,
                       atol=1e-10)

    # the tokenizer should have added tokens
    assert len(message.get(TOKENS_NAMES[TEXT], [])) > 0
    # the number of regex matches on each token should match
    for i, token in enumerate(message.get(TOKENS_NAMES[TEXT])):
        token_matches = token.get("pattern").values()
        num_matches = sum(token_matches)
        assert num_matches == labeled_tokens.count(i)