Ejemplo n.º 1
0
def test_lookup_tables(
    sentence: Text,
    expected_sequence_features: List[float],
    expected_sentence_features: List[float],
    labeled_tokens: List[float],
    spacy_nlp: Any,
):
    from rasa.nlu.featurizers.sparse_featurizer.regex_featurizer import RegexFeaturizer

    lookups = [
        {
            "name": "drinks",
            "elements": ["mojito", "lemonade", "sweet berry wine", "tea", "club?mate"],
        },
        {"name": "plates", "elements": "data/test/lookup_tables/plates.txt"},
    ]
    ftr = RegexFeaturizer({"number_additional_patterns": 0})
    training_data = TrainingData()
    training_data.lookup_tables = lookups
    ftr.train(training_data)

    # adds tokens to the message
    component_config = {"name": "SpacyTokenizer"}
    tokenizer = SpacyTokenizer(component_config)
    message = Message(data={TEXT: sentence})
    message.set("text_spacy_doc", spacy_nlp(sentence))
    tokenizer.process(message)

    sequence_features, sentence_features = ftr._features_for_patterns(message, TEXT)
    assert np.allclose(
        sequence_features.toarray(), expected_sequence_features, atol=1e-10
    )
    assert np.allclose(
        sentence_features.toarray(), expected_sentence_features, atol=1e-10
    )

    # the tokenizer should have added tokens
    assert len(message.get(TOKENS_NAMES[TEXT], [])) > 0
    # the number of regex matches on each token should match
    for i, token in enumerate(message.get(TOKENS_NAMES[TEXT])):
        token_matches = token.get("pattern").values()
        num_matches = sum(token_matches)
        assert num_matches == labeled_tokens.count(i)
Ejemplo n.º 2
0
def test_regex_featurizer_case_sensitive(
    sentence: Text,
    sequence_vector: List[float],
    sentence_vector: List[float],
    case_sensitive: bool,
    spacy_nlp: Any,
):

    patterns = [
        {
            "pattern": "[0-9]+",
            "name": "number",
            "usage": "intent"
        },
        {
            "pattern": "\\bhey*",
            "name": "hello",
            "usage": "intent"
        },
        {
            "pattern": "[0-1]+",
            "name": "binary",
            "usage": "intent"
        },
    ]
    ftr = RegexFeaturizer({"case_sensitive": case_sensitive},
                          known_patterns=patterns)

    # adds tokens to the message
    tokenizer = SpacyTokenizer()
    message = Message(sentence)
    message.set(SPACY_DOCS[TEXT], spacy_nlp(sentence))
    tokenizer.process(message)

    sequence_featrures, sentence_features = ftr._features_for_patterns(
        message, TEXT)
    assert np.allclose(sequence_featrures.toarray()[0],
                       sequence_vector,
                       atol=1e-10)
    assert np.allclose(sentence_features.toarray()[-1],
                       sentence_vector,
                       atol=1e-10)
Ejemplo n.º 3
0
def test_regex_featurizer_no_sequence(
    sentence: Text,
    expected_sequence_features: List[float],
    expected_sentence_features: List[float],
    spacy_nlp: Any,
):

    patterns = [
        {
            "pattern": "[0-9]+",
            "name": "number",
            "usage": "intent"
        },
        {
            "pattern": "\\bhey*",
            "name": "hello",
            "usage": "intent"
        },
        {
            "pattern": "[0-1]+",
            "name": "binary",
            "usage": "intent"
        },
    ]
    ftr = RegexFeaturizer({"number_additional_patterns": 0},
                          known_patterns=patterns)

    # adds tokens to the message
    tokenizer = SpacyTokenizer()
    message = Message(data={TEXT: sentence})
    message.set(SPACY_DOCS[TEXT], spacy_nlp(sentence))
    tokenizer.process(message)

    sequence_features, sentence_features = ftr._features_for_patterns(
        message, TEXT)
    assert np.allclose(sequence_features.toarray()[0],
                       expected_sequence_features,
                       atol=1e-10)
    assert np.allclose(sentence_features.toarray()[-1],
                       expected_sentence_features,
                       atol=1e-10)
Ejemplo n.º 4
0
def test_regex_featurizer(
    sentence: Text,
    expected_sequence_features: List[float],
    expected_sentence_features: List[float],
    labeled_tokens: List[int],
    additional_vocabulary_size: int,
    spacy_nlp: Any,
):
    from rasa.nlu.featurizers.sparse_featurizer.regex_featurizer import RegexFeaturizer

    patterns = [
        {"pattern": "[0-9]+", "name": "number", "usage": "intent"},
        {"pattern": "\\bhey*", "name": "hello", "usage": "intent"},
        {"pattern": "[0-1]+", "name": "binary", "usage": "intent"},
    ]
    ftr = RegexFeaturizer(
        {"number_additional_patterns": additional_vocabulary_size},
        known_patterns=patterns,
    )

    # adds tokens to the message
    tokenizer = SpacyTokenizer({})
    message = Message(data={TEXT: sentence, RESPONSE: sentence})
    message.set(SPACY_DOCS[TEXT], spacy_nlp(sentence))
    tokenizer.process(message)

    sequence_features, sentence_features = ftr._features_for_patterns(message, TEXT)
    assert np.allclose(
        sequence_features.toarray(), expected_sequence_features, atol=1e-10
    )
    assert np.allclose(
        sentence_features.toarray(), expected_sentence_features, atol=1e-10
    )

    # the tokenizer should have added tokens
    assert len(message.get(TOKENS_NAMES[TEXT], [])) > 0
    # the number of regex matches on each token should match
    for i, token in enumerate(message.get(TOKENS_NAMES[TEXT])):
        token_matches = token.get("pattern").values()
        num_matches = sum(token_matches)
        assert num_matches == labeled_tokens.count(i)
Ejemplo n.º 5
0
def test_regex_featurizer(sentence, expected, labeled_tokens, spacy_nlp):
    from rasa.nlu.featurizers.sparse_featurizer.regex_featurizer import RegexFeaturizer

    patterns = [
        {
            "pattern": "[0-9]+",
            "name": "number",
            "usage": "intent"
        },
        {
            "pattern": "\\bhey*",
            "name": "hello",
            "usage": "intent"
        },
        {
            "pattern": "[0-1]+",
            "name": "binary",
            "usage": "intent"
        },
    ]
    ftr = RegexFeaturizer({}, known_patterns=patterns)

    # adds tokens to the message
    tokenizer = SpacyTokenizer({})
    message = Message(sentence, data={RESPONSE: sentence})
    message.set(SPACY_DOCS[TEXT], spacy_nlp(sentence))
    tokenizer.process(message)

    result = ftr._features_for_patterns(message, TEXT)
    assert np.allclose(result.toarray(), expected, atol=1e-10)

    # the tokenizer should have added tokens
    assert len(message.get(TOKENS_NAMES[TEXT], [])) > 0
    # the number of regex matches on each token should match
    for i, token in enumerate(message.get(TOKENS_NAMES[TEXT])):
        token_matches = token.get("pattern").values()
        num_matches = sum(token_matches)
        assert num_matches == labeled_tokens.count(i)
Ejemplo n.º 6
0
def test_lookup_tables_without_use_word_boundaries(
    sentence: Text,
    tokens: List[Tuple[Text, float]],
    expected_sequence_features: List[float],
    expected_sentence_features: List[float],
    labeled_tokens: List[float],
):
    from rasa.nlu.featurizers.sparse_featurizer.regex_featurizer import RegexFeaturizer
    from rasa.nlu.tokenizers.tokenizer import Token

    lookups = [
        {"name": "cites", "elements": ["北京", "上海", "广州", "深圳", "杭州"],},
        {"name": "dates", "elements": ["昨天", "今天", "明天", "后天"],},
    ]
    ftr = RegexFeaturizer(
        {"use_word_boundaries": False, "number_additional_patterns": 0}
    )
    training_data = TrainingData()
    training_data.lookup_tables = lookups
    ftr.train(training_data)

    # adds tokens to the message
    message = Message(data={TEXT: sentence})
    message.set(TOKENS_NAMES[TEXT], [Token(word, start) for (word, start) in tokens])

    sequence_features, sentence_features = ftr._features_for_patterns(message, TEXT)
    assert np.allclose(
        sequence_features.toarray(), expected_sequence_features, atol=1e-10
    )
    assert np.allclose(
        sentence_features.toarray(), expected_sentence_features, atol=1e-10
    )

    # the number of regex matches on each token should match
    for i, token in enumerate(message.get(TOKENS_NAMES[TEXT])):
        token_matches = token.get("pattern").values()
        num_matches = sum(token_matches)
        assert num_matches == labeled_tokens.count(i)