Exemple #1
0
def test_crf_json_from_BILOU(spacy_nlp, ner_crf_pos_feature_config):
    from rasa.nlu.extractors.crf_entity_extractor import CRFEntityExtractor

    ext = CRFEntityExtractor(component_config=ner_crf_pos_feature_config)
    sentence = "I need a home cleaning close-by"
    doc = {"spacy_doc": spacy_nlp(sentence)}
    r = ext._from_crf_to_json(
        Message(sentence, doc),
        [
            {"O": 1.0},
            {"O": 1.0},
            {"O": 1.0},
            {"B-what": 1.0},
            {"L-what": 1.0},
            {"B-where": 1.0},
            {"I-where": 1.0},
            {"L-where": 1.0},
        ],
    )
    assert len(r) == 2, "There should be two entities"

    assert r[0]["confidence"]  # confidence should exist
    del r[0]["confidence"]
    assert r[0] == {"start": 9, "end": 22, "value": "home cleaning", "entity": "what"}

    assert r[1]["confidence"]  # confidence should exist
    del r[1]["confidence"]
    assert r[1] == {"start": 23, "end": 31, "value": "close-by", "entity": "where"}
Exemple #2
0
def test_crf_json_from_non_BILOU(spacy_nlp, ner_crf_pos_feature_config):
    from rasa.nlu.extractors.crf_entity_extractor import CRFEntityExtractor

    ner_crf_pos_feature_config.update({"BILOU_flag": False})
    ext = CRFEntityExtractor(component_config=ner_crf_pos_feature_config)
    sentence = "I need a home cleaning close-by"
    doc = {"spacy_doc": spacy_nlp(sentence)}
    rs = ext._from_crf_to_json(
        Message(sentence, doc),
        [
            {"O": 1.0},
            {"O": 1.0},
            {"O": 1.0},
            {"what": 1.0},
            {"what": 1.0},
            {"where": 1.0},
            {"where": 1.0},
            {"where": 1.0},
        ],
    )

    # non BILOU will split multi-word entities - hence 5
    assert len(rs) == 5, "There should be five entities"

    for r in rs:
        assert r["confidence"]  # confidence should exist
        del r["confidence"]

    assert rs[0] == {"start": 9, "end": 13, "value": "home", "entity": "what"}
    assert rs[1] == {"start": 14, "end": 22, "value": "cleaning", "entity": "what"}
    assert rs[2] == {"start": 23, "end": 28, "value": "close", "entity": "where"}
    assert rs[3] == {"start": 28, "end": 29, "value": "-", "entity": "where"}
    assert rs[4] == {"start": 29, "end": 31, "value": "by", "entity": "where"}
def test_crf_use_dense_features(ner_crf_pos_feature_config, spacy_nlp):
    from rasa.nlu.extractors.crf_entity_extractor import CRFEntityExtractor
    from rasa.nlu.tokenizers.whitespace_tokenizer import WhitespaceTokenizer
    from rasa.nlu.featurizers.dense_featurizer.spacy_featurizer import SpacyFeaturizer

    ner_crf_pos_feature_config["features"][1].append("text_dense_features")
    crf_extractor = CRFEntityExtractor(component_config=ner_crf_pos_feature_config)

    spacy_featurizer = SpacyFeaturizer()
    white_space_tokenizer = WhitespaceTokenizer({"use_cls_token": False})

    text = "Rasa is a company in Berlin"
    message = Message(text)
    message.set("spacy_doc", spacy_nlp(text))

    white_space_tokenizer.process(message)
    spacy_featurizer.process(message)

    text_data = crf_extractor._from_text_to_crf(message)
    features = crf_extractor._sentence_to_features(text_data)

    assert "0:text_dense_features" in features[0]
    for i in range(0, len(message.data.get("text_dense_features")[0])):
        assert (
            features[0]["0:text_dense_features"]["text_dense_features"][str(i)]
            == message.data.get("text_dense_features")[0][i]
        )
    def __init__(self,
                 component_config: Optional[Dict[Text, Any]]=None,
                 ent_tagger: Optional[Dict[Text, Any]]=None) -> None:

        super(IncrementalCRFEntityExtractor, self).__init__(component_config)

        self.CRFEE = CRFEntityExtractor(component_config, ent_tagger)
        self.prev_ents = []
Exemple #5
0
def load_entity_extractor(data_file, config_file):
    training_data = load_data(data_file)
    configuration = config.load(config_file)
    comp_builder = components.ComponentBuilder()
    #component = comp_builder.create_component("ner_crf",configuration)
    #ee = EntityExtractor(components.Component(configuration))
    crf = CRFEntityExtractor()
    crf.train(training_data, configuration)
    model_directory = crf.persist('./models/default/')
    return model_directory
def test_most_likely_entity(
    entity_predictions: List[Dict[Text, float]],
    expected_label: Text,
    expected_confidence: float,
):
    crf_extractor = CRFEntityExtractor({"BILOU_flag": True})

    actual_label, actual_confidence = crf_extractor._most_likely_tag(entity_predictions)

    assert actual_label == expected_label
    assert actual_confidence == expected_confidence
Exemple #7
0
def test_crf_json_from_BILOU(spacy_nlp):
    ext = CRFEntityExtractor(
        component_config={
            "features": [
                ["low", "title", "upper", "pos", "pos2"],
                [
                    "low",
                    "bias",
                    "suffix3",
                    "suffix2",
                    "upper",
                    "title",
                    "digit",
                    "pos",
                    "pos2",
                ],
                ["low", "title", "upper", "pos", "pos2"],
            ]
        }
    )

    sentence = "I need a home cleaning close-by"

    message = Message(sentence, {SPACY_DOCS[TEXT]: spacy_nlp(sentence)})

    tokenizer = SpacyTokenizer()
    tokenizer.process(message)

    r = ext._from_crf_to_json(
        message,
        [
            {"O": 1.0},
            {"O": 1.0},
            {"O": 1.0},
            {"B-what": 1.0},
            {"L-what": 1.0},
            {"B-where": 1.0},
            {"I-where": 1.0},
            {"L-where": 1.0},
        ],
    )
    assert len(r) == 2, "There should be two entities"

    assert r[0]["confidence"]  # confidence should exist
    del r[0]["confidence"]
    assert r[0] == {"start": 9, "end": 22, "value": "home cleaning", "entity": "what"}

    assert r[1]["confidence"]  # confidence should exist
    del r[1]["confidence"]
    assert r[1] == {"start": 23, "end": 31, "value": "close-by", "entity": "where"}
Exemple #8
0
def test_crf_json_from_non_BILOU(spacy_nlp, ner_crf_pos_feature_config):
    from rasa.nlu.extractors.crf_entity_extractor import CRFEntityExtractor
    ner_crf_pos_feature_config.update({"BILOU_flag": False})
    ext = CRFEntityExtractor(component_config=ner_crf_pos_feature_config)
    sentence = u"I need a home cleaning close-by"
    doc = {"spacy_doc": spacy_nlp(sentence)}
    rs = ext._from_crf_to_json(Message(sentence, doc), [{
        'O': 1.0
    }, {
        'O': 1.0
    }, {
        'O': 1.0
    }, {
        'what': 1.0
    }, {
        'what': 1.0
    }, {
        'where': 1.0
    }, {
        'where': 1.0
    }, {
        'where': 1.0
    }])

    # non BILOU will split multi-word entities - hence 5
    assert len(rs) == 5, "There should be five entities"

    for r in rs:
        assert r['confidence']  # confidence should exist
        del r['confidence']

    assert rs[0] == {'start': 9, 'end': 13, 'value': 'home', 'entity': 'what'}
    assert rs[1] == {
        'start': 14,
        'end': 22,
        'value': 'cleaning',
        'entity': 'what'
    }
    assert rs[2] == {
        'start': 23,
        'end': 28,
        'value': 'close',
        'entity': 'where'
    }
    assert rs[3] == {'start': 28, 'end': 29, 'value': '-', 'entity': 'where'}
    assert rs[4] == {'start': 29, 'end': 31, 'value': 'by', 'entity': 'where'}
Exemple #9
0
def test_crf_json_from_non_BILOU(spacy_nlp):
    from rasa.nlu.extractors.crf_entity_extractor import CRFEntityExtractor

    ext = CRFEntityExtractor(
        component_config={
            "BILOU_flag": False,
            "features": [
                ["low", "title", "upper", "pos", "pos2"],
                ["low", "suffix3", "suffix2", "upper", "title", "digit", "pos", "pos2"],
                ["low", "title", "upper", "pos", "pos2"],
            ],
        }
    )
    sentence = "I need a home cleaning close-by"

    message = Message(sentence, {SPACY_DOCS[TEXT]: spacy_nlp(sentence)})

    tokenizer = SpacyTokenizer()
    tokenizer.process(message)

    rs = ext._from_crf_to_json(
        message,
        [
            {"O": 1.0},
            {"O": 1.0},
            {"O": 1.0},
            {"what": 1.0},
            {"what": 1.0},
            {"where": 1.0},
            {"where": 1.0},
            {"where": 1.0},
        ],
    )

    # non BILOU will split multi-word entities - hence 5
    assert len(rs) == 5, "There should be five entities"

    for r in rs:
        assert r["confidence"]  # confidence should exist
        del r["confidence"]

    assert rs[0] == {"start": 9, "end": 13, "value": "home", "entity": "what"}
    assert rs[1] == {"start": 14, "end": 22, "value": "cleaning", "entity": "what"}
    assert rs[2] == {"start": 23, "end": 28, "value": "close", "entity": "where"}
    assert rs[3] == {"start": 28, "end": 29, "value": "-", "entity": "where"}
    assert rs[4] == {"start": 29, "end": 31, "value": "by", "entity": "where"}
def test_crf_use_dense_features(spacy_nlp: Any):
    crf_extractor = CRFEntityExtractor(
        component_config={
            "features": [
                ["low", "title", "upper", "pos", "pos2"],
                [
                    "low",
                    "suffix3",
                    "suffix2",
                    "upper",
                    "title",
                    "digit",
                    "pos",
                    "pos2",
                    "text_dense_features",
                ],
                ["low", "title", "upper", "pos", "pos2"],
            ]
        }
    )

    spacy_featurizer = SpacyFeaturizer()
    spacy_tokenizer = SpacyTokenizer()

    text = "Rasa is a company in Berlin"
    message = Message(data={TEXT: text})
    message.set(SPACY_DOCS[TEXT], spacy_nlp(text))

    spacy_tokenizer.process(message)
    spacy_featurizer.process(message)

    text_data = crf_extractor._convert_to_crf_tokens(message)
    features = crf_extractor._crf_tokens_to_features(text_data)

    assert "0:text_dense_features" in features[0]
    dense_features, _ = message.get_dense_features(TEXT, [])
    if dense_features:
        dense_features = dense_features.features

    for i in range(0, len(dense_features[0])):
        assert (
            features[0]["0:text_dense_features"]["text_dense_features"][str(i)]
            == dense_features[0][i]
        )
Exemple #11
0
def test_crf_json_from_BILOU(spacy_nlp, ner_crf_pos_feature_config):
    from rasa.nlu.extractors.crf_entity_extractor import CRFEntityExtractor
    ext = CRFEntityExtractor(component_config=ner_crf_pos_feature_config)
    sentence = u"I need a home cleaning close-by"
    doc = {"spacy_doc": spacy_nlp(sentence)}
    r = ext._from_crf_to_json(Message(sentence, doc), [{
        'O': 1.0
    }, {
        'O': 1.0
    }, {
        'O': 1.0
    }, {
        'B-what': 1.0
    }, {
        'L-what': 1.0
    }, {
        'B-where': 1.0
    }, {
        'I-where': 1.0
    }, {
        'L-where': 1.0
    }])
    assert len(r) == 2, "There should be two entities"

    assert r[0]["confidence"]  # confidence should exist
    del r[0]["confidence"]
    assert r[0] == {
        'start': 9,
        'end': 22,
        'value': 'home cleaning',
        'entity': 'what'
    }

    assert r[1]["confidence"]  # confidence should exist
    del r[1]["confidence"]
    assert r[1] == {
        'start': 23,
        'end': 31,
        'value': 'close-by',
        'entity': 'where'
    }
Exemple #12
0
def test_crf_use_dense_features(spacy_nlp):
    crf_extractor = CRFEntityExtractor(
        component_config={
            "features": [
                ["low", "title", "upper", "pos", "pos2"],
                [
                    "low",
                    "suffix3",
                    "suffix2",
                    "upper",
                    "title",
                    "digit",
                    "pos",
                    "pos2",
                    "text_dense_features",
                ],
                ["low", "title", "upper", "pos", "pos2"],
            ]
        }
    )

    spacy_featurizer = SpacyFeaturizer()
    spacy_tokenizer = SpacyTokenizer()

    text = "Rasa is a company in Berlin"
    message = Message(text)
    message.set(SPACY_DOCS[TEXT], spacy_nlp(text))

    spacy_tokenizer.process(message)
    spacy_featurizer.process(message)

    text_data = crf_extractor._from_text_to_crf(message)
    features = crf_extractor._sentence_to_features(text_data)

    assert "0:text_dense_features" in features[0]
    for i in range(0, len(message.data.get("text_dense_features")[0])):
        assert (
            features[0]["0:text_dense_features"]["text_dense_features"][str(i)]
            == message.data.get("text_dense_features")[0][i]
        )
def test_crf_extractor(spacy_nlp, ner_crf_pos_feature_config):
    from rasa.nlu.extractors.crf_entity_extractor import CRFEntityExtractor

    ext = CRFEntityExtractor(component_config=ner_crf_pos_feature_config)
    examples = [
        Message(
            "anywhere in the west",
            {
                "intent":
                "restaurant_search",
                "entities": [{
                    "start": 16,
                    "end": 20,
                    "value": "west",
                    "entity": "location"
                }],
                "spacy_doc":
                spacy_nlp("anywhere in the west"),
            },
        ),
        Message(
            "central indian restaurant",
            {
                "intent":
                "restaurant_search",
                "entities": [
                    {
                        "start": 0,
                        "end": 7,
                        "value": "central",
                        "entity": "location",
                        "extractor": "random_extractor",
                    },
                    {
                        "start": 8,
                        "end": 14,
                        "value": "indian",
                        "entity": "cuisine",
                        "extractor": "CRFEntityExtractor",
                    },
                ],
                "spacy_doc":
                spacy_nlp("central indian restaurant"),
            },
        ),
    ]

    # uses BILOU and the default features
    ext.train(TrainingData(training_examples=examples), RasaNLUModelConfig())
    sentence = "anywhere in the west"
    doc = {"spacy_doc": spacy_nlp(sentence)}
    crf_format = ext._from_text_to_crf(Message(sentence, doc))
    assert [word[0]
            for word in crf_format] == ["anywhere", "in", "the", "west"]
    feats = ext._sentence_to_features(crf_format)
    assert "BOS" in feats[0]
    assert "EOS" in feats[-1]
    assert feats[1]["0:low"] == "in"
    sentence = "anywhere in the west"
    ext.extract_entities(Message(sentence, {"spacy_doc": spacy_nlp(sentence)}))
    filtered = ext.filter_trainable_entities(examples)
    assert filtered[0].get("entities") == [{
        "start": 16,
        "end": 20,
        "value": "west",
        "entity": "location"
    }], "Entity without extractor remains"
    assert filtered[1].get("entities") == [{
        "start": 8,
        "end": 14,
        "value": "indian",
        "entity": "cuisine",
        "extractor": "CRFEntityExtractor",
    }], "Only CRFEntityExtractor entity annotation remains"
    assert examples[1].get("entities")[0] == {
        "start": 0,
        "end": 7,
        "value": "central",
        "entity": "location",
        "extractor": "random_extractor",
    }, "Original examples are not mutated"
Exemple #14
0
        os.path.join(report_folder, "response_selection_histogram.png")
    )
    assert not os.path.exists(
        os.path.join(report_folder, "response_selection_errors.json")
    )
    assert os.path.exists(
        os.path.join(report_folder, "response_selection_successes.json")
    )


@pytest.mark.parametrize(
    "components, expected_extractors",
    [
        ([DIETClassifier({ENTITY_RECOGNITION: False})], set()),
        ([DIETClassifier({ENTITY_RECOGNITION: True})], {"DIETClassifier"}),
        ([CRFEntityExtractor()], {"CRFEntityExtractor"}),
        (
            [SpacyEntityExtractor(), CRFEntityExtractor()],
            {"SpacyEntityExtractor", "CRFEntityExtractor"},
        ),
        ([ResponseSelector()], set()),
    ],
)
def test_get_entity_extractors(
    components: List[Component], expected_extractors: Set[Text]
):
    mock_interpreter = Interpreter(components, None)
    extractors = get_entity_extractors(mock_interpreter)

    assert extractors == expected_extractors
Exemple #15
0
    assert os.path.exists(
        os.path.join(report_folder, "response_selection_confusion_matrix.png"))
    assert os.path.exists(
        os.path.join(report_folder, "response_selection_histogram.png"))
    assert not os.path.exists(
        os.path.join(report_folder, "response_selection_errors.json"))
    assert os.path.exists(
        os.path.join(report_folder, "response_selection_successes.json"))


@pytest.mark.parametrize(
    "components, expected_extractors",
    [
        ([DIETClassifier({ENTITY_RECOGNITION: False})], set()),
        ([DIETClassifier({ENTITY_RECOGNITION: True})], {"DIETClassifier"}),
        ([CRFEntityExtractor()], {"CRFEntityExtractor"}),
        (
            [SpacyEntityExtractor(),
             CRFEntityExtractor()],
            {"SpacyEntityExtractor", "CRFEntityExtractor"},
        ),
        ([ResponseSelector()], set()),
    ],
)
def test_get_entity_extractors(components, expected_extractors):
    mock_interpreter = Interpreter(components, None)
    extractors = get_entity_extractors(mock_interpreter)

    assert extractors == expected_extractors

Exemple #16
0
        os.path.join(report_folder, "response_selection_histogram.png")
    )
    assert not os.path.exists(
        os.path.join(report_folder, "response_selection_errors.json")
    )
    assert os.path.exists(
        os.path.join(report_folder, "response_selection_successes.json")
    )


@pytest.mark.parametrize(
    "components, expected_extractors",
    [
        ([DIETClassifier({ENTITY_RECOGNITION: False})], set()),
        ([DIETClassifier({ENTITY_RECOGNITION: True})], {"DIETClassifier"}),
        ([CRFEntityExtractor()], {"CRFEntityExtractor"}),
        (
            [SpacyEntityExtractor(), CRFEntityExtractor()],
            {"SpacyEntityExtractor", "CRFEntityExtractor"},
        ),
        ([ResponseSelector()], set()),
    ],
)
def test_get_entity_extractors(
    components: List[Component], expected_extractors: Set[Text]
):
    mock_interpreter = Interpreter(components, None)
    extractors = get_entity_extractors(mock_interpreter)

    assert extractors == expected_extractors
Exemple #17
0
def test_crf_extractor(spacy_nlp):
    examples = [
        Message(
            "anywhere in the west",
            {
                "intent": "restaurant_search",
                "entities": [
                    {"start": 16, "end": 20, "value": "west", "entity": "location"}
                ],
                SPACY_DOCS[TEXT]: spacy_nlp("anywhere in the west"),
            },
        ),
        Message(
            "central indian restaurant",
            {
                "intent": "restaurant_search",
                "entities": [
                    {
                        "start": 0,
                        "end": 7,
                        "value": "central",
                        "entity": "location",
                        "extractor": "random_extractor",
                    },
                    {
                        "start": 8,
                        "end": 14,
                        "value": "indian",
                        "entity": "cuisine",
                        "extractor": "CRFEntityExtractor",
                    },
                ],
                SPACY_DOCS[TEXT]: spacy_nlp("central indian restaurant"),
            },
        ),
    ]

    extractor = CRFEntityExtractor(
        component_config={
            "features": [
                ["low", "title", "upper", "pos", "pos2"],
                ["low", "suffix3", "suffix2", "upper", "title", "digit", "pos", "pos2"],
                ["low", "title", "upper", "pos", "pos2"],
            ]
        }
    )
    tokenizer = SpacyTokenizer()

    training_data = TrainingData(training_examples=examples)
    tokenizer.train(training_data)
    extractor.train(training_data)

    sentence = "italian restaurant"
    message = Message(sentence, {SPACY_DOCS[TEXT]: spacy_nlp(sentence)})

    tokenizer.process(message)
    extractor.process(message)

    detected_entities = message.get(ENTITIES)

    assert len(detected_entities) == 1
    assert detected_entities[0]["entity"] == "cuisine"
    assert detected_entities[0]["value"] == "italian"
Exemple #18
0
def test_crf_create_entity_dict(spacy_nlp):
    crf_extractor = CRFEntityExtractor()
    spacy_tokenizer = SpacyTokenizer()
    white_space_tokenizer = WhitespaceTokenizer()

    examples = [
        {
            "message": Message(
                "where is St. Michael's Hospital?",
                {
                    "intent": "search_location",
                    "entities": [
                        {
                            "start": 9,
                            "end": 31,
                            "value": "St. Michael's Hospital",
                            "entity": "hospital",
                            "SpacyTokenizer": {
                                "entity_start_token_idx": 2,
                                "entity_end_token_idx": 5,
                            },
                            "WhitespaceTokenizer": {
                                "entity_start_token_idx": 2,
                                "entity_end_token_idx": 5,
                            },
                        }
                    ],
                    SPACY_DOCS[TEXT]: spacy_nlp("where is St. Michael's Hospital?"),
                },
            )
        },
        {
            "message": Message(
                "where is Children's Hospital?",
                {
                    "intent": "search_location",
                    "entities": [
                        {
                            "start": 9,
                            "end": 28,
                            "value": "Children's Hospital",
                            "entity": "hospital",
                            "SpacyTokenizer": {
                                "entity_start_token_idx": 2,
                                "entity_end_token_idx": 4,
                            },
                            "WhitespaceTokenizer": {
                                "entity_start_token_idx": 2,
                                "entity_end_token_idx": 4,
                            },
                        }
                    ],
                    SPACY_DOCS[TEXT]: spacy_nlp("where is Children's Hospital?"),
                },
            )
        },
    ]
    for ex in examples:
        # spacy tokenizers receives a Doc as input and whitespace tokenizer receives a text
        spacy_tokens = spacy_tokenizer.tokenize(ex["message"], TEXT)
        white_space_tokens = white_space_tokenizer.tokenize(ex["message"], TEXT)
        for tokenizer, tokens in [
            ("SpacyTokenizer", spacy_tokens),
            ("WhitespaceTokenizer", white_space_tokens),
        ]:
            for entity in ex["message"].get("entities"):
                parsed_entities = crf_extractor._create_entity_dict(
                    ex["message"],
                    tokens,
                    entity[tokenizer]["entity_start_token_idx"],
                    entity[tokenizer]["entity_end_token_idx"],
                    entity["entity"],
                    0.8,
                )
                assert parsed_entities == {
                    "start": entity["start"],
                    "end": entity["end"],
                    "value": entity["value"],
                    "entity": entity["entity"],
                    "confidence": 0.8,
                }