Beispiel #1
0
def test_crf_json_from_non_BILOU(spacy_nlp, ner_crf_pos_feature_config):
    from rasa.nlu.extractors.crf_entity_extractor import CRFEntityExtractor

    ner_crf_pos_feature_config.update({"BILOU_flag": False})
    ext = CRFEntityExtractor(component_config=ner_crf_pos_feature_config)
    sentence = "I need a home cleaning close-by"
    doc = {"spacy_doc": spacy_nlp(sentence)}
    rs = ext._from_crf_to_json(
        Message(sentence, doc),
        [
            {"O": 1.0},
            {"O": 1.0},
            {"O": 1.0},
            {"what": 1.0},
            {"what": 1.0},
            {"where": 1.0},
            {"where": 1.0},
            {"where": 1.0},
        ],
    )

    # non BILOU will split multi-word entities - hence 5
    assert len(rs) == 5, "There should be five entities"

    for r in rs:
        assert r["confidence"]  # confidence should exist
        del r["confidence"]

    assert rs[0] == {"start": 9, "end": 13, "value": "home", "entity": "what"}
    assert rs[1] == {"start": 14, "end": 22, "value": "cleaning", "entity": "what"}
    assert rs[2] == {"start": 23, "end": 28, "value": "close", "entity": "where"}
    assert rs[3] == {"start": 28, "end": 29, "value": "-", "entity": "where"}
    assert rs[4] == {"start": 29, "end": 31, "value": "by", "entity": "where"}
Beispiel #2
0
def test_crf_json_from_BILOU(spacy_nlp, ner_crf_pos_feature_config):
    from rasa.nlu.extractors.crf_entity_extractor import CRFEntityExtractor

    ext = CRFEntityExtractor(component_config=ner_crf_pos_feature_config)
    sentence = "I need a home cleaning close-by"
    doc = {"spacy_doc": spacy_nlp(sentence)}
    r = ext._from_crf_to_json(
        Message(sentence, doc),
        [
            {"O": 1.0},
            {"O": 1.0},
            {"O": 1.0},
            {"B-what": 1.0},
            {"L-what": 1.0},
            {"B-where": 1.0},
            {"I-where": 1.0},
            {"L-where": 1.0},
        ],
    )
    assert len(r) == 2, "There should be two entities"

    assert r[0]["confidence"]  # confidence should exist
    del r[0]["confidence"]
    assert r[0] == {"start": 9, "end": 22, "value": "home cleaning", "entity": "what"}

    assert r[1]["confidence"]  # confidence should exist
    del r[1]["confidence"]
    assert r[1] == {"start": 23, "end": 31, "value": "close-by", "entity": "where"}
def test_crf_use_dense_features(ner_crf_pos_feature_config, spacy_nlp):
    from rasa.nlu.extractors.crf_entity_extractor import CRFEntityExtractor
    from rasa.nlu.tokenizers.whitespace_tokenizer import WhitespaceTokenizer
    from rasa.nlu.featurizers.dense_featurizer.spacy_featurizer import SpacyFeaturizer

    ner_crf_pos_feature_config["features"][1].append("text_dense_features")
    crf_extractor = CRFEntityExtractor(component_config=ner_crf_pos_feature_config)

    spacy_featurizer = SpacyFeaturizer()
    white_space_tokenizer = WhitespaceTokenizer({"use_cls_token": False})

    text = "Rasa is a company in Berlin"
    message = Message(text)
    message.set("spacy_doc", spacy_nlp(text))

    white_space_tokenizer.process(message)
    spacy_featurizer.process(message)

    text_data = crf_extractor._from_text_to_crf(message)
    features = crf_extractor._sentence_to_features(text_data)

    assert "0:text_dense_features" in features[0]
    for i in range(0, len(message.data.get("text_dense_features")[0])):
        assert (
            features[0]["0:text_dense_features"]["text_dense_features"][str(i)]
            == message.data.get("text_dense_features")[0][i]
        )
Beispiel #4
0
async def test_train_persist_load_with_composite_entities(
    crf_entity_extractor: Callable[[Dict[Text, Any]], CRFEntityExtractor],
    default_model_storage: ModelStorage,
    default_execution_context: ExecutionContext,
    whitespace_tokenizer: WhitespaceTokenizer,
):
    importer = RasaFileImporter(
        training_data_paths=["data/test/demo-rasa-composite-entities.yml"])
    training_data = importer.get_nlu_data()

    whitespace_tokenizer.process_training_data(training_data)

    crf_extractor = crf_entity_extractor({})
    crf_extractor.train(training_data)

    message = Message(data={TEXT: "I am looking for an italian restaurant"})

    whitespace_tokenizer.process([message])
    message2 = copy.deepcopy(message)

    processed_message = crf_extractor.process([message])[0]

    loaded_extractor = CRFEntityExtractor.load(
        CRFEntityExtractor.get_default_config(),
        default_model_storage,
        Resource("CRFEntityExtractor"),
        default_execution_context,
    )

    processed_message2 = loaded_extractor.process([message2])[0]

    assert processed_message2.fingerprint() == processed_message.fingerprint()
    assert list(loaded_extractor.entity_taggers.keys()) == list(
        crf_extractor.entity_taggers.keys())
    def __init__(self,
                 component_config: Optional[Dict[Text, Any]]=None,
                 ent_tagger: Optional[Dict[Text, Any]]=None) -> None:

        super(IncrementalCRFEntityExtractor, self).__init__(component_config)

        self.CRFEE = CRFEntityExtractor(component_config, ent_tagger)
        self.prev_ents = []
Beispiel #6
0
def load_entity_extractor(data_file, config_file):
    training_data = load_data(data_file)
    configuration = config.load(config_file)
    comp_builder = components.ComponentBuilder()
    #component = comp_builder.create_component("ner_crf",configuration)
    #ee = EntityExtractor(components.Component(configuration))
    crf = CRFEntityExtractor()
    crf.train(training_data, configuration)
    model_directory = crf.persist('./models/default/')
    return model_directory
Beispiel #7
0
 def inner(config: Dict[Text, Any]) -> CRFEntityExtractor:
     return CRFEntityExtractor.create(
         {
             **CRFEntityExtractor.get_default_config(),
             **config
         },
         default_model_storage,
         Resource("CRFEntityExtractor"),
         default_execution_context,
     )
def test_most_likely_entity(
    entity_predictions: List[Dict[Text, float]],
    expected_label: Text,
    expected_confidence: float,
):
    crf_extractor = CRFEntityExtractor({"BILOU_flag": True})

    actual_label, actual_confidence = crf_extractor._most_likely_tag(entity_predictions)

    assert actual_label == expected_label
    assert actual_confidence == expected_confidence
Beispiel #9
0
def test_crf_json_from_BILOU(spacy_nlp):
    ext = CRFEntityExtractor(
        component_config={
            "features": [
                ["low", "title", "upper", "pos", "pos2"],
                [
                    "low",
                    "bias",
                    "suffix3",
                    "suffix2",
                    "upper",
                    "title",
                    "digit",
                    "pos",
                    "pos2",
                ],
                ["low", "title", "upper", "pos", "pos2"],
            ]
        }
    )

    sentence = "I need a home cleaning close-by"

    message = Message(sentence, {SPACY_DOCS[TEXT]: spacy_nlp(sentence)})

    tokenizer = SpacyTokenizer()
    tokenizer.process(message)

    r = ext._from_crf_to_json(
        message,
        [
            {"O": 1.0},
            {"O": 1.0},
            {"O": 1.0},
            {"B-what": 1.0},
            {"L-what": 1.0},
            {"B-where": 1.0},
            {"I-where": 1.0},
            {"L-where": 1.0},
        ],
    )
    assert len(r) == 2, "There should be two entities"

    assert r[0]["confidence"]  # confidence should exist
    del r[0]["confidence"]
    assert r[0] == {"start": 9, "end": 22, "value": "home cleaning", "entity": "what"}

    assert r[1]["confidence"]  # confidence should exist
    del r[1]["confidence"]
    assert r[1] == {"start": 23, "end": 31, "value": "close-by", "entity": "where"}
Beispiel #10
0
async def test_train_persist_with_different_configurations(
    crf_entity_extractor: Callable[[Dict[Text, Any]], CRFEntityExtractor],
    config_params: Dict[Text, Any],
    default_model_storage: ModelStorage,
    default_execution_context: ExecutionContext,
    spacy_tokenizer: SpacyTokenizer,
    spacy_featurizer: SpacyFeaturizer,
    spacy_nlp_component: SpacyNLP,
    spacy_model: SpacyModel,
):

    crf_extractor = crf_entity_extractor(config_params)

    importer = RasaFileImporter(training_data_paths=["data/examples/rasa"])
    training_data = importer.get_nlu_data()

    training_data = spacy_nlp_component.process_training_data(
        training_data, spacy_model)
    training_data = spacy_tokenizer.process_training_data(training_data)
    training_data = spacy_featurizer.process_training_data(training_data)
    crf_extractor.train(training_data)

    message = Message(data={TEXT: "I am looking for an italian restaurant"})
    messages = spacy_nlp_component.process([message], spacy_model)
    messages = spacy_tokenizer.process(messages)
    message = spacy_featurizer.process(messages)[0]
    message2 = copy.deepcopy(message)

    processed_message = crf_extractor.process([message])[0]

    loaded_extractor = CRFEntityExtractor.load(
        {
            **CRFEntityExtractor.get_default_config(),
            **config_params
        },
        default_model_storage,
        Resource("CRFEntityExtractor"),
        default_execution_context,
    )

    processed_message2 = loaded_extractor.process([message2])[0]

    assert processed_message2.fingerprint() == processed_message.fingerprint()

    detected_entities = processed_message2.get(ENTITIES)

    assert len(detected_entities) == 1
    assert detected_entities[0]["entity"] == "cuisine"
    assert detected_entities[0]["value"] == "italian"
Beispiel #11
0
def test_crf_json_from_non_BILOU(spacy_nlp, ner_crf_pos_feature_config):
    from rasa.nlu.extractors.crf_entity_extractor import CRFEntityExtractor
    ner_crf_pos_feature_config.update({"BILOU_flag": False})
    ext = CRFEntityExtractor(component_config=ner_crf_pos_feature_config)
    sentence = u"I need a home cleaning close-by"
    doc = {"spacy_doc": spacy_nlp(sentence)}
    rs = ext._from_crf_to_json(Message(sentence, doc), [{
        'O': 1.0
    }, {
        'O': 1.0
    }, {
        'O': 1.0
    }, {
        'what': 1.0
    }, {
        'what': 1.0
    }, {
        'where': 1.0
    }, {
        'where': 1.0
    }, {
        'where': 1.0
    }])

    # non BILOU will split multi-word entities - hence 5
    assert len(rs) == 5, "There should be five entities"

    for r in rs:
        assert r['confidence']  # confidence should exist
        del r['confidence']

    assert rs[0] == {'start': 9, 'end': 13, 'value': 'home', 'entity': 'what'}
    assert rs[1] == {
        'start': 14,
        'end': 22,
        'value': 'cleaning',
        'entity': 'what'
    }
    assert rs[2] == {
        'start': 23,
        'end': 28,
        'value': 'close',
        'entity': 'where'
    }
    assert rs[3] == {'start': 28, 'end': 29, 'value': '-', 'entity': 'where'}
    assert rs[4] == {'start': 29, 'end': 31, 'value': 'by', 'entity': 'where'}
Beispiel #12
0
def test_crf_json_from_non_BILOU(spacy_nlp):
    from rasa.nlu.extractors.crf_entity_extractor import CRFEntityExtractor

    ext = CRFEntityExtractor(
        component_config={
            "BILOU_flag": False,
            "features": [
                ["low", "title", "upper", "pos", "pos2"],
                ["low", "suffix3", "suffix2", "upper", "title", "digit", "pos", "pos2"],
                ["low", "title", "upper", "pos", "pos2"],
            ],
        }
    )
    sentence = "I need a home cleaning close-by"

    message = Message(sentence, {SPACY_DOCS[TEXT]: spacy_nlp(sentence)})

    tokenizer = SpacyTokenizer()
    tokenizer.process(message)

    rs = ext._from_crf_to_json(
        message,
        [
            {"O": 1.0},
            {"O": 1.0},
            {"O": 1.0},
            {"what": 1.0},
            {"what": 1.0},
            {"where": 1.0},
            {"where": 1.0},
            {"where": 1.0},
        ],
    )

    # non BILOU will split multi-word entities - hence 5
    assert len(rs) == 5, "There should be five entities"

    for r in rs:
        assert r["confidence"]  # confidence should exist
        del r["confidence"]

    assert rs[0] == {"start": 9, "end": 13, "value": "home", "entity": "what"}
    assert rs[1] == {"start": 14, "end": 22, "value": "cleaning", "entity": "what"}
    assert rs[2] == {"start": 23, "end": 28, "value": "close", "entity": "where"}
    assert rs[3] == {"start": 28, "end": 29, "value": "-", "entity": "where"}
    assert rs[4] == {"start": 29, "end": 31, "value": "by", "entity": "where"}
def test_crf_use_dense_features(spacy_nlp: Any):
    crf_extractor = CRFEntityExtractor(
        component_config={
            "features": [
                ["low", "title", "upper", "pos", "pos2"],
                [
                    "low",
                    "suffix3",
                    "suffix2",
                    "upper",
                    "title",
                    "digit",
                    "pos",
                    "pos2",
                    "text_dense_features",
                ],
                ["low", "title", "upper", "pos", "pos2"],
            ]
        }
    )

    spacy_featurizer = SpacyFeaturizer()
    spacy_tokenizer = SpacyTokenizer()

    text = "Rasa is a company in Berlin"
    message = Message(data={TEXT: text})
    message.set(SPACY_DOCS[TEXT], spacy_nlp(text))

    spacy_tokenizer.process(message)
    spacy_featurizer.process(message)

    text_data = crf_extractor._convert_to_crf_tokens(message)
    features = crf_extractor._crf_tokens_to_features(text_data)

    assert "0:text_dense_features" in features[0]
    dense_features, _ = message.get_dense_features(TEXT, [])
    if dense_features:
        dense_features = dense_features.features

    for i in range(0, len(dense_features[0])):
        assert (
            features[0]["0:text_dense_features"]["text_dense_features"][str(i)]
            == dense_features[0][i]
        )
Beispiel #14
0
def test_crf_json_from_BILOU(spacy_nlp, ner_crf_pos_feature_config):
    from rasa.nlu.extractors.crf_entity_extractor import CRFEntityExtractor
    ext = CRFEntityExtractor(component_config=ner_crf_pos_feature_config)
    sentence = u"I need a home cleaning close-by"
    doc = {"spacy_doc": spacy_nlp(sentence)}
    r = ext._from_crf_to_json(Message(sentence, doc), [{
        'O': 1.0
    }, {
        'O': 1.0
    }, {
        'O': 1.0
    }, {
        'B-what': 1.0
    }, {
        'L-what': 1.0
    }, {
        'B-where': 1.0
    }, {
        'I-where': 1.0
    }, {
        'L-where': 1.0
    }])
    assert len(r) == 2, "There should be two entities"

    assert r[0]["confidence"]  # confidence should exist
    del r[0]["confidence"]
    assert r[0] == {
        'start': 9,
        'end': 22,
        'value': 'home cleaning',
        'entity': 'what'
    }

    assert r[1]["confidence"]  # confidence should exist
    del r[1]["confidence"]
    assert r[1] == {
        'start': 23,
        'end': 31,
        'value': 'close-by',
        'entity': 'where'
    }
Beispiel #15
0
def test_crf_use_dense_features(spacy_nlp):
    crf_extractor = CRFEntityExtractor(
        component_config={
            "features": [
                ["low", "title", "upper", "pos", "pos2"],
                [
                    "low",
                    "suffix3",
                    "suffix2",
                    "upper",
                    "title",
                    "digit",
                    "pos",
                    "pos2",
                    "text_dense_features",
                ],
                ["low", "title", "upper", "pos", "pos2"],
            ]
        }
    )

    spacy_featurizer = SpacyFeaturizer()
    spacy_tokenizer = SpacyTokenizer()

    text = "Rasa is a company in Berlin"
    message = Message(text)
    message.set(SPACY_DOCS[TEXT], spacy_nlp(text))

    spacy_tokenizer.process(message)
    spacy_featurizer.process(message)

    text_data = crf_extractor._from_text_to_crf(message)
    features = crf_extractor._sentence_to_features(text_data)

    assert "0:text_dense_features" in features[0]
    for i in range(0, len(message.data.get("text_dense_features")[0])):
        assert (
            features[0]["0:text_dense_features"]["text_dense_features"][str(i)]
            == message.data.get("text_dense_features")[0][i]
        )
Beispiel #16
0
    assert os.path.exists(
        os.path.join(report_folder, "response_selection_confusion_matrix.png"))
    assert os.path.exists(
        os.path.join(report_folder, "response_selection_histogram.png"))
    assert not os.path.exists(
        os.path.join(report_folder, "response_selection_errors.json"))
    assert os.path.exists(
        os.path.join(report_folder, "response_selection_successes.json"))


@pytest.mark.parametrize(
    "components, expected_extractors",
    [
        ([DIETClassifier({ENTITY_RECOGNITION: False})], set()),
        ([DIETClassifier({ENTITY_RECOGNITION: True})], {"DIETClassifier"}),
        ([CRFEntityExtractor()], {"CRFEntityExtractor"}),
        (
            [SpacyEntityExtractor(),
             CRFEntityExtractor()],
            {"SpacyEntityExtractor", "CRFEntityExtractor"},
        ),
        ([ResponseSelector()], set()),
    ],
)
def test_get_entity_extractors(components, expected_extractors):
    mock_interpreter = Interpreter(components, None)
    extractors = get_entity_extractors(mock_interpreter)

    assert extractors == expected_extractors

Beispiel #17
0
        os.path.join(report_folder, "response_selection_histogram.png")
    )
    assert not os.path.exists(
        os.path.join(report_folder, "response_selection_errors.json")
    )
    assert os.path.exists(
        os.path.join(report_folder, "response_selection_successes.json")
    )


@pytest.mark.parametrize(
    "components, expected_extractors",
    [
        ([DIETClassifier({ENTITY_RECOGNITION: False})], set()),
        ([DIETClassifier({ENTITY_RECOGNITION: True})], {"DIETClassifier"}),
        ([CRFEntityExtractor()], {"CRFEntityExtractor"}),
        (
            [SpacyEntityExtractor(), CRFEntityExtractor()],
            {"SpacyEntityExtractor", "CRFEntityExtractor"},
        ),
        ([ResponseSelector()], set()),
    ],
)
def test_get_entity_extractors(
    components: List[Component], expected_extractors: Set[Text]
):
    mock_interpreter = Interpreter(components, None)
    extractors = get_entity_extractors(mock_interpreter)

    assert extractors == expected_extractors
Beispiel #18
0
def test_crf_extractor(spacy_nlp):
    examples = [
        Message(
            "anywhere in the west",
            {
                "intent": "restaurant_search",
                "entities": [
                    {"start": 16, "end": 20, "value": "west", "entity": "location"}
                ],
                SPACY_DOCS[TEXT]: spacy_nlp("anywhere in the west"),
            },
        ),
        Message(
            "central indian restaurant",
            {
                "intent": "restaurant_search",
                "entities": [
                    {
                        "start": 0,
                        "end": 7,
                        "value": "central",
                        "entity": "location",
                        "extractor": "random_extractor",
                    },
                    {
                        "start": 8,
                        "end": 14,
                        "value": "indian",
                        "entity": "cuisine",
                        "extractor": "CRFEntityExtractor",
                    },
                ],
                SPACY_DOCS[TEXT]: spacy_nlp("central indian restaurant"),
            },
        ),
    ]

    extractor = CRFEntityExtractor(
        component_config={
            "features": [
                ["low", "title", "upper", "pos", "pos2"],
                ["low", "suffix3", "suffix2", "upper", "title", "digit", "pos", "pos2"],
                ["low", "title", "upper", "pos", "pos2"],
            ]
        }
    )
    tokenizer = SpacyTokenizer()

    training_data = TrainingData(training_examples=examples)
    tokenizer.train(training_data)
    extractor.train(training_data)

    sentence = "italian restaurant"
    message = Message(sentence, {SPACY_DOCS[TEXT]: spacy_nlp(sentence)})

    tokenizer.process(message)
    extractor.process(message)

    detected_entities = message.get(ENTITIES)

    assert len(detected_entities) == 1
    assert detected_entities[0]["entity"] == "cuisine"
    assert detected_entities[0]["value"] == "italian"
Beispiel #19
0
        os.path.join(report_folder, "response_selection_histogram.png")
    )
    assert not os.path.exists(
        os.path.join(report_folder, "response_selection_errors.json")
    )
    assert os.path.exists(
        os.path.join(report_folder, "response_selection_successes.json")
    )


@pytest.mark.parametrize(
    "components, expected_extractors",
    [
        ([DIETClassifier({ENTITY_RECOGNITION: False})], set()),
        ([DIETClassifier({ENTITY_RECOGNITION: True})], {"DIETClassifier"}),
        ([CRFEntityExtractor()], {"CRFEntityExtractor"}),
        (
            [SpacyEntityExtractor(), CRFEntityExtractor()],
            {"SpacyEntityExtractor", "CRFEntityExtractor"},
        ),
        ([ResponseSelector()], set()),
    ],
)
def test_get_entity_extractors(
    components: List[Component], expected_extractors: Set[Text]
):
    mock_interpreter = Interpreter(components, None)
    extractors = get_entity_extractors(mock_interpreter)

    assert extractors == expected_extractors
Beispiel #20
0
def test_crf_create_entity_dict(spacy_nlp):
    crf_extractor = CRFEntityExtractor()
    spacy_tokenizer = SpacyTokenizer()
    white_space_tokenizer = WhitespaceTokenizer()

    examples = [
        {
            "message": Message(
                "where is St. Michael's Hospital?",
                {
                    "intent": "search_location",
                    "entities": [
                        {
                            "start": 9,
                            "end": 31,
                            "value": "St. Michael's Hospital",
                            "entity": "hospital",
                            "SpacyTokenizer": {
                                "entity_start_token_idx": 2,
                                "entity_end_token_idx": 5,
                            },
                            "WhitespaceTokenizer": {
                                "entity_start_token_idx": 2,
                                "entity_end_token_idx": 5,
                            },
                        }
                    ],
                    SPACY_DOCS[TEXT]: spacy_nlp("where is St. Michael's Hospital?"),
                },
            )
        },
        {
            "message": Message(
                "where is Children's Hospital?",
                {
                    "intent": "search_location",
                    "entities": [
                        {
                            "start": 9,
                            "end": 28,
                            "value": "Children's Hospital",
                            "entity": "hospital",
                            "SpacyTokenizer": {
                                "entity_start_token_idx": 2,
                                "entity_end_token_idx": 4,
                            },
                            "WhitespaceTokenizer": {
                                "entity_start_token_idx": 2,
                                "entity_end_token_idx": 4,
                            },
                        }
                    ],
                    SPACY_DOCS[TEXT]: spacy_nlp("where is Children's Hospital?"),
                },
            )
        },
    ]
    for ex in examples:
        # spacy tokenizers receives a Doc as input and whitespace tokenizer receives a text
        spacy_tokens = spacy_tokenizer.tokenize(ex["message"], TEXT)
        white_space_tokens = white_space_tokenizer.tokenize(ex["message"], TEXT)
        for tokenizer, tokens in [
            ("SpacyTokenizer", spacy_tokens),
            ("WhitespaceTokenizer", white_space_tokens),
        ]:
            for entity in ex["message"].get("entities"):
                parsed_entities = crf_extractor._create_entity_dict(
                    ex["message"],
                    tokens,
                    entity[tokenizer]["entity_start_token_idx"],
                    entity[tokenizer]["entity_end_token_idx"],
                    entity["entity"],
                    0.8,
                )
                assert parsed_entities == {
                    "start": entity["start"],
                    "end": entity["end"],
                    "value": entity["value"],
                    "entity": entity["entity"],
                    "confidence": 0.8,
                }
def test_crf_extractor(spacy_nlp, ner_crf_pos_feature_config):
    from rasa.nlu.extractors.crf_entity_extractor import CRFEntityExtractor

    ext = CRFEntityExtractor(component_config=ner_crf_pos_feature_config)
    examples = [
        Message(
            "anywhere in the west",
            {
                "intent":
                "restaurant_search",
                "entities": [{
                    "start": 16,
                    "end": 20,
                    "value": "west",
                    "entity": "location"
                }],
                "spacy_doc":
                spacy_nlp("anywhere in the west"),
            },
        ),
        Message(
            "central indian restaurant",
            {
                "intent":
                "restaurant_search",
                "entities": [
                    {
                        "start": 0,
                        "end": 7,
                        "value": "central",
                        "entity": "location",
                        "extractor": "random_extractor",
                    },
                    {
                        "start": 8,
                        "end": 14,
                        "value": "indian",
                        "entity": "cuisine",
                        "extractor": "CRFEntityExtractor",
                    },
                ],
                "spacy_doc":
                spacy_nlp("central indian restaurant"),
            },
        ),
    ]

    # uses BILOU and the default features
    ext.train(TrainingData(training_examples=examples), RasaNLUModelConfig())
    sentence = "anywhere in the west"
    doc = {"spacy_doc": spacy_nlp(sentence)}
    crf_format = ext._from_text_to_crf(Message(sentence, doc))
    assert [word[0]
            for word in crf_format] == ["anywhere", "in", "the", "west"]
    feats = ext._sentence_to_features(crf_format)
    assert "BOS" in feats[0]
    assert "EOS" in feats[-1]
    assert feats[1]["0:low"] == "in"
    sentence = "anywhere in the west"
    ext.extract_entities(Message(sentence, {"spacy_doc": spacy_nlp(sentence)}))
    filtered = ext.filter_trainable_entities(examples)
    assert filtered[0].get("entities") == [{
        "start": 16,
        "end": 20,
        "value": "west",
        "entity": "location"
    }], "Entity without extractor remains"
    assert filtered[1].get("entities") == [{
        "start": 8,
        "end": 14,
        "value": "indian",
        "entity": "cuisine",
        "extractor": "CRFEntityExtractor",
    }], "Only CRFEntityExtractor entity annotation remains"
    assert examples[1].get("entities")[0] == {
        "start": 0,
        "end": 7,
        "value": "central",
        "entity": "location",
        "extractor": "random_extractor",
    }, "Original examples are not mutated"
class IncrementalCRFEntityExtractor(EntityExtractor, IncrementalComponent):

    provides = ["entities"]

    requires = ["tokens"]

    def __init__(self,
                 component_config: Optional[Dict[Text, Any]]=None,
                 ent_tagger: Optional[Dict[Text, Any]]=None) -> None:

        super(IncrementalCRFEntityExtractor, self).__init__(component_config)

        self.CRFEE = CRFEntityExtractor(component_config, ent_tagger)
        self.prev_ents = []

    def new_utterance(self):
        self.prev_ents = []

    @classmethod
    def required_packages(cls):
        return ["sklearn_crfsuite", "sklearn"]

    def train(self,
              training_data: TrainingData,
              config: RasaNLUModelConfig,
              **kwargs: Any) -> None:

        self.CRFEE.train(training_data, config, **kwargs)

    def process(self, message: Message, **kwargs: Any) -> None:
        iu_list = message.get('iu_list')
        last_iu = iu_list[-1]
        iu_word, iu_type = last_iu
        # TODO: inefficient right now, we are always storing
        # previous state, even if a new entity hasn't been
        # added

        # This will not work with multiple extractors
        if iu_type == "add":
            extracted = self.add_extractor_name(
                self.CRFEE.extract_entities(message))
            message.set("entities", extracted, add_to_output=True)
            self.prev_ents.append(message.get("entities"))
        elif iu_type == "revoke":
            if len(self.prev_ents) > 0:
                prev_ent = self.prev_ents.pop()
                message.set("entities", prev_ent,
                            add_to_output=True)

    @classmethod
    def load(cls,
             meta: Dict[Text, Any],
             model_dir: Text = None,
             model_metadata: Metadata = None,
             cached_component: Optional['IncrementalCRFEntityExtractor'] = None,
             **kwargs: Any
             ) -> 'IncrementalCRFEntityExtractor':
        from sklearn.externals import joblib

        file_name = meta.get("file")
        model_file = os.path.join(model_dir, file_name)

        if os.path.exists(model_file):
            ent_tagger = joblib.load(model_file)
            return cls(meta, ent_tagger)
        else:
            return cls(meta)

    def persist(self,
                file_name: Text,
                model_dir: Text) -> Optional[Dict[Text, Any]]:
        """Persist this model into the passed directory.

        Returns the metadata necessary to load the model again."""

        return self.CRFEE.persist((file_name) + "_incr", model_dir)