def test_crf_json_from_non_BILOU(spacy_nlp): from rasa_nlu.extractors.crf_entity_extractor import CRFEntityExtractor ext = CRFEntityExtractor() ext.BILOU_flag = False sentence = u"I need a home cleaning close-by" doc = {"spacy_doc": spacy_nlp(sentence)} rs = ext._from_crf_to_json(Message(sentence, doc), [{'O': 1.0}, {'O': 1.0}, {'O': 1.0}, {'what': 1.0}, {'what': 1.0}, {'where': 1.0}, {'where': 1.0}, {'where': 1.0}]) # non BILOU will split multi-word entities - hence 5 assert len(rs) == 5, "There should be five entities" for r in rs: assert r['confidence'] # confidence should exist del r['confidence'] assert rs[0] == {'start': 9, 'end': 13, 'value': 'home', 'entity': 'what'} assert rs[1] == {'start': 14, 'end': 22, 'value': 'cleaning', 'entity': 'what'} assert rs[2] == {'start': 23, 'end': 28, 'value': 'close', 'entity': 'where'} assert rs[3] == {'start': 28, 'end': 29, 'value': '-', 'entity': 'where'} assert rs[4] == {'start': 29, 'end': 31, 'value': 'by', 'entity': 'where'}
def test_crf_json_from_BILOU(spacy_nlp): from rasa_nlu.extractors.crf_entity_extractor import CRFEntityExtractor ext = CRFEntityExtractor() ext.BILOU_flag = True sentence = u"I need a home cleaning close-by" doc = {"spacy_doc": spacy_nlp(sentence)} r = ext._from_crf_to_json(Message(sentence, doc), [{'O': 1.0}, {'O': 1.0}, {'O': 1.0}, {'B-what': 1.0}, {'L-what': 1.0}, {'B-where': 1.0}, {'I-where': 1.0}, {'L-where': 1.0}]) assert len(r) == 2, "There should be two entities" assert r[0]["confidence"] # confidence should exist del r[0]["confidence"] assert r[0] == {'start': 9, 'end': 22, 'value': 'home cleaning', 'entity': 'what'} assert r[1]["confidence"] # confidence should exist del r[1]["confidence"] assert r[1] == {'start': 23, 'end': 31, 'value': 'close-by', 'entity': 'where'}
def test_crf_json_from_BILOU(spacy_nlp): from rasa_nlu.extractors.crf_entity_extractor import CRFEntityExtractor ext = CRFEntityExtractor() ext.BILOU_flag = True sentence = u"I need a home cleaning close-by" doc = {"spacy_doc": spacy_nlp(sentence)} r = ext._from_crf_to_json(Message(sentence, doc), [{'O': 1.0}, {'O': 1.0}, {'O': 1.0}, {'B-what': 1.0}, {'L-what': 1.0}, {'B-where': 1.0}, {'I-where': 1.0}, {'L-where': 1.0}]) assert len(r) == 2, "There should be two entities" assert r[0]["confidence"] # confidence should exist del r[0]["confidence"] assert r[0] == {'start': 9, 'end': 22, 'value': 'home cleaning', 'entity': 'what'} assert r[1]["confidence"] # confidence should exist del r[1]["confidence"] assert r[1] == {'start': 23, 'end': 31, 'value': 'close-by', 'entity': 'where'}
def test_crf_json_from_non_BILOU(spacy_nlp): from rasa_nlu.extractors.crf_entity_extractor import CRFEntityExtractor ext = CRFEntityExtractor(component_config={"BILOU_flag": False}) sentence = u"I need a home cleaning close-by" doc = {"spacy_doc": spacy_nlp(sentence)} rs = ext._from_crf_to_json(Message(sentence, doc), [{'O': 1.0}, {'O': 1.0}, {'O': 1.0}, {'what': 1.0}, {'what': 1.0}, {'where': 1.0}, {'where': 1.0}, {'where': 1.0}]) # non BILOU will split multi-word entities - hence 5 assert len(rs) == 5, "There should be five entities" for r in rs: assert r['confidence'] # confidence should exist del r['confidence'] assert rs[0] == {'start': 9, 'end': 13, 'value': 'home', 'entity': 'what'} assert rs[1] == {'start': 14, 'end': 22, 'value': 'cleaning', 'entity': 'what'} assert rs[2] == {'start': 23, 'end': 28, 'value': 'close', 'entity': 'where'} assert rs[3] == {'start': 28, 'end': 29, 'value': '-', 'entity': 'where'} assert rs[4] == {'start': 29, 'end': 31, 'value': 'by', 'entity': 'where'}
def test_crf_json_from_BILOU(spacy_nlp): from rasa_nlu.extractors.crf_entity_extractor import CRFEntityExtractor ext = CRFEntityExtractor() ext.BILOU_flag = True sentence = u"I need a home cleaning close-by" r = ext._from_crf_to_json(spacy_nlp(sentence), ['O', 'O', 'O', 'B-what', 'L-what', 'B-where', 'I-where', 'L-where']) assert len(r) == 2, "There should be two entities" assert r[0] == {u'start': 9, u'end': 22, u'value': u'home cleaning', u'entity': u'what'} assert r[1] == {u'start': 23, u'end': 31, u'value': u'close-by', u'entity': u'where'}
def test_crf_json_from_BILOU(spacy_nlp): from rasa_nlu.extractors.crf_entity_extractor import CRFEntityExtractor ext = CRFEntityExtractor() ext.BILOU_flag = True sentence = u"I need a home cleaning close-by" r = ext._from_crf_to_json(Message(sentence, {"spacy_doc": spacy_nlp(sentence)}), ['O', 'O', 'O', 'B-what', 'L-what', 'B-where', 'I-where', 'L-where']) assert len(r) == 2, "There should be two entities" assert r[0] == {u'start': 9, u'end': 22, u'value': u'home cleaning', u'entity': u'what'} assert r[1] == {u'start': 23, u'end': 31, u'value': u'close-by', u'entity': u'where'}
def test_crf_json_from_non_BILOU(spacy_nlp): from rasa_nlu.extractors.crf_entity_extractor import CRFEntityExtractor ext = CRFEntityExtractor() ext.BILOU_flag = False sentence = u"I need a home cleaning close-by" r = ext._from_crf_to_json(spacy_nlp(sentence), ['O', 'O', 'O', 'what', 'what', 'where', 'where', 'where']) assert len(r) == 5, "There should be five entities" # non BILOU will split multi-word entities - hence 5 assert r[0] == {u'start': 9, u'end': 13, u'value': u'home', u'entity': u'what'} assert r[1] == {u'start': 14, u'end': 22, u'value': u'cleaning', u'entity': u'what'} assert r[2] == {u'start': 23, u'end': 28, u'value': u'close', u'entity': u'where'} assert r[3] == {u'start': 28, u'end': 29, u'value': u'-', u'entity': u'where'} assert r[4] == {u'start': 29, u'end': 31, u'value': u'by', u'entity': u'where'}
def test_crf_json_from_non_BILOU(spacy_nlp): from rasa_nlu.extractors.crf_entity_extractor import CRFEntityExtractor ext = CRFEntityExtractor() ext.BILOU_flag = False sentence = u"I need a home cleaning close-by" r = ext._from_crf_to_json(Message(sentence, {"spacy_doc": spacy_nlp(sentence)}), ['O', 'O', 'O', 'what', 'what', 'where', 'where', 'where']) assert len(r) == 5, "There should be five entities" # non BILOU will split multi-word entities - hence 5 assert r[0] == {u'start': 9, u'end': 13, u'value': u'home', u'entity': u'what'} assert r[1] == {u'start': 14, u'end': 22, u'value': u'cleaning', u'entity': u'what'} assert r[2] == {u'start': 23, u'end': 28, u'value': u'close', u'entity': u'where'} assert r[3] == {u'start': 28, u'end': 29, u'value': u'-', u'entity': u'where'} assert r[4] == {u'start': 29, u'end': 31, u'value': u'by', u'entity': u'where'}
def test_crf_extractor(spacy_nlp): from rasa_nlu.extractors.crf_entity_extractor import CRFEntityExtractor ext = CRFEntityExtractor() examples = [ Message("anywhere in the west", { "intent": "restaurant_search", "entities": [{"start": 16, "end": 20, "value": "west", "entity": "location"}], "spacy_doc": spacy_nlp("anywhere in the west") }), Message("central indian restaurant", { "intent": "restaurant_search", "entities": [{"start": 0, "end": 7, "value": "central", "entity": "location"}], "spacy_doc": spacy_nlp("central indian restaurant") })] config = {"ner_crf": {"BILOU_flag": True, "features": ext.crf_features}} ext.train(TrainingData(training_examples=examples), config) sentence = 'anywhere in the west' crf_format = ext._from_text_to_crf(Message(sentence, {"spacy_doc": spacy_nlp(sentence)})) assert [word[0] for word in crf_format] == ['anywhere', 'in', 'the', 'west'] feats = ext._sentence_to_features(crf_format) assert 'BOS' in feats[0] assert 'EOS' in feats[-1] assert feats[1]['0:low'] == "in" sentence = 'anywhere in the west' ext.extract_entities(Message(sentence, {"spacy_doc": spacy_nlp(sentence)}))
def test_crf_extractor(spacy_nlp): from rasa_nlu.extractors.crf_entity_extractor import CRFEntityExtractor ext = CRFEntityExtractor() examples = [ Message("anywhere in the west", { "intent": "restaurant_search", "entities": [{"start": 16, "end": 20, "value": "west", "entity": "location"}], "spacy_doc": spacy_nlp("anywhere in the west") }), Message("central indian restaurant", { "intent": "restaurant_search", "entities": [{"start": 0, "end": 7, "value": "central", "entity": "location"}], "spacy_doc": spacy_nlp("central indian restaurant") })] config = {"ner_crf": {"BILOU_flag": True, "features": ext.crf_features}} ext.train(TrainingData(training_examples=examples), config) sentence = 'anywhere in the west' crf_format = ext._from_text_to_crf(Message(sentence, {"spacy_doc": spacy_nlp(sentence)})) assert [word[0] for word in crf_format] == ['anywhere', 'in', 'the', 'west'] feats = ext._sentence_to_features(crf_format) assert 'BOS' in feats[0] assert 'EOS' in feats[-1] assert feats[1]['0:low'] == "in" sentence = 'anywhere in the west' ext.extract_entities(Message(sentence, {"spacy_doc": spacy_nlp(sentence)}))
def test_crf_extractor(spacy_nlp, ner_crf_pos_feature_config): from rasa_nlu.extractors.crf_entity_extractor import CRFEntityExtractor ext = CRFEntityExtractor(component_config=ner_crf_pos_feature_config) examples = [ Message("anywhere in the west", { "intent": "restaurant_search", "entities": [{"start": 16, "end": 20, "value": "west", "entity": "location"}], "spacy_doc": spacy_nlp("anywhere in the west") }), Message("central indian restaurant", { "intent": "restaurant_search", "entities": [ {"start": 0, "end": 7, "value": "central", "entity": "location", "extractor": "random_extractor"}, {"start": 8, "end": 14, "value": "indian", "entity": "cuisine", "extractor": "ner_crf"} ], "spacy_doc": spacy_nlp("central indian restaurant") })] # uses BILOU and the default features ext.train(TrainingData(training_examples=examples), RasaNLUModelConfig()) sentence = 'anywhere in the west' doc = {"spacy_doc": spacy_nlp(sentence)} crf_format = ext._from_text_to_crf(Message(sentence, doc)) assert [word[0] for word in crf_format] == ['anywhere', 'in', 'the', 'west'] feats = ext._sentence_to_features(crf_format) assert 'BOS' in feats[0] assert 'EOS' in feats[-1] assert feats[1]['0:low'] == "in" sentence = 'anywhere in the west' ext.extract_entities(Message(sentence, {"spacy_doc": spacy_nlp(sentence)})) filtered = ext.filter_trainable_entities(examples) assert filtered[0].get('entities') == [ {"start": 16, "end": 20, "value": "west", "entity": "location"} ], 'Entity without extractor remains' assert filtered[1].get('entities') == [ {"start": 8, "end": 14, "value": "indian", "entity": "cuisine", "extractor": "ner_crf"} ], 'Only ner_crf entity annotation remains' assert examples[1].get('entities')[0] == { "start": 0, "end": 7, "value": "central", "entity": "location", "extractor": "random_extractor" }, 'Original examples are not mutated'
def test_crf_extractor(spacy_nlp, ner_crf_pos_feature_config): from rasa_nlu.extractors.crf_entity_extractor import CRFEntityExtractor ext = CRFEntityExtractor(component_config=ner_crf_pos_feature_config) examples = [ Message("anywhere in the west", { "intent": "restaurant_search", "entities": [{"start": 16, "end": 20, "value": "west", "entity": "location"}], "spacy_doc": spacy_nlp("anywhere in the west") }), Message("central indian restaurant", { "intent": "restaurant_search", "entities": [ {"start": 0, "end": 7, "value": "central", "entity": "location", "extractor": "random_extractor"}, {"start": 8, "end": 14, "value": "indian", "entity": "cuisine", "extractor": "CRFEntityExtractor"} ], "spacy_doc": spacy_nlp("central indian restaurant") })] # uses BILOU and the default features ext.train(TrainingData(training_examples=examples), RasaNLUModelConfig()) sentence = 'anywhere in the west' doc = {"spacy_doc": spacy_nlp(sentence)} crf_format = ext._from_text_to_crf(Message(sentence, doc)) assert [word[0] for word in crf_format] == ['anywhere', 'in', 'the', 'west'] feats = ext._sentence_to_features(crf_format) assert 'BOS' in feats[0] assert 'EOS' in feats[-1] assert feats[1]['0:low'] == "in" sentence = 'anywhere in the west' ext.extract_entities(Message(sentence, {"spacy_doc": spacy_nlp(sentence)})) filtered = ext.filter_trainable_entities(examples) assert filtered[0].get('entities') == [ {"start": 16, "end": 20, "value": "west", "entity": "location"} ], 'Entity without extractor remains' assert filtered[1].get('entities') == [ {"start": 8, "end": 14, "value": "indian", "entity": "cuisine", "extractor": "CRFEntityExtractor"} ], 'Only CRFEntityExtractor entity annotation remains' assert examples[1].get('entities')[0] == { "start": 0, "end": 7, "value": "central", "entity": "location", "extractor": "random_extractor" }, 'Original examples are not mutated'
def test_crf_extractor(spacy_nlp): from rasa_nlu.extractors.crf_entity_extractor import CRFEntityExtractor ext = CRFEntityExtractor() examples = [{ "text": "anywhere in the west", "intent": "restaurant_search", "entities": [{ "start": 16, "end": 20, "value": "west", "entity": "location" }] }, { "text": "central indian restaurant", "intent": "restaurant_search", "entities": [{ "start": 0, "end": 7, "value": "central", "entity": "location" }] }] ext.train(TrainingData(entity_examples_only=examples), spacy_nlp, True, ext.crf_features) crf_format = ext._from_text_to_crf('anywhere in the west', spacy_nlp) assert ([word[0] for word in crf_format] == ['anywhere', 'in', 'the', 'west']) feats = ext._sentence_to_features(crf_format) assert ('BOS' in feats[0]) assert ('EOS' in feats[-1]) assert ('0:low:in' in feats[1]) ext.extract_entities('anywhere in the west', spacy_nlp)
def test_crf_extractor(spacy_nlp): from rasa_nlu.extractors.crf_entity_extractor import CRFEntityExtractor ext = CRFEntityExtractor() examples = [ { "text": "anywhere in the west", "intent": "restaurant_search", "entities": [{"start": 16, "end": 20, "value": "west", "entity": "location"}] }, { "text": "central indian restaurant", "intent": "restaurant_search", "entities": [{"start": 0, "end": 7, "value": "central", "entity": "location"}] }] ext.train(TrainingData(training_examples=examples), spacy_nlp, True, ext.crf_features) crf_format = ext._from_text_to_crf('anywhere in the west', spacy_nlp) assert ([word[0] for word in crf_format] == ['anywhere', 'in', 'the', 'west']) feats = ext._sentence_to_features(crf_format) assert ('BOS' in feats[0]) assert ('EOS' in feats[-1]) assert ('0:low:in' in feats[1]) ext.extract_entities('anywhere in the west', spacy_nlp)
from rasa_nlu.train import load_data from rasa_nlu.config import RasaNLUModelConfig from rasa_nlu.utils.spacy_utils import SpacyNLP from rasa_nlu.tokenizers.spacy_tokenizer import SpacyTokenizer from rasa_nlu.extractors.crf_entity_extractor import CRFEntityExtractor import spacy config = RasaNLUModelConfig() training_data = load_data("data/examples/rasa/demo-rasa.json") SpacyNLP(nlp=spacy.load("en")).train(training_data, config) SpacyTokenizer().train(training_data, config) print(training_data.training_examples[25].as_dict()) crf = CRFEntityExtractor() filtered_data = crf.filter_trainable_entities(training_data.training_examples) # Create Dataset # dataset = crf._create_dataset(filtered_data) ## Convert Examples dataset = [] ## Convert JSON TO CRF for training_example in filtered_data: entity_offsets = crf._convert_example(training_example) print("Entity Offset", entity_offsets) # b = crf._from_json_to_crf(training_example, entity_offsets) # print("JSON to CRF", b)