def test_issue3345():
    """Test case where preset entity crosses sentence boundary."""
    nlp = English()
    doc = Doc(nlp.vocab, words=["I", "live", "in", "New", "York"])
    doc[4].is_sent_start = True
    ruler = EntityRuler(nlp,
                        patterns=[{
                            "label": "GPE",
                            "pattern": "New York"
                        }])
    config = {
        "learn_tokens": False,
        "min_action_freq": 30,
        "update_with_oracle_cut_size": 100,
    }
    cfg = {"model": DEFAULT_NER_MODEL}
    model = registry.resolve(cfg, validate=True)["model"]
    ner = EntityRecognizer(doc.vocab, model, **config)
    # Add the OUT action. I wouldn't have thought this would be necessary...
    ner.moves.add_action(5, "")
    ner.add_label("GPE")
    doc = ruler(doc)
    # Get into the state just before "New"
    state = ner.moves.init_batch([doc])[0]
    ner.moves.apply_transition(state, "O")
    ner.moves.apply_transition(state, "O")
    ner.moves.apply_transition(state, "O")
    # Check that B-GPE is valid.
    assert ner.moves.is_valid(state, "B-GPE")
Example #2
0
def train(nlp, data, ents, num_iterations=20):
    """

    :param nlp: nlp instance
    :param data: training data(look at required format below)
    :param ents: list of entities
    :param num_iterations: number iterations to train
    :return: trained NER tagger
    """

    # Example :
    # train_data = [
    #     (
    #         'Who is Shaka Khan?',
    #         [(len('Who is '), len('Who is Shaka Khan'), 'PERSON')]
    #     ), ...
    # ]

    for sent, _ in data:
        doc = nlp.make_doc(sent)
        for word in doc:
            _ = nlp.vocab[word.orth]

    result_NER = EntityRecognizer(nlp.vocab, entity_types=ents)
    for _ in range(num_iterations):
        random.shuffle(data)
        for sent, entity_offsets in data:
            doc = nlp.make_doc(sent)
            gold = GoldParse(doc, entities=entity_offsets)
            result_NER.update(doc, gold)
    return result_NER
Example #3
0
def test_ents_reset(en_vocab):
    text = ["This", "is", "a", "lion"]
    doc = get_doc(en_vocab, text)
    ner = EntityRecognizer(en_vocab)
    ner.begin_training([])
    ner(doc)
    assert [t.ent_iob_ for t in doc] == (["O"] * len(doc))
    doc.ents = list(doc.ents)
    assert [t.ent_iob_ for t in doc] == (["O"] * len(doc))
Example #4
0
def test_add_label_deserializes_correctly():
    ner1 = EntityRecognizer(Vocab())
    ner1.add_label("C")
    ner1.add_label("B")
    ner1.add_label("A")
    ner1.begin_training([])
    ner2 = EntityRecognizer(Vocab()).from_bytes(ner1.to_bytes())
    assert ner1.moves.n_moves == ner2.moves.n_moves
    for i in range(ner1.moves.n_moves):
        assert ner1.moves.get_class_name(i) == ner2.moves.get_class_name(i)
Example #5
0
def train_ner(nlp, train_data, entity_types):
    ner = EntityRecognizer(nlp.vocab, entity_types=entity_types)
    for itn in range(5):
        random.shuffle(train_data)
        for raw_text, entity_offsets in train_data:
            doc = nlp.make_doc(raw_text)
            gold = GoldParse(doc, entities=entity_offsets)
            ner.update(doc, gold)
    ner.model.end_training()
    return ner
Example #6
0
def train_ner(nlp, train_data, entity_types):
    ner = EntityRecognizer(nlp.vocab, entity_types=entity_types)
    for itn in range(5):
        random.shuffle(train_data)
        for raw_text, entity_offsets in train_data:
            doc = nlp.make_doc(raw_text)
            gold = GoldParse(doc, entities=entity_offsets)
            ner.update(doc, gold)
    ner.model.end_training()
    return ner
Example #7
0
def train_NER(filepath, vocab, iterations=20):
    print("Training {} iterations".format(iterations))
    docs, postags, entities = read_connl(filepath, vocab)
    ner = EntityRecognizer(vocab, entity_types=LABELS)
    for i in range(iterations):
        if i % 5 == 0:
            print("Iteration {}...".format(i))
        for doc, entity_list in zip(docs, entities):
            ner.update(doc, GoldParse(doc, entities=entity_list))
    print("Done training.")
    return docs, ner
Example #8
0
def test_doc_add_entities_set_ents_iob(en_vocab):
    doc = Doc(en_vocab, words=["This", "is", "a", "lion"])
    ner = EntityRecognizer(en_vocab)
    ner.begin_training([])
    ner(doc)
    assert len(list(doc.ents)) == 0
    assert [w.ent_iob_ for w in doc] == (["O"] * len(doc))
    doc.ents = [(doc.vocab.strings["ANIMAL"], 3, 4)]
    assert [w.ent_iob_ for w in doc] == ["", "", "", "B"]
    doc.ents = [(doc.vocab.strings["WORD"], 0, 2)]
    assert [w.ent_iob_ for w in doc] == ["B", "I", "", ""]
Example #9
0
def test_ents_reset(en_vocab):
    """Ensure that resetting doc.ents does not change anything"""
    text = ["This", "is", "a", "lion"]
    doc = Doc(en_vocab, words=text)
    cfg = {"model": DEFAULT_NER_MODEL}
    model = registry.resolve(cfg, validate=True)["model"]
    ner = EntityRecognizer(en_vocab, model)
    ner.initialize(lambda: [_ner_example(ner)])
    ner(doc)
    orig_iobs = [t.ent_iob_ for t in doc]
    doc.ents = list(doc.ents)
    assert [t.ent_iob_ for t in doc] == orig_iobs
Example #10
0
    def train(self, nlp, entity_examples):
        train_data = self.convert_examples(entity_examples)
        ent_types = [[ent["entity"] for ent in ex["entities"]] for ex in entity_examples]
        entity_types = list(set(sum(ent_types, [])))

        self.ner = EntityRecognizer(nlp.vocab, entity_types=entity_types)
        for itn in range(5):
            random.shuffle(train_data)
            for raw_text, entity_offsets in train_data:
                doc = nlp.make_doc(raw_text)
                gold = GoldParse(doc, entities=entity_offsets)
                self.ner.update(doc, gold)
        self.ner.model.end_training()
Example #11
0
def test_doc_add_entities_set_ents_iob(en_vocab):
    text = ["This", "is", "a", "lion"]
    doc = Doc(en_vocab, words=text)
    cfg = {"model": DEFAULT_NER_MODEL}
    model = registry.resolve(cfg, validate=True)["model"]
    ner = EntityRecognizer(en_vocab, model)
    ner.initialize(lambda: [_ner_example(ner)])
    ner(doc)

    doc.ents = [("ANIMAL", 3, 4)]
    assert [w.ent_iob_ for w in doc] == ["O", "O", "O", "B"]

    doc.ents = [("WORD", 0, 2)]
    assert [w.ent_iob_ for w in doc] == ["B", "I", "O", "O"]
def runspacymodel(sentences, tagger, model):
    # model = 'en_core_web_sm'
    nlp = spacy.load(model)
    ner = EntityRecognizer(nlp.vocab)
    ner.from_disk(tagger)

    result = []
    for sentence in sentences:
        doc = spacy.tokens.doc.Doc(nlp.vocab, words=sentence)

        # run ner against every sentence
        processed = ner(doc)
        for token in processed:
            result.append([token.text, token.ent_type_])
    return result
Example #13
0
def train_ner(nlp, train_data, entity_types):
    # Add new words to vocab.
    for raw_text, _ in train_data:
        doc = nlp.make_doc(raw_text)
        for word in doc:
            _ = nlp.vocab[word.orth]

    # Train NER.
    ner = EntityRecognizer(nlp.vocab, entity_types=entity_types)
    for itn in range(5):
        random.shuffle(train_data)
        for raw_text, entity_offsets in train_data:
            doc = nlp.make_doc(raw_text)
            gold = GoldParse(doc, entities=entity_offsets)
            ner.update(doc, gold)
    return ner
Example #14
0
def train_ner(nlp, train_data, entity_types):
    # Add new words to vocab.
    for raw_text, _ in train_data:
        doc = nlp.make_doc(raw_text)
        for word in doc:
            _ = nlp.vocab[word.orth]

    # Train NER.
    ner = EntityRecognizer(nlp.vocab, entity_types=entity_types)
    for itn in range(5):
        random.shuffle(train_data)
        for raw_text, entity_offsets in train_data:
            doc = nlp.make_doc(raw_text)
            gold = GoldParse(doc, entities=entity_offsets)
            ner.update(doc, gold)
    return ner
def train_query(queryObj):
    global nlp

    # Our query string
    story = queryObj.story
    querystring = queryObj.querystring
    parsed_ner = queryObj.parsed_ner

    # Where our model is located
    model_path = os.path.normpath(
        os.path.join(settings.SPACYMODEL_DIR, str(story.name)))

    ENTITY_OFFSETS = []
    ENTITY_LIST = []

    for txt in parsed_ner:
        cur_entity = parsed_ner[txt]
        cur_index = querystring.find(txt)
        # If string is found in querystring
        if cur_index != -1:
            ENTITY_OFFSETS.append(
                (cur_index, cur_index + len(txt), cur_entity))

            # Add entity to entity list if its not in there
            if cur_entity not in ENTITY_LIST:
                ENTITY_LIST.append(cur_entity)

    # Our training data
    TRAIN_DATA = [
        (querystring, ENTITY_OFFSETS),
    ]

    # Trains the model
    # loads up existing data if they exist
    ner = EntityRecognizer(nlp.vocab, entity_types=ENTITY_LIST)

    # If our model exists, we load it
    for itn in range(25):
        random.shuffle(TRAIN_DATA)
        for raw_text, entity_offsets in TRAIN_DATA:
            doc = nlp.make_doc(raw_text)
            gold = GoldParse(doc, entities=entity_offsets)
            ner.update(doc, gold)
    ner.model.end_training()

    # Save model
    ner.model.dump(model_path)
def test_ents_reset(en_vocab):
    """Ensure that resetting doc.ents does not change anything"""
    text = ["This", "is", "a", "lion"]
    doc = Doc(en_vocab, words=text)
    config = {
        "learn_tokens": False,
        "min_action_freq": 30,
        "update_with_oracle_cut_size": 100,
    }
    cfg = {"model": DEFAULT_NER_MODEL}
    model = registry.resolve(cfg, validate=True)["model"]
    ner = EntityRecognizer(en_vocab, model, **config)
    ner.initialize(lambda: [_ner_example(ner)])
    ner(doc)
    orig_iobs = [t.ent_iob_ for t in doc]
    doc.ents = list(doc.ents)
    assert [t.ent_iob_ for t in doc] == orig_iobs
Example #17
0
def load_model(model_dir):
    model_dir = pathlib.Path(model_dir)
    nlp = spacy.load('en', parser=False, entity=False, add_vectors=False)
    with (model_dir / 'vocab' / 'strings.json').open('r', encoding='utf8') as file_:
        nlp.vocab.strings.load(file_)
    nlp.vocab.load_lexemes(model_dir / 'vocab' / 'lexemes.bin')
    ner = EntityRecognizer.load(model_dir, nlp.vocab, require=True)
    return (nlp, ner)
Example #18
0
def test_add_label_deserializes_correctly():
    ner1 = EntityRecognizer(Vocab())
    ner1.add_label("C")
    ner1.add_label("B")
    ner1.add_label("A")
    ner1.begin_training([])
    ner2 = EntityRecognizer(Vocab()).from_bytes(ner1.to_bytes())
    assert ner1.moves.n_moves == ner2.moves.n_moves
    for i in range(ner1.moves.n_moves):
        assert ner1.moves.get_class_name(i) == ner2.moves.get_class_name(i)
Example #19
0
def load_model(model_dir):
    model_dir = pathlib.Path(model_dir)
    nlp = en_core_web_sm.load()
    with (model_dir / 'vocab' / 'strings.json').open('r',
                                                     encoding='utf8') as file_:
        nlp.vocab.strings.load(file_)
    nlp.vocab.load_lexemes(model_dir / 'vocab' / 'lexemes.bin')
    ner = EntityRecognizer.load(model_dir, nlp.vocab, require=True)
    return nlp, ner
Example #20
0
def load_model(model_dir):
    model_dir = pathlib.Path(model_dir)
    nlp = spacy.load('en', parser=False, entity=False, add_vectors=False)
    with (model_dir / 'vocab' / 'strings.json').open('r',
                                                     encoding='utf8') as file_:
        nlp.vocab.strings.load(file_)
    nlp.vocab.load_lexemes(model_dir / 'vocab' / 'lexemes.bin')
    ner = EntityRecognizer.load(model_dir, nlp.vocab, require=True)
    return (nlp, ner)
Example #21
0
def test_issue3345():
    """Test case where preset entity crosses sentence boundary."""
    nlp = English()
    doc = Doc(nlp.vocab, words=["I", "live", "in", "New", "York"])
    doc[4].is_sent_start = True
    ruler = EntityRuler(nlp, patterns=[{"label": "GPE", "pattern": "New York"}])
    ner = EntityRecognizer(doc.vocab)
    # Add the OUT action. I wouldn't have thought this would be necessary...
    ner.moves.add_action(5, "")
    ner.add_label("GPE")
    doc = ruler(doc)
    # Get into the state just before "New"
    state = ner.moves.init_batch([doc])[0]
    ner.moves.apply_transition(state, "O")
    ner.moves.apply_transition(state, "O")
    ner.moves.apply_transition(state, "O")
    # Check that B-GPE is valid.
    assert ner.moves.is_valid(state, "B-GPE")
Example #22
0
def test_issue3345():
    """Test case where preset entity crosses sentence boundary."""
    nlp = English()
    doc = Doc(nlp.vocab, words=["I", "live", "in", "New", "York"])
    doc[4].is_sent_start = True
    ruler = EntityRuler(nlp, patterns=[{"label": "GPE", "pattern": "New York"}])
    ner = EntityRecognizer(doc.vocab)
    # Add the OUT action. I wouldn't have thought this would be necessary...
    ner.moves.add_action(5, "")
    ner.add_label("GPE")
    doc = ruler(doc)
    # Get into the state just before "New"
    state = ner.moves.init_batch([doc])[0]
    ner.moves.apply_transition(state, "O")
    ner.moves.apply_transition(state, "O")
    ner.moves.apply_transition(state, "O")
    # Check that B-GPE is valid.
    assert ner.moves.is_valid(state, "B-GPE")
Example #23
0
class SpacyEntityExtractor(object):
    def __init__(self, nlp=None, extractor_file=None):
        if extractor_file:
            self.ner = EntityRecognizer.load(pathlib.Path(extractor_file), nlp.vocab)
        else:
            self.ner = None

    def convert_examples(self, entity_examples):
        def convert_entity(ent):
            return ent["start"], ent["end"], ent["entity"]

        def convert_example(ex):
            return ex["text"], [convert_entity(ent) for ent in ex["entities"]]

        return [convert_example(ex) for ex in entity_examples]

    def train(self, nlp, entity_examples):
        train_data = self.convert_examples(entity_examples)
        ent_types = [[ent["entity"] for ent in ex["entities"]] for ex in entity_examples]
        entity_types = list(set(sum(ent_types, [])))

        self.ner = EntityRecognizer(nlp.vocab, entity_types=entity_types)
        for itn in range(5):
            random.shuffle(train_data)
            for raw_text, entity_offsets in train_data:
                doc = nlp.make_doc(raw_text)
                gold = GoldParse(doc, entities=entity_offsets)
                self.ner.update(doc, gold)
        self.ner.model.end_training()

    def extract_entities(self, nlp, sentence):
        doc = nlp.make_doc(sentence)
        nlp.tagger(doc)
        self.ner(doc)

        entities = [
          {
            "entity": ent.label_,
            "value": ent.text,
            "start": ent.start_char,
            "end": ent.end_char
          }
          for ent in doc.ents]
        return entities
    def load(cls, model_dir, entity_extractor_spacy, fine_tune_spacy_ner, spacy_nlp):
        # type: (Text, Text, bool, Language) -> SpacyEntityExtractor
        from spacy.pipeline import EntityRecognizer

        if model_dir and entity_extractor_spacy:
            ner_dir = os.path.join(model_dir, entity_extractor_spacy)
            ner = EntityRecognizer.load(pathlib.Path(ner_dir), spacy_nlp.vocab)
            return SpacyEntityExtractor(fine_tune_spacy_ner, ner)
        else:
            return SpacyEntityExtractor(fine_tune_spacy_ner)
def test_doc_add_entities_set_ents_iob(en_vocab):
    text = ["This", "is", "a", "lion"]
    doc = Doc(en_vocab, words=text)
    config = {
        "learn_tokens": False,
        "min_action_freq": 30,
        "update_with_oracle_cut_size": 100,
    }
    cfg = {"model": DEFAULT_NER_MODEL}
    model = registry.resolve(cfg, validate=True)["model"]
    ner = EntityRecognizer(en_vocab, model, **config)
    ner.initialize(lambda: [_ner_example(ner)])
    ner(doc)

    doc.ents = [("ANIMAL", 3, 4)]
    assert [w.ent_iob_ for w in doc] == ["O", "O", "O", "B"]

    doc.ents = [("WORD", 0, 2)]
    assert [w.ent_iob_ for w in doc] == ["B", "I", "O", "O"]
Example #26
0
    def __init__(self,
                 nlp=None,
                 extractor_file=None,
                 should_fine_tune_spacy_ner=False):
        self.nlp = nlp
        if extractor_file:
            self.ner = EntityRecognizer.load(pathlib.Path(extractor_file),
                                             nlp.vocab)
        else:
            self.ner = None

        self.should_fine_tune_spacy_ner = should_fine_tune_spacy_ner
Example #27
0
def test_issue3345(entity_ruler_factory):
    """Test case where preset entity crosses sentence boundary."""
    nlp = English()
    doc = Doc(nlp.vocab, words=["I", "live", "in", "New", "York"])
    doc[4].is_sent_start = True
    ruler = nlp.add_pipe(entity_ruler_factory, name="entity_ruler")
    ruler.add_patterns([{"label": "GPE", "pattern": "New York"}])
    cfg = {"model": DEFAULT_NER_MODEL}
    model = registry.resolve(cfg, validate=True)["model"]
    ner = EntityRecognizer(doc.vocab, model)
    # Add the OUT action. I wouldn't have thought this would be necessary...
    ner.moves.add_action(5, "")
    ner.add_label("GPE")
    doc = ruler(doc)
    # Get into the state just before "New"
    state = ner.moves.init_batch([doc])[0]
    ner.moves.apply_transition(state, "O")
    ner.moves.apply_transition(state, "O")
    ner.moves.apply_transition(state, "O")
    # Check that B-GPE is valid.
    assert ner.moves.is_valid(state, "B-GPE")
Example #28
0
def test_overwrite_token():
    nlp = English()
    ner1 = nlp.create_pipe("ner")
    nlp.add_pipe(ner1, name="ner")
    nlp.begin_training()

    # The untrained NER will predict O for each token
    doc = nlp("I live in New York")
    assert [token.ent_iob_ for token in doc] == ["O", "O", "O", "O", "O"]
    assert [token.ent_type_ for token in doc] == ["", "", "", "", ""]

    # Check that a new ner can overwrite O
    ner2 = EntityRecognizer(doc.vocab)
    ner2.moves.add_action(5, "")
    ner2.add_label("GPE")
    state = ner2.moves.init_batch([doc])[0]
    assert ner2.moves.is_valid(state, "B-GPE")
    assert ner2.moves.is_valid(state, "U-GPE")
    ner2.moves.apply_transition(state, "B-GPE")
    assert ner2.moves.is_valid(state, "I-GPE")
    assert ner2.moves.is_valid(state, "L-GPE")
Example #29
0
def test_issue4313():
    """ This should not crash or exit with some strange error code """
    beam_width = 16
    beam_density = 0.0001
    nlp = English()
    ner = EntityRecognizer(nlp.vocab)
    ner.add_label("SOME_LABEL")
    ner.begin_training([])
    nlp.add_pipe(ner)

    # add a new label to the doc
    doc = nlp("What do you think about Apple ?")
    assert len(ner.labels) == 1
    assert "SOME_LABEL" in ner.labels
    apple_ent = Span(doc, 5, 6, label="MY_ORG")
    doc.ents = list(doc.ents) + [apple_ent]

    # ensure the beam_parse still works with the new label
    docs = [doc]
    beams = nlp.entity.beam_parse(docs,
                                  beam_width=beam_width,
                                  beam_density=beam_density)

    for doc, beam in zip(docs, beams):
        entity_scores = defaultdict(float)
        for score, ents in nlp.entity.moves.get_beam_parses(beam):
            for start, end, label in ents:
                entity_scores[(start, end, label)] += score
Example #30
0
def get_model(model_name):
    if model_name not in _models:
        model = spacy.load(model_name)
        if model.tagger is None:
            model.tagger = Tagger(model.vocab, features=Tagger.feature_templates)
        if model.entity is None:
            model.entity = EntityRecognizer(model.vocab, entity_types=['PERSON', 'NORP', 'FACILITY', 'ORG', 'GPE',
                                                                       'LOC', 'PRODUCT', 'EVENT', 'WORK_OF_ART',
                                                                       'LANGUAGE', 'DATE', 'TIME', 'PERCENT',
                                                                       'MONEY', 'QUANTITY', 'ORDINAL', 'CARDINAL'])
        model.pipeline = [model.tagger, model.entity, model.parser]
        _models[model_name] = model
    return _models[model_name]
Example #31
0
def predictEnt(query):
    nlp = spacy.load('en', parser=False)
    doc = nlp.make_doc(query)
    vocab_dir = pathlib.Path('ner/vocab')
    with (vocab_dir / 'strings.json').open('r', encoding='utf8') as file_:
        nlp.vocab.strings.load(file_)
    nlp.vocab.load_lexemes(vocab_dir / 'lexemes.bin')
    ner = EntityRecognizer.load(pathlib.Path("ner"), nlp.vocab, require=True)
    nlp.tagger(doc)
    ner(doc)
    for word in doc:
        if word.ent_type_ == 'PRODUCT':
            return word.text
Example #32
0
def test_issue4042_bug2():
    """
    Test that serialization of an NER works fine when new labels were added.
    This is the second bug of two bugs underlying the issue 4042.
    """
    nlp1 = English()
    vocab = nlp1.vocab

    # add ner pipe
    ner1 = nlp1.create_pipe("ner")
    ner1.add_label("SOME_LABEL")
    nlp1.add_pipe(ner1)
    nlp1.begin_training()

    # add a new label to the doc
    doc1 = nlp1("What do you think about Apple ?")
    assert len(ner1.labels) == 1
    assert "SOME_LABEL" in ner1.labels
    apple_ent = Span(doc1, 5, 6, label="MY_ORG")
    doc1.ents = list(doc1.ents) + [apple_ent]

    # reapply the NER - at this point it should resize itself
    ner1(doc1)
    assert len(ner1.labels) == 2
    assert "SOME_LABEL" in ner1.labels
    assert "MY_ORG" in ner1.labels

    with make_tempdir() as d:
        # assert IO goes fine
        output_dir = ensure_path(d)
        if not output_dir.exists():
            output_dir.mkdir()
        ner1.to_disk(output_dir)

        nlp2 = English(vocab)
        ner2 = EntityRecognizer(vocab)
        ner2.from_disk(output_dir)
        assert len(ner2.labels) == 2
Example #33
0
def test_accept_blocked_token():
    """Test succesful blocking of tokens to be in an entity."""
    # 1. test normal behaviour
    nlp1 = English()
    doc1 = nlp1("I live in New York")
    ner1 = EntityRecognizer(doc1.vocab)
    assert [token.ent_iob_ for token in doc1] == ["", "", "", "", ""]
    assert [token.ent_type_ for token in doc1] == ["", "", "", "", ""]

    # Add the OUT action
    ner1.moves.add_action(5, "")
    ner1.add_label("GPE")
    # Get into the state just before "New"
    state1 = ner1.moves.init_batch([doc1])[0]
    ner1.moves.apply_transition(state1, "O")
    ner1.moves.apply_transition(state1, "O")
    ner1.moves.apply_transition(state1, "O")
    # Check that B-GPE is valid.
    assert ner1.moves.is_valid(state1, "B-GPE")

    # 2. test blocking behaviour
    nlp2 = English()
    doc2 = nlp2("I live in New York")
    ner2 = EntityRecognizer(doc2.vocab)

    # set "New York" to a blocked entity
    doc2.ents = [(0, 3, 5)]
    assert [token.ent_iob_ for token in doc2] == ["", "", "", "B", "B"]
    assert [token.ent_type_ for token in doc2] == ["", "", "", "", ""]

    # Check that B-GPE is now invalid.
    ner2.moves.add_action(4, "")
    ner2.moves.add_action(5, "")
    ner2.add_label("GPE")
    state2 = ner2.moves.init_batch([doc2])[0]
    ner2.moves.apply_transition(state2, "O")
    ner2.moves.apply_transition(state2, "O")
    ner2.moves.apply_transition(state2, "O")
    # we can only use U- for "New"
    assert not ner2.moves.is_valid(state2, "B-GPE")
    assert ner2.moves.is_valid(state2, "U-")
    ner2.moves.apply_transition(state2, "U-")
    # we can only use U- for "York"
    assert not ner2.moves.is_valid(state2, "B-GPE")
    assert ner2.moves.is_valid(state2, "U-")
def predict(query):
    # Load NER
    nlp = spacy.load('en', parser=False, entity=False, add_vectors=False)
    vocab_dir = pathlib.Path('ner/vocab')
    with (vocab_dir / 'strings.json').open('r',encoding='utf8') as file_:
        nlp.vocab.strings.load(file_)
    nlp.vocab.load_lexemes(vocab_dir / 'lexemes.bin')


    ner = EntityRecognizer.load(pathlib.Path("ner"), nlp.vocab, require=False)
    doc = nlp.make_doc(query)
    #nlp.tagger(doc)
    ner(doc)
    for word in doc:
        print(word.text, word.orth, word.lower, word.ent_type_)

    for word in doc:
        if word.ent_type_:
            print ('word -> {} and entity-> {}'.format(word.text,word.ent_type_))
def get_query(queryObj):
    global nlp

    # Our query string
    story = queryObj.story
    querystring = queryObj.querystring

    # Where our model is located
    model_path = os.path.normpath(
        os.path.join(settings.SPACYMODEL_DIR, str(story.name)))

    ENTITY_LIST = []
    for attribute in story.storyattribute_set.all():
        ENTITY_LIST.append(str(attribute.attribute))

    # Initialize Spacy modules
    ner = EntityRecognizer(nlp.vocab, entity_types=ENTITY_LIST)

    # Only tag ners if there is an existing dataset
    if os.path.isfile(model_path):
        ner.model.load(model_path)

        # Creates a tagger
        doc = nlp.make_doc(querystring)

        nlp.tagger(doc)
        ner(doc)

        # Formatted Dic, or in JSON format
        ner_dict = {}

        for word in doc:
            if word.ent_type_ is not None and word.ent_type_ is not '':
                ner_dict[word.text] = word.ent_type_

        # Save dict as our parsed ner
        queryObj.parsed_ner = ner_dict

    # Returns empty dict
    else:
        queryObj.parsed_ner = {}
Example #36
0
def load_ner_model(vocab, path):
    return EntityRecognizer.load(path, vocab)