def train(nlp, data, ents, num_iterations=20): """ :param nlp: nlp instance :param data: training data(look at required format below) :param ents: list of entities :param num_iterations: number iterations to train :return: trained NER tagger """ # Example : # train_data = [ # ( # 'Who is Shaka Khan?', # [(len('Who is '), len('Who is Shaka Khan'), 'PERSON')] # ), ... # ] for sent, _ in data: doc = nlp.make_doc(sent) for word in doc: _ = nlp.vocab[word.orth] result_NER = EntityRecognizer(nlp.vocab, entity_types=ents) for _ in range(num_iterations): random.shuffle(data) for sent, entity_offsets in data: doc = nlp.make_doc(sent) gold = GoldParse(doc, entities=entity_offsets) result_NER.update(doc, gold) return result_NER
def train_ner(nlp, train_data, entity_types): ner = EntityRecognizer(nlp.vocab, entity_types=entity_types) for itn in range(5): random.shuffle(train_data) for raw_text, entity_offsets in train_data: doc = nlp.make_doc(raw_text) gold = GoldParse(doc, entities=entity_offsets) ner.update(doc, gold) ner.model.end_training() return ner
def train_NER(filepath, vocab, iterations=20): print("Training {} iterations".format(iterations)) docs, postags, entities = read_connl(filepath, vocab) ner = EntityRecognizer(vocab, entity_types=LABELS) for i in range(iterations): if i % 5 == 0: print("Iteration {}...".format(i)) for doc, entity_list in zip(docs, entities): ner.update(doc, GoldParse(doc, entities=entity_list)) print("Done training.") return docs, ner
def train_ner(nlp, train_data, entity_types): # Add new words to vocab. for raw_text, _ in train_data: doc = nlp.make_doc(raw_text) for word in doc: _ = nlp.vocab[word.orth] # Train NER. ner = EntityRecognizer(nlp.vocab, entity_types=entity_types) for itn in range(5): random.shuffle(train_data) for raw_text, entity_offsets in train_data: doc = nlp.make_doc(raw_text) gold = GoldParse(doc, entities=entity_offsets) ner.update(doc, gold) return ner
def train_query(queryObj): global nlp # Our query string story = queryObj.story querystring = queryObj.querystring parsed_ner = queryObj.parsed_ner # Where our model is located model_path = os.path.normpath( os.path.join(settings.SPACYMODEL_DIR, str(story.name))) ENTITY_OFFSETS = [] ENTITY_LIST = [] for txt in parsed_ner: cur_entity = parsed_ner[txt] cur_index = querystring.find(txt) # If string is found in querystring if cur_index != -1: ENTITY_OFFSETS.append( (cur_index, cur_index + len(txt), cur_entity)) # Add entity to entity list if its not in there if cur_entity not in ENTITY_LIST: ENTITY_LIST.append(cur_entity) # Our training data TRAIN_DATA = [ (querystring, ENTITY_OFFSETS), ] # Trains the model # loads up existing data if they exist ner = EntityRecognizer(nlp.vocab, entity_types=ENTITY_LIST) # If our model exists, we load it for itn in range(25): random.shuffle(TRAIN_DATA) for raw_text, entity_offsets in TRAIN_DATA: doc = nlp.make_doc(raw_text) gold = GoldParse(doc, entities=entity_offsets) ner.update(doc, gold) ner.model.end_training() # Save model ner.model.dump(model_path)
class SpacyEntityExtractor(object): def __init__(self, nlp=None, extractor_file=None): if extractor_file: self.ner = EntityRecognizer.load(pathlib.Path(extractor_file), nlp.vocab) else: self.ner = None def convert_examples(self, entity_examples): def convert_entity(ent): return ent["start"], ent["end"], ent["entity"] def convert_example(ex): return ex["text"], [convert_entity(ent) for ent in ex["entities"]] return [convert_example(ex) for ex in entity_examples] def train(self, nlp, entity_examples): train_data = self.convert_examples(entity_examples) ent_types = [[ent["entity"] for ent in ex["entities"]] for ex in entity_examples] entity_types = list(set(sum(ent_types, []))) self.ner = EntityRecognizer(nlp.vocab, entity_types=entity_types) for itn in range(5): random.shuffle(train_data) for raw_text, entity_offsets in train_data: doc = nlp.make_doc(raw_text) gold = GoldParse(doc, entities=entity_offsets) self.ner.update(doc, gold) self.ner.model.end_training() def extract_entities(self, nlp, sentence): doc = nlp.make_doc(sentence) nlp.tagger(doc) self.ner(doc) entities = [ { "entity": ent.label_, "value": ent.text, "start": ent.start_char, "end": ent.end_char } for ent in doc.ents] return entities
for raw_text, _ in train_data: doc = nlp.make_doc(raw_text) for word in doc: _ = nlp.vocab[word.orth] for itn in range(10): random.shuffle(train_data) for raw_text, entity_offsets in train_data: doc = nlp.make_doc(raw_text) #print(doc) gold = GoldParse(doc, entities=entity_offsets) #nlp.tagger(doc) ner.update(doc, gold) print 'ner training done..' while True: userinput = raw_input("Enter your sentence: ") if userinput == 'exit': break doc2 = nlp.make_doc(userinput.decode('utf-8')) nlp.tagger(doc2) ner(doc2) for word in doc2: if word.ent_type_ in entity_types: print(word.text, word.ent_type_) print