Exemple #1
0
def train(nlp, data, ents, num_iterations=20):
    """

    :param nlp: nlp instance
    :param data: training data(look at required format below)
    :param ents: list of entities
    :param num_iterations: number iterations to train
    :return: trained NER tagger
    """

    # Example :
    # train_data = [
    #     (
    #         'Who is Shaka Khan?',
    #         [(len('Who is '), len('Who is Shaka Khan'), 'PERSON')]
    #     ), ...
    # ]

    for sent, _ in data:
        doc = nlp.make_doc(sent)
        for word in doc:
            _ = nlp.vocab[word.orth]

    result_NER = EntityRecognizer(nlp.vocab, entity_types=ents)
    for _ in range(num_iterations):
        random.shuffle(data)
        for sent, entity_offsets in data:
            doc = nlp.make_doc(sent)
            gold = GoldParse(doc, entities=entity_offsets)
            result_NER.update(doc, gold)
    return result_NER
Exemple #2
0
def train_ner(nlp, train_data, entity_types):
    ner = EntityRecognizer(nlp.vocab, entity_types=entity_types)
    for itn in range(5):
        random.shuffle(train_data)
        for raw_text, entity_offsets in train_data:
            doc = nlp.make_doc(raw_text)
            gold = GoldParse(doc, entities=entity_offsets)
            ner.update(doc, gold)
    ner.model.end_training()
    return ner
Exemple #3
0
def train_ner(nlp, train_data, entity_types):
    ner = EntityRecognizer(nlp.vocab, entity_types=entity_types)
    for itn in range(5):
        random.shuffle(train_data)
        for raw_text, entity_offsets in train_data:
            doc = nlp.make_doc(raw_text)
            gold = GoldParse(doc, entities=entity_offsets)
            ner.update(doc, gold)
    ner.model.end_training()
    return ner
Exemple #4
0
def train_NER(filepath, vocab, iterations=20):
    print("Training {} iterations".format(iterations))
    docs, postags, entities = read_connl(filepath, vocab)
    ner = EntityRecognizer(vocab, entity_types=LABELS)
    for i in range(iterations):
        if i % 5 == 0:
            print("Iteration {}...".format(i))
        for doc, entity_list in zip(docs, entities):
            ner.update(doc, GoldParse(doc, entities=entity_list))
    print("Done training.")
    return docs, ner
def train_ner(nlp, train_data, entity_types):
    # Add new words to vocab.
    for raw_text, _ in train_data:
        doc = nlp.make_doc(raw_text)
        for word in doc:
            _ = nlp.vocab[word.orth]

    # Train NER.
    ner = EntityRecognizer(nlp.vocab, entity_types=entity_types)
    for itn in range(5):
        random.shuffle(train_data)
        for raw_text, entity_offsets in train_data:
            doc = nlp.make_doc(raw_text)
            gold = GoldParse(doc, entities=entity_offsets)
            ner.update(doc, gold)
    return ner
Exemple #6
0
def train_ner(nlp, train_data, entity_types):
    # Add new words to vocab.
    for raw_text, _ in train_data:
        doc = nlp.make_doc(raw_text)
        for word in doc:
            _ = nlp.vocab[word.orth]

    # Train NER.
    ner = EntityRecognizer(nlp.vocab, entity_types=entity_types)
    for itn in range(5):
        random.shuffle(train_data)
        for raw_text, entity_offsets in train_data:
            doc = nlp.make_doc(raw_text)
            gold = GoldParse(doc, entities=entity_offsets)
            ner.update(doc, gold)
    return ner
def train_query(queryObj):
    global nlp

    # Our query string
    story = queryObj.story
    querystring = queryObj.querystring
    parsed_ner = queryObj.parsed_ner

    # Where our model is located
    model_path = os.path.normpath(
        os.path.join(settings.SPACYMODEL_DIR, str(story.name)))

    ENTITY_OFFSETS = []
    ENTITY_LIST = []

    for txt in parsed_ner:
        cur_entity = parsed_ner[txt]
        cur_index = querystring.find(txt)
        # If string is found in querystring
        if cur_index != -1:
            ENTITY_OFFSETS.append(
                (cur_index, cur_index + len(txt), cur_entity))

            # Add entity to entity list if its not in there
            if cur_entity not in ENTITY_LIST:
                ENTITY_LIST.append(cur_entity)

    # Our training data
    TRAIN_DATA = [
        (querystring, ENTITY_OFFSETS),
    ]

    # Trains the model
    # loads up existing data if they exist
    ner = EntityRecognizer(nlp.vocab, entity_types=ENTITY_LIST)

    # If our model exists, we load it
    for itn in range(25):
        random.shuffle(TRAIN_DATA)
        for raw_text, entity_offsets in TRAIN_DATA:
            doc = nlp.make_doc(raw_text)
            gold = GoldParse(doc, entities=entity_offsets)
            ner.update(doc, gold)
    ner.model.end_training()

    # Save model
    ner.model.dump(model_path)
Exemple #8
0
class SpacyEntityExtractor(object):
    def __init__(self, nlp=None, extractor_file=None):
        if extractor_file:
            self.ner = EntityRecognizer.load(pathlib.Path(extractor_file), nlp.vocab)
        else:
            self.ner = None

    def convert_examples(self, entity_examples):
        def convert_entity(ent):
            return ent["start"], ent["end"], ent["entity"]

        def convert_example(ex):
            return ex["text"], [convert_entity(ent) for ent in ex["entities"]]

        return [convert_example(ex) for ex in entity_examples]

    def train(self, nlp, entity_examples):
        train_data = self.convert_examples(entity_examples)
        ent_types = [[ent["entity"] for ent in ex["entities"]] for ex in entity_examples]
        entity_types = list(set(sum(ent_types, [])))

        self.ner = EntityRecognizer(nlp.vocab, entity_types=entity_types)
        for itn in range(5):
            random.shuffle(train_data)
            for raw_text, entity_offsets in train_data:
                doc = nlp.make_doc(raw_text)
                gold = GoldParse(doc, entities=entity_offsets)
                self.ner.update(doc, gold)
        self.ner.model.end_training()

    def extract_entities(self, nlp, sentence):
        doc = nlp.make_doc(sentence)
        nlp.tagger(doc)
        self.ner(doc)

        entities = [
          {
            "entity": ent.label_,
            "value": ent.text,
            "start": ent.start_char,
            "end": ent.end_char
          }
          for ent in doc.ents]
        return entities
Exemple #9
0
for raw_text, _ in train_data:
    doc = nlp.make_doc(raw_text)
    for word in doc:
        _ = nlp.vocab[word.orth]


for itn in range(10):
    random.shuffle(train_data)
    for raw_text, entity_offsets in train_data:
        doc = nlp.make_doc(raw_text)
        #print(doc)
        gold = GoldParse(doc, entities=entity_offsets)

        #nlp.tagger(doc)
        ner.update(doc, gold)


print 'ner training done..'
while True:
	userinput = raw_input("Enter your sentence: ")
	if userinput == 'exit':
		break
	doc2 = nlp.make_doc(userinput.decode('utf-8'))
	nlp.tagger(doc2)
	ner(doc2)
	for word in doc2:
		if word.ent_type_ in entity_types:
			print(word.text, word.ent_type_)
	print