def train(Language, gold_tuples, model_dir, n_iter=15, feat_set=u'basic', seed=0, gold_preproc=False, n_sents=0, corruption_level=0, beam_width=1, verbose=False, use_orig_arc_eager=False): if n_sents > 0: gold_tuples = gold_tuples[:n_sents] templates = Tagger.default_templates() nlp = Language(data_dir=model_dir, tagger=False) nlp.tagger = Tagger.blank(nlp.vocab, templates) print("Itn.\tP.Loss\tUAS\tNER F.\tTag %\tToken %") for itn in range(n_iter): scorer = Scorer() loss = 0 for raw_text, sents in gold_tuples: if gold_preproc: raw_text = None else: sents = _merge_sents(sents) for annot_tuples, ctnt in sents: words = annot_tuples[1] gold_tags = annot_tuples[2] score_model(scorer, nlp, raw_text, annot_tuples) if raw_text is None: tokens = nlp.tokenizer.tokens_from_list(words) else: tokens = nlp.tokenizer(raw_text) loss += nlp.tagger.train(tokens, gold_tags) random.shuffle(gold_tuples) print('%d:\t%d\t%.3f\t%.3f\t%.3f\t%.3f' % (itn, loss, scorer.uas, scorer.ents_f, scorer.tags_acc, scorer.token_acc)) nlp.end_training(model_dir)
def main(output_dir=None): if output_dir is not None: output_dir = Path(output_dir) ensure_dir(output_dir) ensure_dir(output_dir / "pos") ensure_dir(output_dir / "vocab") vocab = Vocab(tag_map=TAG_MAP) # The default_templates argument is where features are specified. See # spacy/tagger.pyx for the defaults. tagger = Tagger(vocab) for i in range(25): for words, tags in DATA: doc = Doc(vocab, words=words) gold = GoldParse(doc, tags=tags) tagger.update(doc, gold) random.shuffle(DATA) tagger.model.end_training() doc = Doc(vocab, orths_and_spaces=zip(["I", "like", "blue", "eggs"], [True] * 4)) tagger(doc) for word in doc: print(word.text, word.tag_, word.pos_) if output_dir is not None: tagger.model.dump(str(output_dir / 'pos' / 'model')) with (output_dir / 'vocab' / 'strings.json').open('w') as file_: tagger.vocab.strings.dump(file_)
def main(output_dir=None): if output_dir is not None: output_dir = Path(output_dir) ensure_dir(output_dir) ensure_dir(output_dir / "pos") ensure_dir(output_dir / "vocab") vocab = Vocab(tag_map=TAG_MAP) # The default_templates argument is where features are specified. See # spacy/tagger.pyx for the defaults. tagger = Tagger(vocab) for i in range(25): for words, tags in DATA: doc = Doc(vocab, words=words) gold = GoldParse(doc, tags=tags) tagger.update(doc, gold) random.shuffle(DATA) tagger.model.end_training() doc = Doc(vocab, orths_and_spaces=zip(["I", "like", "blue", "eggs"], [True] * 4)) tagger(doc) for word in doc: print(word.text, word.tag_, word.pos_) if output_dir is not None: tagger.model.dump(str(output_dir / 'pos' / 'model')) with (output_dir / 'vocab' / 'strings.json').open('wb') as file_: tagger.vocab.strings.dump(file_)
def train(Language, gold_tuples, model_dir, n_iter=15, feat_set=u'basic', seed=0, gold_preproc=False, force_gold=False): dep_model_dir = path.join(model_dir, 'deps') pos_model_dir = path.join(model_dir, 'pos') if path.exists(dep_model_dir): shutil.rmtree(dep_model_dir) if path.exists(pos_model_dir): shutil.rmtree(pos_model_dir) os.mkdir(dep_model_dir) os.mkdir(pos_model_dir) Config.write(dep_model_dir, 'config', features=feat_set, seed=seed, labels=ArcEager.get_labels(gold_tuples)) nlp = Language(data_dir=model_dir, tagger=False, parser=False, entity=False) nlp.tagger = Tagger.blank(nlp.vocab, Tagger.default_templates()) nlp.parser = Parser.from_dir(dep_model_dir, nlp.vocab.strings, ArcEager) print("Itn.\tP.Loss\tUAS\tNER F.\tTag %\tToken %") for itn in range(n_iter): scorer = Scorer() loss = 0 for _, sents in gold_tuples: for annot_tuples, _ in sents: if len(annot_tuples[1]) == 1: continue score_model(scorer, nlp, None, annot_tuples, verbose=False) tokens = nlp.tokenizer.tokens_from_list(annot_tuples[1]) nlp.tagger(tokens) gold = GoldParse(tokens, annot_tuples, make_projective=True) if not gold.is_projective: raise Exception( "Non-projective sentence in training, after we should " "have enforced projectivity: %s" % annot_tuples) loss += nlp.parser.train(tokens, gold) nlp.tagger.train(tokens, gold.tags) random.shuffle(gold_tuples) print('%d:\t%d\t%.3f\t%.3f\t%.3f' % (itn, loss, scorer.uas, scorer.tags_acc, scorer.token_acc)) print('end training') nlp.end_training(model_dir) print('done')
def main(train_loc, dev_loc, model_dir, tag_map_loc=None): if tag_map_loc: with open(tag_map_loc) as file_: tag_map = json.loads(file_.read()) else: tag_map = DEFAULT_TAG_MAP train_sents = list(read_conllx(train_loc)) train_sents = PseudoProjectivity.preprocess_training_data(train_sents) actions = ArcEager.get_actions(gold_parses=train_sents) features = get_templates('basic') model_dir = pathlib.Path(model_dir) if not (model_dir / 'deps').exists(): (model_dir / 'deps').mkdir() with (model_dir / 'deps' / 'config.json').open('wb') as file_: file_.write( json.dumps( {'pseudoprojective': True, 'labels': actions, 'features': features}).encode('utf8')) vocab = Vocab(lex_attr_getters=Language.Defaults.lex_attr_getters, tag_map=tag_map) # Populate vocab for _, doc_sents in train_sents: for (ids, words, tags, heads, deps, ner), _ in doc_sents: for word in words: _ = vocab[word] for dep in deps: _ = vocab[dep] for tag in tags: _ = vocab[tag] if tag_map: for tag in tags: assert tag in tag_map, repr(tag) tagger = Tagger(vocab, tag_map=tag_map) parser = DependencyParser(vocab, actions=actions, features=features, L1=0.0) for itn in range(15): loss = 0. for _, doc_sents in train_sents: for (ids, words, tags, heads, deps, ner), _ in doc_sents: doc = Doc(vocab, words=words) gold = GoldParse(doc, tags=tags, heads=heads, deps=deps) tagger(doc) loss += parser.update(doc, gold, itn=itn) doc = Doc(vocab, words=words) tagger.update(doc, gold) random.shuffle(train_sents) scorer = score_model(vocab, tagger, parser, read_conllx(dev_loc)) print('%d:\t%.3f\t%.3f\t%.3f' % (itn, loss, scorer.uas, scorer.tags_acc)) nlp = Language(vocab=vocab, tagger=tagger, parser=parser) nlp.end_training(model_dir) scorer = score_model(vocab, tagger, parser, read_conllx(dev_loc)) print('%d:\t%.3f\t%.3f\t%.3f' % (itn, scorer.uas, scorer.las, scorer.tags_acc))
def main(train_loc, dev_loc, output_dir=None): if output_dir is not None: output_dir = Path(output_dir) ensure_dir(output_dir) ensure_dir(output_dir / "pos") ensure_dir(output_dir / "vocab") train_data = read_ud_data(train_loc) vocab = Vocab(tag_map=TAG_MAP, lex_attr_getters=LEX_ATTR_GETTERS) # Populate vocab for words, _ in train_data: for word in words: _ = vocab[word] model = spacy.tagger.TaggerModel(spacy.tagger.Tagger.feature_templates) tagger = Tagger(vocab, model) print(tagger.tag_names) for i in range(30): print("training model (iteration " + str(i) + ")...") score = 0. num_samples = 0. for words, tags in train_data: doc = Doc(vocab, words=words) gold = GoldParse(doc, tags=tags) cost = tagger.update(doc, gold) for i, word in enumerate(doc): num_samples += 1 if word.tag_ == tags[i]: score += 1 print('Train acc', score/num_samples) random.shuffle(train_data) tagger.model.end_training() score = 0.0 test_data = read_ud_data(dev_loc) num_samples = 0 for words, tags in test_data: doc = Doc(vocab, words) tagger(doc) for i, word in enumerate(doc): num_samples += 1 if word.tag_ == tags[i]: score += 1 print("score: " + str(score / num_samples * 100.0)) if output_dir is not None: tagger.model.dump(str(output_dir / 'pos' / 'model')) with (output_dir / 'vocab' / 'strings.json').open('w') as file_: tagger.vocab.strings.dump(file_)
def main(model_dir=None): nlp = spacy.load('en', parser=False, entity=False, add_vectors=False) # v1.1.2 onwards if nlp.tagger is None: print('---- WARNING ----') print('Data directory not found') print( 'please run: `python -m spacy.en.download --force all` for better performance' ) print('Using feature templates for tagging') print('-----------------') nlp.tagger = Tagger(nlp.vocab, features=Tagger.feature_templates) train_data = json.load(open('train_ner.json')) ner = train_ner(nlp, train_data, ['Event_Time']) doc = nlp.make_doc('how about coffee tomorrow at 5pm?') nlp.tagger(doc) ner(doc) for word in doc: print(word.text, word.orth, word.lower, word.tag_, word.ent_type_, word.ent_iob) if model_dir is not None: save_model(ner, model_dir)
def main(model_dir=None): nlp = spacy.load('en', parser=False, entity=False, add_vectors=False) # v1.1.2 onwards if nlp.tagger is None: print('---- WARNING ----') print('Data directory not found') print( 'please run: `python -m spacy.en.download --force all` for better performance' ) print('Using feature templates for tagging') print('-----------------') nlp.tagger = Tagger(nlp.vocab, features=Tagger.feature_templates) train_data = [('Who is Shaka Khan?', [(len('Who is '), len('Who is Shaka Khan'), 'PERSON')]), ('I like London and Berlin.', [ (len('I like '), len('I like London'), 'LOC'), (len('I like London and '), len('I like London and Berlin'), 'LOC') ])] ner = train_ner(nlp, train_data, ['PERSON', 'LOC']) doc = nlp.make_doc('Who is Shaka Khan?') nlp.tagger(doc) ner(doc) for word in doc: print(word.text, word.orth, word.lower, word.tag_, word.ent_type_, word.ent_iob) if model_dir is not None: save_model(ner, model_dir)
def main(model_dir=None): nlp = spacy.get_lang_class('pt')(path=None) # v1.1.2 onwards if nlp.tagger is None: print('Setting tagger') nlp.tagger = Tagger(nlp.vocab, features=Tagger.feature_templates) if (len(sys.argv) > 4): filetrain = sys.argv[1] model_dir = sys.argv[2] level = sys.argv[3] n_iter = sys.argv[4] else: print("Usage: python " + sys.argv[0] + " <input filename train> <model_dir> <level> <n_iterations>\n") sys.exit() train_data = get_training_data(filetrain) nlp = create_vocab(nlp, train_data) categories = [ 'Pessoa', 'Organizacao', 'Localizacao', 'Curso', 'Data', 'Hora', 'Evento', 'UnidadeOrganica' ] ner = train_ner(nlp, train_data, categories, int(n_iter)) if model_dir is not None: save_model(ner, model_dir + '/' + level)
def main(tagged_output, traindata, testdata, traindataformat="", testdataformat="", model_dir=None): nlp = spacy.load('de', parser=False, entity=False, add_vectors=False) # v1.1.2 onwards if nlp.tagger is None: print('---- WARNING ----') print('Data directory not found') print( 'please run: `python -m spacy.en.download --force all` for better performance' ) print('Using feature templates for tagging') print('-----------------') nlp.tagger = Tagger(nlp.vocab, features=Tagger.feature_templates) if traindataformat == 'germeval': Cv = convert_germaeval2spacy(traindata) train_data = Cv.convert()[0] ner = train_ner(nlp, train_data, [ 'B-OTH', 'I-OTH', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC', 'B-PER', 'I-PER', 'O' ]) elif traindataformat == 'conll': Cv = convert_conll2spacy(traindata) train_data = Cv.convert()[0] ner = train_ner(nlp, train_data, [ 'B-MISC', 'I-MISC', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC', 'B-PER', 'I-PER', 'O' ]) if testdataformat == 'germeval': Cv_test = convert_germaeval2spacy(testdata) test_data = Cv_test.convert()[2] elif testdataformat == 'conll': Cv_test = convert_conll2spacy(testdata) test_data = Cv_test.convert()[2] #doc = nlp.make_doc(test_data) doc = Doc(nlp.vocab, words=test_data) ner(doc) nlp.tagger(doc) i = 0 testfilespacygermeval = open(tagged_output, "w") for word in doc: print(word.text, word.orth, word.lower, word.tag_, word.ent_type_, word.ent_iob) line = word.text + "\t" + word.ent_type_ + "\n" testfilespacygermeval.write(line) i += 1 print(i) if model_dir is not None: save_model(ner, model_dir)
def from_dir(cls, tag_map, model_dir): vocab = Vocab(tag_map=tag_map, get_lex_attr=Language.default_lex_attrs()) tokenizer = Tokenizer(vocab, {}, None, None, None) tagger = Tagger.blank(vocab, TAGGER_TEMPLATES) cfg = Config.read(path.join(model_dir, 'deps'), 'config') parser = Parser.from_dir(path.join(model_dir, 'deps'), vocab.strings, ArcEager) return cls(vocab, tokenizer, tagger, parser)
def train(Language, gold_tuples, model_dir, n_iter=15, feat_set=u'basic', seed=0, gold_preproc=False, force_gold=False): dep_model_dir = path.join(model_dir, 'deps') pos_model_dir = path.join(model_dir, 'pos') if path.exists(dep_model_dir): shutil.rmtree(dep_model_dir) if path.exists(pos_model_dir): shutil.rmtree(pos_model_dir) os.mkdir(dep_model_dir) os.mkdir(pos_model_dir) Config.write(dep_model_dir, 'config', features=feat_set, seed=seed, labels=ArcEager.get_labels(gold_tuples)) nlp = Language(data_dir=model_dir, tagger=False, parser=False, entity=False) nlp.tagger = Tagger.blank(nlp.vocab, Tagger.default_templates()) nlp.parser = Parser.from_dir(dep_model_dir, nlp.vocab.strings, ArcEager) print("Itn.\tP.Loss\tUAS\tNER F.\tTag %\tToken %") for itn in range(n_iter): scorer = Scorer() loss = 0 for _, sents in gold_tuples: for annot_tuples, _ in sents: if len(annot_tuples[1]) == 1: continue score_model(scorer, nlp, None, annot_tuples, verbose=False) tokens = nlp.tokenizer.tokens_from_list(annot_tuples[1]) nlp.tagger(tokens) gold = GoldParse(tokens, annot_tuples, make_projective=True) if not gold.is_projective: raise Exception( "Non-projective sentence in training, after we should " "have enforced projectivity: %s" % annot_tuples ) loss += nlp.parser.train(tokens, gold) nlp.tagger.train(tokens, gold.tags) random.shuffle(gold_tuples) print('%d:\t%d\t%.3f\t%.3f\t%.3f' % (itn, loss, scorer.uas, scorer.tags_acc, scorer.token_acc)) print('end training') nlp.end_training(model_dir) print('done')
def main(output_dir): ensure_dir(output_dir) ensure_dir(output_dir, "pos") ensure_dir(output_dir, "vocab") vocab = Vocab(tag_map=TAG_MAP) tokenizer = Tokenizer(vocab, {}, None, None, None) # The default_templates argument is where features are specified. See # spacy/tagger.pyx for the defaults. tagger = Tagger.blank(vocab, Tagger.default_templates()) for i in range(5): for words, tags in DATA: tokens = tokenizer.tokens_from_list(words) tagger.train(tokens, tags) random.shuffle(DATA) tagger.model.end_training(path.join(output_dir, 'pos', 'model')) vocab.strings.dump(path.join(output_dir, 'vocab', 'strings.txt'))
def main(train_loc, dev_loc, model_dir, tag_map_loc): with open(tag_map_loc) as file_: tag_map = json.loads(file_.read()) train_sents = list(read_conllx(train_loc)) train_sents = PseudoProjectivity.preprocess_training_data(train_sents) actions = ArcEager.get_actions(gold_parses=train_sents) features = get_templates('basic') model_dir = pathlib.Path(model_dir) with (model_dir / 'deps' / 'config.json').open('w') as file_: json.dump({'pseudoprojective': True, 'labels': actions, 'features': features}, file_) vocab = Vocab(lex_attr_getters=Language.Defaults.lex_attr_getters, tag_map=tag_map) # Populate vocab for _, doc_sents in train_sents: for (ids, words, tags, heads, deps, ner), _ in doc_sents: for word in words: _ = vocab[word] for dep in deps: _ = vocab[dep] for tag in tags: _ = vocab[tag] for tag in tags: assert tag in tag_map, repr(tag) tagger = Tagger(vocab, tag_map=tag_map) parser = DependencyParser(vocab, actions=actions, features=features) for itn in range(15): for _, doc_sents in train_sents: for (ids, words, tags, heads, deps, ner), _ in doc_sents: doc = Doc(vocab, words=words) gold = GoldParse(doc, tags=tags, heads=heads, deps=deps) tagger(doc) parser.update(doc, gold) doc = Doc(vocab, words=words) tagger.update(doc, gold) random.shuffle(train_sents) scorer = score_model(vocab, tagger, parser, read_conllx(dev_loc)) print('%d:\t%.3f\t%.3f' % (itn, scorer.uas, scorer.tags_acc)) nlp = Language(vocab=vocab, tagger=tagger, parser=parser) nlp.end_training(model_dir) scorer = score_model(vocab, tagger, parser, read_conllx(dev_loc)) print('%d:\t%.3f\t%.3f\t%.3f' % (itn, scorer.uas, scorer.las, scorer.tags_acc))
def get_model(model_name): if model_name not in _models: model = spacy.load(model_name) if model.tagger is None: model.tagger = Tagger(model.vocab, features=Tagger.feature_templates) if model.entity is None: model.entity = EntityRecognizer(model.vocab, entity_types=['PERSON', 'NORP', 'FACILITY', 'ORG', 'GPE', 'LOC', 'PRODUCT', 'EVENT', 'WORK_OF_ART', 'LANGUAGE', 'DATE', 'TIME', 'PERCENT', 'MONEY', 'QUANTITY', 'ORDINAL', 'CARDINAL']) model.pipeline = [model.tagger, model.entity, model.parser] _models[model_name] = model return _models[model_name]
def main(model_dir=None): #, parser=False, entity=False, add_vectors=False) nlp = spacy.load('en', parser=False, entity=False, add_vectors=False) # v1.1.2 onwards if nlp.tagger is None: print('---- WARNING ----') print('Data directory not found') print( 'please run: `python -m spacy.en.download --force all` for better performance' ) print('Using feature templates for tagging') print('-----------------') nlp.tagger = Tagger(nlp.vocab, features=Tagger.feature_templates) train_data = [ ('I want to buy a Boxer', [(len('I want to buy a '), len('I want to buy a Boxer'), 'PRODUCT')]), ('Do you have a Blanket', [(len('Do you have a '), len('Do you have Blanket'), 'PRODUCT')]), ('Can you show me some Pants', [(len('Can you show me some '), len('Can you show me some Pants'), 'PRODUCT')]), ('Show me some tops', [(len('Show me some '), len('Show me some tops'), 'PRODUCT')]), ] ner = train_ner(nlp, train_data, ['PRODUCT']) # doc = nlp.make_doc('I want a Blanket') # nlp.tagger(doc) # ner(doc) # for word in doc: # print(word.text, word.orth, word.lower, word.tag_, word.ent_type_, word.ent_iob) # train_data = [ # ( # 'Radha', # [(0, len('Radha'), 'PRODUCT')] # ) # ] # ner = train_ner(nlp, train_data, ['PRODUCT']) # doc = nlp.make_doc('where is London?') # nlp.tagger(doc) # ner(doc) # for word in doc: # print(word.text,word.ent_type_) if model_dir is not None: save_model(ner, model_dir)
def main(data_dir, model_dir=None, exclude_normalize_tags=None, keys={}): ''' data_dir -> path to brat annotation data. searches recursively model_dir -> path to save spacy training model exclude_normalize_tags -> list of tags to exclude from normalization. If NONE, no normalization is performed. keys -> dict translating brat tags to training tags. keys not in dict will be preserved ''' r = RepoModel(data_dir, recursive=True, cached=False) nlp = spacy.load('en') # v1.1.2 onwards if nlp.tagger is None: print('---- WARNING ----') print('Data directory not found') print( 'please run: `python -m spacy.en.download --force all` for better performance' ) print('Using feature templates for tagging') print('-----------------') nlp.tagger = Tagger(nlp.vocab, features=Tagger.feature_templates) normalized_train_data = [] excludes = exclude_normalize_tags #we have manually tagged all instances of these for key, data in r.documents.items(): if exclude_normalize_tags: normalized_train_data.extend( normalize_tags(nlp, get_annotated_sents(data, keys), excludes)) else: normalized_train_data.extend(get_annotated_sents(data, keys)) nlp = train_ner(nlp, normalized_train_data, keys.values()) doc = nlp( u"Hi Adam,\nSounds great to me. I'll send through the QA department. In the invite you through Skype, and we can discuss if Applause is right for you.\nI look forward to it!\nRegards,\nAndrew" ) for word in doc: print(word.text, word.tag_, word.ent_type_) if model_dir is not None: save_model(nlp, model_dir)
def main(model_dir=None): if model_dir is not None: model_dir = pathlib.Path(model_dir) if not model_dir.exists(): model_dir.mkdir() assert model_dir.is_dir() nlp = spacy.load('en', parser=False, entity=False, add_vectors=False) # v1.1.2 onwards if nlp.tagger is None: print('---- WARNING ----') print('Data directory not found') print('please run: `python -m spacy.en.download –force all` for better performance') print('Using feature templates for tagging') print('-----------------') nlp.tagger = Tagger(nlp.vocab, features=Tagger.feature_templates) train_data = [ ( 'Who is Shaka Khan?', [(len('Who is '), len('Who is Shaka Khan'), 'PERSON')] ), ( 'I like London and Berlin.', [(len('I like '), len('I like London'), 'LOC'), (len('I like London and '), len('I like London and Berlin'), 'LOC')] ) ] ner = train_ner(nlp, train_data, ['PERSON', 'LOC']) doc = nlp.make_doc('Who is Shaka Khan?') nlp.tagger(doc) ner(doc) for word in doc: print(word.text, word.tag_, word.ent_type_, word.ent_iob) if model_dir is not None: with (model_dir / 'config.json').open('w') as file_: json.dump(ner.cfg, file_) ner.model.dump(str(model_dir / 'model'))
def main(model_dir=None): train_data = make_train_data() entity_types = load_entyty_types() nlp = spacy.load('en', parser=False, entity=False, add_vectors=False) if nlp.tagger is None: nlp.tagger = Tagger(nlp.vocab, features=Tagger.feature_templates) ner = train(nlp, train_data, entity_types, 20) # small test doc = nlp.make_doc(u'is there a delta flight from denver to san francisco') nlp.tagger(doc) ner(doc) for word in doc: print(word.text, word.ent_type_) # test_model(nlp, ner) if model_dir is not None: save_model(ner, model_dir)
def main(model_dir=None): if model_dir is not None: model_dir = pathlib.Path(model_dir) if not model_dir.exists(): model_dir.mkdir() assert model_dir.is_dir() nlp = spacy.load('en', parser=False, entity=False, add_vectors=False) # v1.1.2 onwards if nlp.tagger is None: print('---- WARNING ----') print('Data directory not found') print('please run: `python -m spacy.en.download --force all` for better performance') print('Using feature templates for tagging') print('-----------------') nlp.tagger = Tagger(nlp.vocab, features=Tagger.feature_templates) train_data =getData() ner = train_ner(nlp, train_data, ['ACC','EMAIL']) doc = nlp.make_doc('update email of account peps to [email protected]') #nlp.tagger(doc) ner(doc) for word in doc: print(word.text, word.orth, word.lower, word.tag_, word.ent_type_, word.ent_iob) if model_dir is not None: with (model_dir / 'config.json').open('wb') as file_: json.dump(ner.cfg, file_) ner.model.dump(str(model_dir / 'model')) if not (model_dir / 'vocab').exists(): (model_dir / 'vocab').mkdir() ner.vocab.dump(str(model_dir / 'vocab' / 'lexemes.bin')) with (model_dir / 'vocab' / 'strings.json').open('w',encoding='utf8') as file_: ner.vocab.strings.dump(file_)
def test_load(self): data_dir = English.default_data_dir() vocab = Vocab.from_dir(path.join(data_dir, 'vocab')) tagger = Tagger.from_dir(path.join(data_dir, 'tagger'), vocab)
def train(Language, gold_tuples, model_dir, n_iter=15, feat_set=u'basic', seed=0, gold_preproc=False, n_sents=0, corruption_level=0, beam_width=1, verbose=False, use_orig_arc_eager=False): dep_model_dir = path.join(model_dir, 'deps') ner_model_dir = path.join(model_dir, 'ner') pos_model_dir = path.join(model_dir, 'pos') if path.exists(dep_model_dir): shutil.rmtree(dep_model_dir) if path.exists(ner_model_dir): shutil.rmtree(ner_model_dir) if path.exists(pos_model_dir): shutil.rmtree(pos_model_dir) os.mkdir(dep_model_dir) os.mkdir(ner_model_dir) os.mkdir(pos_model_dir) Config.write(dep_model_dir, 'config', features=feat_set, seed=seed, labels=ArcEager.get_labels(gold_tuples), beam_width=beam_width) Config.write(ner_model_dir, 'config', features='ner', seed=seed, labels=BiluoPushDown.get_labels(gold_tuples), beam_width=0) if n_sents > 0: gold_tuples = gold_tuples[:n_sents] nlp = Language(data_dir=model_dir, tagger=False, parser=False, entity=False) nlp.tagger = Tagger.blank(nlp.vocab, Tagger.default_templates()) nlp.parser = Parser.from_dir(dep_model_dir, nlp.vocab.strings, ArcEager) nlp.entity = Parser.from_dir(ner_model_dir, nlp.vocab.strings, BiluoPushDown) print("Itn.\tP.Loss\tUAS\tNER F.\tTag %\tToken %") for itn in range(n_iter): scorer = Scorer() loss = 0 for raw_text, sents in gold_tuples: if gold_preproc: raw_text = None else: sents = _merge_sents(sents) for annot_tuples, ctnt in sents: if len(annot_tuples[1]) == 1: continue score_model(scorer, nlp, raw_text, annot_tuples, verbose=verbose if itn >= 2 else False) if raw_text is None: words = add_noise(annot_tuples[1], corruption_level) tokens = nlp.tokenizer.tokens_from_list(words) else: raw_text = add_noise(raw_text, corruption_level) tokens = nlp.tokenizer(raw_text) nlp.tagger(tokens) gold = GoldParse(tokens, annot_tuples, make_projective=True) if not gold.is_projective: raise Exception( "Non-projective sentence in training, after we should " "have enforced projectivity: %s" % annot_tuples ) loss += nlp.parser.train(tokens, gold) nlp.entity.train(tokens, gold) nlp.tagger.train(tokens, gold.tags) random.shuffle(gold_tuples) print('%d:\t%d\t%.3f\t%.3f\t%.3f\t%.3f' % (itn, loss, scorer.uas, scorer.ents_f, scorer.tags_acc, scorer.token_acc)) print('end training') nlp.end_training(model_dir) print('done')
def test_create(self): vocab = Vocab() templates = ((1, ), ) model = Model(vocab.morphology.n_tags, templates, model_loc=None) tagger = Tagger(vocab, model)
def main(lang_name, train_loc, dev_loc, model_dir, clusters_loc=None): LangClass = spacy.util.get_lang_class(lang_name) train_sents = list(read_conllx(train_loc)) train_sents = PseudoProjectivity.preprocess_training_data(train_sents) actions = ArcEager.get_actions(gold_parses=train_sents) features = get_templates('basic') model_dir = pathlib.Path(model_dir) if not model_dir.exists(): model_dir.mkdir() if not (model_dir / 'deps').exists(): (model_dir / 'deps').mkdir() if not (model_dir / 'pos').exists(): (model_dir / 'pos').mkdir() with (model_dir / 'deps' / 'config.json').open('wb') as file_: file_.write( json.dumps( {'pseudoprojective': True, 'labels': actions, 'features': features}).encode('utf8')) vocab = LangClass.Defaults.create_vocab() if not (model_dir / 'vocab').exists(): (model_dir / 'vocab').mkdir() else: if (model_dir / 'vocab' / 'strings.json').exists(): with (model_dir / 'vocab' / 'strings.json').open() as file_: vocab.strings.load(file_) if (model_dir / 'vocab' / 'lexemes.bin').exists(): vocab.load_lexemes(model_dir / 'vocab' / 'lexemes.bin') if clusters_loc is not None: clusters_loc = pathlib.Path(clusters_loc) with clusters_loc.open() as file_: for line in file_: try: cluster, word, freq = line.split() except ValueError: continue lex = vocab[word] lex.cluster = int(cluster[::-1], 2) # Populate vocab for _, doc_sents in train_sents: for (ids, words, tags, heads, deps, ner), _ in doc_sents: for word in words: _ = vocab[word] for dep in deps: _ = vocab[dep] for tag in tags: _ = vocab[tag] if vocab.morphology.tag_map: for tag in tags: assert tag in vocab.morphology.tag_map, repr(tag) tagger = Tagger(vocab) parser = DependencyParser(vocab, actions=actions, features=features, L1=0.0) for itn in range(30): loss = 0. for _, doc_sents in train_sents: for (ids, words, tags, heads, deps, ner), _ in doc_sents: doc = Doc(vocab, words=words) gold = GoldParse(doc, tags=tags, heads=heads, deps=deps) tagger(doc) loss += parser.update(doc, gold, itn=itn) doc = Doc(vocab, words=words) tagger.update(doc, gold) random.shuffle(train_sents) scorer = score_model(vocab, tagger, parser, read_conllx(dev_loc)) print('%d:\t%.3f\t%.3f\t%.3f' % (itn, loss, scorer.uas, scorer.tags_acc)) nlp = LangClass(vocab=vocab, tagger=tagger, parser=parser) nlp.end_training(model_dir) scorer = score_model(vocab, tagger, parser, read_conllx(dev_loc)) print('%d:\t%.3f\t%.3f\t%.3f' % (itn, scorer.uas, scorer.las, scorer.tags_acc))
import os import random import spacy from django.conf import settings from spacy.gold import GoldParse from spacy.pipeline import EntityRecognizer from spacy.tagger import Tagger # Load up our Data dir # NLP Module nlp = spacy.load('en', parser=False, entity=False, add_vectors=False) # Quick and easy if you don't have the data installed if nlp.tagger is None: nlp.tagger = Tagger(nlp.vocab, features=Tagger.feature_templates) # Trains our query object def train_query(queryObj): global nlp # Our query string story = queryObj.story querystring = queryObj.querystring parsed_ner = queryObj.parsed_ner # Where our model is located model_path = os.path.normpath( os.path.join(settings.SPACYMODEL_DIR, str(story.name)))
if g == p: right += 1 else: wrong += 1 acc = 100 * right / (right + wrong) print(f"Accuracy: {acc:.2f}") tag_map = {t: {'pos': 'X'} for t in alltags} #tag_map.update(TAG_MAP) vocab = Vocab(tag_map=tag_map) # Add all train words to vocab! for (ws, _) in trainset + testset: for w in ws: _ = vocab[w] tagger = Tagger(vocab) for i in range(50): print(f"Epoch {i}:") for (ws, ts) in trainset: doc = Doc(vocab, words=ws) gold = GoldParse(doc, tags=ts) tagger.update(doc, gold) eval(tagger) tagger.model.end_training() eval(tagger) tagger.model.resume_training() shuffle(trainset)
def test_load(self): data_dir = English.default_data_dir() if path.exists(path.join(data_dir, 'vocab')): vocab = Vocab.from_dir(path.join(data_dir, 'vocab')) tagger = Tagger.from_dir(path.join(data_dir, 'tagger'), vocab)
def train(Language, gold_tuples, model_dir, n_iter=15, feat_set=u'basic', seed=0, gold_preproc=False, n_sents=0, corruption_level=0, beam_width=1, verbose=False, use_orig_arc_eager=False, pseudoprojective=False): dep_model_dir = path.join(model_dir, 'deps') ner_model_dir = path.join(model_dir, 'ner') pos_model_dir = path.join(model_dir, 'pos') if path.exists(dep_model_dir): shutil.rmtree(dep_model_dir) if path.exists(ner_model_dir): shutil.rmtree(ner_model_dir) if path.exists(pos_model_dir): shutil.rmtree(pos_model_dir) os.mkdir(dep_model_dir) os.mkdir(ner_model_dir) os.mkdir(pos_model_dir) if pseudoprojective: # preprocess training data here before ArcEager.get_labels() is called gold_tuples = PseudoProjectivity.preprocess_training_data(gold_tuples) Config.write(dep_model_dir, 'config', features=feat_set, seed=seed, labels=ArcEager.get_labels(gold_tuples), beam_width=beam_width,projectivize=pseudoprojective) Config.write(ner_model_dir, 'config', features='ner', seed=seed, labels=BiluoPushDown.get_labels(gold_tuples), beam_width=0) if n_sents > 0: gold_tuples = gold_tuples[:n_sents] nlp = Language(data_dir=model_dir, tagger=False, parser=False, entity=False) if nlp.lang == 'de': nlp.vocab.morphology.lemmatizer = lambda string,pos: set([string]) nlp.tagger = Tagger.blank(nlp.vocab, Tagger.default_templates()) nlp.parser = Parser.from_dir(dep_model_dir, nlp.vocab.strings, ArcEager) nlp.entity = Parser.from_dir(ner_model_dir, nlp.vocab.strings, BiluoPushDown) print("Itn.\tP.Loss\tUAS\tNER F.\tTag %\tToken %") for itn in range(n_iter): scorer = Scorer() loss = 0 for raw_text, sents in gold_tuples: if gold_preproc: raw_text = None else: sents = _merge_sents(sents) for annot_tuples, ctnt in sents: if len(annot_tuples[1]) == 1: continue score_model(scorer, nlp, raw_text, annot_tuples, verbose=verbose if itn >= 2 else False) if raw_text is None: words = add_noise(annot_tuples[1], corruption_level) tokens = nlp.tokenizer.tokens_from_list(words) else: raw_text = add_noise(raw_text, corruption_level) tokens = nlp.tokenizer(raw_text) nlp.tagger(tokens) gold = GoldParse(tokens, annot_tuples) if not gold.is_projective: raise Exception("Non-projective sentence in training: %s" % annot_tuples[1]) loss += nlp.parser.train(tokens, gold) nlp.entity.train(tokens, gold) nlp.tagger.train(tokens, gold.tags) random.shuffle(gold_tuples) print('%d:\t%d\t%.3f\t%.3f\t%.3f\t%.3f' % (itn, loss, scorer.uas, scorer.ents_f, scorer.tags_acc, scorer.token_acc)) print('end training') nlp.end_training(model_dir) print('done')
def main(lang_name, train_loc, dev_loc, model_dir, clusters_loc=None): LangClass = spacy.util.get_lang_class(lang_name) train_sents = list(read_conllx(train_loc)) train_sents = PseudoProjectivity.preprocess_training_data(train_sents) actions = ArcEager.get_actions(gold_parses=train_sents) features = get_templates('basic') model_dir = pathlib.Path(model_dir) if not model_dir.exists(): model_dir.mkdir() if not (model_dir / 'deps').exists(): (model_dir / 'deps').mkdir() if not (model_dir / 'pos').exists(): (model_dir / 'pos').mkdir() with (model_dir / 'deps' / 'config.json').open('wb') as file_: file_.write( json.dumps( {'pseudoprojective': True, 'labels': actions, 'features': features}).encode('utf8')) vocab = LangClass.Defaults.create_vocab() if not (model_dir / 'vocab').exists(): (model_dir / 'vocab').mkdir() else: if (model_dir / 'vocab' / 'strings.json').exists(): with (model_dir / 'vocab' / 'strings.json').open() as file_: vocab.strings.load(file_) if (model_dir / 'vocab' / 'lexemes.bin').exists(): vocab.load_lexemes(model_dir / 'vocab' / 'lexemes.bin') if clusters_loc is not None: clusters_loc = pathlib.Path(clusters_loc) with clusters_loc.open() as file_: for line in file_: try: cluster, word, freq = line.split() except ValueError: continue lex = vocab[word] lex.cluster = int(cluster[::-1], 2) # Populate vocab for _, doc_sents in train_sents: for (ids, words, tags, heads, deps, ner), _ in doc_sents: for word in words: _ = vocab[word] for dep in deps: _ = vocab[dep] for tag in tags: _ = vocab[tag] if vocab.morphology.tag_map: for tag in tags: assert tag in vocab.morphology.tag_map, repr(tag) tagger = Tagger(vocab) parser = DependencyParser(vocab, actions=actions, features=features, L1=0.0) for itn in range(30): loss = 0. for _, doc_sents in train_sents: for (ids, words, tags, heads, deps, ner), _ in doc_sents: doc = Doc(vocab, words=words) gold = GoldParse(doc, tags=tags, heads=heads, deps=deps) tagger(doc) loss += parser.update(doc, gold, itn=itn) doc = Doc(vocab, words=words) tagger.update(doc, gold) random.shuffle(train_sents) scorer = score_model(vocab, tagger, parser, read_conllx(dev_loc)) print('%d:\t%.3f\t%.3f\t%.3f' % (itn, loss, scorer.uas, scorer.tags_acc)) nlp = Language(vocab=vocab, tagger=tagger, parser=parser) nlp.end_training(model_dir) scorer = score_model(vocab, tagger, parser, read_conllx(dev_loc)) print('%d:\t%.3f\t%.3f\t%.3f' % (itn, scorer.uas, scorer.las, scorer.tags_acc))
def main(model_dir=None): nlp = spacy.load('en', parser=False, entity=False, add_vectors=False) # v1.1.2 onwards if nlp.tagger is None: print('---- WARNING ----') print('Data directory not found') print( 'please run: `python -m spacy.en.download --force all` for better performance' ) print('Using feature templates for tagging') print('-----------------') nlp.tagger = Tagger(nlp.vocab, features=Tagger.feature_templates) train_data = [ ("Masheesh Ikram\nLEAD SOFTWARE ENGINEER\nSupply Chain | Research & Development\nIFS R&D International, \nNo 501, Galle Road, Colombo 06, SRI LANKA\nTel +94 (0) 11 2364 400. Fax +94 (0) 11 2364401. Mobile +94 (0) 779050954\[email protected] | www.IFSWORLD.com \nIFS World Operations AB is a limited liability company registered in Sweden. \nCorporate identity number: 556040-6042. \nRegistered office: Teknikringen 5, Box 1545, SE-581 15 Linköping.", [(len('Masheesh Ikram\n'), len('Masheesh Ikram\nLEAD SOFTWARE ENGINEER'), 'POS')]), ("Asanka Gallege\nSecretary | IFS Welfare\n501, Galle Road, Colombo 06, SRI LANKA\nTel +94 11 236 4400 (ext. 1722). Fax +94 11 236 4401. Mobile +94 71 563 9556\[email protected] | www.IFSWORLD.com \nIFS World Operations AB is a limited liability company registered in Sweden. \nCorporate identity number: 556040-6042. \nRegistered office: Teknikringen 5, Box 1545, SE-581 15 Linköping.", [(len('Asanka Gallege\n'), len('Asanka Gallege\nSecretary'), 'POS')]), ("David Anderson\nEmail: [email protected]\nChief Executive Officer\nOffice 800-555-5555 \nBroadlook Technologies \nCell : 414-555-5555 \n21140 Capitol Drive\nFax : 262-754-8081\nPewaukee WI 53072\nBlog www.idanato.com\nhttp://www.broadlook.com", [(len('David Anderson\nEmail: [email protected]\n'), len('David Anderson\nEmail: [email protected]\nChief Executive Officer' ), 'POS')]), ("Valerie Richardson \nAccountant\n2906 N. Glenwood Terrace, Atlanta, GA 30310\n(404) 555-0789\[email protected]\n501, Galle Road, , Colombo 06, SRI LANKA\nTel +94 11 236 44 00. Fax +94 11 236 44 01\[email protected] | www.IFSWORLD.com ", [(len('Valerie Richardson \n'), len('Valerie Richardson \nAccountant'), 'POS')]), ('Kandasamy Yogendirakumar (Yogi)\nMSc, MBCS, MIET | DIRECTOR IFS ACADEMY \n501, Galle Road, Colombo 06, SRI LANKA\nTel +94 (0)112 364 440. Fax +94 (0)112 364 441. Mobile +94 (0)714 039 089 \[email protected]|www.IFSWORLD.com \nIFS World Operations AB is a limited liability company registered in Sweden. \nCorporate identity number: 556040-6042. \nRegistered office: Teknikringen 5, Box 1545, SE-581 15 Linköping.', [(len('Kandasamy Yogendirakumar (Yogi)\nMSc, MBCS, MIET | '), len('Kandasamy Yogendirakumar (Yogi)\nMSc, MBCS, MIET | DIRECTOR'), 'POS')]), ('He was a Software Engineer.', [(len('He was a '), len('He was a Lead Software Engineer'), 'POS')]), ('I am an Engineer', [(len('I am an '), len('I am an Engineer'), 'POS') ]), ('I am an Lead Engineer as well as Software Engineer in IFS.', [(len('I am an '), len('I am an Lead Engineer'), 'POS'), (len('I am an Lead Engineer as well as '), len('I am an Lead Engineer as well as Software Engineer'), 'POS')]), ('Secretary', [(0, len('Secretary'), 'POS')]), ('Chief Executive Officer', [(0, len('Chief Executive Officer'), 'POS') ]), ('David Anderson Secretary', [(len('David Anderson'), len('David Anderson Secretary'), 'POS')]), ('David Anderson\nSecretary', [(len('David Anderson\n'), len('David Anderson Secretary'), 'POS')]), ('Asanka Gallege\nSecretary | IFS Welfare\n501, Galle Road, Colombo 06, SRI LANKA\nTel +94 11 236 4400 (ext. 1722). Fax +94 11 236 4401. Mobile +94 71 563 9556', [(len('Asanka Gallege\n'), len('Asanka Gallege\nSecretary'), 'POS')]), ('Fredrik Vom\nGROUP SENIOR VICE PRESIDENT\nBusiness Development\nGullbergs Strandgata 15, SE-411 04 Goteborg,SWEDEN\nTel +46 31 726 3046. Fax +46 31726 3001. Mobile +46 733 453046\[email protected] | www.IFSWORLD.com\nIFS World Operations AB is a limited liability company registered in Sweden.', [(len('Fredrik Vom\n'), len('Fredrik Vom\nGROUP SENIOR VICE PRESIDENT'), 'POS')]), ('Fredrik Vom\nGROUP SENIOR VICE PRESIDENT\nBusiness Development\nGullbergs Strandgata 15, SE-411 04 Goteborg,SWEDEN\nTel +46 31 726 3046. Fax +46 31726 3001. Mobile +46 733 453046\[email protected] | www.IFSWORLD.com\nIFS World Operations AB is a limited liability company registered in Sweden.', [(len('Fredrik Vom\n'), len('Fredrik Vom\nGROUP SENIOR VICE PRESIDENT'), 'POS')]), ('Dr. Ashok Padhye\nGeneral Physician\nA-205, Natasha Apartments\n2, Inner Ring Road\nDomlur\nBANGALORE - 560071\nKarnataka', [(len('Dr. Ashok Padhye\n'), len('Dr. Ashok Padhye\nGeneral Physician'), 'POS')]), ('Dr. Ashok Padhye\nGeneral Physician\nA-205, Natasha Apartments\n2, Inner Ring Road\nDomlur\nBANGALORE - 560071\nKarnataka', [(len('Dr. Ashok Padhye\n'), len('Dr. Ashok Padhye\nGeneral Physician'), 'POS')]), ] # file_name = 'D:\PYTHON\Input\input.txt' # train_data = open(file_name, "r") ner = train_ner(nlp, train_data, ['POS']) doc = nlp.make_doc(""" I am an Lead Engineer as well as Software Engineer in IFS. """) doc1 = unicode(doc) nlp.tagger(doc) ner(doc) position = [] for word in doc: if word.ent_type_ == 'POS': position.append(word.text) # print(word.text,word.ent_type_, word.ent_iob) # print(position) # position=[word for word in doc if word.ent_type =='POS'] # print (position) i = 0 pos = [] new_pos = [] pos = position for x in pos: if word.ent_iob == 3 and i != 0: new_pos.append(pos[:i]) pos = position[i:] elif i == len(position) - 1: new_pos.append(pos) i += 1 for y in new_pos: string = " ".join(str(x) for x in y) print(string) # print (doc1) if model_dir is not None: save_model(ner, model_dir)
def main(model_dir=None): nlp = spacy.get_lang_class('pt')(path=None) # v1.1.2 onwards if nlp.tagger is None: print('Setting tagger') nlp.tagger = Tagger(nlp.vocab, features=Tagger.feature_templates) if (len(sys.argv) > 3): filetrain = sys.argv[1] model_dir = sys.argv[2] level = sys.argv[3] else: print("Usage: python " + sys.argv[0] + " <input filename train> <model_dir> <level>\n") sys.exit() train_data = get_training_data(filetrain) nlp = create_vocab(nlp, train_data) cat = [ 'ABSTRACCAO', 'OUTRO', 'LOCAL', 'ACONTECIMENTO', 'TEMPO', 'PESSOA', 'OBRA', 'ORGANIZACAO', 'VALOR', 'COISA' ] types = [ 'ABSTRACCAO_IDEIA', 'LOCAL_HUMANO', 'ACONTECIMENTO_EVENTO', 'ACONTECIMENTO_EFEMERIDE', 'TEMPO_TEMPO_CALEND', 'PESSOA_POVO', 'PESSOA_INDIVIDUAL', 'OBRA_REPRODUZIDA', 'ABSTRACCAO_DISCIPLINA', 'PESSOA_GRUPOMEMBRO', 'ORGANIZACAO_INSTITUICAO', 'PESSOA_CARGO', 'OBRA_PLANO', 'ORGANIZACAO_ADMINISTRACAO', 'TEMPO_GENERICO', 'ABSTRACCAO_NOME', 'TEMPO_FREQUENCIA', 'LOCAL_FISICO', 'VALOR_QUANTIDADE', 'COISA_SUBSTANCIA', 'LOCAL_VIRTUAL', 'COISA_OBJECTO', 'PESSOA_GRUPOIND', 'ORGANIZACAO_EMPRESA', 'PESSOA_MEMBRO', 'COISA_CLASSE', 'ACONTECIMENTO_ORGANIZADO', 'TEMPO_DURACAO', 'VALOR_MOEDA', 'VALOR_CLASSIFICACAO', 'OBRA_ARTE', 'PESSOA_GRUPOCARGO', 'COISA_MEMBROCLASSE', 'ABSTRACCAO_ESTADO', 'ABSTRACCAO_', 'ORGANIZACAO_', 'OUTRO_', 'ACONTECIMENTO_', 'LOCAL_OUTRO', 'COISA_OUTRO' ] subtypes = [ 'LOCAL_HUMANO_DIVISAO', 'TEMPO_TEMPO_CALEND_DATA', 'LOCAL_HUMANO_PAIS', 'OBRA_REPRODUZIDA_LIVRO', 'PESSOA_POVO_', 'LOCAL_HUMANO_REGIAO', 'TEMPO_TEMPO_CALEND_INTERVALO', 'LOCAL_FISICO_AGUACURSO', 'LOCAL_FISICO_AGUAMASSA', 'TEMPO_TEMPO_CALEND_HORA', 'LOCAL_FISICO_PLANETA', 'LOCAL_HUMANO_RUA', 'LOCAL_HUMANO_CONSTRUCAO', 'LOCAL_FISICO_OUTRO', 'LOCAL_VIRTUAL_SITIO', 'OBRA_REPRODUZIDA_PROGRAMA', 'ORGANIZACAO_INSTITUICAO_', 'LOCAL_HUMANO_OUTRO', 'OBRA_REPRODUZIDA_MUSICA', 'OBRA_REPRODUZIDA_OUTRO', 'ORGANIZACAO_INSTITUICAO_SUB', 'ORGANIZACAO_ADMINISTRACAO_', 'LOCAL_FISICO_REGIAO', 'ABSTRACCAO_IDEIA_', 'OBRA_ARTE_CONSTRUCAO', 'OBRA_ARTE_OUTRO', 'LOCAL_FISICO_RELEVO', 'ORGANIZACAO_ADMINISTRACAO_SUB', 'LOCAL_VIRTUAL_COMSOCIAL', 'ACONTECIMENTO_EFEMERIDE_', 'ACONTECIMENTO_EVENTO_', 'COISA_OBJECTO_', 'LOCAL_FISICO_ILHA', 'OBRA_PLANO_', 'OBRA_REPRODUZIDA_FILME', 'ORGANIZACAO_EMPRESA_', 'LOCAL_VIRTUAL_OBRA', 'ORGANIZACAO_EMPRESA_SUB', 'ACONTECIMENTO_ORGANIZADO_', 'OBRA_REPRODUZIDA_', 'LOCAL_VIRTUAL_OUTRO', 'OBRA_ARTE_', 'ABSTRACCAO_NOME_', 'TEMPO_DURACAO_', 'OBRA_REPRODUZIDA_TEATRO', 'OBRA_ARTE_PINTURA', 'OBRA_ARTE_EDIFICIO' ] filtered = [ 'LOCAL', 'ACONTECIMENTO', 'TEMPO', 'PESSOA', 'ORGANIZACAO', 'VALOR' ] if level == 'cat': categories = cat elif level == 'types': categories = types elif level == 'subtypes': categories = subtypes else: categories = filtered ner = train_ner(nlp, train_data, categories) if model_dir is not None: save_model(ner, model_dir + '/' + level)