def train_types(self): """ Train models for each subtype of entity, and a general model. :param types: subtypes of entities to train individual models, as well as a general model """ self.basemodel.load_data(self.corpus, feature_extractors.keys(), subtype="all") for t in self.types: typepath = self.basepath + "_" + t model = StanfordNERModel(typepath, subtypes=self.basemodel.subtypes) model.copy_data(self.basemodel, t) logging.info("training subtype %s" % t) model.train() self.models[t] = model
def run_crossvalidation(goldstd_list, corpus, model, cv, crf="stanford", entity_type="all", cvlog="cv.log"): logfile = open(cvlog, 'w') doclist = corpus.documents.keys() random.shuffle(doclist) size = int(len(doclist) / cv) sublists = chunks(doclist, size) logging.debug("Chunks:") logging.debug(sublists) p, r = [], [] all_results = ResultsNER(model) all_results.path = model + "_results" for nlist in range(cv): testids, trainids = None, None testids = sublists[nlist] trainids = list(itertools.chain.from_iterable(sublists[:nlist])) trainids += list(itertools.chain.from_iterable(sublists[nlist + 1:])) train_corpus, test_corpus = None, None print 'CV{} - test set: {}; train set: {}'.format( nlist, len(testids), len(trainids)) train_corpus = Corpus( corpus.path + "_train", documents={did: corpus.documents[did] for did in trainids}) test_corpus = Corpus( corpus.path + "_test", documents={did: corpus.documents[did] for did in testids}) # logging.debug("train corpus docs: {}".format("\n".join(train_corpus.documents.keys()))) #test_entities = len(test_corpus.get_all_entities("goldstandard")) #train_entities = len(train_corpus.get_all_entities("goldstandard")) #logging.info("test set entities: {}; train set entities: {}".format(test_entities, train_entities)) basemodel = model + "_cv{}".format(nlist) logging.debug('CV{} - test set: {}; train set: {}'.format( nlist, len(test_corpus.documents), len(train_corpus.documents))) '''for d in train_corpus.documents: for s in train_corpus.documents[d].sentences: print len([t.tags.get("goldstandard") for t in s.tokens if t.tags.get("goldstandard") != "other"]) sys.exit()''' # train logging.info('CV{} - TRAIN'.format(nlist)) # train_model = StanfordNERModel(basemodel) train_model = None if crf == "stanford": train_model = StanfordNERModel(basemodel, entity_type) elif crf == "crfsuite": train_model = CrfSuiteModel(basemodel, entity_type) train_model.load_data(train_corpus, feature_extractors.keys()) train_model.train() # test logging.info('CV{} - TEST'.format(nlist)) test_model = None if crf == "stanford": test_model = StanfordNERModel(basemodel, entity_type) elif crf == "crfsuite": test_model = CrfSuiteModel(basemodel, entity_type) test_model.load_tagger(port=9191 + nlist) test_model.load_data(test_corpus, feature_extractors.keys(), mode="test") final_results = None final_results = test_model.test(test_corpus, port=9191 + nlist) if crf == "stanford": test_model.kill_process() final_results.basepath = basemodel + "_results" final_results.path = basemodel all_results.entities.update(final_results.entities) all_results.corpus.documents.update(final_results.corpus.documents) # validate """if config.use_chebi: logging.info('CV{} - VALIDATE'.format(nlist)) final_results = add_chebi_mappings(final_results, basemodel) final_results = add_ssm_score(final_results, basemodel) final_results.combine_results(basemodel, basemodel)""" # evaluate logging.info('CV{} - EVALUATE'.format(nlist)) test_goldset = set() for gs in goldstd_list: goldset = get_gold_ann_set(config.paths[gs]["format"], config.paths[gs]["annotations"], entity_type, "pairtype", config.paths[gs]["text"]) for g in goldset[0]: if g[0] in testids: test_goldset.add(g) precision, recall = get_results(final_results, basemodel, test_goldset, {}, []) # evaluation = run_chemdner_evaluation(config.paths[goldstd]["cem"], basemodel + "_results.txt", "-t") # values = evaluation.split("\n")[1].split('\t') p.append(precision) r.append(recall) # logging.info("precision: {} recall:{}".format(str(values[13]), str(values[14]))) pavg = sum(p) / cv ravg = sum(r) / cv print "precision: average={} all={}".format( str(pavg), '|'.join([str(pp) for pp in p])) print "recall: average={} all={}".format(str(ravg), '|'.join([str(rr) for rr in r])) all_goldset = set() for gs in goldstd_list: goldset = get_gold_ann_set(config.paths[gs]["format"], config.paths[gs]["annotations"], entity_type, config.paths[gs]["text"]) for g in goldset: all_goldset.add(g) get_results(all_results, model, all_goldset, {}, [])
def run_crossvalidation(goldstd_list, corpus, model, cv, crf="stanford", entity_type="all", cvlog="cv.log"): logfile = open(cvlog, 'w') doclist = corpus.documents.keys() random.shuffle(doclist) size = int(len(doclist)/cv) sublists = chunks(doclist, size) logging.debug("Chunks:") logging.debug(sublists) p, r = [], [] all_results = ResultsNER(model) all_results.path = model + "_results" for nlist in range(cv): testids, trainids = None, None testids = sublists[nlist] trainids = list(itertools.chain.from_iterable(sublists[:nlist])) trainids += list(itertools.chain.from_iterable(sublists[nlist+1:])) train_corpus, test_corpus = None, None print 'CV{} - test set: {}; train set: {}'.format(nlist, len(testids), len(trainids)) train_corpus = Corpus(corpus.path + "_train", documents={did: corpus.documents[did] for did in trainids}) test_corpus = Corpus(corpus.path + "_test", documents={did: corpus.documents[did] for did in testids}) # logging.debug("train corpus docs: {}".format("\n".join(train_corpus.documents.keys()))) #test_entities = len(test_corpus.get_all_entities("goldstandard")) #train_entities = len(train_corpus.get_all_entities("goldstandard")) #logging.info("test set entities: {}; train set entities: {}".format(test_entities, train_entities)) basemodel = model + "_cv{}".format(nlist) logging.debug('CV{} - test set: {}; train set: {}'.format(nlist, len(test_corpus.documents), len(train_corpus.documents))) '''for d in train_corpus.documents: for s in train_corpus.documents[d].sentences: print len([t.tags.get("goldstandard") for t in s.tokens if t.tags.get("goldstandard") != "other"]) sys.exit()''' # train logging.info('CV{} - TRAIN'.format(nlist)) # train_model = StanfordNERModel(basemodel) train_model = None if crf == "stanford": train_model = StanfordNERModel(basemodel, entity_type) elif crf == "crfsuite": train_model = CrfSuiteModel(basemodel, entity_type) train_model.load_data(train_corpus, feature_extractors.keys()) train_model.train() # test logging.info('CV{} - TEST'.format(nlist)) test_model = None if crf == "stanford": test_model = StanfordNERModel(basemodel, entity_type) elif crf == "crfsuite": test_model = CrfSuiteModel(basemodel, entity_type) test_model.load_tagger(port=9191+nlist) test_model.load_data(test_corpus, feature_extractors.keys(), mode="test") final_results = None final_results = test_model.test(test_corpus, port=9191+nlist) if crf == "stanford": test_model.kill_process() final_results.basepath = basemodel + "_results" final_results.path = basemodel all_results.entities.update(final_results.entities) all_results.corpus.documents.update(final_results.corpus.documents) # validate """if config.use_chebi: logging.info('CV{} - VALIDATE'.format(nlist)) final_results = add_chebi_mappings(final_results, basemodel) final_results = add_ssm_score(final_results, basemodel) final_results.combine_results(basemodel, basemodel)""" # evaluate logging.info('CV{} - EVALUATE'.format(nlist)) test_goldset = set() for gs in goldstd_list: goldset = get_gold_ann_set(config.corpus_paths.paths[gs]["format"], config.corpus_paths.paths[gs]["annotations"], entity_type, "pairtype", config.corpus_paths.paths[gs]["text"]) for g in goldset[0]: if g[0] in testids: test_goldset.add(g) precision, recall = get_results(final_results, basemodel, test_goldset, {}, []) # evaluation = run_chemdner_evaluation(config.paths[goldstd]["cem"], basemodel + "_results.txt", "-t") # values = evaluation.split("\n")[1].split('\t') p.append(precision) r.append(recall) # logging.info("precision: {} recall:{}".format(str(values[13]), str(values[14]))) pavg = sum(p)/cv ravg = sum(r)/cv print "precision: average={} all={}".format(str(pavg), '|'.join([str(pp) for pp in p])) print "recall: average={} all={}".format(str(ravg), '|'.join([str(rr) for rr in r])) all_goldset = set() for gs in goldstd_list: goldset = get_gold_ann_set(config.corpus_paths.paths[gs]["format"], config.corpus_paths.paths[gs]["annotations"], entity_type, "", config.corpus_paths.paths[gs]["text"]) for g in goldset[0]: all_goldset.add(g) get_results(all_results, model, all_goldset, {}, [])