class TaggerCollection(object): """ Collection of tagger classifiers used to train and test specific subtype models """ CHEMDNER_TYPES = [ "IDENTIFIER", "MULTIPLE", "FAMILY", "FORMULA", "SYSTEMATIC", "ABBREVIATION", "TRIVIAL" ] GPRO_TYPES = ["NESTED", "IDENTIFIER", "FULL_NAME", "ABBREVIATION"] DDI_TYPES = ["drug", "group", "brand", "drug_n"] def __init__(self, basepath, baseport=9191, **kwargs): self.models = {} self.basepath = basepath self.corpus = kwargs.get("corpus") submodels = [] self.baseport = baseport self.types = [] if basepath.split("/")[-1].startswith("chemdner+ddi"): self.types = self.DDI_TYPES + self.CHEMDNER_TYPES + [ "chemdner", "ddi" ] elif basepath.split("/")[-1].startswith("ddi"): self.types = self.DDI_TYPES + ["all"] elif basepath.split("/")[-1].startswith("chemdner") or basepath.split( "/")[-1].startswith("cemp"): self.types = ["all"] + self.CHEMDNER_TYPES elif basepath.split("/")[-1].startswith("gpro"): self.types = self.GPRO_TYPES + ["all"] self.basemodel = StanfordNERModel(self.basepath, "all") def train_types(self): """ Train models for each subtype of entity, and a general model. :param types: subtypes of entities to train individual models, as well as a general model """ self.basemodel.load_data(self.corpus, feature_extractors.keys(), subtype="all") for t in self.types: typepath = self.basepath + "_" + t model = StanfordNERModel(typepath, subtypes=self.basemodel.subtypes) model.copy_data(self.basemodel, t) logging.info("training subtype %s" % t) model.train() self.models[t] = model def load_models(self): for i, t in enumerate(self.types): model = StanfordNERModel(self.basepath + "_" + t, t, subtypes=self.basemodel.subtypes) model.load_tagger(self.baseport + i) self.models[t] = model def process_type(self, modelst, t, corpus, basemodel, basepath, port): # load data only for one model since this takes at least 5 minutes each time logging.debug("{}: copying data...".format(t)) modelst.copy_data(basemodel) #logging.debug("pre test %s" % model) logging.debug("{}: testing...".format(t)) res = modelst.test(corpus, port) logging.info("{}:done...".format(t)) return res def test_types(self, corpus): """ Classify the corpus with multiple classifiers from different subtypes :return ResultSetNER object with the results obtained for the models """ # TODO: parallelize results = ResultSetNER(corpus, self.basepath) self.basemodel.load_data(corpus, feature_extractors.keys()) all_results = [] tasks = [(self.models[t], t, corpus, self.basemodel, self.basepath, self.baseport + i) for i, t in enumerate(self.types)] all_results = [] for t in tasks: r = self.process_type(*t) all_results.append(r) logging.info("adding results...") for res, i in enumerate(all_results): #logging.debug("adding these results: {}".format(self.types[i])) results.add_results(res) return results
def run_crossvalidation(goldstd_list, corpus, model, cv, crf="stanford", entity_type="all", cvlog="cv.log"): logfile = open(cvlog, 'w') doclist = corpus.documents.keys() random.shuffle(doclist) size = int(len(doclist) / cv) sublists = chunks(doclist, size) logging.debug("Chunks:") logging.debug(sublists) p, r = [], [] all_results = ResultsNER(model) all_results.path = model + "_results" for nlist in range(cv): testids, trainids = None, None testids = sublists[nlist] trainids = list(itertools.chain.from_iterable(sublists[:nlist])) trainids += list(itertools.chain.from_iterable(sublists[nlist + 1:])) train_corpus, test_corpus = None, None print 'CV{} - test set: {}; train set: {}'.format( nlist, len(testids), len(trainids)) train_corpus = Corpus( corpus.path + "_train", documents={did: corpus.documents[did] for did in trainids}) test_corpus = Corpus( corpus.path + "_test", documents={did: corpus.documents[did] for did in testids}) # logging.debug("train corpus docs: {}".format("\n".join(train_corpus.documents.keys()))) #test_entities = len(test_corpus.get_all_entities("goldstandard")) #train_entities = len(train_corpus.get_all_entities("goldstandard")) #logging.info("test set entities: {}; train set entities: {}".format(test_entities, train_entities)) basemodel = model + "_cv{}".format(nlist) logging.debug('CV{} - test set: {}; train set: {}'.format( nlist, len(test_corpus.documents), len(train_corpus.documents))) '''for d in train_corpus.documents: for s in train_corpus.documents[d].sentences: print len([t.tags.get("goldstandard") for t in s.tokens if t.tags.get("goldstandard") != "other"]) sys.exit()''' # train logging.info('CV{} - TRAIN'.format(nlist)) # train_model = StanfordNERModel(basemodel) train_model = None if crf == "stanford": train_model = StanfordNERModel(basemodel, entity_type) elif crf == "crfsuite": train_model = CrfSuiteModel(basemodel, entity_type) train_model.load_data(train_corpus, feature_extractors.keys()) train_model.train() # test logging.info('CV{} - TEST'.format(nlist)) test_model = None if crf == "stanford": test_model = StanfordNERModel(basemodel, entity_type) elif crf == "crfsuite": test_model = CrfSuiteModel(basemodel, entity_type) test_model.load_tagger(port=9191 + nlist) test_model.load_data(test_corpus, feature_extractors.keys(), mode="test") final_results = None final_results = test_model.test(test_corpus, port=9191 + nlist) if crf == "stanford": test_model.kill_process() final_results.basepath = basemodel + "_results" final_results.path = basemodel all_results.entities.update(final_results.entities) all_results.corpus.documents.update(final_results.corpus.documents) # validate """if config.use_chebi: logging.info('CV{} - VALIDATE'.format(nlist)) final_results = add_chebi_mappings(final_results, basemodel) final_results = add_ssm_score(final_results, basemodel) final_results.combine_results(basemodel, basemodel)""" # evaluate logging.info('CV{} - EVALUATE'.format(nlist)) test_goldset = set() for gs in goldstd_list: goldset = get_gold_ann_set(config.paths[gs]["format"], config.paths[gs]["annotations"], entity_type, "pairtype", config.paths[gs]["text"]) for g in goldset[0]: if g[0] in testids: test_goldset.add(g) precision, recall = get_results(final_results, basemodel, test_goldset, {}, []) # evaluation = run_chemdner_evaluation(config.paths[goldstd]["cem"], basemodel + "_results.txt", "-t") # values = evaluation.split("\n")[1].split('\t') p.append(precision) r.append(recall) # logging.info("precision: {} recall:{}".format(str(values[13]), str(values[14]))) pavg = sum(p) / cv ravg = sum(r) / cv print "precision: average={} all={}".format( str(pavg), '|'.join([str(pp) for pp in p])) print "recall: average={} all={}".format(str(ravg), '|'.join([str(rr) for rr in r])) all_goldset = set() for gs in goldstd_list: goldset = get_gold_ann_set(config.paths[gs]["format"], config.paths[gs]["annotations"], entity_type, config.paths[gs]["text"]) for g in goldset: all_goldset.add(g) get_results(all_results, model, all_goldset, {}, [])
def run_crossvalidation(goldstd_list, corpus, model, cv, crf="stanford", entity_type="all", cvlog="cv.log"): logfile = open(cvlog, 'w') doclist = corpus.documents.keys() random.shuffle(doclist) size = int(len(doclist)/cv) sublists = chunks(doclist, size) logging.debug("Chunks:") logging.debug(sublists) p, r = [], [] all_results = ResultsNER(model) all_results.path = model + "_results" for nlist in range(cv): testids, trainids = None, None testids = sublists[nlist] trainids = list(itertools.chain.from_iterable(sublists[:nlist])) trainids += list(itertools.chain.from_iterable(sublists[nlist+1:])) train_corpus, test_corpus = None, None print 'CV{} - test set: {}; train set: {}'.format(nlist, len(testids), len(trainids)) train_corpus = Corpus(corpus.path + "_train", documents={did: corpus.documents[did] for did in trainids}) test_corpus = Corpus(corpus.path + "_test", documents={did: corpus.documents[did] for did in testids}) # logging.debug("train corpus docs: {}".format("\n".join(train_corpus.documents.keys()))) #test_entities = len(test_corpus.get_all_entities("goldstandard")) #train_entities = len(train_corpus.get_all_entities("goldstandard")) #logging.info("test set entities: {}; train set entities: {}".format(test_entities, train_entities)) basemodel = model + "_cv{}".format(nlist) logging.debug('CV{} - test set: {}; train set: {}'.format(nlist, len(test_corpus.documents), len(train_corpus.documents))) '''for d in train_corpus.documents: for s in train_corpus.documents[d].sentences: print len([t.tags.get("goldstandard") for t in s.tokens if t.tags.get("goldstandard") != "other"]) sys.exit()''' # train logging.info('CV{} - TRAIN'.format(nlist)) # train_model = StanfordNERModel(basemodel) train_model = None if crf == "stanford": train_model = StanfordNERModel(basemodel, entity_type) elif crf == "crfsuite": train_model = CrfSuiteModel(basemodel, entity_type) train_model.load_data(train_corpus, feature_extractors.keys()) train_model.train() # test logging.info('CV{} - TEST'.format(nlist)) test_model = None if crf == "stanford": test_model = StanfordNERModel(basemodel, entity_type) elif crf == "crfsuite": test_model = CrfSuiteModel(basemodel, entity_type) test_model.load_tagger(port=9191+nlist) test_model.load_data(test_corpus, feature_extractors.keys(), mode="test") final_results = None final_results = test_model.test(test_corpus, port=9191+nlist) if crf == "stanford": test_model.kill_process() final_results.basepath = basemodel + "_results" final_results.path = basemodel all_results.entities.update(final_results.entities) all_results.corpus.documents.update(final_results.corpus.documents) # validate """if config.use_chebi: logging.info('CV{} - VALIDATE'.format(nlist)) final_results = add_chebi_mappings(final_results, basemodel) final_results = add_ssm_score(final_results, basemodel) final_results.combine_results(basemodel, basemodel)""" # evaluate logging.info('CV{} - EVALUATE'.format(nlist)) test_goldset = set() for gs in goldstd_list: goldset = get_gold_ann_set(config.corpus_paths.paths[gs]["format"], config.corpus_paths.paths[gs]["annotations"], entity_type, "pairtype", config.corpus_paths.paths[gs]["text"]) for g in goldset[0]: if g[0] in testids: test_goldset.add(g) precision, recall = get_results(final_results, basemodel, test_goldset, {}, []) # evaluation = run_chemdner_evaluation(config.paths[goldstd]["cem"], basemodel + "_results.txt", "-t") # values = evaluation.split("\n")[1].split('\t') p.append(precision) r.append(recall) # logging.info("precision: {} recall:{}".format(str(values[13]), str(values[14]))) pavg = sum(p)/cv ravg = sum(r)/cv print "precision: average={} all={}".format(str(pavg), '|'.join([str(pp) for pp in p])) print "recall: average={} all={}".format(str(ravg), '|'.join([str(rr) for rr in r])) all_goldset = set() for gs in goldstd_list: goldset = get_gold_ann_set(config.corpus_paths.paths[gs]["format"], config.corpus_paths.paths[gs]["annotations"], entity_type, "", config.corpus_paths.paths[gs]["text"]) for g in goldset[0]: all_goldset.add(g) get_results(all_results, model, all_goldset, {}, [])