def __init__(self, basemodel, ensemble_model, submodels): self.corenlp = StanfordCoreNLP(config.corenlp_dir) self.basemodel = basemodel self.ensemble_model = ensemble_model self.subtypes = submodels self.models = TaggerCollection(basepath=self.basemodel) self.models.load_models() self.ensemble = EnsembleNER(self.ensemble_model, None, self.basemodel + "_combined", types=self.subtypes, features=[]) self.ensemble.load()
def main(): start_time = time.time() parser = argparse.ArgumentParser(description='') parser.add_argument("actions", default="classify", help="Actions to be performed.", choices=[ "load_corpus", "annotate", "classify", "write_results", "write_goldstandard", "train", "test", "train_multiple", "test_multiple", "train_matcher", "test_matcher", "crossvalidation", "train_relations", "test_relations" ]) parser.add_argument( "--goldstd", default="", dest="goldstd", nargs="+", help="Gold standard to be used. Will override corpus, annotations", choices=config.paths.keys()) parser.add_argument("--submodels", default="", nargs='+', help="sub types of classifiers"), parser.add_argument( "-i", "--input", dest="input", action="store", default='''Administration of a higher dose of indinavir should be \\ considered when coadministering with megestrol acetate.''', help="Text to classify.") parser.add_argument( "--corpus", dest="corpus", nargs=2, default=[ "chemdner", "CHEMDNER/CHEMDNER_SAMPLE_JUNE25/chemdner_sample_abstracts.txt" ], help="format path") parser.add_argument("--annotations", dest="annotations") parser.add_argument("--tag", dest="tag", default="0", help="Tag to identify the text.") parser.add_argument("--models", dest="models", help="model destination path, without extension") parser.add_argument("--entitytype", dest="etype", help="type of entities to be considered", default="all") parser.add_argument("--pairtype", dest="ptype", help="type of pairs to be considered", default="all") parser.add_argument("--doctype", dest="doctype", help="type of document to be considered", default="all") parser.add_argument("--annotated", action="store_true", default=False, dest="annotated", help="True if the input has <entity> tags.") parser.add_argument( "-o", "--output", "--format", dest="output", nargs=2, help="format path; output formats: xml, html, tsv, text, chemdner.") parser.add_argument("--crf", dest="crf", help="CRF implementation", default="stanford", choices=["stanford", "crfsuite"]) parser.add_argument("--log", action="store", dest="loglevel", default="WARNING", help="Log level") parser.add_argument("--kernel", action="store", dest="kernel", default="svmtk", help="Kernel for relation extraction") options = parser.parse_args() # set logger numeric_level = getattr(logging, options.loglevel.upper(), None) if not isinstance(numeric_level, int): raise ValueError('Invalid log level: %s' % options.loglevel) while len(logging.root.handlers) > 0: logging.root.removeHandler(logging.root.handlers[-1]) logging_format = '%(asctime)s %(levelname)s %(filename)s:%(lineno)s:%(funcName)s %(message)s' logging.basicConfig(level=numeric_level, format=logging_format, filename="debug.log") logging.getLogger().setLevel(numeric_level) logging.getLogger("requests.packages").setLevel(30) logging.info("Processing action {0} on {1}".format(options.actions, options.goldstd)) # set configuration variables based on the goldstd option if the corpus has a gold standard, # or on corpus and annotation options # pre-processing options if options.actions == "load_corpus": if len(options.goldstd) > 1: print("load only one corpus each time") sys.exit() options.goldstd = options.goldstd[0] corpus_format = config.paths[options.goldstd]["format"] corpus_path = config.paths[options.goldstd]["text"] corpus_ann = config.paths[options.goldstd]["annotations"] corenlp_client = StanfordCoreNLP('http://localhost:9000') corpus = load_corpus(options.goldstd, corpus_path, corpus_format, corenlp_client) corpus.save(config.paths[options.goldstd]["corpus"]) if corpus_ann: #add annotation if it is not a test set corpus.load_annotations(corpus_ann, options.etype, options.ptype) corpus.save(config.paths[options.goldstd]["corpus"]) elif options.actions == "annotate": # rext-add annotation to corpus if len(options.goldstd) > 1: print("load only one corpus each time") sys.exit() options.goldstd = options.goldstd[0] corpus_path = config.paths[options.goldstd]["corpus"] corpus_ann = config.paths[options.goldstd]["annotations"] logging.info("loading corpus %s" % corpus_path) corpus = pickle.load(open(corpus_path, 'rb')) logging.debug("loading annotations...") corpus.clear_annotations(options.etype) corpus.load_annotations(corpus_ann, options.etype, options.ptype) # corpus.get_invalid_sentences() corpus.save(config.paths[options.goldstd]["corpus"]) else: corpus = Corpus("corpus/" + "&".join(options.goldstd)) for g in options.goldstd: corpus_path = config.paths[g]["corpus"] logging.info("loading corpus %s" % corpus_path) this_corpus = pickle.load(open(corpus_path, 'rb')) corpus.documents.update(this_corpus.documents) if options.actions == "write_goldstandard": model = BiasModel(options.output[1]) model.load_data(corpus, []) results = model.test() #results = ResultsNER(options.output[1]) #results.get_ner_results(corpus, model) results.save(options.output[1] + ".pickle") #logging.info("saved gold standard results to " + options.output[1] + ".txt") # training elif options.actions == "train": if options.crf == "stanford": model = StanfordNERModel(options.models, options.etype) elif options.crf == "crfsuite": model = CrfSuiteModel(options.models, options.etype) model.load_data(corpus, feature_extractors.keys(), options.etype) model.train() elif options.actions == "train_matcher": # Train a simple classifier based on string matching model = MatcherModel(options.models) model.train(corpus) # TODO: term list option #model.train("TermList.txt") elif options.actions == "train_multiple": # Train one classifier for each type of entity in this corpus # logging.info(corpus.subtypes) models = TaggerCollection(basepath=options.models, corpus=corpus, subtypes=corpus.subtypes) models.train_types() elif options.actions == "train_relations": if options.kernel == "jsre": model = JSREKernel(corpus, options.ptype) elif options.kernel == "svmtk": model = SVMTKernel(corpus, options.ptype) elif options.kernel == "stanfordre": model = StanfordRE(corpus, options.ptype) elif options.kernel == "multir": model = MultiR(corpus, options.ptype) elif options.kernel == "scikit": model = ScikitRE(corpus, options.ptype) elif options.kernel == "crf": model = CrfSuiteRE(corpus, options.ptype) model.train() # testing elif options.actions == "test": base_port = 9191 if len(options.submodels) > 1: allresults = ResultSetNER(corpus, options.output[1]) for i, submodel in enumerate(options.submodels): model = StanfordNERModel(options.models + "_" + submodel) model.load_tagger(base_port + i) # load data into the model format model.load_data(corpus, feature_extractors.keys(), mode="test") # run the classifier on the data results = model.test(corpus, port=base_port + i) allresults.add_results(results) model.kill_process() # save the results to an object that can be read again, and log files to debug final_results = allresults.combine_results() else: if options.crf == "stanford": model = StanfordNERModel(options.models, options.etype) elif options.crf == "crfsuite": model = CrfSuiteModel(options.models, options.etype) model.load_tagger() model.load_data(corpus, feature_extractors.keys(), mode="test") final_results = model.test(corpus) #with codecs.open(options.output[1] + ".txt", 'w', 'utf-8') as outfile: # lines = final_results.corpus.write_chemdner_results(options.models, outfile) #final_results.lines = lines final_results.save(options.output[1] + ".pickle") elif options.actions == "test_matcher": if "mirna" in options.models: model = MirnaMatcher(options.models) else: model = MatcherModel(options.models) results = ResultsNER(options.models) results.corpus, results.entities = model.test(corpus) allentities = set() for e in results.entities: allentities.add(results.entities[e].text) with codecs.open(options.output[1] + ".txt", 'w', 'utf-8') as outfile: outfile.write('\n'.join(allentities)) results.save(options.output[1] + ".pickle") elif options.actions == "test_multiple": logging.info("testing with multiple classifiers... {}".format( ' '.join(options.submodels))) allresults = ResultSetNER(corpus, options.output[1]) if len(options.submodels) < 2: models = TaggerCollection(basepath=options.models) models.load_models() results = models.test_types(corpus) final_results = results.combine_results() else: base_port = 9191 for submodel in options.submodels: models = TaggerCollection(basepath=options.models + "_" + submodel, baseport=base_port) models.load_models() results = models.test_types(corpus) logging.info("combining results...") submodel_results = results.combine_results() allresults.add_results(submodel_results) base_port += len(models.models) final_results = allresults.combine_results() logging.info("saving results...") final_results.save(options.output[1] + ".pickle") elif options.actions == "test_relations": if options.kernel == "jsre": model = JSREKernel(corpus, options.ptype, train=False) elif options.kernel == "svmtk": model = SVMTKernel(corpus, options.ptype) elif options.kernel == "rules": model = RuleClassifier(corpus, options.ptype) elif options.kernel == "stanfordre": model = StanfordRE(corpus, options.ptype) elif options.kernel == "scikit": model = ScikitRE(corpus, options.ptype) elif options.kernel == "crf": model = CrfSuiteRE(corpus, options.ptype, test=True) model.load_classifier() model.test() results = model.get_predictions(corpus) results.save(options.output[1] + ".pickle") total_time = time.time() - start_time logging.info("Total time: %ss" % total_time)
def main(): start_time = time.time() parser = argparse.ArgumentParser(description='') parser.add_argument("actions", default="classify", help="Actions to be performed.") parser.add_argument( "--goldstd", default="", dest="goldstd", nargs="+", help="Gold standard to be used. Will override corpus, annotations", choices=paths.keys()) parser.add_argument("--submodels", default="", nargs='+', help="sub types of classifiers"), parser.add_argument("--models", dest="models", help="model destination path, without extension") parser.add_argument("--pairtype", dest="ptype", help="type of pairs to be considered", default="all") parser.add_argument("--doctype", dest="doctype", help="type of document to be considered", default="all") parser.add_argument( "-o", "--output", "--format", dest="output", nargs=2, help="format path; output formats: xml, html, tsv, text, chemdner.") parser.add_argument("--log", action="store", dest="loglevel", default="WARNING", help="Log level") parser.add_argument("--kernel", action="store", dest="kernel", default="svmtk", help="Kernel for relation extraction") options = parser.parse_args() # set logger numeric_level = getattr(logging, options.loglevel.upper(), None) if not isinstance(numeric_level, int): raise ValueError('Invalid log level: %s' % options.loglevel) while len(logging.root.handlers) > 0: logging.root.removeHandler(logging.root.handlers[-1]) logging_format = '%(asctime)s %(levelname)s %(filename)s:%(lineno)s:%(funcName)s %(message)s' logging.basicConfig(level=numeric_level, format=logging_format) logging.getLogger().setLevel(numeric_level) logging.getLogger("requests.packages").setLevel(30) logging.info("Processing action {0} on {1}".format(options.actions, options.goldstd)) # set configuration variables based on the goldstd option if the corpus has a gold standard, # or on corpus and annotation options # pre-processing options if options.actions == "load_corpus": if len(options.goldstd) > 1: print "load only one corpus each time" sys.exit() options.goldstd = options.goldstd[0] corpus_format = paths[options.goldstd]["format"] corpus_path = paths[options.goldstd]["text"] corpus_ann = paths[options.goldstd]["annotations"] corenlp_client = StanfordCoreNLP('http://localhost:9000') # corpus = load_corpus(options.goldstd, corpus_path, corpus_format, corenlp_client) corpus = SeeDevCorpus(corpus_path) corpus.load_corpus(corenlp_client) corpus.save(paths[options.goldstd]["corpus"]) if corpus_ann: #add annotation if it is not a test set corpus.load_annotations(corpus_ann, "all") corpus.save(paths[options.goldstd]["corpus"]) elif options.actions == "annotate": # rext-add annotation to corpus if len(options.goldstd) > 1: print "load only one corpus each time" sys.exit() options.goldstd = options.goldstd[0] corpus_path = paths[options.goldstd]["corpus"] corpus_ann = paths[options.goldstd]["annotations"] logging.info("loading corpus %s" % corpus_path) corpus = pickle.load(open(corpus_path, 'rb')) logging.debug("loading annotations...") # corpus.clear_annotations("all") corpus.load_annotations(corpus_ann, "all", options.ptype) # corpus.get_invalid_sentences() corpus.save(paths[options.goldstd]["corpus"]) else: #corpus = SeeDevCorpus("corpus/" + "&".join(options.goldstd)) corpus_path = paths[options.goldstd[0]]["corpus"] logging.info("loading corpus %s" % corpus_path) basecorpus = pickle.load(open(corpus_path, 'rb')) corpus = SeeDevCorpus(corpus_path) corpus.documents = basecorpus.documents if options.actions == "add_sentences": corpus.add_more_sentences(options.models) elif options.actions == "add_goldstandard": corpus.convert_entities_to_goldstandard() corpus.find_ds_relations() #corpus.save(config.paths[options.goldstd[0]]["corpus"]) elif options.actions == "train_multiple": # Train one classifier for each type of entity in this corpus # logging.info(corpus.subtypes) models = TaggerCollection(basepath=options.models, corpus=corpus, subtypes=all_entity_types) models.train_types() elif options.actions == "train_relations": if options.ptype == "all": ptypes = pair_types.keys() # ptypes = config.event_types.keys() else: ptypes = [options.ptype] for p in ptypes: print p if options.kernel == "jsre": model = JSREKernel(corpus, p, train=True) elif options.kernel == "svmtk": model = SVMTKernel(corpus, p) elif options.kernel == "stanfordre": model = StanfordRE(corpus, p) elif options.kernel == "multir": model = MultiR(corpus, p) elif options.kernel == "scikit": model = ScikitRE(corpus, p) elif options.kernel == "crf": model = CrfSuiteRE(corpus, p) # model.train() # testing elif options.actions == "test_multiple": logging.info("testing with multiple classifiers... {}".format( ' '.join(options.submodels))) models = TaggerCollection(basepath=options.models, subtypes=all_entity_types) models.load_models() results = models.test_types(corpus) final_results = results.combine_results() logging.info("saving results...") final_results.save(options.output[1] + ".pickle") elif options.actions == "test_relations": if options.ptype == "all": ptypes = pair_types.keys() # ptypes = config.event_types.keys() all_results = ResultsRE(options.output[1]) all_results.corpus = corpus all_results.path = options.output[1] else: ptypes = [options.ptype] for p in ptypes: print p if options.kernel == "jsre": model = JSREKernel(corpus, p, train=False) elif options.kernel == "svmtk": model = SVMTKernel(corpus, p) elif options.kernel == "rules": model = RuleClassifier(corpus, p) elif options.kernel == "stanfordre": model = StanfordRE(corpus, p) elif options.kernel == "scikit": model = ScikitRE(corpus, p) elif options.kernel == "crf": model = CrfSuiteRE(corpus, p, test=True) model.load_classifier() model.test() results = model.get_predictions(corpus) # results.save(options.output[1] + "_" + p.lower() + ".pickle") # results.load_corpus(options.goldstd[0]) results.path = options.output[1] + "_" + p.lower() goldset = get_gold_ann_set( paths[options.goldstd[0]]["format"], paths[options.goldstd[0]]["annotations"], "all", p, paths[options.goldstd[0]]["text"]) get_relations_results(results, options.models, goldset[1], [], []) if options.ptype == "all": for did in results.document_pairs: if did not in all_results.document_pairs: all_results.document_pairs[did] = Pairs(did=did) all_results.document_pairs[ did].pairs += results.document_pairs[did].pairs if options.ptype == "all": goldset = get_gold_ann_set( paths[options.goldstd[0]]["format"], paths[options.goldstd[0]]["annotations"], "all", "all", paths[options.goldstd[0]]["text"]) get_relations_results(all_results, options.models, goldset[1], [], []) write_seedev_results(all_results, options.output[1]) elif options.actions == "train_sentences": #and evaluate if options.ptype == "all": avg = [0, 0, 0] for p in pair_types: print p tps, fps, fns = corpus.train_sentence_classifier(p) if tps == 0 and fns == 0: precision, recall, fscore = 0, 1, 1 else: precision = 1.0 * tps / (fps + tps) recall = 1.0 * fns / (fns + tps) fscore = 2.0 * precision * recall / (recall + precision) print precision, recall, fscore avg[0] += tps avg[1] += fps avg[2] += fns #print [a/len(config.pair_types) for a in avg] precision = 1.0 * avg[1] / (avg[0] + avg[1]) recall = 1.0 * avg[2] / (avg[0] + avg[2]) fscore = 2.0 * precision * recall / (recall + precision) print precision, recall, fscore else: res = corpus.train_sentence_classifier(options.ptype) print res corpus.save(paths[options.goldstd[0]]["corpus"]) elif options.actions == "test_sentences": #and evaluate if options.ptype == "all": avg = [0, 0, 0] for p in pair_types: print p tps, fps, fns = corpus.test_sentence_classifier(p) if tps == 0 and fns == 0: precision, recall, fscore = 0, 1, 1 else: precision = 1.0 * tps / (fps + tps) recall = 1.0 * fns / (fns + tps) fscore = 2.0 * precision * recall / (recall + precision) print precision, recall, fscore avg[0] += tps avg[1] += fps avg[2] += fns #print [a/len(config.pair_types) for a in avg] precision = 1.0 * avg[1] / (avg[0] + avg[1]) recall = 1.0 * avg[2] / (avg[0] + avg[2]) fscore = 2.0 * precision * recall / (recall + precision) print precision, recall, fscore #else: # res = corpus.test_sentence_classifier(options.ptype) # print res elif options.actions == "evaluate_ner": if os.path.exists(options.output[1] + ".pickle"): results = pickle.load(open(options.output[1] + ".pickle", 'rb')) results.load_corpus(options.goldstd[0]) results.path = options.output[1] logging.info("loading gold standard %s" % paths[options.goldstd[0]]["annotations"]) for t in all_entity_types: print t results.path = options.output[1] + "_" + t goldset = get_gold_ann_set( paths[options.goldstd[0]]["format"], paths[options.goldstd[0]]["annotations"], t, options.ptype, paths[options.goldstd[0]]["text"]) get_results(results, options.models + "_" + t, goldset[0], {}, {}) corpus.save(paths[options.goldstd[0]]["corpus"]) total_time = time.time() - start_time logging.info("Total time: %ss" % total_time)
def main(): start_time = time.time() parser = argparse.ArgumentParser(description='') parser.add_argument("actions", default="classify", help="Actions to be performed.") parser.add_argument("--goldstd", default="", dest="goldstd", nargs="+", help="Gold standard to be used. Will override corpus, annotations", choices=paths.keys()) parser.add_argument("--submodels", default="", nargs='+', help="sub types of classifiers"), parser.add_argument("--models", dest="models", help="model destination path, without extension") parser.add_argument("--pairtype", dest="ptype", help="type of pairs to be considered", default="all") parser.add_argument("--doctype", dest="doctype", help="type of document to be considered", default="all") parser.add_argument("-o", "--output", "--format", dest="output", nargs=2, help="format path; output formats: xml, html, tsv, text, chemdner.") parser.add_argument("--log", action="store", dest="loglevel", default="WARNING", help="Log level") parser.add_argument("--kernel", action="store", dest="kernel", default="svmtk", help="Kernel for relation extraction") options = parser.parse_args() # set logger numeric_level = getattr(logging, options.loglevel.upper(), None) if not isinstance(numeric_level, int): raise ValueError('Invalid log level: %s' % options.loglevel) while len(logging.root.handlers) > 0: logging.root.removeHandler(logging.root.handlers[-1]) logging_format = '%(asctime)s %(levelname)s %(filename)s:%(lineno)s:%(funcName)s %(message)s' logging.basicConfig(level=numeric_level, format=logging_format) logging.getLogger().setLevel(numeric_level) logging.getLogger("requests.packages").setLevel(30) logging.info("Processing action {0} on {1}".format(options.actions, options.goldstd)) # set configuration variables based on the goldstd option if the corpus has a gold standard, # or on corpus and annotation options # pre-processing options if options.actions == "load_corpus": if len(options.goldstd) > 1: print "load only one corpus each time" sys.exit() options.goldstd = options.goldstd[0] corpus_format = paths[options.goldstd]["format"] corpus_path = paths[options.goldstd]["text"] corpus_ann = paths[options.goldstd]["annotations"] corenlp_client = StanfordCoreNLP('http://localhost:9000') # corpus = load_corpus(options.goldstd, corpus_path, corpus_format, corenlp_client) corpus = SeeDevCorpus(corpus_path) corpus.load_corpus(corenlp_client) corpus.save(paths[options.goldstd]["corpus"]) if corpus_ann: #add annotation if it is not a test set corpus.load_annotations(corpus_ann, "all") corpus.save(paths[options.goldstd]["corpus"]) elif options.actions == "annotate": # rext-add annotation to corpus if len(options.goldstd) > 1: print "load only one corpus each time" sys.exit() options.goldstd = options.goldstd[0] corpus_path = paths[options.goldstd]["corpus"] corpus_ann = paths[options.goldstd]["annotations"] logging.info("loading corpus %s" % corpus_path) corpus = pickle.load(open(corpus_path, 'rb')) logging.debug("loading annotations...") # corpus.clear_annotations("all") corpus.load_annotations(corpus_ann, "all", options.ptype) # corpus.get_invalid_sentences() corpus.save(paths[options.goldstd]["corpus"]) else: #corpus = SeeDevCorpus("corpus/" + "&".join(options.goldstd)) corpus_path = paths[options.goldstd[0]]["corpus"] logging.info("loading corpus %s" % corpus_path) basecorpus = pickle.load(open(corpus_path, 'rb')) corpus = SeeDevCorpus(corpus_path) corpus.documents = basecorpus.documents if options.actions == "add_sentences": corpus.add_more_sentences(options.models) elif options.actions == "add_goldstandard": corpus.convert_entities_to_goldstandard() corpus.find_ds_relations() #corpus.save(config.paths[options.goldstd[0]]["corpus"]) elif options.actions == "train_multiple": # Train one classifier for each type of entity in this corpus # logging.info(corpus.subtypes) models = TaggerCollection(basepath=options.models, corpus=corpus, subtypes=all_entity_types) models.train_types() elif options.actions == "train_relations": if options.ptype == "all": ptypes = pair_types.keys() # ptypes = config.event_types.keys() else: ptypes = [options.ptype] for p in ptypes: print p if options.kernel == "jsre": model = JSREKernel(corpus, p, train=True) elif options.kernel == "svmtk": model = SVMTKernel(corpus, p) elif options.kernel == "stanfordre": model = StanfordRE(corpus, p) elif options.kernel == "multir": model = MultiR(corpus, p) elif options.kernel == "scikit": model = ScikitRE(corpus, p) elif options.kernel == "crf": model = CrfSuiteRE(corpus, p) # model.train() # testing elif options.actions == "test_multiple": logging.info("testing with multiple classifiers... {}".format(' '.join(options.submodels))) models = TaggerCollection(basepath=options.models, subtypes=all_entity_types) models.load_models() results = models.test_types(corpus) final_results = results.combine_results() logging.info("saving results...") final_results.save(options.output[1] + ".pickle") elif options.actions == "test_relations": if options.ptype == "all": ptypes = pair_types.keys() # ptypes = config.event_types.keys() all_results = ResultsRE(options.output[1]) all_results.corpus = corpus all_results.path = options.output[1] else: ptypes = [options.ptype] for p in ptypes: print p if options.kernel == "jsre": model = JSREKernel(corpus, p, train=False) elif options.kernel == "svmtk": model = SVMTKernel(corpus, p) elif options.kernel == "rules": model = RuleClassifier(corpus, p) elif options.kernel == "stanfordre": model = StanfordRE(corpus, p) elif options.kernel == "scikit": model = ScikitRE(corpus, p) elif options.kernel == "crf": model = CrfSuiteRE(corpus, p, test=True) model.load_classifier() model.test() results = model.get_predictions(corpus) # results.save(options.output[1] + "_" + p.lower() + ".pickle") # results.load_corpus(options.goldstd[0]) results.path = options.output[1] + "_" + p.lower() goldset = get_gold_ann_set(paths[options.goldstd[0]]["format"], paths[options.goldstd[0]]["annotations"], "all", p, paths[options.goldstd[0]]["text"]) get_relations_results(results, options.models, goldset[1],[], []) if options.ptype == "all": for did in results.document_pairs: if did not in all_results.document_pairs: all_results.document_pairs[did] = Pairs(did=did) all_results.document_pairs[did].pairs += results.document_pairs[did].pairs if options.ptype == "all": goldset = get_gold_ann_set(paths[options.goldstd[0]]["format"], paths[options.goldstd[0]]["annotations"], "all", "all", paths[options.goldstd[0]]["text"]) get_relations_results(all_results, options.models, goldset[1],[], []) write_seedev_results(all_results, options.output[1]) elif options.actions == "train_sentences": #and evaluate if options.ptype == "all": avg = [0,0,0] for p in pair_types: print p tps, fps, fns = corpus.train_sentence_classifier(p) if tps == 0 and fns == 0: precision, recall, fscore = 0, 1, 1 else: precision = 1.0 * tps / (fps + tps) recall = 1.0 * fns / (fns + tps) fscore = 2.0 * precision * recall / (recall + precision) print precision, recall, fscore avg[0] += tps avg[1] += fps avg[2] += fns #print [a/len(config.pair_types) for a in avg] precision = 1.0 * avg[1] / (avg[0] + avg[1]) recall = 1.0 * avg[2] / (avg[0] + avg[2]) fscore = 2.0 * precision * recall / (recall + precision) print precision, recall, fscore else: res = corpus.train_sentence_classifier(options.ptype) print res corpus.save(paths[options.goldstd[0]]["corpus"]) elif options.actions == "test_sentences": #and evaluate if options.ptype == "all": avg = [0,0,0] for p in pair_types: print p tps, fps, fns = corpus.test_sentence_classifier(p) if tps == 0 and fns == 0: precision, recall, fscore = 0, 1, 1 else: precision = 1.0 * tps / (fps + tps) recall = 1.0 * fns / (fns + tps) fscore = 2.0 * precision * recall / (recall + precision) print precision, recall, fscore avg[0] += tps avg[1] += fps avg[2] += fns #print [a/len(config.pair_types) for a in avg] precision = 1.0 * avg[1] / (avg[0] + avg[1]) recall = 1.0 * avg[2] / (avg[0] + avg[2]) fscore = 2.0 * precision * recall / (recall + precision) print precision, recall, fscore #else: # res = corpus.test_sentence_classifier(options.ptype) # print res elif options.actions == "evaluate_ner": if os.path.exists(options.output[1] + ".pickle"): results = pickle.load(open(options.output[1] + ".pickle", 'rb')) results.load_corpus(options.goldstd[0]) results.path = options.output[1] logging.info("loading gold standard %s" % paths[options.goldstd[0]]["annotations"]) for t in all_entity_types: print t results.path = options.output[1] + "_" + t goldset = get_gold_ann_set(paths[options.goldstd[0]]["format"], paths[options.goldstd[0]]["annotations"], t, options.ptype, paths[options.goldstd[0]]["text"]) get_results(results, options.models + "_" + t, goldset[0], {}, {}) corpus.save(paths[options.goldstd[0]]["corpus"]) total_time = time.time() - start_time logging.info("Total time: %ss" % total_time)
class IICEServer(object): def __init__(self, basemodel, ensemble_model, submodels): self.corenlp = StanfordCoreNLP(config.corenlp_dir) self.basemodel = basemodel self.ensemble_model = ensemble_model self.subtypes = submodels self.models = TaggerCollection(basepath=self.basemodel) self.models.load_models() self.ensemble = EnsembleNER(self.ensemble_model, None, self.basemodel + "_combined", types=self.subtypes, features=[]) self.ensemble.load() def hello(self): return "Hello World!" def process_pubmed(self, pmid): title, text = pubmed.get_pubmed_abs(pmid) def id_generator(self, size=6, chars=string.ascii_uppercase + string.digits): return ''.join(random.choice(chars) for _ in range(size)) def process_multiple(self): bottle.response.content_type = "application/json" data = bottle.request.json text = data["text"] format = data["format"] test_corpus = self.generate_corpus(text) multiple_results = self.models.test_types(test_corpus) final_results = multiple_results.combine_results() final_results = add_chebi_mappings(final_results, self.basemodel) final_results = add_ssm_score(final_results, self.basemodel) final_results.combine_results(self.basemodel, self.basemodel + "_combined") # self.ensemble.generate_data(final_results, supervisioned=False) #self.ensemble.test() # ensemble_results = ResultsNER(self.basemodel + "_combined_ensemble") # ensemble_results.get_ensemble_results(self.ensemble, final_results.corpus, self.basemodel + "_combined") #output = get_output(final_results, basemodel + "_combined") results_id = self.id_generator() #output = self.get_output(ensemble_results, self.basemodel + "_combined_ensemble", format, id=results_id) output = self.get_output(final_results, self.basemodel + "_combined", format=format, results_id=results_id) #self.models.load_models() self.clean_up() # save corpus to pickel and add ID to the output as corpusfile pickle.dump(final_results.corpus, open("temp/{}.pickle".format(results_id), 'w')) return output def clean_up(self): for m in self.models.models: self.models.models[m].reset() self.models.basemodel.reset() def process(self, text="", modeltype="all"): test_corpus = self.generate_corpus(text) model = SimpleTaggerModel("models/chemdner_train_f13_lbfgs_" + modeltype) model.load_tagger() # load data into the model format model.load_data(test_corpus, feature_extractors.keys()) # run the classifier on the data results = model.test(stats=False) #results = ResultsNER("models/chemdner_train_f13_lbfgs_" + modeltype) # process the results #results.get_ner_results(test_corpus, model) output = self.get_output(results, "models/chemdner_train_f13_lbfgs_" + modeltype) return output def get_relations(self): """ Process the results dictionary, identify relations :return: results dictionary with relations """ data = bottle.request.json # logging.debug(str(data)) if "corpusfile" in data: corpus = pickle.load(open("temp/{}.pickle".format(data["corpusfile"]))) logging.info("loaded corpus {}".format(data["corpusfile"])) else: # create corpus corpus = None pass did = "d0" #for sentence in data["abstract"]["sentences"]: for sentence in corpus.documents["d0"].sentences[1:]: sentence_pairs = [] sentence_entities = sentence.entities.elist[self.basemodel + "_combined"] # logging.info("sentence entities:" + str(sentence_entities)) sid = sentence.sid for i1, e1 in enumerate(sentence_entities): logging.info("sentence entities:" + str(e1)) if i1 < len(sentence_entities)-1: for i2, e2 in enumerate(sentence_entities[i1+1:]): logging.info("sentence entities:" + str(e2)) pid = sentence.sid + ".p{}".format(len(sentence_pairs)) newpair = Pair(entities=[e1, e2], sid=sid, pid=pid, did=did) sentence_pairs.append(newpair) sentence.pairs.pairs[pid] = newpair logging.info(str(sentence_pairs)) if len(sentence_pairs) > 0: corpus.documents[did].get_sentence(sid).test_relations(sentence_pairs, self.basemodel + "_combined") return data def generate_corpus(self, text): """ Create a corpus object from the input text. :param text: :return: """ test_corpus = Corpus("") newdoc = Document(text, process=False, did="d0", title="Test document") newdoc.sentence_tokenize("biomedical") newdoc.process_document(self.corenlp, "biomedical") test_corpus.documents["d0"] = newdoc return test_corpus def get_output(self, results, model_name, format="bioc", results_id=None): if format == "bioc": a = ET.Element('collection') bioc = results.corpus.documents["d0"].write_bioc_results(a, model_name) rough_string = ET.tostring(a, 'utf-8') reparsed = minidom.parseString(rough_string) output = reparsed.toprettyxml(indent="\t") elif format == "chemdner": with codecs.open("/dev/null", 'w', 'utf-8') as outfile: lines = results.corpus.write_chemdner_results(model_name, outfile) output = "" for l in lines: output += ' '.join(l) + " " else: # default should be json results_dic = results.corpus.documents["d0"].get_dic(model_name) results_dic["corpusfile"] = results_id output = json.dumps(results_dic) return output
def main(): start_time = time.time() parser = argparse.ArgumentParser(description='') parser.add_argument("actions", default="classify", help="Actions to be performed.", choices=["load_corpus", "annotate", "classify", "write_results", "write_goldstandard", "train", "test", "train_multiple", "test_multiple", "train_matcher", "test_matcher", "crossvalidation", "train_relations", "test_relations", "load_genia", "load_biomodel", "merge_corpus"]) parser.add_argument("--goldstd", default="", dest="goldstd", nargs="+", help="Gold standard to be used. Will override corpus, annotations", choices=paths.keys()) parser.add_argument("--submodels", default="", nargs='+', help="sub types of classifiers"), parser.add_argument("-i", "--input", dest="input", action="store", default='''Administration of a higher dose of indinavir should be \\ considered when coadministering with megestrol acetate.''', help="Text to classify.") parser.add_argument("--corpus", dest="corpus", nargs=2, default=["chemdner", "CHEMDNER/CHEMDNER_SAMPLE_JUNE25/chemdner_sample_abstracts.txt"], help="format path") parser.add_argument("--annotations", dest="annotations") parser.add_argument("--tag", dest="tag", default="0", help="Tag to identify the experiment") parser.add_argument("--models", dest="models", help="model destination path, without extension") parser.add_argument("--entitytype", dest="etype", help="type of entities to be considered", default="all") parser.add_argument("--pairtype", dest="ptype", help="type of pairs to be considered", default="all") parser.add_argument("--doctype", dest="doctype", help="type of document to be considered", default="all") parser.add_argument("--annotated", action="store_true", default=False, dest="annotated", help="True if the input has <entity> tags.") parser.add_argument("-o", "--output", "--format", dest="output", nargs=2, help="format path; output formats: xml, html, tsv, text, chemdner.") parser.add_argument("--crf", dest="crf", help="CRF implementation", default="stanford", choices=["stanford", "crfsuite", "banner"]) parser.add_argument("--log", action="store", dest="loglevel", default="WARNING", help="Log level") parser.add_argument("--kernel", action="store", dest="kernel", default="svmtk", help="Kernel for relation extraction") options = parser.parse_args() # set logger numeric_level = getattr(logging, options.loglevel.upper(), None) if not isinstance(numeric_level, int): raise ValueError('Invalid log level: %s' % options.loglevel) while len(logging.root.handlers) > 0: logging.root.removeHandler(logging.root.handlers[-1]) logging_format = '%(asctime)s %(levelname)s %(filename)s:%(lineno)s:%(funcName)s %(message)s' logging.basicConfig(level=numeric_level, format=logging_format) logging.getLogger().setLevel(numeric_level) logging.getLogger("requests.packages").setLevel(30) logging.info("Processing action {0} on {1}".format(options.actions, options.goldstd)) # set configuration variables based on the goldstd option if the corpus has a gold standard, # or on corpus and annotation options # pre-processing options if options.actions == "load_corpus": if len(options.goldstd) > 1: print "load only one corpus each time" sys.exit() options.goldstd = options.goldstd[0] corpus_format = paths[options.goldstd]["format"] corpus_path = paths[options.goldstd]["text"] corpus_ann = paths[options.goldstd]["annotations"] corenlp_client = StanfordCoreNLP('http://localhost:9000') corpus = load_corpus(options.goldstd, corpus_path, corpus_format, corenlp_client) #corpus.load_genia() #TODO optional genia corpus.save(paths[options.goldstd]["corpus"]) if corpus_ann: #add annotation if it is not a test set corpus.load_annotations(corpus_ann, options.etype, options.ptype) corpus.save(paths[options.goldstd]["corpus"]) elif options.actions == "load_genia": options.goldstd = options.goldstd[0] corpus_path = paths[options.goldstd]["corpus"] corpus_ann = paths[options.goldstd]["annotations"] logging.info("loading corpus %s" % corpus_path) corpus = pickle.load(open(corpus_path, 'rb')) corpus.load_genia() corpus.save(paths[options.goldstd]["corpus"]) elif options.actions == "load_biomodel": options.goldstd = options.goldstd[0] corpus_path = paths[options.goldstd]["corpus"] corpus_ann = paths[options.goldstd]["annotations"] logging.info("loading corpus %s" % corpus_path) corpus = pickle.load(open(corpus_path, 'rb')) corpus.load_biomodel() corpus.save(paths[options.goldstd]["corpus"]) elif options.actions == "annotate": # rext-add annotation to corpus if len(options.goldstd) > 1: print "load only one corpus each time" sys.exit() options.goldstd = options.goldstd[0] corpus_path = paths[options.goldstd]["corpus"] corpus_ann = paths[options.goldstd]["annotations"] logging.info("loading corpus %s" % corpus_path) corpus = pickle.load(open(corpus_path, 'rb')) logging.debug("loading annotations...") corpus.clear_annotations(options.etype) corpus.load_annotations(corpus_ann, options.etype, options.ptype) # corpus.get_invalid_sentences() corpus.save(paths[options.goldstd]["corpus"]) else: corpus = Corpus("corpus/" + "&".join(options.goldstd)) for g in options.goldstd: corpus_path = paths[g]["corpus"] logging.info("loading corpus %s" % corpus_path) this_corpus = pickle.load(open(corpus_path, 'rb')) corpus.documents.update(this_corpus.documents) if options.actions == "write_goldstandard": model = BiasModel(options.output[1]) model.load_data(corpus, []) results = model.test() #results = ResultsNER(options.output[1]) #results.get_ner_results(corpus, model) results.save(options.output[1] + ".pickle") #logging.info("saved gold standard results to " + options.output[1] + ".txt") elif options.actions == "merge_corpus": corpus.save(paths[options.output[1]]["corpus"]) # training elif options.actions == "train": if options.crf == "stanford": model = StanfordNERModel(options.models, options.etype) elif options.crf == "crfsuite": model = CrfSuiteModel(options.models, options.etype) model.load_data(corpus, feature_extractors.keys(), options.etype) model.train() elif options.actions == "train_matcher": # Train a simple classifier based on string matching model = MatcherModel(options.models) model.train(corpus) # TODO: term list option #model.train("TermList.txt") elif options.actions == "train_multiple": # Train one classifier for each type of entity in this corpus # logging.info(corpus.subtypes) models = TaggerCollection(basepath=options.models, corpus=corpus, subtypes=corpus.subtypes) models.train_types() elif options.actions == "train_relations": if options.kernel == "jsre": model = JSREKernel(corpus, options.ptype, modelname=options.tag) elif options.kernel == "svmtk": model = SVMTKernel(corpus, options.ptype, modelname=options.tag) #elif options.kernel == "stanfordre": # model = StanfordRE(corpus, options.ptype) #elif options.kernel == "multir": # model = MultiR(corpus, options.ptype) #elif options.kernel == "scikit": # model = ScikitRE(corpus, options.ptype) #elif options.kernel == "crf": # model = CrfSuiteRE(corpus, options.ptype) elif options.kernel == "mil": relations = set() with open("corpora/transmir/transmir_relations.txt") as rfile: for l in rfile: relations.add(tuple(l.strip().split('\t'))) model = MILClassifier(corpus, options.ptype, relations, ner=options.models) model.train() # testing elif options.actions == "test": base_port = 9191 if len(options.submodels) > 1: allresults = ResultSetNER(corpus, options.output[1]) for i, submodel in enumerate(options.submodels): model = StanfordNERModel(options.models + "_" + submodel) model.load_tagger(base_port + i) # load data into the model format model.load_data(corpus, feature_extractors.keys(), mode="test") # run the classifier on the data results = model.test(corpus, port=base_port + i) allresults.add_results(results) model.kill_process() # save the results to an object that can be read again, and log files to debug final_results = allresults.combine_results() else: if options.crf == "stanford": model = StanfordNERModel(options.models, options.etype) elif options.crf == "crfsuite": model = CrfSuiteModel(options.models, options.etype) elif options.crf == "banner": model = BANNERModel(options.models, options.etype) model.load_tagger() model.load_data(corpus, feature_extractors.keys(), mode="test") final_results = model.test(corpus) #with codecs.open(options.output[1] + ".txt", 'w', 'utf-8') as outfile: # lines = final_results.corpus.write_chemdner_results(options.models, outfile) #final_results.lines = lines final_results.save(options.output[1] + ".pickle") elif options.actions == "test_matcher": if "mirna" in options.models: model = MirnaMatcher(options.models) else: model = MatcherModel(options.models) results = ResultsNER(options.models) results.corpus, results.entities = model.test(corpus) allentities = set() for e in results.entities: allentities.add(results.entities[e].text) with codecs.open(options.output[1] + ".txt", 'w', 'utf-8') as outfile: outfile.write('\n'.join(allentities)) results.save(options.output[1] + ".pickle") elif options.actions == "test_multiple": logging.info("testing with multiple classifiers... {}".format(' '.join(options.submodels))) allresults = ResultSetNER(corpus, options.output[1]) if len(options.submodels) < 2: models = TaggerCollection(basepath=options.models) models.load_models() results = models.test_types(corpus) final_results = results.combine_results() else: base_port = 9191 for submodel in options.submodels: models = TaggerCollection(basepath=options.models + "_" + submodel, baseport = base_port) models.load_models() results = models.test_types(corpus) logging.info("combining results...") submodel_results = results.combine_results() allresults.add_results(submodel_results) base_port += len(models.models) final_results = allresults.combine_results() logging.info("saving results...") final_results.save(options.output[1] + ".pickle") elif options.actions == "test_relations": if options.kernel == "jsre": model = JSREKernel(corpus, options.ptype, train=False, modelname=options.tag, ner=options.models) elif options.kernel == "svmtk": model = SVMTKernel(corpus, options.ptype, modelname=options.tag, ner=options.models) elif options.kernel == "rules": model = RuleClassifier(corpus, options.ptype, ner=options.models) elif options.kernel == "mirtex_rules": model = MirtexClassifier(corpus, options.ptype) elif options.kernel == "stanfordre": model = StanfordRE(corpus, options.ptype) elif options.kernel == "scikit": model = ScikitRE(corpus, options.ptype) elif options.kernel == "crf": model = CrfSuiteRE(corpus, options.ptype, test=True) elif options.kernel == "mil": relations = set() with open("corpora/transmir/transmir_relations.txt") as rfile: for l in rfile: relations.add(tuple(l.strip().split('\t'))) model = MILClassifier(corpus, options.ptype, relations, test=True, ner=options.models) model.load_classifier() model.test() results = model.get_predictions(corpus) results.save(options.output[1] + ".pickle") total_time = time.time() - start_time logging.info("Total time: %ss" % total_time)