def get_predictions(self, corpus): #real_pair_type = config.event_types[self.pairtype]["subtypes"][0] results = ResultsRE(self.modelname) temppreds = {} for i in range(len(self.pred)): did = ".".join(self.pairs[i][0].sid.split(".")[:-1]) pid = did + ".p" + str(i) if self.pred[i]: did = '.'.join(pid.split(".")[:-1]) if did not in results.document_pairs: results.document_pairs[did] = Pairs() #pair = corpus.documents[did].add_relation(self.pairs[i][0], self.pairs[i][1], real_pair_type, relation=True) pair = corpus.documents[did].add_relation(self.pairs[i][0], self.pairs[i][1], self.pairtype, relation=True) results.document_pairs[did].add_pair(pair, "scikit") #pair = self.get_pair(pid, corpus) results.pairs[pid] = pair # logging.debug("{} - {} SLK: {}".format(pair.entities[0], pair.entities[1], p)) #if pair not in temppreds: # temppreds[pair] = [] #temppreds[pair].append(p) results.pairs[pid].recognized_by["scikit"] = 1 results.corpus = corpus return results
def get_predictions(self, corpus, resultfile="jsre_results.txt"): results = ResultsRE(resultfile) with open(self.temp_dir + "svm_test_output.txt", 'r') as out: lines = out.readlines() # npairs = sum([len(corpus.documents[did].pairs.pairs) for did in corpus.documents]) # if len(lines) != npairs: # print "check " + "svm_test_output.txt! something is wrong" # sys.exit() for ip, pid in enumerate(self.pids): score = float(lines[ip]) # pair = self.get_pair(pid, corpus) # results.pairs[pid] = pair if float(score) < 0: # pair.recognized_by["svmtk"] = -1 logging.info(score) pass else: did = ".".join(pid.split(".")[:-1]) pair = corpus.documents[did].add_relation(self.pids[pid][0], self.pids[pid][1], "pair", relation=True) #pair = self.get_pair(pid, corpus) results.pairs[pid] = pair pair.recognized_by["svmtk"] = 1 logging.info("{0.eid}:{0.text} => {1.eid}:{1.text}".format(pair.entities[0],pair.entities[1])) #logging.info("{} - {} SST: {}".format(pair.entities[0], pair.entities[0], score)) results.corpus = corpus return results
def get_predictions(self, corpus, resultfile="jsre_results.txt"): results = ResultsRE(resultfile) with open(self.temp_dir + "svm_test_output.txt", 'r') as out: lines = out.readlines() # npairs = sum([len(corpus.documents[did].pairs.pairs) for did in corpus.documents]) # if len(lines) != npairs: # print "check " + "svm_test_output.txt! something is wrong" # sys.exit() for ip, pid in enumerate(self.pids): score = float(lines[ip]) # pair = self.get_pair(pid, corpus) # results.pairs[pid] = pair if float(score) < 0: # pair.recognized_by["svmtk"] = -1 logging.debug(score) else: did = ".".join(pid.split(".")[:-1]) pair = corpus.documents[did].add_relation(self.pids[pid][0], self.pids[pid][1], self.pair_type, relation=True) #pair = self.get_pair(pid, corpus) results.pairs[pid] = pair pair.recognized_by["svmtk"] = 1 logging.info("{0.eid}:{0.text} => {1.eid}:{1.text}".format(pair.entities[0],pair.entities[1])) #logging.info("{} - {} SST: {}".format(pair.entities[0], pair.entities[0], score)) results.corpus = corpus return results
def get_predictions(self, corpus): results = ResultsRE(self.modelname) temppreds = {} for i in range(len(self.entities)): # did = ".".join(self.pairs[i][0].sid.split(".")[:-1]) # pid = did + ".p" + str(i) # if "B-TARGET" in self.predicted[i]: # print self.predicted[i] # print self.scores did = self.entities[i][0].did if did not in results.document_pairs: results.document_pairs[did] = Pairs() for it, label in enumerate(self.predicted[i]): if label.endswith("B-TARGET"): # print self.entities[i][0].text, [(e.text, e.type) for e in self.entities[i][1][it]] for target in self.entities[i][1][it]: pid = did + ".p" + str(i) # if self.pred[i]: # did = '.'.join(pid.split(".")[:-1]) if did not in results.document_pairs: results.document_pairs[did] = Pairs() pair = corpus.documents[did].add_relation(self.entities[i][0], target, self.pairtype, relation=True) results.document_pairs[did].add_pair(pair, "crf") #pair = self.get_pair(pid, corpus) #results.pairs[pid] = pair # logging.debug("{} - {} SLK: {}".format(pair.entities[0], pair.entities[1], p)) #if pair not in temppreds: # temppreds[pair] = [] #temppreds[pair].append(p) results.pairs[pid] = pair results.pairs[pid].recognized_by["crf"] = 1 results.corpus = corpus return results
def get_predictions(self, corpus): # real_pair_type = config.event_types[self.pairtype]["subtypes"][0] #pred_y = [] with open(self.resultsfile, 'r') as resfile: pred = resfile.readlines() with codecs.open(self.examplesfile, 'r', 'utf-8') as trainfile: original = trainfile.readlines() if len(pred) != len(original): print "different number of predictions!" sys.exit() results = ResultsRE(self.resultsfile) temppreds = {} for i in range(len(pred)): original_tsv = original[i].split('\t') # logging.debug(original_tsv) pid = '.'.join(original_tsv[1].split('.')[:-1]) p = float(pred[i].strip()) if p == 0: p = -1 if p == 2: print "p=2!" p = 1 if p == 1: did = '.'.join(pid.split(".")[:-1]) if did not in results.document_pairs: results.document_pairs[did] = Pairs() pair = corpus.documents[did].add_relation(self.pairs[pid][0], self.pairs[pid][1], self.pairtype, relation=True) # pair = corpus.documents[did].add_relation(self.pairs[pid][0], self.pairs[pid][1], real_pair_type, relation=True) #pair = self.get_pair(pid, corpus) results.pairs[pid] = pair results.document_pairs[did].add_pair(pair, "jsre") # logging.debug("{} - {} SLK: {}".format(pair.entities[0], pair.entities[1], p)) #if pair not in temppreds: # temppreds[pair] = [] #temppreds[pair].append(p) results.pairs[pid].recognized_by["jsre"] = p '''for pair in temppreds: if relations.SLK_PRED not in pairs[pair]: pairs[pair][relations.SLK_PRED] = {} p = mode(temppreds[pair])[0][0] if len(set(temppreds[pair])) > 1: print temppreds[pair], p pairs[pair][relations.SLK_PRED][dditype] = p #if pairs[pair][ddi.SLK_PRED][dditype] and not pairs[pair][ddi.SLK_PRED]["all"]: # logging.info("type classifier %s found a new true pair: %s", dditype, pair) for pair in pairs: if relations.SLK_PRED not in pairs[pair]: pairs[pair][relations.SLK_PRED] = {} if dditype not in pairs[pair][relations.SLK_PRED]: pairs[pair][relations.SLK_PRED][dditype] = -1''' results.corpus = corpus return results
def get_predictions(self, corpus): results = ResultsRE("") # print len(self.pids) for p, pid in enumerate(self.pids): did = self.pids[pid][0].did if did not in results.document_pairs: results.document_pairs[did] = Pairs() pair = corpus.documents[did].add_relation(self.pids[pid][0], self.pids[pid][1], self.ptype, relation=True) # print pair, pair[0], pair[1] # pair = self.get_pair(pid, corpus) results.document_pairs[did].add_pair(pair, "mirtex_rules") results.pairs[pid] = pair pair.recognized_by["mirtex_rules"] = 1 logging.info("{0.eid}:{0.text} => {1.eid}:{1.text}".format(pair.entities[0], pair.entities[1])) # logging.info("{} - {} SST: {}".format(pair.entities[0], pair.entities[0], score)) results.corpus = corpus return results
def get_predictions(self, corpus): results = ResultsRE(self.resultsfile) for i, pred in enumerate(self.predicted): if pred >= 0: score = 1.0 / (1.0 + math.exp(-pred)) bag = self.bag_pairs[i] pairs = self.pairs[bag] for pair in pairs: #did = bag[0] did = pair[0].did if did not in results.document_pairs: results.document_pairs[did] = Pairs() new_pair = corpus.documents[did].add_relation( pair[0], pair[1], self.pairtype, relation=True) results.document_pairs[did].add_pair(new_pair, "mil") pid = did + ".p" + str(len(results.pairs)) results.pairs[pid] = new_pair results.pairs[pid].recognized_by["mil"] = score results.corpus = corpus return results
def get_predictions(self, corpus): results = ResultsRE(self.resultsfile) for i, pred in enumerate(self.predicted): if pred >= 0: score = 1.0 / (1.0 + math.exp(-pred)) bag = self.bag_pairs[i] pairs = self.pairs[bag] for pair in pairs: #did = bag[0] did = pair[0].did if did not in results.document_pairs: results.document_pairs[did] = Pairs() new_pair = corpus.documents[did].add_relation(pair[0], pair[1], self.pairtype, relation=True) results.document_pairs[did].add_pair(new_pair, "mil") pid = did + ".p" + str(len(results.pairs)) results.pairs[pid] = new_pair results.pairs[pid].recognized_by["mil"] = score results.corpus = corpus return results
def get_predictions(self, corpus): results = ResultsRE("") # print len(self.pids) for p, pid in enumerate(self.pids): did = self.pids[pid][0].did if did not in results.document_pairs: results.document_pairs[did] = Pairs() pair = corpus.documents[did].add_relation(self.pids[pid][0], self.pids[pid][1], self.ptype, relation=True) # print pair, pair[0], pair[1] #pair = self.get_pair(pid, corpus) results.document_pairs[did].add_pair(pair, "mirtex_rules") results.pairs[pid] = pair pair.recognized_by["mirtex_rules"] = 1 logging.info("{0.eid}:{0.text} => {1.eid}:{1.text}".format( pair.entities[0], pair.entities[1])) #logging.info("{} - {} SST: {}".format(pair.entities[0], pair.entities[0], score)) results.corpus = corpus return results
def main(): start_time = time.time() parser = argparse.ArgumentParser(description='') parser.add_argument("actions", default="classify", help="Actions to be performed.") parser.add_argument( "--goldstd", default="", dest="goldstd", nargs="+", help="Gold standard to be used. Will override corpus, annotations", choices=config.paths.keys()) parser.add_argument("--submodels", default="", nargs='+', help="sub types of classifiers"), parser.add_argument("--models", dest="models", help="model destination path, without extension") parser.add_argument("--pairtype", dest="ptype", help="type of pairs to be considered", default="all") parser.add_argument("--doctype", dest="doctype", help="type of document to be considered", default="all") parser.add_argument( "-o", "--output", "--format", dest="output", nargs=2, help="format path; output formats: xml, html, tsv, text, chemdner.") parser.add_argument("--log", action="store", dest="loglevel", default="WARNING", help="Log level") parser.add_argument("--kernel", action="store", dest="kernel", default="svmtk", help="Kernel for relation extraction") options = parser.parse_args() # set logger numeric_level = getattr(logging, options.loglevel.upper(), None) if not isinstance(numeric_level, int): raise ValueError('Invalid log level: %s' % options.loglevel) while len(logging.root.handlers) > 0: logging.root.removeHandler(logging.root.handlers[-1]) logging_format = '%(asctime)s %(levelname)s %(filename)s:%(lineno)s:%(funcName)s %(message)s' logging.basicConfig(level=numeric_level, format=logging_format) logging.getLogger().setLevel(numeric_level) logging.getLogger("requests.packages").setLevel(30) logging.info("Processing action {0} on {1}".format(options.actions, options.goldstd)) # set configuration variables based on the goldstd option if the corpus has a gold standard, # or on corpus and annotation options # pre-processing options if options.actions == "load_corpus": if len(options.goldstd) > 1: print "load only one corpus each time" sys.exit() options.goldstd = options.goldstd[0] corpus_format = config.paths[options.goldstd]["format"] corpus_path = config.paths[options.goldstd]["text"] corpus_ann = config.paths[options.goldstd]["annotations"] corenlp_client = StanfordCoreNLP('http://localhost:9000') # corpus = load_corpus(options.goldstd, corpus_path, corpus_format, corenlp_client) corpus = SeeDevCorpus(corpus_path) corpus.load_corpus(corenlp_client) corpus.save(config.paths[options.goldstd]["corpus"]) if corpus_ann: #add annotation if it is not a test set corpus.load_annotations(corpus_ann, "all") corpus.save(config.paths[options.goldstd]["corpus"]) elif options.actions == "annotate": # rext-add annotation to corpus if len(options.goldstd) > 1: print "load only one corpus each time" sys.exit() options.goldstd = options.goldstd[0] corpus_path = config.paths[options.goldstd]["corpus"] corpus_ann = config.paths[options.goldstd]["annotations"] logging.info("loading corpus %s" % corpus_path) corpus = pickle.load(open(corpus_path, 'rb')) logging.debug("loading annotations...") # corpus.clear_annotations("all") corpus.load_annotations(corpus_ann, "all", options.ptype) # corpus.get_invalid_sentences() corpus.save(config.paths[options.goldstd]["corpus"]) else: #corpus = SeeDevCorpus("corpus/" + "&".join(options.goldstd)) corpus_path = config.paths[options.goldstd[0]]["corpus"] logging.info("loading corpus %s" % corpus_path) corpus = pickle.load(open(corpus_path, 'rb')) if options.actions == "add_sentences": corpus.add_more_sentences(options.models) elif options.actions == "train_relations": if options.ptype == "all": ptypes = config.pair_types.keys() # ptypes = config.event_types.keys() else: ptypes = [options.ptype] for p in ptypes: print p if options.kernel == "jsre": model = JSREKernel(corpus, p, train=True) elif options.kernel == "svmtk": model = SVMTKernel(corpus, p) elif options.kernel == "stanfordre": model = StanfordRE(corpus, p) elif options.kernel == "multir": model = MultiR(corpus, p) elif options.kernel == "scikit": model = ScikitRE(corpus, p) elif options.kernel == "crf": model = CrfSuiteRE(corpus, p) model.train() # testing elif options.actions == "test_relations": if options.ptype == "all": ptypes = config.pair_types.keys() # ptypes = config.event_types.keys() all_results = ResultsRE(options.output[1]) all_results.corpus = corpus all_results.path = options.output[1] else: ptypes = [options.ptype] for p in ptypes: print p if options.kernel == "jsre": model = JSREKernel(corpus, p, train=False) elif options.kernel == "svmtk": model = SVMTKernel(corpus, p) elif options.kernel == "rules": model = RuleClassifier(corpus, p) elif options.kernel == "stanfordre": model = StanfordRE(corpus, p) elif options.kernel == "scikit": model = ScikitRE(corpus, p) elif options.kernel == "crf": model = CrfSuiteRE(corpus, p, test=True) model.load_classifier() model.test() results = model.get_predictions(corpus) # results.save(options.output[1] + "_" + p.lower() + ".pickle") # results.load_corpus(options.goldstd[0]) results.path = options.output[1] + "_" + p.lower() goldset = get_gold_ann_set( config.paths[options.goldstd[0]]["format"], config.paths[options.goldstd[0]]["annotations"], "all", p, config.paths[options.goldstd[0]]["text"]) get_relations_results(results, options.models, goldset[1], [], []) if options.ptype == "all": for did in results.document_pairs: if did not in all_results.document_pairs: all_results.document_pairs[did] = Pairs(did=did) all_results.document_pairs[ did].pairs += results.document_pairs[did].pairs if options.ptype == "all": goldset = get_gold_ann_set( config.paths[options.goldstd[0]]["format"], config.paths[options.goldstd[0]]["annotations"], "all", "all", config.paths[options.goldstd[0]]["text"]) get_relations_results(all_results, options.models, goldset[1], [], []) write_seedev_results(all_results, options.output[1]) elif options.actions == "train_sentences": #and evaluate if options.ptype == "all": avg = [0, 0, 0] for p in config.pair_types: print p tps, fps, fns = corpus.train_sentence_classifier(p) if tps == 0 and fns == 0: precision, recall, fscore = 0, 1, 1 else: precision = 1.0 * tps / (fps + tps) recall = 1.0 * fns / (fns + tps) fscore = 2.0 * precision * recall / (recall + precision) print precision, recall, fscore avg[0] += tps avg[1] += fps avg[2] += fns #print [a/len(config.pair_types) for a in avg] precision = 1.0 * avg[1] / (avg[0] + avg[1]) recall = 1.0 * avg[2] / (avg[0] + avg[2]) fscore = 2.0 * precision * recall / (recall + precision) print precision, recall, fscore else: res = corpus.train_sentence_classifier(options.ptype) print res corpus.save(config.paths[options.goldstd[0]]["corpus"]) elif options.actions == "test_sentences": #and evaluate if options.ptype == "all": avg = [0, 0, 0] for p in config.pair_types: print p tps, fps, fns = corpus.test_sentence_classifier(p) if tps == 0 and fns == 0: precision, recall, fscore = 0, 1, 1 else: precision = 1.0 * tps / (fps + tps) recall = 1.0 * fns / (fns + tps) fscore = 2.0 * precision * recall / (recall + precision) print precision, recall, fscore avg[0] += tps avg[1] += fps avg[2] += fns #print [a/len(config.pair_types) for a in avg] precision = 1.0 * avg[1] / (avg[0] + avg[1]) recall = 1.0 * avg[2] / (avg[0] + avg[2]) fscore = 2.0 * precision * recall / (recall + precision) print precision, recall, fscore else: res = corpus.test_sentence_classifier(options.ptype) print res corpus.save(config.paths[options.goldstd[0]]["corpus"]) total_time = time.time() - start_time logging.info("Total time: %ss" % total_time)
def main(): start_time = time.time() parser = argparse.ArgumentParser(description='') parser.add_argument("actions", default="classify", help="Actions to be performed.") parser.add_argument("--goldstd", default="", dest="goldstd", nargs="+", help="Gold standard to be used. Will override corpus, annotations", choices=paths.keys()) parser.add_argument("--submodels", default="", nargs='+', help="sub types of classifiers"), parser.add_argument("--models", dest="models", help="model destination path, without extension") parser.add_argument("--pairtype", dest="ptype", help="type of pairs to be considered", default="all") parser.add_argument("--doctype", dest="doctype", help="type of document to be considered", default="all") parser.add_argument("-o", "--output", "--format", dest="output", nargs=2, help="format path; output formats: xml, html, tsv, text, chemdner.") parser.add_argument("--log", action="store", dest="loglevel", default="WARNING", help="Log level") parser.add_argument("--kernel", action="store", dest="kernel", default="svmtk", help="Kernel for relation extraction") options = parser.parse_args() # set logger numeric_level = getattr(logging, options.loglevel.upper(), None) if not isinstance(numeric_level, int): raise ValueError('Invalid log level: %s' % options.loglevel) while len(logging.root.handlers) > 0: logging.root.removeHandler(logging.root.handlers[-1]) logging_format = '%(asctime)s %(levelname)s %(filename)s:%(lineno)s:%(funcName)s %(message)s' logging.basicConfig(level=numeric_level, format=logging_format) logging.getLogger().setLevel(numeric_level) logging.getLogger("requests.packages").setLevel(30) logging.info("Processing action {0} on {1}".format(options.actions, options.goldstd)) # set configuration variables based on the goldstd option if the corpus has a gold standard, # or on corpus and annotation options # pre-processing options if options.actions == "load_corpus": if len(options.goldstd) > 1: print "load only one corpus each time" sys.exit() options.goldstd = options.goldstd[0] corpus_format = paths[options.goldstd]["format"] corpus_path = paths[options.goldstd]["text"] corpus_ann = paths[options.goldstd]["annotations"] corenlp_client = StanfordCoreNLP('http://localhost:9000') # corpus = load_corpus(options.goldstd, corpus_path, corpus_format, corenlp_client) corpus = SeeDevCorpus(corpus_path) corpus.load_corpus(corenlp_client) corpus.save(paths[options.goldstd]["corpus"]) if corpus_ann: #add annotation if it is not a test set corpus.load_annotations(corpus_ann, "all") corpus.save(paths[options.goldstd]["corpus"]) elif options.actions == "annotate": # rext-add annotation to corpus if len(options.goldstd) > 1: print "load only one corpus each time" sys.exit() options.goldstd = options.goldstd[0] corpus_path = paths[options.goldstd]["corpus"] corpus_ann = paths[options.goldstd]["annotations"] logging.info("loading corpus %s" % corpus_path) corpus = pickle.load(open(corpus_path, 'rb')) logging.debug("loading annotations...") # corpus.clear_annotations("all") corpus.load_annotations(corpus_ann, "all", options.ptype) # corpus.get_invalid_sentences() corpus.save(paths[options.goldstd]["corpus"]) else: #corpus = SeeDevCorpus("corpus/" + "&".join(options.goldstd)) corpus_path = paths[options.goldstd[0]]["corpus"] logging.info("loading corpus %s" % corpus_path) basecorpus = pickle.load(open(corpus_path, 'rb')) corpus = SeeDevCorpus(corpus_path) corpus.documents = basecorpus.documents if options.actions == "add_sentences": corpus.add_more_sentences(options.models) elif options.actions == "add_goldstandard": corpus.convert_entities_to_goldstandard() corpus.find_ds_relations() #corpus.save(config.paths[options.goldstd[0]]["corpus"]) elif options.actions == "train_multiple": # Train one classifier for each type of entity in this corpus # logging.info(corpus.subtypes) models = TaggerCollection(basepath=options.models, corpus=corpus, subtypes=all_entity_types) models.train_types() elif options.actions == "train_relations": if options.ptype == "all": ptypes = pair_types.keys() # ptypes = config.event_types.keys() else: ptypes = [options.ptype] for p in ptypes: print p if options.kernel == "jsre": model = JSREKernel(corpus, p, train=True) elif options.kernel == "svmtk": model = SVMTKernel(corpus, p) elif options.kernel == "stanfordre": model = StanfordRE(corpus, p) elif options.kernel == "multir": model = MultiR(corpus, p) elif options.kernel == "scikit": model = ScikitRE(corpus, p) elif options.kernel == "crf": model = CrfSuiteRE(corpus, p) # model.train() # testing elif options.actions == "test_multiple": logging.info("testing with multiple classifiers... {}".format(' '.join(options.submodels))) models = TaggerCollection(basepath=options.models, subtypes=all_entity_types) models.load_models() results = models.test_types(corpus) final_results = results.combine_results() logging.info("saving results...") final_results.save(options.output[1] + ".pickle") elif options.actions == "test_relations": if options.ptype == "all": ptypes = pair_types.keys() # ptypes = config.event_types.keys() all_results = ResultsRE(options.output[1]) all_results.corpus = corpus all_results.path = options.output[1] else: ptypes = [options.ptype] for p in ptypes: print p if options.kernel == "jsre": model = JSREKernel(corpus, p, train=False) elif options.kernel == "svmtk": model = SVMTKernel(corpus, p) elif options.kernel == "rules": model = RuleClassifier(corpus, p) elif options.kernel == "stanfordre": model = StanfordRE(corpus, p) elif options.kernel == "scikit": model = ScikitRE(corpus, p) elif options.kernel == "crf": model = CrfSuiteRE(corpus, p, test=True) model.load_classifier() model.test() results = model.get_predictions(corpus) # results.save(options.output[1] + "_" + p.lower() + ".pickle") # results.load_corpus(options.goldstd[0]) results.path = options.output[1] + "_" + p.lower() goldset = get_gold_ann_set(paths[options.goldstd[0]]["format"], paths[options.goldstd[0]]["annotations"], "all", p, paths[options.goldstd[0]]["text"]) get_relations_results(results, options.models, goldset[1],[], []) if options.ptype == "all": for did in results.document_pairs: if did not in all_results.document_pairs: all_results.document_pairs[did] = Pairs(did=did) all_results.document_pairs[did].pairs += results.document_pairs[did].pairs if options.ptype == "all": goldset = get_gold_ann_set(paths[options.goldstd[0]]["format"], paths[options.goldstd[0]]["annotations"], "all", "all", paths[options.goldstd[0]]["text"]) get_relations_results(all_results, options.models, goldset[1],[], []) write_seedev_results(all_results, options.output[1]) elif options.actions == "train_sentences": #and evaluate if options.ptype == "all": avg = [0,0,0] for p in pair_types: print p tps, fps, fns = corpus.train_sentence_classifier(p) if tps == 0 and fns == 0: precision, recall, fscore = 0, 1, 1 else: precision = 1.0 * tps / (fps + tps) recall = 1.0 * fns / (fns + tps) fscore = 2.0 * precision * recall / (recall + precision) print precision, recall, fscore avg[0] += tps avg[1] += fps avg[2] += fns #print [a/len(config.pair_types) for a in avg] precision = 1.0 * avg[1] / (avg[0] + avg[1]) recall = 1.0 * avg[2] / (avg[0] + avg[2]) fscore = 2.0 * precision * recall / (recall + precision) print precision, recall, fscore else: res = corpus.train_sentence_classifier(options.ptype) print res corpus.save(paths[options.goldstd[0]]["corpus"]) elif options.actions == "test_sentences": #and evaluate if options.ptype == "all": avg = [0,0,0] for p in pair_types: print p tps, fps, fns = corpus.test_sentence_classifier(p) if tps == 0 and fns == 0: precision, recall, fscore = 0, 1, 1 else: precision = 1.0 * tps / (fps + tps) recall = 1.0 * fns / (fns + tps) fscore = 2.0 * precision * recall / (recall + precision) print precision, recall, fscore avg[0] += tps avg[1] += fps avg[2] += fns #print [a/len(config.pair_types) for a in avg] precision = 1.0 * avg[1] / (avg[0] + avg[1]) recall = 1.0 * avg[2] / (avg[0] + avg[2]) fscore = 2.0 * precision * recall / (recall + precision) print precision, recall, fscore #else: # res = corpus.test_sentence_classifier(options.ptype) # print res elif options.actions == "evaluate_ner": if os.path.exists(options.output[1] + ".pickle"): results = pickle.load(open(options.output[1] + ".pickle", 'rb')) results.load_corpus(options.goldstd[0]) results.path = options.output[1] logging.info("loading gold standard %s" % paths[options.goldstd[0]]["annotations"]) for t in all_entity_types: print t results.path = options.output[1] + "_" + t goldset = get_gold_ann_set(paths[options.goldstd[0]]["format"], paths[options.goldstd[0]]["annotations"], t, options.ptype, paths[options.goldstd[0]]["text"]) get_results(results, options.models + "_" + t, goldset[0], {}, {}) corpus.save(paths[options.goldstd[0]]["corpus"]) total_time = time.time() - start_time logging.info("Total time: %ss" % total_time)