def process_documents(): corpus = Corpus("corpora/Thaliana/pubmed") final_text = [] corenlp_client = StanfordCoreNLP('http://localhost:9000') lcount = 0 starts = set() with codecs.open("corpora/Thaliana/documents.txt", 'r', 'utf-8') as docfile: for l in docfile: print lcount if l[:20] in starts: continue lcount += 1 starts.add(l[:20]) newdoc = Document(l.strip()) newdoc.process_document(corenlp_client) for sentence in newdoc.sentences: print[t.text for t in sentence.tokens] newtext = "" corpus.documents["d" + str(lcount)] = newdoc """for s in newdoc.sentences: for t in s.tokens: newtext += t.text + " " final_text.append(newtext)""" # if lcount > 10: # break if lcount % 1000 == 0: corpus.save( "corpora/Thaliana/thaliana-documents_{}.pickle".format( str(lcount / 1000)))
def process_documents(corpus_path): corpus = Corpus(corpus_path) final_text = [] corenlp_client = StanfordCoreNLP('http://localhost:9000') lcount = 0 starts = set() with codecs.open(corpus_path, 'r', 'utf-8') as docfile: for l in docfile: print lcount if l[:10] in starts: print "repeated abstract:", l[:10] continue lcount += 1 starts.add(l[:10]) values = l.strip().split("\t") pmid = values[0] abs_text = " ".join(values[1:]) newdoc = Document(abs_text, did="PMID" + pmid) newdoc.process_document(corenlp_client) #for sentence in newdoc.sentences: # print [t.text for t in sentence.tokens] newtext = "" newdoc.did = "PMID" + pmid corpus.documents["PMID" + pmid] = newdoc """for s in newdoc.sentences: for t in s.tokens: newtext += t.text + " " final_text.append(newtext)""" # if lcount > 10: # break if lcount % 1000 == 0: corpus.save("{}_{}.pickle".format(corpus_path, str(lcount / 1000))) corpus = Corpus(corpus_path) corpus.save("{}_{}.pickle".format(corpus_path, str(lcount / 1000)))
def generate_corpus(self, text): """ Create a corpus object from the input text. :param text: :return: """ test_corpus = Corpus("") newdoc = Document(text, process=False, did="d0", title="Test document") newdoc.sentence_tokenize("biomedical") newdoc.process_document(self.corenlp, "biomedical") test_corpus.documents["d0"] = newdoc return test_corpus
def process_documents(corpus_path): corpus = Corpus(corpus_path) final_text = [] corenlp_client = StanfordCoreNLP('http://localhost:9000') lcount = 0 starts = set() with codecs.open(corpus_path, 'r', 'utf-8') as docfile: for l in docfile: print lcount if l[:10] in starts: print "repeated abstract:", l[:10] continue lcount += 1 starts.add(l[:10]) values = l.strip().split("\t") pmid = values[0] abs_text = " ".join(values[1:]) newdoc = Document(abs_text, did="PMID" + pmid) newdoc.process_document(corenlp_client) #for sentence in newdoc.sentences: # print [t.text for t in sentence.tokens] newtext = "" newdoc.did = "PMID" + pmid corpus.documents["PMID" + pmid] = newdoc """for s in newdoc.sentences: for t in s.tokens: newtext += t.text + " " final_text.append(newtext)""" # if lcount > 10: # break if lcount % 1000 == 0: corpus.save("{}_{}.pickle".format(corpus_path, str(lcount/1000))) corpus = Corpus(corpus_path) corpus.save("{}_{}.pickle".format(corpus_path, str(lcount / 1000)))
def main(): start_time = time.time() parser = argparse.ArgumentParser(description='') parser.add_argument("--goldstd", default="", dest="goldstd", nargs="+", help="Gold standard to be used. Will override corpus, annotations", choices=paths.keys()) parser.add_argument("--submodels", default="", nargs='+', help="sub types of classifiers"), parser.add_argument("--corpus", dest="corpus", nargs=2, default=["chemdner", "CHEMDNER/CHEMDNER_SAMPLE_JUNE25/chemdner_sample_abstracts.txt"], help="format path") parser.add_argument("--annotations", dest="annotations") parser.add_argument("--tag", dest="tag", default="0", help="Tag to identify the text.") parser.add_argument("--cv", dest="cv", default=5, help="Number of folds.", type=int) parser.add_argument("--models", dest="models", help="model destination path, without extension") parser.add_argument("--entitytype", dest="etype", help="type of entities to be considered", default="all") parser.add_argument("--doctype", dest="doctype", help="type of document to be considered", default="all") parser.add_argument("-o", "--output", "--format", dest="output", nargs=2, help="format path; output formats: xml, html, tsv, text, chemdner.") parser.add_argument("--crf", dest="crf", help="CRF implementation", default="stanford", choices=["stanford", "crfsuite"]) parser.add_argument("--log", action="store", dest="loglevel", default="WARNING", help="Log level") parser.add_argument("--kernel", action="store", dest="kernel", default="svmtk", help="Kernel for relation extraction") parser.add_argument("--pairtype1", action="store", dest="pairtype1") parser.add_argument("--pairtype2", action="store", dest="pairtype2") options = parser.parse_args() # set logger numeric_level = getattr(logging, options.loglevel.upper(), None) if not isinstance(numeric_level, int): raise ValueError('Invalid log level: %s' % options.loglevel) while len(logging.root.handlers) > 0: logging.root.removeHandler(logging.root.handlers[-1]) logging_format = '%(asctime)s %(levelname)s %(filename)s:%(lineno)s:%(funcName)s %(message)s' logging.basicConfig(level=numeric_level, format=logging_format) logging.getLogger().setLevel(numeric_level) logging.info("Crossvalidation on {0}".format(options.goldstd)) # set configuration variables based on the goldstd option if the corpus has a gold standard, # or on corpus and annotation options # pre-processing options corpus_name = "&".join(options.goldstd) corpus = Corpus("corpus/" + corpus_name) for g in options.goldstd: corpus_path = paths[g]["corpus"] logging.info("loading corpus %s" % corpus_path) this_corpus = pickle.load(open(corpus_path, 'rb')) #docs = this_corpus.documents docs = dict((k, this_corpus.documents[k]) for k in this_corpus.documents.keys()) corpus.documents.update(docs) run_crossvalidation(options.goldstd, corpus, options.models, options.cv, options.crf, options.etype) total_time = time.time() - start_time logging.info("Total time: %ss" % total_time)
def __init__(self, name): self.entities = {} self.name = name self.corpus = Corpus(self.name) self.basedir = "models/ensemble/"
class ResultsNER(object): """Store a set of entities related to a corpus or input text """ def __init__(self, name): self.entities = {} self.name = name self.corpus = Corpus(self.name) self.basedir = "models/ensemble/" def get_ensemble_results(self, ensemble, corpus, model): """ Go through every entity in corpus and if it was predicted true by the ensemble, save to entities, otherwise, delete it. """ for did in corpus.documents: for sentence in corpus.documents[did].sentences: new_entities = [] for entity in sentence.entities.elist[model]: sentence_type = "A" if sentence.sid.endswith("s0"): sentence_type = "T" id = (did, "{0}:{1}:{2}".format(sentence_type, entity.dstart, entity.dend), "1") if id not in ensemble.ids: logging.debug("this is new! {0}".format(entity)) continue predicted_index = ensemble.ids.index(id) #logging.info(predicted_index) if ensemble.predicted[predicted_index][1] > 0.5: self.entities[entity.eid] = entity #logging.info("good entity: {}".format(entity.text.encode("utf8"))) new_entities.append(entity) #else: # logging.info("bad entity: {}".format(entity.text.encode("utf8"))) sentence.entities.elist[self.name] = new_entities self.corpus = corpus def save(self, path): # no need to save the whole corpus, only the entities of each sentence are necessary # because the full corpus is already saved on a diferent pickle logging.info("Saving results to {}".format(path)) reduced_corpus = {} for did in self.corpus.documents: reduced_corpus[did] = {} for sentence in self.corpus.documents[did].sentences: reduced_corpus[did][sentence.sid] = sentence.entities self.corpus = reduced_corpus pickle.dump(self, open(path, "wb")) def save_chemdner(self): pass def load_corpus(self, goldstd): logging.info("loading corpus %s" % paths[goldstd]["corpus"]) corpus = pickle.load(open(paths[goldstd]["corpus"])) for did in corpus.documents: if did not in self.corpus: logging.info("no results for {}".format(did)) continue for sentence in corpus.documents[did].sentences: sentence.entities = self.corpus[did][sentence.sid] #for entity in sentence.entities.elist[options.models]: # print entity.chebi_score, self.corpus = corpus def combine_results(self, basemodel, name): # add another set of anotations to each sentence, ending in combined # each entity from this dataset should have a unique ID and a recognized_by attribute scores = 0 total = 0 for did in self.corpus.documents: #logging.debug(did) for sentence in self.corpus.documents[did].sentences: #logging.debug(sentence.sid) sentence.entities.combine_entities(basemodel, name) #for e in sentence.entities.elist[name]: # total += 1 #logging.info("{} - {}".format(e.text, e.score)) # if len(e.recognized_by) > 1: # scores += sum(e.score.values())/len(e.score.values()) # elif len == 1: # scores += e.score.values()[0] #if e.score < 0.8: # logging.info("{0} score of {1}".format(e.text.encode("utf-8"), # e.score)) if total > 0: logging.info("{0} entities average confidence of {1}".format( total, scores / total)) def add_results(self, results): all_models = set() # merge the results of this set with another set dids = set(self.corpus.documents.keys()).union( set(results.corpus.documents.keys())) for did in dids: # one result set may contain more or less documents than this one # in that case, simply add the document to the other result set if did not in self.corpus.documents: self.corpus.documents[did] = results.corpus.document[did] elif did not in results.corpus.documents: results.corpus.documents[did] = self.corpus.documents[did] else: # merge entities for sentence in results.corpus.documents[did].sentences: base_sentence = self.corpus.documents[did].get_sentence( sentence.sid) # add every new model in the new result set to this one for model in sentence.entities.elist: if model != "goldstandard" and model not in base_sentence.entities.elist: base_sentence.entities.elist[ model] = sentence.entities.elist[model] all_models = all_models.union( set(base_sentence.entities.elist.keys())) # print all_models def train_ensemble(self, pipeline, modelname, etype): train_data, labels, offsets = self.generate_data(etype) print "training ensemble classifier..." pipeline = pipeline.fit(train_data, labels) if not os.path.exists(self.basedir + modelname): os.makedirs(self.basedir + modelname) logging.info("Training complete, saving to {}/{}/{}.pkl".format( self.basedir, modelname, modelname)) joblib.dump(pipeline, "{}/{}/{}.pkl".format(self.basedir, modelname, modelname)) def test_ensemble(self, pipeline, modelname, etype): train_data, labels, offsets = self.generate_data(etype, mode="test") pred = pipeline.predict(train_data) print pred for i, p in enumerate(pred): if p == True: sentence = self.corpus.get_sentence(offsets.keys()[i][0]) sentence.tag_entity(offsets.keys()[i][1], offsets.keys()[i][2], etype, source=modelname) def generate_data(self, etype, mode="train"): """ Use scikit to train a pipeline to classify entities as correct or incorrect features consist in the classifiers that identified the entity :param modelname: :return: """ offsets = {} features = set() gs_labels = {} # collect offsets from every model (except gold standard) and add classifier score all_models = set() # merge the results of this set with another set for did in self.corpus.documents: # logging.debug(did) for sentence in self.corpus.documents[did].sentences: for s in sentence.entities.elist: # logging.info("%s - %s" % (self.sid, s)) # use everything except what's already combined and gold standard if not s.startswith("goldstandard") and s.endswith(etype): features.add(s) for e in sentence.entities.elist[s]: # if any([word in e.text for word in self.stopwords]): # logging.info("ignored stopword %s" % e.text) # continue # eid_alt = e.sid + ":" + str(e.dstart) + ':' + str(e.dend) #next_eid = "{0}.e{1}".format(e.sid, len(combined)) #eid_offset = Offset(e.dstart, e.dend, text=e.text, sid=e.sid, eid=next_eid) # check for perfect overlaps only offset = (sentence.sid, e.start, e.end) if offset not in offsets: offsets[offset] = {} offsets[offset][s] = e.score elif mode == "train" and s == "goldstandard_" + etype: for e in sentence.entities.elist[s]: offset = (sentence.sid, e.start, e.end) gs_labels[offset] = True train_data = [] train_labels = [] features = sorted(list(features)) for o in offsets: of = [] for f in features: if f in offsets[o]: of.append(offsets[o][f]) else: of.append(0) train_data.append(of) if mode == "train" and gs_labels.get(o) == True: train_labels.append(True) else: train_labels.append(False) # print features # for i, l in enumerate(train_labels[:10]): # print train_data[i], l return train_data, train_labels, offsets def convert_to(self, output_format, output_path, eset): if output_format == "brat": self.convert_to_brat(output_path, eset) def convert_to_brat(self, output_path, eset): if not os.path.exists(output_path): os.makedirs(output_path) for did in self.corpus.documents: with io.open("{}/{}.ann".format(output_path, did), "w", encoding='utf-8') as output_file: ecount = 0 for sentence in self.corpus.documents[did].sentences: if eset in sentence.entities.elist: print "writing...", eset for entity in sentence.entities.elist[eset]: output_file.write( u"T{0}\t{1.type} {1.dstart} {1.dend}\t{1.text}\n" .format(ecount, entity)) ecount += 1 def import_chemdner(self, filepath): with io.open(filepath, encoding="utf-8") as inputfile: next(inputfile) for l in inputfile: values = l.split("\t") did = values[0] sectionid = values[1] # print l start, end, text = int(values[2]), int(values[3]), values[5] confidence = values[4] if did in self.corpus.documents: entity = self.corpus.documents[did].tag_chemdner_entity( start, end, "unknown", source=self.model, text=text, confidence=confidence, doct=sectionid, score=1) if entity: self.entities[entity.eid] = entity
def main(): start_time = time.time() parser = argparse.ArgumentParser(description='') parser.add_argument("actions", default="classify", help="Actions to be performed.", choices=[ "load_corpus", "annotate", "classify", "write_results", "write_goldstandard", "train", "test", "train_multiple", "test_multiple", "train_matcher", "test_matcher", "crossvalidation", "train_relations", "test_relations" ]) parser.add_argument( "--goldstd", default="", dest="goldstd", nargs="+", help="Gold standard to be used. Will override corpus, annotations", choices=config.paths.keys()) parser.add_argument("--submodels", default="", nargs='+', help="sub types of classifiers"), parser.add_argument( "-i", "--input", dest="input", action="store", default='''Administration of a higher dose of indinavir should be \\ considered when coadministering with megestrol acetate.''', help="Text to classify.") parser.add_argument( "--corpus", dest="corpus", nargs=2, default=[ "chemdner", "CHEMDNER/CHEMDNER_SAMPLE_JUNE25/chemdner_sample_abstracts.txt" ], help="format path") parser.add_argument("--annotations", dest="annotations") parser.add_argument("--tag", dest="tag", default="0", help="Tag to identify the text.") parser.add_argument("--models", dest="models", help="model destination path, without extension") parser.add_argument("--entitytype", dest="etype", help="type of entities to be considered", default="all") parser.add_argument("--pairtype", dest="ptype", help="type of pairs to be considered", default="all") parser.add_argument("--doctype", dest="doctype", help="type of document to be considered", default="all") parser.add_argument("--annotated", action="store_true", default=False, dest="annotated", help="True if the input has <entity> tags.") parser.add_argument( "-o", "--output", "--format", dest="output", nargs=2, help="format path; output formats: xml, html, tsv, text, chemdner.") parser.add_argument("--crf", dest="crf", help="CRF implementation", default="stanford", choices=["stanford", "crfsuite"]) parser.add_argument("--log", action="store", dest="loglevel", default="WARNING", help="Log level") parser.add_argument("--kernel", action="store", dest="kernel", default="svmtk", help="Kernel for relation extraction") options = parser.parse_args() # set logger numeric_level = getattr(logging, options.loglevel.upper(), None) if not isinstance(numeric_level, int): raise ValueError('Invalid log level: %s' % options.loglevel) while len(logging.root.handlers) > 0: logging.root.removeHandler(logging.root.handlers[-1]) logging_format = '%(asctime)s %(levelname)s %(filename)s:%(lineno)s:%(funcName)s %(message)s' logging.basicConfig(level=numeric_level, format=logging_format, filename="debug.log") logging.getLogger().setLevel(numeric_level) logging.getLogger("requests.packages").setLevel(30) logging.info("Processing action {0} on {1}".format(options.actions, options.goldstd)) # set configuration variables based on the goldstd option if the corpus has a gold standard, # or on corpus and annotation options # pre-processing options if options.actions == "load_corpus": if len(options.goldstd) > 1: print("load only one corpus each time") sys.exit() options.goldstd = options.goldstd[0] corpus_format = config.paths[options.goldstd]["format"] corpus_path = config.paths[options.goldstd]["text"] corpus_ann = config.paths[options.goldstd]["annotations"] corenlp_client = StanfordCoreNLP('http://localhost:9000') corpus = load_corpus(options.goldstd, corpus_path, corpus_format, corenlp_client) corpus.save(config.paths[options.goldstd]["corpus"]) if corpus_ann: #add annotation if it is not a test set corpus.load_annotations(corpus_ann, options.etype, options.ptype) corpus.save(config.paths[options.goldstd]["corpus"]) elif options.actions == "annotate": # rext-add annotation to corpus if len(options.goldstd) > 1: print("load only one corpus each time") sys.exit() options.goldstd = options.goldstd[0] corpus_path = config.paths[options.goldstd]["corpus"] corpus_ann = config.paths[options.goldstd]["annotations"] logging.info("loading corpus %s" % corpus_path) corpus = pickle.load(open(corpus_path, 'rb')) logging.debug("loading annotations...") corpus.clear_annotations(options.etype) corpus.load_annotations(corpus_ann, options.etype, options.ptype) # corpus.get_invalid_sentences() corpus.save(config.paths[options.goldstd]["corpus"]) else: corpus = Corpus("corpus/" + "&".join(options.goldstd)) for g in options.goldstd: corpus_path = config.paths[g]["corpus"] logging.info("loading corpus %s" % corpus_path) this_corpus = pickle.load(open(corpus_path, 'rb')) corpus.documents.update(this_corpus.documents) if options.actions == "write_goldstandard": model = BiasModel(options.output[1]) model.load_data(corpus, []) results = model.test() #results = ResultsNER(options.output[1]) #results.get_ner_results(corpus, model) results.save(options.output[1] + ".pickle") #logging.info("saved gold standard results to " + options.output[1] + ".txt") # training elif options.actions == "train": if options.crf == "stanford": model = StanfordNERModel(options.models, options.etype) elif options.crf == "crfsuite": model = CrfSuiteModel(options.models, options.etype) model.load_data(corpus, feature_extractors.keys(), options.etype) model.train() elif options.actions == "train_matcher": # Train a simple classifier based on string matching model = MatcherModel(options.models) model.train(corpus) # TODO: term list option #model.train("TermList.txt") elif options.actions == "train_multiple": # Train one classifier for each type of entity in this corpus # logging.info(corpus.subtypes) models = TaggerCollection(basepath=options.models, corpus=corpus, subtypes=corpus.subtypes) models.train_types() elif options.actions == "train_relations": if options.kernel == "jsre": model = JSREKernel(corpus, options.ptype) elif options.kernel == "svmtk": model = SVMTKernel(corpus, options.ptype) elif options.kernel == "stanfordre": model = StanfordRE(corpus, options.ptype) elif options.kernel == "multir": model = MultiR(corpus, options.ptype) elif options.kernel == "scikit": model = ScikitRE(corpus, options.ptype) elif options.kernel == "crf": model = CrfSuiteRE(corpus, options.ptype) model.train() # testing elif options.actions == "test": base_port = 9191 if len(options.submodels) > 1: allresults = ResultSetNER(corpus, options.output[1]) for i, submodel in enumerate(options.submodels): model = StanfordNERModel(options.models + "_" + submodel) model.load_tagger(base_port + i) # load data into the model format model.load_data(corpus, feature_extractors.keys(), mode="test") # run the classifier on the data results = model.test(corpus, port=base_port + i) allresults.add_results(results) model.kill_process() # save the results to an object that can be read again, and log files to debug final_results = allresults.combine_results() else: if options.crf == "stanford": model = StanfordNERModel(options.models, options.etype) elif options.crf == "crfsuite": model = CrfSuiteModel(options.models, options.etype) model.load_tagger() model.load_data(corpus, feature_extractors.keys(), mode="test") final_results = model.test(corpus) #with codecs.open(options.output[1] + ".txt", 'w', 'utf-8') as outfile: # lines = final_results.corpus.write_chemdner_results(options.models, outfile) #final_results.lines = lines final_results.save(options.output[1] + ".pickle") elif options.actions == "test_matcher": if "mirna" in options.models: model = MirnaMatcher(options.models) else: model = MatcherModel(options.models) results = ResultsNER(options.models) results.corpus, results.entities = model.test(corpus) allentities = set() for e in results.entities: allentities.add(results.entities[e].text) with codecs.open(options.output[1] + ".txt", 'w', 'utf-8') as outfile: outfile.write('\n'.join(allentities)) results.save(options.output[1] + ".pickle") elif options.actions == "test_multiple": logging.info("testing with multiple classifiers... {}".format( ' '.join(options.submodels))) allresults = ResultSetNER(corpus, options.output[1]) if len(options.submodels) < 2: models = TaggerCollection(basepath=options.models) models.load_models() results = models.test_types(corpus) final_results = results.combine_results() else: base_port = 9191 for submodel in options.submodels: models = TaggerCollection(basepath=options.models + "_" + submodel, baseport=base_port) models.load_models() results = models.test_types(corpus) logging.info("combining results...") submodel_results = results.combine_results() allresults.add_results(submodel_results) base_port += len(models.models) final_results = allresults.combine_results() logging.info("saving results...") final_results.save(options.output[1] + ".pickle") elif options.actions == "test_relations": if options.kernel == "jsre": model = JSREKernel(corpus, options.ptype, train=False) elif options.kernel == "svmtk": model = SVMTKernel(corpus, options.ptype) elif options.kernel == "rules": model = RuleClassifier(corpus, options.ptype) elif options.kernel == "stanfordre": model = StanfordRE(corpus, options.ptype) elif options.kernel == "scikit": model = ScikitRE(corpus, options.ptype) elif options.kernel == "crf": model = CrfSuiteRE(corpus, options.ptype, test=True) model.load_classifier() model.test() results = model.get_predictions(corpus) results.save(options.output[1] + ".pickle") total_time = time.time() - start_time logging.info("Total time: %ss" % total_time)
def run_crossvalidation(goldstd_list, corpus, model, cv, crf="stanford", entity_type="all", cvlog="cv.log"): logfile = open(cvlog, 'w') doclist = corpus.documents.keys() random.shuffle(doclist) size = int(len(doclist) / cv) sublists = chunks(doclist, size) logging.debug("Chunks:") logging.debug(sublists) p, r = [], [] all_results = ResultsNER(model) all_results.path = model + "_results" for nlist in range(cv): testids, trainids = None, None testids = sublists[nlist] trainids = list(itertools.chain.from_iterable(sublists[:nlist])) trainids += list(itertools.chain.from_iterable(sublists[nlist + 1:])) train_corpus, test_corpus = None, None print 'CV{} - test set: {}; train set: {}'.format( nlist, len(testids), len(trainids)) train_corpus = Corpus( corpus.path + "_train", documents={did: corpus.documents[did] for did in trainids}) test_corpus = Corpus( corpus.path + "_test", documents={did: corpus.documents[did] for did in testids}) # logging.debug("train corpus docs: {}".format("\n".join(train_corpus.documents.keys()))) #test_entities = len(test_corpus.get_all_entities("goldstandard")) #train_entities = len(train_corpus.get_all_entities("goldstandard")) #logging.info("test set entities: {}; train set entities: {}".format(test_entities, train_entities)) basemodel = model + "_cv{}".format(nlist) logging.debug('CV{} - test set: {}; train set: {}'.format( nlist, len(test_corpus.documents), len(train_corpus.documents))) '''for d in train_corpus.documents: for s in train_corpus.documents[d].sentences: print len([t.tags.get("goldstandard") for t in s.tokens if t.tags.get("goldstandard") != "other"]) sys.exit()''' # train logging.info('CV{} - TRAIN'.format(nlist)) # train_model = StanfordNERModel(basemodel) train_model = None if crf == "stanford": train_model = StanfordNERModel(basemodel, entity_type) elif crf == "crfsuite": train_model = CrfSuiteModel(basemodel, entity_type) train_model.load_data(train_corpus, feature_extractors.keys()) train_model.train() # test logging.info('CV{} - TEST'.format(nlist)) test_model = None if crf == "stanford": test_model = StanfordNERModel(basemodel, entity_type) elif crf == "crfsuite": test_model = CrfSuiteModel(basemodel, entity_type) test_model.load_tagger(port=9191 + nlist) test_model.load_data(test_corpus, feature_extractors.keys(), mode="test") final_results = None final_results = test_model.test(test_corpus, port=9191 + nlist) if crf == "stanford": test_model.kill_process() final_results.basepath = basemodel + "_results" final_results.path = basemodel all_results.entities.update(final_results.entities) all_results.corpus.documents.update(final_results.corpus.documents) # validate """if config.use_chebi: logging.info('CV{} - VALIDATE'.format(nlist)) final_results = add_chebi_mappings(final_results, basemodel) final_results = add_ssm_score(final_results, basemodel) final_results.combine_results(basemodel, basemodel)""" # evaluate logging.info('CV{} - EVALUATE'.format(nlist)) test_goldset = set() for gs in goldstd_list: goldset = get_gold_ann_set(config.paths[gs]["format"], config.paths[gs]["annotations"], entity_type, "pairtype", config.paths[gs]["text"]) for g in goldset[0]: if g[0] in testids: test_goldset.add(g) precision, recall = get_results(final_results, basemodel, test_goldset, {}, []) # evaluation = run_chemdner_evaluation(config.paths[goldstd]["cem"], basemodel + "_results.txt", "-t") # values = evaluation.split("\n")[1].split('\t') p.append(precision) r.append(recall) # logging.info("precision: {} recall:{}".format(str(values[13]), str(values[14]))) pavg = sum(p) / cv ravg = sum(r) / cv print "precision: average={} all={}".format( str(pavg), '|'.join([str(pp) for pp in p])) print "recall: average={} all={}".format(str(ravg), '|'.join([str(rr) for rr in r])) all_goldset = set() for gs in goldstd_list: goldset = get_gold_ann_set(config.paths[gs]["format"], config.paths[gs]["annotations"], entity_type, config.paths[gs]["text"]) for g in goldset: all_goldset.add(g) get_results(all_results, model, all_goldset, {}, [])
def __init__(self, name): self.entities = {} self.name = name self.corpus = Corpus(self.name)
def main(): start_time = time.time() parser = argparse.ArgumentParser(description='') parser.add_argument("actions", default="classify", help="Actions to be performed.", choices=["load_corpus", "annotate", "classify", "write_results", "write_goldstandard", "train", "test", "train_multiple", "test_multiple", "train_matcher", "test_matcher", "crossvalidation", "train_relations", "test_relations", "load_genia", "load_biomodel", "merge_corpus"]) parser.add_argument("--goldstd", default="", dest="goldstd", nargs="+", help="Gold standard to be used. Will override corpus, annotations", choices=paths.keys()) parser.add_argument("--submodels", default="", nargs='+', help="sub types of classifiers"), parser.add_argument("-i", "--input", dest="input", action="store", default='''Administration of a higher dose of indinavir should be \\ considered when coadministering with megestrol acetate.''', help="Text to classify.") parser.add_argument("--corpus", dest="corpus", nargs=2, default=["chemdner", "CHEMDNER/CHEMDNER_SAMPLE_JUNE25/chemdner_sample_abstracts.txt"], help="format path") parser.add_argument("--annotations", dest="annotations") parser.add_argument("--tag", dest="tag", default="0", help="Tag to identify the experiment") parser.add_argument("--models", dest="models", help="model destination path, without extension") parser.add_argument("--entitytype", dest="etype", help="type of entities to be considered", default="all") parser.add_argument("--pairtype", dest="ptype", help="type of pairs to be considered", default="all") parser.add_argument("--doctype", dest="doctype", help="type of document to be considered", default="all") parser.add_argument("--annotated", action="store_true", default=False, dest="annotated", help="True if the input has <entity> tags.") parser.add_argument("-o", "--output", "--format", dest="output", nargs=2, help="format path; output formats: xml, html, tsv, text, chemdner.") parser.add_argument("--crf", dest="crf", help="CRF implementation", default="stanford", choices=["stanford", "crfsuite", "banner"]) parser.add_argument("--log", action="store", dest="loglevel", default="WARNING", help="Log level") parser.add_argument("--kernel", action="store", dest="kernel", default="svmtk", help="Kernel for relation extraction") options = parser.parse_args() # set logger numeric_level = getattr(logging, options.loglevel.upper(), None) if not isinstance(numeric_level, int): raise ValueError('Invalid log level: %s' % options.loglevel) while len(logging.root.handlers) > 0: logging.root.removeHandler(logging.root.handlers[-1]) logging_format = '%(asctime)s %(levelname)s %(filename)s:%(lineno)s:%(funcName)s %(message)s' logging.basicConfig(level=numeric_level, format=logging_format) logging.getLogger().setLevel(numeric_level) logging.getLogger("requests.packages").setLevel(30) logging.info("Processing action {0} on {1}".format(options.actions, options.goldstd)) # set configuration variables based on the goldstd option if the corpus has a gold standard, # or on corpus and annotation options # pre-processing options if options.actions == "load_corpus": if len(options.goldstd) > 1: print "load only one corpus each time" sys.exit() options.goldstd = options.goldstd[0] corpus_format = paths[options.goldstd]["format"] corpus_path = paths[options.goldstd]["text"] corpus_ann = paths[options.goldstd]["annotations"] corenlp_client = StanfordCoreNLP('http://localhost:9000') corpus = load_corpus(options.goldstd, corpus_path, corpus_format, corenlp_client) #corpus.load_genia() #TODO optional genia corpus.save(paths[options.goldstd]["corpus"]) if corpus_ann: #add annotation if it is not a test set corpus.load_annotations(corpus_ann, options.etype, options.ptype) corpus.save(paths[options.goldstd]["corpus"]) elif options.actions == "load_genia": options.goldstd = options.goldstd[0] corpus_path = paths[options.goldstd]["corpus"] corpus_ann = paths[options.goldstd]["annotations"] logging.info("loading corpus %s" % corpus_path) corpus = pickle.load(open(corpus_path, 'rb')) corpus.load_genia() corpus.save(paths[options.goldstd]["corpus"]) elif options.actions == "load_biomodel": options.goldstd = options.goldstd[0] corpus_path = paths[options.goldstd]["corpus"] corpus_ann = paths[options.goldstd]["annotations"] logging.info("loading corpus %s" % corpus_path) corpus = pickle.load(open(corpus_path, 'rb')) corpus.load_biomodel() corpus.save(paths[options.goldstd]["corpus"]) elif options.actions == "annotate": # rext-add annotation to corpus if len(options.goldstd) > 1: print "load only one corpus each time" sys.exit() options.goldstd = options.goldstd[0] corpus_path = paths[options.goldstd]["corpus"] corpus_ann = paths[options.goldstd]["annotations"] logging.info("loading corpus %s" % corpus_path) corpus = pickle.load(open(corpus_path, 'rb')) logging.debug("loading annotations...") corpus.clear_annotations(options.etype) corpus.load_annotations(corpus_ann, options.etype, options.ptype) # corpus.get_invalid_sentences() corpus.save(paths[options.goldstd]["corpus"]) else: corpus = Corpus("corpus/" + "&".join(options.goldstd)) for g in options.goldstd: corpus_path = paths[g]["corpus"] logging.info("loading corpus %s" % corpus_path) this_corpus = pickle.load(open(corpus_path, 'rb')) corpus.documents.update(this_corpus.documents) if options.actions == "write_goldstandard": model = BiasModel(options.output[1]) model.load_data(corpus, []) results = model.test() #results = ResultsNER(options.output[1]) #results.get_ner_results(corpus, model) results.save(options.output[1] + ".pickle") #logging.info("saved gold standard results to " + options.output[1] + ".txt") elif options.actions == "merge_corpus": corpus.save(paths[options.output[1]]["corpus"]) # training elif options.actions == "train": if options.crf == "stanford": model = StanfordNERModel(options.models, options.etype) elif options.crf == "crfsuite": model = CrfSuiteModel(options.models, options.etype) model.load_data(corpus, feature_extractors.keys(), options.etype) model.train() elif options.actions == "train_matcher": # Train a simple classifier based on string matching model = MatcherModel(options.models) model.train(corpus) # TODO: term list option #model.train("TermList.txt") elif options.actions == "train_multiple": # Train one classifier for each type of entity in this corpus # logging.info(corpus.subtypes) models = TaggerCollection(basepath=options.models, corpus=corpus, subtypes=corpus.subtypes) models.train_types() elif options.actions == "train_relations": if options.kernel == "jsre": model = JSREKernel(corpus, options.ptype, modelname=options.tag) elif options.kernel == "svmtk": model = SVMTKernel(corpus, options.ptype, modelname=options.tag) #elif options.kernel == "stanfordre": # model = StanfordRE(corpus, options.ptype) #elif options.kernel == "multir": # model = MultiR(corpus, options.ptype) #elif options.kernel == "scikit": # model = ScikitRE(corpus, options.ptype) #elif options.kernel == "crf": # model = CrfSuiteRE(corpus, options.ptype) elif options.kernel == "mil": relations = set() with open("corpora/transmir/transmir_relations.txt") as rfile: for l in rfile: relations.add(tuple(l.strip().split('\t'))) model = MILClassifier(corpus, options.ptype, relations, ner=options.models) model.train() # testing elif options.actions == "test": base_port = 9191 if len(options.submodels) > 1: allresults = ResultSetNER(corpus, options.output[1]) for i, submodel in enumerate(options.submodels): model = StanfordNERModel(options.models + "_" + submodel) model.load_tagger(base_port + i) # load data into the model format model.load_data(corpus, feature_extractors.keys(), mode="test") # run the classifier on the data results = model.test(corpus, port=base_port + i) allresults.add_results(results) model.kill_process() # save the results to an object that can be read again, and log files to debug final_results = allresults.combine_results() else: if options.crf == "stanford": model = StanfordNERModel(options.models, options.etype) elif options.crf == "crfsuite": model = CrfSuiteModel(options.models, options.etype) elif options.crf == "banner": model = BANNERModel(options.models, options.etype) model.load_tagger() model.load_data(corpus, feature_extractors.keys(), mode="test") final_results = model.test(corpus) #with codecs.open(options.output[1] + ".txt", 'w', 'utf-8') as outfile: # lines = final_results.corpus.write_chemdner_results(options.models, outfile) #final_results.lines = lines final_results.save(options.output[1] + ".pickle") elif options.actions == "test_matcher": if "mirna" in options.models: model = MirnaMatcher(options.models) else: model = MatcherModel(options.models) results = ResultsNER(options.models) results.corpus, results.entities = model.test(corpus) allentities = set() for e in results.entities: allentities.add(results.entities[e].text) with codecs.open(options.output[1] + ".txt", 'w', 'utf-8') as outfile: outfile.write('\n'.join(allentities)) results.save(options.output[1] + ".pickle") elif options.actions == "test_multiple": logging.info("testing with multiple classifiers... {}".format(' '.join(options.submodels))) allresults = ResultSetNER(corpus, options.output[1]) if len(options.submodels) < 2: models = TaggerCollection(basepath=options.models) models.load_models() results = models.test_types(corpus) final_results = results.combine_results() else: base_port = 9191 for submodel in options.submodels: models = TaggerCollection(basepath=options.models + "_" + submodel, baseport = base_port) models.load_models() results = models.test_types(corpus) logging.info("combining results...") submodel_results = results.combine_results() allresults.add_results(submodel_results) base_port += len(models.models) final_results = allresults.combine_results() logging.info("saving results...") final_results.save(options.output[1] + ".pickle") elif options.actions == "test_relations": if options.kernel == "jsre": model = JSREKernel(corpus, options.ptype, train=False, modelname=options.tag, ner=options.models) elif options.kernel == "svmtk": model = SVMTKernel(corpus, options.ptype, modelname=options.tag, ner=options.models) elif options.kernel == "rules": model = RuleClassifier(corpus, options.ptype, ner=options.models) elif options.kernel == "mirtex_rules": model = MirtexClassifier(corpus, options.ptype) elif options.kernel == "stanfordre": model = StanfordRE(corpus, options.ptype) elif options.kernel == "scikit": model = ScikitRE(corpus, options.ptype) elif options.kernel == "crf": model = CrfSuiteRE(corpus, options.ptype, test=True) elif options.kernel == "mil": relations = set() with open("corpora/transmir/transmir_relations.txt") as rfile: for l in rfile: relations.add(tuple(l.strip().split('\t'))) model = MILClassifier(corpus, options.ptype, relations, test=True, ner=options.models) model.load_classifier() model.test() results = model.get_predictions(corpus) results.save(options.output[1] + ".pickle") total_time = time.time() - start_time logging.info("Total time: %ss" % total_time)