def preproc_document(doc_id,inp_dir,interm_dir,out_dir,abbreviations,taggers): """ Returns: language, number of sentences, number of tokens """ lang, no_sentences, no_tokens = np.nan,np.nan,np.nan try: intermediate_out_file = "%s%s"%(interm_dir,doc_id) iob_out_file = "%s%s"%(out_dir,doc_id) text = codecs.open("%s%s"%(inp_dir,doc_id),'r','utf-8').read() intermediate_text = sentencebreaks_to_newlines(text) recovered_text= recover_segmentation_errors(intermediate_text,abbreviations,verbose=False) codecs.open(intermediate_out_file,'w','utf-8').write(recovered_text) logger.info("Written intermediate output to %s"%intermediate_out_file) lang = detect_language(text) logger.info("Language detected=\"%s\""%lang) sentences = recovered_text.split('\n') logger.info("Document \"%s\" has %i sentences"%(doc_id,len(sentences))) tagged_sentences = taggers[lang].tag_sents(sentences) tokenised_text = [[token[:2] for token in line] for line in tagged_sentences] IO.write_iob_file(tokenised_text,iob_out_file) logger.info("Written IOB output to %s"%iob_out_file) no_sentences = len(recovered_text.split('\n')) no_tokens = IO.count_tokens(tokenised_text) except Exception, e: logger.error("The pre-processing of document %s (lang=\'%s\') failed with error \"%s\""%(doc_id,lang,e))
def extract_citations(extractor,outputdir,filename,iob_sentences,outfilename=None): """docstring for extract_citations""" # this is the important bit which performs the citation extraction import sys import os from citation_extractor.eval import IO result,out_fname = None, "" if(outfilename is None): path,name = os.path.split(filename) out_fname = '%s%s'%(outputdir,name) else: out_fname = outfilename try: postags = [[("z_POS",token[1]) for token in instance] for instance in iob_sentences if len(instance)>0] instances = [[token[0] for token in instance] for instance in iob_sentences if len(instance)>0] result = extractor.extract(instances, postags) output = [] for i,res in enumerate(result): temp = [] for n,d_res in enumerate(res): temp.append((res[n]["token"], postags[i][n][1], res[n]["label"])) output.append(temp) try: IO.write_iob_file(output,out_fname) print >> sys.stderr, "Output successfully written to file \"%s\""%out_fname return result,out_fname except Exception, e: raise e except Exception, e: raise e
def test_improvement(pre_settings, post_settings): """ TODO: what this function should do: 1. run without selected candidates in the train set and evaluate 2. run with selected candidates in the train set and evaluate 3. return: stats for the 1st run, stats for the 2nd run and improvement obtained """ from citation_extractor.core import citation_extractor from citation_extractor.eval import SimpleEvaluator from citation_extractor.Utils import aph_corpus from citation_extractor.Utils import IO # extractor without selected candidates in the train set and evaluate pre_extractor = citation_extractor(pre_settings) # extractor with selected candidates in the train set and evaluate post_extractor = citation_extractor(post_settings) # initialise evaluator and evaluate against the test set se = SimpleEvaluator([pre_extractor, post_extractor], post_settings.TEST_DIR) results = se.eval() print "***data***" print "pre-active learning TRAIN-SET: %s" % str(pre_settings.DATA_DIRS) train_details = aph_corpus.get_collection_details( pre_settings.TRAIN_COLLECTIONS) print "pre-active learning TRAIN-SET: # tokens = %i; # NEs = %i" % ( train_details['total_token_count'], train_details['ne_token_count']) train_details = aph_corpus.get_collection_details( post_settings.TRAIN_COLLECTIONS) print "post-active learning TRAIN-SET: %s" % str( post_settings.DATA_DIRS) print "post-active learning TRAIN-SET: # tokens = %i; # NEs = %i" % ( train_details['total_token_count'], train_details['ne_token_count']) test_details = aph_corpus.get_collection_details( post_settings.TEST_COLLECTIONS) print "TEST-SET: %s" % str(post_settings.TEST_DIR) print "TEST-SET details: # tokens = %i; # NEs = %i\n" % ( test_details['total_token_count'], test_details['ne_token_count']) print "*** pre-active learning ***" pre_al_results = results[str(pre_extractor)][0] print "fscore: %f \nprecision: %f\nrecall: %f\n" % ( pre_al_results["f-score"] * 100, pre_al_results["precision"] * 100, pre_al_results["recall"] * 100) print "*** post-active learning ***" post_al_results = results[str(post_extractor)][0] print "fscore: %f \nprecision: %f\nrecall: %f\n" % ( post_al_results["f-score"] * 100, post_al_results["precision"] * 100, post_al_results["recall"] * 100) print "*** post-active learning gain (%) ***" print "fscore: %f \nprecision: %f\nrecall: %f\n" % ( post_al_results["f-score"] * 100 - pre_al_results["f-score"] * 100, post_al_results["precision"] * 100 - pre_al_results["precision"] * 100, post_al_results["recall"] * 100 - pre_al_results["recall"] * 100) IO.write_iob_file(se.output[str(pre_extractor)], "%spre_out.data" % post_settings.OUT_DIR) IO.write_iob_file(se.output[str(post_extractor)], "%spost_out.data" % post_settings.OUT_DIR)
def preproc_document(doc_id, inp_dir, interm_dir, out_dir, abbreviations, taggers, split_sentences=True): """ :param doc_id: the input filename :param inp_dir: the input directory :param interm_dir: the directory where to store intermediate outputs :param out_dir: the directory where to store the PoS-tagged and tokenised text :param abbreviations: :param taggers: the dictionary returned by `get_taggers` :param split_sentences: (boolean) whether to slit text into sentences or not. If `False`, text is split on newline characters `\n`. Returns: language, number of sentences, number of tokens """ lang, no_sentences, no_tokens = np.nan, np.nan, np.nan try: intermediate_out_file = "%s%s" % (interm_dir, doc_id) iob_out_file = "%s%s" % (out_dir, doc_id) text = codecs.open("%s%s" % (inp_dir, doc_id), 'r', 'utf-8').read() if (split_sentences): intermediate_text = sentencebreaks_to_newlines(text) text = recover_segmentation_errors(intermediate_text, abbreviations, verbose=False) else: logger.info("Document %s: skipping sentence splitting" % doc_id) sentences = text.split('\n') logger.info("Document \"%s\" has %i sentences" % (doc_id, len(sentences))) codecs.open(intermediate_out_file, 'w', 'utf-8').write(text) logger.info("Written intermediate output to %s" % intermediate_out_file) lang = detect_language(text) logger.info("Language detected=\"%s\"" % lang) tagged_sentences = taggers[lang].tag_sents(sentences) tokenised_text = [[token for token in line] for line in tagged_sentences] IO.write_iob_file(tokenised_text, iob_out_file) logger.info("Written IOB output to %s" % iob_out_file) no_sentences = len(text.split('\n')) no_tokens = IO.count_tokens(tokenised_text) except Exception, e: logger.error( "The pre-processing of document %s (lang=\'%s\') failed with error \"%s\"" % (doc_id, lang, e))
def main(): import argparse parser = argparse.ArgumentParser(description="") parser.add_argument("input", type=str, help="IOB input file") parser.add_argument("--standoff-dir", help="Stand-off directory", type=str, required=True) parser.add_argument("--output-dir", help="IOB output file", type=str, required=True) args = parser.parse_args() print >> sys.stderr, "IOB Input:", args.input print >> sys.stderr, "Stand-off input folder: ", args.standoff_dir print >> sys.stderr, "IOB output dir:", args.output_dir fname = os.path.split(args.input)[1].split(".")[0] # read the correspondant .ann file with stand-off annotation so_entities, so_relations, so_annotations = read_ann_file( "%s.txt" % fname, args.standoff_dir) # extract for each token the start and end sentences = process(args.input) token_start_end = get_start_end(sentences) # read IOB from file iob_data = IO.file_to_instances(args.input) # make sure that data is consistent assert [len(sentence) for sentence in iob_data ] == [len(sentence) for sentence in token_start_end] so_entities = [(so_entities[ent][1], so_entities[ent][0], int(so_entities[ent][2]), int(so_entities[ent][3])) for ent in so_entities.keys()] updated_iob_instances = update(token_start_end, iob_data, so_entities) try: destination = "%s%s.txt" % (args.output_dir, fname) IO.write_iob_file(updated_iob_instances, destination) print >> sys.stderr, "IOB output written to \'%s\'" % destination except Exception, e: print >> sys.stderr, "Writing ouput to \'%s\' failed with error \'%s\'" % ( destination, e)
def do_ner(doc_id,inp_dir,interm_dir,out_dir,extractor,so2iob_script): # TODO: # wrap with a try/except/finally # return doc_id and a boolean from citation_extractor.Utils import IO try: data = IO.file_to_instances("%s%s"%(inp_dir,doc_id)) postags = [[("z_POS",token[1]) for token in instance] for instance in data if len(instance)>0] instances = [[token[0] for token in instance] for instance in data if len(instance)>0] result = extractor.extract(instances,postags) output = [[(res[n]["token"].decode('utf-8'), postags[i][n][1], res[n]["label"]) for n,d_res in enumerate(res)] for i,res in enumerate(result)] out_fname = "%s%s"%(interm_dir,doc_id) IO.write_iob_file(output,out_fname) logger.info("Output successfully written to file \"%s\""%out_fname) tostandoff(out_fname,out_dir,so2iob_script) return (doc_id,True) except Exception, e: logger.error("The NER of document %s failed with error \"%s\""%(doc_id,e)) return (doc_id,False)
def test_improvement(pre_settings,post_settings): """ TODO: what this function should do: 1. run without selected candidates in the train set and evaluate 2. run with selected candidates in the train set and evaluate 3. return: stats for the 1st run, stats for the 2nd run and improvement obtained """ from citation_extractor.core import citation_extractor from citation_extractor.eval import SimpleEvaluator from citation_extractor.Utils import aph_corpus from citation_extractor.Utils import IO # extractor without selected candidates in the train set and evaluate pre_extractor = citation_extractor(pre_settings) # extractor with selected candidates in the train set and evaluate post_extractor = citation_extractor(post_settings) # initialise evaluator and evaluate against the test set se = SimpleEvaluator([pre_extractor,post_extractor],post_settings.TEST_DIR) results = se.eval() print "***data***" print "pre-active learning TRAIN-SET: %s"%str(pre_settings.DATA_DIRS) train_details = aph_corpus.get_collection_details(pre_settings.TRAIN_COLLECTIONS) print "pre-active learning TRAIN-SET: # tokens = %i; # NEs = %i"%(train_details['total_token_count'],train_details['ne_token_count']) train_details = aph_corpus.get_collection_details(post_settings.TRAIN_COLLECTIONS) print "post-active learning TRAIN-SET: %s"%str(post_settings.DATA_DIRS) print "post-active learning TRAIN-SET: # tokens = %i; # NEs = %i"%(train_details['total_token_count'],train_details['ne_token_count']) test_details = aph_corpus.get_collection_details(post_settings.TEST_COLLECTIONS) print "TEST-SET: %s"%str(post_settings.TEST_DIR) print "TEST-SET details: # tokens = %i; # NEs = %i\n"%(test_details['total_token_count'],test_details['ne_token_count']) print "*** pre-active learning ***" pre_al_results = results[str(pre_extractor)][0] print "fscore: %f \nprecision: %f\nrecall: %f\n"%(pre_al_results["f-score"]*100,pre_al_results["precision"]*100,pre_al_results["recall"]*100) print "*** post-active learning ***" post_al_results = results[str(post_extractor)][0] print "fscore: %f \nprecision: %f\nrecall: %f\n"%(post_al_results["f-score"]*100,post_al_results["precision"]*100,post_al_results["recall"]*100) print "*** post-active learning gain (%) ***" print "fscore: %f \nprecision: %f\nrecall: %f\n"%(post_al_results["f-score"]*100 - pre_al_results["f-score"]*100,post_al_results["precision"]*100 - pre_al_results["precision"]*100,post_al_results["recall"]*100 - pre_al_results["recall"]*100) IO.write_iob_file(se.output[str(pre_extractor)],"%spre_out.data"%post_settings.OUT_DIR) IO.write_iob_file(se.output[str(post_extractor)],"%spost_out.data"%post_settings.OUT_DIR)
def do_ner(doc_id, inp_dir, interm_dir, out_dir, extractor, so2iob_script): # TODO: # wrap with a try/except/finally # return doc_id and a boolean from citation_extractor.Utils import IO try: data = IO.file_to_instances("%s%s" % (inp_dir, doc_id)) postags = [[("z_POS", token[1]) for token in instance] for instance in data if len(instance) > 0] instances = [[token[0] for token in instance] for instance in data if len(instance) > 0] result = extractor.extract(instances, postags) output = [[(res[n]["token"].decode('utf-8'), postags[i][n][1], res[n]["label"]) for n, d_res in enumerate(res)] for i, res in enumerate(result)] out_fname = "%s%s" % (interm_dir, doc_id) IO.write_iob_file(output, out_fname) logger.info("Output successfully written to file \"%s\"" % out_fname) tostandoff(out_fname, out_dir, so2iob_script) return (doc_id, True) except Exception, e: logger.error("The NER of document %s failed with error \"%s\"" % (doc_id, e)) return (doc_id, False)
def run(self): """ TODO """ iterations = [] results = {} results_by_entity = {} # first lets' create test and train set for each iteration for x, iter in enumerate(self.dataSets_iterator): self.logger.info("Iteration %i" % (x + 1)) train_set = [] test_set = [] for y, set in enumerate(iter): for n, group in enumerate(set): if (y == 0): train_set += group else: test_set += group iterations.append((train_set, test_set)) # let's go through all the iterations for i, iter in enumerate(iterations): results["iter-%i" % (i + 1)] = {} results_by_entity["iter-%i" % (i + 1)] = {} train_file = "%sfold_%i.train" % (self.evaluation_dir, i + 1) test_file = "%sfold_%i.test" % (self.evaluation_dir, i + 1) IO.write_iob_file(iter[0], train_file) IO.write_iob_file(iter[1], test_file) # the following line is a bit of a workaround # to avoid recomputing the features when training # each new classifier, I take them from the file created # to train the CRF model (which should always be the first extractor # to be evaluated). filename = "%sfold_%i.train.train" % ( self.extractors[0][1].TEMP_DIR, (i + 1)) f = codecs.open(filename, 'r', 'utf-8') data = f.read() f.close() feature_sets = [[[ token.split('\t')[:len(token.split('\t')) - 1], token.split('\t')[len(token.split('\t')) - 1:] ] for token in instance.split('\n')] for instance in data.split('\n\n')] order = FeatureExtractor().get_feature_order() labelled_feature_sets = [] for instance in feature_sets: for token in instance: temp = [{ order[n]: feature for n, feature in enumerate(token[0]) }, token[1][0]] labelled_feature_sets.append(temp) self.logger.info("read %i labelled instances" % len(feature_sets)) for n, extractor in enumerate(self.extractors): extractor_settings = extractor[1] extractor_name = extractor[0] results["iter-%i" % (i + 1)][extractor_name] = {} self.logger.info("Running iteration #%i with extractor %s" % (i + 1, extractor_name)) self.logger.info(train_file) self.logger.info(test_file) self.logger.info(extractor_settings) extractor_settings.DATA_FILE = train_file if (extractor_settings.CLASSIFIER is not None): extractor = citation_extractor( extractor_settings, extractor_settings.CLASSIFIER, labelled_feature_sets) else: extractor = citation_extractor(extractor_settings) self.logger.info(extractor.classifier) se = SimpleEvaluator([ (extractor_name, extractor), ], iob_file=test_file) results["iter-%i" % (i + 1)][extractor_name] = se.eval()[extractor_name][0] results_by_entity["iter-%i" % ( i + 1)][extractor_name] = SimpleEvaluator.calc_stats_by_entity( se.eval()[extractor_name][1]) #self.logger.info(results_by_entity["iter-%i"%(i+1)][extractor_name]) return results, results_by_entity
def run(self): """ TODO """ iterations = [] results = {} results_by_entity = {} # first lets' create test and train set for each iteration for x,iter in enumerate(self.dataSets_iterator): self.logger.info("Iteration %i"%(x+1)) train_set=[] test_set=[] for y,set in enumerate(iter): for n,group in enumerate(set): if(y==0): train_set+=group else: test_set+=group iterations.append((train_set,test_set)) # let's go through all the iterations for i,iter in enumerate(iterations): results["iter-%i"%(i+1)] = {} results_by_entity["iter-%i"%(i+1)] = {} train_file="%sfold_%i.train"%(self.evaluation_dir,i+1) test_file="%sfold_%i.test"%(self.evaluation_dir,i+1) IO.write_iob_file(iter[0],train_file) IO.write_iob_file(iter[1],test_file) # the following line is a bit of a workaround # to avoid recomputing the features when training # each new classifier, I take them from the file created # to train the CRF model (which should always be the first extractor # to be evaluated). filename = "%sfold_%i.train.train"%(self.extractors[0][1].TEMP_DIR,(i+1)) f=codecs.open(filename,'r','utf-8') data = f.read() f.close() feature_sets=[[[token.split('\t')[:len(token.split('\t'))-1],token.split('\t')[len(token.split('\t'))-1:]] for token in instance.split('\n')] for instance in data.split('\n\n')] order = FeatureExtractor().get_feature_order() labelled_feature_sets=[] for instance in feature_sets: for token in instance: temp = [{order[n]:feature for n,feature in enumerate(token[0])},token[1][0]] labelled_feature_sets.append(temp) self.logger.info("read %i labelled instances"%len(feature_sets)) for n,extractor in enumerate(self.extractors): extractor_settings = extractor[1] extractor_name = extractor[0] results["iter-%i"%(i+1)][extractor_name] = {} self.logger.info("Running iteration #%i with extractor %s"%(i+1,extractor_name)) self.logger.info(train_file) self.logger.info(test_file) self.logger.info(extractor_settings) extractor_settings.DATA_FILE = train_file if(extractor_settings.CLASSIFIER is not None): extractor = citation_extractor(extractor_settings, extractor_settings.CLASSIFIER,labelled_feature_sets) else: extractor = citation_extractor(extractor_settings) self.logger.info(extractor.classifier) se = SimpleEvaluator([(extractor_name, extractor),],iob_file=test_file) results["iter-%i"%(i+1)][extractor_name] = se.eval()[extractor_name][0] results_by_entity["iter-%i"%(i+1)][extractor_name] = SimpleEvaluator.calc_stats_by_entity(se.eval()[extractor_name][1]) #self.logger.info(results_by_entity["iter-%i"%(i+1)][extractor_name]) return results,results_by_entity