def main(): filepath = "F:/Programming Documnets/AI/Telehealth/CliNER-master/data/examples/ex_doc.txt" train_txt_files = glob.glob(filepath) filecon = "F:/Programming Documnets/AI/Telehealth/CliNER-master/data/examples/ex_doc.con" train_con_files = glob.glob(filecon) # Collect training data file paths train_txt_files_map = tools.map_files(train_txt_files) train_con_files_map = tools.map_files(train_con_files) training_list = [] for k in train_txt_files_map: if k in train_con_files_map: training_list.append( (train_txt_files_map[k], train_con_files_map[k])) # Train the model default_log = os.path.join(CLINER_DIR, 'models', 'train.log') default_use_lstm = False val_list = [] test_list = [] filemodel = "F:/Programming Documnets/AI/Telehealth/CliNER-master/models/foo.model" model = os.fspath(filemodel) train(training_list, model, 'i2b2', default_use_lstm, default_log, val=val_list, test=test_list)
def train_model(txt_path, con_path, model_path): modeldir = os.path.dirname(model_path) if (not os.path.exists(modeldir)) and (modeldir != ''): sys.stderr.write('\n\tError: Model dir does not exist: %s\n' % modeldir) sys.stderr.write('\n') exit(1) # A list of txt and concept file paths train_txt_files = glob.glob(txt_path) train_con_files = glob.glob(con_path) # Collect training data file paths train_txt_files_map = tools.map_files(train_txt_files) train_con_files_map = tools.map_files(train_con_files) training_list = [] for k in train_txt_files_map: if k in train_con_files_map: training_list.append( (train_txt_files_map[k], train_con_files_map[k])) val_list = [] test_list = [] # Train the model train(training_list, model_path, "i2b2", False, logfile=None, val=val_list, test=test_list)
def main(): # Parse command line arguments parser = argparse.ArgumentParser(prog='cliner evaluate') parser.add_argument( "--predictions", dest="pred", help="Directory where predictions are stored.", ) parser.add_argument( "--gold", dest="gold", help="Directory where gold standard is stored.", ) parser.add_argument("--format", dest="format", help="Data format ( con ) ") args = parser.parse_args() if not args.pred: sys.stderr.write('\n\tERROR: must provide --pred argument\n\n') parser.print_help(sys.stderr) sys.stderr.write('\n') sys.exit() if not args.gold: sys.stderr.write('\n\tERROR: must provide --gold argument\n\n') parser.print_help(sys.stderr) sys.stderr.write('\n') sys.exit() if args.format: format = args.format else: sys.stderr.write('\n\tERROR: must provide --format argument\n\n') parser.print_help(sys.stderr) sys.stderr.write('\n') sys.exit() # Must specify output format if format not in ['i2b2']: sys.stderr.write('\n\tError: Must specify output format\n') sys.stderr.write('\tAvailable formats: i2b2\n') sys.stderr.write('\n') parser.print_help(sys.stderr) sys.stderr.write('\n') sys.exit() ref_files = os.listdir(args.gold) ref_files = map(lambda f: os.path.join(args.gold, f), ref_files) pred_files = os.listdir(args.pred) pred_files = map(lambda f: os.path.join(args.pred, f), pred_files) ref_files_map = tools.map_files(ref_files) pred_files_map = tools.map_files(pred_files) files = [] for k in ref_files_map: if k in pred_files_map: files.append((pred_files_map[k], ref_files_map[k])) gold_list, pred_list = zip(*files) #print gold_list #print pred_list # create temporary directory for these files cliner_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) tmp_dir = os.path.join(cliner_dir, 'data', 'tmp') tempdir_name = os.path.join(tmp_dir, 'cliner_eval_%d' % random.randint(0, 256)) #print tempdir_name #text_dir = os.path.join(tempdir_name, 'text/') pred_dir = os.path.join(tempdir_name, 'pred/') gold_dir = os.path.join(tempdir_name, 'gold/') os.mkdir(tempdir_name) os.mkdir(pred_dir) os.mkdir(gold_dir) # copy files for pred_file in pred_list: shutil.copy(pred_file, pred_dir) for gold_file in gold_list: shutil.copy(gold_file, gold_dir) # eval jar cliner_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) eval_dir = os.path.join( cliner_dir, 'tools', ) eval_jar = os.path.join(eval_dir, 'i2b2va-eval.jar') cmd = 'java -jar %s -rcp %s -scp %s -ft con -ex all' % (eval_jar, gold_dir, pred_dir) status = subprocess.call(cmd, shell=True, stdout=sys.stdout) # cleanup after yourself shutil.rmtree(tempdir_name)
def main(): parser = argparse.ArgumentParser() parser.add_argument( "-t", help="Text files that were used to generate predictions", dest="txt", ) parser.add_argument( "-c", help= "The directory that contains predicted concept files organized into subdirectories for svm, lin, srf", dest="con", ) parser.add_argument( "-r", help= "The directory that contains reference gold standard concept files", dest="ref", ) parser.add_argument("-f", dest="format", help="Data format (i2b2 or xml).", default='i2b2') parser.add_argument( "--concept", dest="do_concept", help= "A flag indicating whether to evaluate chunk-level or concept-level", action="store_true", default=False) parser.add_argument( "-o", help="Write the evaluation to a file rather than STDOUT", dest="output", default=None) # Parse command line arguments args = parser.parse_args() # Is output destination specified? if args.output: args.output = open(args.output, "w") else: args.output = sys.stdout # Which format to read? if args.format == 'i2b2': wildcard = '*.con' elif args.format == 'xml': wildcard = '*.xml' else: print >> sys.stderr, '\n\tError: Must specify output format (i2b2 or xml)' print >> sys.stderr, '' exit(1) # List of medical text txt_files = glob.glob(args.txt) txt_files_map = tools.map_files(txt_files) # List of gold data ref_files = glob.glob(os.path.join(args.ref, wildcard)) ref_files_map = tools.map_files(ref_files) # List of predictions pred_files = glob.glob(os.path.join(args.con, wildcard)) pred_files_map = tools.map_files(pred_files) # Grouping of text, predictions, gold files = [] for k in txt_files_map: if k in pred_files_map and k in ref_files_map: files.append( (txt_files_map[k], pred_files_map[k], ref_files_map[k])) if args.do_concept: tag2id = {'problem': 0, 'test': 1, 'treatment': 2, 'none': 3} else: from documents import labels as tag2id # Compute the confusion matrix confusion = [[0] * len(tag2id) for e in tag2id] # txt <- medical text # annotations <- predictions # gold <- gold standard for txt, annotations, gold in files: # Read predictions and gols standard data cnote = Document(txt, annotations) rnote = Document(txt, gold) ''' # List of list of labels predictions = tools.flatten( cnote.conlist() ) gold = tools.flatten( rnote.conlist() ) for p,g in zip(predictions,gold): if args.do_concept: p = p[2:] g = g[2:] if p == '': p = 'none' if g == '': g = 'none' confusion[tag2id[g]][tag2id[p]] += 1 ''' #''' sents = cnote.getTokenizedSentences() predictions = cnote.conlist() gold = rnote.conlist() for i, (pline, gline) in enumerate(zip(predictions, gold)): #for p,g in zip(pline,gline)[1:]: #for p,g in zip(pline,gline)[:1]: for j, (p, g) in enumerate(zip(pline, gline)): # try to ignore those leading articles #if j < len(pline)-1: #if pline[j+1][2:]==gline[j+1][2:] and pline[j+1][0]=='B' and gline[j+1][0]=='I': #if pline[j+1][2:]==gline[j+1][2:] and p=='B' and gline[i+1][0]=='B': # continue #if sents[i][j] == '__num__': # continue #if j == 0: # continue if args.do_concept: p = p[2:] g = g[2:] if p == '': p = 'none' if g == '': g = 'none' confusion[tag2id[g]][tag2id[p]] += 1 #''' # Display the confusion matrix if args.do_concept: choice = 'CONCEPT' else: choice = '7-way' print >> args.output, "" print >> args.output, "" print >> args.output, "" print >> args.output, "================" print >> args.output, "%s RESULTS" % choice print >> args.output, "================" print >> args.output, "" print >> args.output, "Confusion Matrix" pad = max(len(l) for l in tag2id) + 6 print >> args.output, "%s %s" % (' ' * pad, "\t".join( [s[:5] for s in tag2id.keys()])) for act, act_v in tag2id.items(): print >> args.output, "%s %s" % (act.rjust(pad), "\t".join( [str(confusion[act_v][pre_v]) for pre, pre_v in tag2id.items()])) print >> args.output, "" # Compute the analysis stuff precision = [] recall = [] specificity = [] f1 = [] tp = 0 fp = 0 fn = 0 tn = 0 print >> args.output, "Analysis" print >> args.output, " " * pad, "%10s%10s%10s" % ("Precision", "Recall", "F1") for lab, lab_v in tag2id.items(): tp = confusion[lab_v][lab_v] fp = sum(confusion[v][lab_v] for k, v in tag2id.items() if v != lab_v) fn = sum(confusion[lab_v][v] for k, v in tag2id.items() if v != lab_v) tn = sum(confusion[v1][v2] for k1, v1 in tag2id.items() for k2, v2 in tag2id.items() if v1 != lab_v and v2 != lab_v) precision += [float(tp) / (tp + fp + 1e-100)] recall += [float(tp) / (tp + fn + 1e-100)] specificity += [float(tn) / (tn + fp + 1e-100)] f1 += [float(2 * tp) / (2 * tp + fp + fn + 1e-100)] print >> args.output, "%s %10.4f%10.4f%10.4f" % ( lab.rjust(pad), precision[-1], recall[-1], f1[-1]) print >> args.output, "--------" precision = sum(precision) / len(precision) recall = sum(recall) / len(recall) specificity = sum(specificity) / len(specificity) f1 = sum(f1) / len(f1) print >> args.output, "Average: %.4f\t%.4f\t%.4f" % (precision, recall, f1)
def main(): parser = argparse.ArgumentParser() parser.add_argument( "--txt", dest="txt", help="The files that contain the training examples", ) parser.add_argument( "--annotations", dest="con", help="The files that contain the labels for the training examples", ) parser.add_argument( "--val-txt", dest="val_txt", help="The files that contain the validation examples", ) parser.add_argument( "--val-annotations", dest="val_con", help="The files that contain the labels for the validation examples", ) parser.add_argument( "--test-txt", dest="test_txt", help="The files that contain the test examples", ) parser.add_argument( "--test-annotations", dest="test_con", help="The files that contain the labels for the test examples", ) parser.add_argument( "--model", dest="model", help="Path to the model that should be generated", ) parser.add_argument("--log", dest="log", help="Path to the log file for training info", default=os.path.join(CLINER_DIR, 'models', 'train.log')) parser.add_argument("--use-lstm", dest="use_lstm", help="Whether to use an LSTM model", action='store_true', default=True) parser.add_argument("--format", dest="format", help="Data format ( i2b2 )") # Parse the command line arguments args = parser.parse_args() # Error check: Ensure that file paths are specified if not args.txt: sys.stderr.write('\n\tError: Must provide text files\n') sys.stderr.write('\n') exit(1) if not args.con: sys.stderr.write( '\n\tError: Must provide annotations for text files\n') sys.stderr.write('\n') exit(1) if not args.model: sys.stderr.write('\n\tError: Must provide valid path to store model\n') sys.stderr.write('\n') exit(1) modeldir = os.path.dirname(args.model) if (not os.path.exists(modeldir)) and (modeldir != ''): sys.stderr.write('\n\tError: Model dir does not exist: %s\n' % modeldir) sys.stderr.write('\n') exit(1) # A list of txt and concept file paths train_txt_files = glob.glob(args.txt) train_con_files = glob.glob(args.con) # data format if args.format: format = args.format # Must specify output format if args.format not in ['i2b2']: print >> sys.stderr, '\n\tError: Must specify output format' print >> sys.stderr, '\tAvailable formats: i2b2' sys.stderr.write('\n') exit(1) # Collect training data file paths train_txt_files_map = tools.map_files(train_txt_files) train_con_files_map = tools.map_files(train_con_files) training_list = [] for k in train_txt_files_map: if k in train_con_files_map: training_list.append( (train_txt_files_map[k], train_con_files_map[k])) # If validation data was specified if args.val_txt and args.val_con: val_txt_files = glob.glob(args.val_txt) val_con_files = glob.glob(args.val_con) val_txt_files_map = tools.map_files(val_txt_files) val_con_files_map = tools.map_files(val_con_files) val_list = [] for k in val_txt_files_map: if k in val_con_files_map: val_list.append((val_txt_files_map[k], val_con_files_map[k])) else: val_list = [] # If test data was specified if args.test_txt and args.test_con: test_txt_files = glob.glob(args.test_txt) test_con_files = glob.glob(args.test_con) test_txt_files_map = tools.map_files(test_txt_files) test_con_files_map = tools.map_files(test_con_files) test_list = [] for k in test_txt_files_map: if k in test_con_files_map: test_list.append( (test_txt_files_map[k], test_con_files_map[k])) else: test_list = [] # Train the model train(training_list, args.model, args.format, args.use_lstm, logfile=args.log, val=val_list, test=test_list)
def main(): # Parse command line arguments parser = argparse.ArgumentParser(prog='cliner evaluate') parser.add_argument("--predictions", dest = "pred", help = "Directory where predictions are stored.", ) parser.add_argument("--gold", dest = "gold", help = "Directory where gold standard is stored.", ) parser.add_argument("--format", dest = "format", help = "Data format ( con ) " ) parser.add_argument("--output", dest = "output", help = "Write the evaluation to a file rather than STDOUT", ) args = parser.parse_args() if not args.pred: print '\n\tERROR: must provide --pred argument\n' parser.print_help(sys.stderr) print >>sys.stderr, '' exit(1) if not args.gold: print '\n\tERROR: must provide --gold argument\n' parser.print_help(sys.stderr) print >>sys.stderr, '' exit(1) if args.format: format = args.format else: print '\n\tERROR: must provide --format argument\n' parser.print_help(sys.stderr) print >>sys.stderr, '' exit(1) # Is output destination specified? if args.output: args.output = open(args.output, "w") else: args.output = sys.stdout # Must specify output format if format not in ['i2b2']: print >>sys.stderr, '\n\tError: Must specify output format' print >>sys.stderr, '\tAvailable formats: con' print >>sys.stderr, '' parser.print_help(sys.stderr) print >>sys.stderr, '' exit(1) ref_files = os.listdir(args.gold) ref_files = map(lambda f: os.path.join(args.gold, f), ref_files) pred_files = os.listdir(args.pred) pred_files = map(lambda f: os.path.join(args.pred, f), pred_files) ref_files_map = tools.map_files( ref_files) pred_files_map = tools.map_files(pred_files) files = [] for k in ref_files_map: if k in pred_files_map: files.append((pred_files_map[k], ref_files_map[k])) gold_list, pred_list = zip(*files) #print gold_list #print pred_list # create temporary directory for these files tempdir_name = '/tmp/cliner_eval_%d' % random.randint(0,256) #print tempdir_name #text_dir = os.path.join(tempdir_name, 'text/') pred_dir = os.path.join(tempdir_name, 'pred/') gold_dir = os.path.join(tempdir_name, 'gold/') os.mkdir(tempdir_name) os.mkdir(pred_dir) os.mkdir(gold_dir) # copy files for pred_file in pred_list: shutil.copy(pred_file, pred_dir) for gold_file in gold_list: shutil.copy(gold_file, gold_dir) # eval jar cliner_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) eval_dir = os.path.join(cliner_dir, 'tools',) eval_jar = os.path.join(eval_dir, 'i2b2va-eval.jar') cmd = 'java -jar %s -rcp %s -scp %s -ft con -ex all' % (eval_jar, gold_dir, pred_dir) status,output = commands.getstatusoutput(cmd) print output # cleanup after yourself shutil.rmtree(tempdir_name)
def main(): # Parse command line arguments parser = argparse.ArgumentParser(prog='cliner evaluate') parser.add_argument( "--txt", dest="txt", help="Glob of .txt files of discharge summaries", ) parser.add_argument( "--predictions", dest="pred", help="Directory where predictions are stored.", ) parser.add_argument( "--gold", dest="gold", help="Directory where gold standard is stored.", ) parser.add_argument("--format", dest="format", help="Data format ( con )") parser.add_argument( "--output", dest="output", help="Write the evaluation to a file rather than STDOUT", ) args = parser.parse_args() if not args.txt: print '\n\tERROR: must provide --txt argument\n' parser.print_help(sys.stderr) print >> sys.stderr, '' exit(1) if not args.pred: print '\n\tERROR: must provide --pred argument\n' parser.print_help(sys.stderr) print >> sys.stderr, '' exit(1) if not args.gold: print '\n\tERROR: must provide --gold argument\n' parser.print_help(sys.stderr) print >> sys.stderr, '' exit(1) if args.format: format = args.format else: print '\n\tERROR: must provide --format argument\n' parser.print_help(sys.stderr) print >> sys.stderr, '' exit(1) # Is output destination specified? if args.output: args.output = open(args.output, "w") else: args.output = sys.stdout # Must specify output format if format not in ['i2b2']: print >> sys.stderr, '\n\tError: Must specify output format' print >> sys.stderr, '\tAvailable formats: i2b2' print >> sys.stderr, '' parser.print_help(sys.stderr) print >> sys.stderr, '' exit(1) # List of medical text txt_files = glob.glob(args.txt) txt_files_map = tools.map_files(txt_files) wildcard = '*.con' # List of gold data ref_files = glob.glob(os.path.join(args.gold, wildcard)) ref_files_map = tools.map_files(ref_files) # List of predictions pred_files = glob.glob(os.path.join(args.pred, wildcard)) pred_files_map = tools.map_files(pred_files) # Grouping of text, predictions, gold files = [] for k in txt_files_map: if k in pred_files_map and k in ref_files_map: files.append( (txt_files_map[k], pred_files_map[k], ref_files_map[k])) # txt <- medical text # annotations <- predictions # gold <- gold standard if len(files) == 0: print "No files to be evaluated" exit() print for txt, annotations, gold in files: # Read predictions and gols standard data cnote = Document(txt, annotations) rnote = Document(txt, gold) sents = rnote.getTokenizedSentences() # Note - can also get first pass (IOB labels) ref = rnote.conlist() pred = cnote.conlist() for i, toks, pline, rline in zip(range(len(sents)), sents, pred, ref): for j, token, rlab, plab in zip(range(len(pline)), toks, rline, pline): if rlab != plab: ind = max(0, j - 3) #print 'ref: ', rline[j-3:j+3] #print 'pred: ', pline[j-3:j+3] print token for k in range(ind, j): print ' ' * (len(toks[k]) + 4), print '<>' print toks[j - 3:j + 3] print '\tpred: ', plab print '\tref: ', rlab print '\n'
def main(): parser = argparse.ArgumentParser() parser.add_argument("--txt", dest="txt") parser.add_argument( "--annotations", dest="con", ) parser.add_argument("--val-txt", dest="val_txt") parser.add_argument("--val-annotations", dest="val_con") parser.add_argument("--test-txt", dest="test_txt") parser.add_argument("--test-annotations", dest="test_con") parser.add_argument("--model", dest="model") parser.add_argument("--log", dest="log") # parser.add_argument("--format",dest="format") args = parser.parse_args() if (not args.txt or not args.con or not args.model): parser.print_help(sys.stderr) sys.stderr.write('\n\tError in parsing arguments\n') sys.stderr.write('\n') exit(1) m_dir = os.path.dirname(args.model) if (not os.path.exists(m_dir)) and (m_dir != ''): parser.print_help(sys.stderr) sys.stderr.write('\n\tNo such model directory:%s\n' % m_dir) sys.stderr.write('\n') exit(1) textFiles = glob.glob(args.txt) conceptFiles = glob.glob(args.con) textFilesMap = tools.map_files(textFiles) conceptFilesMap = tools.map_files(conceptFiles) trainingList = [] for k in textFilesMap: if k in conceptFilesMap: trainingList.append((textFilesMap[k], conceptFilesMap[k])) if args.val_txt and args.val_con: valTextFiles = glob.glob(args.val_txt) valConceptFiles = glob.glob(args.val_con) valTextFilesMap = tools.map_files(valTextFiles) valConceptFilesMap = tools.map_files(valConceptFiles) valList = [] for k in valTextFilesMap: if k in valConceptFilesMap: valList.append((valTextFilesMap[k], valConceptFilesMap[k])) else: valList = [] if args.test_txt and args.test_con: testTextFiles = glob.glob(args.test_txt) testConceptFiles = glob.glob(args.test_con) testTextFilesMap = tools.map_files(testTextFiles) testConceptFilesMap = tools.map_files(testConceptFiles) testList = [] for k in testTextFilesMap: if k in testConceptFilesMap: testList.append((testTextFilesMap[k], testConceptFilesMap[k])) else: testList = [] build(trainingList, args.model, logFile=args.log, val=valList, test=testList)
def main(): parser = argparse.ArgumentParser() parser.add_argument("-t", help = "Text files that were used to generate predictions", dest = "txt", ) parser.add_argument("-c", help = "The directory that contains predicted concept files organized into subdirectories for svm, lin, srf", dest = "con", ) parser.add_argument("-r", help = "The directory that contains reference gold standard concept files", dest = "ref", ) parser.add_argument("-f", dest = "format", help = "Data format (i2b2 or xml).", default = 'i2b2' ) parser.add_argument("--concept", dest = "do_concept", help = "A flag indicating whether to evaluate chunk-level or concept-level", action = "store_true", default = False ) parser.add_argument("-o", help = "Write the evaluation to a file rather than STDOUT", dest = "output", default = None ) # Parse command line arguments args = parser.parse_args() # Is output destination specified? if args.output: args.output = open(args.output, "w") else: args.output = sys.stdout # Which format to read? if args.format == 'i2b2': wildcard = '*.con' elif args.format == 'xml': wildcard = '*.xml' else: print >>sys.stderr, '\n\tError: Must specify output format (i2b2 or xml)' print >>sys.stderr, '' exit(1) # List of medical text txt_files = glob.glob(args.txt) txt_files_map = tools.map_files(txt_files) # List of gold data ref_files = glob.glob( os.path.join(args.ref, wildcard) ) ref_files_map = tools.map_files(ref_files) # List of predictions pred_files = glob.glob( os.path.join(args.con, wildcard) ) pred_files_map = tools.map_files(pred_files) # Grouping of text, predictions, gold files = [] for k in txt_files_map: if k in pred_files_map and k in ref_files_map: files.append((txt_files_map[k], pred_files_map[k], ref_files_map[k])) if args.do_concept: tag2id = { 'problem':0, 'test':1, 'treatment':2, 'none':3 } else: from documents import labels as tag2id # Compute the confusion matrix confusion = [[0] * len(tag2id) for e in tag2id] # txt <- medical text # annotations <- predictions # gold <- gold standard for txt, annotations, gold in files: # Read predictions and gols standard data cnote = Document(txt, annotations) rnote = Document(txt, gold) ''' # List of list of labels predictions = tools.flatten( cnote.conlist() ) gold = tools.flatten( rnote.conlist() ) for p,g in zip(predictions,gold): if args.do_concept: p = p[2:] g = g[2:] if p == '': p = 'none' if g == '': g = 'none' confusion[tag2id[g]][tag2id[p]] += 1 ''' #''' sents = cnote.getTokenizedSentences() predictions = cnote.conlist() gold = rnote.conlist() for i,(pline,gline) in enumerate(zip(predictions,gold)): #for p,g in zip(pline,gline)[1:]: #for p,g in zip(pline,gline)[:1]: for j,(p,g) in enumerate(zip(pline,gline)): # try to ignore those leading articles #if j < len(pline)-1: #if pline[j+1][2:]==gline[j+1][2:] and pline[j+1][0]=='B' and gline[j+1][0]=='I': #if pline[j+1][2:]==gline[j+1][2:] and p=='B' and gline[i+1][0]=='B': # continue #if sents[i][j] == '__num__': # continue #if j == 0: # continue if args.do_concept: p = p[2:] g = g[2:] if p == '': p = 'none' if g == '': g = 'none' confusion[tag2id[g]][tag2id[p]] += 1 #''' # Display the confusion matrix if args.do_concept: choice = 'CONCEPT' else: choice = '7-way' print >>args.output, "" print >>args.output, "" print >>args.output, "" print >>args.output, "================" print >>args.output, "%s RESULTS" % choice print >>args.output, "================" print >>args.output, "" print >>args.output, "Confusion Matrix" pad = max(len(l) for l in tag2id) + 6 print >>args.output, "%s %s" % (' ' * pad, "\t".join([s[:5] for s in tag2id.keys()])) for act, act_v in tag2id.items(): print >>args.output, "%s %s" % (act.rjust(pad), "\t".join([str(confusion[act_v][pre_v]) for pre, pre_v in tag2id.items()])) print >>args.output, "" # Compute the analysis stuff precision = [] recall = [] specificity = [] f1 = [] tp = 0 fp = 0 fn = 0 tn = 0 print >>args.output, "Analysis" print >>args.output, " " * pad, "%10s%10s%10s" % ("Precision","Recall","F1") for lab, lab_v in tag2id.items(): tp = confusion[lab_v][lab_v] fp = sum(confusion[v][lab_v] for k, v in tag2id.items() if v != lab_v) fn = sum(confusion[lab_v][v] for k, v in tag2id.items() if v != lab_v) tn = sum(confusion[v1][v2] for k1, v1 in tag2id.items() for k2, v2 in tag2id.items() if v1 != lab_v and v2 != lab_v) precision += [float(tp) / (tp + fp + 1e-100)] recall += [float(tp) / (tp + fn + 1e-100)] specificity += [float(tn) / (tn + fp + 1e-100)] f1 += [float(2 * tp) / (2 * tp + fp + fn + 1e-100)] print >>args.output, "%s %10.4f%10.4f%10.4f" % (lab.rjust(pad), precision[-1], recall[-1], f1[-1]) print >>args.output, "--------" precision = sum(precision) / len(precision) recall = sum(recall) / len(recall) specificity = sum(specificity) / len(specificity) f1 = sum(f1) / len(f1) print >>args.output, "Average: %.4f\t%.4f\t%.4f" % (precision, recall, f1)
def main(): # Parse command line arguments parser = argparse.ArgumentParser(prog='cliner evaluate') parser.add_argument( "--predictions", dest="pred", help="Directory where predictions are stored.", ) parser.add_argument( "--gold", dest="gold", help="Directory where gold standard is stored.", ) parser.add_argument("--format", dest="format", help="Data format ( con ) ") parser.add_argument( "--output", dest="output", help="Write the evaluation to a file rather than STDOUT", ) args = parser.parse_args() if not args.pred: print '\n\tERROR: must provide --pred argument\n' parser.print_help(sys.stderr) print >> sys.stderr, '' exit(1) if not args.gold: print '\n\tERROR: must provide --gold argument\n' parser.print_help(sys.stderr) print >> sys.stderr, '' exit(1) if args.format: format = args.format else: print '\n\tERROR: must provide --format argument\n' parser.print_help(sys.stderr) print >> sys.stderr, '' exit(1) # Is output destination specified? if args.output: args.output = open(args.output, "w") else: args.output = sys.stdout # Must specify output format if format not in ['i2b2']: print >> sys.stderr, '\n\tError: Must specify output format' print >> sys.stderr, '\tAvailable formats: con' print >> sys.stderr, '' parser.print_help(sys.stderr) print >> sys.stderr, '' exit(1) ref_files = os.listdir(args.gold) ref_files = map(lambda f: os.path.join(args.gold, f), ref_files) pred_files = os.listdir(args.pred) pred_files = map(lambda f: os.path.join(args.pred, f), pred_files) ref_files_map = tools.map_files(ref_files) pred_files_map = tools.map_files(pred_files) files = [] for k in ref_files_map: if k in pred_files_map: files.append((pred_files_map[k], ref_files_map[k])) gold_list, pred_list = zip(*files) #print gold_list #print pred_list # create temporary directory for these files tempdir_name = '/tmp/cliner_eval_%d' % random.randint(0, 256) #print tempdir_name #text_dir = os.path.join(tempdir_name, 'text/') pred_dir = os.path.join(tempdir_name, 'pred/') gold_dir = os.path.join(tempdir_name, 'gold/') os.mkdir(tempdir_name) os.mkdir(pred_dir) os.mkdir(gold_dir) # copy files for pred_file in pred_list: shutil.copy(pred_file, pred_dir) for gold_file in gold_list: shutil.copy(gold_file, gold_dir) # eval jar cliner_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) eval_dir = os.path.join( cliner_dir, 'tools', ) eval_jar = os.path.join(eval_dir, 'i2b2va-eval.jar') cmd = 'java -jar %s -rcp %s -scp %s -ft con -ex all' % (eval_jar, gold_dir, pred_dir) status, output = commands.getstatusoutput(cmd) print output # cleanup after yourself shutil.rmtree(tempdir_name)
def main(): # Parse arguments parser = argparse.ArgumentParser(prog='cliner train') parser.add_argument("--txt", dest = "txt", help = ".txt files of discharge summaries" ) parser.add_argument("--annotations", dest = "con", help = "concept files for annotations of the .txt files", ) parser.add_argument("--model", dest = "model", help = "Path to the model that should be stored", ) parser.add_argument("--log", dest = "log", help = "Path to the log file for training info", default = os.path.join(CLINER_DIR, 'models', 'train.log') ) parser.add_argument("--format", dest = "format", help = "Data format ( i2b2 )" ) # Parse the command line arguments args = parser.parse_args() # Error check: Ensure that file paths are specified if not args.txt: print >>sys.stderr, '\n\tError: Must provide text files' print >>sys.stderr, '' parser.print_help(sys.stderr) print >>sys.stderr, '' exit(1) if not args.con: print >>sys.stderr, '\n\tError: Must provide annotations for text files' print >>sys.stderr, '' parser.print_help(sys.stderr) print >>sys.stderr, '' exit(1) if not args.model: print >>sys.stderr, '\n\tError: Must provide valid path to store model' print >>sys.stderr, '' parser.print_help(sys.stderr) print >>sys.stderr, '' exit(1) modeldir = os.path.dirname(args.model) if (not os.path.exists(modeldir)) and (modeldir != ''): print >>sys.stderr, '\n\tError: ClinerModel dir does not exist: %s' % modeldir print >>sys.stderr, '' parser.print_help(sys.stderr) print >>sys.stderr, '' exit(1) # A list of text and concept file paths txt_files = glob.glob(args.txt) con_files = glob.glob(args.con) # data format if not args.format: print '\n\tERROR: must provide "format" argument\n' exit() # Must specify output format if args.format not in ['i2b2']: print >>sys.stderr, '\n\tError: Must specify output format' print >>sys.stderr, '\tAvailable formats: i2b2' print >>sys.stderr, '' exit(1) # Collect training data file paths txt_files_map = tools.map_files(txt_files) con_files_map = tools.map_files(con_files) training_list = [] for k in txt_files_map: if k in con_files_map: training_list.append((txt_files_map[k], con_files_map[k])) # Train the model train(training_list, args.model, args.format, logfile=args.log)
def main(): parser = argparse.ArgumentParser() parser.add_argument( "--txt", dest="txt", help="The files that contain the training examples", ) parser.add_argument( "--annotations", dest="con", help="The files that contain the labels for the training examples", ) parser.add_argument( "--model", dest="model", help="Path to the model that should be generated", ) parser.add_argument("--log", dest="log", help="Path to the log file for training info", default=os.path.join(CLINER_DIR, 'models', 'train.log')) parser.add_argument("--use-lstm", dest="use_lstm", help="Whether to use an LSTM model", action='store_true', default=False) parser.add_argument("--format", dest="format", help="Data format ( i2b2 )") # Parse the command line arguments args = parser.parse_args() # Error check: Ensure that file paths are specified if not args.txt: print >> sys.stderr, '\n\tError: Must provide text files' print >> sys.stderr, '' exit(1) if not args.con: print >> sys.stderr, '\n\tError: Must provide annotations for text files' print >> sys.stderr, '' exit(1) if not args.model: print >> sys.stderr, '\n\tError: Must provide valid path to store model' print >> sys.stderr, '' exit(1) modeldir = os.path.dirname(args.model) if (not os.path.exists(modeldir)) and (modeldir != ''): print >> sys.stderr, '\n\tError: Model dir does not exist: %s' % modeldir print >> sys.stderr, '' exit(1) if args.use_lstm: print >> sys.stderr, '\n\t --use-lstm not supported yet' print >> sys.stderr, '' exit(1) # A list of txt and concept file paths txt_files = glob.glob(args.txt) con_files = glob.glob(args.con) # data format if args.format: format = args.format # Must specify output format if args.format not in ['i2b2']: print >> sys.stderr, '\n\tError: Must specify output format' print >> sys.stderr, '\tAvailable formats: i2b2' print >> sys.stderr, '' exit(1) # Collect training data file paths txt_files_map = tools.map_files(txt_files) con_files_map = tools.map_files(con_files) training_list = [] for k in txt_files_map: if k in con_files_map: training_list.append((txt_files_map[k], con_files_map[k])) # Train the model train(training_list, args.model, args.format, args.use_lstm, logfile=args.log)