def main(): data_paths = { 'test': ('../data/test_data/*', '../data/reference_standard_for_test_data/concepts/'), 'train': ('../data/concept_assertion_relation_training_data/merged/txt/*', '../data/concept_assertion_relation_training_data/merged/concept') } for type, paths in data_paths.items(): full_path = lambda f: os.path.join(os.path.dirname(os.path.realpath(__file__)), f) args_txt = full_path(paths[0]) args_ref = full_path(paths[1]) txt_files = glob.glob(args_txt) ref_files = os.listdir(args_ref) ref_files = map(lambda f: os.path.join(args_ref, f), ref_files) txt_files_map = helper.map_files(txt_files) ref_files_map = helper.map_files(ref_files) files = [] for k in txt_files_map: if k in ref_files_map: files.append((txt_files_map[k], ref_files_map[k])) labels = {} for txt, ref in files: txt = read_txt(txt) for r in read_con(ref, txt): for r in r: if r not in labels: labels[r] = 0 labels[r] += 1 print type, labels
def main(): parser = argparse.ArgumentParser() parser.add_argument("-t", help="Files containing predictions", dest="txt", default=os.path.join(BASE_DIR, 'data/predictions/*')) parser.add_argument( "-r", help= "The directory that contains reference gold standard concept files", dest="ref", default=os.path.join(BASE_DIR, 'data')) parser.add_argument( "-o", help="Write the evaluation to a file rather than STDOUT", dest="output", default=None) parser.add_argument("-e", help="Do error analysis", dest="error", action='store_true') # Parse command line arguments args = parser.parse_args() # Is output destination specified if args.output: args.output = open(args.output, "w") else: args.output = sys.stdout txt_files = glob.glob(args.txt) txt_files_map = helper.map_files(txt_files) ref_directory = args.ref ref_files = os.listdir(ref_directory) ref_files = map(lambda f: os.path.join(args.ref, f), ref_files) ref_files_map = helper.map_files(ref_files) files = [] for k in txt_files_map: if k in ref_files_map: files.append((txt_files_map[k], ref_files_map[k])) print files # Useful for error analysis text = [] # One list of all labels pred_labels = [] gold_labels = [] # txt <- predicted labels # ref <- actual labels for txt, ref in files: # A note that represents the model's predictions pnote = Note() pnote.read(txt) # A note that is the actual concept labels gnote = Note() gnote.read(ref) # Accumulate all predictions pred_labels += pnote.label_list() gold_labels += gnote.label_list() # Collect text for error analysis text += pnote.text_list() # Compute results evaluate(pred_labels, gold_labels, out=args.output) # Error analysis if args.error: print '\n\n\n' error_analysis(text, pred_labels, gold_labels)
def main(): parser = argparse.ArgumentParser() parser.add_argument("-t", dest = "txt", help = "The files that contain the training examples", default = os.path.join(os.getenv('CLINER_DIR'), 'data/train/txt/*') ) parser.add_argument("-c", dest = "con", help = "The files that contain the labels for the training examples", default = os.path.join(os.getenv('CLINER_DIR'), 'data/train/con/*') ) parser.add_argument("-m", dest = "model", help = "Path to the model that should be generated", default = os.path.join(os.getenv('CLINER_DIR'), 'models/run.model') ) parser.add_argument("-f", dest = "format", help = "Data format ( " + ' | '.join(Note.supportedFormats()) + " )", default = 'i2b2' ) parser.add_argument("-g", dest = "grid", help = "A flag indicating whether to perform a grid search", action = "store_true" ) parser.add_argument("-no-crf", dest = "nocrf", help = "A flag indicating whether to use crfsuite for pass one.", action = "store_true" ) # Parse the command line arguments args = parser.parse_args() is_crf = not args.nocrf # A list of text file paths # A list of concept file paths txt_files = glob.glob(args.txt) con_files = glob.glob(args.con) # data format format = args.format # Must specify output format if format not in Note.supportedFormats(): print >>sys.stderr, '\n\tError: Must specify output format' print >>sys.stderr, '\tAvailable formats: ', ' | '.join(Note.supportedFormats()) print >>sys.stderr, '' exit(1) # Collect training data file paths txt_files_map = helper.map_files(txt_files) # ex. {'record-13': 'record-13.con'} con_files_map = helper.map_files(con_files) training_list = [] # ex. training_list = [ ('record-13.txt', 'record-13.con') ] for k in txt_files_map: if k in con_files_map: training_list.append((txt_files_map[k], con_files_map[k])) # display file names (for user to see data was properly located) print '\n', training_list, '\n' # Train the model train(training_list, args.model, format, is_crf=is_crf, grid=args.grid)
def main(): parser = argparse.ArgumentParser() parser.add_argument( "-t", dest="txt", help="The files that contain the training examples", default=os.path.join( os.path.dirname(os.path.realpath(__file__)), '../data/concept_assertion_relation_training_data/merged/txt/*')) parser.add_argument( "-c", dest="con", help="The files that contain the labels for the training examples", default=os.path.join( os.path.dirname(os.path.realpath(__file__)), '../data/concept_assertion_relation_training_data/merged/concept/*' )) parser.add_argument("-m", dest="model", help="Path to the model that should be generated", default=os.path.join( os.path.dirname(os.path.realpath(__file__)), '../models/awesome.model')) parser.add_argument("-d", dest="disabled_features", help="The features that should not be used", nargs="+", default=None) parser.add_argument( "-e", dest="enabled_features", help="The features that should be used. This option trumps -d", nargs="+", default=None) parser.add_argument( "--no-svm", dest="no_svm", action="store_true", help="Disable SVM model generation", ) parser.add_argument( "--no-lin", dest="no_lin", action="store_true", help="Disable LIN model generation", ) parser.add_argument( "--no-crf", dest="no_crf", action="store_true", help="Disable CRF model generation", ) args = parser.parse_args() training_list = [] txt_files = glob.glob(args.txt) con_files = glob.glob(args.con) txt_files_map = helper.map_files(txt_files) con_files_map = helper.map_files(con_files) for k in txt_files_map: if k in con_files_map: training_list.append((txt_files_map[k], con_files_map[k])) type = 0 if not args.no_svm: type = type | libml.SVM if not args.no_lin: type = type | libml.LIN if not args.no_crf: type = type | libml.CRF # Get data and labels from files data = [] labels = [] for txt, con in training_list: datum = read_txt(txt) data += datum labels += read_con(con, datum) # Train a model on the data and labels model = Model(filename=args.model, type=type) if args.disabled_features != None: model.enabled_features = model.enabled_features - Set( args.disabled_features) if args.enabled_features != None: model.enabled_features = Set(args.enabled_features) model.train(data, labels)
def main(): parser = argparse.ArgumentParser() parser.add_argument( "-t", help="Text files that were used to generate predictions", dest="txt", default=os.path.join(os.getenv('CLINER_DIR'), 'data/test_data/*')) parser.add_argument( "-c", help= "The directory that contains predicted concept files organized into subdirectories for svm, lin, srf", dest="con", default=os.path.join(os.getenv('CLINER_DIR'), 'data/predictions/')) parser.add_argument( "-r", help= "The directory that contains reference gold standard concept files", dest="ref", default=os.path.join( os.getenv('CLINER_DIR'), 'data/reference_standard_for_test_data/concepts/')) parser.add_argument( "-f", dest="format", help="Data format ( " + ' | '.join(Note.supportedFormats()) + " )", ) parser.add_argument( "-o", help="Write the evaluation to a file rather than STDOUT", dest="output", default=None) # Parse command line arguments args = parser.parse_args() if args.format: format = args.format else: print '\n\tERROR: must provide "format" argument\n' exit() # Is output destination specified? if args.output: args.output = open(args.output, "w") else: args.output = sys.stdout # Must specify output format if format not in Note.supportedFormats(): print >> sys.stderr, '\n\tError: Must specify output format' print >> sys.stderr, '\tAvailable formats: ', ' | '.join( Note.supportedFormats()) print >> sys.stderr, '' exit(1) # List of medical text txt_files = glob.glob(args.txt) txt_files_map = helper.map_files(txt_files) wildcard = '*.' + Note.dictOfFormatToExtensions()[format] # List of gold data ref_files = glob.glob(os.path.join(args.ref, wildcard)) ref_files_map = helper.map_files(ref_files) # List of predictions pred_files = glob.glob(os.path.join(args.con, wildcard)) pred_files_map = helper.map_files(pred_files) # Grouping of text, predictions, gold files = [] for k in txt_files_map: if k in pred_files_map and k in ref_files_map: files.append( (txt_files_map[k], pred_files_map[k], ref_files_map[k])) # txt <- medical text # annotations <- predictions # gold <- gold standard truePositivesExactSpan = 0 falseNegativesExactSpan = 0 falsePositivesExactSpan = 0 truePositivesInexactSpan = 0 falseNegativesInexactSpan = 0 falsePositivesInexactSpan = 0 confusion = [[0] * len(labels) for e in labels] confusionMatrixExactSpan = deepcopy(confusion) confusionMatrixInexactSpan = deepcopy(confusion) if len(files) == 0: exit("No files to be evaluated") for txt, annotations, gold in files: # Read predictions and gols standard data cnote = Note(format) rnote = Note(format) cnote.read(txt, annotations) rnote.read(txt, gold) referenceSpans = getConceptSpans(rnote.getIOBLabels(), rnote.conlist()) predictedSpans = getConceptSpans(cnote.getIOBLabels(), cnote.conlist()) #TO DO: i need to generate a cumulative total accross all of the files #modify my functions slightly and have it return the number of true positive and etc... #then call generate results exactResults = evaluate(deepcopy(referenceSpans), deepcopy(predictedSpans), exactMatch=True, reportSeperately=False) inexactResults = evaluate(deepcopy(referenceSpans), deepcopy(predictedSpans), exactMatch=False, reportSeperately=False) truePositivesExactSpan += exactResults["True Positives"] falseNegativesExactSpan += exactResults["False Negatives"] falsePositivesExactSpan += exactResults["False Positives"] inexactResults = evaluate(deepcopy(referenceSpans), deepcopy(predictedSpans), exactMatch=False, reportSeperately=False) truePositivesInexactSpan += inexactResults["True Positives"] falseNegativesInexactSpan += inexactResults["False Negatives"] falsePositivesInexactSpan += inexactResults["False Positives"] MatrixInexactSpan = evaluate(deepcopy(referenceSpans), deepcopy(predictedSpans), exactMatch=False, reportSeperately=True) for sublist1, sublist2 in zip(confusionMatrixInexactSpan, MatrixInexactSpan): for i, int2 in enumerate(sublist2): sublist1[i] += int2 MatrixExactSpan = evaluate(deepcopy(referenceSpans), deepcopy(predictedSpans), exactMatch=True, reportSeperately=True) for sublist1, sublist2 in zip(confusionMatrixExactSpan, MatrixExactSpan): for i, int2 in enumerate(sublist2): sublist1[i] += int2 print "\nResults for exact span for concepts together.\n" print "True Positives: ", truePositivesExactSpan print "False Negatives: ", falseNegativesExactSpan print "False Positives: ", falsePositivesExactSpan exactSpan = generateResultsForExactSpans(truePositivesExactSpan, falseNegativesExactSpan, falsePositivesExactSpan) print "Recall: ", exactSpan["Recall"] print "Precision: ", exactSpan["Precision"] print "F Measure: ", exactSpan["F Score"] inexactSpan = generateResultsForExactSpans(truePositivesInexactSpan, falseNegativesInexactSpan, falsePositivesInexactSpan) print "\nResults for inexact span for concepts together.\n" print "True Positives: ", truePositivesInexactSpan print "False Negatives: ", falseNegativesInexactSpan print "False Positives: ", falsePositivesInexactSpan print "Recall: ", inexactSpan["Recall"] print "Precision: ", inexactSpan["Precision"] print "F Measure: ", inexactSpan["F Score"] #TO DO: ENSURE NUMBER OF FP,FN,TP is equal to number of predicted spans #TO DO: number of FP, FN, TP is not same between exact and inexact. #LEFT OFF HERE. FIX DISPLAY FUNCTION displayMatrix(args.output, 'Exact', confusionMatrixExactSpan) displayMatrix(args.output, 'Inexact', confusionMatrixInexactSpan) #print evaluate(deepcopy(referenceSpans), deepcopy(predictedSpans), exactMatch=False, reportSeperately=True) return
def main(): parser = argparse.ArgumentParser() parser.add_argument( "-t", dest="txt", help="The files that contain the training examples", ) parser.add_argument( "-c", dest="con", help="The files that contain the labels for the training examples", ) parser.add_argument( "-m", dest="model", help="Path to the model that should be generated", ) parser.add_argument( "-f", dest="format", help="Data format ( " + ' | '.join(Note.supportedFormats()) + " )", ) parser.add_argument( "-g", dest="grid", help="A flag indicating whether to perform a grid search", action="store_true") parser.add_argument( "-no-crf", dest="nocrf", help="A flag indicating whether to use crfsuite for pass one.", action="store_true") parser.add_argument( "-discontiguous_spans", dest="third", help="A flag indicating whether to have third/clustering pass", action="store_true") parser.add_argument( "-umls_disambiguation", dest="umls_disambiguation", action="store_true", help= "A flag indicating wheter to disambiguate CUI id for detected entities in semeval format", ) """ parser.add_argument("-unlabeled", dest = "unlabeled", help = "Path to dir containing unlabelled data used for unsupervised methods", ) """ # Parse the command line arguments args = parser.parse_args() is_crf = not args.nocrf third = args.third # Error check: Ensure that file paths are specified if not args.txt: print >> sys.stderr, '\n\tError: Must provide text files' print >> sys.stderr, '' exit(1) if not args.con: print >> sys.stderr, '\n\tError: Must provide annotations for text files' print >> sys.stderr, '' exit(1) if not args.model: print >> sys.stderr, '\n\tError: Must provide valid path to store model' print >> sys.stderr, '' exit(1) modeldir = os.path.dirname(args.model) if (not os.path.exists(modeldir)) and (modeldir != ''): print >> sys.stderr, '\n\tError: Model dir does not exist: %s' % modeldir print >> sys.stderr, '' exit(1) if "PY4J_DIR_PATH" not in os.environ and args.third is True: exit( "please set environ var PY4J_DIR_PATH to the dir of the folder containg py4j<version>.jar" ) # A list of text file paths # A list of concept file paths txt_files = glob.glob(args.txt) con_files = glob.glob(args.con) # data format if args.format: format = args.format else: print '\n\tERROR: must provide "format" argument\n' exit() if third is True and args.format == "i2b2": exit("i2b2 formatting does not support disjoint spans") # Must specify output format if format not in Note.supportedFormats(): print >> sys.stderr, '\n\tError: Must specify output format' print >> sys.stderr, '\tAvailable formats: ', ' | '.join( Note.supportedFormats()) print >> sys.stderr, '' exit(1) # Collect training data file paths txt_files_map = helper.map_files( txt_files) # ex. {'record-13': 'record-13.con'} con_files_map = helper.map_files(con_files) training_list = [ ] # ex. training_list = [ ('record-13.txt', 'record-13.con') ] for k in txt_files_map: if k in con_files_map: training_list.append((txt_files_map[k], con_files_map[k])) # Train the model train(training_list, args.model, format, is_crf=is_crf, grid=args.grid, third=third, disambiguate=args.umls_disambiguation)
def main(): parser = argparse.ArgumentParser() parser.add_argument("-t", help = "Files containing predictions", dest = "txt", default = os.path.join(BASE_DIR, 'data/predictions/*') ) parser.add_argument("-r", help = "The directory that contains reference gold standard concept files", dest = "ref", default = os.path.join(BASE_DIR, 'data') ) parser.add_argument("-o", help = "Write the evaluation to a file rather than STDOUT", dest = "output", default = None ) parser.add_argument("-e", help = "Do error analysis", dest = "error", action = 'store_true' ) # Parse command line arguments args = parser.parse_args() # Is output destination specified if args.output: args.output = open(args.output, "w") else: args.output = sys.stdout txt_files = glob.glob(args.txt) txt_files_map = helper.map_files(txt_files) ref_directory = args.ref ref_files = os.listdir(ref_directory) ref_files = map(lambda f: os.path.join(args.ref, f), ref_files) ref_files_map = helper.map_files(ref_files) files = [] for k in txt_files_map: if k in ref_files_map: files.append((txt_files_map[k], ref_files_map[k])) print files # Useful for error analysis text = [] # One list of all labels pred_labels = [] gold_labels = [] # txt <- predicted labels # ref <- actual labels for txt, ref in files: # A note that represents the model's predictions pnote = Note() pnote.read( txt ) # A note that is the actual concept labels gnote = Note() gnote.read( ref ) # Accumulate all predictions pred_labels += pnote.label_list() gold_labels += gnote.label_list() # Collect text for error analysis text += pnote.text_list() # Compute results evaluate(pred_labels, gold_labels, out=args.output) # Error analysis if args.error: print '\n\n\n' error_analysis(text, pred_labels, gold_labels)
def main(): parser = argparse.ArgumentParser() parser.add_argument( "-t", dest="txt", help="The files that contain the training examples", default=os.path.join( os.path.dirname(os.path.realpath(__file__)), "../data/concept_assertion_relation_training_data/merged/txt/*" ), ) parser.add_argument( "-c", dest="con", help="The files that contain the labels for the training examples", default=os.path.join( os.path.dirname(os.path.realpath(__file__)), "../data/concept_assertion_relation_training_data/merged/concept/*", ), ) parser.add_argument( "-m", dest="model", help="Path to the model that should be generated", default=os.path.join(os.path.dirname(os.path.realpath(__file__)), "../models/awesome.model"), ) parser.add_argument( "-d", dest="disabled_features", help="The features that should not be used", nargs="+", default=None ) parser.add_argument( "-e", dest="enabled_features", help="The features that should be used. This option trumps -d", nargs="+", default=None, ) parser.add_argument("--no-svm", dest="no_svm", action="store_true", help="Disable SVM model generation") parser.add_argument("--no-lin", dest="no_lin", action="store_true", help="Disable LIN model generation") parser.add_argument("--no-crf", dest="no_crf", action="store_true", help="Disable CRF model generation") args = parser.parse_args() training_list = [] txt_files = glob.glob(args.txt) con_files = glob.glob(args.con) txt_files_map = helper.map_files(txt_files) con_files_map = helper.map_files(con_files) for k in txt_files_map: if k in con_files_map: training_list.append((txt_files_map[k], con_files_map[k])) type = 0 if not args.no_svm: type = type | libml.SVM if not args.no_lin: type = type | libml.LIN if not args.no_crf: type = type | libml.CRF # Get data and labels from files data = [] labels = [] for txt, con in training_list: datum = read_txt(txt) data += datum labels += read_con(con, datum) # Train a model on the data and labels model = Model(filename=args.model, type=type) if args.disabled_features != None: model.enabled_features = model.enabled_features - Set(args.disabled_features) if args.enabled_features != None: model.enabled_features = Set(args.enabled_features) model.train(data, labels)
def main(): parser = argparse.ArgumentParser() parser.add_argument("-t", help = "Text files that were used to generate predictions", dest = "txt", default = os.path.join(os.getenv('CLINER_DIR'), 'data/test_data/*') ) parser.add_argument("-c", help = "The directory that contains predicted concept files organized into subdirectories for svm, lin, srf", dest = "con", default = os.path.join(os.getenv('CLINER_DIR'), 'data/predictions/') ) parser.add_argument("-r", help = "The directory that contains reference gold standard concept files", dest = "ref", default = os.path.join(os.getenv('CLINER_DIR'), 'data/reference_standard_for_test_data/concepts/') ) parser.add_argument("-f", dest = "format", help = "Data format ( " + ' | '.join(Note.supportedFormats()) + " )", default = 'i2b2' ) parser.add_argument("-o", help = "Write the evaluation to a file rather than STDOUT", dest = "output", default = None ) # Parse command line arguments args = parser.parse_args() format = args.format # Is output destination specified? if args.output: args.output = open(args.output, "w") else: args.output = sys.stdout # Must specify output format if format not in Note.supportedFormats(): print >>sys.stderr, '\n\tError: Must specify output format' print >>sys.stderr, '\tAvailable formats: ', ' | '.join(Note.supportedFormats()) print >>sys.stderr, '' exit(1) # List of medical text txt_files = glob.glob(args.txt) txt_files_map = helper.map_files(txt_files) wildcard = '*.' + Note.dictOfFormatToExtensions()[format] # List of gold data ref_files = glob.glob( os.path.join(args.ref, wildcard) ) ref_files_map = helper.map_files(ref_files) # List of predictions pred_files = glob.glob( os.path.join(args.con, wildcard) ) pred_files_map = helper.map_files(pred_files) # Grouping of text, predictions, gold files = [] for k in txt_files_map: if k in pred_files_map and k in ref_files_map: files.append((txt_files_map[k], pred_files_map[k], ref_files_map[k])) # txt <- medical text # annotations <- predictions # gold <- gold standard truePositivesExactSpan = 0 falseNegativesExactSpan = 0 falsePositivesExactSpan = 0 truePositivesInexactSpan = 0 falseNegativesInexactSpan = 0 falsePositivesInexactSpan = 0 confusion = [[0] * len(labels) for e in labels] confusionMatrixExactSpan = deepcopy(confusion) confusionMatrixInexactSpan = deepcopy(confusion) for txt, annotations, gold in files: # Read predictions and gols standard data cnote = Note(format) rnote = Note(format) cnote.read(txt, annotations) rnote.read(txt, gold) referenceSpans = getConceptSpans(rnote.getIOBLabels(), rnote.conlist()) predictedSpans = getConceptSpans(cnote.getIOBLabels(), cnote.conlist()) #TO DO: i need to generate a cumulative total accross all of the files #modify my functions slightly and have it return the number of true positive and etc... #then call generate results exactResults = evaluate(deepcopy(referenceSpans), deepcopy(predictedSpans), exactMatch=True, reportSeperately=False) inexactResults = evaluate(deepcopy(referenceSpans), deepcopy(predictedSpans), exactMatch=False, reportSeperately=False) truePositivesExactSpan += exactResults["True Positives"] falseNegativesExactSpan += exactResults["False Negatives"] falsePositivesExactSpan += exactResults["False Positives"] inexactResults = evaluate(deepcopy(referenceSpans), deepcopy(predictedSpans), exactMatch=False, reportSeperately=False) truePositivesInexactSpan += inexactResults["True Positives"] falseNegativesInexactSpan += inexactResults["False Negatives"] falsePositivesInexactSpan += inexactResults["False Positives"] MatrixInexactSpan = evaluate(deepcopy(referenceSpans), deepcopy(predictedSpans), exactMatch=False, reportSeperately=True) for sublist1, sublist2 in zip(confusionMatrixInexactSpan, MatrixInexactSpan): for i,int2 in enumerate(sublist2): sublist1[i] += int2 MatrixExactSpan = evaluate(deepcopy(referenceSpans), deepcopy(predictedSpans), exactMatch=True, reportSeperately=True) for sublist1, sublist2 in zip(confusionMatrixExactSpan, MatrixExactSpan): for i,int2 in enumerate(sublist2): sublist1[i] += int2 print "\nResults for exact span for concepts together.\n" print "True Positives: ", truePositivesExactSpan print "False Negatives: ", falseNegativesExactSpan print "False Positives: ", falsePositivesExactSpan exactSpan = generateResultsForExactSpans(truePositivesExactSpan, falseNegativesExactSpan, falsePositivesExactSpan) print "Recall: ", exactSpan["Recall"] print "Precision: ", exactSpan["Precision"] print "F Measure: ", exactSpan["F Score"] inexactSpan = generateResultsForExactSpans(truePositivesInexactSpan, falseNegativesInexactSpan, falsePositivesInexactSpan) print "\nResults for inexact span for concepts together.\n" print "True Positives: ", truePositivesInexactSpan print "False Negatives: ", falseNegativesInexactSpan print "False Positives: ", falsePositivesInexactSpan print "Recall: ", inexactSpan["Recall"] print "Precision: ", inexactSpan["Precision"] print "F Measure: ", inexactSpan["F Score"] #TO DO: ENSURE NUMBER OF FP,FN,TP is equal to number of predicted spans #TO DO: number of FP, FN, TP is not same between exact and inexact. #LEFT OFF HERE. FIX DISPLAY FUNCTION displayMatrix(args.output, 'Exact' , confusionMatrixExactSpan) displayMatrix(args.output, 'Inexact', confusionMatrixInexactSpan) #print evaluate(deepcopy(referenceSpans), deepcopy(predictedSpans), exactMatch=False, reportSeperately=True) return
def main(): parser = argparse.ArgumentParser() parser.add_argument("-t", dest="txt", help="The files that contain the training examples", default=os.path.join(os.getenv('CLINER_DIR'), 'data/train/txt/*')) parser.add_argument( "-c", dest="con", help="The files that contain the labels for the training examples", default=os.path.join(os.getenv('CLINER_DIR'), 'data/train/con/*')) parser.add_argument("-m", dest="model", help="Path to the model that should be generated", default=os.path.join(os.getenv('CLINER_DIR'), 'models/run.model')) parser.add_argument("-f", dest="format", help="Data format ( " + ' | '.join(Note.supportedFormats()) + " )", default='i2b2') parser.add_argument( "-g", dest="grid", help="A flag indicating whether to perform a grid search", action="store_true") parser.add_argument( "-no-crf", dest="nocrf", help="A flag indicating whether to use crfsuite for pass one.", action="store_true") # Parse the command line arguments args = parser.parse_args() is_crf = not args.nocrf # A list of text file paths # A list of concept file paths txt_files = glob.glob(args.txt) con_files = glob.glob(args.con) # data format format = args.format # Must specify output format if format not in Note.supportedFormats(): print >> sys.stderr, '\n\tError: Must specify output format' print >> sys.stderr, '\tAvailable formats: ', ' | '.join( Note.supportedFormats()) print >> sys.stderr, '' exit(1) # Collect training data file paths txt_files_map = helper.map_files( txt_files) # ex. {'record-13': 'record-13.con'} con_files_map = helper.map_files(con_files) training_list = [ ] # ex. training_list = [ ('record-13.txt', 'record-13.con') ] for k in txt_files_map: if k in con_files_map: training_list.append((txt_files_map[k], con_files_map[k])) # display file names (for user to see data was properly located) print '\n', training_list, '\n' # Train the model train(training_list, args.model, format, is_crf=is_crf, grid=args.grid)
def main(): parser = argparse.ArgumentParser() parser.add_argument("-t", help = "Test files that were used to generate predictions", dest = "txt", default = os.path.join(os.path.dirname(os.path.realpath(__file__)), '../data/test_data/*') ) parser.add_argument("-c", help = "The directory that contains predicted concept files organized into subdirectories for svm, lin, srf", dest = "con", default = os.path.join(os.path.dirname(os.path.realpath(__file__)), '../data/test_predictions/') ) parser.add_argument("-r", help = "The directory that contains reference gold standard concept files", dest = "ref", default = os.path.join(os.path.dirname(os.path.realpath(__file__)), '../data/reference_standard_for_test_data/concepts/') ) parser.add_argument("-o", help = "Write the evaluation to a file rather than STDOUT", dest = "output", default = None ) args = parser.parse_args() # output if args.output: args.output = open(args.output, "w") else: args.output = sys.stdout txt_files = glob.glob(args.txt) ref_files = os.listdir(args.ref) ref_files = map(lambda f: os.path.join(args.ref, f), ref_files) txt_files_map = helper.map_files(txt_files) ref_files_map = helper.map_files(ref_files) con_directories = os.listdir(args.con) for con_directory in con_directories: files = [] directory_name = os.path.basename(con_directory) if directory_name not in ["svm", "crf", "lin"]: continue con_files = os.listdir(os.path.join(args.con, con_directory)) con_files = map(lambda f: os.path.join(args.con, con_directory, f), con_files) con_files_map = helper.map_files(con_files) for k in txt_files_map: if k in con_files_map and k in ref_files_map: files.append((txt_files_map[k], con_files_map[k], ref_files_map[k])) # Compute the confusion matrix labels = Model.labels confusion = [[0] * len(labels) for e in labels] for txt, con, ref in files: txt = read_txt(txt) for c, r in zip(read_con(con, txt), read_con(ref, txt)): for c, r in zip(c, r): confusion[labels[r]][labels[c]] += 1 # Display the confusion matrix print >>args.output, "" print >>args.output, "" print >>args.output, "" print >>args.output, "================" print >>args.output, directory_name.upper() + " RESULTS" print >>args.output, "================" print >>args.output, "" print >>args.output, "Confusion Matrix" pad = max(len(l) for l in labels) + 6 print >>args.output, "%s %s" % (' ' * pad, "\t".join(Model.labels.keys())) for act, act_v in labels.items(): print >>args.output, "%s %s" % (act.rjust(pad), "\t".join([str(confusion[act_v][pre_v]) for pre, pre_v in labels.items()])) print >>args.output, "" # Compute the analysis stuff precision = [] recall = [] specificity = [] f1 = [] tp = 0 fp = 0 fn = 0 tn = 0 print >>args.output, "Analysis" print >>args.output, " " * pad, "Precision\tRecall\tF1" for lab, lab_v in labels.items(): tp = confusion[lab_v][lab_v] fp = sum(confusion[v][lab_v] for k, v in labels.items() if v != lab_v) fn = sum(confusion[lab_v][v] for k, v in labels.items() if v != lab_v) tn = sum(confusion[v1][v2] for k1, v1 in labels.items() for k2, v2 in labels.items() if v1 != lab_v and v2 != lab_v) precision += [float(tp) / (tp + fp + 1e-100)] recall += [float(tp) / (tp + fn + 1e-100)] specificity += [float(tn) / (tn + fp + 1e-100)] f1 += [float(2 * tp) / (2 * tp + fp + fn + 1e-100)] print >>args.output, "%s %.4f\t%.4f\t%.4f\t%.4f" % (lab.rjust(pad), precision[-1], recall[-1], specificity[-1], f1[-1]) print >>args.output, "--------" precision = sum(precision) / len(precision) recall = sum(recall) / len(recall) specificity = sum(specificity) / len(specificity) f1 = sum(f1) / len(f1) print >>args.output, "Average: %.4f\t%.4f\t%.4f\t%.4f" % (precision, recall, specificity, f1)
def main(): parser = argparse.ArgumentParser() parser.add_argument( "-t", help="Test files that were used to generate predictions", dest="txt", default=os.path.join(os.path.dirname(os.path.realpath(__file__)), '../data/test_data/*')) parser.add_argument( "-c", help= "The directory that contains predicted concept files organized into subdirectories for svm, lin, srf", dest="con", default=os.path.join(os.path.dirname(os.path.realpath(__file__)), '../data/test_predictions/')) parser.add_argument( "-r", help= "The directory that contains reference gold standard concept files", dest="ref", default=os.path.join( os.path.dirname(os.path.realpath(__file__)), '../data/reference_standard_for_test_data/concepts/')) parser.add_argument( "-o", help="Write the evaluation to a file rather than STDOUT", dest="output", default=None) args = parser.parse_args() # output if args.output: args.output = open(args.output, "w") else: args.output = sys.stdout txt_files = glob.glob(args.txt) ref_files = os.listdir(args.ref) ref_files = map(lambda f: os.path.join(args.ref, f), ref_files) txt_files_map = helper.map_files(txt_files) ref_files_map = helper.map_files(ref_files) con_directories = os.listdir(args.con) for con_directory in con_directories: files = [] directory_name = os.path.basename(con_directory) if directory_name not in ["svm", "crf", "lin"]: continue con_files = os.listdir(os.path.join(args.con, con_directory)) con_files = map(lambda f: os.path.join(args.con, con_directory, f), con_files) con_files_map = helper.map_files(con_files) for k in txt_files_map: if k in con_files_map and k in ref_files_map: files.append( (txt_files_map[k], con_files_map[k], ref_files_map[k])) # Compute the confusion matrix labels = Model.labels confusion = [[0] * len(labels) for e in labels] for txt, con, ref in files: txt = read_txt(txt) for c, r in zip(read_con(con, txt), read_con(ref, txt)): for c, r in zip(c, r): confusion[labels[r]][labels[c]] += 1 # Display the confusion matrix print >> args.output, "" print >> args.output, "" print >> args.output, "" print >> args.output, "================" print >> args.output, directory_name.upper() + " RESULTS" print >> args.output, "================" print >> args.output, "" print >> args.output, "Confusion Matrix" pad = max(len(l) for l in labels) + 6 print >> args.output, "%s %s" % (' ' * pad, "\t".join( Model.labels.keys())) for act, act_v in labels.items(): print >> args.output, "%s %s" % (act.rjust(pad), "\t".join([ str(confusion[act_v][pre_v]) for pre, pre_v in labels.items() ])) print >> args.output, "" # Compute the analysis stuff precision = [] recall = [] specificity = [] f1 = [] tp = 0 fp = 0 fn = 0 tn = 0 print >> args.output, "Analysis" print >> args.output, " " * pad, "Precision\tRecall\tF1" for lab, lab_v in labels.items(): tp = confusion[lab_v][lab_v] fp = sum(confusion[v][lab_v] for k, v in labels.items() if v != lab_v) fn = sum(confusion[lab_v][v] for k, v in labels.items() if v != lab_v) tn = sum(confusion[v1][v2] for k1, v1 in labels.items() for k2, v2 in labels.items() if v1 != lab_v and v2 != lab_v) precision += [float(tp) / (tp + fp + 1e-100)] recall += [float(tp) / (tp + fn + 1e-100)] specificity += [float(tn) / (tn + fp + 1e-100)] f1 += [float(2 * tp) / (2 * tp + fp + fn + 1e-100)] print >> args.output, "%s %.4f\t%.4f\t%.4f\t%.4f" % (lab.rjust( pad), precision[-1], recall[-1], specificity[-1], f1[-1]) print >> args.output, "--------" precision = sum(precision) / len(precision) recall = sum(recall) / len(recall) specificity = sum(specificity) / len(specificity) f1 = sum(f1) / len(f1) print >> args.output, "Average: %.4f\t%.4f\t%.4f\t%.4f" % ( precision, recall, specificity, f1)