def train(training_list, model_path, format, is_crf=True, grid=False): # Read the data into a Note object notes = [] for txt, con in training_list: note_tmp = Note(format) # Create Note note_tmp.read(txt, con) # Read data into Note notes.append(note_tmp) # Add the Note to the list # file names if not notes: print 'Error: Cannot train on 0 files. Terminating train.' return 1 # Create a Machine Learning model model = Model(is_crf=is_crf) # Train the model using the Note's data model.train(notes, grid) # Pickle dump print 'pickle dump' with open(model_path, "wb") as m_file: pickle.dump(model, m_file)
def find_note(self, name): note = None aux = self.db.get_note(name) self.db.close_db() print type(aux), aux if aux: note = Note(aux[0][1], aux[0][3]) note.id = aux[0][0] note.creation_date = aux[0][2] return note return note
def train(training_list, model_path, format, is_crf=True, grid=False, third=False, disambiguate=False): """ train() Purpose: Train a model for given clinical data. @param training_list list of (txt,con) file path tuples (training instances) @param model_path string filename of where to pickle model object @param format concept file data format (ex. i2b2, semeval) @param is_crf whether first pass should use CRF classifier @param grid whether second pass should perform grid search @param third whether to perform third/clustering pass """ # Read the data into a Note object notes = [] for txt, con in training_list: note_tmp = Note(format) # Create Note note_tmp.read(txt, con) # Read data into Note notes.append(note_tmp) # Add the Note to the list # file names if not notes: print 'Error: Cannot train on 0 files. Terminating train.' return 1 # Create a Machine Learning model model = Model(is_crf=is_crf) # disambiguation if format == "semeval" and disambiguate is True and enabled["UMLS"] != None: model.set_cui_freq(cui_disambiguation.calcFreqOfCuis(training_list)) # Train the model using the Note's data model.train(notes, grid, do_third=third) # Pickle dump print '\nserializing model to %s\n' % model_path with open(model_path, "wb") as m_file: pickle.dump(model, m_file) # return trained model return model
def predict(files, model_path, output_dir, format): # Must specify output format if format not in Note.supportedFormats(): print >>sys.stderr, '\n\tError: Must specify output format' print >>sys.stderr, '\tAvailable formats: ', ' | '.join(Note.supportedFormats()) print >>sys.stderr, '' exit(1) # Load model model = Model.load(model_path) # Tell user if not predicting if not files: print >>sys.stderr, "\n\tNote: You did not supply any input files\n" exit() # For each file, predict concept labels n = len(files) for i,txt in enumerate(sorted(files)): # Read the data into a Note object note = Note(format) note.read(txt) print '-' * 30 print '\n\t%d of %d' % (i+1,n) print '\t', txt, '\n' # Predict concept labels labels = model.predict(note) # Get predictions in proper format extension = note.getExtension() output = note.write(labels) #print output # Output file fname = os.path.splitext(os.path.basename(txt))[0] + '.' + extension out_path = os.path.join(output_dir, fname) # Output the concept predictions print '\n\nwriting to: ', out_path with open(out_path, 'w') as f: print >>f, output print
def main(): parser = argparse.ArgumentParser() parser.add_argument("-i", dest = "input", help = "The input files to predict", default = os.path.join(os.getenv('CLINER_DIR'), 'data/test_data/*') ) parser.add_argument("-o", dest = "output", help = "The directory to write the output", default = os.path.join(os.getenv('CLINER_DIR'), 'data/test_predictions') ) parser.add_argument("-m", dest = "model", help = "The model to use for prediction", default = os.path.join(os.getenv('CLINER_DIR'), 'models/run.model') ) parser.add_argument("-f", dest = "format", help = "Data format ( " + ' | '.join(Note.supportedFormats()) + " )", default = 'i2b2' ) parser.add_argument("-crf", dest = "with_crf", help = "Specify where to find crfsuite", default = None ) args = parser.parse_args() # Parse arguments files = glob.glob(args.input) helper.mkpath(args.output) format = args.format # Predict predict(files, args.model, args.output, format=format)
def main(): parser = argparse.ArgumentParser() parser.add_argument("-t", dest = "txt", help = "The files that contain the training examples", default = os.path.join(os.getenv('CLINER_DIR'), 'data/train/txt/*') ) parser.add_argument("-c", dest = "con", help = "The files that contain the labels for the training examples", default = os.path.join(os.getenv('CLINER_DIR'), 'data/train/con/*') ) parser.add_argument("-m", dest = "model", help = "Path to the model that should be generated", default = os.path.join(os.getenv('CLINER_DIR'), 'models/run.model') ) parser.add_argument("-f", dest = "format", help = "Data format ( " + ' | '.join(Note.supportedFormats()) + " )", default = 'i2b2' ) parser.add_argument("-g", dest = "grid", help = "A flag indicating whether to perform a grid search", action = "store_true" ) parser.add_argument("-no-crf", dest = "nocrf", help = "A flag indicating whether to use crfsuite for pass one.", action = "store_true" ) # Parse the command line arguments args = parser.parse_args() is_crf = not args.nocrf # A list of text file paths # A list of concept file paths txt_files = glob.glob(args.txt) con_files = glob.glob(args.con) # data format format = args.format # Must specify output format if format not in Note.supportedFormats(): print >>sys.stderr, '\n\tError: Must specify output format' print >>sys.stderr, '\tAvailable formats: ', ' | '.join(Note.supportedFormats()) print >>sys.stderr, '' exit(1) # Collect training data file paths txt_files_map = helper.map_files(txt_files) # ex. {'record-13': 'record-13.con'} con_files_map = helper.map_files(con_files) training_list = [] # ex. training_list = [ ('record-13.txt', 'record-13.con') ] for k in txt_files_map: if k in con_files_map: training_list.append((txt_files_map[k], con_files_map[k])) # display file names (for user to see data was properly located) print '\n', training_list, '\n' # Train the model train(training_list, args.model, format, is_crf=is_crf, grid=args.grid)
def main(): # Argument Parser parser = argparse.ArgumentParser() parser.add_argument( "-txt", dest="txt", help="The files that contain the training examples", ) parser.add_argument( "-annotations", dest="annotations", help="The files that contain the labels for the training examples", ) parser.add_argument( "-out", dest="out", default=None, help="Directory to output data", ) parser.add_argument( "-format", dest="format", help="Output format (%s)" % str(' or '.join(Note.supportedFormats())), ) # Parse the command line arguments args = parser.parse_args() # Parse arguments txt = args.txt annotations = args.annotations out_file = args.out format = args.format # Ensure annotations are specified if not txt: print >> sys.stderr, '\n\tError: Must supply text file' print >> sys.stderr exit(1) elif not os.path.exists(txt): print >> sys.stderr, '\n\tError: Given text file does not exist' print >> sys.stderr exit(1) # Ensure annotations are specified extensions = Note.supportedFormatExtensions() if not annotations: print >> sys.stderr, '\n\tError: Must supply annotations' print >> sys.stderr exit(2) elif not os.path.exists(txt): print >> sys.stderr, '\n\tError: Given annotation file does not exist' print >> sys.stderr exit(2) elif os.path.splitext(annotations)[1][1:] not in extensions: print >> sys.stderr, '\n\tError: annotation must be a supported format' print >> sys.stderr, '\t\t(.%s)' % str(' or .'.join(extensions)) print >> sys.stderr exit(2) # Ensure output format is specified if (not format) or (format not in Note.supportedFormats()): print >> sys.stderr, '\n\tError: Must specify supported output format' print >> sys.stderr, '\t\t(%s)' % str(' or '.join( Note.supportedFormats())) print >> sys.stderr exit(3) # Automatically find the input file format in_extension = os.path.splitext(annotations)[1][1:] for f, ext in Note.dictOfFormatToExtensions().items(): if ext == in_extension: in_format = f # Read input data into note object in_note = Note(in_format) in_note.read(txt, annotations) # Convert data to standard format internal_output = in_note.write_standard() os_handle, tmp_file = tempfile.mkstemp(dir=tmp_dir, suffix="format_temp") with open(tmp_file, 'w') as f: f.write(internal_output) os.close(os_handle) #print internal_output # Read internal standard data into new file with given output format out_note = Note(format) out_note.read_standard(txt, tmp_file) # Output data out = out_note.write() if out_file: with open(out_file, 'w') as out_f: out_f.write(out) else: sys.stdout.write(out) # Clean up os.remove(tmp_file) if out_file: out_f.close()
def main(): parser = argparse.ArgumentParser() parser.add_argument( "-t", help="Text files that were used to generate predictions", dest="txt", default=os.path.join(os.getenv('CLINER_DIR'), 'data/test_data/*')) parser.add_argument( "-c", help= "The directory that contains predicted concept files organized into subdirectories for svm, lin, srf", dest="con", default=os.path.join(os.getenv('CLINER_DIR'), 'data/predictions/')) parser.add_argument( "-r", help= "The directory that contains reference gold standard concept files", dest="ref", default=os.path.join( os.getenv('CLINER_DIR'), 'data/reference_standard_for_test_data/concepts/')) parser.add_argument( "-f", dest="format", help="Data format ( " + ' | '.join(Note.supportedFormats()) + " )", ) parser.add_argument( "-o", help="Write the evaluation to a file rather than STDOUT", dest="output", default=None) # Parse command line arguments args = parser.parse_args() if args.format: format = args.format else: print '\n\tERROR: must provide "format" argument\n' exit() # Is output destination specified? if args.output: args.output = open(args.output, "w") else: args.output = sys.stdout # Must specify output format if format not in Note.supportedFormats(): print >> sys.stderr, '\n\tError: Must specify output format' print >> sys.stderr, '\tAvailable formats: ', ' | '.join( Note.supportedFormats()) print >> sys.stderr, '' exit(1) # List of medical text txt_files = glob.glob(args.txt) txt_files_map = helper.map_files(txt_files) wildcard = '*.' + Note.dictOfFormatToExtensions()[format] # List of gold data ref_files = glob.glob(os.path.join(args.ref, wildcard)) ref_files_map = helper.map_files(ref_files) # List of predictions pred_files = glob.glob(os.path.join(args.con, wildcard)) pred_files_map = helper.map_files(pred_files) # Grouping of text, predictions, gold files = [] for k in txt_files_map: if k in pred_files_map and k in ref_files_map: files.append( (txt_files_map[k], pred_files_map[k], ref_files_map[k])) # txt <- medical text # annotations <- predictions # gold <- gold standard truePositivesExactSpan = 0 falseNegativesExactSpan = 0 falsePositivesExactSpan = 0 truePositivesInexactSpan = 0 falseNegativesInexactSpan = 0 falsePositivesInexactSpan = 0 confusion = [[0] * len(labels) for e in labels] confusionMatrixExactSpan = deepcopy(confusion) confusionMatrixInexactSpan = deepcopy(confusion) if len(files) == 0: exit("No files to be evaluated") for txt, annotations, gold in files: # Read predictions and gols standard data cnote = Note(format) rnote = Note(format) cnote.read(txt, annotations) rnote.read(txt, gold) referenceSpans = getConceptSpans(rnote.getIOBLabels(), rnote.conlist()) predictedSpans = getConceptSpans(cnote.getIOBLabels(), cnote.conlist()) #TO DO: i need to generate a cumulative total accross all of the files #modify my functions slightly and have it return the number of true positive and etc... #then call generate results exactResults = evaluate(deepcopy(referenceSpans), deepcopy(predictedSpans), exactMatch=True, reportSeperately=False) inexactResults = evaluate(deepcopy(referenceSpans), deepcopy(predictedSpans), exactMatch=False, reportSeperately=False) truePositivesExactSpan += exactResults["True Positives"] falseNegativesExactSpan += exactResults["False Negatives"] falsePositivesExactSpan += exactResults["False Positives"] inexactResults = evaluate(deepcopy(referenceSpans), deepcopy(predictedSpans), exactMatch=False, reportSeperately=False) truePositivesInexactSpan += inexactResults["True Positives"] falseNegativesInexactSpan += inexactResults["False Negatives"] falsePositivesInexactSpan += inexactResults["False Positives"] MatrixInexactSpan = evaluate(deepcopy(referenceSpans), deepcopy(predictedSpans), exactMatch=False, reportSeperately=True) for sublist1, sublist2 in zip(confusionMatrixInexactSpan, MatrixInexactSpan): for i, int2 in enumerate(sublist2): sublist1[i] += int2 MatrixExactSpan = evaluate(deepcopy(referenceSpans), deepcopy(predictedSpans), exactMatch=True, reportSeperately=True) for sublist1, sublist2 in zip(confusionMatrixExactSpan, MatrixExactSpan): for i, int2 in enumerate(sublist2): sublist1[i] += int2 print "\nResults for exact span for concepts together.\n" print "True Positives: ", truePositivesExactSpan print "False Negatives: ", falseNegativesExactSpan print "False Positives: ", falsePositivesExactSpan exactSpan = generateResultsForExactSpans(truePositivesExactSpan, falseNegativesExactSpan, falsePositivesExactSpan) print "Recall: ", exactSpan["Recall"] print "Precision: ", exactSpan["Precision"] print "F Measure: ", exactSpan["F Score"] inexactSpan = generateResultsForExactSpans(truePositivesInexactSpan, falseNegativesInexactSpan, falsePositivesInexactSpan) print "\nResults for inexact span for concepts together.\n" print "True Positives: ", truePositivesInexactSpan print "False Negatives: ", falseNegativesInexactSpan print "False Positives: ", falsePositivesInexactSpan print "Recall: ", inexactSpan["Recall"] print "Precision: ", inexactSpan["Precision"] print "F Measure: ", inexactSpan["F Score"] #TO DO: ENSURE NUMBER OF FP,FN,TP is equal to number of predicted spans #TO DO: number of FP, FN, TP is not same between exact and inexact. #LEFT OFF HERE. FIX DISPLAY FUNCTION displayMatrix(args.output, 'Exact', confusionMatrixExactSpan) displayMatrix(args.output, 'Inexact', confusionMatrixInexactSpan) #print evaluate(deepcopy(referenceSpans), deepcopy(predictedSpans), exactMatch=False, reportSeperately=True) return
def main(): parser = argparse.ArgumentParser() parser.add_argument( "-t", dest="txt", help="The files that contain the training examples", ) parser.add_argument( "-c", dest="con", help="The files that contain the labels for the training examples", ) parser.add_argument( "-m", dest="model", help="Path to the model that should be generated", ) parser.add_argument( "-f", dest="format", help="Data format ( " + ' | '.join(Note.supportedFormats()) + " )", ) parser.add_argument( "-g", dest="grid", help="A flag indicating whether to perform a grid search", action="store_true") parser.add_argument( "-no-crf", dest="nocrf", help="A flag indicating whether to use crfsuite for pass one.", action="store_true") parser.add_argument( "-discontiguous_spans", dest="third", help="A flag indicating whether to have third/clustering pass", action="store_true") parser.add_argument( "-umls_disambiguation", dest="umls_disambiguation", action="store_true", help= "A flag indicating wheter to disambiguate CUI id for detected entities in semeval format", ) """ parser.add_argument("-unlabeled", dest = "unlabeled", help = "Path to dir containing unlabelled data used for unsupervised methods", ) """ # Parse the command line arguments args = parser.parse_args() is_crf = not args.nocrf third = args.third # Error check: Ensure that file paths are specified if not args.txt: print >> sys.stderr, '\n\tError: Must provide text files' print >> sys.stderr, '' exit(1) if not args.con: print >> sys.stderr, '\n\tError: Must provide annotations for text files' print >> sys.stderr, '' exit(1) if not args.model: print >> sys.stderr, '\n\tError: Must provide valid path to store model' print >> sys.stderr, '' exit(1) modeldir = os.path.dirname(args.model) if (not os.path.exists(modeldir)) and (modeldir != ''): print >> sys.stderr, '\n\tError: Model dir does not exist: %s' % modeldir print >> sys.stderr, '' exit(1) if "PY4J_DIR_PATH" not in os.environ and args.third is True: exit( "please set environ var PY4J_DIR_PATH to the dir of the folder containg py4j<version>.jar" ) # A list of text file paths # A list of concept file paths txt_files = glob.glob(args.txt) con_files = glob.glob(args.con) # data format if args.format: format = args.format else: print '\n\tERROR: must provide "format" argument\n' exit() if third is True and args.format == "i2b2": exit("i2b2 formatting does not support disjoint spans") # Must specify output format if format not in Note.supportedFormats(): print >> sys.stderr, '\n\tError: Must specify output format' print >> sys.stderr, '\tAvailable formats: ', ' | '.join( Note.supportedFormats()) print >> sys.stderr, '' exit(1) # Collect training data file paths txt_files_map = helper.map_files( txt_files) # ex. {'record-13': 'record-13.con'} con_files_map = helper.map_files(con_files) training_list = [ ] # ex. training_list = [ ('record-13.txt', 'record-13.con') ] for k in txt_files_map: if k in con_files_map: training_list.append((txt_files_map[k], con_files_map[k])) # Train the model train(training_list, args.model, format, is_crf=is_crf, grid=args.grid, third=third, disambiguate=args.umls_disambiguation)
def main(): parser = argparse.ArgumentParser() parser.add_argument("-t", help = "Text files that were used to generate predictions", dest = "txt", default = os.path.join(os.getenv('CLINER_DIR'), 'data/test_data/*') ) parser.add_argument("-c", help = "The directory that contains predicted concept files organized into subdirectories for svm, lin, srf", dest = "con", default = os.path.join(os.getenv('CLINER_DIR'), 'data/predictions/') ) parser.add_argument("-r", help = "The directory that contains reference gold standard concept files", dest = "ref", default = os.path.join(os.getenv('CLINER_DIR'), 'data/reference_standard_for_test_data/concepts/') ) parser.add_argument("-f", dest = "format", help = "Data format ( " + ' | '.join(Note.supportedFormats()) + " )", default = 'i2b2' ) parser.add_argument("-o", help = "Write the evaluation to a file rather than STDOUT", dest = "output", default = None ) # Parse command line arguments args = parser.parse_args() format = args.format # Is output destination specified? if args.output: args.output = open(args.output, "w") else: args.output = sys.stdout # Must specify output format if format not in Note.supportedFormats(): print >>sys.stderr, '\n\tError: Must specify output format' print >>sys.stderr, '\tAvailable formats: ', ' | '.join(Note.supportedFormats()) print >>sys.stderr, '' exit(1) # List of medical text txt_files = glob.glob(args.txt) txt_files_map = helper.map_files(txt_files) wildcard = '*.' + Note.dictOfFormatToExtensions()[format] # List of gold data ref_files = glob.glob( os.path.join(args.ref, wildcard) ) ref_files_map = helper.map_files(ref_files) # List of predictions pred_files = glob.glob( os.path.join(args.con, wildcard) ) pred_files_map = helper.map_files(pred_files) # Grouping of text, predictions, gold files = [] for k in txt_files_map: if k in pred_files_map and k in ref_files_map: files.append((txt_files_map[k], pred_files_map[k], ref_files_map[k])) # txt <- medical text # annotations <- predictions # gold <- gold standard truePositivesExactSpan = 0 falseNegativesExactSpan = 0 falsePositivesExactSpan = 0 truePositivesInexactSpan = 0 falseNegativesInexactSpan = 0 falsePositivesInexactSpan = 0 confusion = [[0] * len(labels) for e in labels] confusionMatrixExactSpan = deepcopy(confusion) confusionMatrixInexactSpan = deepcopy(confusion) for txt, annotations, gold in files: # Read predictions and gols standard data cnote = Note(format) rnote = Note(format) cnote.read(txt, annotations) rnote.read(txt, gold) referenceSpans = getConceptSpans(rnote.getIOBLabels(), rnote.conlist()) predictedSpans = getConceptSpans(cnote.getIOBLabels(), cnote.conlist()) #TO DO: i need to generate a cumulative total accross all of the files #modify my functions slightly and have it return the number of true positive and etc... #then call generate results exactResults = evaluate(deepcopy(referenceSpans), deepcopy(predictedSpans), exactMatch=True, reportSeperately=False) inexactResults = evaluate(deepcopy(referenceSpans), deepcopy(predictedSpans), exactMatch=False, reportSeperately=False) truePositivesExactSpan += exactResults["True Positives"] falseNegativesExactSpan += exactResults["False Negatives"] falsePositivesExactSpan += exactResults["False Positives"] inexactResults = evaluate(deepcopy(referenceSpans), deepcopy(predictedSpans), exactMatch=False, reportSeperately=False) truePositivesInexactSpan += inexactResults["True Positives"] falseNegativesInexactSpan += inexactResults["False Negatives"] falsePositivesInexactSpan += inexactResults["False Positives"] MatrixInexactSpan = evaluate(deepcopy(referenceSpans), deepcopy(predictedSpans), exactMatch=False, reportSeperately=True) for sublist1, sublist2 in zip(confusionMatrixInexactSpan, MatrixInexactSpan): for i,int2 in enumerate(sublist2): sublist1[i] += int2 MatrixExactSpan = evaluate(deepcopy(referenceSpans), deepcopy(predictedSpans), exactMatch=True, reportSeperately=True) for sublist1, sublist2 in zip(confusionMatrixExactSpan, MatrixExactSpan): for i,int2 in enumerate(sublist2): sublist1[i] += int2 print "\nResults for exact span for concepts together.\n" print "True Positives: ", truePositivesExactSpan print "False Negatives: ", falseNegativesExactSpan print "False Positives: ", falsePositivesExactSpan exactSpan = generateResultsForExactSpans(truePositivesExactSpan, falseNegativesExactSpan, falsePositivesExactSpan) print "Recall: ", exactSpan["Recall"] print "Precision: ", exactSpan["Precision"] print "F Measure: ", exactSpan["F Score"] inexactSpan = generateResultsForExactSpans(truePositivesInexactSpan, falseNegativesInexactSpan, falsePositivesInexactSpan) print "\nResults for inexact span for concepts together.\n" print "True Positives: ", truePositivesInexactSpan print "False Negatives: ", falseNegativesInexactSpan print "False Positives: ", falsePositivesInexactSpan print "Recall: ", inexactSpan["Recall"] print "Precision: ", inexactSpan["Precision"] print "F Measure: ", inexactSpan["F Score"] #TO DO: ENSURE NUMBER OF FP,FN,TP is equal to number of predicted spans #TO DO: number of FP, FN, TP is not same between exact and inexact. #LEFT OFF HERE. FIX DISPLAY FUNCTION displayMatrix(args.output, 'Exact' , confusionMatrixExactSpan) displayMatrix(args.output, 'Inexact', confusionMatrixInexactSpan) #print evaluate(deepcopy(referenceSpans), deepcopy(predictedSpans), exactMatch=False, reportSeperately=True) return
def main(): parser = argparse.ArgumentParser() parser.add_argument("-t", dest="txt", help="The files that contain the training examples", default=os.path.join(os.getenv('CLINER_DIR'), 'data/train/txt/*')) parser.add_argument( "-c", dest="con", help="The files that contain the labels for the training examples", default=os.path.join(os.getenv('CLINER_DIR'), 'data/train/con/*')) parser.add_argument("-m", dest="model", help="Path to the model that should be generated", default=os.path.join(os.getenv('CLINER_DIR'), 'models/run.model')) parser.add_argument("-f", dest="format", help="Data format ( " + ' | '.join(Note.supportedFormats()) + " )", default='i2b2') parser.add_argument( "-g", dest="grid", help="A flag indicating whether to perform a grid search", action="store_true") parser.add_argument( "-no-crf", dest="nocrf", help="A flag indicating whether to use crfsuite for pass one.", action="store_true") # Parse the command line arguments args = parser.parse_args() is_crf = not args.nocrf # A list of text file paths # A list of concept file paths txt_files = glob.glob(args.txt) con_files = glob.glob(args.con) # data format format = args.format # Must specify output format if format not in Note.supportedFormats(): print >> sys.stderr, '\n\tError: Must specify output format' print >> sys.stderr, '\tAvailable formats: ', ' | '.join( Note.supportedFormats()) print >> sys.stderr, '' exit(1) # Collect training data file paths txt_files_map = helper.map_files( txt_files) # ex. {'record-13': 'record-13.con'} con_files_map = helper.map_files(con_files) training_list = [ ] # ex. training_list = [ ('record-13.txt', 'record-13.con') ] for k in txt_files_map: if k in con_files_map: training_list.append((txt_files_map[k], con_files_map[k])) # display file names (for user to see data was properly located) print '\n', training_list, '\n' # Train the model train(training_list, args.model, format, is_crf=is_crf, grid=args.grid)
def main(): # Argument Parser parser = argparse.ArgumentParser() parser.add_argument("-t", dest = "txt", help = "The files that contain the training examples", ) parser.add_argument("-a", dest = "annotations", help = "The files that contain the labels for the training examples", ) parser.add_argument("-o", dest = "out", default = None, help = "Directory to output data", ) parser.add_argument("-f", dest = "format", help = "Output format (%s)"%str(' or '.join(Note.supportedFormats())), ) # Parse the command line arguments args = parser.parse_args() # Parse arguments txt = args.txt annotations = args.annotations out_file = args.out format = args.format # Ensure annotations are specified if not txt: print >>sys.stderr, '\n\tError: Must supply text file' print >>sys.stderr exit(1) elif not os.path.exists(txt): print >>sys.stderr, '\n\tError: Given text file does not exist' print >>sys.stderr exit(1) # Ensure annotations are specified extensions = Note.supportedFormatExtensions() if not annotations: print >>sys.stderr, '\n\tError: Must supply annotations' print >>sys.stderr exit(2) elif not os.path.exists(txt): print >>sys.stderr, '\n\tError: Given annotation file does not exist' print >>sys.stderr exit(2) elif os.path.splitext(annotations)[1][1:] not in extensions: print >>sys.stderr, '\n\tError: annotation must be a supported format' print >>sys.stderr, '\t\t(.%s)' %str(' or .'.join(extensions) ) print >>sys.stderr exit(2) # Ensure output format is specified if (not format) or (format not in Note.supportedFormats()): print >>sys.stderr, '\n\tError: Must specify supported output format' print >>sys.stderr, '\t\t(%s)' %str(' or '.join(Note.supportedFormats())) print >>sys.stderr exit(3) # Automatically find the input file format in_extension = os.path.splitext(annotations)[1][1:] for f,ext in Note.dictOfFormatToExtensions().items(): if ext == in_extension: in_format = f # Read input data into note object in_note = Note(in_format) in_note.read(txt,annotations) # Convert data to standard format internal_output = in_note.write_standard() os_handle,tmp_file = tempfile.mkstemp(dir=tmp_dir, suffix="format_temp") with open(tmp_file, 'w') as f: f.write(internal_output) os.close(os_handle) #print internal_output # Read internal standard data into new file with given output format out_note = Note(format) out_note.read_standard(txt,tmp_file) # Output data out = out_note.write() if out_file: with open(out_file, 'w') as out_f: out_f.write(out) else: sys.stdout.write(out) # Clean up os.remove(tmp_file) if out_file: out_f.close()
def main(): parser = argparse.ArgumentParser() parser.add_argument( "-i", dest="input", help="The input files to predict", ) parser.add_argument( "-o", dest="output", help="The directory to write the output", ) parser.add_argument( "-m", dest="model", help="The model to use for prediction", ) parser.add_argument( "-f", dest="format", help="Data format ( " + ' | '.join(Note.supportedFormats()) + " )", ) parser.add_argument("-crf", dest="with_crf", help="Specify where to find crfsuite", default=None) parser.add_argument( "-discontiguous_spans", dest="third", help="A flag indicating whether to have third/clustering pass", action="store_true") parser.add_argument( "-umls_disambiguation", dest="disambiguate", help= "A flag indicating whether to disambiguate CUI ID for identified entities in semeval", action="store_true") args = parser.parse_args() # Error check: Ensure that file paths are specified if not args.input: print >> sys.stderr, '\n\tError: Must provide text files\n' exit(1) if not args.output: print >> sys.stderr, '\n\tError: Must provide output directory\n' exit(1) if not args.model: print >> sys.stderr, '\n\tError: Must provide path to model\n' exit(1) if not os.path.exists(args.model): print >> sys.stderr, '\n\tError: Model does not exist: %s\n' % args.model exit(1) # Parse arguments files = glob.glob(args.input) helper.mkpath(args.output) third = args.third if args.format: format = args.format else: print '\n\tERROR: must provide "format" argument\n' exit() if third is True and args.format == "i2b2": exit("i2b2 formatting does not support disjoint spans") # Tell user if not predicting if not files: print >> sys.stderr, "\n\tNote: You did not supply any input files\n" exit() # Predict predict(files, args.model, args.output, format=format, third=third, disambiguate=args.disambiguate)
def predict(files, model_path, output_dir, format, third=False, disambiguate=False): # Must specify output format if format not in Note.supportedFormats(): print >> sys.stderr, '\n\tError: Must specify output format' print >> sys.stderr, '\tAvailable formats: ', ' | '.join( Note.supportedFormats()) print >> sys.stderr, '' exit(1) # Load model model = Model.load(model_path) # Tell user if not predicting if not files: print >> sys.stderr, "\n\tNote: You did not supply any input files\n" exit() if enabled["UMLS"] is not None and disambiguate is True: from disambiguation import cui_disambiguation # For each file, predict concept labels n = len(files) for i, txt in enumerate(sorted(files)): note = Note(format) note.read(txt) # Output file extension = note.getExtension() fname = os.path.splitext(os.path.basename(txt))[0] + '.' + extension out_path = os.path.join(output_dir, fname) #if os.path.exists(out_path): # print '\tWARNING: prediction file already exists (%s)' % out_path # continue if format == "semevaL": note.setFileName(os.path.split(txt)[-1]) # Predict concept labels labels = model.predict(note, third) # Get predictions in proper format output = note.write(labels) # TODO: make a flag to enable or disable looking up concept ids. if format == "semeval": print "\nencoding concept ids" if enabled["UMLS"] is not None and disambiguate is True: output = cui_disambiguation.disambiguate( output, txt, model.get_cui_freq()) # Output the concept predictions print '\n\nwriting to: ', out_path with open(out_path, 'w') as f: print >> f, output print