Esempio n. 1
0
def main():
    filepath = "F:/Programming Documnets/AI/Telehealth/CliNER-master/data/examples/ex_doc.txt"
    train_txt_files = glob.glob(filepath)

    filecon = "F:/Programming Documnets/AI/Telehealth/CliNER-master/data/examples/ex_doc.con"
    train_con_files = glob.glob(filecon)

    # Collect training data file paths
    train_txt_files_map = tools.map_files(train_txt_files)
    train_con_files_map = tools.map_files(train_con_files)

    training_list = []
    for k in train_txt_files_map:
        if k in train_con_files_map:
            training_list.append(
                (train_txt_files_map[k], train_con_files_map[k]))

    # Train the model

    default_log = os.path.join(CLINER_DIR, 'models', 'train.log')
    default_use_lstm = False
    val_list = []
    test_list = []

    filemodel = "F:/Programming Documnets/AI/Telehealth/CliNER-master/models/foo.model"
    model = os.fspath(filemodel)

    train(training_list,
          model,
          'i2b2',
          default_use_lstm,
          default_log,
          val=val_list,
          test=test_list)
Esempio n. 2
0
def train_model(txt_path, con_path, model_path):

    modeldir = os.path.dirname(model_path)
    if (not os.path.exists(modeldir)) and (modeldir != ''):
        sys.stderr.write('\n\tError: Model dir does not exist: %s\n' %
                         modeldir)
        sys.stderr.write('\n')
        exit(1)

    # A list of txt and concept file paths
    train_txt_files = glob.glob(txt_path)
    train_con_files = glob.glob(con_path)

    # Collect training data file paths
    train_txt_files_map = tools.map_files(train_txt_files)
    train_con_files_map = tools.map_files(train_con_files)

    training_list = []
    for k in train_txt_files_map:
        if k in train_con_files_map:
            training_list.append(
                (train_txt_files_map[k], train_con_files_map[k]))

    val_list = []
    test_list = []

    # Train the model
    train(training_list,
          model_path,
          "i2b2",
          False,
          logfile=None,
          val=val_list,
          test=test_list)
Esempio n. 3
0
def main():

    # Parse command line arguments
    parser = argparse.ArgumentParser(prog='cliner evaluate')
    parser.add_argument(
        "--predictions",
        dest="pred",
        help="Directory where predictions  are stored.",
    )
    parser.add_argument(
        "--gold",
        dest="gold",
        help="Directory where gold standard is stored.",
    )
    parser.add_argument("--format", dest="format", help="Data format ( con ) ")

    args = parser.parse_args()

    if not args.pred:
        sys.stderr.write('\n\tERROR: must provide --pred argument\n\n')
        parser.print_help(sys.stderr)
        sys.stderr.write('\n')
        sys.exit()

    if not args.gold:
        sys.stderr.write('\n\tERROR: must provide --gold argument\n\n')
        parser.print_help(sys.stderr)
        sys.stderr.write('\n')
        sys.exit()

    if args.format:
        format = args.format
    else:
        sys.stderr.write('\n\tERROR: must provide --format argument\n\n')
        parser.print_help(sys.stderr)
        sys.stderr.write('\n')
        sys.exit()

    # Must specify output format
    if format not in ['i2b2']:
        sys.stderr.write('\n\tError: Must specify output format\n')
        sys.stderr.write('\tAvailable formats: i2b2\n')
        sys.stderr.write('\n')
        parser.print_help(sys.stderr)
        sys.stderr.write('\n')
        sys.exit()

    ref_files = os.listdir(args.gold)
    ref_files = map(lambda f: os.path.join(args.gold, f), ref_files)

    pred_files = os.listdir(args.pred)
    pred_files = map(lambda f: os.path.join(args.pred, f), pred_files)

    ref_files_map = tools.map_files(ref_files)
    pred_files_map = tools.map_files(pred_files)

    files = []
    for k in ref_files_map:
        if k in pred_files_map:
            files.append((pred_files_map[k], ref_files_map[k]))

    gold_list, pred_list = zip(*files)

    #print gold_list
    #print pred_list

    # create temporary directory for these files
    cliner_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
    tmp_dir = os.path.join(cliner_dir, 'data', 'tmp')
    tempdir_name = os.path.join(tmp_dir,
                                'cliner_eval_%d' % random.randint(0, 256))
    #print tempdir_name

    #text_dir = os.path.join(tempdir_name, 'text/')
    pred_dir = os.path.join(tempdir_name, 'pred/')
    gold_dir = os.path.join(tempdir_name, 'gold/')

    os.mkdir(tempdir_name)
    os.mkdir(pred_dir)
    os.mkdir(gold_dir)

    # copy files
    for pred_file in pred_list:
        shutil.copy(pred_file, pred_dir)
    for gold_file in gold_list:
        shutil.copy(gold_file, gold_dir)

    # eval jar
    cliner_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
    eval_dir = os.path.join(
        cliner_dir,
        'tools',
    )
    eval_jar = os.path.join(eval_dir, 'i2b2va-eval.jar')

    cmd = 'java -jar %s -rcp %s -scp %s -ft con -ex all' % (eval_jar, gold_dir,
                                                            pred_dir)
    status = subprocess.call(cmd, shell=True, stdout=sys.stdout)

    # cleanup after yourself
    shutil.rmtree(tempdir_name)
Esempio n. 4
0
def main():

    parser = argparse.ArgumentParser()

    parser.add_argument(
        "-t",
        help="Text files that were used to generate predictions",
        dest="txt",
    )

    parser.add_argument(
        "-c",
        help=
        "The directory that contains predicted concept files organized into subdirectories for svm, lin, srf",
        dest="con",
    )

    parser.add_argument(
        "-r",
        help=
        "The directory that contains reference gold standard concept files",
        dest="ref",
    )

    parser.add_argument("-f",
                        dest="format",
                        help="Data format (i2b2 or xml).",
                        default='i2b2')

    parser.add_argument(
        "--concept",
        dest="do_concept",
        help=
        "A flag indicating whether to evaluate chunk-level or concept-level",
        action="store_true",
        default=False)

    parser.add_argument(
        "-o",
        help="Write the evaluation to a file rather than STDOUT",
        dest="output",
        default=None)

    # Parse command line arguments
    args = parser.parse_args()

    # Is output destination specified?
    if args.output:
        args.output = open(args.output, "w")
    else:
        args.output = sys.stdout

    # Which format to read?
    if args.format == 'i2b2':
        wildcard = '*.con'
    elif args.format == 'xml':
        wildcard = '*.xml'
    else:
        print >> sys.stderr, '\n\tError: Must specify output format (i2b2 or xml)'
        print >> sys.stderr, ''
        exit(1)

    # List of medical text
    txt_files = glob.glob(args.txt)
    txt_files_map = tools.map_files(txt_files)

    # List of gold data
    ref_files = glob.glob(os.path.join(args.ref, wildcard))
    ref_files_map = tools.map_files(ref_files)

    # List of predictions
    pred_files = glob.glob(os.path.join(args.con, wildcard))
    pred_files_map = tools.map_files(pred_files)

    # Grouping of text, predictions, gold
    files = []
    for k in txt_files_map:
        if k in pred_files_map and k in ref_files_map:
            files.append(
                (txt_files_map[k], pred_files_map[k], ref_files_map[k]))

    if args.do_concept:
        tag2id = {'problem': 0, 'test': 1, 'treatment': 2, 'none': 3}
    else:
        from documents import labels as tag2id

    # Compute the confusion matrix
    confusion = [[0] * len(tag2id) for e in tag2id]

    # txt          <- medical text
    # annotations  <- predictions
    # gold         <- gold standard
    for txt, annotations, gold in files:

        # Read predictions and gols standard data
        cnote = Document(txt, annotations)
        rnote = Document(txt, gold)
        '''
        # List of list of labels
        predictions = tools.flatten( cnote.conlist() )
        gold        = tools.flatten( rnote.conlist() )

        for p,g in zip(predictions,gold):
            if args.do_concept:
                p = p[2:]
                g = g[2:]
                if p == '': p = 'none'
                if g == '': g = 'none'
            confusion[tag2id[g]][tag2id[p]] += 1
        '''

        #'''
        sents = cnote.getTokenizedSentences()
        predictions = cnote.conlist()
        gold = rnote.conlist()
        for i, (pline, gline) in enumerate(zip(predictions, gold)):
            #for p,g in zip(pline,gline)[1:]:
            #for p,g in zip(pline,gline)[:1]:
            for j, (p, g) in enumerate(zip(pline, gline)):
                # try to ignore those leading articles
                #if j < len(pline)-1:
                #if pline[j+1][2:]==gline[j+1][2:] and pline[j+1][0]=='B' and gline[j+1][0]=='I':
                #if pline[j+1][2:]==gline[j+1][2:] and p=='B' and gline[i+1][0]=='B':
                #    continue

                #if sents[i][j] == '__num__':
                #    continue

                #if j == 0:
                #    continue

                if args.do_concept:
                    p = p[2:]
                    g = g[2:]
                    if p == '': p = 'none'
                    if g == '': g = 'none'
                confusion[tag2id[g]][tag2id[p]] += 1
        #'''

    # Display the confusion matrix
    if args.do_concept:
        choice = 'CONCEPT'
    else:
        choice = '7-way'
    print >> args.output, ""
    print >> args.output, ""
    print >> args.output, ""
    print >> args.output, "================"
    print >> args.output, "%s RESULTS" % choice
    print >> args.output, "================"
    print >> args.output, ""
    print >> args.output, "Confusion Matrix"
    pad = max(len(l) for l in tag2id) + 6
    print >> args.output, "%s %s" % (' ' * pad, "\t".join(
        [s[:5] for s in tag2id.keys()]))
    for act, act_v in tag2id.items():
        print >> args.output, "%s %s" % (act.rjust(pad), "\t".join(
            [str(confusion[act_v][pre_v]) for pre, pre_v in tag2id.items()]))
    print >> args.output, ""

    # Compute the analysis stuff
    precision = []
    recall = []
    specificity = []
    f1 = []

    tp = 0
    fp = 0
    fn = 0
    tn = 0

    print >> args.output, "Analysis"
    print >> args.output, " " * pad, "%10s%10s%10s" % ("Precision", "Recall",
                                                       "F1")

    for lab, lab_v in tag2id.items():
        tp = confusion[lab_v][lab_v]
        fp = sum(confusion[v][lab_v] for k, v in tag2id.items() if v != lab_v)
        fn = sum(confusion[lab_v][v] for k, v in tag2id.items() if v != lab_v)
        tn = sum(confusion[v1][v2] for k1, v1 in tag2id.items()
                 for k2, v2 in tag2id.items() if v1 != lab_v and v2 != lab_v)
        precision += [float(tp) / (tp + fp + 1e-100)]
        recall += [float(tp) / (tp + fn + 1e-100)]
        specificity += [float(tn) / (tn + fp + 1e-100)]
        f1 += [float(2 * tp) / (2 * tp + fp + fn + 1e-100)]
        print >> args.output, "%s %10.4f%10.4f%10.4f" % (
            lab.rjust(pad), precision[-1], recall[-1], f1[-1])

    print >> args.output, "--------"

    precision = sum(precision) / len(precision)
    recall = sum(recall) / len(recall)
    specificity = sum(specificity) / len(specificity)
    f1 = sum(f1) / len(f1)

    print >> args.output, "Average: %.4f\t%.4f\t%.4f" % (precision, recall, f1)
Esempio n. 5
0
def main():
    parser = argparse.ArgumentParser()

    parser.add_argument(
        "--txt",
        dest="txt",
        help="The files that contain the training examples",
    )
    parser.add_argument(
        "--annotations",
        dest="con",
        help="The files that contain the labels for the training examples",
    )
    parser.add_argument(
        "--val-txt",
        dest="val_txt",
        help="The files that contain the validation examples",
    )
    parser.add_argument(
        "--val-annotations",
        dest="val_con",
        help="The files that contain the labels for the validation examples",
    )
    parser.add_argument(
        "--test-txt",
        dest="test_txt",
        help="The files that contain the test examples",
    )
    parser.add_argument(
        "--test-annotations",
        dest="test_con",
        help="The files that contain the labels for the test examples",
    )
    parser.add_argument(
        "--model",
        dest="model",
        help="Path to the model that should be generated",
    )
    parser.add_argument("--log",
                        dest="log",
                        help="Path to the log file for training info",
                        default=os.path.join(CLINER_DIR, 'models',
                                             'train.log'))
    parser.add_argument("--use-lstm",
                        dest="use_lstm",
                        help="Whether to use an LSTM model",
                        action='store_true',
                        default=True)
    parser.add_argument("--format", dest="format", help="Data format ( i2b2 )")

    # Parse the command line arguments
    args = parser.parse_args()

    # Error check: Ensure that file paths are specified
    if not args.txt:
        sys.stderr.write('\n\tError: Must provide text files\n')
        sys.stderr.write('\n')
        exit(1)
    if not args.con:
        sys.stderr.write(
            '\n\tError: Must provide annotations for text files\n')
        sys.stderr.write('\n')
        exit(1)
    if not args.model:
        sys.stderr.write('\n\tError: Must provide valid path to store model\n')
        sys.stderr.write('\n')
        exit(1)
    modeldir = os.path.dirname(args.model)
    if (not os.path.exists(modeldir)) and (modeldir != ''):
        sys.stderr.write('\n\tError: Model dir does not exist: %s\n' %
                         modeldir)
        sys.stderr.write('\n')
        exit(1)

    # A list of txt and concept file paths
    train_txt_files = glob.glob(args.txt)
    train_con_files = glob.glob(args.con)

    # data format
    if args.format:
        format = args.format

    # Must specify output format
    if args.format not in ['i2b2']:
        print >> sys.stderr, '\n\tError: Must specify output format'
        print >> sys.stderr, '\tAvailable formats: i2b2'
        sys.stderr.write('\n')
        exit(1)

    # Collect training data file paths
    train_txt_files_map = tools.map_files(train_txt_files)
    train_con_files_map = tools.map_files(train_con_files)

    training_list = []
    for k in train_txt_files_map:
        if k in train_con_files_map:
            training_list.append(
                (train_txt_files_map[k], train_con_files_map[k]))

    # If validation data was specified
    if args.val_txt and args.val_con:
        val_txt_files = glob.glob(args.val_txt)
        val_con_files = glob.glob(args.val_con)

        val_txt_files_map = tools.map_files(val_txt_files)
        val_con_files_map = tools.map_files(val_con_files)

        val_list = []
        for k in val_txt_files_map:
            if k in val_con_files_map:
                val_list.append((val_txt_files_map[k], val_con_files_map[k]))
    else:
        val_list = []

    # If test data was specified
    if args.test_txt and args.test_con:
        test_txt_files = glob.glob(args.test_txt)
        test_con_files = glob.glob(args.test_con)

        test_txt_files_map = tools.map_files(test_txt_files)
        test_con_files_map = tools.map_files(test_con_files)

        test_list = []
        for k in test_txt_files_map:
            if k in test_con_files_map:
                test_list.append(
                    (test_txt_files_map[k], test_con_files_map[k]))
    else:
        test_list = []

    # Train the model
    train(training_list,
          args.model,
          args.format,
          args.use_lstm,
          logfile=args.log,
          val=val_list,
          test=test_list)
Esempio n. 6
0
def main():

    # Parse command line arguments
    parser = argparse.ArgumentParser(prog='cliner evaluate')
    parser.add_argument("--predictions",
        dest = "pred",
        help = "Directory where predictions  are stored.",
    )
    parser.add_argument("--gold",
        dest = "gold",
        help = "Directory where gold standard is stored.",
    )
    parser.add_argument("--format",
        dest = "format",
        help = "Data format ( con ) "
    )
    parser.add_argument("--output",
        dest = "output",
        help = "Write the evaluation to a file rather than STDOUT",
    )
    args = parser.parse_args()


    if not args.pred:
        print '\n\tERROR: must provide --pred argument\n'
        parser.print_help(sys.stderr)
        print >>sys.stderr,  ''
        exit(1)

    if not args.gold:
        print '\n\tERROR: must provide --gold argument\n'
        parser.print_help(sys.stderr)
        print >>sys.stderr,  ''
        exit(1)


    if args.format:
        format = args.format
    else:
        print '\n\tERROR: must provide --format argument\n'
        parser.print_help(sys.stderr)
        print >>sys.stderr,  ''
        exit(1)


    # Is output destination specified?
    if args.output:
        args.output = open(args.output, "w")
    else:
        args.output = sys.stdout


    # Must specify output format
    if format not in ['i2b2']:
        print >>sys.stderr, '\n\tError: Must specify output format'
        print >>sys.stderr,   '\tAvailable formats: con'
        print >>sys.stderr, ''
        parser.print_help(sys.stderr)
        print >>sys.stderr,  ''
        exit(1)


    ref_files  = os.listdir(args.gold)
    ref_files = map(lambda f: os.path.join(args.gold, f), ref_files)

    pred_files = os.listdir(args.pred)
    pred_files = map(lambda f: os.path.join(args.pred, f), pred_files)

    ref_files_map  = tools.map_files( ref_files)
    pred_files_map = tools.map_files(pred_files)

    files = []
    for k in ref_files_map:
        if k in pred_files_map:
            files.append((pred_files_map[k], ref_files_map[k]))

    gold_list, pred_list = zip(*files)

    #print gold_list
    #print pred_list


    # create temporary directory for these files
    tempdir_name = '/tmp/cliner_eval_%d' % random.randint(0,256)
    #print tempdir_name

    #text_dir = os.path.join(tempdir_name, 'text/')
    pred_dir = os.path.join(tempdir_name, 'pred/')
    gold_dir = os.path.join(tempdir_name, 'gold/')

    os.mkdir(tempdir_name)
    os.mkdir(pred_dir)
    os.mkdir(gold_dir)

    # copy files
    for pred_file in pred_list:
        shutil.copy(pred_file, pred_dir)
    for gold_file in gold_list:
        shutil.copy(gold_file, gold_dir)


    # eval jar
    cliner_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
    eval_dir = os.path.join(cliner_dir, 'tools',)
    eval_jar = os.path.join(eval_dir, 'i2b2va-eval.jar')

    cmd = 'java -jar %s -rcp %s -scp %s -ft con -ex all' % (eval_jar, gold_dir, pred_dir)
    status,output = commands.getstatusoutput(cmd)
    print output


    # cleanup after yourself
    shutil.rmtree(tempdir_name)
Esempio n. 7
0
def main():

    # Parse command line arguments
    parser = argparse.ArgumentParser(prog='cliner evaluate')
    parser.add_argument(
        "--txt",
        dest="txt",
        help="Glob of .txt files of discharge summaries",
    )
    parser.add_argument(
        "--predictions",
        dest="pred",
        help="Directory where predictions  are stored.",
    )
    parser.add_argument(
        "--gold",
        dest="gold",
        help="Directory where gold standard is stored.",
    )
    parser.add_argument("--format", dest="format", help="Data format ( con )")
    parser.add_argument(
        "--output",
        dest="output",
        help="Write the evaluation to a file rather than STDOUT",
    )
    args = parser.parse_args()

    if not args.txt:
        print '\n\tERROR: must provide --txt argument\n'
        parser.print_help(sys.stderr)
        print >> sys.stderr, ''
        exit(1)

    if not args.pred:
        print '\n\tERROR: must provide --pred argument\n'
        parser.print_help(sys.stderr)
        print >> sys.stderr, ''
        exit(1)

    if not args.gold:
        print '\n\tERROR: must provide --gold argument\n'
        parser.print_help(sys.stderr)
        print >> sys.stderr, ''
        exit(1)

    if args.format:
        format = args.format
    else:
        print '\n\tERROR: must provide --format argument\n'
        parser.print_help(sys.stderr)
        print >> sys.stderr, ''
        exit(1)

    # Is output destination specified?
    if args.output:
        args.output = open(args.output, "w")
    else:
        args.output = sys.stdout

    # Must specify output format
    if format not in ['i2b2']:
        print >> sys.stderr, '\n\tError: Must specify output format'
        print >> sys.stderr, '\tAvailable formats: i2b2'
        print >> sys.stderr, ''
        parser.print_help(sys.stderr)
        print >> sys.stderr, ''
        exit(1)

    # List of medical text
    txt_files = glob.glob(args.txt)
    txt_files_map = tools.map_files(txt_files)
    wildcard = '*.con'

    # List of gold data
    ref_files = glob.glob(os.path.join(args.gold, wildcard))
    ref_files_map = tools.map_files(ref_files)

    # List of predictions
    pred_files = glob.glob(os.path.join(args.pred, wildcard))
    pred_files_map = tools.map_files(pred_files)

    # Grouping of text, predictions, gold
    files = []
    for k in txt_files_map:
        if k in pred_files_map and k in ref_files_map:
            files.append(
                (txt_files_map[k], pred_files_map[k], ref_files_map[k]))

    # txt          <- medical text
    # annotations  <- predictions
    # gold         <- gold standard

    if len(files) == 0:
        print "No files to be evaluated"
        exit()

    print
    for txt, annotations, gold in files:

        # Read predictions and gols standard data
        cnote = Document(txt, annotations)
        rnote = Document(txt, gold)

        sents = rnote.getTokenizedSentences()

        # Note - can also get first pass (IOB labels)
        ref = rnote.conlist()
        pred = cnote.conlist()

        for i, toks, pline, rline in zip(range(len(sents)), sents, pred, ref):
            for j, token, rlab, plab in zip(range(len(pline)), toks, rline,
                                            pline):
                if rlab != plab:
                    ind = max(0, j - 3)
                    #print 'ref:  ', rline[j-3:j+3]
                    #print 'pred: ', pline[j-3:j+3]
                    print token
                    for k in range(ind, j):
                        print ' ' * (len(toks[k]) + 4),
                    print '<>'
                    print toks[j - 3:j + 3]
                    print '\tpred: ', plab
                    print '\tref:  ', rlab
                    print '\n'
Esempio n. 8
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("--txt", dest="txt")
    parser.add_argument(
        "--annotations",
        dest="con",
    )
    parser.add_argument("--val-txt", dest="val_txt")
    parser.add_argument("--val-annotations", dest="val_con")
    parser.add_argument("--test-txt", dest="test_txt")
    parser.add_argument("--test-annotations", dest="test_con")
    parser.add_argument("--model", dest="model")
    parser.add_argument("--log", dest="log")
    # parser.add_argument("--format",dest="format")

    args = parser.parse_args()

    if (not args.txt or not args.con or not args.model):
        parser.print_help(sys.stderr)
        sys.stderr.write('\n\tError in parsing arguments\n')
        sys.stderr.write('\n')
        exit(1)

    m_dir = os.path.dirname(args.model)

    if (not os.path.exists(m_dir)) and (m_dir != ''):
        parser.print_help(sys.stderr)
        sys.stderr.write('\n\tNo such model directory:%s\n' % m_dir)
        sys.stderr.write('\n')
        exit(1)

    textFiles = glob.glob(args.txt)
    conceptFiles = glob.glob(args.con)

    textFilesMap = tools.map_files(textFiles)
    conceptFilesMap = tools.map_files(conceptFiles)

    trainingList = []

    for k in textFilesMap:
        if k in conceptFilesMap:
            trainingList.append((textFilesMap[k], conceptFilesMap[k]))

    if args.val_txt and args.val_con:
        valTextFiles = glob.glob(args.val_txt)
        valConceptFiles = glob.glob(args.val_con)

        valTextFilesMap = tools.map_files(valTextFiles)
        valConceptFilesMap = tools.map_files(valConceptFiles)

        valList = []
        for k in valTextFilesMap:
            if k in valConceptFilesMap:
                valList.append((valTextFilesMap[k], valConceptFilesMap[k]))
    else:
        valList = []

    if args.test_txt and args.test_con:
        testTextFiles = glob.glob(args.test_txt)
        testConceptFiles = glob.glob(args.test_con)

        testTextFilesMap = tools.map_files(testTextFiles)
        testConceptFilesMap = tools.map_files(testConceptFiles)

        testList = []
        for k in testTextFilesMap:
            if k in testConceptFilesMap:
                testList.append((testTextFilesMap[k], testConceptFilesMap[k]))
    else:
        testList = []

    build(trainingList,
          args.model,
          logFile=args.log,
          val=valList,
          test=testList)
Esempio n. 9
0
def main():

    parser = argparse.ArgumentParser()

    parser.add_argument("-t",
        help = "Text files that were used to generate predictions",
        dest = "txt",
    )

    parser.add_argument("-c",
        help = "The directory that contains predicted concept files organized into subdirectories for svm, lin, srf",
        dest = "con",
    )

    parser.add_argument("-r",
        help = "The directory that contains reference gold standard concept files",
        dest = "ref",
    )

    parser.add_argument("-f",
        dest = "format",
        help = "Data format (i2b2 or xml).",
        default = 'i2b2'
    )

    parser.add_argument("--concept",
        dest = "do_concept",
        help = "A flag indicating whether to evaluate chunk-level or concept-level",
        action = "store_true",
        default = False
    )


    parser.add_argument("-o",
        help = "Write the evaluation to a file rather than STDOUT",
        dest = "output",
        default = None
    )

    # Parse command line arguments
    args = parser.parse_args()


    # Is output destination specified?
    if args.output:
        args.output = open(args.output, "w")
    else:
        args.output = sys.stdout


    # Which format to read?
    if   args.format == 'i2b2':
        wildcard = '*.con'
    elif args.format == 'xml':
        wildcard = '*.xml'
    else:
        print >>sys.stderr, '\n\tError: Must specify output format (i2b2 or xml)'
        print >>sys.stderr, ''
        exit(1)


    # List of medical text
    txt_files = glob.glob(args.txt)
    txt_files_map = tools.map_files(txt_files)


    # List of gold data
    ref_files = glob.glob( os.path.join(args.ref, wildcard) )
    ref_files_map = tools.map_files(ref_files)


    # List of predictions
    pred_files = glob.glob( os.path.join(args.con, wildcard) )
    pred_files_map = tools.map_files(pred_files)


    # Grouping of text, predictions, gold
    files = []
    for k in txt_files_map:
        if k in pred_files_map and k in ref_files_map:
            files.append((txt_files_map[k], pred_files_map[k], ref_files_map[k]))

    if args.do_concept:
        tag2id = { 'problem':0, 'test':1, 'treatment':2, 'none':3 }
    else:
        from documents import labels as tag2id


    # Compute the confusion matrix
    confusion = [[0] * len(tag2id) for e in tag2id]


    # txt          <- medical text
    # annotations  <- predictions
    # gold         <- gold standard
    for txt, annotations, gold in files:

        # Read predictions and gols standard data
        cnote = Document(txt, annotations)
        rnote = Document(txt, gold)

        '''
        # List of list of labels
        predictions = tools.flatten( cnote.conlist() )
        gold        = tools.flatten( rnote.conlist() )

        for p,g in zip(predictions,gold):
            if args.do_concept:
                p = p[2:]
                g = g[2:]
                if p == '': p = 'none'
                if g == '': g = 'none'
            confusion[tag2id[g]][tag2id[p]] += 1
        '''

        #'''
        sents       = cnote.getTokenizedSentences()
        predictions = cnote.conlist()
        gold        = rnote.conlist()
        for i,(pline,gline) in enumerate(zip(predictions,gold)):
            #for p,g in zip(pline,gline)[1:]:
            #for p,g in zip(pline,gline)[:1]:
            for j,(p,g) in enumerate(zip(pline,gline)):
                # try to ignore those leading articles
                #if j < len(pline)-1:
                    #if pline[j+1][2:]==gline[j+1][2:] and pline[j+1][0]=='B' and gline[j+1][0]=='I':
                    #if pline[j+1][2:]==gline[j+1][2:] and p=='B' and gline[i+1][0]=='B':
                    #    continue

                #if sents[i][j] == '__num__':
                #    continue

                #if j == 0:
                #    continue

                if args.do_concept:
                    p = p[2:]
                    g = g[2:]
                    if p == '': p = 'none'
                    if g == '': g = 'none'
                confusion[tag2id[g]][tag2id[p]] += 1
        #'''




    # Display the confusion matrix
    if args.do_concept:
        choice = 'CONCEPT'
    else:
        choice = '7-way'
    print >>args.output, ""
    print >>args.output, ""
    print >>args.output, ""
    print >>args.output, "================"
    print >>args.output, "%s RESULTS" % choice
    print >>args.output, "================"
    print >>args.output, ""
    print >>args.output, "Confusion Matrix"
    pad = max(len(l) for l in tag2id) + 6
    print >>args.output, "%s %s" % (' ' * pad, "\t".join([s[:5] for s  in tag2id.keys()]))
    for act, act_v in tag2id.items():
        print >>args.output, "%s %s" % (act.rjust(pad), "\t".join([str(confusion[act_v][pre_v]) for pre, pre_v in tag2id.items()]))
    print >>args.output, ""

    # Compute the analysis stuff
    precision = []
    recall = []
    specificity = []
    f1 = []

    tp = 0
    fp = 0
    fn = 0
    tn = 0

    print >>args.output, "Analysis"
    print >>args.output, " " * pad, "%10s%10s%10s" % ("Precision","Recall","F1")

    for lab, lab_v in tag2id.items():
        tp = confusion[lab_v][lab_v]
        fp = sum(confusion[v][lab_v] for k, v in tag2id.items() if v != lab_v)
        fn = sum(confusion[lab_v][v] for k, v in tag2id.items() if v != lab_v)
        tn = sum(confusion[v1][v2] for k1, v1 in tag2id.items()
          for k2, v2 in tag2id.items() if v1 != lab_v and v2 != lab_v)
        precision += [float(tp) / (tp + fp + 1e-100)]
        recall += [float(tp) / (tp + fn + 1e-100)]
        specificity += [float(tn) / (tn + fp + 1e-100)]
        f1 += [float(2 * tp) / (2 * tp + fp + fn + 1e-100)]
        print >>args.output, "%s %10.4f%10.4f%10.4f" % (lab.rjust(pad), precision[-1], recall[-1], f1[-1])

    print >>args.output, "--------"

    precision = sum(precision) / len(precision)
    recall = sum(recall) / len(recall)
    specificity = sum(specificity) / len(specificity)
    f1 = sum(f1) / len(f1)

    print >>args.output, "Average: %.4f\t%.4f\t%.4f" % (precision, recall, f1)
Esempio n. 10
0
def main():

    # Parse command line arguments
    parser = argparse.ArgumentParser(prog='cliner evaluate')
    parser.add_argument(
        "--predictions",
        dest="pred",
        help="Directory where predictions  are stored.",
    )
    parser.add_argument(
        "--gold",
        dest="gold",
        help="Directory where gold standard is stored.",
    )
    parser.add_argument("--format", dest="format", help="Data format ( con ) ")
    parser.add_argument(
        "--output",
        dest="output",
        help="Write the evaluation to a file rather than STDOUT",
    )
    args = parser.parse_args()

    if not args.pred:
        print '\n\tERROR: must provide --pred argument\n'
        parser.print_help(sys.stderr)
        print >> sys.stderr, ''
        exit(1)

    if not args.gold:
        print '\n\tERROR: must provide --gold argument\n'
        parser.print_help(sys.stderr)
        print >> sys.stderr, ''
        exit(1)

    if args.format:
        format = args.format
    else:
        print '\n\tERROR: must provide --format argument\n'
        parser.print_help(sys.stderr)
        print >> sys.stderr, ''
        exit(1)

    # Is output destination specified?
    if args.output:
        args.output = open(args.output, "w")
    else:
        args.output = sys.stdout

    # Must specify output format
    if format not in ['i2b2']:
        print >> sys.stderr, '\n\tError: Must specify output format'
        print >> sys.stderr, '\tAvailable formats: con'
        print >> sys.stderr, ''
        parser.print_help(sys.stderr)
        print >> sys.stderr, ''
        exit(1)

    ref_files = os.listdir(args.gold)
    ref_files = map(lambda f: os.path.join(args.gold, f), ref_files)

    pred_files = os.listdir(args.pred)
    pred_files = map(lambda f: os.path.join(args.pred, f), pred_files)

    ref_files_map = tools.map_files(ref_files)
    pred_files_map = tools.map_files(pred_files)

    files = []
    for k in ref_files_map:
        if k in pred_files_map:
            files.append((pred_files_map[k], ref_files_map[k]))

    gold_list, pred_list = zip(*files)

    #print gold_list
    #print pred_list

    # create temporary directory for these files
    tempdir_name = '/tmp/cliner_eval_%d' % random.randint(0, 256)
    #print tempdir_name

    #text_dir = os.path.join(tempdir_name, 'text/')
    pred_dir = os.path.join(tempdir_name, 'pred/')
    gold_dir = os.path.join(tempdir_name, 'gold/')

    os.mkdir(tempdir_name)
    os.mkdir(pred_dir)
    os.mkdir(gold_dir)

    # copy files
    for pred_file in pred_list:
        shutil.copy(pred_file, pred_dir)
    for gold_file in gold_list:
        shutil.copy(gold_file, gold_dir)

    # eval jar
    cliner_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
    eval_dir = os.path.join(
        cliner_dir,
        'tools',
    )
    eval_jar = os.path.join(eval_dir, 'i2b2va-eval.jar')

    cmd = 'java -jar %s -rcp %s -scp %s -ft con -ex all' % (eval_jar, gold_dir,
                                                            pred_dir)
    status, output = commands.getstatusoutput(cmd)
    print output

    # cleanup after yourself
    shutil.rmtree(tempdir_name)
Esempio n. 11
0
def main():
    # Parse arguments
    parser = argparse.ArgumentParser(prog='cliner train')
    parser.add_argument("--txt",
        dest = "txt",
        help = ".txt files of discharge summaries"
    )
    parser.add_argument("--annotations",
        dest = "con",
        help = "concept files for annotations of the .txt files",
    )
    parser.add_argument("--model",
        dest = "model",
        help = "Path to the model that should be stored",
    )
    parser.add_argument("--log",
        dest = "log",
        help = "Path to the log file for training info",
        default = os.path.join(CLINER_DIR, 'models', 'train.log')
    )
    parser.add_argument("--format",
        dest = "format",
        help = "Data format ( i2b2 )"
    )

    # Parse the command line arguments
    args = parser.parse_args()

    # Error check: Ensure that file paths are specified
    if not args.txt:
        print >>sys.stderr, '\n\tError: Must provide text files'
        print >>sys.stderr,  ''
        parser.print_help(sys.stderr)
        print >>sys.stderr,  ''
        exit(1)
    if not args.con:
        print >>sys.stderr, '\n\tError: Must provide annotations for text files'
        print >>sys.stderr,  ''
        parser.print_help(sys.stderr)
        print >>sys.stderr,  ''
        exit(1)
    if not args.model:
        print >>sys.stderr, '\n\tError: Must provide valid path to store model'
        print >>sys.stderr,  ''
        parser.print_help(sys.stderr)
        print >>sys.stderr,  ''
        exit(1)
    modeldir = os.path.dirname(args.model)
    if (not os.path.exists(modeldir)) and (modeldir != ''):
        print >>sys.stderr, '\n\tError: ClinerModel dir does not exist: %s' % modeldir
        print >>sys.stderr,  ''
        parser.print_help(sys.stderr)
        print >>sys.stderr,  ''
        exit(1)

    # A list of text and concept file paths
    txt_files = glob.glob(args.txt)
    con_files = glob.glob(args.con)

    # data format
    if not args.format:
        print '\n\tERROR: must provide "format" argument\n'
        exit()

    # Must specify output format
    if args.format not in ['i2b2']:
        print >>sys.stderr, '\n\tError: Must specify output format'
        print >>sys.stderr,   '\tAvailable formats: i2b2'
        print >>sys.stderr, ''
        exit(1)

    # Collect training data file paths
    txt_files_map = tools.map_files(txt_files)
    con_files_map = tools.map_files(con_files)

    training_list = []
    for k in txt_files_map:
        if k in con_files_map:
            training_list.append((txt_files_map[k], con_files_map[k]))

    # Train the model
    train(training_list, args.model, args.format, logfile=args.log)
Esempio n. 12
0
def main():
    parser = argparse.ArgumentParser()

    parser.add_argument(
        "--txt",
        dest="txt",
        help="The files that contain the training examples",
    )
    parser.add_argument(
        "--annotations",
        dest="con",
        help="The files that contain the labels for the training examples",
    )
    parser.add_argument(
        "--model",
        dest="model",
        help="Path to the model that should be generated",
    )
    parser.add_argument("--log",
                        dest="log",
                        help="Path to the log file for training info",
                        default=os.path.join(CLINER_DIR, 'models',
                                             'train.log'))
    parser.add_argument("--use-lstm",
                        dest="use_lstm",
                        help="Whether to use an LSTM model",
                        action='store_true',
                        default=False)
    parser.add_argument("--format", dest="format", help="Data format ( i2b2 )")

    # Parse the command line arguments
    args = parser.parse_args()

    # Error check: Ensure that file paths are specified
    if not args.txt:
        print >> sys.stderr, '\n\tError: Must provide text files'
        print >> sys.stderr, ''
        exit(1)
    if not args.con:
        print >> sys.stderr, '\n\tError: Must provide annotations for text files'
        print >> sys.stderr, ''
        exit(1)
    if not args.model:
        print >> sys.stderr, '\n\tError: Must provide valid path to store model'
        print >> sys.stderr, ''
        exit(1)
    modeldir = os.path.dirname(args.model)
    if (not os.path.exists(modeldir)) and (modeldir != ''):
        print >> sys.stderr, '\n\tError: Model dir does not exist: %s' % modeldir
        print >> sys.stderr, ''
        exit(1)
    if args.use_lstm:
        print >> sys.stderr, '\n\t --use-lstm not supported yet'
        print >> sys.stderr, ''
        exit(1)

    # A list of txt and concept file paths
    txt_files = glob.glob(args.txt)
    con_files = glob.glob(args.con)

    # data format
    if args.format:
        format = args.format

    # Must specify output format
    if args.format not in ['i2b2']:
        print >> sys.stderr, '\n\tError: Must specify output format'
        print >> sys.stderr, '\tAvailable formats: i2b2'
        print >> sys.stderr, ''
        exit(1)

    # Collect training data file paths
    txt_files_map = tools.map_files(txt_files)
    con_files_map = tools.map_files(con_files)

    training_list = []

    for k in txt_files_map:
        if k in con_files_map:
            training_list.append((txt_files_map[k], con_files_map[k]))

    # Train the model
    train(training_list,
          args.model,
          args.format,
          args.use_lstm,
          logfile=args.log)