def main():

    # Argument Parser
    parser = argparse.ArgumentParser()

    parser.add_argument(
        "-txt",
        dest="txt",
        help="The files that contain the training examples",
    )

    parser.add_argument(
        "-annotations",
        dest="annotations",
        help="The files that contain the labels for the training examples",
    )

    parser.add_argument(
        "-out",
        dest="out",
        default=None,
        help="Directory to output data",
    )

    parser.add_argument(
        "-format",
        dest="format",
        help="Output format (%s)" % str(' or '.join(Note.supportedFormats())),
    )

    # Parse the command line arguments
    args = parser.parse_args()

    # Parse arguments
    txt = args.txt
    annotations = args.annotations
    out_file = args.out
    format = args.format

    # Ensure annotations are specified
    if not txt:
        print >> sys.stderr, '\n\tError: Must supply text file'
        print >> sys.stderr
        exit(1)
    elif not os.path.exists(txt):
        print >> sys.stderr, '\n\tError: Given text file does not exist'
        print >> sys.stderr
        exit(1)

    # Ensure annotations are specified
    extensions = Note.supportedFormatExtensions()
    if not annotations:
        print >> sys.stderr, '\n\tError: Must supply annotations'
        print >> sys.stderr
        exit(2)
    elif not os.path.exists(txt):
        print >> sys.stderr, '\n\tError: Given annotation file does not exist'
        print >> sys.stderr
        exit(2)
    elif os.path.splitext(annotations)[1][1:] not in extensions:
        print >> sys.stderr, '\n\tError: annotation must be a supported format'
        print >> sys.stderr, '\t\t(.%s)' % str(' or .'.join(extensions))
        print >> sys.stderr
        exit(2)

    # Ensure output format is specified
    if (not format) or (format not in Note.supportedFormats()):
        print >> sys.stderr, '\n\tError: Must specify supported output format'
        print >> sys.stderr, '\t\t(%s)' % str(' or '.join(
            Note.supportedFormats()))
        print >> sys.stderr
        exit(3)

    # Automatically find the input file format
    in_extension = os.path.splitext(annotations)[1][1:]
    for f, ext in Note.dictOfFormatToExtensions().items():
        if ext == in_extension:
            in_format = f

    # Read input data into note object
    in_note = Note(in_format)
    in_note.read(txt, annotations)

    # Convert data to standard format
    internal_output = in_note.write_standard()

    os_handle, tmp_file = tempfile.mkstemp(dir=tmp_dir, suffix="format_temp")
    with open(tmp_file, 'w') as f:
        f.write(internal_output)
    os.close(os_handle)

    #print internal_output

    # Read internal standard data into new file with given output format
    out_note = Note(format)
    out_note.read_standard(txt, tmp_file)

    # Output data
    out = out_note.write()
    if out_file:
        with open(out_file, 'w') as out_f:
            out_f.write(out)
    else:
        sys.stdout.write(out)

    # Clean up
    os.remove(tmp_file)
    if out_file:
        out_f.close()
Beispiel #2
0
def main():

    parser = argparse.ArgumentParser()

    parser.add_argument("-t",
        help = "Text files that were used to generate predictions",
        dest = "txt",
        default = os.path.join(os.getenv('CLINER_DIR'), 'data/test_data/*')
    )

    parser.add_argument("-c",
        help = "The directory that contains predicted concept files organized into subdirectories for svm, lin, srf",
        dest = "con",
        default = os.path.join(os.getenv('CLINER_DIR'), 'data/predictions/')
    )

    parser.add_argument("-r",
        help = "The directory that contains reference gold standard concept files",
        dest = "ref",
        default = os.path.join(os.getenv('CLINER_DIR'), 'data/reference_standard_for_test_data/concepts/')
    )

    parser.add_argument("-f",
        dest = "format",
        help = "Data format ( " + ' | '.join(Note.supportedFormats()) + " )",
        default = 'i2b2'
    )

    parser.add_argument("-o",
        help = "Write the evaluation to a file rather than STDOUT",
        dest = "output",
        default = None
    )

    # Parse command line arguments
    args = parser.parse_args()
    format = args.format


    # Is output destination specified?
    if args.output:
        args.output = open(args.output, "w")
    else:
        args.output = sys.stdout


    # Must specify output format
    if format not in Note.supportedFormats():
        print >>sys.stderr, '\n\tError: Must specify output format'
        print >>sys.stderr,   '\tAvailable formats: ', ' | '.join(Note.supportedFormats())
        print >>sys.stderr, ''
        exit(1)


    # List of medical text
    txt_files = glob.glob(args.txt)
    txt_files_map = helper.map_files(txt_files)
    wildcard = '*.' + Note.dictOfFormatToExtensions()[format]


    # List of gold data
    ref_files = glob.glob( os.path.join(args.ref, wildcard) )
    ref_files_map = helper.map_files(ref_files)


    # List of predictions
    pred_files = glob.glob( os.path.join(args.con, wildcard) )
    pred_files_map = helper.map_files(pred_files)


    # Grouping of text, predictions, gold
    files = []
    for k in txt_files_map:
        if k in pred_files_map and k in ref_files_map:
            files.append((txt_files_map[k], pred_files_map[k], ref_files_map[k]))


    # txt          <- medical text
    # annotations  <- predictions
    # gold         <- gold standard


    truePositivesExactSpan = 0
    falseNegativesExactSpan = 0
    falsePositivesExactSpan = 0

    truePositivesInexactSpan = 0
    falseNegativesInexactSpan = 0
    falsePositivesInexactSpan = 0

    confusion = [[0] * len(labels) for e in labels]

    confusionMatrixExactSpan = deepcopy(confusion)
    confusionMatrixInexactSpan = deepcopy(confusion)



    for txt, annotations, gold in files:

        # Read predictions and gols standard data
        cnote = Note(format)
        rnote = Note(format)
        cnote.read(txt, annotations)
        rnote.read(txt,        gold)

        referenceSpans = getConceptSpans(rnote.getIOBLabels(), rnote.conlist())
        predictedSpans = getConceptSpans(cnote.getIOBLabels(), cnote.conlist())

        #TO DO: i need to generate a cumulative total accross all of the files
        #modify my functions slightly and have it return the number of true positive and etc...
        #then call generate results

        exactResults =  evaluate(deepcopy(referenceSpans), deepcopy(predictedSpans), exactMatch=True, reportSeperately=False)

        inexactResults =  evaluate(deepcopy(referenceSpans), deepcopy(predictedSpans), exactMatch=False, reportSeperately=False)


        truePositivesExactSpan += exactResults["True Positives"]
        falseNegativesExactSpan += exactResults["False Negatives"]
        falsePositivesExactSpan += exactResults["False Positives"]


        inexactResults = evaluate(deepcopy(referenceSpans), deepcopy(predictedSpans), exactMatch=False, reportSeperately=False)

        truePositivesInexactSpan += inexactResults["True Positives"]
        falseNegativesInexactSpan += inexactResults["False Negatives"]
        falsePositivesInexactSpan += inexactResults["False Positives"]

        MatrixInexactSpan = evaluate(deepcopy(referenceSpans), deepcopy(predictedSpans), exactMatch=False, reportSeperately=True)

        for sublist1, sublist2 in zip(confusionMatrixInexactSpan, MatrixInexactSpan):
            for i,int2 in enumerate(sublist2):
                sublist1[i] += int2

        MatrixExactSpan = evaluate(deepcopy(referenceSpans), deepcopy(predictedSpans), exactMatch=True, reportSeperately=True)

        for sublist1, sublist2 in zip(confusionMatrixExactSpan, MatrixExactSpan):
            for i,int2 in enumerate(sublist2):
                sublist1[i] += int2

    print "\nResults for exact span for concepts together.\n"

    print "True Positives: ", truePositivesExactSpan
    print "False Negatives: ", falseNegativesExactSpan
    print "False Positives: ", falsePositivesExactSpan

    exactSpan = generateResultsForExactSpans(truePositivesExactSpan,
                                 falseNegativesExactSpan,
                                 falsePositivesExactSpan)

    print "Recall: ", exactSpan["Recall"]
    print "Precision: ", exactSpan["Precision"]
    print "F Measure: ", exactSpan["F Score"]

    inexactSpan = generateResultsForExactSpans(truePositivesInexactSpan,
                                 falseNegativesInexactSpan,
                                 falsePositivesInexactSpan)

    print "\nResults for inexact span for concepts together.\n"

    print "True Positives: ", truePositivesInexactSpan
    print "False Negatives: ", falseNegativesInexactSpan
    print "False Positives: ", falsePositivesInexactSpan

    print "Recall: ", inexactSpan["Recall"]
    print "Precision: ", inexactSpan["Precision"]
    print "F Measure: ", inexactSpan["F Score"]

    #TO DO: ENSURE NUMBER OF FP,FN,TP is equal to number of predicted spans
    #TO DO: number of FP, FN, TP is not same between exact and inexact.

    #LEFT OFF HERE. FIX DISPLAY FUNCTION

    displayMatrix(args.output, 'Exact'  , confusionMatrixExactSpan)
    displayMatrix(args.output, 'Inexact', confusionMatrixInexactSpan)


        #print evaluate(deepcopy(referenceSpans), deepcopy(predictedSpans), exactMatch=False, reportSeperately=True)

    return
Beispiel #3
0
def main():

    parser = argparse.ArgumentParser()

    parser.add_argument(
        "-t",
        help="Text files that were used to generate predictions",
        dest="txt",
        default=os.path.join(os.getenv('CLINER_DIR'), 'data/test_data/*'))

    parser.add_argument(
        "-c",
        help=
        "The directory that contains predicted concept files organized into subdirectories for svm, lin, srf",
        dest="con",
        default=os.path.join(os.getenv('CLINER_DIR'), 'data/predictions/'))

    parser.add_argument(
        "-r",
        help=
        "The directory that contains reference gold standard concept files",
        dest="ref",
        default=os.path.join(
            os.getenv('CLINER_DIR'),
            'data/reference_standard_for_test_data/concepts/'))

    parser.add_argument(
        "-f",
        dest="format",
        help="Data format ( " + ' | '.join(Note.supportedFormats()) + " )",
    )

    parser.add_argument(
        "-o",
        help="Write the evaluation to a file rather than STDOUT",
        dest="output",
        default=None)

    # Parse command line arguments
    args = parser.parse_args()

    if args.format:
        format = args.format
    else:
        print '\n\tERROR: must provide "format" argument\n'
        exit()

    # Is output destination specified?
    if args.output:
        args.output = open(args.output, "w")
    else:
        args.output = sys.stdout

    # Must specify output format
    if format not in Note.supportedFormats():
        print >> sys.stderr, '\n\tError: Must specify output format'
        print >> sys.stderr, '\tAvailable formats: ', ' | '.join(
            Note.supportedFormats())
        print >> sys.stderr, ''
        exit(1)

    # List of medical text
    txt_files = glob.glob(args.txt)
    txt_files_map = helper.map_files(txt_files)
    wildcard = '*.' + Note.dictOfFormatToExtensions()[format]

    # List of gold data
    ref_files = glob.glob(os.path.join(args.ref, wildcard))
    ref_files_map = helper.map_files(ref_files)

    # List of predictions
    pred_files = glob.glob(os.path.join(args.con, wildcard))
    pred_files_map = helper.map_files(pred_files)

    # Grouping of text, predictions, gold
    files = []
    for k in txt_files_map:
        if k in pred_files_map and k in ref_files_map:
            files.append(
                (txt_files_map[k], pred_files_map[k], ref_files_map[k]))

    # txt          <- medical text
    # annotations  <- predictions
    # gold         <- gold standard

    truePositivesExactSpan = 0
    falseNegativesExactSpan = 0
    falsePositivesExactSpan = 0

    truePositivesInexactSpan = 0
    falseNegativesInexactSpan = 0
    falsePositivesInexactSpan = 0

    confusion = [[0] * len(labels) for e in labels]

    confusionMatrixExactSpan = deepcopy(confusion)
    confusionMatrixInexactSpan = deepcopy(confusion)

    if len(files) == 0:
        exit("No files to be evaluated")

    for txt, annotations, gold in files:

        # Read predictions and gols standard data
        cnote = Note(format)
        rnote = Note(format)
        cnote.read(txt, annotations)
        rnote.read(txt, gold)

        referenceSpans = getConceptSpans(rnote.getIOBLabels(), rnote.conlist())
        predictedSpans = getConceptSpans(cnote.getIOBLabels(), cnote.conlist())

        #TO DO: i need to generate a cumulative total accross all of the files
        #modify my functions slightly and have it return the number of true positive and etc...
        #then call generate results

        exactResults = evaluate(deepcopy(referenceSpans),
                                deepcopy(predictedSpans),
                                exactMatch=True,
                                reportSeperately=False)

        inexactResults = evaluate(deepcopy(referenceSpans),
                                  deepcopy(predictedSpans),
                                  exactMatch=False,
                                  reportSeperately=False)

        truePositivesExactSpan += exactResults["True Positives"]
        falseNegativesExactSpan += exactResults["False Negatives"]
        falsePositivesExactSpan += exactResults["False Positives"]

        inexactResults = evaluate(deepcopy(referenceSpans),
                                  deepcopy(predictedSpans),
                                  exactMatch=False,
                                  reportSeperately=False)

        truePositivesInexactSpan += inexactResults["True Positives"]
        falseNegativesInexactSpan += inexactResults["False Negatives"]
        falsePositivesInexactSpan += inexactResults["False Positives"]

        MatrixInexactSpan = evaluate(deepcopy(referenceSpans),
                                     deepcopy(predictedSpans),
                                     exactMatch=False,
                                     reportSeperately=True)

        for sublist1, sublist2 in zip(confusionMatrixInexactSpan,
                                      MatrixInexactSpan):
            for i, int2 in enumerate(sublist2):
                sublist1[i] += int2

        MatrixExactSpan = evaluate(deepcopy(referenceSpans),
                                   deepcopy(predictedSpans),
                                   exactMatch=True,
                                   reportSeperately=True)

        for sublist1, sublist2 in zip(confusionMatrixExactSpan,
                                      MatrixExactSpan):
            for i, int2 in enumerate(sublist2):
                sublist1[i] += int2

    print "\nResults for exact span for concepts together.\n"

    print "True Positives: ", truePositivesExactSpan
    print "False Negatives: ", falseNegativesExactSpan
    print "False Positives: ", falsePositivesExactSpan

    exactSpan = generateResultsForExactSpans(truePositivesExactSpan,
                                             falseNegativesExactSpan,
                                             falsePositivesExactSpan)

    print "Recall: ", exactSpan["Recall"]
    print "Precision: ", exactSpan["Precision"]
    print "F Measure: ", exactSpan["F Score"]

    inexactSpan = generateResultsForExactSpans(truePositivesInexactSpan,
                                               falseNegativesInexactSpan,
                                               falsePositivesInexactSpan)

    print "\nResults for inexact span for concepts together.\n"

    print "True Positives: ", truePositivesInexactSpan
    print "False Negatives: ", falseNegativesInexactSpan
    print "False Positives: ", falsePositivesInexactSpan

    print "Recall: ", inexactSpan["Recall"]
    print "Precision: ", inexactSpan["Precision"]
    print "F Measure: ", inexactSpan["F Score"]

    #TO DO: ENSURE NUMBER OF FP,FN,TP is equal to number of predicted spans
    #TO DO: number of FP, FN, TP is not same between exact and inexact.

    #LEFT OFF HERE. FIX DISPLAY FUNCTION

    displayMatrix(args.output, 'Exact', confusionMatrixExactSpan)
    displayMatrix(args.output, 'Inexact', confusionMatrixInexactSpan)

    #print evaluate(deepcopy(referenceSpans), deepcopy(predictedSpans), exactMatch=False, reportSeperately=True)

    return
Beispiel #4
0
def main():

    # Argument Parser
    parser = argparse.ArgumentParser()

    parser.add_argument("-t",
        dest = "txt",
        help = "The files that contain the training examples",
    )

    parser.add_argument("-a",
        dest = "annotations",
        help = "The files that contain the labels for the training examples",
    )

    parser.add_argument("-o",
        dest = "out",
        default = None,
        help = "Directory to output data",
    )

    parser.add_argument("-f",
        dest = "format",
        help = "Output format (%s)"%str(' or '.join(Note.supportedFormats())),
    )

    # Parse the command line arguments
    args = parser.parse_args()


    # Parse arguments
    txt         = args.txt
    annotations = args.annotations
    out_file    = args.out
    format      = args.format


    # Ensure annotations are specified
    if not txt:
        print >>sys.stderr, '\n\tError: Must supply text file'
        print >>sys.stderr
        exit(1)
    elif not os.path.exists(txt):
        print >>sys.stderr, '\n\tError: Given text file does not exist'
        print >>sys.stderr
        exit(1)

    # Ensure annotations are specified
    extensions = Note.supportedFormatExtensions()
    if not annotations:
        print >>sys.stderr, '\n\tError: Must supply annotations'
        print >>sys.stderr
        exit(2)
    elif not os.path.exists(txt):
        print >>sys.stderr, '\n\tError: Given annotation file does not exist'
        print >>sys.stderr
        exit(2)
    elif os.path.splitext(annotations)[1][1:] not in extensions:
        print >>sys.stderr, '\n\tError: annotation must be a supported format'
        print >>sys.stderr, '\t\t(.%s)' %str(' or .'.join(extensions) )
        print >>sys.stderr
        exit(2)

    # Ensure output format is specified
    if (not format) or (format not in Note.supportedFormats()):
        print >>sys.stderr, '\n\tError: Must specify supported output format'
        print >>sys.stderr, '\t\t(%s)' %str(' or '.join(Note.supportedFormats()))
        print >>sys.stderr
        exit(3)


    # Automatically find the input file format
    in_extension =  os.path.splitext(annotations)[1][1:]
    for f,ext in Note.dictOfFormatToExtensions().items():
        if ext == in_extension:
            in_format = f

    # Read input data into note object
    in_note = Note(in_format)
    in_note.read(txt,annotations)


    # Convert data to standard format
    internal_output = in_note.write_standard()

    os_handle,tmp_file = tempfile.mkstemp(dir=tmp_dir, suffix="format_temp")
    with open(tmp_file, 'w') as f:
        f.write(internal_output)
    os.close(os_handle)

    #print internal_output

    # Read internal standard data into new file with given output format
    out_note = Note(format)
    out_note.read_standard(txt,tmp_file)


    # Output data
    out = out_note.write()
    if out_file:
        with open(out_file, 'w') as out_f:
            out_f.write(out)
    else:
        sys.stdout.write(out)


    # Clean up
    os.remove(tmp_file)
    if out_file:
        out_f.close()