Beispiel #1
0
def predict(files,
            model_path,
            output_dir,
            format,
            third=False,
            disambiguate=False):

    # Must specify output format
    if format not in Note.supportedFormats():
        print >> sys.stderr, '\n\tError: Must specify output format'
        print >> sys.stderr, '\tAvailable formats: ', ' | '.join(
            Note.supportedFormats())
        print >> sys.stderr, ''
        exit(1)

    # Load model
    model = Model.load(model_path)

    # Tell user if not predicting
    if not files:
        print >> sys.stderr, "\n\tNote: You did not supply any input files\n"
        exit()

    if enabled["UMLS"] is not None and disambiguate is True:
        from disambiguation import cui_disambiguation

    # For each file, predict concept labels
    n = len(files)
    for i, txt in enumerate(sorted(files)):

        note = Note(format)
        note.read(txt)

        # Output file
        extension = note.getExtension()
        fname = os.path.splitext(os.path.basename(txt))[0] + '.' + extension
        out_path = os.path.join(output_dir, fname)
        #if os.path.exists(out_path):
        #    print '\tWARNING: prediction file already exists (%s)' % out_path
        #    continue

        if format == "semevaL":
            note.setFileName(os.path.split(txt)[-1])

        # Predict concept labels
        labels = model.predict(note, third)

        # Get predictions in proper format
        output = note.write(labels)

        # TODO: make a flag to enable or disable looking up concept ids.
        if format == "semeval":

            print "\nencoding concept ids"
            if enabled["UMLS"] is not None and disambiguate is True:
                output = cui_disambiguation.disambiguate(
                    output, txt, model.get_cui_freq())

        # Output the concept predictions
        print '\n\nwriting to: ', out_path
        with open(out_path, 'w') as f:
            print >> f, output
        print