Beispiel #1
0
def main():
    usage = 'usage: %prog [options] <repr_hdf5> <data_hdf5> <target_index>'
    parser = OptionParser(usage)
    parser.add_option('-a', dest='add_only', default=False, action='store_true', help='Use additional features only; no sequence features')
    parser.add_option('-b', dest='balance', default=False, action='store_true', help='Downsample the negative set to balance [Default: %default]')
    parser.add_option('-o', dest='out_dir', default='postmodel', help='Output directory [Default: %default]')
    parser.add_option('-r', dest='regression', default=False, action='store_true', help='Regression mode [Default: %default]')
    parser.add_option('-s', dest='seq_only', default=False, action='store_true', help='Use sequence features only; no additional features [Default: %default]')
    parser.add_option('--sample', dest='sample', default=None, type='int', help='Sample from the training set [Default: %default]')
    parser.add_option('-t', dest='target_hdf5', default=None, help='Extract targets from this HDF5 rather than data_hdf5 argument')
    parser.add_option('-x', dest='regex_add', default=None, help='Filter additional features using a comma-separated list of regular expressions')
    (options,args) = parser.parse_args()

    if len(args) != 3:
        parser.error('Must provide full data HDF5, representation HDF5, and target index or filename')
    else:
        repr_hdf5_file = args[0]
        data_hdf5_file = args[1]
        target_i = args[2]

    if not os.path.isdir(options.out_dir):
        os.mkdir(options.out_dir)

    random.seed(1)

    #######################################################
    # preprocessing
    #######################################################

    # load training targets
    data_hdf5_in = h5py.File(data_hdf5_file, 'r')
    if options.target_hdf5:
        target_hdf5_in = h5py.File(options.target_hdf5, 'r')
    else:
        target_hdf5_in = data_hdf5_in
    train_y = np.array(target_hdf5_in['train_out'])[:,target_i]
    test_y = np.array(target_hdf5_in['test_out'])[:,target_i]

    # load training representations
    if not options.add_only:
        repr_hdf5_in = h5py.File(repr_hdf5_file, 'r')
        train_x = np.array(repr_hdf5_in['train_repr'])
        test_x = np.array(repr_hdf5_in['test_repr'])
        repr_hdf5_in.close()

    if options.seq_only:
        add_labels = []

    else:
        # load additional features
        train_a = np.array(data_hdf5_in['train_add'])
        test_a = np.array(data_hdf5_in['test_add'])
        add_labels = np.array(data_hdf5_in['add_labels'])

        if options.regex_add:
            fi = filter_regex(options.regex_add, add_labels)
            train_a, test_a, add_labels = train_a[:,fi], test_a[:,fi], add_labels[fi]

        # append additional features
        if options.add_only:
            add_i = 0
            train_x, test_x = train_a, test_a
        else:
            add_i = train_x.shape[1]
            train_x = np.concatenate((train_x,train_a), axis=1)
            test_x = np.concatenate((test_x,test_a), axis=1)

    data_hdf5_in.close()
    if options.target_hdf5:
        target_hdf5_in.close()

    # balance
    if options.balance:
        train_x, train_y = balance(train_x, train_y)

    # sample
    if options.sample is not None and options.sample < train_x.shape[0]:
        sample_indexes = random.sample(range(train_x.shape[0]), options.sample)
        train_x = train_x[sample_indexes]
        train_y = train_y[sample_indexes]


    #######################################################
    # model
    #######################################################
    if options.regression:
        # fit
        model = BayesianRidge(fit_intercept=True)
        model.fit(train_x, train_y)

        # accuracy
        acc_out = open('%s/r2.txt' % options.out_dir, 'w')
        print >> acc_out, model.score(test_x, test_y)
        acc_out.close()

        test_preds = model.predict(test_x)

        # plot a sample of predictions versus actual
        plt.figure()
        sns.jointplot(test_preds[:5000], test_y[:5000], joint_kws={'alpha':0.3})
        plt.savefig('%s/scatter.pdf' % options.out_dir)
        plt.close()

        # plot the distribution of residuals
        plt.figure()
        sns.distplot(test_y-test_preds)
        plt.savefig('%s/residuals.pdf' % options.out_dir)
        plt.close()

    else:
        # fit
        model = LogisticRegression(penalty='l2', C=1000)
        model.fit(train_x, train_y)

        # accuracy
        test_preds = model.predict_proba(test_x)[:,1].flatten()
        acc_out = open('%s/auc.txt' % options.out_dir, 'w')
        print >> acc_out, roc_auc_score(test_y, test_preds)
        acc_out.close()

        # compute and print ROC curve
        fpr, tpr, thresholds = roc_curve(test_y, test_preds)

        roc_out = open('%s/roc.txt' % options.out_dir, 'w')
        for i in range(len(fpr)):
            print >> roc_out, '%f\t%f\t%f' % (fpr[i], tpr[i], thresholds[i])
        roc_out.close()

        # compute and print precision-recall curve
        precision, recall, thresholds = precision_recall_curve(test_y, test_preds)

        prc_out = open('%s/prc.txt' % options.out_dir, 'w')
        for i in range(len(precision)):
            print >> prc_out, '%f\t%f' % (precision[i], recall[i])
        prc_out.close()

    # save model
    joblib.dump(model, '%s/model.pkl' % options.out_dir)

    #######################################################
    # analyze
    #######################################################
    # print coefficients table
    coef_out = open('%s/add_coefs.txt' % options.out_dir, 'w')
    for ai in range(len(add_labels)):
        if options.regression:
            coefi = model.coef_[add_i+ai]
        else:
            coefi = model.coef_[0,add_i+ai]
        print >> coef_out, add_labels[ai], coefi
    coef_out.close()