Esempio n. 1
0
def do_cross_validation(data, k, target, algorithm, estimators, name, outdir):

    kf = cross_validation.StratifiedKFold(target, k)
    auc_list = []
    mean_tpr = 0.0
    mean_fpr = linspace(0, 1, 100)
    rocs = []
    mean_roc = []
    cut_value = 0.0

    predictions = [None] * len(target)
    for train_index, test_index in kf:
        data_train = [data[index] for index in train_index]
        data_test = [data[index] for index in test_index]
        target_train = [target[index] for index in train_index]
        target_test = [target[index] for index in test_index]
        if algorithm == 'svm':
            predicted = train_svm(data_train, target_train, data_test)
        else:
            predicted = train_random_forest(data_train, target_train,
                                            data_test, estimators)

        cut_value += cut_at(target_test, predicted)
        for x in xrange(len(test_index)):
            predictions[test_index[x]] = (test_index[x], predicted[x, 1],
                                          target_test[x])

        roc = pyroc.ROCData([(
            target_test[i],
            predicted[i, 1],
        ) for i in xrange(0, len(predicted))])
        rocs.append(roc)
        mean_roc += [(
            target_test[i],
            predicted[i, 1],
        ) for i in xrange(0, len(predicted))]
        auc_list.append(roc.auc())

    mean_tpr /= k
    #mean_tpr[-1] = 1.0
    #mean_auc = metrics.auc(mean_fpr, mean_tpr)
    mean_auc = sum(auc_list) / len(auc_list)
    rocs.append(pyroc.ROCData(mean_roc, 'r-'))
    print("Averaged AUC: %f" % mean_auc)
    print "Averaged cut_value: %f" % (cut_value / k)

    save_predictions(predictions, outdir)

    fig_title = 'ROC Curve for %s on %s \n (mean area = %0.2f)' % (
        algorithm, name, mean_auc)
    plot_roc(rocs, fig_title, outdir)

    return mean_auc, cut_value / k
Esempio n. 2
0
def do_cross_validation(pos1, neg1, pos2, neg2, k, algorithm, estimators, name, outdir):
    #if len(data) < 20*k:
        #k = len(data)/20
    #TODO?
    
    len_n = min(pos1.shape[0], pos2.shape[0], neg1.shape[0], neg2.shape[0])
    target = [1]*len_n + [0]*len_n
    
    data1, data2 = join(pos1, neg1, pos2, neg2)

    kf = cross_validation.StratifiedKFold(target, k)
    auc_list=[]
    mean_tpr = 0.0
    mean_fpr = linspace(0, 1, 100)
    predictions = []
    mean_roc = []   
    cut_value = 0.0

    for train_index, test_index in kf:
        data_train = [data1[index] for index in train_index]
        data_test = [data1[index] for index in test_index]
        
        data_train2 = [data2[index] for index in train_index]
        data_test2 = [data2[index] for index in test_index]
        
        target_train = [target[index] for index in train_index]
        target_test = [target[index] for index in test_index]
        if algorithm == 'svm':
            #TODO
            predicted = train_svm(data_train, target_train, data_test)
        else:
            predicted = train_random_forest(data_train, target_train, data_test, estimators)
            predicted2 = train_random_forest(data_train2, target_train, data_test2, estimators)
            predicted = combine_predictions(predicted, predicted2)
        cut_value += cut_at(target_test, predicted)
        
        roc = pyroc.ROCData([(target_test[i], predicted[i][1],) for i in xrange(0, len(predicted))])
        predictions.append(roc)
        mean_roc += [(target_test[i], predicted[i][1],) for i in xrange(0, len(predicted))]
        auc_list.append(roc.auc())
    mean_tpr /= k
    #mean_tpr[-1] = 1.0
    #mean_auc = metrics.auc(mean_fpr, mean_tpr)
    mean_auc = sum(auc_list)/len(auc_list)
    predictions.append(pyroc.ROCData(mean_roc, 'r-'))
    print("Averaged AUC: %f" % mean_auc)
    print "Averaged cut_value: %f" % (cut_value/k)
    
    
    fig_title = 'ROC Curve for %s on %s \n (mean area = %0.2f)' % (algorithm, name, mean_auc)
    #plot_roc(predictions, fig_title, outdir)
    return mean_auc, cut_value/k
Esempio n. 3
0
 def testBupaData(self):
     X, Y = load_bupa_dataset()
     classifier = AdaBoost(DecisionStump)
     for t in [100, 200, 300, 400, 500]:
         score = classifier.test_on_training_set(X, Y, t)
         roc = pyroc.ROCData(zip(Y, score))
         auc = roc.auc()
         print auc
         self.failUnless(auc > .9)
Esempio n. 4
0
def auc2(pos, neg):
    roc = pyroc.ROCData([(
        1,
        pos[i],
    ) for i in xrange(0, len(pos))] + [(
        0,
        neg[i],
    ) for i in xrange(0, len(neg))])
    return roc.auc()
Esempio n. 5
0
def predict(datapos, dataneg, class_dir, outdir):

    try:
        class_filename = glob(RESULTSPATH + class_dir + '/*')[0]
    except:
        assert True, 'No classifier in %s' % class_dir

    print "Predicting using classifier from ", class_filename
    classifier = pickle.load(open(class_filename))

    data, target = train.join_and_balance(datapos, dataneg, False)
    data, names = my_transpose(data)
    predicted = classifier.predict_proba(data)
    roc = pyroc.ROCData([(
        target[i],
        predicted[i, 1],
    ) for i in xrange(0, len(predicted))])

    train.save_predictions([(i, predicted[i, 1], target[i])
                            for i in xrange(0, len(predicted))], outdir)

    print "AUC=", roc.auc()
    #plot roc ?
    return [roc.auc(), 0]
Esempio n. 6
0
max_AUC = 0.01
best_epoch = 0

for i in range(1000):
    for start, end in zip(range(0, len(trX), 100), range(100, len(trX), 100)):
        cost = train(trX[start:end], trY[start:end])
        batches_seen += 1
        # Learning rate decay
        lr.set_value(
            floatX(np.amax((init_lr / (decay_factor**batches_seen), min_lr))))
    pred = predict(vX)
    current_epoch.set_value(floatX(i))
    this_error = 1 - np.mean(vY == pred[1])
    if this_error < min_error:
        min_error = this_error
    this_AUC = pyroc.ROCData(zip(vY, pred[0])).auc()
    if this_AUC > max_AUC:
        max_AUC = this_AUC
        best_epoch = i
        f = file(os.path.basename(__file__) + '_best.pkl', 'wb')
        for p in params:
            cPickle.dump(p.get_value(), f, protocol=cPickle.HIGHEST_PROTOCOL)
        f.close()
    if i > end_momentum and (i - best_epoch) > 10:
        break

    print "Epoch {0} \t Error={1}".format(i, this_error)
    print "          \t AUC={0}".format(this_AUC)
    print "          \t Learning rate={0}".format(lr.get_value())
    if not (i % 5):
        print "          \t \t min Error={0} \t max AUC={1}".format(
Esempio n. 7
0
        #print mod.predict(Xte).ravel() == yte.ravel()
        #if fold == 4: break
    y_pred = np.concatenate(y_pred)
    y_true = np.concatenate(y_true)
    y_prob_pred = np.concatenate(y_prob_pred)
    p, r, f, s = precision_recall_fscore_support(y_true, y_pred, average=None)
    fpr, tpr, thresholds = roc_curve(y_true, y_prob_pred)
    roc_auc = auc(fpr, tpr)
    print "Precisions: ", p, p.mean(), "\n",\
          "Recalls:    ", r, r.mean(), "\n",\
          "F:          ", f, f.mean(), "\n",\
          "AUC:        ", roc_auc, "\n",\
          "Support:    ", s
    import pyroc
    sample = np.c_[y_true, y_prob_pred]
    roc = pyroc.ROCData(sample)  #Create the ROC Object
    roc.auc()  #get the area under the curve
    # 0.9829545454545454
    roc.plot('ROC Curve (AUC= %.2f)' % roc.auc(), True,
             True)  #Create a plot of the ROC curve

############################################################################
## Permuations + 10 CV
############################################################################
if MODE == "permutations":
    k, l, g = ALPHA * np.array([1 - L1_RATIO, L1_RATIO, 0])
    mod = LogisticRegressionL1L2TV(k=k,
                                   l=l,
                                   g=g,
                                   A=A,
                                   penalty_start=1,
Esempio n. 8
0
    filename = sys.argv[1]
    dataname = 'higgs'

    # Load pylearn2 model object.
    print 'Loading model...'
    model = pkl.load(open(filename, 'r'))

    # Determine which features were used to train the model from filename.
    if 'all' in filename or 'True' in filename:
        derived_feat = True
    elif 'raw' in filename or 'False' in filename:
        derived_feat = False
    else:
        assert 'only' in filename
        derived_feat = 'only'
    print 'Loading dataset %s...' % dataname
    benchmark = 1 if dataname == 'higgs' else 2
    dataset = physics.PHYSICS(benchmark=benchmark,
                              which_set='test',
                              derived_feat=derived_feat)

    # Predict.
    print 'Making predictions...'
    Yhat = fprop(model, dataset.X)
    # Compute area under the ROC curve.
    print 'Computing AUC...'
    auc = pyroc.ROCData(zip(dataset.y, Yhat)).auc()
    error_test = model.monitor.channels['test_y_kl'].val_record[-1]
    print 'AUC=%f, Error=%f, Dataset=%s, Model File=%s' % (auc, error_test,
                                                           dataname, filename)
Esempio n. 9
0
def auc(target, predicted):
    roc = pyroc.ROCData([(
        target[i],
        predicted[i, 1],
    ) for i in xrange(0, len(predicted))])
    return roc.auc()
Esempio n. 10
0
truth = open("Paper/pred_results/truth.csv")

true_vals = {}
for line in truth:
    if line.strip() == '':
        continue
    vals = line.strip().split("\t")
    true_vals[vals[1]] = int(vals[0])

roclist = []
labels = []
for fname in sys.argv[1:]:
    infile = open(fname)
    data = []
    for line in infile:
        if line.strip() == '':
            continue
        vals = line.strip().split(",")
        if not vals[0] in true_vals:
            continue
        data.append((true_vals[vals[0]], float(vals[1])))

    roc = pyroc.ROCData(data)
    print fname + ": " + str(roc.auc())
    roclist.append(roc)
    labels.append(";".join(
        [x for x in fname.split("/")[-1].split("_")[0:-1] if x != ""]))

pyroc.plot_multiple_roc(roclist, labels=labels)