Ejemplo n.º 1
0
ys[random_labeled_points] = ytrue[random_labeled_points]

# supervised score
#basemodel = WQDA() # weighted Quadratic Discriminant Analysis
basemodel = SGDClassifier(loss='log',
                          penalty='l1')  # scikit logistic regression
basemodel.fit(X[random_labeled_points, :], ys[random_labeled_points])
print("supervised log.reg. score", basemodel.score(X, ytrue))

# fast (but naive, unsafe) self learning framework
ssmodel = SelfLearningModel(basemodel)
ssmodel.fit(X, ys)
print("self-learning log.reg. score", ssmodel.score(X, ytrue))

# semi-supervised score (base model has to be able to take weighted samples)
ssmodel = CPLELearningModel(basemodel)
ssmodel.fit(X, ys)
print("CPLE semi-supervised log.reg. score", ssmodel.score(X, ytrue))

# semi-supervised score, WQDA model
ssmodel = CPLELearningModel(WQDA(), predict_from_probabilities=True
                            )  # weighted Quadratic Discriminant Analysis
ssmodel.fit(X, ys)
print("CPLE semi-supervised WQDA score", ssmodel.score(X, ytrue))

# semi-supervised score, RBF SVM model
ssmodel = CPLELearningModel(sklearn.svm.SVC(kernel="rbf", probability=True),
                            predict_from_probabilities=True)  # RBF SVM
ssmodel.fit(X, ys)
print("CPLE semi-supervised RBF SVM score", ssmodel.score(X, ytrue))
Ejemplo n.º 2
0
    else:
        X = X_raw

    #   Split the data into test and train data
    test_pcnt = 0.15
    X_train = X[:int(len(X) * (1 - test_pcnt)), :]
    X_test = X[int(len(X) * (1 - test_pcnt)):, :]
    y_train = y[:int(len(X) * (1 - test_pcnt))]
    y_test = y[int(len(X) * (1 - test_pcnt)):]

    ytrue = y_train

    print X_train.shape, y_train.shape

    # Just supervised score
    basemodel = WQDA()  # weighted Quadratic Discriminant Analysis
    #basemodel = SGDClassifier(loss='log', penalty='l1') # scikit logistic regression
    basemodel.fit(X_train, ytrue)
    print "full labeled wqda score", basemodel.score(X_test, y_test)
    print "standard error of wqda", 1.96 * np.sqrt(
        basemodel.score(X_test, y_test) *
        (1 - basemodel.score(X_test, y_test)) / X_test.shape[0])

    # Just supervised score
    #basemodel = WQDA() # weighted Quadratic Discriminant Analysis
    basemodel = SGDClassifier(loss='log',
                              penalty='l1')  # scikit logistic regression
    basemodel.fit(X_train, ytrue)
    print "full labeled log.reg. score", basemodel.score(X_test, y_test)
    print "standard error of log reg", 1.96 * np.sqrt(
        basemodel.score(X_test, y_test) *
ytrue = np.array([0] * N + [1] * N)

ys = np.array([-1] * (2 * N))
for i in range(supevised_data_points / 2):
    ys[np.random.randint(0, N)] = 0
for i in range(supevised_data_points / 2):
    ys[np.random.randint(N, 2 * N)] = 1

Xsupervised = Xs[ys != -1, :]
ysupervised = ys[ys != -1]

# compare models

lbl = "Purely supervised QDA:"
print lbl
model = WQDA()
model.fit(Xsupervised, ysupervised)
evaluate_and_plot(model, Xs, ys, ytrue, lbl, 1)

lbl = "SelfLearning QDA:"
print lbl
model = SelfLearningModel(WQDA())
model.fit(Xs, ys)
evaluate_and_plot(model, Xs, ys, ytrue, lbl, 2)

lbl = "CPLE(pessimistic) QDA:"
print lbl
model = CPLELearningModel(WQDA(), predict_from_probabilities=True)
model.fit(Xs, ys)
evaluate_and_plot(model, Xs, ys, ytrue, lbl, 3)
    
Xsupervised = Xs[ys!=-1, :]
ysupervised = ys[ys!=-1]

plt.figure()
cols = [np.array([1,0,0]),np.array([0,1,0])] # colors

# loop through and compare methods     
for i in range(4):
    plt.subplot(2,2,i+1)
    
    t1=time.time()
    # train model
    if i == 0:
        lbl= "Purely supervised QDA:"
        model = WQDA()
        model.fit(Xsupervised, ysupervised)
    else:
        if i == 1:
            lbl= "SelfLearning QDA:"
            model = SelfLearningModel(WQDA())
        if i == 2:
            lbl= "CPLE(pessimistic) QDA:"
            model = CPLELearningModel(WQDA(), predict_from_probabilities=True)
        elif i == 3:
            lbl= "CPLE(optimistic) QDA:"
            CPLELearningModel.pessimistic = False
            model = CPLELearningModel(WQDA(), predict_from_probabilities=True)
        model.fit(Xs, ys)
    print ""
    print lbl
Xsupervised = Xs[ys != -1, :]
ysupervised = ys[ys != -1]

plt.figure()
cols = [np.array([1, 0, 0]), np.array([0, 1, 0])]  # colors

# loop through and compare methods
for i in range(4):
    plt.subplot(2, 2, i + 1)

    t1 = time.time()
    # train model
    if i == 0:
        lbl = "Purely supervised QDA:"
        model = WQDA()
        model.fit(Xsupervised, ysupervised)
    else:
        if i == 1:
            lbl = "SelfLearning QDA:"
            model = SelfLearningModel(WQDA())
        if i == 2:
            lbl = "CPLE(pessimistic) QDA:"
            model = CPLELearningModel(WQDA())
        elif i == 3:
            lbl = "CPLE(optimistic) QDA:"
            CPLELearningModel.pessimistic = False
            model = CPLELearningModel(WQDA())
        model.fit(Xs, ys)
    print ""
    print lbl
Ejemplo n.º 6
0
# load data
heart = fetch_mldata("heart")
X = heart.data
ytrue = np.copy(heart.target)
ytrue[ytrue==-1]=0

# label a few points 
labeled_N = 30
ys = np.array([-1]*len(ytrue)) # -1 denotes unlabeled point
random_labeled_points = random.sample(np.where(ytrue == 0)[0], labeled_N/2)+\
                        random.sample(np.where(ytrue == 1)[0], labeled_N/2)
ys[random_labeled_points] = ytrue[random_labeled_points]

# supervised score 
basemodel = WQDA() # weighted Quadratic Discriminant Analysis
#basemodel = SGDClassifier(loss='log', penalty='l1') # scikit logistic regression
basemodel.fit(X[random_labeled_points, :], ys[random_labeled_points])
#print "supervised log.reg. score", basemodel.score(X, ytrue)

# fast (but naive, unsafe) self learning framework
ssmodel = SelfLearningModel(basemodel)
ssmodel.fit(X, ys)
print("this is the fitted thing", ssmodel.fit(X,ys))
y_score = ssmodel.predict(heart.data)
#print "heart.target", heart.target
#print "this is the prediction", y_score
print("self-learning log.reg. score", ssmodel.score(X, ytrue))

fpr = dict()
tpr = dict()
ytrue = np.array([0]*N + [1]*N)

ys = np.array([-1]*(2*N))
for i in range(supevised_data_points/2):
    ys[np.random.randint(0, N)] = 0
for i in range(supevised_data_points/2):
    ys[np.random.randint(N, 2*N)] = 1
    
Xsupervised = Xs[ys!=-1, :]
ysupervised = ys[ys!=-1]

# compare models

lbl = "Purely supervised QDA:"
print lbl
model = WQDA()
model.fit(Xsupervised, ysupervised)
evaluate_and_plot(model, Xs, ys, ytrue, lbl, 1)

lbl = "SelfLearning QDA:"
print lbl
model = SelfLearningModel(WQDA())
model.fit(Xs, ys)
evaluate_and_plot(model, Xs, ys, ytrue, lbl, 2)

lbl = "CPLE(pessimistic) QDA:"
print lbl
model = CPLELearningModel(WQDA(), predict_from_probabilities=True)
model.fit(Xs, ys)
evaluate_and_plot(model, Xs, ys, ytrue, lbl, 3)
    #EXTRA METHODS WHICH WERE USED FOR INITIAL EVALUATION PURPOSE
    ################################################################################
    #Create the semi supervised RBF (Radial basis function) classifier
    lbl = "Label Propagation(RBF):"
    print lbl
    rbf_model = label_propagation.LabelSpreading(kernel='rbf', gamma = 20,max_iter=50, tol=0.0001)
    rbf_model.fit(train_data, train_labels)
    y_predict = rbf_model.predict(X_extra)

    accuracy = accuracy_score(y_extra, y_predict)
    error_rate = 1 - accuracy
    logLik = -np.sum( stats.norm.logpdf(y_extra, loc=y_predict, scale=sd) )
    print 'RBF Error Rate:', error_rate, logLik
    ################################################################################
    ###############################################################################
    lbl = "Self Learning Model:"
    print lbl
    model = SelfLearningModel(WQDA())
    model.fit(train_data, train_labels)
    y_predict = model.predict(X_extra)
    # Calculate the negative log-likelihood as the negative sum of the log of a normal
    # PDF where the observed values are normally distributed around the mean (yPred)
    # with a standard deviation of sd

    accuracy = accuracy_score(y_extra, y_predict)
    error_rate = 1 - accuracy
    logLik = -np.sum( stats.norm.logpdf(y_extra, loc=y_predict, scale=sd) )
    print 'Self Learning Error Rate:', error_rate, logLik
    ###############################################################################