ys[random_labeled_points] = ytrue[random_labeled_points] # supervised score #basemodel = WQDA() # weighted Quadratic Discriminant Analysis basemodel = SGDClassifier(loss='log', penalty='l1') # scikit logistic regression basemodel.fit(X[random_labeled_points, :], ys[random_labeled_points]) print("supervised log.reg. score", basemodel.score(X, ytrue)) # fast (but naive, unsafe) self learning framework ssmodel = SelfLearningModel(basemodel) ssmodel.fit(X, ys) print("self-learning log.reg. score", ssmodel.score(X, ytrue)) # semi-supervised score (base model has to be able to take weighted samples) ssmodel = CPLELearningModel(basemodel) ssmodel.fit(X, ys) print("CPLE semi-supervised log.reg. score", ssmodel.score(X, ytrue)) # semi-supervised score, WQDA model ssmodel = CPLELearningModel(WQDA(), predict_from_probabilities=True ) # weighted Quadratic Discriminant Analysis ssmodel.fit(X, ys) print("CPLE semi-supervised WQDA score", ssmodel.score(X, ytrue)) # semi-supervised score, RBF SVM model ssmodel = CPLELearningModel(sklearn.svm.SVC(kernel="rbf", probability=True), predict_from_probabilities=True) # RBF SVM ssmodel.fit(X, ys) print("CPLE semi-supervised RBF SVM score", ssmodel.score(X, ytrue))
else: X = X_raw # Split the data into test and train data test_pcnt = 0.15 X_train = X[:int(len(X) * (1 - test_pcnt)), :] X_test = X[int(len(X) * (1 - test_pcnt)):, :] y_train = y[:int(len(X) * (1 - test_pcnt))] y_test = y[int(len(X) * (1 - test_pcnt)):] ytrue = y_train print X_train.shape, y_train.shape # Just supervised score basemodel = WQDA() # weighted Quadratic Discriminant Analysis #basemodel = SGDClassifier(loss='log', penalty='l1') # scikit logistic regression basemodel.fit(X_train, ytrue) print "full labeled wqda score", basemodel.score(X_test, y_test) print "standard error of wqda", 1.96 * np.sqrt( basemodel.score(X_test, y_test) * (1 - basemodel.score(X_test, y_test)) / X_test.shape[0]) # Just supervised score #basemodel = WQDA() # weighted Quadratic Discriminant Analysis basemodel = SGDClassifier(loss='log', penalty='l1') # scikit logistic regression basemodel.fit(X_train, ytrue) print "full labeled log.reg. score", basemodel.score(X_test, y_test) print "standard error of log reg", 1.96 * np.sqrt( basemodel.score(X_test, y_test) *
ytrue = np.array([0] * N + [1] * N) ys = np.array([-1] * (2 * N)) for i in range(supevised_data_points / 2): ys[np.random.randint(0, N)] = 0 for i in range(supevised_data_points / 2): ys[np.random.randint(N, 2 * N)] = 1 Xsupervised = Xs[ys != -1, :] ysupervised = ys[ys != -1] # compare models lbl = "Purely supervised QDA:" print lbl model = WQDA() model.fit(Xsupervised, ysupervised) evaluate_and_plot(model, Xs, ys, ytrue, lbl, 1) lbl = "SelfLearning QDA:" print lbl model = SelfLearningModel(WQDA()) model.fit(Xs, ys) evaluate_and_plot(model, Xs, ys, ytrue, lbl, 2) lbl = "CPLE(pessimistic) QDA:" print lbl model = CPLELearningModel(WQDA(), predict_from_probabilities=True) model.fit(Xs, ys) evaluate_and_plot(model, Xs, ys, ytrue, lbl, 3)
Xsupervised = Xs[ys!=-1, :] ysupervised = ys[ys!=-1] plt.figure() cols = [np.array([1,0,0]),np.array([0,1,0])] # colors # loop through and compare methods for i in range(4): plt.subplot(2,2,i+1) t1=time.time() # train model if i == 0: lbl= "Purely supervised QDA:" model = WQDA() model.fit(Xsupervised, ysupervised) else: if i == 1: lbl= "SelfLearning QDA:" model = SelfLearningModel(WQDA()) if i == 2: lbl= "CPLE(pessimistic) QDA:" model = CPLELearningModel(WQDA(), predict_from_probabilities=True) elif i == 3: lbl= "CPLE(optimistic) QDA:" CPLELearningModel.pessimistic = False model = CPLELearningModel(WQDA(), predict_from_probabilities=True) model.fit(Xs, ys) print "" print lbl
Xsupervised = Xs[ys != -1, :] ysupervised = ys[ys != -1] plt.figure() cols = [np.array([1, 0, 0]), np.array([0, 1, 0])] # colors # loop through and compare methods for i in range(4): plt.subplot(2, 2, i + 1) t1 = time.time() # train model if i == 0: lbl = "Purely supervised QDA:" model = WQDA() model.fit(Xsupervised, ysupervised) else: if i == 1: lbl = "SelfLearning QDA:" model = SelfLearningModel(WQDA()) if i == 2: lbl = "CPLE(pessimistic) QDA:" model = CPLELearningModel(WQDA()) elif i == 3: lbl = "CPLE(optimistic) QDA:" CPLELearningModel.pessimistic = False model = CPLELearningModel(WQDA()) model.fit(Xs, ys) print "" print lbl
# load data heart = fetch_mldata("heart") X = heart.data ytrue = np.copy(heart.target) ytrue[ytrue==-1]=0 # label a few points labeled_N = 30 ys = np.array([-1]*len(ytrue)) # -1 denotes unlabeled point random_labeled_points = random.sample(np.where(ytrue == 0)[0], labeled_N/2)+\ random.sample(np.where(ytrue == 1)[0], labeled_N/2) ys[random_labeled_points] = ytrue[random_labeled_points] # supervised score basemodel = WQDA() # weighted Quadratic Discriminant Analysis #basemodel = SGDClassifier(loss='log', penalty='l1') # scikit logistic regression basemodel.fit(X[random_labeled_points, :], ys[random_labeled_points]) #print "supervised log.reg. score", basemodel.score(X, ytrue) # fast (but naive, unsafe) self learning framework ssmodel = SelfLearningModel(basemodel) ssmodel.fit(X, ys) print("this is the fitted thing", ssmodel.fit(X,ys)) y_score = ssmodel.predict(heart.data) #print "heart.target", heart.target #print "this is the prediction", y_score print("self-learning log.reg. score", ssmodel.score(X, ytrue)) fpr = dict() tpr = dict()
ytrue = np.array([0]*N + [1]*N) ys = np.array([-1]*(2*N)) for i in range(supevised_data_points/2): ys[np.random.randint(0, N)] = 0 for i in range(supevised_data_points/2): ys[np.random.randint(N, 2*N)] = 1 Xsupervised = Xs[ys!=-1, :] ysupervised = ys[ys!=-1] # compare models lbl = "Purely supervised QDA:" print lbl model = WQDA() model.fit(Xsupervised, ysupervised) evaluate_and_plot(model, Xs, ys, ytrue, lbl, 1) lbl = "SelfLearning QDA:" print lbl model = SelfLearningModel(WQDA()) model.fit(Xs, ys) evaluate_and_plot(model, Xs, ys, ytrue, lbl, 2) lbl = "CPLE(pessimistic) QDA:" print lbl model = CPLELearningModel(WQDA(), predict_from_probabilities=True) model.fit(Xs, ys) evaluate_and_plot(model, Xs, ys, ytrue, lbl, 3)
#EXTRA METHODS WHICH WERE USED FOR INITIAL EVALUATION PURPOSE ################################################################################ #Create the semi supervised RBF (Radial basis function) classifier lbl = "Label Propagation(RBF):" print lbl rbf_model = label_propagation.LabelSpreading(kernel='rbf', gamma = 20,max_iter=50, tol=0.0001) rbf_model.fit(train_data, train_labels) y_predict = rbf_model.predict(X_extra) accuracy = accuracy_score(y_extra, y_predict) error_rate = 1 - accuracy logLik = -np.sum( stats.norm.logpdf(y_extra, loc=y_predict, scale=sd) ) print 'RBF Error Rate:', error_rate, logLik ################################################################################ ############################################################################### lbl = "Self Learning Model:" print lbl model = SelfLearningModel(WQDA()) model.fit(train_data, train_labels) y_predict = model.predict(X_extra) # Calculate the negative log-likelihood as the negative sum of the log of a normal # PDF where the observed values are normally distributed around the mean (yPred) # with a standard deviation of sd accuracy = accuracy_score(y_extra, y_predict) error_rate = 1 - accuracy logLik = -np.sum( stats.norm.logpdf(y_extra, loc=y_predict, scale=sd) ) print 'Self Learning Error Rate:', error_rate, logLik ###############################################################################