def SelfTraingWrapper(X_train, y_train, X_test):
     from frameworks.SelfLearning import SelfLearningModel
     clf = RandomForestClassifier(warm_start=True, n_estimators=1000)
     ssmodel = SelfLearningModel(clf, prob_threshold=0.9)
     newlabels = np.concatenate((np.array(y_train), -np.ones(len(X_test))))
     ssmodel.fit(np.concatenate((X_train, X_test)), newlabels)
     return ssmodel.predict(X_test)
Example #2
0
# label a few points
labeled_N = 4
ys = np.array([-1] * len(ytrue))  # -1 denotes unlabeled point
random_labeled_points = random.sample(np.where(ytrue == 0)[0], labeled_N/2)+\
                        random.sample(np.where(ytrue == 1)[0], labeled_N/2)
ys[random_labeled_points] = ytrue[random_labeled_points]

# supervised score
#basemodel = WQDA() # weighted Quadratic Discriminant Analysis
basemodel = SGDClassifier(loss='log',
                          penalty='l1')  # scikit logistic regression
basemodel.fit(X[random_labeled_points, :], ys[random_labeled_points])
print("supervised log.reg. score", basemodel.score(X, ytrue))

# fast (but naive, unsafe) self learning framework
ssmodel = SelfLearningModel(basemodel)
ssmodel.fit(X, ys)
print("self-learning log.reg. score", ssmodel.score(X, ytrue))

# semi-supervised score (base model has to be able to take weighted samples)
ssmodel = CPLELearningModel(basemodel)
ssmodel.fit(X, ys)
print("CPLE semi-supervised log.reg. score", ssmodel.score(X, ytrue))

# semi-supervised score, WQDA model
ssmodel = CPLELearningModel(WQDA(), predict_from_probabilities=True
                            )  # weighted Quadratic Discriminant Analysis
ssmodel.fit(X, ys)
print("CPLE semi-supervised WQDA score", ssmodel.score(X, ytrue))

# semi-supervised score, RBF SVM model
Example #3
0
# label a few points
labeled_N = 4
ys = np.array([-1] * len(ytrue))  # -1 denotes unlabeled point
random_labeled_points = random.sample(np.where(ytrue == 0)[0], labeled_N / 2) + random.sample(
    np.where(ytrue == 1)[0], labeled_N / 2
)
ys[random_labeled_points] = ytrue[random_labeled_points]

# supervised score
# basemodel = WQDA() # weighted Quadratic Discriminant Analysis
basemodel = SGDClassifier(loss="log", penalty="l1")  # scikit logistic regression
basemodel.fit(X[random_labeled_points, :], ys[random_labeled_points])
print "supervised log.reg. score", basemodel.score(X, ytrue)

# fast (but naive, unsafe) self learning framework
ssmodel = SelfLearningModel(basemodel)
ssmodel.fit(X, ys)
print "self-learning log.reg. score", ssmodel.score(X, ytrue)

# semi-supervised score (base model has to be able to take weighted samples)
ssmodel = CPLELearningModel(basemodel)
ssmodel.fit(X, ys)
print "CPLE semi-supervised log.reg. score", ssmodel.score(X, ytrue)

# semi-supervised score, WQDA model
ssmodel = CPLELearningModel(WQDA(), predict_from_probabilities=True)  # weighted Quadratic Discriminant Analysis
ssmodel.fit(X, ys)
print "CPLE semi-supervised WQDA score", ssmodel.score(X, ytrue)

# semi-supervised score, RBF SVM model
ssmodel = CPLELearningModel(sklearn.svm.SVC(kernel="rbf", probability=True), predict_from_probabilities=True)  # RBF SVM
        S_with_lowest_Q_statistic = S_lowest_new.copy()
        sim_schemas_lowest = sim_schemas_lowest_new.copy()
        Xms_lowest = Xms_lowest_new.copy()
        Xus_lowest = Xus_lowest_new.copy()
    return sim_schemas_lowest, Xms_lowest, Xus_lowest


sim_schemas, Xms, Xus = select_sim_schemas_with_high_Q_statistic(
    sim_schemas, Xms, Xus, 10)

#### Jurek_step 5 the self learning training process
models = []
labels_per_schema = []
probs_per_schema = []
for i, schema in enumerate(sim_schemas):
    model = SelfLearningModel(LogisticRegression(tol=1e-3, solver='liblinear'))
    models.append(model)
    x = pairs[schema].values
    y_df = pd.DataFrame(list(-1 for i in range(x.shape[0])),
                        columns=['y'],
                        index=pairs.index)
    y_df.loc[Xms[i]] = 1
    y_df.loc[Xus[i]] = 0
    y = y_df['y'].values
    model.fit(x, y)
    labels = model.predict(x)
    labels_per_schema.append(labels)
    probs_per_schema.append(model.predict_proba(x)[:, 1])

labels_ensemble = np.array([round(sum(list(labels_per_schema[i][j] \
                    for i in range(len(labels_per_schema)))) / len(labels_per_schema)) \
Example #5
0
# set the labels of the labeled samples
ys[random_labeled_points] = ytrue[random_labeled_points]  # 2

# print(X[random_labeled_points])
# supervised score
# basemodel = WQDA() # weighted Quadratic Discriminant Analysis
# SGDClassifier
basemodel = SGDClassifier(loss='log',
                          penalty='l1')  # scikit logistic regression
# model fit
basemodel.fit(X[random_labeled_points, :], ys[random_labeled_points])
print("supervised log.reg. score", basemodel.score(X, ytrue))

# self learning framework
# fast (but naive, unsafe) self learning framework
ssmodel = SelfLearningModel(basemodel)  # defaule use the sample weighting
ssmodel.fit(X, ys)
print("self-learning log.reg. score", ssmodel.score(X, ytrue))

# semi-supervised score (base model has to be able to take weighted samples)
ssmodel = CPLELearningModel(basemodel)
ssmodel.fit(X, ys)
print "CPLE semi-supervised log.reg. score", ssmodel.score(X, ytrue)

# semi-supervised score, WQDA model
# WQDA: Weighted Quadratic Discriminant Analysis, 加权二次判别式分析
ssmodel = CPLELearningModel(WQDA(), predict_from_probabilities=True
                            )  # weighted Quadratic Discriminant Analysis
ssmodel.fit(X, ys)
print "CPLE semi-supervised WQDA score", ssmodel.score(X, ytrue)
Example #6
0
kernel = "rbf"
Xsupervised = X[ys!=-1, :]
ysupervised = ys[ys!=-1]

lbl = "Base model SVM(kernel=rbf):"
print (lbl)
basemodel = sklearn.svm.SVC(kernel=kernel, probability=True)
basemodel.fit(Xsupervised, ysupervised)
evaluate(basemodel, X, ys, ytrue, lbl)

# basemodel = SGDClassifier(loss='hinge', penalty='l1', tol=1e-3, max_iter=1000) # scikit logistic regression
# basemodel.fit(X[random_labeled_points, :], ys[random_labeled_points])
# print ("supervised log.reg. score", basemodel.score(X, ytrue))
#
# fast (but naive, unsafe) self learning framework
ssmodel = SelfLearningModel(basemodel)
ssmodel.fit(X, ys)
print ("self-learning log.reg. score", ssmodel.score(X, ytrue))

kernel = "rbf"

Xsupervised = X[ys!=-1, :]
ysupervised = ys[ys!=-1]

lbl = "Purely supervised SVM:"
print (lbl)
model = sklearn.svm.SVC(kernel=kernel, probability=True)
model.fit(Xsupervised, ysupervised)
evaluate(model, X, ys, ytrue, lbl)

lbl =  "S3VM (Gieseke et al. 2012):"
    ys[np.random.randint(N, 2 * N)] = 1

Xsupervised = Xs[ys != -1, :]
ysupervised = ys[ys != -1]

# compare models

lbl = "Purely supervised QDA:"
print lbl
model = WQDA()
model.fit(Xsupervised, ysupervised)
evaluate_and_plot(model, Xs, ys, ytrue, lbl, 1)

lbl = "SelfLearning QDA:"
print lbl
model = SelfLearningModel(WQDA())
model.fit(Xs, ys)
evaluate_and_plot(model, Xs, ys, ytrue, lbl, 2)

lbl = "CPLE(pessimistic) QDA:"
print lbl
model = CPLELearningModel(WQDA(), predict_from_probabilities=True)
model.fit(Xs, ys)
evaluate_and_plot(model, Xs, ys, ytrue, lbl, 3)

lbl = "CPLE(optimistic) QDA:"
print lbl
CPLELearningModel.pessimistic = False
model = CPLELearningModel(WQDA(), predict_from_probabilities=True)
model.fit(Xs, ys)
evaluate_and_plot(model, Xs, ys, ytrue, lbl, 4, block=True)
Example #8
0
        ys[random_labeled_points] = ytrue[random_labeled_points]

        # supervised score
        basemodel = SGDClassifier(loss='hinge',
                                  penalty='l1',
                                  tol=1e-3,
                                  max_iter=1000)  # scikit logistic regression
        basemodel.fit(X[random_labeled_points, :], ys[random_labeled_points])
        acc = basemodel.score(X, ytrue)
        if acc:
            sgd_active.append(acc)

        kernel = "rbf"

        svm_model = sklearn.svm.SVC(kernel=kernel, probability=True)
        ssmodel = SelfLearningModel(svm_model)
        ssmodel.fit(X, ys)
        acc = ssmodel.score(X, ytrue)
        if acc:
            self_learning_active.append(acc)

        Xsupervised = X[ys != -1, :]
        ysupervised = ys[ys != -1]

        lbl = "Purely supervised SVM:"
        model = sklearn.svm.SVC(kernel=kernel, probability=True)
        model.fit(Xsupervised, ysupervised)
        acc = evaluate(model, X, ys, ytrue, lbl)
        print("SVM Accuracy:{}".format(acc))
        if acc:
            svm_active.append(acc)
Example #9
0
        basemodel.fit(X_model[random_labeled_points, :],
                      ys[random_labeled_points])
        #print "supervised log.reg. score", basemodel.score(X_test, y_test)

        #if j == 2:
        #Plot the base model
        #    evaluate_and_plot(basemodel, X_model, ys, ytrue, "Logistic Regression", subplot = 1, block=True)

        #Calculate accuracy
        sum_super += basemodel.score(X_test, y_test)
        super_acc[i] = basemodel.score(X_test, y_test)
        sum_super_err += 1.96 * np.sqrt(super_acc[i] *
                                        (1 - super_acc[i]) / X_test.shape[0])

        # fast (but naive, unsafe) self learning framework
        ssmodel = SelfLearningModel(basemodel)
        ssmodel.fit(X_model, ys)
        #print "self-learning log.reg. score", ssmodel.score(X_test, y_test)

        #if j == 2:
        #Plot the ssmodel
        #    evaluate_and_plot(ssmodel, X_model, ys, ytrue, "Self-Learning", subplot = 2, block=True)

        #Calculate accuracy
        sum_semi += ssmodel.score(X_test, y_test)
        semi_acc[i] = ssmodel.score(X_test, y_test)
        sum_semi_err += 1.96 * np.sqrt(semi_acc[i] *
                                       (1 - semi_acc[i]) / X_test.shape[0])

        #if j==2:
        #Save the figure
Example #10
0
        ys[random_labeled_points] = ytrue[random_labeled_points]

        # supervised score
        basemodel = SGDClassifier(loss='hinge',
                                  penalty='l1',
                                  tol=1e-3,
                                  max_iter=1000)  # scikit logistic regression
        basemodel.fit(X[random_labeled_points, :], ys[random_labeled_points])

        sgd_active.append(basemodel.score(X, ytrue))

        kernel = "rbf"

        svm_model = sklearn.svm.SVC(kernel=kernel, probability=True)
        ssmodel = SelfLearningModel(svm_model)
        ssmodel.fit(X, ys)
        self_learning_active.append(ssmodel.score(X, ytrue))

        Xsupervised = X[ys != -1, :]
        ysupervised = ys[ys != -1]

        lbl = "Purely supervised SVM:"
        model = sklearn.svm.SVC(kernel=kernel, probability=True)
        model.fit(Xsupervised, ysupervised)
        acc = evaluate(model, X, ys, ytrue, lbl)
        svm_active.append(acc)

        lbl = "S3VM (Gieseke et al. 2012):"
        model = scikitTSVM.SKTSVM(kernel=kernel)
        model.fit(X, ys)
Example #11
0
# label a few points 
labeled_N = 30
ys = np.array([-1]*len(ytrue)) # -1 denotes unlabeled point
random_labeled_points = random.sample(np.where(ytrue == 0)[0], labeled_N/2)+\
                        random.sample(np.where(ytrue == 1)[0], labeled_N/2)
ys[random_labeled_points] = ytrue[random_labeled_points]

# supervised score 
basemodel = WQDA() # weighted Quadratic Discriminant Analysis
#basemodel = SGDClassifier(loss='log', penalty='l1') # scikit logistic regression
basemodel.fit(X[random_labeled_points, :], ys[random_labeled_points])
#print "supervised log.reg. score", basemodel.score(X, ytrue)

# fast (but naive, unsafe) self learning framework
ssmodel = SelfLearningModel(basemodel)
ssmodel.fit(X, ys)
print("this is the fitted thing", ssmodel.fit(X,ys))
y_score = ssmodel.predict(heart.data)
#print "heart.target", heart.target
#print "this is the prediction", y_score
print("self-learning log.reg. score", ssmodel.score(X, ytrue))

fpr = dict()
tpr = dict()
roc_auc = dict()
for i in range(2):
    fpr[i], tpr[i], _ = roc_curve(label_binarize(heart.target, classes = [0,1]), label_binarize(y_score, classes = [0,1]))
    roc_auc[i] = auc(fpr[i], tpr[i])

for i in range(2):