def SelfTraingWrapper(X_train, y_train, X_test):
     from frameworks.SelfLearning import SelfLearningModel
     clf = RandomForestClassifier(warm_start=True, n_estimators=1000)
     ssmodel = SelfLearningModel(clf, prob_threshold=0.9)
     newlabels = np.concatenate((np.array(y_train), -np.ones(len(X_test))))
     ssmodel.fit(np.concatenate((X_train, X_test)), newlabels)
     return ssmodel.predict(X_test)
Example #2
0
labeled_N = 4
ys = np.array([-1] * len(ytrue))  # -1 denotes unlabeled point
random_labeled_points = random.sample(np.where(ytrue == 0)[0], labeled_N/2)+\
                        random.sample(np.where(ytrue == 1)[0], labeled_N/2)
ys[random_labeled_points] = ytrue[random_labeled_points]

# supervised score
#basemodel = WQDA() # weighted Quadratic Discriminant Analysis
basemodel = SGDClassifier(loss='log',
                          penalty='l1')  # scikit logistic regression
basemodel.fit(X[random_labeled_points, :], ys[random_labeled_points])
print("supervised log.reg. score", basemodel.score(X, ytrue))

# fast (but naive, unsafe) self learning framework
ssmodel = SelfLearningModel(basemodel)
ssmodel.fit(X, ys)
print("self-learning log.reg. score", ssmodel.score(X, ytrue))

# semi-supervised score (base model has to be able to take weighted samples)
ssmodel = CPLELearningModel(basemodel)
ssmodel.fit(X, ys)
print("CPLE semi-supervised log.reg. score", ssmodel.score(X, ytrue))

# semi-supervised score, WQDA model
ssmodel = CPLELearningModel(WQDA(), predict_from_probabilities=True
                            )  # weighted Quadratic Discriminant Analysis
ssmodel.fit(X, ys)
print("CPLE semi-supervised WQDA score", ssmodel.score(X, ytrue))

# semi-supervised score, RBF SVM model
ssmodel = CPLELearningModel(sklearn.svm.SVC(kernel="rbf", probability=True),
Example #3
0
labeled_N = 4
ys = np.array([-1] * len(ytrue))  # -1 denotes unlabeled point
random_labeled_points = random.sample(np.where(ytrue == 0)[0], labeled_N / 2) + random.sample(
    np.where(ytrue == 1)[0], labeled_N / 2
)
ys[random_labeled_points] = ytrue[random_labeled_points]

# supervised score
# basemodel = WQDA() # weighted Quadratic Discriminant Analysis
basemodel = SGDClassifier(loss="log", penalty="l1")  # scikit logistic regression
basemodel.fit(X[random_labeled_points, :], ys[random_labeled_points])
print "supervised log.reg. score", basemodel.score(X, ytrue)

# fast (but naive, unsafe) self learning framework
ssmodel = SelfLearningModel(basemodel)
ssmodel.fit(X, ys)
print "self-learning log.reg. score", ssmodel.score(X, ytrue)

# semi-supervised score (base model has to be able to take weighted samples)
ssmodel = CPLELearningModel(basemodel)
ssmodel.fit(X, ys)
print "CPLE semi-supervised log.reg. score", ssmodel.score(X, ytrue)

# semi-supervised score, WQDA model
ssmodel = CPLELearningModel(WQDA(), predict_from_probabilities=True)  # weighted Quadratic Discriminant Analysis
ssmodel.fit(X, ys)
print "CPLE semi-supervised WQDA score", ssmodel.score(X, ytrue)

# semi-supervised score, RBF SVM model
ssmodel = CPLELearningModel(sklearn.svm.SVC(kernel="rbf", probability=True), predict_from_probabilities=True)  # RBF SVM
ssmodel.fit(X, ys)
#### Jurek_step 5 the self learning training process
models = []
labels_per_schema = []
probs_per_schema = []
for i, schema in enumerate(sim_schemas):
    model = SelfLearningModel(LogisticRegression(tol=1e-3, solver='liblinear'))
    models.append(model)
    x = pairs[schema].values
    y_df = pd.DataFrame(list(-1 for i in range(x.shape[0])),
                        columns=['y'],
                        index=pairs.index)
    y_df.loc[Xms[i]] = 1
    y_df.loc[Xus[i]] = 0
    y = y_df['y'].values
    model.fit(x, y)
    labels = model.predict(x)
    labels_per_schema.append(labels)
    probs_per_schema.append(model.predict_proba(x)[:, 1])

labels_ensemble = np.array([round(sum(list(labels_per_schema[i][j] \
                    for i in range(len(labels_per_schema)))) / len(labels_per_schema)) \
                   for j in range(len(labels_per_schema[0]))],int)


### Jurek_step 6 remove the classfiers that make a different predictions with other classfiers baes on a threshold
def calculate_CRs(labels_per_schema, labels_ensemble):
    CRs = []
    for labels in labels_per_schema:
        CRs.append(np.average([labels == labels_ensemble]))
    return CRs
Example #5
0
                      ys[random_labeled_points])
        #print "supervised log.reg. score", basemodel.score(X_test, y_test)

        #if j == 2:
        #Plot the base model
        #    evaluate_and_plot(basemodel, X_model, ys, ytrue, "Logistic Regression", subplot = 1, block=True)

        #Calculate accuracy
        sum_super += basemodel.score(X_test, y_test)
        super_acc[i] = basemodel.score(X_test, y_test)
        sum_super_err += 1.96 * np.sqrt(super_acc[i] *
                                        (1 - super_acc[i]) / X_test.shape[0])

        # fast (but naive, unsafe) self learning framework
        ssmodel = SelfLearningModel(basemodel)
        ssmodel.fit(X_model, ys)
        #print "self-learning log.reg. score", ssmodel.score(X_test, y_test)

        #if j == 2:
        #Plot the ssmodel
        #    evaluate_and_plot(ssmodel, X_model, ys, ytrue, "Self-Learning", subplot = 2, block=True)

        #Calculate accuracy
        sum_semi += ssmodel.score(X_test, y_test)
        semi_acc[i] = ssmodel.score(X_test, y_test)
        sum_semi_err += 1.96 * np.sqrt(semi_acc[i] *
                                       (1 - semi_acc[i]) / X_test.shape[0])

        #if j==2:
        #Save the figure
        #    plt.savefig(('comparisons_' + str(j) + '_' + str(i) + '.png'))
Example #6
0
# label a few points 
labeled_N = 30
ys = np.array([-1]*len(ytrue)) # -1 denotes unlabeled point
random_labeled_points = random.sample(np.where(ytrue == 0)[0], labeled_N/2)+\
                        random.sample(np.where(ytrue == 1)[0], labeled_N/2)
ys[random_labeled_points] = ytrue[random_labeled_points]

# supervised score 
basemodel = WQDA() # weighted Quadratic Discriminant Analysis
#basemodel = SGDClassifier(loss='log', penalty='l1') # scikit logistic regression
basemodel.fit(X[random_labeled_points, :], ys[random_labeled_points])
#print "supervised log.reg. score", basemodel.score(X, ytrue)

# fast (but naive, unsafe) self learning framework
ssmodel = SelfLearningModel(basemodel)
ssmodel.fit(X, ys)
print("this is the fitted thing", ssmodel.fit(X,ys))
y_score = ssmodel.predict(heart.data)
#print "heart.target", heart.target
#print "this is the prediction", y_score
print("self-learning log.reg. score", ssmodel.score(X, ytrue))

fpr = dict()
tpr = dict()
roc_auc = dict()
for i in range(2):
    fpr[i], tpr[i], _ = roc_curve(label_binarize(heart.target, classes = [0,1]), label_binarize(y_score, classes = [0,1]))
    roc_auc[i] = auc(fpr[i], tpr[i])

for i in range(2):
    plt.plot(fpr[i], tpr[i], label='ROC curve of class {0} (area = {1:0.2f})'