def SelfTraingWrapper(X_train, y_train, X_test):
     from frameworks.SelfLearning import SelfLearningModel
     clf = RandomForestClassifier(warm_start=True, n_estimators=1000)
     ssmodel = SelfLearningModel(clf, prob_threshold=0.9)
     newlabels = np.concatenate((np.array(y_train), -np.ones(len(X_test))))
     ssmodel.fit(np.concatenate((X_train, X_test)), newlabels)
     return ssmodel.predict(X_test)
#### Jurek_step 5 the self learning training process
models = []
labels_per_schema = []
probs_per_schema = []
for i, schema in enumerate(sim_schemas):
    model = SelfLearningModel(LogisticRegression(tol=1e-3, solver='liblinear'))
    models.append(model)
    x = pairs[schema].values
    y_df = pd.DataFrame(list(-1 for i in range(x.shape[0])),
                        columns=['y'],
                        index=pairs.index)
    y_df.loc[Xms[i]] = 1
    y_df.loc[Xus[i]] = 0
    y = y_df['y'].values
    model.fit(x, y)
    labels = model.predict(x)
    labels_per_schema.append(labels)
    probs_per_schema.append(model.predict_proba(x)[:, 1])

labels_ensemble = np.array([round(sum(list(labels_per_schema[i][j] \
                    for i in range(len(labels_per_schema)))) / len(labels_per_schema)) \
                   for j in range(len(labels_per_schema[0]))],int)


### Jurek_step 6 remove the classfiers that make a different predictions with other classfiers baes on a threshold
def calculate_CRs(labels_per_schema, labels_ensemble):
    CRs = []
    for labels in labels_per_schema:
        CRs.append(np.average([labels == labels_ensemble]))
    return CRs
Example #3
0
ys = np.array([-1]*len(ytrue)) # -1 denotes unlabeled point
random_labeled_points = random.sample(np.where(ytrue == 0)[0], labeled_N/2)+\
                        random.sample(np.where(ytrue == 1)[0], labeled_N/2)
ys[random_labeled_points] = ytrue[random_labeled_points]

# supervised score 
basemodel = WQDA() # weighted Quadratic Discriminant Analysis
#basemodel = SGDClassifier(loss='log', penalty='l1') # scikit logistic regression
basemodel.fit(X[random_labeled_points, :], ys[random_labeled_points])
#print "supervised log.reg. score", basemodel.score(X, ytrue)

# fast (but naive, unsafe) self learning framework
ssmodel = SelfLearningModel(basemodel)
ssmodel.fit(X, ys)
print("this is the fitted thing", ssmodel.fit(X,ys))
y_score = ssmodel.predict(heart.data)
#print "heart.target", heart.target
#print "this is the prediction", y_score
print("self-learning log.reg. score", ssmodel.score(X, ytrue))

fpr = dict()
tpr = dict()
roc_auc = dict()
for i in range(2):
    fpr[i], tpr[i], _ = roc_curve(label_binarize(heart.target, classes = [0,1]), label_binarize(y_score, classes = [0,1]))
    roc_auc[i] = auc(fpr[i], tpr[i])

for i in range(2):
    plt.plot(fpr[i], tpr[i], label='ROC curve of class {0} (area = {1:0.2f})'
                                   ''.format(i, roc_auc[i]))