Exemple #1
0
def tsvm(train_examples, train_labels, test_examples, test_labels, verbose):
    model = CPLELearningModel(SVC(kernel="rbf",
                                  C=10,
                                  gamma=0.01,
                                  probability=True),
                              predict_from_probabilities=True)
    model.fit(train_examples, train_labels)
    score = model.score(test_examples, test_labels)
    return score
 def CPLELearningWrapper(X_train, y_train, X_test):
     from frameworks.CPLELearning import CPLELearningModel
     #clf = RandomForestClassifier()
     from sklearn.linear_model.stochastic_gradient import SGDClassifier
     clf = SGDClassifier(loss='log', penalty='l1')
     ssmodel = CPLELearningModel(clf)
     newlabels = np.concatenate((np.array(y_train), -np.ones(len(X_test))))
     ssmodel.fit(np.concatenate((X_train, X_test)), newlabels)
     return ssmodel.predict(X_test)
Exemple #3
0
def run3(file_path):
    index = -1
    features = []
    label = []
    for path in file_path:
        a, b, c = fe(path, 1, 1, 0.05, 0.05, compute_beat=False)
        for example in a:
            features.append(example.tolist())
            label.append(index)
        index += 1
        print(index, " FOLDER FEATURE EXTRACTED")
    features = np.asarray(features)
    label = np.asarray(label)
    model = CPLELearningModel(SVC(kernel="rbf",
                                  C=10,
                                  gamma=0.01,
                                  probability=True),
                              predict_from_probabilities=True)
    model.fit(features, label)
    pkl_filename = "tsvm.pkl"
    with open(pkl_filename, 'wb') as file:
        pickle.dump(model, file)
        print("MODEL SAVED")
ys[random_labeled_points] = ytrue[random_labeled_points]

# supervised score
#basemodel = WQDA() # weighted Quadratic Discriminant Analysis
basemodel = SGDClassifier(loss='log',
                          penalty='l1')  # scikit logistic regression
basemodel.fit(X[random_labeled_points, :], ys[random_labeled_points])
print("supervised log.reg. score", basemodel.score(X, ytrue))

# fast (but naive, unsafe) self learning framework
ssmodel = SelfLearningModel(basemodel)
ssmodel.fit(X, ys)
print("self-learning log.reg. score", ssmodel.score(X, ytrue))

# semi-supervised score (base model has to be able to take weighted samples)
ssmodel = CPLELearningModel(basemodel)
ssmodel.fit(X, ys)
print("CPLE semi-supervised log.reg. score", ssmodel.score(X, ytrue))

# semi-supervised score, WQDA model
ssmodel = CPLELearningModel(WQDA(), predict_from_probabilities=True
                            )  # weighted Quadratic Discriminant Analysis
ssmodel.fit(X, ys)
print("CPLE semi-supervised WQDA score", ssmodel.score(X, ytrue))

# semi-supervised score, RBF SVM model
ssmodel = CPLELearningModel(sklearn.svm.SVC(kernel="rbf", probability=True),
                            predict_from_probabilities=True)  # RBF SVM
ssmodel.fit(X, ys)
print("CPLE semi-supervised RBF SVM score", ssmodel.score(X, ytrue))
    plt.subplot(2,2,i+1)
    plt.hold(True)
    
    t1=time.time()
    # train model
    if i == 0:
        lbl = "Purely supervised SVM:"
        model = sklearn.svm.SVC(kernel=kernel, probability=True)
        model.fit(Xsupervised, ysupervised)
    else:
        if i==1:
            lbl =  "S3VM (Gieseke et al. 2012):"
            model = scikitTSVM.SKTSVM(kernel=kernel)
        elif i == 2:
            lbl = "CPLE(pessimistic) SVM:"
            model = CPLELearningModel(sklearn.svm.SVC(kernel=kernel, probability=True))
        elif i == 3:
            lbl = "CPLE(optimistic) SVM:"
            CPLELearningModel.pessimistic = False
            model = CPLELearningModel(sklearn.svm.SVC(kernel=kernel, probability=True))
        model.fit(Xs, ys.astype(int))
    print ""
    print lbl
    print "Model training time: ", round(time.time()-t1, 3)

    # predict, and evaluate
    pred = model.predict(Xs)
    
    acc = np.mean(pred==ytrue)
    print "accuracy:", round(acc, 3)
    
Xsupervised = Xs[ys != -1, :]
ysupervised = ys[ys != -1]

# compare models

lbl = "Purely supervised QDA:"
print lbl
model = WQDA()
model.fit(Xsupervised, ysupervised)
evaluate_and_plot(model, Xs, ys, ytrue, lbl, 1)

lbl = "SelfLearning QDA:"
print lbl
model = SelfLearningModel(WQDA())
model.fit(Xs, ys)
evaluate_and_plot(model, Xs, ys, ytrue, lbl, 2)

lbl = "CPLE(pessimistic) QDA:"
print lbl
model = CPLELearningModel(WQDA(), predict_from_probabilities=True)
model.fit(Xs, ys)
evaluate_and_plot(model, Xs, ys, ytrue, lbl, 3)

lbl = "CPLE(optimistic) QDA:"
print lbl
CPLELearningModel.pessimistic = False
model = CPLELearningModel(WQDA(), predict_from_probabilities=True)
model.fit(Xs, ys)
evaluate_and_plot(model, Xs, ys, ytrue, lbl, 4, block=True)
model.fit(Xsupervised, ysupervised)
evaluate_and_plot(model, Xs, ys, ytrue, lbl, 1)


lbl = "S3VM (Gieseke et al. 2012):"
print(lbl)
model = scikitTSVM.SKTSVM(kernel=kernel)
model.fit(Xs, ys)
evaluate_and_plot(model, Xs, ys, ytrue, lbl, 2)


lbl = "CPLE(pessimistic) SVM:"
print(lbl)
model = CPLELearningModel(
    sklearn.svm.SVC(
        kernel=kernel,
        probability=True,
        gamma='auto'),
    predict_from_probabilities=True)
model.fit(Xs, ys)
evaluate_and_plot(model, Xs, ys, ytrue, lbl, 3)


lbl = "CPLE(optimistic) SVM:"
print(lbl)
CPLELearningModel.pessimistic = False
model = CPLELearningModel(
    sklearn.svm.SVC(
        kernel=kernel,
        probability=True,
        gamma='auto'),
    predict_from_probabilities=True)
Exemple #8
0
from sklearn.linear_model.stochastic_gradient import SGDClassifier
from methods.scikitWQDA import WQDA

# load data
heart = fetch_mldata("heart")
X = heart.data
ytrue = np.copy(heart.target)
ytrue[ytrue == -1] = 0

# label a few points
labeled_N = 2
ys = np.array([-1] * len(ytrue))  # -1 denotes unlabeled point
random_labeled_points = random.sample(np.where(ytrue == 0)[0], labeled_N / 2) + random.sample(
    np.where(ytrue == 1)[0], labeled_N / 2
)
ys[random_labeled_points] = ytrue[random_labeled_points]

# supervised score
# basemodel = WQDA() # weighted Quadratic Discriminant Analysis
basemodel = SGDClassifier(loss="log", penalty="l1")  # scikit logistic regression
basemodel.fit(X[random_labeled_points, :], ys[random_labeled_points])
print "supervised score", basemodel.score(X, ytrue)

# semi-supervised score (base model has to be able to take weighted samples)
ssmodel = CPLELearningModel(basemodel)
ssmodel.fit(X, ys)
print "semi-supervised score", ssmodel.score(X, ytrue)

# supervised score 0.418518518519
# semi-supervised score 0.555555555556
ysupervised = ys[ys != -1]

# compare models
lbl = "Purely supervised SVM:"
print lbl
model = sklearn.svm.SVC(kernel=kernel, probability=True)
model.fit(Xsupervised, ysupervised)
evaluate_and_plot(model, Xs, ys, ytrue, lbl, 1)


lbl = "S3VM (Gieseke et al. 2012):"
print lbl
model = scikitTSVM.SKTSVM(kernel=kernel)
model.fit(Xs, ys)
evaluate_and_plot(model, Xs, ys, ytrue, lbl, 2)


lbl = "CPLE(pessimistic) SVM:"
print lbl
model = CPLELearningModel(sklearn.svm.SVC(kernel=kernel, probability=True), predict_from_probabilities=True)
model.fit(Xs, ys)
evaluate_and_plot(model, Xs, ys, ytrue, lbl, 3)


lbl = "CPLE(optimistic) SVM:"
print lbl
CPLELearningModel.pessimistic = False
model = CPLELearningModel(sklearn.svm.SVC(kernel=kernel, probability=True), predict_from_probabilities=True)
model.fit(Xs, ys)
evaluate_and_plot(model, Xs, ys, ytrue, lbl, 4, block=True)
Exemple #10
0
ys[sidx] = ytrue[sidx]

Xsupervised = Xs[ys!=-1, :]
ysupervised = ys[ys!=-1]

# compare models
lbl = "Purely supervised SVM:"
print(lbl)
model = sklearn.svm.SVC(kernel=kernel, probability=True)
model.fit(Xsupervised, ysupervised)
evaluate_and_plot(model, Xs, ys, ytrue, lbl, 1)

lbl =  "S3VM (Gieseke et al. 2012):"
print(lbl)
model = scikitTSVM.SKTSVM(kernel=kernel)
model.fit(Xs, ys.astype(int))
evaluate_and_plot(model, Xs, ys, ytrue, lbl, 2)

lbl = "CPLE(pessimistic) SVM:"
print(lbl)
model = CPLELearningModel(sklearn.svm.SVC(kernel=kernel, probability=True), predict_from_probabilities=True)
model.fit(Xs, ys.astype(int))
evaluate_and_plot(model, Xs, ys, ytrue, lbl, 3)

lbl = "CPLE(optimistic) SVM:"
print(lbl)
CPLELearningModel.pessimistic = False
model = CPLELearningModel(sklearn.svm.SVC(kernel=kernel, probability=True), predict_from_probabilities=True)
model.fit(Xs, ys.astype(int))
evaluate_and_plot(model, Xs, ys, ytrue, lbl, 4, block=True)
Exemple #11
0
def main(centroid_name, cluster_name, feature_root, centroid_KPI_label_count):
    matrix = np.array([[0, 0], [0, 0]])
    summary_cluster = []
    summary_curve = []
    df_source = pd.read_csv(os.path.join(feature_root, centroid_name))
    feature_name = [i for i in df_source.columns if i.startswith("F#")]
    cluster_matrix = np.array([[0, 0], [0, 0]])
    real_label = df_source['label'].values.copy()
    df_source, centroid_KPI_real_label_count = label_positive(
        df_source, centroid_KPI_label_count, 'random')
    centroid_train = df_source.copy()
    '''Start PU learning'''
    PU_model = PULearningModel(centroid_train[feature_name].values,
                               centroid_train['label'].values,
                               len(centroid_train))
    print Counter(centroid_train['label'].values)
    PU_model.pre_training(0.2)
    print Counter(real_label)
    RF_model = RandomForestClassifier(n_estimators=100)
    PU_labels, positive_label_count = PU_model.add_reliable_samples_using_RandomForest(
        0.015, 200, 0.7, real_label, RF_model)
    train_data = centroid_train[feature_name].values
    centroid_train['label'] = PU_labels
    print 'Finish PU learning for centroid:', Counter(
        centroid_train['label'].values)
    '''Finish PU learning'''

    for name_suffix in os.listdir(feature_root):
        if name_suffix == centroid_name:
            continue
        print('*' * 30)
        print(name_suffix)
        print('*' * 30)
        df_target = pd.read_csv(os.path.join(feature_root, name_suffix))

        target_test_length = int(target_test_ratio * len(df_target))
        test = df_target[-target_test_length:].copy()
        target_train_length = int(target_train_ratio * len(df_target))
        target_train = df_target[:target_train_length].copy()
        target_train_with_label = target_train.copy()
        target_train['label'] = -1

        train = pd.concat([centroid_train, target_train]).copy()
        print Counter(train['label'].values)
        model = CPLELearningModel(basemodel=RandomForestClassifier(
            config.RF_n_trees, n_jobs=15),
                                  max_iter=50,
                                  predict_from_probabilities=True,
                                  real_label=None)
        train_data = train[feature_name].values
        train_label = train['label'].values
        print 'start training CPLE model:', Counter(train_label)
        model.fit(train_data, train_label)
        print("finish train")
        # exit()
        name = name_suffix + '_PU'
        joblib.dump(model, model_root + '/' + name + ".sav")
        model1 = joblib.load(model_root + '/' + name + '.sav')
        print("model is :", model1)

        proba = model.predict_proba(test[feature_name])
        proba = proba[:, 1]

        eva = delay_eva(test["label"].values, proba)
        print(proba)
        _, best_threshold = eva.best_fscore_threshold()
        threshold = best_threshold
        print "threshold is", threshold

        predict_ans = eva.predict_for_threshold(threshold)
        save_proba(model, test, name + "_test" + ".csv", predict_ans)

        fscore = eva.fscore_for_threshold(threshold)
        average_detection_delay = eva.average_detection_delay(
            threshold) * config.interval / 60
        print("PUAD fscore of test is %f", fscore)
        print("PUAD average_detection_delay is %f", average_detection_delay)
        temp_matrix = eva.confusion_matrix_for_threshold(threshold)
        matrix = matrix + temp_matrix
        cluster_matrix = cluster_matrix + temp_matrix

        _, pre, rec = cal_fscore(temp_matrix)
        TP = temp_matrix[1][1]
        FP = temp_matrix[0][1]
        FN = temp_matrix[1][0]
        print 'TP:', TP
        print 'FP:', FP
        print 'FN:', FN
        temp = OrderedDict([("name", name), ("medios", 0),
                            ("label", test["label"].values.sum()),
                            ("PU_fscore", fscore),
                            ("delay", average_detection_delay), ("pre", pre),
                            ("rec", rec), ("TP", TP), ("FP", FP), ("FN", FN),
                            ("centroid_KPI_label_count",
                             centroid_KPI_label_count),
                            ("threshold", threshold)])
        summary_curve.append(temp)

        # 在这里上面的循环结束

    df_curveresults = pd.DataFrame(summary_curve)
    df_curveresults.to_csv(os.path.join(
        result_root, 'PUAD_%s_%d_%f_result.csv' %
        (cluster_name, centroid_KPI_label_count, 0.015)),
                           index=False)
sidx = random.sample(np.where(ytrue == 0)[0], supevised_data_points/2)+random.sample(np.where(ytrue == 1)[0], supevised_data_points/2)
ys[sidx] = ytrue[sidx]

Xsupervised = Xs[ys!=-1, :]
ysupervised = ys[ys!=-1]
    
# compare models     
lbl = "Purely supervised SVM:"
print lbl
model = sklearn.svm.SVC(kernel=kernel, probability=True)
model.fit(Xsupervised, ysupervised)
evaluate_and_plot(model, Xs, ys, ytrue, lbl, 1)

lbl =  "S3VM (Gieseke et al. 2012):"
print lbl
model = scikitTSVM.SKTSVM(kernel=kernel)
model.fit(Xs, ys.astype(int))
evaluate_and_plot(model, Xs, ys, ytrue, lbl, 2)

lbl = "CPLE(pessimistic) SVM:"
print lbl
model = CPLELearningModel(sklearn.svm.SVC(kernel=kernel, probability=True), predict_from_probabilities=True)
model.fit(Xs, ys.astype(int))
evaluate_and_plot(model, Xs, ys, ytrue, lbl, 3)

lbl = "CPLE(optimistic) SVM:"
print lbl
CPLELearningModel.pessimistic = False
model = CPLELearningModel(sklearn.svm.SVC(kernel=kernel, probability=True), predict_from_probabilities=True)
model.fit(Xs, ys.astype(int))
evaluate_and_plot(model, Xs, ys, ytrue, lbl, 4, block=True)
Exemple #13
0
ys[random_labeled_points] = ytrue[random_labeled_points]

# supervised score
# basemodel = WQDA() # weighted Quadratic Discriminant Analysis
# scikit logistic regression
basemodel = SGDClassifier(loss='log', penalty='l1')
basemodel.fit(X[random_labeled_points, :], ys[random_labeled_points])
print("supervised log.reg. score", basemodel.score(X, ytrue))

# fast (but naive, unsafe) self learning framework
ssmodel = SelfLearningModel(basemodel)
ssmodel.fit(X, ys)
print("self-learning log.reg. score", ssmodel.score(X, ytrue))

# semi-supervised score (base model has to be able to take weighted samples)
ssmodel = CPLELearningModel(basemodel)
ssmodel.fit(X, ys)
print("CPLE semi-supervised log.reg. score", ssmodel.score(X, ytrue))

# semi-supervised score, WQDA model
# weighted Quadratic Discriminant Analysis
ssmodel = CPLELearningModel(WQDA(), predict_from_probabilities=True)
ssmodel.fit(X, ys)
print("CPLE semi-supervised WQDA score", ssmodel.score(X, ytrue))

# semi-supervised score, RBF SVM model
ssmodel = CPLELearningModel(sklearn.svm.SVC(kernel="rbf", probability=True),
                            predict_from_probabilities=True)  # RBF SVM
ssmodel.fit(X, ys)
print("CPLE semi-supervised RBF SVM score", ssmodel.score(X, ytrue))
    train_data = np.concatenate((X_labelled, X_unlabelled),axis =0)
    len_train = len(train_data)
    print 'No. of training data:', len_train
    #Final Traning labels
    train_labels = np.concatenate((y_labelled, y_minusone), axis = 0)
    len_labels = len(train_labels)
    print 'No. of training labels:', len_labels
    ##Print the number of test data
    print 'No. of test data:', len(y_extra)
    ################################################################################


    ################################################################################
    lbl = "CPLE(pessimistic) SVM:"
    print lbl
    model = CPLELearningModel(svm.SVC(kernel="rbf", probability=True), predict_from_probabilities=True, max_iter = 5000 )
    model.fit(train_data, train_labels)
    y_predict = model.predict(X_extra)

    accuracy = accuracy_score(y_extra, y_predict)
    print accuracy
    error_rate_svm[i] = 1 - accuracy
    logLik_svm[i] = -np.sum( stats.norm.logpdf(y_extra, loc=y_predict, scale=sd) )
    print 'CPLE Error Rate:', error_rate_svm[i], logLik_svm[i]
    ###############################################################################
    ################################################################################
    #Create the semi supervised KNN classifier
    lbl = "Label Propagation(KNN):"
    print lbl
    knn_model = label_propagation.LabelSpreading(kernel='knn', alpha=0.0001, max_iter=3000)
    knn_model.fit(train_data, train_labels)