Esempio n. 1
0
def test_LabelSpreading_knn(*data):
    x, y, unlabeled_indices = data
    y_train = np.copy(y)
    y_train[unlabeled_indices] = -1
    fig = plt.figure()
    ax = fig.add_subplot(1, 1, 1)
    alphas = np.linspace(0.01, 1, num=10, endpoint=True)
    Ks = [1, 2, 3, 4, 5, 8, 10, 15, 20, 25, 30, 35, 40, 50]
    colors = (
    (1, 0, 0), (0, 1, 0), (0, 0, 1), (0.5, 0.5, 0), (0, 0.5, 0.5), (0.5, 0, 0.5), (0.4, 0.6, 0), (0.6, 0.4, 0), \
    (0, 0.6, 0.4), (0.5, 0.3, 0.2))  # 颜色集合,不同的曲线用不同的颜色
    # 训练并绘图
    for alpha, color in zip(alphas, colors):
        scores = []
        for K in Ks:
            clf = LabelSpreading(max_iter=100, n_neighbors=K, alpha=alpha, kernel='knn')
            clf.fit(x, y_train)
            scores.append(clf.score(x[unlabeled_indices], y[unlabeled_indices]))
        ax.plot(Ks, scores, label=r"$\alpha=%s$" % alpha, color=color)
    # 设置图形
    ax.set_xlabel(r"k")
    ax.set_ylabel("score")
    ax.legend(loc='best')
    ax.set_title("LabelSpreading knn kernel")
    plt.show()
Esempio n. 2
0
def run_lp_bow_runtime_vocabulary(nbr, str_list, neighbors):
    i = 0
    avg_f1 = 0
    avg_accuracy = 0
    while i < 10:
        dataset = Dataset(categories)
        dataset.load_preprocessed(categories)
        dataset.split_train_true(nbr)
        print_v2_test_docs_vocabulary_labeled(categories)
        dataset.load_preprocessed_test_vocabulary_labeled_in_use(categories)
        vectorizer = CountVectorizer(vocabulary=Vocabulary.get_vocabulary(categories))
        vectors = vectorizer.fit_transform(dataset.train['data'])

        clf = LabelSpreading(kernel='knn', n_neighbors=neighbors).fit(vectors.todense(), dataset.train['target'])
        test_vec = vectorizer.transform(dataset.test['data'])
        pred = clf.predict(test_vec.todense())
        avg_f1 += metrics.f1_score(dataset.test['target'], pred, average='macro')
        avg_accuracy += clf.score(test_vec.todense(), dataset.test['target'])
        i += 1
    avg_accuracy = avg_accuracy/10
    avg_f1 = avg_f1/10
    str_list.extend(["KNN BOW runtime voc Avg f1: " + avg_f1.__str__(), "KNN BOW runtime vod Avg acc: "
                     + avg_accuracy.__str__()])
    print("Avg f1: " + avg_f1.__str__())
    print("Avg acc: " + avg_accuracy.__str__())
Esempio n. 3
0
def test_LabelSpreading_rbf(*data):
    x, y, unlabeled_indices = data
    y_train = np.copy(y)
    y_train[unlabeled_indices] = -1
    fig = plt.figure()
    ax = fig.add_subplot(1, 1, 1)
    alphas = np.linspace(0.01, 1, num=10, endpoint=True)
    gammas = np.logspace(-2, 2, num=50)
    colors = ((1, 0, 0), (0, 1, 0), (0, 0, 1), (0.5, 0.5, 0), (0, 0.5, 0.5), (0.5, 0, 0.5), (0.4, 0.6, 0), (0.6, 0.4, 0) \
                  , (0, 0.6, 0.4), (0.5, 0.3, 0.2))  # 颜色集合,不同的曲线用不同的颜色
    # 训练并绘图
    for alpha, color in zip(alphas, colors):
        scores = []
        for gamma in gammas:
            clf = LabelSpreading(max_iter=100, gamma=gamma, alpha=alpha, kernel='rbf')
            clf.fit(x, y_train)
            scores.append(clf.score(x[unlabeled_indices], y[unlabeled_indices]))
        ax.plot(gammas, scores, label=r"$\alpha=%s$" % alpha, color=color)
    # 设置图形
    ax.set_xlabel(r"$\gamma$")
    ax.set_ylabel("score")
    ax.set_xscale("log")
    ax.legend(loc='best')
    ax.set_title("LabelSpreading rbf kernel")
    plt.show()
Esempio n. 4
0
def test_LabelSpreading(*data):
    X,y,unlabeled_indices = data
    y_train = np.copy(y)
    y_train[unlabeled_indices] = -1
    clf = LabelSpreading(max_iter=1000, kernel='knn',gamma = 0.1)
    clf.fit(X,y_train)
    true_labels = y[unlabeled_indices]
    predicted_labels = clf.transduction_[unlabeled_indices]
    print('Accuracy : %f' %clf.score(X[unlabeled_indices],true_labels))
    print('Accuracy : %f' %metrics.accuracy_score(true_labels,predicted_labels))
Esempio n. 5
0
def test_LabelSpreading_alpha_gamma(*data):
    X,y,unlabeled_indices = data
    y_train = np.copy(y)
    y_train[unlabeled_indices] = -1
    alphas = np.logspace(-2,-1,num = 10)
    gammas = np.logspace(-2,2,num = 10)
    fig = plt.figure()
    ax = fig.add_subplot(1,1,1)    
    for i,alpha in enumerate(alphas):
        scores = []
        for gamma in gammas:
            clf = LabelSpreading(max_iter=1000, kernel='knn',gamma = gamma, alpha = alpha)
            clf.fit(X,y_train)
            true_labels = y[unlabeled_indices]
            scores.append(clf.score(X[unlabeled_indices],true_labels))
        
        ax.plot(alphas,scores,label='alpha = %f' %alpha)
        ax.set_xscale('log')
        ax.legend()
Esempio n. 6
0
def run_lp_tfidf(nbr, str_list, neighbors):
    i = 0
    avg_f1 = 0
    avg_accuracy = 0
    while i < 10:
        dataset = Dataset(categories)
        dataset.split_train_true(nbr)
        vectorizer = TfidfVectorizer()
        vectors = vectorizer.fit_transform(dataset.train['data'])

        clf = LabelSpreading(kernel='knn', n_neighbors=neighbors).fit(vectors.todense(), dataset.train['target'])
        test_vec = vectorizer.transform(dataset.test['data'])
        pred = clf.predict(test_vec.todense())
        avg_f1 += metrics.f1_score(dataset.test['target'], pred, average='macro')
        avg_accuracy += clf.score(test_vec.todense(), dataset.test['target'])
        i += 1
    avg_accuracy = avg_accuracy/10
    avg_f1 = avg_f1/10
    str_list.extend(["KNN TF-IDF Avg f1: " + avg_f1.__str__(), "KNN TF-IDF Avg acc: " + avg_accuracy.__str__()])
    print("Avg f1: " + avg_f1.__str__())
    print("Avg acc: " + avg_accuracy.__str__())
Esempio n. 7
0
def label_spreading(x_train, y_train, x_test, y_test):
    from sklearn.semi_supervised import LabelSpreading
    sel = LabelSpreading()
    sel.fit(x_train, y_train)
    value = sel.score(x_test, y_test)
    return "{0:.2f}".format(value)
idxs = np.random.choice(X_train.shape[0], replace = False, size=n_unlabeled)

y = np.asarray(Y_train)
for i in idxs:
    y[i] = -1

Y_train = y


# Train model and print statistics (use 'knn' as kernel)

from sklearn.semi_supervised import LabelSpreading

model = LabelSpreading(kernel = 'knn', n_neighbors = 10, max_iter=1000).fit(X_train, Y_train)

print("Percentage of correct predictions = {}".format(round(100*model.score(X_test, Y_test),2)))
pred = model.predict(X_test) == Y_test
print("Correct: {}".format(np.count_nonzero(pred==True)),"/",
      "Incorrect: {}".format(np.count_nonzero(pred==False)))

Z1 = model.predict(X_test).reshape(Y_test.size,1)
Z2 = np.asarray(Y_test).reshape(Y_test.size,1)
Z3 = np.around(model.predict_proba(X_test),decimals=2)
data = np.concatenate((Z1,Z2,Z3),axis=1)
outcome = pd.DataFrame(data, columns = ["Predicted Label", 
                                        "Actual Label", 
                                        "Prob. Label = 0.0", 
                                        "Prob. Label = 1.0"])
indicesToKeep = outcome["Predicted Label"] != outcome["Actual Label"]

print("False predictions with associated class probabilities:\n{}".format(outcome[indicesToKeep]))
hist, bins = np.histogram(
    lables,
    bins=[-0.1, 0.1, 1.1, 2.1, 3.1, 4.1, 5.1, 6.1, 7.1, 8.1, 9.1, 10.1])
print(hist)
print(bins)

print(train_labeled.shape)
print(train_labeled[:, 0])

train_unlabeled = sklearn.preprocessing.scale(train_unlabeled)
features = sklearn.preprocessing.scale(features)

lp = LabelSpreading(kernel='knn',
                    gamma=20,
                    n_neighbors=7,
                    alpha=0.2,
                    max_iter=50,
                    tol=0.01,
                    n_jobs=-1)

y = lables
for i in range(21000):
    y = np.concatenate((y, np.array([-1])), axis=0)

all_data = np.concatenate((features, train_unlabeled), axis=0)

lp.fit(all_data, y)
Yresult = lp.predict(all_data)
print(lp.score(all_data, Yresult))

np.savetxt('semiLabelsOfUnlabeled2.csv', Yresult, delimiter=",")
Esempio n. 10
0
                                              gamma=p_gamma,
                                              n_neighbors=p_neighbors,
                                              alpha=p_alpha)
        elif (p_ss_mod == 'LabSpr' and p_ss_kern == 'rbf'):
            label_prop_model = LabelPropagation(kernel=p_ss_kern,
                                                gamma=p_gamma,
                                                n_neighbors=p_neighbors,
                                                alpha=p_alpha,
                                                max_iter=70)
        else:
            label_prop_model = dic_ss_mod[p_ss_mod](kernel=p_ss_kern,
                                                    gamma=p_gamma,
                                                    n_neighbors=p_neighbors)
        print('Start to fit. Run for shelter!')
        label_prop_model.fit(X_tot, y_tot)
        temp_acc = label_prop_model.score(X_valid_lab, y_valid)
        print('{} / {} :accuracy = {}'.format(i, p_manyfit, temp_acc))
        RESULT_ACC_SS += temp_acc
    y_tot = label_prop_model.transduction_
    y_submit = label_prop_model.predict(X_submit)
    save_to_csv(X_tot, y_tot, X_valid_lab, y_valid)
    RESULT_ACC_SS /= p_manyfit
    json_dict['ss_accuracy'] = RESULT_ACC_SS
    print('accuracy obtained on the test set of the ss algo:', RESULT_ACC_SS)
else:
    init_variables()
    #PCA preprocessing
    if (PCA_MODE):
        pca_preprocess()
    X_tot, y_tot, X_valid, y_valid = load_xy()
Esempio n. 11
0
def LabelData(LabelX,
              unLabelX,
              Labely,
              unLabely,
              testX,
              testy,
              batch_id=0,
              save_model=False):
    LabelXLen = LabelX.shape[0]

    print("LabeledCellNames", LabelX)

    X = pd.concat([LabelX, unLabelX], axis=0, join='inner')
    y = np.append(Labely, unLabely)

    Features = X.columns.values.tolist()
    testX = testX.loc[:, Features]

    # Knn LabelSpreading
    label_spread = LabelSpreading(kernel='knn', alpha=0.8, max_iter=5)

    print("==== X ====")
    print(X)

    print("==== Y ====")
    print(y)

    label_spread.fit(X, y)
    output_labels = label_spread.transduction_
    score = label_spread.score(testX, testy)

    output_labels = le.inverse_transform(output_labels)
    CellNames = X.index.values.tolist()
    CellResult = {
        "CellName": CellNames[LabelXLen + 1:],
        "CellType": output_labels[LabelXLen + 1:]
    }

    # Result = pd.DataFrame(data=CellResult)
    # Result.to_csv("./result/%d.csv"%(batch_id), columns=['CellName','CellType'], index=False)

    # accuracy
    print("score : ", score)

    PredictY = label_spread.predict(testX)
    PredictYLabels = le.inverse_transform(PredictY)
    TrueYLabels = le.inverse_transform(testy)
    PredictResult = {"trueLabel": TrueYLabels, "predictLabel": PredictYLabels}

    # PredictResult = pd.DataFrame(data=PredictResult)
    # PredictResult.to_csv("./result/predict_result_%d.csv"%(batch_id), columns=['trueLabel','predictLabel'], index=False)

    # Label Distribution
    print("======= label_spread.label_distributions_ =======")
    print(label_spread.label_distributions_)

    LabelXIndexs = LabelX.index
    indexs = X.index
    ClassLabels = le.inverse_transform(label_spread.classes_)
    print(ClassLabels)

    LabelDistribution = pd.DataFrame(data=label_spread.label_distributions_,
                                     index=indexs,
                                     columns=ClassLabels)
    LabelDistribution = LabelDistribution.drop(index=LabelXIndexs)

    # LabelDistribution.to_csv("./result/test/LabelDistribution.csv")

    if save_model:
        with open('./result/%s/clf.pickle' % (FileName), 'wb') as f:
            pickle.dump(label_spread, f)

    return CellResult, LabelDistribution
Esempio n. 12
0
# classification
# use max_iter=10 when 20 categories
clf_rbf = LabelPropagation(kernel='rbf', gamma=5).fit(vectors_rbf.todense(), dataset_rbf.train['target'])
clf_knn = LabelSpreading(kernel='knn', n_neighbors=10).fit(vectors_knn.todense(), dataset_knn.train['target'])
test_vec_rbf = vectorizer_rbf.transform(dataset_rbf.test['data'])
test_vec_knn = vectorizer_knn.transform(dataset_knn.test['data'])

print('----PREDICTIONS----')
pred_rbf = clf_rbf.predict(test_vec_rbf.todense())
pred_knn = clf_knn.predict(test_vec_knn.todense())

print('f1 score rbf: ', metrics.f1_score(dataset_rbf.test['target'], pred_rbf, average='macro'))
print('clf score rbf: ', clf_rbf.score(test_vec_rbf.todense(), dataset_rbf.test['target']))
print('f1 score knn: ', metrics.f1_score(dataset_knn.test['target'], pred_knn, average='macro'))
print('clf score knn: ', clf_knn.score(test_vec_knn.todense(), dataset_knn.test['target']))

np.set_printoptions(precision=2)

""""
# Plot non-normalized confusion matrix
plot_confusion_matrix(dataset_rbf.test['target'], pred_rbf, classes=categories,
                      title='Confusion matrix (RBF), without normalization')

# Plot normalized confusion matrix
plot_confusion_matrix(dataset_rbf.test['target'], pred_rbf, classes=categories, normalize=True,
                      title='Normalized confusion matrix (RBF)')
plt.show()


# Plot non-normalized confusion matrix
    #print(gridsearch.best_params_)

    print('got Vectors')
    model = LabelSpreading(kernel='rbf')
    params = {'gamma': [0.1, 1.0, 10.0, 30.0, 50.0, 80.0, 100.0, 300.0], 'max_iter': [10, 100, 1000], 'alpha': [0.2, 0.4, 0.6, 0.8]}

    scoreDict = {}

    for max_iter in params['max_iter']:
        model.max_iter = max_iter
        for alpha in params['alpha']:
            model.alpha = alpha
            for gamma in params['gamma']:
                model.gamma = gamma
                model.fit(list(data), list(unlab))
                score = model.score(list(testData), list(testLabels))
                print(score, ' gamma = ', gamma, ' max_iter = ', max_iter, ' alpha = ', alpha)
                if (score in scoreDict):
                    scoreDict[score].append(
                        'gamma = ' + str(gamma) + ' max_iter = ' + str(max_iter) + ' alpha = ' + str(alpha))
                else:
                    scoreDict[score] = [
                        'gamma = ' + str(gamma) + ' max_iter = ' + str(max_iter) + ' alpha = ' + str(alpha)]

    knnModel = LabelSpreading(kernel='knn')
    knnParams = {'n_neighbors': [1, 4, 9, 16], 'max_iter': [10, 100, 1000], 'alpha': [0.2, 0.4, 0.6, 0.8]}
    for max_iter in knnParams['max_iter']:
        model.max_iter = max_iter
        for alpha in knnParams['alpha']:
            model.alpha = alpha
            for n_neighbors in knnParams['n_neighbors']:
Esempio n. 14
0
def run_methods(x_c, y, x_e, z_c, z_y, z_e):
    x = np.concatenate((x_c, x_e), axis=1)
    z = np.concatenate((z_c, z_e), axis=1)

    # Baseline: Linear Logistic Regression
    lin_lr = LogisticRegression(random_state=0,
                                solver='liblinear').fit(x, y.ravel())
    acc_lin_lr = lin_lr.score(z, z_y)
    # hard_label_lin_lr = lin_lr.predict(z)
    # soft_label_lin_lr = lin_lr.predict_proba(z)[:, 1]

    # TRANSDUCTIVE APPROACHES
    # merge labelled and unlabelled data (with label -1) for transductive methods
    x_merged = np.concatenate((x, z))
    y_merged = np.concatenate((y, -1 * np.ones(
        (z.shape[0], 1)))).ravel().astype(int)

    # Baseline: Linear TSVM: https://github.com/tmadl/semisup-learn/tree/master/methods
    lin_tsvm = SKTSVM(kernel='linear')
    lin_tsvm.fit(x_merged, y_merged)
    acc_lin_tsvm = lin_tsvm.score(z, z_y)
    # hard_label_lin_tsvm = lin_tsvm.predict(z)
    # soft_label_lin_tsvm = lin_tsvm.predict_proba(z)[:, 1]

    # Baseline: Non-Linear TSVM:  https://github.com/tmadl/semisup-learn/tree/master/methods
    rbf_tsvm = SKTSVM(kernel='RBF')
    rbf_tsvm.fit(x_merged, y_merged)
    acc_rbf_tsvm = rbf_tsvm.score(z, z_y)
    # hard_label_rbf_tsvm = rbf_tsvm.predict(z)
    # soft_label_rbf_tsvm = rbf_tsvm.predict_proba(z)[:, 1]

    # Baseline: Label Propagation RBF weights
    try:
        rbf_label_prop = LabelPropagation(kernel='rbf')
        rbf_label_prop.fit(x_merged, y_merged)
        acc_rbf_label_prop = rbf_label_prop.score(z, z_y)
        # hard_label_rbf_label_prop= rbf_label_prop.predict(z)
        # soft_label_rbf_label_prop = rbf_label_prop.predict_proba(z)[:, 1]
    except:
        acc_rbf_label_prop = []
        print 'rbf label prop did not work'

    # Baseline: Label Spreading with RBF weights
    try:
        rbf_label_spread = LabelSpreading(kernel='rbf')
        rbf_label_spread.fit(x_merged, y_merged)
        acc_rbf_label_spread = rbf_label_spread.score(z, z_y)
        # hard_label_rbf_label_spread = rbf_label_spread.predict(z)
        # soft_label_rbf_label_spread = rbf_label_spread.predict_proba(z)[:, 1]
    except:
        acc_rbf_label_spread = []
        print 'rbf label spread did not work '

    # THE K-NN VERSIONS ARE UNSTABLE UNLESS USING LARGE K
    # Baseline: Label Propagation with k-NN weights
    try:
        knn_label_prop = LabelPropagation(kernel='knn', n_neighbors=11)
        knn_label_prop.fit(x_merged, y_merged)
        acc_knn_label_prop = knn_label_prop.score(z, z_y)
        # hard_label_knn_label_prop = knn_label_prop.predict(z)
        # soft_label_knn_label_prop = knn_label_prop.predict_proba(z)[:, 1]
    except:
        acc_knn_label_prop = []
        print 'knn label prop did not work'

    # Baseline: Label Spreading with k-NN weights
    try:
        knn_label_spread = LabelSpreading(kernel='knn', n_neighbors=11)
        knn_label_spread.fit(x_merged, y_merged)
        acc_knn_label_spread = knn_label_spread.score(z, z_y)
        # hard_label_knn_label_spread = knn_label_spread.predict(z)
        # soft_label_knn_label_spread = knn_label_spread.predict_proba(z)[:, 1]
    except:
        acc_knn_label_spread = []
        print 'knn label spread did not work'

    # Generative Models
    # Semi-generative model on labelled data only
    a_y, b_y, a_e0, a_e1, b_0, b_1, cov_e0, cov_e1 = soft_label_EM(
        x_c, y, x_e, z_c, z_e, converged=True)
    soft_label_semigen = predict_class_probs(z_c, z_e, a_y, b_y, a_e0, a_e1,
                                             b_0, b_1, cov_e0, cov_e1)
    hard_label_semigen = soft_label_semigen > 0.5
    acc_semigen_labelled = np.mean(hard_label_semigen == z_y)

    # EM with soft labels
    a_y, b_y, a_e0, a_e1, b_0, b_1, cov_e0, cov_e1 = soft_label_EM(
        x_c, y, x_e, z_c, z_e)
    soft_label_soft_EM = predict_class_probs(z_c, z_e, a_y, b_y, a_e0, a_e1,
                                             b_0, b_1, cov_e0, cov_e1)
    hard_label_soft_EM = soft_label_soft_EM > 0.5
    acc_soft_EM = np.mean(hard_label_soft_EM == z_y)

    # EM with hard labels
    a_y, b_y, a_e0, a_e1, b_0, b_1, cov_e0, cov_e1 = hard_label_EM(
        x_c, y, x_e, z_c, z_e)
    soft_label_hard_EM = predict_class_probs(z_c, z_e, a_y, b_y, a_e0, a_e1,
                                             b_0, b_1, cov_e0, cov_e1)
    hard_label_hard_EM = soft_label_hard_EM > 0.5
    acc_hard_EM = np.mean(hard_label_hard_EM == z_y)

    # Conditional label prop
    acc_cond_prop = conditional_prop(x_c, y, x_e, z_c, z_y, z_e)

    return acc_lin_lr, acc_lin_tsvm, acc_rbf_tsvm, acc_rbf_label_prop, acc_rbf_label_spread, acc_knn_label_prop,\
           acc_knn_label_spread, acc_semigen_labelled, acc_soft_EM, acc_hard_EM, acc_cond_prop
Esempio n. 15
0
# In[356]:

# Test Label Spreading by cross validation
skf = StratifiedKFold(n_splits=5)
score0 = []
score0_holdout = []
score1 = []
score1_holdout = []
for i_train, i_test in skf.split(X_train2_, Y_train2_.argmax(axis=1)):
    X_train3, y_train3 = X_train2_[i_train], Y_train2_[i_train].argmax(axis=1)
    X_holdout3, y_holdout3 = X_train2_[i_test], Y_train2_[i_test].argmax(
        axis=1)
    n_holdout3 = len(y_holdout3)
    ls0 = LabelSpreading(kernel='rbf', gamma=2, n_neighbors=4)
    ls0.fit(X_train3, y_train3)
    score0.append(ls0.score(X_holdout3, y_holdout3))
    score0_holdout.append(ls0.score(X_holdout2, Y_holdout2.argmax(axis=1)))
    print('   Supervised score: {:.4f} (holdout {:.4f})'.format(
        score0[-1], score0_holdout[-1]))
    ls1 = LabelSpreading(kernel='rbf', gamma=2, n_neighbors=4)
    ls1.fit(np.vstack((X_train3, X_holdout3)),
            np.concatenate((y_train3, np.full(n_holdout3, -1))))
    score1.append(ls1.score(X_holdout3, y_holdout3))
    score1_holdout.append(ls1.score(X_holdout2, Y_holdout2.argmax(axis=1)))
    print('   Semi-Supervised score: {:.4f} (holdout {:.4f})'.format(
        score1[-1], score1_holdout[-1]))
print('Mean supervised: {:.4f} (holdout {:.4f})'.format(
    np.mean(score0), np.mean(score0_holdout)))
print('Mean semi-supervised: {:.4f} (holdout {:.4f})'.format(
    np.mean(score1), np.mean(score1_holdout)))
Esempio n. 16
0
    def label_spreading(self,
                        kernel='rbf',
                        gamma=20,
                        n_neighbors=7,
                        alpha=0.2,
                        max_iter=30,
                        tol=0.001,
                        n_jobs=1):
        """
            LabelSpreading model for semi-supervised learning
            This model is similar to the basic Label Propagation algorithm,
            but uses affinity matrix based on the normalized graph Laplacian
            and soft clamping across the labels.

            Parameters
            ----------
            kernel : {'knn', 'rbf', callable}
                String identifier for kernel function to use or the kernel function
                itself. Only 'rbf' and 'knn' strings are valid inputs. The function
                passed should take two inputs, each of shape [n_samples, n_features],
                and return a [n_samples, n_samples] shaped weight matrix

            gamma : float
              parameter for rbf kernel

            n_neighbors : integer > 0
              parameter for knn kernel

            alpha : float
              Clamping factor. A value in [0, 1] that specifies the relative amount
              that an instance should adopt the information from its neighbors as
              opposed to its initial label.
              alpha=0 means keeping the initial label information; alpha=1 means
              replacing all initial information.

            max_iter : integer
              maximum number of iterations allowed

            tol : float
              Convergence tolerance: threshold to consider the system at steady
              state

            n_jobs : int or None, optional (default=None)
                The number of parallel jobs to run.
                ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.
                ``-1`` means using all processors. See :term:`Glossary <n_jobs>`
                for more details.

            Returns
            -------
            score : the score of learning model on test data

            Example
            --------
            >>> labeled_path = "../data/labeled.csv"
            >>> unlabeled_path = "../data/unlabeled.csv"
            >>> mtl = MultiTaskLearner(labeled_path, unlabeled_path)
            >>> encoding = mtl.embed(word_length=5)
            >>> X, y, X_t, y_t = train_test_split(mtl.sequences, mtl.labels, test_size=0.33)
            >>> score = mtl.semi_supervised_learner(X, y, X_t, y_t, ssl="label_spreading")
        """
        model = LabelSpreading(kernel=kernel,
                               gamma=gamma,
                               n_neighbors=n_neighbors,
                               alpha=alpha,
                               max_iter=max_iter,
                               tol=tol,
                               n_jobs=n_jobs)
        model.fit(self.X, self.y)
        return model.score(self.X_t, self.y_t)
pred = model4.predict(X_test) == Y_test

statistics.loc[3] = ["SS Naive Bayes", 
                     round(100*model4.score(X_test, Y_test),2), 
                     np.count_nonzero(pred==True), 
                     np.count_nonzero(pred==False), 
                     round(100*idxs.size/Y_train.size,2),
                     training_time4]

# Train semi-supervised LabelSpreading model, predict (use 'knn' as kernel) and collect statistics

from sklearn.semi_supervised import LabelSpreading

start_time5 = time.time_ns()
model5 = LabelSpreading(kernel = 'knn', n_neighbors = 10, max_iter=1000).fit(X_train, Y_train)
training_time5 = time.time_ns() - start_time5
pred = model5.predict(X_test) == Y_test

statistics.loc[4] = ["SS Label Spreading", 
                     round(100*model5.score(X_test, Y_test),2), 
                     np.count_nonzero(pred==True), 
                     np.count_nonzero(pred==False), 
                     round(100*idxs.size/Y_train.size,2),
                     training_time5]

# Print summary statistics

print(statistics)


Esempio n. 18
0
                         max_iter=1000).fit(vectors.todense(),
                                            dataset.train['target'])
clf_knn = LabelSpreading(kernel='knn', n_neighbors=5,
                         max_iter=1000).fit(vectors_knn.todense(),
                                            dataset_knn.train['target'])
test_vec_rbf = vectorizer_rbf.transform(dataset.test['data'])
test_vec_knn = vectorizer_knn.transform(dataset_knn.test['data'])

print('----PREDICTIONS----')
pred_rbf = clf_rbf.predict(test_vec_rbf.todense())
pred_knn = clf_knn.predict(test_vec_knn.todense())

print('f1 score rbf: ',
      metrics.f1_score(dataset.test['target'], pred_rbf, average='macro'))
print('clf score rbf: ',
      clf_rbf.score(test_vec_rbf.todense(), dataset.test['target']))
print('f1 score knn: ',
      metrics.f1_score(dataset_knn.test['target'], pred_knn, average='macro'))
print('clf score knn: ',
      clf_knn.score(test_vec_knn.todense(), dataset_knn.test['target']))

np.set_printoptions(precision=2)
"""
# Plot non-normalized confusion matrix
plot_confusion_matrix(dataset.test['target'], pred_rbf, classes=categories,
                      title='Confusion matrix (RBF with vocabulary), without normalization')

# Plot normalized confusion matrix
plot_confusion_matrix(dataset.test['target'], pred_rbf, classes=categories, normalize=True,
                      title='Normalized confusion matrix (RBF with vocabulary)')
plt.show()