def test_LabelSpreading_knn(*data): x, y, unlabeled_indices = data y_train = np.copy(y) y_train[unlabeled_indices] = -1 fig = plt.figure() ax = fig.add_subplot(1, 1, 1) alphas = np.linspace(0.01, 1, num=10, endpoint=True) Ks = [1, 2, 3, 4, 5, 8, 10, 15, 20, 25, 30, 35, 40, 50] colors = ( (1, 0, 0), (0, 1, 0), (0, 0, 1), (0.5, 0.5, 0), (0, 0.5, 0.5), (0.5, 0, 0.5), (0.4, 0.6, 0), (0.6, 0.4, 0), \ (0, 0.6, 0.4), (0.5, 0.3, 0.2)) # 颜色集合,不同的曲线用不同的颜色 # 训练并绘图 for alpha, color in zip(alphas, colors): scores = [] for K in Ks: clf = LabelSpreading(max_iter=100, n_neighbors=K, alpha=alpha, kernel='knn') clf.fit(x, y_train) scores.append(clf.score(x[unlabeled_indices], y[unlabeled_indices])) ax.plot(Ks, scores, label=r"$\alpha=%s$" % alpha, color=color) # 设置图形 ax.set_xlabel(r"k") ax.set_ylabel("score") ax.legend(loc='best') ax.set_title("LabelSpreading knn kernel") plt.show()
def run_lp_bow_runtime_vocabulary(nbr, str_list, neighbors): i = 0 avg_f1 = 0 avg_accuracy = 0 while i < 10: dataset = Dataset(categories) dataset.load_preprocessed(categories) dataset.split_train_true(nbr) print_v2_test_docs_vocabulary_labeled(categories) dataset.load_preprocessed_test_vocabulary_labeled_in_use(categories) vectorizer = CountVectorizer(vocabulary=Vocabulary.get_vocabulary(categories)) vectors = vectorizer.fit_transform(dataset.train['data']) clf = LabelSpreading(kernel='knn', n_neighbors=neighbors).fit(vectors.todense(), dataset.train['target']) test_vec = vectorizer.transform(dataset.test['data']) pred = clf.predict(test_vec.todense()) avg_f1 += metrics.f1_score(dataset.test['target'], pred, average='macro') avg_accuracy += clf.score(test_vec.todense(), dataset.test['target']) i += 1 avg_accuracy = avg_accuracy/10 avg_f1 = avg_f1/10 str_list.extend(["KNN BOW runtime voc Avg f1: " + avg_f1.__str__(), "KNN BOW runtime vod Avg acc: " + avg_accuracy.__str__()]) print("Avg f1: " + avg_f1.__str__()) print("Avg acc: " + avg_accuracy.__str__())
def test_LabelSpreading_rbf(*data): x, y, unlabeled_indices = data y_train = np.copy(y) y_train[unlabeled_indices] = -1 fig = plt.figure() ax = fig.add_subplot(1, 1, 1) alphas = np.linspace(0.01, 1, num=10, endpoint=True) gammas = np.logspace(-2, 2, num=50) colors = ((1, 0, 0), (0, 1, 0), (0, 0, 1), (0.5, 0.5, 0), (0, 0.5, 0.5), (0.5, 0, 0.5), (0.4, 0.6, 0), (0.6, 0.4, 0) \ , (0, 0.6, 0.4), (0.5, 0.3, 0.2)) # 颜色集合,不同的曲线用不同的颜色 # 训练并绘图 for alpha, color in zip(alphas, colors): scores = [] for gamma in gammas: clf = LabelSpreading(max_iter=100, gamma=gamma, alpha=alpha, kernel='rbf') clf.fit(x, y_train) scores.append(clf.score(x[unlabeled_indices], y[unlabeled_indices])) ax.plot(gammas, scores, label=r"$\alpha=%s$" % alpha, color=color) # 设置图形 ax.set_xlabel(r"$\gamma$") ax.set_ylabel("score") ax.set_xscale("log") ax.legend(loc='best') ax.set_title("LabelSpreading rbf kernel") plt.show()
def test_LabelSpreading(*data): X,y,unlabeled_indices = data y_train = np.copy(y) y_train[unlabeled_indices] = -1 clf = LabelSpreading(max_iter=1000, kernel='knn',gamma = 0.1) clf.fit(X,y_train) true_labels = y[unlabeled_indices] predicted_labels = clf.transduction_[unlabeled_indices] print('Accuracy : %f' %clf.score(X[unlabeled_indices],true_labels)) print('Accuracy : %f' %metrics.accuracy_score(true_labels,predicted_labels))
def test_LabelSpreading_alpha_gamma(*data): X,y,unlabeled_indices = data y_train = np.copy(y) y_train[unlabeled_indices] = -1 alphas = np.logspace(-2,-1,num = 10) gammas = np.logspace(-2,2,num = 10) fig = plt.figure() ax = fig.add_subplot(1,1,1) for i,alpha in enumerate(alphas): scores = [] for gamma in gammas: clf = LabelSpreading(max_iter=1000, kernel='knn',gamma = gamma, alpha = alpha) clf.fit(X,y_train) true_labels = y[unlabeled_indices] scores.append(clf.score(X[unlabeled_indices],true_labels)) ax.plot(alphas,scores,label='alpha = %f' %alpha) ax.set_xscale('log') ax.legend()
def run_lp_tfidf(nbr, str_list, neighbors): i = 0 avg_f1 = 0 avg_accuracy = 0 while i < 10: dataset = Dataset(categories) dataset.split_train_true(nbr) vectorizer = TfidfVectorizer() vectors = vectorizer.fit_transform(dataset.train['data']) clf = LabelSpreading(kernel='knn', n_neighbors=neighbors).fit(vectors.todense(), dataset.train['target']) test_vec = vectorizer.transform(dataset.test['data']) pred = clf.predict(test_vec.todense()) avg_f1 += metrics.f1_score(dataset.test['target'], pred, average='macro') avg_accuracy += clf.score(test_vec.todense(), dataset.test['target']) i += 1 avg_accuracy = avg_accuracy/10 avg_f1 = avg_f1/10 str_list.extend(["KNN TF-IDF Avg f1: " + avg_f1.__str__(), "KNN TF-IDF Avg acc: " + avg_accuracy.__str__()]) print("Avg f1: " + avg_f1.__str__()) print("Avg acc: " + avg_accuracy.__str__())
def label_spreading(x_train, y_train, x_test, y_test): from sklearn.semi_supervised import LabelSpreading sel = LabelSpreading() sel.fit(x_train, y_train) value = sel.score(x_test, y_test) return "{0:.2f}".format(value)
idxs = np.random.choice(X_train.shape[0], replace = False, size=n_unlabeled) y = np.asarray(Y_train) for i in idxs: y[i] = -1 Y_train = y # Train model and print statistics (use 'knn' as kernel) from sklearn.semi_supervised import LabelSpreading model = LabelSpreading(kernel = 'knn', n_neighbors = 10, max_iter=1000).fit(X_train, Y_train) print("Percentage of correct predictions = {}".format(round(100*model.score(X_test, Y_test),2))) pred = model.predict(X_test) == Y_test print("Correct: {}".format(np.count_nonzero(pred==True)),"/", "Incorrect: {}".format(np.count_nonzero(pred==False))) Z1 = model.predict(X_test).reshape(Y_test.size,1) Z2 = np.asarray(Y_test).reshape(Y_test.size,1) Z3 = np.around(model.predict_proba(X_test),decimals=2) data = np.concatenate((Z1,Z2,Z3),axis=1) outcome = pd.DataFrame(data, columns = ["Predicted Label", "Actual Label", "Prob. Label = 0.0", "Prob. Label = 1.0"]) indicesToKeep = outcome["Predicted Label"] != outcome["Actual Label"] print("False predictions with associated class probabilities:\n{}".format(outcome[indicesToKeep]))
hist, bins = np.histogram( lables, bins=[-0.1, 0.1, 1.1, 2.1, 3.1, 4.1, 5.1, 6.1, 7.1, 8.1, 9.1, 10.1]) print(hist) print(bins) print(train_labeled.shape) print(train_labeled[:, 0]) train_unlabeled = sklearn.preprocessing.scale(train_unlabeled) features = sklearn.preprocessing.scale(features) lp = LabelSpreading(kernel='knn', gamma=20, n_neighbors=7, alpha=0.2, max_iter=50, tol=0.01, n_jobs=-1) y = lables for i in range(21000): y = np.concatenate((y, np.array([-1])), axis=0) all_data = np.concatenate((features, train_unlabeled), axis=0) lp.fit(all_data, y) Yresult = lp.predict(all_data) print(lp.score(all_data, Yresult)) np.savetxt('semiLabelsOfUnlabeled2.csv', Yresult, delimiter=",")
gamma=p_gamma, n_neighbors=p_neighbors, alpha=p_alpha) elif (p_ss_mod == 'LabSpr' and p_ss_kern == 'rbf'): label_prop_model = LabelPropagation(kernel=p_ss_kern, gamma=p_gamma, n_neighbors=p_neighbors, alpha=p_alpha, max_iter=70) else: label_prop_model = dic_ss_mod[p_ss_mod](kernel=p_ss_kern, gamma=p_gamma, n_neighbors=p_neighbors) print('Start to fit. Run for shelter!') label_prop_model.fit(X_tot, y_tot) temp_acc = label_prop_model.score(X_valid_lab, y_valid) print('{} / {} :accuracy = {}'.format(i, p_manyfit, temp_acc)) RESULT_ACC_SS += temp_acc y_tot = label_prop_model.transduction_ y_submit = label_prop_model.predict(X_submit) save_to_csv(X_tot, y_tot, X_valid_lab, y_valid) RESULT_ACC_SS /= p_manyfit json_dict['ss_accuracy'] = RESULT_ACC_SS print('accuracy obtained on the test set of the ss algo:', RESULT_ACC_SS) else: init_variables() #PCA preprocessing if (PCA_MODE): pca_preprocess() X_tot, y_tot, X_valid, y_valid = load_xy()
def LabelData(LabelX, unLabelX, Labely, unLabely, testX, testy, batch_id=0, save_model=False): LabelXLen = LabelX.shape[0] print("LabeledCellNames", LabelX) X = pd.concat([LabelX, unLabelX], axis=0, join='inner') y = np.append(Labely, unLabely) Features = X.columns.values.tolist() testX = testX.loc[:, Features] # Knn LabelSpreading label_spread = LabelSpreading(kernel='knn', alpha=0.8, max_iter=5) print("==== X ====") print(X) print("==== Y ====") print(y) label_spread.fit(X, y) output_labels = label_spread.transduction_ score = label_spread.score(testX, testy) output_labels = le.inverse_transform(output_labels) CellNames = X.index.values.tolist() CellResult = { "CellName": CellNames[LabelXLen + 1:], "CellType": output_labels[LabelXLen + 1:] } # Result = pd.DataFrame(data=CellResult) # Result.to_csv("./result/%d.csv"%(batch_id), columns=['CellName','CellType'], index=False) # accuracy print("score : ", score) PredictY = label_spread.predict(testX) PredictYLabels = le.inverse_transform(PredictY) TrueYLabels = le.inverse_transform(testy) PredictResult = {"trueLabel": TrueYLabels, "predictLabel": PredictYLabels} # PredictResult = pd.DataFrame(data=PredictResult) # PredictResult.to_csv("./result/predict_result_%d.csv"%(batch_id), columns=['trueLabel','predictLabel'], index=False) # Label Distribution print("======= label_spread.label_distributions_ =======") print(label_spread.label_distributions_) LabelXIndexs = LabelX.index indexs = X.index ClassLabels = le.inverse_transform(label_spread.classes_) print(ClassLabels) LabelDistribution = pd.DataFrame(data=label_spread.label_distributions_, index=indexs, columns=ClassLabels) LabelDistribution = LabelDistribution.drop(index=LabelXIndexs) # LabelDistribution.to_csv("./result/test/LabelDistribution.csv") if save_model: with open('./result/%s/clf.pickle' % (FileName), 'wb') as f: pickle.dump(label_spread, f) return CellResult, LabelDistribution
# classification # use max_iter=10 when 20 categories clf_rbf = LabelPropagation(kernel='rbf', gamma=5).fit(vectors_rbf.todense(), dataset_rbf.train['target']) clf_knn = LabelSpreading(kernel='knn', n_neighbors=10).fit(vectors_knn.todense(), dataset_knn.train['target']) test_vec_rbf = vectorizer_rbf.transform(dataset_rbf.test['data']) test_vec_knn = vectorizer_knn.transform(dataset_knn.test['data']) print('----PREDICTIONS----') pred_rbf = clf_rbf.predict(test_vec_rbf.todense()) pred_knn = clf_knn.predict(test_vec_knn.todense()) print('f1 score rbf: ', metrics.f1_score(dataset_rbf.test['target'], pred_rbf, average='macro')) print('clf score rbf: ', clf_rbf.score(test_vec_rbf.todense(), dataset_rbf.test['target'])) print('f1 score knn: ', metrics.f1_score(dataset_knn.test['target'], pred_knn, average='macro')) print('clf score knn: ', clf_knn.score(test_vec_knn.todense(), dataset_knn.test['target'])) np.set_printoptions(precision=2) """" # Plot non-normalized confusion matrix plot_confusion_matrix(dataset_rbf.test['target'], pred_rbf, classes=categories, title='Confusion matrix (RBF), without normalization') # Plot normalized confusion matrix plot_confusion_matrix(dataset_rbf.test['target'], pred_rbf, classes=categories, normalize=True, title='Normalized confusion matrix (RBF)') plt.show() # Plot non-normalized confusion matrix
#print(gridsearch.best_params_) print('got Vectors') model = LabelSpreading(kernel='rbf') params = {'gamma': [0.1, 1.0, 10.0, 30.0, 50.0, 80.0, 100.0, 300.0], 'max_iter': [10, 100, 1000], 'alpha': [0.2, 0.4, 0.6, 0.8]} scoreDict = {} for max_iter in params['max_iter']: model.max_iter = max_iter for alpha in params['alpha']: model.alpha = alpha for gamma in params['gamma']: model.gamma = gamma model.fit(list(data), list(unlab)) score = model.score(list(testData), list(testLabels)) print(score, ' gamma = ', gamma, ' max_iter = ', max_iter, ' alpha = ', alpha) if (score in scoreDict): scoreDict[score].append( 'gamma = ' + str(gamma) + ' max_iter = ' + str(max_iter) + ' alpha = ' + str(alpha)) else: scoreDict[score] = [ 'gamma = ' + str(gamma) + ' max_iter = ' + str(max_iter) + ' alpha = ' + str(alpha)] knnModel = LabelSpreading(kernel='knn') knnParams = {'n_neighbors': [1, 4, 9, 16], 'max_iter': [10, 100, 1000], 'alpha': [0.2, 0.4, 0.6, 0.8]} for max_iter in knnParams['max_iter']: model.max_iter = max_iter for alpha in knnParams['alpha']: model.alpha = alpha for n_neighbors in knnParams['n_neighbors']:
def run_methods(x_c, y, x_e, z_c, z_y, z_e): x = np.concatenate((x_c, x_e), axis=1) z = np.concatenate((z_c, z_e), axis=1) # Baseline: Linear Logistic Regression lin_lr = LogisticRegression(random_state=0, solver='liblinear').fit(x, y.ravel()) acc_lin_lr = lin_lr.score(z, z_y) # hard_label_lin_lr = lin_lr.predict(z) # soft_label_lin_lr = lin_lr.predict_proba(z)[:, 1] # TRANSDUCTIVE APPROACHES # merge labelled and unlabelled data (with label -1) for transductive methods x_merged = np.concatenate((x, z)) y_merged = np.concatenate((y, -1 * np.ones( (z.shape[0], 1)))).ravel().astype(int) # Baseline: Linear TSVM: https://github.com/tmadl/semisup-learn/tree/master/methods lin_tsvm = SKTSVM(kernel='linear') lin_tsvm.fit(x_merged, y_merged) acc_lin_tsvm = lin_tsvm.score(z, z_y) # hard_label_lin_tsvm = lin_tsvm.predict(z) # soft_label_lin_tsvm = lin_tsvm.predict_proba(z)[:, 1] # Baseline: Non-Linear TSVM: https://github.com/tmadl/semisup-learn/tree/master/methods rbf_tsvm = SKTSVM(kernel='RBF') rbf_tsvm.fit(x_merged, y_merged) acc_rbf_tsvm = rbf_tsvm.score(z, z_y) # hard_label_rbf_tsvm = rbf_tsvm.predict(z) # soft_label_rbf_tsvm = rbf_tsvm.predict_proba(z)[:, 1] # Baseline: Label Propagation RBF weights try: rbf_label_prop = LabelPropagation(kernel='rbf') rbf_label_prop.fit(x_merged, y_merged) acc_rbf_label_prop = rbf_label_prop.score(z, z_y) # hard_label_rbf_label_prop= rbf_label_prop.predict(z) # soft_label_rbf_label_prop = rbf_label_prop.predict_proba(z)[:, 1] except: acc_rbf_label_prop = [] print 'rbf label prop did not work' # Baseline: Label Spreading with RBF weights try: rbf_label_spread = LabelSpreading(kernel='rbf') rbf_label_spread.fit(x_merged, y_merged) acc_rbf_label_spread = rbf_label_spread.score(z, z_y) # hard_label_rbf_label_spread = rbf_label_spread.predict(z) # soft_label_rbf_label_spread = rbf_label_spread.predict_proba(z)[:, 1] except: acc_rbf_label_spread = [] print 'rbf label spread did not work ' # THE K-NN VERSIONS ARE UNSTABLE UNLESS USING LARGE K # Baseline: Label Propagation with k-NN weights try: knn_label_prop = LabelPropagation(kernel='knn', n_neighbors=11) knn_label_prop.fit(x_merged, y_merged) acc_knn_label_prop = knn_label_prop.score(z, z_y) # hard_label_knn_label_prop = knn_label_prop.predict(z) # soft_label_knn_label_prop = knn_label_prop.predict_proba(z)[:, 1] except: acc_knn_label_prop = [] print 'knn label prop did not work' # Baseline: Label Spreading with k-NN weights try: knn_label_spread = LabelSpreading(kernel='knn', n_neighbors=11) knn_label_spread.fit(x_merged, y_merged) acc_knn_label_spread = knn_label_spread.score(z, z_y) # hard_label_knn_label_spread = knn_label_spread.predict(z) # soft_label_knn_label_spread = knn_label_spread.predict_proba(z)[:, 1] except: acc_knn_label_spread = [] print 'knn label spread did not work' # Generative Models # Semi-generative model on labelled data only a_y, b_y, a_e0, a_e1, b_0, b_1, cov_e0, cov_e1 = soft_label_EM( x_c, y, x_e, z_c, z_e, converged=True) soft_label_semigen = predict_class_probs(z_c, z_e, a_y, b_y, a_e0, a_e1, b_0, b_1, cov_e0, cov_e1) hard_label_semigen = soft_label_semigen > 0.5 acc_semigen_labelled = np.mean(hard_label_semigen == z_y) # EM with soft labels a_y, b_y, a_e0, a_e1, b_0, b_1, cov_e0, cov_e1 = soft_label_EM( x_c, y, x_e, z_c, z_e) soft_label_soft_EM = predict_class_probs(z_c, z_e, a_y, b_y, a_e0, a_e1, b_0, b_1, cov_e0, cov_e1) hard_label_soft_EM = soft_label_soft_EM > 0.5 acc_soft_EM = np.mean(hard_label_soft_EM == z_y) # EM with hard labels a_y, b_y, a_e0, a_e1, b_0, b_1, cov_e0, cov_e1 = hard_label_EM( x_c, y, x_e, z_c, z_e) soft_label_hard_EM = predict_class_probs(z_c, z_e, a_y, b_y, a_e0, a_e1, b_0, b_1, cov_e0, cov_e1) hard_label_hard_EM = soft_label_hard_EM > 0.5 acc_hard_EM = np.mean(hard_label_hard_EM == z_y) # Conditional label prop acc_cond_prop = conditional_prop(x_c, y, x_e, z_c, z_y, z_e) return acc_lin_lr, acc_lin_tsvm, acc_rbf_tsvm, acc_rbf_label_prop, acc_rbf_label_spread, acc_knn_label_prop,\ acc_knn_label_spread, acc_semigen_labelled, acc_soft_EM, acc_hard_EM, acc_cond_prop
# In[356]: # Test Label Spreading by cross validation skf = StratifiedKFold(n_splits=5) score0 = [] score0_holdout = [] score1 = [] score1_holdout = [] for i_train, i_test in skf.split(X_train2_, Y_train2_.argmax(axis=1)): X_train3, y_train3 = X_train2_[i_train], Y_train2_[i_train].argmax(axis=1) X_holdout3, y_holdout3 = X_train2_[i_test], Y_train2_[i_test].argmax( axis=1) n_holdout3 = len(y_holdout3) ls0 = LabelSpreading(kernel='rbf', gamma=2, n_neighbors=4) ls0.fit(X_train3, y_train3) score0.append(ls0.score(X_holdout3, y_holdout3)) score0_holdout.append(ls0.score(X_holdout2, Y_holdout2.argmax(axis=1))) print(' Supervised score: {:.4f} (holdout {:.4f})'.format( score0[-1], score0_holdout[-1])) ls1 = LabelSpreading(kernel='rbf', gamma=2, n_neighbors=4) ls1.fit(np.vstack((X_train3, X_holdout3)), np.concatenate((y_train3, np.full(n_holdout3, -1)))) score1.append(ls1.score(X_holdout3, y_holdout3)) score1_holdout.append(ls1.score(X_holdout2, Y_holdout2.argmax(axis=1))) print(' Semi-Supervised score: {:.4f} (holdout {:.4f})'.format( score1[-1], score1_holdout[-1])) print('Mean supervised: {:.4f} (holdout {:.4f})'.format( np.mean(score0), np.mean(score0_holdout))) print('Mean semi-supervised: {:.4f} (holdout {:.4f})'.format( np.mean(score1), np.mean(score1_holdout)))
def label_spreading(self, kernel='rbf', gamma=20, n_neighbors=7, alpha=0.2, max_iter=30, tol=0.001, n_jobs=1): """ LabelSpreading model for semi-supervised learning This model is similar to the basic Label Propagation algorithm, but uses affinity matrix based on the normalized graph Laplacian and soft clamping across the labels. Parameters ---------- kernel : {'knn', 'rbf', callable} String identifier for kernel function to use or the kernel function itself. Only 'rbf' and 'knn' strings are valid inputs. The function passed should take two inputs, each of shape [n_samples, n_features], and return a [n_samples, n_samples] shaped weight matrix gamma : float parameter for rbf kernel n_neighbors : integer > 0 parameter for knn kernel alpha : float Clamping factor. A value in [0, 1] that specifies the relative amount that an instance should adopt the information from its neighbors as opposed to its initial label. alpha=0 means keeping the initial label information; alpha=1 means replacing all initial information. max_iter : integer maximum number of iterations allowed tol : float Convergence tolerance: threshold to consider the system at steady state n_jobs : int or None, optional (default=None) The number of parallel jobs to run. ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context. ``-1`` means using all processors. See :term:`Glossary <n_jobs>` for more details. Returns ------- score : the score of learning model on test data Example -------- >>> labeled_path = "../data/labeled.csv" >>> unlabeled_path = "../data/unlabeled.csv" >>> mtl = MultiTaskLearner(labeled_path, unlabeled_path) >>> encoding = mtl.embed(word_length=5) >>> X, y, X_t, y_t = train_test_split(mtl.sequences, mtl.labels, test_size=0.33) >>> score = mtl.semi_supervised_learner(X, y, X_t, y_t, ssl="label_spreading") """ model = LabelSpreading(kernel=kernel, gamma=gamma, n_neighbors=n_neighbors, alpha=alpha, max_iter=max_iter, tol=tol, n_jobs=n_jobs) model.fit(self.X, self.y) return model.score(self.X_t, self.y_t)
pred = model4.predict(X_test) == Y_test statistics.loc[3] = ["SS Naive Bayes", round(100*model4.score(X_test, Y_test),2), np.count_nonzero(pred==True), np.count_nonzero(pred==False), round(100*idxs.size/Y_train.size,2), training_time4] # Train semi-supervised LabelSpreading model, predict (use 'knn' as kernel) and collect statistics from sklearn.semi_supervised import LabelSpreading start_time5 = time.time_ns() model5 = LabelSpreading(kernel = 'knn', n_neighbors = 10, max_iter=1000).fit(X_train, Y_train) training_time5 = time.time_ns() - start_time5 pred = model5.predict(X_test) == Y_test statistics.loc[4] = ["SS Label Spreading", round(100*model5.score(X_test, Y_test),2), np.count_nonzero(pred==True), np.count_nonzero(pred==False), round(100*idxs.size/Y_train.size,2), training_time5] # Print summary statistics print(statistics)
max_iter=1000).fit(vectors.todense(), dataset.train['target']) clf_knn = LabelSpreading(kernel='knn', n_neighbors=5, max_iter=1000).fit(vectors_knn.todense(), dataset_knn.train['target']) test_vec_rbf = vectorizer_rbf.transform(dataset.test['data']) test_vec_knn = vectorizer_knn.transform(dataset_knn.test['data']) print('----PREDICTIONS----') pred_rbf = clf_rbf.predict(test_vec_rbf.todense()) pred_knn = clf_knn.predict(test_vec_knn.todense()) print('f1 score rbf: ', metrics.f1_score(dataset.test['target'], pred_rbf, average='macro')) print('clf score rbf: ', clf_rbf.score(test_vec_rbf.todense(), dataset.test['target'])) print('f1 score knn: ', metrics.f1_score(dataset_knn.test['target'], pred_knn, average='macro')) print('clf score knn: ', clf_knn.score(test_vec_knn.todense(), dataset_knn.test['target'])) np.set_printoptions(precision=2) """ # Plot non-normalized confusion matrix plot_confusion_matrix(dataset.test['target'], pred_rbf, classes=categories, title='Confusion matrix (RBF with vocabulary), without normalization') # Plot normalized confusion matrix plot_confusion_matrix(dataset.test['target'], pred_rbf, classes=categories, normalize=True, title='Normalized confusion matrix (RBF with vocabulary)') plt.show()