def lb_prop_classify(network, labels): kf = StratifiedKFold(n_splits=10) scores = [] cms = [] for test_index, train_index in kf.split(network ,labels): first_train_index, last_train_index = min(train_index), max(train_index) train_dataset = network[first_train_index:last_train_index] train_labels = labels[first_train_index:last_train_index] test_dataset = np.delete(network, np.s_[first_train_index:last_train_index], 0) test_labels = np.delete(labels, np.s_[first_train_index:last_train_index], 0) label_spreading_model = LabelPropagation() label_spreading_model.fit(train_dataset, train_labels) scores.append(label_spreading_model.score(test_dataset, test_labels)) prediction = label_spreading_model.predict(test_dataset) cms.append(confusion_matrix(test_labels, prediction, label_spreading_model.classes_)) print('label propagation media {}'.format(np.average(scores))) print('label propagation desvio padrao {}'.format(np.std(scores))) print('label propagation matriz de confusao') print(get_percentile_cm(get_average_cm(cms))) print('\n') return scores
def test_LabelPropagation_knn(*data): ''' 测试 LabelPropagation 的 knn 核时,预测性能随 alpha 和 n_neighbors 的变化 ''' X, y, unlabeled_indices = data y_train = np.copy(y) # 必须拷贝,后面要用到 y y_train[unlabeled_indices] = -1 # 未标记样本的标记设定为 -1 fig = plt.figure() ax = fig.add_subplot(1, 1, 1) alphas = np.linspace(0.01, 1, num=10, endpoint=True) Ks = [1, 2, 3, 4, 5, 8, 10, 15, 20, 25, 30, 35, 40, 50] scores = [] for K in Ks: clf = LabelPropagation(max_iter=100, n_neighbors=K, kernel='knn') clf.fit(X, y_train) scores.append(clf.score(X[unlabeled_indices], y[unlabeled_indices])) ax.plot(Ks, scores) ### 设置图形 ax.set_xlabel(r"$k$") ax.set_ylabel("score") ax.legend(loc="best") ax.set_title("LabelPropagation knn kernel") plt.show()
def test_LabelPropagation_rbf(*data): x, y, unlabeled_indices = data y_train = np.copy(y) y_train[unlabeled_indices] = -1 fig = plt.figure() ax = fig.add_subplot(1, 1, 1) alphas = np.linspace(0.01, 1, num=10, endpoint=True) gammas = np.logspace(-2, 2, num=50) colors = ((1, 0, 0), (0, 1, 0), (0, 0, 1), (0.5, 0.5, 0), (0, 0.5, 0.5), (0.5, 0, 0.5), (0.4, 0.6, 0), (0.6, 0.4, 0) \ , (0, 0.6, 0.4), (0.5, 0.3, 0.2)) # 颜色集合,不同的曲线用不同的颜色 # 训练并绘图 for alpha, color in zip(alphas, colors): scores = [] for gamma in gammas: clf = LabelPropagation(max_iter=100, gamma=gamma, alpha=alpha, kernel='rbf') clf.fit(x, y_train) scores.append(clf.score(x[unlabeled_indices], y[unlabeled_indices])) ax.plot(gammas, scores, label=r"$\alpha=%s$" % alpha, color=color) # 设置图形 ax.set_xlabel(r"$\gamma$") ax.set_ylabel("score") ax.set_xscale("log") ax.legend(loc='best') ax.set_title("LabelPropagation rbf kernel") plt.show()
def test_LabelPropagation_rbf(*data): ''' 测试 LabelPropagation 的 rbf 核时,预测性能随 alpha 和 gamma 的变化 ''' X, y, unlabeled_indices = data # 必须拷贝,后面要用到 y y_train = np.copy(y) # 未标记样本的标记设定为 -1 y_train[unlabeled_indices] = -1 fig = plt.figure() ax = fig.add_subplot(1, 1, 1) alphas = np.linspace(0.01, 1, num=10, endpoint=True) gammas = np.logspace(-2, 2, num=50) scores = [] for gamma in gammas: clf = LabelPropagation(max_iter=100, gamma=gamma, kernel='rbf') clf.fit(X, y_train) scores.append(clf.score(X[unlabeled_indices], y[unlabeled_indices])) ax.plot(gammas, scores) ### 设置图形 ax.set_xlabel(r"$\gamma$") ax.set_ylabel("score") ax.set_xscale("log") ax.legend(loc="best") ax.set_title("LabelPropagation rbf kernel") plt.show()
def test_LabelPropagation(*data): X,y,unlabeled_indices = data y_train = np.copy(y) y_train[unlabeled_indices] = -1 clf = LabelPropagation(max_iter=100, kernel='rbf', gamma=0.1) clf.fit(X,y_train) true_labels = y[unlabeled_indices] print('Accuracy : %.2f' %clf.score(X[unlabeled_indices],true_labels))
def test_LabelPropagation(*data): x, y ,unlabeled_indices = data y_train = np.copy(y) # 这里选择复制,后面要用到y y_train[unlabeled_indices] = -1 # 未标记样本的标记设定为-1 clf = LabelPropagation(max_iter=100, kernel='rbf', gamma=0.1) clf.fit(x, y_train) # 获取预测准确率 true_labels = y[unlabeled_indices] # 取得真实标记 print("Accuracy: %f" % clf.score(x[unlabeled_indices], true_labels))
def process(self, n_components): X_train, y_train, X_test, y_test = self.preprocess(n_components) label_prop_model = LabelPropagation(n_jobs=-1) label_prop_model.fit(X_train, y_train) y_pred = label_prop_model.predict(X_test) mean_acc = label_prop_model.score(X_test, y_test) plot_confusion_matrix(y_test, y_pred, self.labels, normalize=False, figname=('lp_comps_%d.png' % n_components)) self.m_acc.append(mean_acc) print(label_prop_model.get_params())
def khren3(G): result_s = {} result_d = {} passed_set = [] list_neighbrs = {} for v in G.nodes: list_neighbrs.update({v: set(nx.neighbors(G, v))}) for u in G.nodes: passed_set.append(u) for v in nx.neighbors(G, u): if not v in passed_set: cmn_nmbr = list_neighbrs[u] & list_neighbrs[v] # dist = nx.shortest_path_length(G,u,v) # if dist == 2: # cmn_nmbr = G.distance(u,v) if G.nodes[u]["ground_label"] == G.nodes[v]['ground_label']: result_s.update({(u, v): cmn_nmbr}) else: result_d.update({(u, v): cmn_nmbr}) # max_s = max(len(result_s.values())) min_s = len(min(result_s.values(), key=len)) min_d = len(min(result_d.values(), key=len)) max_d = len(max(result_d.values(), key=len)) for (pair, vertex_list) in result_d.items(): if len(vertex_list) == max_d: max_pair = pair break print(min_s, min_d) adj_matrix = nx.adjacency_matrix(G).toarray() labels = [-1 for node in G.nodes] true_labels = [G.nodes[node]['ground_label'] for node in G.nodes] # labels[[0]] = 0 labels[max_pair[0]] = 0 labels[max_pair[1]] = 1 # labels[0:10] = [0 for i in range(10)] # labels[900:910] = [1 for i in range(10)] lp = LabelPropagation(kernel='rbf', gamma=0.7, max_iter=1000) lp.fit(adj_matrix, labels) print(lp.score(adj_matrix, true_labels)) return (result_s, result_d)
def test_LabelPropagation_rbf(*data): ''' 测试 LabelPropagation 的 rbf 核时,预测性能随 alpha 和 gamma 的变化 :param data: 一个元组,依次为: 样本集合、样本标记集合、 未标记样本的下标集合 :return: None ''' X, y, unlabeled_indices = data y_train = np.copy(y) # 必须拷贝,后面要用到 y y_train[unlabeled_indices] = -1 # 未标记样本的标记设定为 -1 fig = plt.figure() ax = fig.add_subplot(1, 1, 1) alphas = np.linspace(0.01, 1, num=10, endpoint=True) gammas = np.logspace(-2, 2, num=50) colors = ( (1, 0, 0), (0, 1, 0), (0, 0, 1), (0.5, 0.5, 0), (0, 0.5, 0.5), (0.5, 0, 0.5), (0.4, 0.6, 0), (0.6, 0.4, 0), (0, 0.6, 0.4), (0.5, 0.3, 0.2), ) # 颜色集合,不同曲线用不同颜色 ## 训练并绘图 for alpha, color in zip(alphas, colors): scores = [] for gamma in gammas: clf = LabelPropagation(max_iter=100, gamma=gamma, alpha=alpha, kernel='rbf') clf.fit(X, y_train) scores.append(clf.score(X[unlabeled_indices], y[unlabeled_indices])) ax.plot(gammas, scores, label=r"$\alpha=%s$" % alpha, color=color) ### 设置图形 ax.set_xlabel(r"$\gamma$") ax.set_ylabel("score") ax.set_xscale("log") ax.legend(loc="best") ax.set_title("LabelPropagation rbf kernel") plt.show()
def test_LabelPropagation_knn(*data): ''' 测试 LabelPropagation 的 knn 核时,预测性能随 alpha 和 n_neighbors 的变化 :param data: 一个元组,依次为: 样本集合、样本标记集合、 未标记样本的下标集合 :return: None ''' X, y, unlabeled_indices = data y_train = np.copy(y) # 必须拷贝,后面要用到 y y_train[unlabeled_indices] = -1 # 未标记样本的标记设定为 -1 fig = plt.figure() ax = fig.add_subplot(1, 1, 1) alphas = np.linspace(0.01, 1, num=10, endpoint=True) Ks = [1, 2, 3, 4, 5, 8, 10, 15, 20, 25, 30, 35, 40, 50] colors = ( (1, 0, 0), (0, 1, 0), (0, 0, 1), (0.5, 0.5, 0), (0, 0.5, 0.5), (0.5, 0, 0.5), (0.4, 0.6, 0), (0.6, 0.4, 0), (0, 0.6, 0.4), (0.5, 0.3, 0.2), ) # 颜色集合,不同曲线用不同颜色 ## 训练并绘图 for alpha, color in zip(alphas, colors): scores = [] for K in Ks: clf = LabelPropagation(max_iter=100, n_neighbors=K, alpha=alpha, kernel='knn') clf.fit(X, y_train) scores.append(clf.score(X[unlabeled_indices], y[unlabeled_indices])) ax.plot(Ks, scores, label=r"$\alpha=%s$" % alpha, color=color) ### 设置图形 ax.set_xlabel(r"$k$") ax.set_ylabel("score") ax.legend(loc="best") ax.set_title("LabelPropagation knn kernel") plt.show()
def ssl_label_prop(unlabel, clfs, true, x, y, test): for row in y: row = int(row) df_noise_x, df_noise_y, noisy_labels = shuffle.run(unlabel, [-1] * len(unlabel), x, y) ground = [] point = [] for row in test: ground.append(row[0]) point.append(row[1:]) # sklearn algo label_prop_model = LabelPropagation(kernel='knn', n_neighbors=2, max_iter=400, tol=0.01) label_prop_model.fit(df_noise_x, df_noise_y) return label_prop_model.score(point, ground)
def test_LabelPropagation_alpha_n_neighbors(*data): X,y,unlabeled_indices = data y_train = np.copy(y) y_train[unlabeled_indices] = -1 fig = plt.figure() ax = fig.add_subplot(1,1,1) alphas = np.linspace(0.01,1,num=2,endpoint=True) n_neighbors = [1,2,3,4,5,6,7,8,10,20,30,40,50] for i,alpha in enumerate(alphas): scores = [] for n_neighbor in n_neighbors: clf = LabelPropagation(max_iter=1000, kernel='knn', n_neighbors=n_neighbor, alpha=alpha) clf.fit(X,y_train) true_labels = y[unlabeled_indices] scores.append(clf.score(X[unlabeled_indices],true_labels)) ax.plot(n_neighbors,scores,label = 'alpha = %s' %alpha) ax.set_xlabel('n_neighbors') ax.set_ylabel('score') ax.set_xscale('log') ax.legend()
def test_LabelPropagation_alpha_gamma(*data): X,y,unlabeled_indices = data y_train = np.copy(y) y_train[unlabeled_indices] = -1 fig = plt.figure() ax = fig.add_subplot(1,1,1) alphas = np.linspace(0.01,1,num=10,endpoint=True) gammas = np.logspace(-2,2,num=5) for i,alpha in enumerate(alphas): scores = [] for gamma in gammas: clf = LabelPropagation(max_iter=100, kernel='rbf', gamma=gamma, alpha=alpha) clf.fit(X,y_train) true_labels = y[unlabeled_indices] scores.append(clf.score(X[unlabeled_indices],true_labels)) ax.plot(gammas,scores,label = 'alpha = %s' %alpha) ax.set_xlabel('gamma') ax.set_ylabel('score') ax.set_xscale('log') ax.legend()
def test_LabelPropagation_knn(*data): x, y, unlabeled_indices = data y_train = np.copy(y) y_train[unlabeled_indices] = -1 fig = plt.figure() ax = fig.add_subplot(1, 1, 1) alphas = np.linspace(0.01, 1, num=10, endpoint=True) Ks = [1, 2, 3, 4, 5, 8, 10, 15, 20, 25, 30, 35, 40, 50] colors = ((1, 0, 0), (0, 1, 0), (0, 0, 1), (0.5, 0.5, 0), (0, 0.5, 0.5), (0.5, 0, 0.5), (0.4, 0.6, 0), (0.6, 0.4, 0),\ (0, 0.6, 0.4), (0.5, 0.3, 0.2)) # 颜色集合,不同的曲线用不同的颜色 # 训练并绘图 for alpha, color in zip(alphas, colors): scores = [] for K in Ks: clf = LabelPropagation(max_iter=100, n_neighbors=K, alpha=alpha, kernel='knn') clf.fit(x, y_train) scores.append(clf.score(x[unlabeled_indices], y[unlabeled_indices])) ax.plot(Ks, scores, label=r"$\alpha=%s$" % alpha, color=color) # 设置图形 ax.set_xlabel(r"k") ax.set_ylabel("score") ax.legend(loc='best') ax.set_title("LabelPropagation knn kernel") plt.show()
def experemint_2(l=8): # Experiment comparing random walk, tSVM, SVM and our cluster kernel: tSVM = LabelPropagation(max_iter=5000) np.random.seed(133769) # reproducibility x_mac, x_win, y_mac, y_win = get_data() x_test = np.vstack((x_mac[-500:], x_win[-500:])) y_test = np.hstack((y_mac[-500:], y_win[-500:])) y_test_tsvm = np.hstack((0.0 * y_mac[-500:], y_win[-500:])) x_mac, x_win, y_mac, y_win = x_mac[: -500], x_win[: -500], y_mac[: -500], y_win[: -500] y_mac_tsvm = np.zeros((y_mac.shape)) # change -1 to zero x_labeled = np.vstack((x_mac[:l], x_win[:l])) x_unlabeled = np.vstack((x_mac[l:], x_win[l:])) X = np.vstack((x_labeled, x_unlabeled)) y_labeled = np.hstack((y_mac[:l], y_win[:l])) y_labeled_tsvm = np.hstack((y_mac_tsvm[:l], y_win[:l])) y_unlabeled = np.hstack((y_mac[l:], y_win[l:])) y_unlabeled_tsvm = -np.ones((y_unlabeled.shape)) # Set unlabeled points labels_tsvm = np.hstack((y_labeled_tsvm, y_unlabeled_tsvm)) acc_tSVM = np.array([None] * 100) acc_random_walk = np.array([None] * 100) acc_polyStep = np.array([None] * 100) acc_linear = np.array([None] * 100) kernel1 = lambda x: cluster_kernel.kernel(x, 10, "polyStep", 16) kernel2 = lambda x: cluster_kernel.kernel(x, 10, "linear", 16) for test in range(100): np.random.shuffle(x_mac) np.random.shuffle(x_win) x_labeled = np.vstack((x_mac[:l], x_win[:l])) x_unlabeled = np.vstack((x_mac[l:], x_win[l:])) y_labeled = np.hstack((y_mac[:l], y_win[:l])) X = np.vstack((x_labeled, x_unlabeled)) tSVM.fit(X, labels_tsvm) acc_tSVM[test] = tSVM.score(x_test, y_test_tsvm) print(f'accuracy = {acc_tSVM[test] * 100}% () tSVM') acc_random_walk[test] = random_walk.random_walk( x_labeled, x_unlabeled, x_test, y_labeled, y_test) print(f'accuracy = {acc_random_walk[test] * 100}% () Random Walk') acc_polyStep[test] = evaluate_kernel(x_labeled, x_unlabeled, x_test, y_labeled, y_test, kernel1) print(f'accuracy = {acc_polyStep[test] * 100}% () Poly Step') acc_linear[test] = evaluate_kernel_SVM(x_labeled, x_unlabeled, x_test, y_labeled, y_test, kernel2) print(f'accuracy = {acc_linear[test] * 100}% () Linear') # acc[test] = evaluate_kernel_2(x_labeled_i, x_test, y_labeled, y_test, k) #acc[test] = evaluate_kernel(x_labeled, x_unlabeled, x_test, y_labeled, y_test, kernel) # acc[test] = random_walk.random_walk(x_labeled, x_unlabeled, x_test, y_labeled, y_test) print( f'normal SVM: accuracy = {acc_linear.mean() * 100}% (±{acc_linear.std() * 100:.2})' ) print( f'tSVM: accuracy = {acc_tSVM.mean() * 100}% (±{acc_tSVM.std() * 100:.2})' ) print( f'random walk: accuracy = {acc_random_walk.mean() * 100}% (±{acc_random_walk.std() * 100:.2})' ) print( f'Cluster kernel: accuracy = {acc_polyStep.mean() * 100}% (±{acc_polyStep.std() * 100:.2})' )
print(iris.feature_names) print(iris.data[:5]) # In[2]: rng = np.random.RandomState(42) random_unlabeled_points = rng.rand(len(iris.target)) < 0.8 labels = np.copy(iris.target) print('supervised labels: ') print(labels) labels[random_unlabeled_points] = -1 print('semi supervised labels: ') print(labels) # In[3]: from sklearn.semi_supervised import LabelPropagation label_prop_model = LabelPropagation() # In[4]: label_prop_model.fit(iris.data, labels) # In[5]: label_prop_model.transduction_ # In[6]: label_prop_model.score(X=iris.data, y=iris.target)
Y_test = list(test.Category.values) label_prop_model = LabelPropagation(kernel="knn") labels = np.copy(Y_train_l) label_prop_model.fit(X_train_l, labels) label_prop_model.predict(X_train_ul) #y = test.Category.values.reshape(-1,1) label_prop_model.score(X_test, test.Category.values) label_prop_model = LabelSpreading() labels = np.copy(Y_train_l) label_prop_model.fit(X_train_l, labels) label_prop_model.predict(X_train_ul) #y = test.Category.values.reshape(-1,1) label_prop_model.score(X_test, test.Category.values)
def do_machinea_leaning_stuff(train_X, train_Y, test_X, test_Y): returnValue = [] test_predict_Y = [] # de facut ceva cu acest rezultat #f_classif(X, y); #Algoritmi de clasificare rfc = RandomForestClassifier(n_estimators=100, max_depth=2, random_state=0) rfc.fit(train_X, train_Y) test_predict_Y = rfc.predict(test_X) returnValue.append({ 'name': "RandomForestClassifier", 'score': rfc.score(test_X, test_Y), 'accuracy_naive': (test_Y != test_predict_Y).sum() * 1.0 / len(test_predict_Y), 'accuracy_score': accuracy_score(test_Y, test_predict_Y), 'classification_report': classification_report(test_Y, test_predict_Y) }) etc = ExtraTreesClassifier() etc.fit(train_X, train_Y) test_predict_Y = etc.predict(test_X) returnValue.append({ 'name': "ExtraTreesClassifier", 'score': etc.score(test_X, test_Y), 'accuracy_naive': (test_Y != test_predict_Y).sum() * 1.0 / len(test_predict_Y), 'accuracy_score': accuracy_score(test_Y, test_predict_Y), 'classification_report': classification_report(test_Y, test_predict_Y) }) gpc = GaussianProcessClassifier(random_state=0) gpc.fit(train_X, train_Y) test_predict_Y = gpc.predict(test_X) # TODO : poate folosim si asta print(gpc.predict_proba(test_X)) returnValue.append({ 'name': "GaussianProcessClassifier", 'score': gpc.score(test_X, test_Y), 'accuracy_naive': (test_Y != test_predict_Y).sum() * 1.0 / len(test_predict_Y), 'accuracy_score': accuracy_score(test_Y, test_predict_Y), 'classification_report': classification_report(test_Y, test_predict_Y) }) pac = PassiveAggressiveClassifier(max_iter=1000, random_state=0, tol=1e-3) pac.fit(train_X, train_Y) test_predict_Y = pac.predict(test_X) returnValue.append({ 'name': "PassiveAggressiveClassifier", 'score': pac.score(test_X, test_Y), 'accuracy_naive': (test_Y != test_predict_Y).sum() * 1.0 / len(test_predict_Y), 'accuracy_score': accuracy_score(test_Y, test_predict_Y), 'classification_report': classification_report(test_Y, test_predict_Y) }) rc = RidgeClassifier() rc.fit(train_X, train_Y) test_predict_Y = rc.predict(test_X) returnValue.append({ 'name': "RidgeClassifier", 'score': rc.score(test_X, test_Y), 'accuracy_naive': (test_Y != test_predict_Y).sum() * 1.0 / len(test_predict_Y), 'accuracy_score': accuracy_score(test_Y, test_predict_Y), 'classification_report': classification_report(test_Y, test_predict_Y) }) sgdc = SGDClassifier(max_iter=1000, tol=1e-3) sgdc.fit(train_X, train_Y) test_predict_Y = sgdc.predict(test_X) returnValue.append({ 'name': "SGDClassifier", 'score': sgdc.score(test_X, test_Y), 'accuracy_naive': (test_Y != test_predict_Y).sum() * 1.0 / len(test_predict_Y), 'accuracy_score': accuracy_score(test_Y, test_predict_Y), 'classification_report': classification_report(test_Y, test_predict_Y) }) bnb = BernoulliNB() bnb.fit(train_X, train_Y) test_predict_Y = bnb.predict(test_X) returnValue.append({ 'name': "BernoulliNB", 'score': bnb.score(test_X, test_Y), 'accuracy_naive': (test_Y != test_predict_Y).sum() * 1.0 / len(test_predict_Y), 'accuracy_score': accuracy_score(test_Y, test_predict_Y), 'classification_report': classification_report(test_Y, test_predict_Y) }) knnc = KNeighborsClassifier(n_neighbors=3) knnc.fit(train_X, train_Y) test_predict_Y = knnc.predict(test_X) returnValue.append({ 'name': "KNeighborsClassifier", 'score': knnc.score(test_X, test_Y), 'accuracy_naive': (test_Y != test_predict_Y).sum() * 1.0 / len(test_predict_Y), 'accuracy_score': accuracy_score(test_Y, test_predict_Y), 'classification_report': classification_report(test_Y, test_predict_Y) }) mlpc = MLPClassifier() mlpc.fit(train_X, train_Y) test_predict_Y = mlpc.predict(test_X) returnValue.append({ 'name': "MLPClassifier", 'score': mlpc.score(test_X, test_Y), 'accuracy_naive': (test_Y != test_predict_Y).sum() * 1.0 / len(test_predict_Y), 'accuracy_score': accuracy_score(test_Y, test_predict_Y), 'classification_report': classification_report(test_Y, test_predict_Y) }) label_prop_model = LabelPropagation() rng = np.random.RandomState(42) random_unlabeled_points = rng.rand(len(train_Y)) < 0.3 labels = np.copy(train_Y) labels[random_unlabeled_points] = -1 label_prop_model.fit(train_X, labels) test_predict_Y = label_prop_model.predict(test_X) returnValue.append({ 'name': "LabelPropagation", 'score': label_prop_model.score(test_X, test_Y), 'accuracy_naive': (test_Y != test_predict_Y).sum() * 1.0 / len(test_predict_Y), 'accuracy_score': accuracy_score(test_Y, test_predict_Y), 'classification_report': classification_report(test_Y, test_predict_Y) }) lsvc = LinearSVC(random_state=0, tol=1e-5) lsvc.fit(train_X, train_Y) test_predict_Y = lsvc.predict(test_X) returnValue.append({ 'name': "LinearSVC", 'score': label_prop_model.score(test_X, test_Y), 'accuracy_naive': (test_Y != test_predict_Y).sum() * 1.0 / len(test_predict_Y), 'accuracy_score': accuracy_score(test_Y, test_predict_Y), 'classification_report': classification_report(test_Y, test_predict_Y) }) svc = SVC(gamma='auto') svc.fit(train_X, train_Y) test_predict_Y = svc.predict(test_X) returnValue.append({ 'name': "SVC", 'score': svc.score(test_X, test_Y), 'accuracy_naive': (test_Y != test_predict_Y).sum() * 1.0 / len(test_predict_Y), 'accuracy_score': accuracy_score(test_Y, test_predict_Y), 'classification_report': classification_report(test_Y, test_predict_Y) }) dtc = DecisionTreeClassifier(random_state=0) dtc.fit(train_X, train_Y) test_predict_Y = dtc.predict(test_X) returnValue.append({ 'name': "DecisionTreeClassifier", 'score': dtc.score(test_X, test_Y), 'accuracy_naive': (test_Y != test_predict_Y).sum() * 1.0 / len(test_predict_Y), 'accuracy_score': accuracy_score(test_Y, test_predict_Y), 'classification_report': classification_report(test_Y, test_predict_Y) }) cccv = CalibratedClassifierCV() cccv.fit(train_X, train_Y) test_predict_Y = cccv.predict(test_X) returnValue.append({ 'name': "CalibratedClassifierCV", 'score': cccv.score(test_X, test_Y), 'accuracy_naive': (test_Y != test_predict_Y).sum() * 1.0 / len(test_predict_Y), 'accuracy_score': accuracy_score(test_Y, test_predict_Y), 'classification_report': classification_report(test_Y, test_predict_Y) }) return returnValue
datatrain.loc[datatrain['estilo_de_aprendizagem'] == 'Indefinido', 'estilo_de_aprendizagem'] = 0 datatrain.loc[datatrain['estilo_de_aprendizagem'] == 'Ativo', 'estilo_de_aprendizagem'] = 1 datatrain.loc[datatrain['estilo_de_aprendizagem'] == 'Teorico', 'estilo_de_aprendizagem'] = 2 datatrain.loc[datatrain['estilo_de_aprendizagem'] == 'Reflexivo', 'estilo_de_aprendizagem'] = 3 datatrain.loc[datatrain['estilo_de_aprendizagem'] == 'Pragmatico', 'estilo_de_aprendizagem'] = 4 datatrain = datatrain.apply(pd.to_numeric) datatrain_array = datatrain.as_matrix() X = datatrain_array[:, :14] y = datatrain_array[:, 14:15] X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1) scaler = StandardScaler() scaler.fit(X_train) X_train = scaler.transform(X_train) X_test = scaler.transform(X_test) cv = LabelPropagation() cv.fit(X_train, y_train) precisao = cv.score(X_test, y_test) print("------Acurácia-------: %f" % (precisao))
sgd.score(x_test_3, y_test_3) sgd = SGDClassifier(loss='log', shuffle=True, random_state=171) sgd.fit(x_train_3, y_train_3) sgd.predict(x_train_3) sgd.score(x_test_3, y_test_3) sgd = SGDClassifier(shuffle=True, random_state=171) sgd.fit(x_train_3, y_train_3) sgd.predict(x_train_3) sgd.score(x_test_3, y_test_3) submission = pd.DataFrame({'Id': test.Id, 'Cover_Type': ensemble_test_pred}) submission.head() submission.to_csv('submission.csv', index=False) submission_tree = pd.DataFrame({'Id': test.Id, 'Cover_Type': tree_test_pred}) submission_tree.head() submission_tree.to_csv('submission2.csv', index=False) #Extra tree classifier is a tree based model for classification problems et = ExtraTreeClassifier() et.fit(x_train_3, y_train_3) et.predict(x_train_3) et.score(x_test_3, y_test_3) from sklearn.semi_supervised import LabelPropagation lb = LabelPropagation() lb.fit(x_train_3, y_train_3) lb.predict(x_train_3) lb.score(x_test_3, y_test_3) from sklearn.neighbors import KNeighborsClassifier knng = KNeighborsClassifier() knng.fit(x_train_3, y_train_3) knng.predict(x_train_3) knng.score(x_test_3, y_test_3)
call_times.append(str(i) + '点通话次数') df = df[['次均通话时长', '在网时长(单位:秒)', '当月活跃基站个数', '交往圈数量', 'avg']] #df= df.drop(call_times,axis = 1 ) #df = df[['月累计短信发送数量','月累计流量使用情况(单位:字节)']] print('start pca...') pca = PCA(n_components=2) reduced_X = pca.fit_transform(df) reduced_X_1 = reduced_X[:, 0] reduced_X_2 = reduced_X[:, 1] conponent = pd.DataFrame({'p1': reduced_X_1, 'p2': reduced_X_2, 'label': y}) X = conponent[['p1', 'p2']] y = conponent['label'] sss = StratifiedShuffleSplit(n_splits=3, train_size=0.0025, test_size=0.0025, random_state=0) sss.get_n_splits(X, y) #X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42) for train_index, test_index in sss.split(X, y): print("TRAIN:", train_index, "TEST:", test_index) X_train, X_test = X.iloc[train_index, :], X.iloc[test_index, :] y_train, y_test = y.iloc[train_index], y.iloc[test_index] label_prop_model = LabelPropagation(max_iter=5000) label_prop_model.fit(X_train, y_train) print(label_prop_model.score(X_test, y_test))
def label_propagation(x_train, y_train, x_test, y_test): from sklearn.semi_supervised import LabelPropagation sel = LabelPropagation() sel.fit(x_train, y_train) value = sel.score(x_test, y_test) return "{0:.2f}".format(value)
def label_propagation(self, kernel='rbf', gamma=20, n_neighbors=7, max_iter=30, tol=1e-3, n_jobs=1): """ Label Propagation classifier for semi-supervised learning Parameters ---------- kernel : {'knn', 'rbf'} String identifier for kernel function to use or the kernel function itself. Only 'rbf' and 'knn' strings are valid inputs. The function passed should take two inputs, each of shape [n_samples, n_features], and return a [n_samples, n_samples] shaped weight matrix. gamma : float Parameter for rbf kernel n_neighbors : integer > 0 Parameter for knn kernel alpha : float Clamping factor. max_iter : integer Change maximum number of iterations allowed tol : float Convergence tolerance: threshold to consider the system at steady state n_jobs : int or None, optional (default=None) The number of parallel jobs to run. ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context. ``-1`` means using all processors. See :term:`Glossary <n_jobs>` for more details. Returns ------- score : the score of learning model on test data Example -------- >>> labeled_path = "../data/labeled.csv" >>> unlabeled_path = "../data/unlabeled.csv" >>> mtl = MultiTaskLearner(labeled_path, unlabeled_path) >>> encoding = mtl.embed(word_length=5) >>> X, y, X_t, y_t = train_test_split(mtl.sequences, mtl.labels, test_size=0.33) >>> score = mtl.semi_supervised_learner(X, y, X_t, y_t, ssl="label_propagation") """ model = LabelPropagation(kernel=kernel, gamma=gamma, n_neighbors=n_neighbors, max_iter=max_iter, tol=tol, n_jobs=n_jobs) model.fit(self.X, self.y) return model.score(self.X_t, self.y_t)
words = f.read().split("\n") while i < len(words): j = 0 while j < len(newsgroups_train.data): newsgroups_train.data[j] = re.sub(words[i], '', newsgroups_train.data[j]) j += 1 i += 1 f.close() print([newsgroups_train.data[0]]) # feature extraction vectorizer = TfidfVectorizer(stop_words=get_stopwords()) vectors = vectorizer.fit_transform(newsgroups_train.data) clf = LabelPropagation(kernel='rbf', gamma=0.89).fit(vectors.todense(), newsgroups_train.target) test_vec = vectorizer.transform(newsgroups_test.data) pred = clf.predict(test_vec) print(clf.score(test_vec, newsgroups_test.target)) print('f1 score: ', metrics.f1_score(newsgroups_test.target, pred, average='macro')) remove_regex_words() vectors = vectorizer.fit_transform(newsgroups_train.data) clf = LabelPropagation(kernel='rbf', gamma=0.89).fit(vectors.todense(), newsgroups_train.target) test_vec = vectorizer.transform(newsgroups_test.data) pred = clf.predict(test_vec) print(clf.score(test_vec, newsgroups_test.target)) print('f1 score: ', metrics.f1_score(newsgroups_test.target, pred, average='macro'))
def run_methods(x_c, y, x_e, z_c, z_y, z_e): x = np.concatenate((x_c, x_e), axis=1) z = np.concatenate((z_c, z_e), axis=1) # Baseline: Linear Logistic Regression lin_lr = LogisticRegression(random_state=0, solver='liblinear').fit(x, y.ravel()) acc_lin_lr = lin_lr.score(z, z_y) # hard_label_lin_lr = lin_lr.predict(z) # soft_label_lin_lr = lin_lr.predict_proba(z)[:, 1] # TRANSDUCTIVE APPROACHES # merge labelled and unlabelled data (with label -1) for transductive methods x_merged = np.concatenate((x, z)) y_merged = np.concatenate((y, -1 * np.ones( (z.shape[0], 1)))).ravel().astype(int) # Baseline: Linear TSVM: https://github.com/tmadl/semisup-learn/tree/master/methods lin_tsvm = SKTSVM(kernel='linear') lin_tsvm.fit(x_merged, y_merged) acc_lin_tsvm = lin_tsvm.score(z, z_y) # hard_label_lin_tsvm = lin_tsvm.predict(z) # soft_label_lin_tsvm = lin_tsvm.predict_proba(z)[:, 1] # Baseline: Non-Linear TSVM: https://github.com/tmadl/semisup-learn/tree/master/methods rbf_tsvm = SKTSVM(kernel='RBF') rbf_tsvm.fit(x_merged, y_merged) acc_rbf_tsvm = rbf_tsvm.score(z, z_y) # hard_label_rbf_tsvm = rbf_tsvm.predict(z) # soft_label_rbf_tsvm = rbf_tsvm.predict_proba(z)[:, 1] # Baseline: Label Propagation RBF weights try: rbf_label_prop = LabelPropagation(kernel='rbf') rbf_label_prop.fit(x_merged, y_merged) acc_rbf_label_prop = rbf_label_prop.score(z, z_y) # hard_label_rbf_label_prop= rbf_label_prop.predict(z) # soft_label_rbf_label_prop = rbf_label_prop.predict_proba(z)[:, 1] except: acc_rbf_label_prop = [] print 'rbf label prop did not work' # Baseline: Label Spreading with RBF weights try: rbf_label_spread = LabelSpreading(kernel='rbf') rbf_label_spread.fit(x_merged, y_merged) acc_rbf_label_spread = rbf_label_spread.score(z, z_y) # hard_label_rbf_label_spread = rbf_label_spread.predict(z) # soft_label_rbf_label_spread = rbf_label_spread.predict_proba(z)[:, 1] except: acc_rbf_label_spread = [] print 'rbf label spread did not work ' # THE K-NN VERSIONS ARE UNSTABLE UNLESS USING LARGE K # Baseline: Label Propagation with k-NN weights try: knn_label_prop = LabelPropagation(kernel='knn', n_neighbors=11) knn_label_prop.fit(x_merged, y_merged) acc_knn_label_prop = knn_label_prop.score(z, z_y) # hard_label_knn_label_prop = knn_label_prop.predict(z) # soft_label_knn_label_prop = knn_label_prop.predict_proba(z)[:, 1] except: acc_knn_label_prop = [] print 'knn label prop did not work' # Baseline: Label Spreading with k-NN weights try: knn_label_spread = LabelSpreading(kernel='knn', n_neighbors=11) knn_label_spread.fit(x_merged, y_merged) acc_knn_label_spread = knn_label_spread.score(z, z_y) # hard_label_knn_label_spread = knn_label_spread.predict(z) # soft_label_knn_label_spread = knn_label_spread.predict_proba(z)[:, 1] except: acc_knn_label_spread = [] print 'knn label spread did not work' # Generative Models # Semi-generative model on labelled data only a_y, b_y, a_e0, a_e1, b_0, b_1, cov_e0, cov_e1 = soft_label_EM( x_c, y, x_e, z_c, z_e, converged=True) soft_label_semigen = predict_class_probs(z_c, z_e, a_y, b_y, a_e0, a_e1, b_0, b_1, cov_e0, cov_e1) hard_label_semigen = soft_label_semigen > 0.5 acc_semigen_labelled = np.mean(hard_label_semigen == z_y) # EM with soft labels a_y, b_y, a_e0, a_e1, b_0, b_1, cov_e0, cov_e1 = soft_label_EM( x_c, y, x_e, z_c, z_e) soft_label_soft_EM = predict_class_probs(z_c, z_e, a_y, b_y, a_e0, a_e1, b_0, b_1, cov_e0, cov_e1) hard_label_soft_EM = soft_label_soft_EM > 0.5 acc_soft_EM = np.mean(hard_label_soft_EM == z_y) # EM with hard labels a_y, b_y, a_e0, a_e1, b_0, b_1, cov_e0, cov_e1 = hard_label_EM( x_c, y, x_e, z_c, z_e) soft_label_hard_EM = predict_class_probs(z_c, z_e, a_y, b_y, a_e0, a_e1, b_0, b_1, cov_e0, cov_e1) hard_label_hard_EM = soft_label_hard_EM > 0.5 acc_hard_EM = np.mean(hard_label_hard_EM == z_y) # Conditional label prop acc_cond_prop = conditional_prop(x_c, y, x_e, z_c, z_y, z_e) return acc_lin_lr, acc_lin_tsvm, acc_rbf_tsvm, acc_rbf_label_prop, acc_rbf_label_spread, acc_knn_label_prop,\ acc_knn_label_spread, acc_semigen_labelled, acc_soft_EM, acc_hard_EM, acc_cond_prop