def main(): stop = fetch_default_stop_words() comm = fetch_data(ds_name="sheet_4_labeled", cut_all=False, stop_words=stop) # 将无标注的数据的标记置为-1 for key in comm.keys(): if comm[key].integrity is None: comm[key].integrity = -1 comm[key].interpretability = -1 labeled = {key: c for key, c in comm.items() if c.integrity != -1} unlabeled = {key: c for key, c in comm.items() if c.integrity == -1} # 建立基于答复文本的Word2Vec模型 line_sents = fetch_data("sheet_4_labeled", cut_all=False, stop_words=stop, mode="reply_lines", remove_duplicates=False) wv_model = Word2Vec(line_sents, size=400, window=5, sg=1, min_count=5) wv_model.wv.save_word2vec_format("../resources/wv_reply_text", binary=False) # 保存模型 print("model is saved.") # 加载基于答复文本的wv模型 # wv_model = gensim.models.KeyedVectors.load_word2vec_format("../resources/wv_reply_text", binary=False) # print("model is loaded.") xy = [(doc_vec(c.seg_reply, model=wv_model), c.interpretability) for c in unlabeled.values()] x = [t[0] for t in xy] y = [t[1] for t in xy] # 从已标注的数据中分出一部分作为测试集 xy_labeled = [(doc_vec(c.seg_reply, model=wv_model), c.interpretability) for c in labeled.values()] x_labeled = [t[0] for t in xy_labeled] y_labeled = [t[1] for t in xy_labeled] # 从100个标注样本中分出30个作为测试集 x_train, x_test, y_train, y_test = sklearn.model_selection.train_test_split(x_labeled, y_labeled, test_size=0.3) # 将已标注的数据与未标注的数据混合成为训练集 x_train += x y_train += y # 训练标记传播模型 # clf = LabelPropagation(gamma=30) # 模型1 clf = LabelSpreading() # 模型2, 第二组参数:max_iter=100, kernel='rbf', gamma=0.1 clf.fit(x_train, y_train) joblib.dump(clf, "../resources/label_spreading_interpretability_clf") # x_test = [doc_vec(c.seg_reply, wv_model) for c in labeled.values()] # y_test = [c.integrity for c in labeled.values()] print(f"Accuracy:{clf.score(x_test, y_test)}")
def fit(self, X, y=None): self._sklearn_model = SKLModel(**self._hyperparams) if (y is not None): self._sklearn_model.fit(X, y) else: self._sklearn_model.fit(X) return self
def test_LabelSpreading(*data): ''' test LabelSpreading :param data: data( have target), data_target, data( not have target) :return: None ''' X, y, unlabeled_indices = data y_train = np.copy(y) y_train[unlabeled_indices] = -1 clf = LabelSpreading(max_iter=100, kernel='rbf', gamma=0.1) clf.fit(X, y_train) predicted_labels = clf.transduction_[unlabeled_indices] true_labels = y[unlabeled_indices] print("Accuracy:%f" % metrics.accuracy_score(true_labels, predicted_labels))
def test_LabelSpreading_rbf(*data): ''' 测试 LabelSpreading 的 rbf 核时,预测性能随 alpha 和 gamma 的变化 :param data: 一个元组,依次为: 样本集合、样本标记集合、 未标记样本的下标集合 :return: None ''' X, y, unlabeled_indices = data y_train = np.copy(y) # 必须拷贝,后面要用到 y y_train[unlabeled_indices] = -1 # 未标记样本的标记设定为 -1 fig = plt.figure() ax = fig.add_subplot(1, 1, 1) alphas = np.linspace(0.01, 1, num=10, endpoint=True) gammas = np.logspace(-2, 2, num=50) colors = ( (1, 0, 0), (0, 1, 0), (0, 0, 1), (0.5, 0.5, 0), (0, 0.5, 0.5), (0.5, 0, 0.5), (0.4, 0.6, 0), (0.6, 0.4, 0), (0, 0.6, 0.4), (0.5, 0.3, 0.2), ) # 颜色集合,不同曲线用不同颜色 ## 训练并绘图 for alpha, color in zip(alphas, colors): scores = [] for gamma in gammas: clf = LabelSpreading(max_iter=100, gamma=gamma, alpha=alpha, kernel='rbf') clf.fit(X, y_train) scores.append(clf.score(X[unlabeled_indices], y[unlabeled_indices])) ax.plot(gammas, scores, label=r"$\alpha=%s$" % alpha, color=color) ### 设置图形 ax.set_xlabel(r"$\gamma$") ax.set_ylabel("score") ax.set_xscale("log") ax.legend(loc="best") ax.set_title("LabelSpreading rbf kernel") plt.show()
def test_LabelSpreading_knn(*data): ''' 测试 LabelSpreading 的 knn 核时,预测性能随 alpha 和 n_neighbors 的变化 :param data: 一个元组,依次为: 样本集合、样本标记集合、 未标记样本的下标集合 :return: None ''' X, y, unlabeled_indices = data y_train = np.copy(y) # 必须拷贝,后面要用到 y y_train[unlabeled_indices] = -1 # 未标记样本的标记设定为 -1 fig = plt.figure() ax = fig.add_subplot(1, 1, 1) alphas = np.linspace(0.01, 1, num=10, endpoint=True) Ks = [1, 2, 3, 4, 5, 8, 10, 15, 20, 25, 30, 35, 40, 50] colors = ( (1, 0, 0), (0, 1, 0), (0, 0, 1), (0.5, 0.5, 0), (0, 0.5, 0.5), (0.5, 0, 0.5), (0.4, 0.6, 0), (0.6, 0.4, 0), (0, 0.6, 0.4), (0.5, 0.3, 0.2), ) # 颜色集合,不同曲线用不同颜色 ## 训练并绘图 for alpha, color in zip(alphas, colors): scores = [] for K in Ks: clf = LabelSpreading(kernel='knn', max_iter=100, n_neighbors=K, alpha=alpha) clf.fit(X, y_train) scores.append(clf.score(X[unlabeled_indices], y[unlabeled_indices])) ax.plot(Ks, scores, label=r"$\alpha=%s$" % alpha, color=color) ### 设置图形 ax.set_xlabel(r"$k$") ax.set_ylabel("score") ax.legend(loc="best") ax.set_title("LabelSpreading knn kernel") plt.show()
def test_LabelSpreading(*data): ''' 测试 LabelSpreading 的用法 :param data: 一个元组,依次为: 样本集合、样本标记集合、 未标记样本的下标集合 :return: None ''' X, y, unlabeled_indices = data y_train = np.copy(y) # 必须拷贝,后面要用到 y y_train[unlabeled_indices] = -1 # 未标记样本的标记设定为 -1 clf = LabelSpreading(max_iter=100, kernel='rbf', gamma=0.1) clf.fit(X, y_train) ### 获取预测准确率 predicted_labels = clf.transduction_[unlabeled_indices] # 预测标记 true_labels = y[unlabeled_indices] # 真实标记 print("Accuracy:%f" % metrics.accuracy_score(true_labels, predicted_labels))
def test_LabelSpreading_rbf(*data): ''' test LabelSpreading with rbf kernel and different alpha, gamma :param data: data( have target), data_target, data( not have target) :return: None ''' X, y, unlabeled_indices = data y_train = np.copy(y) y_train[unlabeled_indices] = -1 fig = plt.figure() ax = fig.add_subplot(1, 1, 1) alphas = np.linspace(0.01, 1, num=10, endpoint=True) gammas = np.logspace(-2, 2, num=50) colors = ( (1, 0, 0), (0, 1, 0), (0, 0, 1), (0.5, 0.5, 0), (0, 0.5, 0.5), (0.5, 0, 0.5), (0.4, 0.6, 0), (0.6, 0.4, 0), (0, 0.6, 0.4), (0.5, 0.3, 0.2), ) for alpha, color in zip(alphas, colors): scores = [] for gamma in gammas: clf = LabelSpreading(max_iter=100, gamma=gamma, alpha=alpha, kernel='rbf') clf.fit(X, y_train) scores.append(clf.score(X[unlabeled_indices], y[unlabeled_indices])) ax.plot(gammas, scores, label=r"$\alpha=%s$" % alpha, color=color) ax.set_xlabel(r"$\gamma$") ax.set_ylabel("score") ax.set_xscale("log") ax.legend(loc="best") ax.set_title("LabelSpreading rbf kernel") plt.show()
def main(): stop = fetch_default_stop_words() comm = fetch_data(ds_name="sheet_4_labeled", cut_all=False, stop_words=stop) # 将无标注的数据的标记置为-1 for key in comm.keys(): if comm[key].integrity is None: comm[key].integrity = -1 comm[key].interpretability = -1 labeled = {key: c for key, c in comm.items() if c.integrity != -1} unlabeled = {key: c for key, c in comm.items() if c.integrity == -1} # 加载基于答复文本的wv模型 wv_model = gensim.models.KeyedVectors.load_word2vec_format(word2vec_model_path, binary=False) # ------------------------------------------------------- # 模型一 xy = [(doc_vec(c.seg_reply, model=wv_model), c.integrity) for c in unlabeled.values()] x = [t[0] for t in xy] y = [t[1] for t in xy] xy_labeled = [(doc_vec(c.seg_reply, model=wv_model), c.integrity) for c in labeled.values()] x_labeled = [t[0] for t in xy_labeled] y_labeled = [t[1] for t in xy_labeled] x_train, y_train = (x_labeled, y_labeled) # 将已标注的数据与未标注的数据混合成为训练集 x_train += x y_train += y # 训练标记传播模型 clf = LabelPropagation(gamma=30) # 模型1 clf.fit(x_train, y_train) joblib.dump(clf, integrity_clf_path) # -------------------------------------------------------------- # 模型二 xy = [(doc_vec(c.seg_reply, model=wv_model), c.interpretability) for c in unlabeled.values()] x = [t[0] for t in xy] y = [t[1] for t in xy] xy_labeled = [(doc_vec(c.seg_reply, model=wv_model), c.interpretability) for c in labeled.values()] x_labeled = [t[0] for t in xy_labeled] y_labeled = [t[1] for t in xy_labeled] x_train, y_train = (x_labeled, y_labeled) # 将已标注的数据与未标注的数据混合成为训练集 x_train += x y_train += y # 训练标记传播模型 clf = LabelSpreading() # 模型2 clf.fit(x_train, y_train) joblib.dump(clf, interpretability_clf_path)
def test_LabelSpreading_knn(*data): ''' test LabelSpreading with knn kernel, and different alpha , n_neighbors :param data: data( have target), data_target, data( not have target) :return: None ''' X, y, unlabeled_indices = data y_train = np.copy(y) y_train[unlabeled_indices] = -1 fig = plt.figure() ax = fig.add_subplot(1, 1, 1) alphas = np.linspace(0.01, 1, num=10, endpoint=True) Ks = [1, 2, 3, 4, 5, 8, 10, 15, 20, 25, 30, 35, 40, 50] colors = ( (1, 0, 0), (0, 1, 0), (0, 0, 1), (0.5, 0.5, 0), (0, 0.5, 0.5), (0.5, 0, 0.5), (0.4, 0.6, 0), (0.6, 0.4, 0), (0, 0.6, 0.4), (0.5, 0.3, 0.2), ) for alpha, color in zip(alphas, colors): scores = [] for K in Ks: clf = LabelSpreading(kernel='knn', max_iter=100, n_neighbors=K, alpha=alpha) clf.fit(X, y_train) scores.append(clf.score(X[unlabeled_indices], y[unlabeled_indices])) ax.plot(Ks, scores, label=r"$\alpha=%s$" % alpha, color=color) ax.set_xlabel(r"$k$") ax.set_ylabel("score") ax.legend(loc="best") ax.set_title("LabelSpreading knn kernel") plt.show()
def _semi_supervised(self, label: str) -> List[Stats]: return [ self._active_learning_for_learner_strategy( label, LabelSpreading(), sampling_strategy, self._active_learning_data_split(label), semi_sup=True) for sampling_strategy in self.active_learning_strategies ]
class LabelSpreadingImpl(): def __init__(self, kernel='rbf', gamma=20, n_neighbors=7, alpha=0.2, max_iter=30, tol=0.001, n_jobs=None): self._hyperparams = { 'kernel': kernel, 'gamma': gamma, 'n_neighbors': n_neighbors, 'alpha': alpha, 'max_iter': max_iter, 'tol': tol, 'n_jobs': n_jobs } self._wrapped_model = Op(**self._hyperparams) def fit(self, X, y=None): if (y is not None): self._wrapped_model.fit(X, y) else: self._wrapped_model.fit(X) return self def predict(self, X): return self._wrapped_model.predict(X) def predict_proba(self, X): return self._wrapped_model.predict_proba(X)
def __init__(self, kernel='rbf', gamma=20, n_neighbors=7, alpha=0.2, max_iter=30, tol=0.001, n_jobs=None): self._hyperparams = { 'kernel': kernel, 'gamma': gamma, 'n_neighbors': n_neighbors, 'alpha': alpha, 'max_iter': max_iter, 'tol': tol, 'n_jobs': n_jobs } self._wrapped_model = Op(**self._hyperparams)
warnings.filterwarnings('ignore') classifiers = [ AdaBoostClassifier(), BaggingClassifier(), BernoulliNB(), CalibratedClassifierCV(), DecisionTreeClassifier(), ExtraTreeClassifier(), ExtraTreesClassifier(), GaussianNB(), GaussianProcessClassifier(), GradientBoostingClassifier(), KNeighborsClassifier(), LabelPropagation(), LabelSpreading(), LinearDiscriminantAnalysis(), LogisticRegression(), LogisticRegressionCV(), MLPClassifier(), NuSVC(probability=True), QuadraticDiscriminantAnalysis(), RandomForestClassifier(), SGDClassifier(loss='log'), SVC(probability=True), XGBClassifier() ] names = [ 'AdaBoostClassifier', 'BaggingClassifier', 'BernoulliNB', 'CalibratedClassifierCV', 'DecisionTreeClassifier', 'ExtraTreeClassifier',
'GraphLassoCV':GraphLassoCV(), 'HuberRegressor':HuberRegressor(), 'Imputer':Imputer(), 'IncrementalPCA':IncrementalPCA(), 'IsolationForest':IsolationForest(), 'Isomap':Isomap(), 'KMeans':KMeans(), 'KNeighborsClassifier':KNeighborsClassifier(), 'KNeighborsRegressor':KNeighborsRegressor(), 'KernelCenterer':KernelCenterer(), 'KernelDensity':KernelDensity(), 'KernelPCA':KernelPCA(), 'KernelRidge':KernelRidge(), 'LSHForest':LSHForest(), 'LabelPropagation':LabelPropagation(), 'LabelSpreading':LabelSpreading(), 'Lars':Lars(), 'LarsCV':LarsCV(), 'Lasso':Lasso(), 'LassoCV':LassoCV(), 'LassoLars':LassoLars(), 'LassoLarsCV':LassoLarsCV(), 'LassoLarsIC':LassoLarsIC(), 'LatentDirichletAllocation':LatentDirichletAllocation(), 'LedoitWolf':LedoitWolf(), 'LinearDiscriminantAnalysis':LinearDiscriminantAnalysis(), 'LinearRegression':LinearRegression(), 'LinearSVC':LinearSVC(), 'LinearSVR':LinearSVR(), 'LocallyLinearEmbedding':LocallyLinearEmbedding(), 'LogisticRegression':LogisticRegression(),