def main():
    stop = fetch_default_stop_words()
    comm = fetch_data(ds_name="sheet_4_labeled", cut_all=False, stop_words=stop)

    # 将无标注的数据的标记置为-1
    for key in comm.keys():
        if comm[key].integrity is None:
            comm[key].integrity = -1
            comm[key].interpretability = -1

    labeled = {key: c for key, c in comm.items() if c.integrity != -1}
    unlabeled = {key: c for key, c in comm.items() if c.integrity == -1}

    # 建立基于答复文本的Word2Vec模型
    line_sents = fetch_data("sheet_4_labeled", cut_all=False, stop_words=stop,
                            mode="reply_lines", remove_duplicates=False)
    wv_model = Word2Vec(line_sents,
                        size=400, window=5, sg=1, min_count=5)
    wv_model.wv.save_word2vec_format("../resources/wv_reply_text", binary=False)  # 保存模型
    print("model is saved.")

    # 加载基于答复文本的wv模型
    # wv_model = gensim.models.KeyedVectors.load_word2vec_format("../resources/wv_reply_text", binary=False)
    # print("model is loaded.")

    xy = [(doc_vec(c.seg_reply, model=wv_model), c.interpretability) for c in unlabeled.values()]
    x = [t[0] for t in xy]
    y = [t[1] for t in xy]

    # 从已标注的数据中分出一部分作为测试集
    xy_labeled = [(doc_vec(c.seg_reply, model=wv_model), c.interpretability) for c in labeled.values()]
    x_labeled = [t[0] for t in xy_labeled]
    y_labeled = [t[1] for t in xy_labeled]
    # 从100个标注样本中分出30个作为测试集
    x_train, x_test, y_train, y_test = sklearn.model_selection.train_test_split(x_labeled, y_labeled, test_size=0.3)

    # 将已标注的数据与未标注的数据混合成为训练集
    x_train += x
    y_train += y

    # 训练标记传播模型
    # clf = LabelPropagation(gamma=30)  # 模型1
    clf = LabelSpreading()  # 模型2, 第二组参数:max_iter=100, kernel='rbf', gamma=0.1
    clf.fit(x_train, y_train)
    joblib.dump(clf, "../resources/label_spreading_interpretability_clf")

    # x_test = [doc_vec(c.seg_reply, wv_model) for c in labeled.values()]
    # y_test = [c.integrity for c in labeled.values()]
    print(f"Accuracy:{clf.score(x_test, y_test)}")
 def fit(self, X, y=None):
     self._sklearn_model = SKLModel(**self._hyperparams)
     if (y is not None):
         self._sklearn_model.fit(X, y)
     else:
         self._sklearn_model.fit(X)
     return self
Exemple #3
0
def test_LabelSpreading(*data):
    '''
    test LabelSpreading
    :param data: data( have target), data_target, data( not have target)
    :return: None
    '''
    X, y, unlabeled_indices = data
    y_train = np.copy(y)
    y_train[unlabeled_indices] = -1
    clf = LabelSpreading(max_iter=100, kernel='rbf', gamma=0.1)
    clf.fit(X, y_train)

    predicted_labels = clf.transduction_[unlabeled_indices]
    true_labels = y[unlabeled_indices]
    print("Accuracy:%f" %
          metrics.accuracy_score(true_labels, predicted_labels))
Exemple #4
0
def test_LabelSpreading_rbf(*data):
    '''
    测试 LabelSpreading 的 rbf 核时,预测性能随 alpha 和 gamma 的变化

    :param data: 一个元组,依次为: 样本集合、样本标记集合、 未标记样本的下标集合
    :return: None
    '''
    X, y, unlabeled_indices = data
    y_train = np.copy(y)  # 必须拷贝,后面要用到 y
    y_train[unlabeled_indices] = -1  # 未标记样本的标记设定为 -1

    fig = plt.figure()
    ax = fig.add_subplot(1, 1, 1)
    alphas = np.linspace(0.01, 1, num=10, endpoint=True)
    gammas = np.logspace(-2, 2, num=50)
    colors = (
        (1, 0, 0),
        (0, 1, 0),
        (0, 0, 1),
        (0.5, 0.5, 0),
        (0, 0.5, 0.5),
        (0.5, 0, 0.5),
        (0.4, 0.6, 0),
        (0.6, 0.4, 0),
        (0, 0.6, 0.4),
        (0.5, 0.3, 0.2),
    )  # 颜色集合,不同曲线用不同颜色
    ## 训练并绘图
    for alpha, color in zip(alphas, colors):
        scores = []
        for gamma in gammas:
            clf = LabelSpreading(max_iter=100,
                                 gamma=gamma,
                                 alpha=alpha,
                                 kernel='rbf')
            clf.fit(X, y_train)
            scores.append(clf.score(X[unlabeled_indices],
                                    y[unlabeled_indices]))
        ax.plot(gammas, scores, label=r"$\alpha=%s$" % alpha, color=color)

    ### 设置图形
    ax.set_xlabel(r"$\gamma$")
    ax.set_ylabel("score")
    ax.set_xscale("log")
    ax.legend(loc="best")
    ax.set_title("LabelSpreading rbf kernel")
    plt.show()
Exemple #5
0
def test_LabelSpreading_knn(*data):
    '''
   测试 LabelSpreading 的 knn 核时,预测性能随 alpha 和 n_neighbors 的变化


    :param data:  一个元组,依次为: 样本集合、样本标记集合、 未标记样本的下标集合
    :return:  None
    '''
    X, y, unlabeled_indices = data
    y_train = np.copy(y)  # 必须拷贝,后面要用到 y
    y_train[unlabeled_indices] = -1  # 未标记样本的标记设定为 -1

    fig = plt.figure()
    ax = fig.add_subplot(1, 1, 1)
    alphas = np.linspace(0.01, 1, num=10, endpoint=True)
    Ks = [1, 2, 3, 4, 5, 8, 10, 15, 20, 25, 30, 35, 40, 50]
    colors = (
        (1, 0, 0),
        (0, 1, 0),
        (0, 0, 1),
        (0.5, 0.5, 0),
        (0, 0.5, 0.5),
        (0.5, 0, 0.5),
        (0.4, 0.6, 0),
        (0.6, 0.4, 0),
        (0, 0.6, 0.4),
        (0.5, 0.3, 0.2),
    )  # 颜色集合,不同曲线用不同颜色
    ## 训练并绘图
    for alpha, color in zip(alphas, colors):
        scores = []
        for K in Ks:
            clf = LabelSpreading(kernel='knn',
                                 max_iter=100,
                                 n_neighbors=K,
                                 alpha=alpha)
            clf.fit(X, y_train)
            scores.append(clf.score(X[unlabeled_indices],
                                    y[unlabeled_indices]))
        ax.plot(Ks, scores, label=r"$\alpha=%s$" % alpha, color=color)

    ### 设置图形
    ax.set_xlabel(r"$k$")
    ax.set_ylabel("score")
    ax.legend(loc="best")
    ax.set_title("LabelSpreading knn kernel")
    plt.show()
Exemple #6
0
def test_LabelSpreading(*data):
    '''
    测试 LabelSpreading 的用法

    :param data: 一个元组,依次为: 样本集合、样本标记集合、 未标记样本的下标集合
    :return: None
    '''
    X, y, unlabeled_indices = data
    y_train = np.copy(y)  # 必须拷贝,后面要用到 y
    y_train[unlabeled_indices] = -1  # 未标记样本的标记设定为 -1
    clf = LabelSpreading(max_iter=100, kernel='rbf', gamma=0.1)
    clf.fit(X, y_train)
    ### 获取预测准确率
    predicted_labels = clf.transduction_[unlabeled_indices]  # 预测标记
    true_labels = y[unlabeled_indices]  # 真实标记
    print("Accuracy:%f" %
          metrics.accuracy_score(true_labels, predicted_labels))
Exemple #7
0
def test_LabelSpreading_rbf(*data):
    '''
    test LabelSpreading with rbf kernel and different alpha, gamma
    :param data: data( have target), data_target, data( not have target)
    :return: None
    '''
    X, y, unlabeled_indices = data
    y_train = np.copy(y)
    y_train[unlabeled_indices] = -1

    fig = plt.figure()
    ax = fig.add_subplot(1, 1, 1)
    alphas = np.linspace(0.01, 1, num=10, endpoint=True)
    gammas = np.logspace(-2, 2, num=50)
    colors = (
        (1, 0, 0),
        (0, 1, 0),
        (0, 0, 1),
        (0.5, 0.5, 0),
        (0, 0.5, 0.5),
        (0.5, 0, 0.5),
        (0.4, 0.6, 0),
        (0.6, 0.4, 0),
        (0, 0.6, 0.4),
        (0.5, 0.3, 0.2),
    )

    for alpha, color in zip(alphas, colors):
        scores = []
        for gamma in gammas:
            clf = LabelSpreading(max_iter=100,
                                 gamma=gamma,
                                 alpha=alpha,
                                 kernel='rbf')
            clf.fit(X, y_train)
            scores.append(clf.score(X[unlabeled_indices],
                                    y[unlabeled_indices]))
        ax.plot(gammas, scores, label=r"$\alpha=%s$" % alpha, color=color)

    ax.set_xlabel(r"$\gamma$")
    ax.set_ylabel("score")
    ax.set_xscale("log")
    ax.legend(loc="best")
    ax.set_title("LabelSpreading rbf kernel")
    plt.show()
def main():
    stop = fetch_default_stop_words()
    comm = fetch_data(ds_name="sheet_4_labeled", cut_all=False, stop_words=stop)

    # 将无标注的数据的标记置为-1
    for key in comm.keys():
        if comm[key].integrity is None:
            comm[key].integrity = -1
            comm[key].interpretability = -1

    labeled = {key: c for key, c in comm.items() if c.integrity != -1}
    unlabeled = {key: c for key, c in comm.items() if c.integrity == -1}

    # 加载基于答复文本的wv模型
    wv_model = gensim.models.KeyedVectors.load_word2vec_format(word2vec_model_path, binary=False)

    # -------------------------------------------------------
    # 模型一
    xy = [(doc_vec(c.seg_reply, model=wv_model), c.integrity) for c in unlabeled.values()]
    x = [t[0] for t in xy]
    y = [t[1] for t in xy]

    xy_labeled = [(doc_vec(c.seg_reply, model=wv_model), c.integrity) for c in labeled.values()]
    x_labeled = [t[0] for t in xy_labeled]
    y_labeled = [t[1] for t in xy_labeled]

    x_train, y_train = (x_labeled, y_labeled)

    # 将已标注的数据与未标注的数据混合成为训练集
    x_train += x
    y_train += y

    # 训练标记传播模型
    clf = LabelPropagation(gamma=30)  # 模型1
    clf.fit(x_train, y_train)
    joblib.dump(clf, integrity_clf_path)

    # --------------------------------------------------------------
    # 模型二
    xy = [(doc_vec(c.seg_reply, model=wv_model), c.interpretability) for c in unlabeled.values()]
    x = [t[0] for t in xy]
    y = [t[1] for t in xy]

    xy_labeled = [(doc_vec(c.seg_reply, model=wv_model), c.interpretability) for c in labeled.values()]
    x_labeled = [t[0] for t in xy_labeled]
    y_labeled = [t[1] for t in xy_labeled]

    x_train, y_train = (x_labeled, y_labeled)

    # 将已标注的数据与未标注的数据混合成为训练集
    x_train += x
    y_train += y

    # 训练标记传播模型
    clf = LabelSpreading()  # 模型2
    clf.fit(x_train, y_train)
    joblib.dump(clf, interpretability_clf_path)
Exemple #9
0
def test_LabelSpreading_knn(*data):
    '''
    test LabelSpreading with knn kernel, and different alpha , n_neighbors
    :param data:  data( have target), data_target, data( not have target)
    :return:  None
    '''
    X, y, unlabeled_indices = data
    y_train = np.copy(y)
    y_train[unlabeled_indices] = -1

    fig = plt.figure()
    ax = fig.add_subplot(1, 1, 1)
    alphas = np.linspace(0.01, 1, num=10, endpoint=True)
    Ks = [1, 2, 3, 4, 5, 8, 10, 15, 20, 25, 30, 35, 40, 50]
    colors = (
        (1, 0, 0),
        (0, 1, 0),
        (0, 0, 1),
        (0.5, 0.5, 0),
        (0, 0.5, 0.5),
        (0.5, 0, 0.5),
        (0.4, 0.6, 0),
        (0.6, 0.4, 0),
        (0, 0.6, 0.4),
        (0.5, 0.3, 0.2),
    )

    for alpha, color in zip(alphas, colors):
        scores = []
        for K in Ks:
            clf = LabelSpreading(kernel='knn',
                                 max_iter=100,
                                 n_neighbors=K,
                                 alpha=alpha)
            clf.fit(X, y_train)
            scores.append(clf.score(X[unlabeled_indices],
                                    y[unlabeled_indices]))
        ax.plot(Ks, scores, label=r"$\alpha=%s$" % alpha, color=color)

    ax.set_xlabel(r"$k$")
    ax.set_ylabel("score")
    ax.legend(loc="best")
    ax.set_title("LabelSpreading knn kernel")
    plt.show()
Exemple #10
0
 def _semi_supervised(self, label: str) -> List[Stats]:
     return [
         self._active_learning_for_learner_strategy(
             label,
             LabelSpreading(),
             sampling_strategy,
             self._active_learning_data_split(label),
             semi_sup=True)
         for sampling_strategy in self.active_learning_strategies
     ]
Exemple #11
0
class LabelSpreadingImpl():
    def __init__(self,
                 kernel='rbf',
                 gamma=20,
                 n_neighbors=7,
                 alpha=0.2,
                 max_iter=30,
                 tol=0.001,
                 n_jobs=None):
        self._hyperparams = {
            'kernel': kernel,
            'gamma': gamma,
            'n_neighbors': n_neighbors,
            'alpha': alpha,
            'max_iter': max_iter,
            'tol': tol,
            'n_jobs': n_jobs
        }
        self._wrapped_model = Op(**self._hyperparams)

    def fit(self, X, y=None):
        if (y is not None):
            self._wrapped_model.fit(X, y)
        else:
            self._wrapped_model.fit(X)
        return self

    def predict(self, X):
        return self._wrapped_model.predict(X)

    def predict_proba(self, X):
        return self._wrapped_model.predict_proba(X)
Exemple #12
0
 def __init__(self,
              kernel='rbf',
              gamma=20,
              n_neighbors=7,
              alpha=0.2,
              max_iter=30,
              tol=0.001,
              n_jobs=None):
     self._hyperparams = {
         'kernel': kernel,
         'gamma': gamma,
         'n_neighbors': n_neighbors,
         'alpha': alpha,
         'max_iter': max_iter,
         'tol': tol,
         'n_jobs': n_jobs
     }
     self._wrapped_model = Op(**self._hyperparams)
Exemple #13
0
warnings.filterwarnings('ignore')

classifiers = [
    AdaBoostClassifier(),
    BaggingClassifier(),
    BernoulliNB(),
    CalibratedClassifierCV(),
    DecisionTreeClassifier(),
    ExtraTreeClassifier(),
    ExtraTreesClassifier(),
    GaussianNB(),
    GaussianProcessClassifier(),
    GradientBoostingClassifier(),
    KNeighborsClassifier(),
    LabelPropagation(),
    LabelSpreading(),
    LinearDiscriminantAnalysis(),
    LogisticRegression(),
    LogisticRegressionCV(),
    MLPClassifier(),
    NuSVC(probability=True),
    QuadraticDiscriminantAnalysis(),
    RandomForestClassifier(),
    SGDClassifier(loss='log'),
    SVC(probability=True),
    XGBClassifier()
]

names = [
    'AdaBoostClassifier', 'BaggingClassifier', 'BernoulliNB',
    'CalibratedClassifierCV', 'DecisionTreeClassifier', 'ExtraTreeClassifier',
Exemple #14
0
			'GraphLassoCV':GraphLassoCV(),
			'HuberRegressor':HuberRegressor(),
			'Imputer':Imputer(),
			'IncrementalPCA':IncrementalPCA(),
			'IsolationForest':IsolationForest(),
			'Isomap':Isomap(),
			'KMeans':KMeans(),
			'KNeighborsClassifier':KNeighborsClassifier(),
			'KNeighborsRegressor':KNeighborsRegressor(),
			'KernelCenterer':KernelCenterer(),
			'KernelDensity':KernelDensity(),
			'KernelPCA':KernelPCA(),
			'KernelRidge':KernelRidge(),
			'LSHForest':LSHForest(),
			'LabelPropagation':LabelPropagation(),
			'LabelSpreading':LabelSpreading(),
			'Lars':Lars(),
			'LarsCV':LarsCV(),
			'Lasso':Lasso(),
			'LassoCV':LassoCV(),
			'LassoLars':LassoLars(),
			'LassoLarsCV':LassoLarsCV(),
			'LassoLarsIC':LassoLarsIC(),
			'LatentDirichletAllocation':LatentDirichletAllocation(),
			'LedoitWolf':LedoitWolf(),
			'LinearDiscriminantAnalysis':LinearDiscriminantAnalysis(),
			'LinearRegression':LinearRegression(),
			'LinearSVC':LinearSVC(),
			'LinearSVR':LinearSVR(),
			'LocallyLinearEmbedding':LocallyLinearEmbedding(),
			'LogisticRegression':LogisticRegression(),