Python LabelPropagation.fit Examples, sklearn.semi_supervised.LabelPropagation.fit Python Examples

Example #1

0

Show file

def test_LabelPropagation_rbf(*data):
    x, y, unlabeled_indices = data
    y_train = np.copy(y)
    y_train[unlabeled_indices] = -1
    fig = plt.figure()
    ax = fig.add_subplot(1, 1, 1)
    alphas = np.linspace(0.01, 1, num=10, endpoint=True)
    gammas = np.logspace(-2, 2, num=50)
    colors = ((1, 0, 0), (0, 1, 0), (0, 0, 1), (0.5, 0.5, 0), (0, 0.5, 0.5), (0.5, 0, 0.5), (0.4, 0.6, 0), (0.6, 0.4, 0) \
                  , (0, 0.6, 0.4), (0.5, 0.3, 0.2))  # 颜色集合，不同的曲线用不同的颜色
    # 训练并绘图
    for alpha, color in zip(alphas, colors):
        scores = []
        for gamma in gammas:
            clf = LabelPropagation(max_iter=100, gamma=gamma, alpha=alpha, kernel='rbf')
            clf.fit(x, y_train)
            scores.append(clf.score(x[unlabeled_indices], y[unlabeled_indices]))
        ax.plot(gammas, scores, label=r"$\alpha=%s$" % alpha, color=color)
    # 设置图形
    ax.set_xlabel(r"$\gamma$")
    ax.set_ylabel("score")
    ax.set_xscale("log")
    ax.legend(loc='best')
    ax.set_title("LabelPropagation rbf kernel")
    plt.show()

Example #2

0

Show file

File: balance_sample_data.py Project: DeepSleepUCDenver/sleep_models

def load_all_data():
    # Read am partition the matrix
    data = pd.read_feather('../feature_stage_data_all.ftr')
    x = data[data.columns[3:]]
    y = data['stage']
    o = data.observation
    x = x.values
    x = normalize(x)
    y = y.values
    x_va = x[4977:4977+3000]
    y_va = y[4977:4977+3000]
    x = np.concatenate((x[:4977],x[4977+3000:]))
    y = np.concatenate((y[:4977],y[4977+3000:]))
    
    
    nnl = lambda a: np.invert(np.isnan(a))
    nul = lambda a: np.isnan(a)
    x_obs = x[nnl(y)]
    y_obs = y[nnl(y)]
    
    # apply Label Spreading
    x_nuls = x[nul(y)]
    label_spread = LabelPropagation(kernel='knn')
    label_spread.fit(x_obs, y_obs)
    x = np.concatenate([x_obs, x_nuls], axis=0)
    y = np.concatenate([y_obs, label_spread.predict(x_nuls)], axis=0)
    
    x_tr, x_te, y_tr, y_te = train_test_split(x, y, test_size = 0.20)
    return x_tr, y_tr, x_te, y_te, x_va, y_va

Example #3

0

Show file

def test_LabelPropagation_rbf(*data):
    '''
    测试 LabelPropagation 的 rbf 核时，预测性能随 alpha 和 gamma 的变化
    '''
    X, y, unlabeled_indices = data
    # 必须拷贝，后面要用到 y
    y_train = np.copy(y)
    # 未标记样本的标记设定为 -1
    y_train[unlabeled_indices] = -1

    fig = plt.figure()
    ax = fig.add_subplot(1, 1, 1)
    alphas = np.linspace(0.01, 1, num=10, endpoint=True)
    gammas = np.logspace(-2, 2, num=50)

    scores = []
    for gamma in gammas:
        clf = LabelPropagation(max_iter=100, gamma=gamma, kernel='rbf')
        clf.fit(X, y_train)
        scores.append(clf.score(X[unlabeled_indices], y[unlabeled_indices]))
    ax.plot(gammas, scores)

    ### 设置图形
    ax.set_xlabel(r"$\gamma$")
    ax.set_ylabel("score")
    ax.set_xscale("log")
    ax.legend(loc="best")
    ax.set_title("LabelPropagation rbf kernel")
    plt.show()

Example #4

0

Show file

def semiLabelPropagation(feature_extractor, generator, val_generator, kernel,
                         neighbors, gamma):
    semi = LabelPropagation(kernel=kernel,
                            n_neighbors=neighbors,
                            gamma=gamma,
                            alpha=None,
                            tol=0.001,
                            max_iter=1000000)

    features = feature_extractor.predict_generator(generator,
                                                   steps=generator.samples /
                                                   generator.batch_size,
                                                   verbose=1)

    classes = generator.classes

    for i in range(0, generator.samples):
        if (generator.filenames[i][0] == 'N'):
            classes[i] = -1

    semi.fit(features, classes)

    val_features = feature_extractor.predict_generator(
        val_generator,
        steps=val_generator.samples / val_generator.batch_size,
        verbose=1)
    predicted_classes = semi.predict(val_features)

    return predicted_classes

Example #5

0

Show file

File: vc_helper.py Project: smorabit/vc_tool

    def testing_predictions(self,
                            test_data,
                            model,
                            num_pcs,
                            gamma=False,
                            max_iter=1000000,
                            mean=False):

        pca_data = self.principal_components(test_data, self.pca, num_pcs)
        if mean == False:
            return np.array([p[1] for p in model.predict_proba(pca_data)])

        train_pca_data = self.principal_components(self.X, self.pca, num_pcs)

        predicted_probs = ""
        for seed in self.seeds:
            np.random.seed(seed)

            model = LabelPropagation(kernel='rbf',
                                     gamma=gamma,
                                     max_iter=max_iter)
            model.fit(train_pca_data, self.Y)

            predicted_prob = np.array(
                [p[1] for p in model.predict_proba(pca_data)])
            if predicted_probs == "":
                predicted_probs = predicted_prob
            else:
                predicted_probs = np.vstack((predicted_probs, predicted_prob))

        #get mean of each run:
        mean_probs = np.mean(predicted_probs, axis=0)
        return mean_probs

Example #6

0

Show file

def test_LabelPropagation_knn(*data):
    '''
   测试 LabelPropagation 的 knn 核时，预测性能随 alpha 和 n_neighbors 的变化
    '''
    X, y, unlabeled_indices = data
    y_train = np.copy(y)  # 必须拷贝，后面要用到 y
    y_train[unlabeled_indices] = -1  # 未标记样本的标记设定为 -1

    fig = plt.figure()
    ax = fig.add_subplot(1, 1, 1)
    alphas = np.linspace(0.01, 1, num=10, endpoint=True)
    Ks = [1, 2, 3, 4, 5, 8, 10, 15, 20, 25, 30, 35, 40, 50]

    scores = []
    for K in Ks:
        clf = LabelPropagation(max_iter=100, n_neighbors=K, kernel='knn')
        clf.fit(X, y_train)
        scores.append(clf.score(X[unlabeled_indices], y[unlabeled_indices]))
    ax.plot(Ks, scores)

    ### 设置图形
    ax.set_xlabel(r"$k$")
    ax.set_ylabel("score")
    ax.legend(loc="best")
    ax.set_title("LabelPropagation knn kernel")
    plt.show()

Example #7

0

Show file

def test_LabelPropagation(*data):
    '''
    测试 LabelPropagation 的用法
    '''
    X, y, unlabeled_indices, XPredict, yTrue = data
    #print("get ytrue")
    #print(yTrue)
    # 必须拷贝，后面要用到 y
    y_train = np.copy(y)
    # 未标记样本的标记设定为 -1
    y_train[unlabeled_indices] = -1
    print(y_train)
    #clf = LabelPropagation(max_iter=1000, kernel='rbf', gamma=0.1)
    clf = LabelPropagation(max_iter=5, kernel='knn', n_neighbors=3, tol=1e-5)
    #clf = LabelPropagation.LabelSpreading(gamma = 0.25, max_iter = 20)
    clf.fit(X, y_train)
    ### 获取预测准确率
    # 预测标记
    predicted_labels = clf.predict(XPredict)
    print(XPredict)
    #predicted_labels = clf.transduction_[unlabeled_indices]
    # 真实标记
    #yTrue
    #true_labels = y[unlabeled_indices]
    print("Accuracy:%f" % metrics.accuracy_score(yTrue, predicted_labels))

Example #8

0

Show file

File: preproc.py Project: guilhermesantos/fakenews

def lb_prop_classify(network, labels):
	kf = StratifiedKFold(n_splits=10)
	scores = []
	cms = []

	for test_index, train_index in kf.split(network ,labels):
		first_train_index, last_train_index = min(train_index), max(train_index)

		train_dataset = network[first_train_index:last_train_index]
		train_labels = labels[first_train_index:last_train_index]

		test_dataset = np.delete(network, np.s_[first_train_index:last_train_index], 0)
		test_labels = np.delete(labels, np.s_[first_train_index:last_train_index], 0)

		label_spreading_model = LabelPropagation()
		label_spreading_model.fit(train_dataset, train_labels)
		scores.append(label_spreading_model.score(test_dataset, test_labels))

		prediction = label_spreading_model.predict(test_dataset)
		cms.append(confusion_matrix(test_labels, prediction, label_spreading_model.classes_))

	print('label propagation media {}'.format(np.average(scores)))
	print('label propagation desvio padrao {}'.format(np.std(scores)))
	print('label propagation matriz de confusao')
	print(get_percentile_cm(get_average_cm(cms)))
	print('\n')

	return scores

Example #9

0

Show file

File: lp.py Project: usc-isi-i2/WEDC

def sklearn_lp(X, y,
            output=None,
            kernel='knn', 
            gamma=None,
            n_neighbors=10, 
            alpha=1, 
            max_iter=1000, 
            tol=0.00001):

    from sklearn.cross_validation import train_test_split
    from sklearn.metrics import classification_report
    from sklearn.metrics import accuracy_score

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.9, random_state=3)
    label_prop_model = LabelPropagation(kernel=kernel, 
                                        gamma=gamma, 
                                        n_neighbors=n_neighbors, 
                                        alpha=alpha, 
                                        max_iter=max_iter, 
                                        tol=tol)
    label_prop_model.fit(X_train, y_train)

    y_predict = label_prop_model.predict(X_test)
    print 'y_train: ', y_train
    print 'y_predict: ', y_predict
    
    print '+--------------------------------------------------------+'
    print '|                         Report                         +'
    print '+--------------------------------------------------------+'
    print classification_report(y_test, y_predict)
    print 'accuracy: ' + str(accuracy_score(y_test, y_predict))
    print '\n\n'

Example #10

0

Show file

def hard_clamping(kernel, k, xTrain, yTrain, MI=10000, g=0.6):
    prop = LabelPropagation(kernel=kernel,
                            n_neighbors=k,
                            gamma=g,
                            max_iter=MI,
                            n_jobs=-1)
    prop.fit(xTrain, yTrain)
    predY = prop.predict_proba(xTrain)
    norm_Y = normalize(yTrain, predY)
    labels = []
    for i in norm_Y:
        if i[0] > i[1]:
            labels.append(benign)
        elif i[0] < i[1]:
            labels.append(malware)

    lm_to_b, lb_to_m, tp, tn, fp, fn, pred_day1, missed_day1 = stats(
        yTrain, labels, yExpect, day_one)

    results = [
        'HC', kernel, k, g, lm_to_b, lb_to_m, tp, tn, fp, fn, pred_day1,
        missed_day1
    ]

    file_name = 'HC_CMN_5per_' + str(rate) + '.csv'
    write_csv(file_name, results)

Example #11

0

Show file

File: oversample.py Project: CUDeepLearningFall2019/sleep_models

def load_all_data():
    # Read am partition the matrix
    data = pd.read_feather('./feature_stage_data_all.ftr')
    x = data[data.columns[3:]]
    y = data['stage']
    o = data.observation
    x = x.values
    x = normalize(x)
    y = y.values
    x_va = x[[i in [8, 9] for i in o.values]]
    y_va = y[[i in [8, 9] for i in o.values]]
    x = x[[i not in [8, 9] for i in o.values]]
    y = y[[i not in [8, 9] for i in o.values]]
    o.unique()

    nnl = lambda a: np.invert(np.isnan(a))
    nul = lambda a: np.isnan(a)
    x_obs = x[nnl(y)]
    y_obs = y[nnl(y)]

    # apply Label Spreading
    x_nuls = x[nul(y)]
    label_spread = LabelPropagation(kernel='knn')
    label_spread.fit(x_obs, y_obs)
    x_all = np.concatenate([x_obs, x_nuls], axis=0)
    y_all = np.concatenate([y_obs, label_spread.predict(x_nuls)], axis=0)

    # Over sample the stages
    zen = SMOTE(random_state=8675309)
    x, y = zen.fit_resample(x_all, y_all)
    x, y = shuffle(x, y, random_state=42)
    x_tr, x_te, y_tr, y_te = train_test_split(x, y, test_size=0.20)
    return x_tr, y_tr, x_te, y_te, x_va, y_va

Example #12

0

Show file

File: TelematicsLib.py Project: smirshamsi/KaggleDriverTelematics

 def doLabelPropagation(self,X,y,**kwargs):
     label_prop_model = LabelPropagation(**kwargs)
     if self.verbose>2: 
         print("X, y shapes: ",X.shape,y.shape)
         print(" y hist: ",np.histogram(y))
     label_prop_model.fit(X, y)
     if self.verbose>2: print("lp_predict:",np.histogram(label_prop_model.predict(X)) )
     return label_prop_model.predict_proba(X)

Example #13

0

Show file

def _label_propagation(df):
    X = _generate_features(df)
    labels = _generate_labels(df)
    # for some reason pandas returns NaN for -1 values
    labels = labels.fillna(-1)
    label_prop_model = LabelPropagation()
    label_prop_model.fit(X.toarray(), labels)
    return label_prop_model.predict(X.toarray())

Example #14

0

Show file

File: train_label_spreading.py Project: Shepherd1701/HelloNLP-TopicC-in-TipDMCup2020

def main():
    stop = fetch_default_stop_words()
    comm = fetch_data(ds_name="sheet_4_labeled", cut_all=False, stop_words=stop)

    # 将无标注的数据的标记置为-1
    for key in comm.keys():
        if comm[key].integrity is None:
            comm[key].integrity = -1
            comm[key].interpretability = -1

    labeled = {key: c for key, c in comm.items() if c.integrity != -1}
    unlabeled = {key: c for key, c in comm.items() if c.integrity == -1}

    # 加载基于答复文本的wv模型
    wv_model = gensim.models.KeyedVectors.load_word2vec_format(word2vec_model_path, binary=False)

    # -------------------------------------------------------
    # 模型一
    xy = [(doc_vec(c.seg_reply, model=wv_model), c.integrity) for c in unlabeled.values()]
    x = [t[0] for t in xy]
    y = [t[1] for t in xy]

    xy_labeled = [(doc_vec(c.seg_reply, model=wv_model), c.integrity) for c in labeled.values()]
    x_labeled = [t[0] for t in xy_labeled]
    y_labeled = [t[1] for t in xy_labeled]

    x_train, y_train = (x_labeled, y_labeled)

    # 将已标注的数据与未标注的数据混合成为训练集
    x_train += x
    y_train += y

    # 训练标记传播模型
    clf = LabelPropagation(gamma=30)  # 模型1
    clf.fit(x_train, y_train)
    joblib.dump(clf, integrity_clf_path)

    # --------------------------------------------------------------
    # 模型二
    xy = [(doc_vec(c.seg_reply, model=wv_model), c.interpretability) for c in unlabeled.values()]
    x = [t[0] for t in xy]
    y = [t[1] for t in xy]

    xy_labeled = [(doc_vec(c.seg_reply, model=wv_model), c.interpretability) for c in labeled.values()]
    x_labeled = [t[0] for t in xy_labeled]
    y_labeled = [t[1] for t in xy_labeled]

    x_train, y_train = (x_labeled, y_labeled)

    # 将已标注的数据与未标注的数据混合成为训练集
    x_train += x
    y_train += y

    # 训练标记传播模型
    clf = LabelSpreading()  # 模型2
    clf.fit(x_train, y_train)
    joblib.dump(clf, interpretability_clf_path)

Example #15

0

Show file

def test_LabelPropagation(*data):
    X,y,unlabeled_indices = data
    y_train = np.copy(y)
    y_train[unlabeled_indices] = -1
    
    clf = LabelPropagation(max_iter=100, kernel='rbf', gamma=0.1)
    clf.fit(X,y_train)
    true_labels = y[unlabeled_indices]
    print('Accuracy : %.2f' %clf.score(X[unlabeled_indices],true_labels))

Example #16

0

Show file

def test_LabelPropagation(*data):
    x, y ,unlabeled_indices = data
    y_train = np.copy(y)  # 这里选择复制，后面要用到y
    y_train[unlabeled_indices] = -1  # 未标记样本的标记设定为-1
    clf = LabelPropagation(max_iter=100, kernel='rbf', gamma=0.1)
    clf.fit(x, y_train)
    # 获取预测准确率
    true_labels = y[unlabeled_indices]  # 取得真实标记
    print("Accuracy: %f" % clf.score(x[unlabeled_indices], true_labels))

Example #17

0

Show file

File: semi_super.py Project: vkhokhla/csc411

def ss_test(images, labels, unlabeled_images, test_images):
    all_images = np.vstack((images, unlabeled_images))
    neg_ones = -np.ones((unlabeled_images.shape[0], ))
    all_labels = np.concatenate((labels, neg_ones), axis=0)

    model = LabelPropagation()
    model.fit(all_images, all_labels)

    test_labels = model.predict(test_images)
    create_submission(test_labels)

Example #18

0

Show file

File: semi_super.py Project: bkotzz/csc411

def ss_test(images, labels, unlabeled_images, test_images):
    all_images = np.vstack((images, unlabeled_images))
    neg_ones = -np.ones((unlabeled_images.shape[0],))
    all_labels = np.concatenate((labels, neg_ones), axis = 0)

    model = LabelPropagation()
    model.fit(all_images, all_labels)

    test_labels = model.predict(test_images)
    create_submission(test_labels)

Example #19

0

Show file

def LP(source_train, target_test, label1, label3):
    label_prop_model = LabelPropagation()
    label_prop_model.fit(source_train, label1)
    source_predict = label_prop_model.predict(target_test)
    # 评价参数
    accuracy = metrics.accuracy_score(label3, source_predict)
    recall = metrics.recall_score(label3, source_predict, average='weighted')
    f1 = metrics.f1_score(label3, source_predict, average='weighted')
    precision = metrics.precision_score(label3, source_predict, average='weighted')
    print("LP:", accuracy, recall, f1, precision)
    return accuracy, recall, f1, precision

Example #20

0

Show file

File: classifiers.py Project: stephenrdove/pu_learning

def create_label_prop(dataset):
    vectors, labels = make_vectors(dataset)

    Q_labels = -1 * np.ones(dataset.Q.shape[0] + dataset.test_X.shape[0])
    labels = np.concatenate((labels, Q_labels))
    vectors = np.concatenate((vectors, dataset.Q, dataset.test_X))

    label_prop = LabelPropagation()
    label_prop.fit(vectors, labels)
    print("\tLabel Propogation accuracy:")

    return label_prop

Example #21

0

Show file

def model_and_fit(type, train_vector, classes):
    if type == SemiSupervisedAlgorithms.LABEL_PROPAGATION:
        model = LabelPropagation()
        model.fit(train_vector, classes)
        return model
    elif type == SemiSupervisedAlgorithms.LABEL_SPREADING:
        from scipy.sparse import csgraph
        model = LabelSpreading(kernel='rbf')
        model.fit(train_vector, classes)
        return model
    else:
        raise ValueError('Wrong semi supervised model type!')

Example #22

0

Show file

def apply_notl(trainX, trainY, testX, testY, window, source_pos, target_pos):
    #######################
    ### SEMI-SUPERVISED ###
    ########################
    # Label Propagation
    label_prop_model = LabelPropagation(kernel='knn')
    label_prop_model.fit(trainX, trainY)
    Y_Pred = label_prop_model.predict(testX);
    acc_ss_propagation, acc_ss_propagation_INFO = check_accuracy(testY, Y_Pred)
    # Label Spreading
    label_prop_models_spr = LabelSpreading(kernel='knn')
    label_prop_models_spr.fit(trainX, trainY)
    Y_Pred = label_prop_models_spr.predict(testX);
    acc_ss_spreading, acc_ss_spreading_INFO = check_accuracy(testY, Y_Pred)
    ########################
    #### WITHOUT TL ########
    ########################
    # LogisticRegression 
    modelLR = LogisticRegression()
    modelLR.fit(trainX, trainY)
    predLR = modelLR.predict(testX)
    accLR, acc_LR_INFO = check_accuracy(testY, predLR)
    # DecisionTreeClassifier
    modelDT = tree.DecisionTreeClassifier()
    modelDT.fit(trainX, trainY)
    predDT = modelDT.predict(testX)
    accDT, acc_DT_INFO = check_accuracy(testY, predDT)
    # BernoulliNB
    modelNB = BernoulliNB()
    modelNB.fit(trainX, trainY)
    predND = modelNB.predict(testX)
    accNB, acc_NB_INFO = check_accuracy(testY, predND)
    #
    return pd.DataFrame(
        [{ 
        'window': window,
        'source_position': source_pos,
        'target_position': target_pos,

        'acc_SS_propagation': acc_ss_propagation,
        'acc_SS_propagation_INFO':acc_ss_propagation_INFO,
        'acc_SS_spreading': acc_ss_spreading,
        'acc_SS_spreading_INFO':acc_ss_spreading_INFO,
        'acc_LR':accLR,
        'acc_LR_INFO': str(acc_LR_INFO),
        'acc_DT': accDT,
        'acc_DT_INFO': str(acc_DT_INFO),
        'acc_NB': accNB,
        'acc_NB_INFO': str(acc_NB_INFO)       

        }]
    )

Example #23

0

Show file

File: lp.py Project: usc-isi-i2/WEDC

def do_evaluation(X, y, 
                kernel='knn',
                output=None, 
                gamma=None,
                n_neighbors=10, 
                alpha=1, 
                max_iter=1000, 
                tol=0.00001):
    # from sklearn.cross_validation import train_test_split
    from sklearn.metrics import classification_report
    from sklearn.metrics import accuracy_score
    import random

    size = len(X)

    random_seeds = np.random.randint(1, 1000, size=10)
    for i in range(len(random_seeds)):
        
        # X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.6, random_state=random_seeds[i])
        labels = np.copy(y)
        tmp = np.arange(size)
        np.random.shuffle(tmp)
        train_test_split_rate = int(size*.9)
        random_unlabeled_points = tmp[:train_test_split_rate]
        labeled_points = tmp[train_test_split_rate:]
        random_unlabeled_points.sort()
        X_test = [X[_] for _ in range(size) if _ in random_unlabeled_points]
        y_test = [y[_] for _ in range(size) if _ in random_unlabeled_points]
        y_train = [y[_] for _ in range(size) if _ in labeled_points]

        labels[random_unlabeled_points] = -1

        label_prop_model = LabelPropagation(kernel=kernel, 
                                            gamma=gamma, 
                                            n_neighbors=n_neighbors, 
                                            alpha=alpha, 
                                            max_iter=max_iter, 
                                            tol=tol)
        label_prop_model.fit(X, labels)

        y_predict = label_prop_model.predict(X_test)
        
        print '+--------------------------------------------------------+'
        print '|                         Report                         |'
        print '+--------------------------------------------------------+'
        print 'test round:', (i+1), ' with random seed: ', random_seeds[i]
        print 'training label: ', y_train
        print 'training post id: ', [_+1 for _ in labeled_points]
        print 'predict label: ', y_predict
        print classification_report(y_test, y_predict)
        print 'accuracy: ' + str(accuracy_score(y_test, y_predict))
        print '\n\n'

Example #24

0

Show file

 def process(self, n_components):
     X_train, y_train, X_test, y_test = self.preprocess(n_components)
     label_prop_model = LabelPropagation(n_jobs=-1)
     label_prop_model.fit(X_train, y_train)
     y_pred = label_prop_model.predict(X_test)
     mean_acc = label_prop_model.score(X_test, y_test)
     plot_confusion_matrix(y_test,
                           y_pred,
                           self.labels,
                           normalize=False,
                           figname=('lp_comps_%d.png' % n_components))
     self.m_acc.append(mean_acc)
     print(label_prop_model.get_params())

Example #25

0

Show file

def khren3(G):

    result_s = {}
    result_d = {}
    passed_set = []

    list_neighbrs = {}

    for v in G.nodes:
        list_neighbrs.update({v: set(nx.neighbors(G, v))})

    for u in G.nodes:
        passed_set.append(u)
        for v in nx.neighbors(G, u):
            if not v in passed_set:
                cmn_nmbr = list_neighbrs[u] & list_neighbrs[v]
                # dist = nx.shortest_path_length(G,u,v)
                # if dist == 2:
                # cmn_nmbr = G.distance(u,v)
                if G.nodes[u]["ground_label"] == G.nodes[v]['ground_label']:
                    result_s.update({(u, v): cmn_nmbr})
                else:
                    result_d.update({(u, v): cmn_nmbr})

    # max_s = max(len(result_s.values()))
    min_s = len(min(result_s.values(), key=len))
    min_d = len(min(result_d.values(), key=len))
    max_d = len(max(result_d.values(), key=len))

    for (pair, vertex_list) in result_d.items():
        if len(vertex_list) == max_d:
            max_pair = pair
            break

    print(min_s, min_d)

    adj_matrix = nx.adjacency_matrix(G).toarray()
    labels = [-1 for node in G.nodes]
    true_labels = [G.nodes[node]['ground_label'] for node in G.nodes]
    # labels[[0]] = 0
    labels[max_pair[0]] = 0
    labels[max_pair[1]] = 1
    # labels[0:10] = [0 for i in range(10)]
    # labels[900:910] = [1 for i in range(10)]

    lp = LabelPropagation(kernel='rbf', gamma=0.7, max_iter=1000)
    lp.fit(adj_matrix, labels)
    print(lp.score(adj_matrix, true_labels))

    return (result_s, result_d)

Example #26

0

Show file

    def execute(self, function_context: FunctionContext,
                input_list: List) -> List:
        x_train = input_list[0]
        y_label = input_list[1]
        input_dim = 512

        x_train_columns = list()
        x_train_columns.append('face_id')
        for i in range(1, input_dim + 1):
            x_train_columns.append('col' + str(i))
        trainDf = pd.DataFrame(x_train, columns=x_train_columns)
        labelDf = pd.DataFrame(y_label, columns=('face_id', 'label'))

        trainDf = pd.merge(trainDf,
                           labelDf,
                           on=['face_id'],
                           how='inner',
                           suffixes=('_x', '_y'))
        y_label = trainDf['label'].values.astype(int)
        trainDf = trainDf.drop('face_id', 1)
        x_train = trainDf.drop('label', 1).values

        label_prop_model = None
        score = 0.0
        while score < 0.95:
            print('before train ACC:', score)
            random_unlabeled_points = np.random.rand(len(y_label))
            random_unlabeled_points = random_unlabeled_points < 0.3  # 0-1的随机数，小于0.7返回1，大于等于0.7返回0
            Y = y_label[random_unlabeled_points]  # label转换之前的
            y_label[random_unlabeled_points] = -1  # 标签重置，将标签为1的变为-1

            label_prop_model = LabelPropagation()
            label_prop_model.fit(x_train, y_label)

            Y_pred = label_prop_model.predict(x_train)
            Y_pred = Y_pred[random_unlabeled_points]
            score = accuracy_score(Y, Y_pred)

            y_label[random_unlabeled_points] = Y

        model_path = os.path.dirname(os.path.abspath(__file__)) + '/model'
        print('Save trained model to {}'.format(model_path))
        if not os.path.exists(model_path):
            joblib.dump(label_prop_model, model_path)

        model_meta: ModelMeta = function_context.node_spec.output_model
        # Register model version to notify that cluster serving is ready to start loading the registered model version.
        register_model_version(model=model_meta, model_path=model_path)
        return []

Example #27

0

Show file

File: supervised_models.py Project: nguyendo24/MOOC-text-Prioritization

    def label_propagation(self, X_train, y, X_test):

        clf = LabelPropagation()
        print("X_train Shape :", X_train.shape, type(X_train))
        print("X_test shape : ", X_test.shape, type(X_test))
        print("y shape : ", y.shape)

        X = np.concatenate((X_train.todense(), X_test.todense()), axis=0)
        print("X shape now ", X.shape)
        print("Y shape now ", y.shape)
        clf.fit(X, y)
        final_labels = clf.predict(X_test)
        label_prob = clf.predict_proba(X_test)
        print(compare_labels_probabilities().compare(label_prob, final_labels))
        return final_labels, clf

Example #28

0

Show file

File: LabelPropagationNoEvaluation.py Project: babakaskari/LeakDetectionProject

def label_prop():

    labels = df9.loc[df9['Leak Found'].notnull(), ['Leak Found']]
    model = LabelPropagation(kernel=rbf_kernel_safe)
    model.fit(df10, labels.values.ravel())
    pred = np.array(model.predict(df12))
    df13 = pd.DataFrame(pred, columns=['Prediction'])
    df14 = pd.concat([df12, df13], axis=1)
    print(df14[['ID', 'Prediction']])
    # print(df14.loc[df14['Prediction'] == 'Y'])
    plt.style.use ( 'seaborn' )
    df14['Prediction'].value_counts().plot(kind='bar')
    plt.xticks ( [ 0 , 1 , 2 ] , [ 'NO' , 'YES' , 'N-PRV' ] )
    plt.ylabel('Number of occurrences after prediction by RBF algorithm');
    plt.show()

Example #29

0

Show file

File: semi-supervised-lda.py Project: torjush/ML_code_exercises

def propagate_labels(X_u, y_u, X_l, num_unlabeled):
    # unlabeled samples are represented by -1 in labelprop
    y_u_placeholder = np.zeros(num_unlabeled) - 1

    X_train_prop = np.concatenate((X_l, X_u), axis=0)
    y_train_prop = np.concatenate((y_l, y_u_placeholder), axis=0)

    prop = LabelPropagation(gamma=15)
    prop.fit(X_train_prop, y_train_prop)

    y_train_lda = prop.transduction_

    X_train_lda = np.concatenate((X_l, X_u), axis=0)

    return X_train_lda, y_train_lda

Example #30

0

Show file

File: labelPropagation.py Project: xiphodon/py_vs_ML

def test_LabelPropagation(*data):
    '''
    测试 LabelPropagation 的用法
    :param data: 一个元组，依次为： 样本集合、样本标记集合、 未标记样本的下标集合
    :return: None
    '''
    X, y, unlabeled_indices = data
    y_train = np.copy(y)  # 必须拷贝，后面要用到 y
    y_train[unlabeled_indices] = -1  # 未标记样本的标记设定为 -1
    clf = LabelPropagation(max_iter=100, kernel='rbf', gamma=0.1)
    clf.fit(X, y_train)
    ### 获取预测准确率
    predicted_labels = clf.transduction_[unlabeled_indices]  # 预测标记
    true_labels = y[unlabeled_indices]  # 真实标记
    print("Accuracy:%f" %
          metrics.accuracy_score(true_labels, predicted_labels))

Example #31

0

Show file

def test_LabelPropagation_rbf(*data):
    '''
    测试 LabelPropagation 的 rbf 核时，预测性能随 alpha 和 gamma 的变化

    :param data: 一个元组，依次为： 样本集合、样本标记集合、 未标记样本的下标集合
    :return: None
    '''
    X, y, unlabeled_indices = data
    y_train = np.copy(y)  # 必须拷贝，后面要用到 y
    y_train[unlabeled_indices] = -1  # 未标记样本的标记设定为 -1

    fig = plt.figure()
    ax = fig.add_subplot(1, 1, 1)
    alphas = np.linspace(0.01, 1, num=10, endpoint=True)
    gammas = np.logspace(-2, 2, num=50)
    colors = (
        (1, 0, 0),
        (0, 1, 0),
        (0, 0, 1),
        (0.5, 0.5, 0),
        (0, 0.5, 0.5),
        (0.5, 0, 0.5),
        (0.4, 0.6, 0),
        (0.6, 0.4, 0),
        (0, 0.6, 0.4),
        (0.5, 0.3, 0.2),
    )  # 颜色集合，不同曲线用不同颜色
    ## 训练并绘图
    for alpha, color in zip(alphas, colors):
        scores = []
        for gamma in gammas:
            clf = LabelPropagation(max_iter=100,
                                   gamma=gamma,
                                   alpha=alpha,
                                   kernel='rbf')
            clf.fit(X, y_train)
            scores.append(clf.score(X[unlabeled_indices],
                                    y[unlabeled_indices]))
        ax.plot(gammas, scores, label=r"$\alpha=%s$" % alpha, color=color)

    ### 设置图形
    ax.set_xlabel(r"$\gamma$")
    ax.set_ylabel("score")
    ax.set_xscale("log")
    ax.legend(loc="best")
    ax.set_title("LabelPropagation rbf kernel")
    plt.show()

Example #32

0

Show file

class _LabelPropagationImpl:
    def __init__(self, **hyperparams):
        self._hyperparams = hyperparams
        self._wrapped_model = Op(**self._hyperparams)

    def fit(self, X, y=None):
        if y is not None:
            self._wrapped_model.fit(X, y)
        else:
            self._wrapped_model.fit(X)
        return self

    def predict(self, X):
        return self._wrapped_model.predict(X)

    def predict_proba(self, X):
        return self._wrapped_model.predict_proba(X)

Example #33

0

Show file

def test_LabelPropagation_knn(*data):
    '''
   测试 LabelPropagation 的 knn 核时，预测性能随 alpha 和 n_neighbors 的变化

    :param data:  一个元组，依次为： 样本集合、样本标记集合、 未标记样本的下标集合
    :return:  None
    '''
    X, y, unlabeled_indices = data
    y_train = np.copy(y)  # 必须拷贝，后面要用到 y
    y_train[unlabeled_indices] = -1  # 未标记样本的标记设定为 -1

    fig = plt.figure()
    ax = fig.add_subplot(1, 1, 1)
    alphas = np.linspace(0.01, 1, num=10, endpoint=True)
    Ks = [1, 2, 3, 4, 5, 8, 10, 15, 20, 25, 30, 35, 40, 50]
    colors = (
        (1, 0, 0),
        (0, 1, 0),
        (0, 0, 1),
        (0.5, 0.5, 0),
        (0, 0.5, 0.5),
        (0.5, 0, 0.5),
        (0.4, 0.6, 0),
        (0.6, 0.4, 0),
        (0, 0.6, 0.4),
        (0.5, 0.3, 0.2),
    )  # 颜色集合，不同曲线用不同颜色
    ## 训练并绘图
    for alpha, color in zip(alphas, colors):
        scores = []
        for K in Ks:
            clf = LabelPropagation(max_iter=100,
                                   n_neighbors=K,
                                   alpha=alpha,
                                   kernel='knn')
            clf.fit(X, y_train)
            scores.append(clf.score(X[unlabeled_indices],
                                    y[unlabeled_indices]))
        ax.plot(Ks, scores, label=r"$\alpha=%s$" % alpha, color=color)

    ### 设置图形
    ax.set_xlabel(r"$k$")
    ax.set_ylabel("score")
    ax.legend(loc="best")
    ax.set_title("LabelPropagation knn kernel")
    plt.show()

Example #34

0

Show file

File: lp.py Project: usc-isi-i2/WEDC

def do_label_propagation(input_data,
                        input_label,
                        output=None,
                        kernel='knn', 
                        gamma=None,
                        n_neighbors=10, 
                        alpha=1, 
                        max_iter=30, 
                        tol=0.001):
    n_neighbors += 1

    # input label
    input_label_fh = open(input_label, 'rb')
    label_lines = input_label_fh.readlines()
    label_lines = [int(_.strip()) for _ in label_lines]
    y = np.array(label_lines)
    input_label_fh.close()

    size = len(y)

    # input data
    input_data_fh = open(input_data, 'rb')
    data_lines = input_data_fh.readlines()[:size]
    data_lines = [_.strip() for _ in data_lines]
    X = np.array(np.mat(';'.join(data_lines)))
    input_data_fh.close()

    label_prop_model = LabelPropagation(kernel=kernel, 
                                        gamma=gamma, 
                                        n_neighbors=n_neighbors, 
                                        alpha=alpha, 
                                        max_iter=max_iter, 
                                        tol=tol)
    label_prop_model.fit(X, y)

    prediction = label_prop_model.predict(X)

    if output:
        output_fh = open(output, 'wb')
        for p in prediction:
            output_fh.write(str(p)+'\n')
        output_fh.close()

    return label_prop_model

Example #35

0

Show file

File: dataset_one_learner.py Project: Ikram/DUMLS14

def tryLabelPropagation(goFast):
  from sklearn.datasets import dump_svmlight_file, load_svmlight_file
  if goFast:
    training_data, training_labels = load_svmlight_file("dt1_1500.trn.svm", n_features=253659, zero_based=True)
    validation_data, validation_labels = load_svmlight_file("dt1_1500.vld.svm", n_features=253659, zero_based=True)
    testing_data, testing_labels = load_svmlight_file("dt1_1500.tst.svm", n_features=253659, zero_based=True)
  else:
    training_data, training_labels = load_svmlight_file("dt1.trn.svm", n_features=253659, zero_based=True)
    validation_data, validation_labels = load_svmlight_file("dt1.vld.svm", n_features=253659, zero_based=True)
    testing_data, testing_labels = load_svmlight_file("dt1.tst.svm", n_features=253659, zero_based=True)

  from sklearn.semi_supervised import LabelPropagation
  from sklearn.metrics import accuracy_score
  from sklearn.grid_search import ParameterGrid

  propOperator = LabelPropagation(gamma=150)

  propOperator.fit(training_data[:3000],training_labels[:3000])
  score = accuracy_score(validation_labels, propOperator.predict(validation_data))
  print str(score)

Example #36

0

Show file

File: label_propagation.py Project: CS178/CS178Project

from sklearn.semi_supervised import LabelPropagation
from sklearn import metrics
import numpy as np
 
#K nearest neighbors model ensures we dont run over our memory
#rbf Kernel needs complete graph, so requires feature selection
lp_model = LabelPropagation(kernel = 'knn') #Label Propagation model
Xtr = np.genfromtxt("data/Kaggle.X1.train.txt", delimiter = ',') #Get X training data
Ytr_labels = np.genfromtxt("data/Kaggle.Y.labels.train.txt",delimter = ','); #Get classification data


#Unlabeled points - random size for now. 
unlabeled_points = np.where(np.random.random_integers(0,1,size = len(Ytr_labels)))
labels = np.copy(Ytr_labels) #Save training labels for testing
labels[unlabeled_points] = -1  #Set unlabeled value, classes : 0, 1
lp_model.fit(Xtr,labels) #Train

#############################################
#   Models use n_neighbors and max_iteration to control kernel
#############################################

#############################################
#   Test Functions
#############################################
#Mean squared Error
yhat = lp_model.predict(Xtr);
mse = metrics.mean_squared_error(Ytr_labels,yhat);

###############################################
# Cross Validation
###############################################

Example #37

0

Show file

File: proj4_nico.py Project: wtq-git/LIS

noLabels = [1 for i in Y_train if i==-1]
labels = [1 for i in Y_train if not i==-1]
print('unlabeled: ',np.sum(noLabels))
print('labeled: ',np.sum(labels))


# In[8]:

from sklearn.semi_supervised import LabelPropagation as LP
from sklearn.semi_supervised import LabelSpreading as LS


# In[17]:

lspr = LP(gamma = 70)
lspr.fit(X_norm,Ytrain)


# In[15]:

print('nofClasses: ',lspr.classes_)


# In[16]:

pred = lspr.predict(X_norm)
notN = [1 for i in pred if i>0.0]
print(sum(notN))


# In[12]:

Example #38

0

Show file

File: pr4.py Project: tobiagru/LIS

                else:
                    clf = LabelSpreading(kernel=param["kernel"],
                                           n_neighbors=param["n_neighbors"])
                extra_param = param["n_neighbors"]

            now = datetime.datetime.now()
            date_time = '{0:02d}_{1:02d}_{2:02d}_{3:02d}_{4:02d}'.format((now.year%2000),
                                                                         now.month, now.day,
                                                                         now.hour, now.minute)

            #classification Type
            #clf = OneVsOneClassifier(clf)
            #clf = OneVsRestClassifier(clf)

            logging.info("start with training ")
            clf.fit(X_train, y_train)
            #y_pred = clf.predict(X_valid)
            #print("min:{0}  max:{0}".format(y_pred.min(),y_pred.max()))
            #score = accuracy_score(y_valid, y_pred, True)

            print("found classes are {0}".format(clf.classes_))

            y_test = clf.predict(X_test)
            y_test = y_test.astype(np.uint32)

            lib_IO.write_Y("Data/pr4/{0}_{1}_{2}_{3}".format(name,param["kernel"],extra_param,date_time),y_test,Ids=ids)
            #Gridsearch
            #grid_search = GridSearchCV(clf, param, scoring='accuracy',cv=10, n_jobs=-1, verbose=1)
            #grid_search.fit(X_train, y_train)

            #clf_tmp = grid_search.best_estimator_

Example #39

0

Show file

File: IndicatorIdentifier.py Project: anukat2015/TetumSentimentAnalysis

class IndicatorIdentifier(object):
    '''
    Identify the an indicator for a document
    '''


    def __init__(self):
        self.model = LabelPropagation() #(kernel='knn', alpha=1.0)
        #self.model = LabelSpreading()
    
    def readingDatabase(self):
        da = DocumentsAccess()
        
        labeledFile = "Database/Indicator/Indicators.xlsx"
        sheet = "Sheet1"
        df = da.readingDatabaseTetum(labeledFile, sheet, head= 0)


        cut = int(0.8*df.shape[0])    
        # re-duplicate the data => Result: one document has one label only
        columns = df.columns.tolist()
        columns.remove("Content")
        print columns
        X_train = []
        Y_train = []
        X_test = []
        Y_test = []
        for index, row in df.iterrows():
            labels = list(set([row[col] for col in columns if not pd.isnull(row[col])]))
            content = row["Content"]
            if index < cut: # training part
                for label in labels:
                    X_train.append(content)
                    Y_train.append(label)
            else:
                X_test.append(content)
                Y_test.append(labels)
                
                
           
       
        fileUnlabeled = "Database/Clean Master Cleaner 2222.xlsx"
        sheet = "Sheet1"
        unlabeledData = da.readingDatabaseTetum(fileUnlabeled, sheet)
        unlabeledData = unlabeledData[0].tolist() 
        print len(unlabeledData)
        
        fileUnlabeled2 = "Database/SAPO.xlsx"
        unlabeledData2 = da.readingDatabaseTetum(fileUnlabeled2, sheet)
        unlabeledData2 = unlabeledData2[0].tolist() 
        print len(unlabeledData2)
        
        fileUnlabeled3 = "Database/Suara News.xlsx"
        unlabeledData3 = da.readingDatabaseTetum(fileUnlabeled3, sheet)
        unlabeledData3 = unlabeledData3[0].tolist() 
        
        '''
        fileUnlabeled4 = "Database/Haksesuk.xlsx"
        unlabeledData4 = da.readingDatabaseTetum(fileUnlabeled4, sheet)
        unlabeledData4 = unlabeledData4[0].tolist()
        print len(unlabeledData4)
        ''' 

        unlabeledData = unlabeledData + unlabeledData2 + unlabeledData3
        
        print len(unlabeledData)
        #print unlabeledData[0]
        
        return (X_train, Y_train, X_test, Y_test, unlabeledData) 
    
    def preprocessData(self, X):   
        return tp.preprocess_dataset(X, fold=True, specials=False, min_size=2)       
    def train(self, X, Y):
        '''
            Goal: Batch training (Use only one time at the beginning. After that, use updateNewInformation function to update new information from new data)
        '''
        X = self.preprocessData(X)
        X = self.featureExtractionTrain(X)
        X = X.toarray()
        self.model.fit(X, Y)    
    def test(self, X):
        '''
            Goal: predict a new document
        '''
        X = self.preprocessData(X)
        X = self.featureExtractionPredict(X)
        X = X.toarray()
        predictedY = self.model.predict(X)   
        return predictedY      
       
    def updateNewInformation(self, x1, y1):
        '''
            Goal: Update the information from the new data (Online Learning)
            Run re-train model at weekend
        '''
        #self.model.partial_fit(x1,y1)
        pass

    def featureExtractionTrain(self, X):
        self.vsm = VectorSpaceModel.createInstance("TFIDF")#("BooleanWeighting") #("TFIDF")
        trainTable = self.vsm.train(X)
        return trainTable
    
    def featureExtractionPredict(self, X):
        testTable = self.vsm.test(X)
        return testTable

    def evaluation(self, trueLabels, predictedLabels):
        accuracy = metrics.accuracy_score(trueLabels,predictedLabels)
        precision = metrics.precision_score(trueLabels,predictedLabels,pos_label=None, average='weighted')
        recall = metrics.recall_score(trueLabels,predictedLabels,pos_label=None, average='weighted')
        f1 = metrics.f1_score(trueLabels,predictedLabels,pos_label=None, average='weighted')
        accuracy = round(accuracy,4)
        precision = round(precision,4)
        recall = round(recall,4)
        f1 = round(f1,4)
        result = [("Accuracy",accuracy),("Precision",precision),("Recall",recall),("f1",f1)]
        return result    
       
    def run(self):
        # Reading data
        (X_train, Y_train, X_test, Y_test, unlabeledData)   = self.readingDatabase()
        print "Training size: " + str(len(X_train))
        print "Test size: " + str(len(X_test))
        '''
        X_train = X_train[:100]
        Y_train = Y_train[:100]
        X_test = X_test[:100]
        Y_test = Y_test[:100]
        '''
        print "Finish reading database."
        #print FreqDist(indicators).most_common()
        k = 0
        dictLabel = FreqDist(Y_train)
        for key in dictLabel:
            dictLabel[key] = k
            k+=1
        Y_train = [dictLabel[ind] for ind in Y_train]
        Y_test = [[dictLabel[ind] for ind in labels] for labels in Y_test]
        
        '''       
        random.seed(123456)
        # Training
        z = zip(labeledData, indicators)
        random.shuffle(z)
        labeledData, indicators = zip(*z)
        
        X_train = list(labeledData[:cut])
        Y_train = list(indicators[:cut])
        X_test = list(labeledData[cut:])
        Y_test = list(indicators[cut:])
        '''
        X_train += unlabeledData
        Y_train += (-1*np.ones((len(unlabeledData)), dtype = int)).tolist()

        #pprint(X_train)
        #print Y_train
        
        #print X_train[cut-2:cut+2]
        #print Y_train[cut-2:cut+2]
        print "Training..."
        self.train(X_train, Y_train)
        
        # Testing
        print "Testing..."
        Y_predicted = self.test(X_test)

        print Y_predicted

        # The Y_predicted only need to be one of the true labels in order to be calculated as correctness
        for i in range(len(Y_predicted)):
            lab = Y_predicted[i]
            if lab in Y_test[i]:
                Y_test[i] = lab
            else:
                Y_test[i] = -1
        (accuracy,_, _, _) = self.evaluation(Y_test,Y_predicted)
        print accuracy

Example #40

0

Show file

File: SentimentAnalysis.py Project: anukat2015/TetumSentimentAnalysis

class SentimentAnalysis(object):
    '''
    Identify the a sentiment for each document.
    '''


    def __init__(self):
        self.model = LabelPropagation()#(kernel='knn', alpha=1.0)
        #self.model = LabelSpreading()
    
    def readingDatabase(self):
        da = DocumentsAccess()
        
        filePos = "Database/Sentiment/Sentiment/PoliceRelations/positive.xlsx"
        sheet = "Sheet1"
        posData = da.readingDatabaseTetum(filePos, sheet)
        posData = posData[0].tolist()
        print len(posData)
        print posData[0]

        
 
        fileNeg = "Database/Sentiment/Sentiment/PoliceRelations/negative.xlsx"
        sheet = "Sheet1"
        negData = da.readingDatabaseTetum(fileNeg, sheet)  
        negData = negData[0].tolist() 
        print len(negData)
        print negData[0]
       
        fileUnlabeled = "Database/Clean Master Cleaner 2222.xlsx"
        sheet = "Sheet1"
        unlabeledData = da.readingDatabaseTetum(fileUnlabeled, sheet)
        unlabeledData = unlabeledData[0].tolist() 
        print len(unlabeledData)
        
        fileUnlabeled2 = "Database/SAPO.xlsx"
        unlabeledData2 = da.readingDatabaseTetum(fileUnlabeled2, sheet)
        unlabeledData2 = unlabeledData2[0].tolist() 
        print len(unlabeledData2)
        
        fileUnlabeled3 = "Database/Suara News.xlsx"
        unlabeledData3 = da.readingDatabaseTetum(fileUnlabeled3, sheet)
        unlabeledData3 = unlabeledData3[0].tolist() 
        
        '''
        fileUnlabeled4 = "Database/Haksesuk.xlsx"
        unlabeledData4 = da.readingDatabaseTetum(fileUnlabeled4, sheet)
        unlabeledData4 = unlabeledData4[0].tolist()
        print len(unlabeledData4)
        ''' 

        unlabeledData = unlabeledData + unlabeledData2 + unlabeledData3
        
        print len(unlabeledData)
        print unlabeledData[0]

       
        return (posData, negData, unlabeledData) 
    
    def preprocessData(self, X):   
        return tp.preprocess_dataset(X, fold=True, specials=False, min_size=2)       
    def train(self, X, Y):
        '''
            Goal: Batch training (Use only one time at the beginning. After that, use updateNewInformation function to update new information from new data)
        '''
        X = self.preprocessData(X)
        X = self.featureExtractionTrain(X)
        X = X.toarray()
        self.model.fit(X, Y)    
    def test(self, X):
        '''
            Goal: predict a new document
        '''
        X = self.preprocessData(X)
        X = self.featureExtractionPredict(X)
        X = X.toarray()
        predictedY = self.model.predict(X)   
        return predictedY      
       
    def updateNewInformation(self, x1, y1):
        '''
            Goal: Update the information from the new data (Online Learning)
            Run re-train model at weekend
        '''
        #self.model.partial_fit(x1,y1)
        pass

    def featureExtractionTrain(self, X):
        self.vsm = VectorSpaceModel.createInstance("TFIDF")#("BooleanWeighting") #("TFIDF")
        trainTable = self.vsm.train(X)
        return trainTable
    
    def featureExtractionPredict(self, X):
        testTable = self.vsm.test(X)
        return testTable

    def evaluation(self, trueLabels, predictedLabels):
        accuracy = metrics.accuracy_score(trueLabels,predictedLabels)
        precision = metrics.precision_score(trueLabels,predictedLabels,pos_label=None, average='weighted')
        recall = metrics.recall_score(trueLabels,predictedLabels,pos_label=None, average='weighted')
        f1 = metrics.f1_score(trueLabels,predictedLabels,pos_label=None, average='weighted')
        accuracy = round(accuracy,4)
        precision = round(precision,4)
        recall = round(recall,4)
        f1 = round(f1,4)
        result = [("Accuracy",accuracy),("Precision",precision),("Recall",recall),("f1",f1)]
        return result    
       
    def run(self):
        # Reading data
        (posData, negData, unlabeledData)  = self.readingDatabase()
        print "Finish reading database."
           
        # Divide training and test data
        cut = 10
        posDataTrain = posData[:cut]
        negDataTrain = negData[:cut]
        posDataTest = posData[cut:]
        negDataTest = negData[cut:]        
        
        random.seed(123456)
        # Training
        X_train = posDataTrain + negDataTrain + unlabeledData
        Y_train = np.ones((len(posDataTrain)), dtype = int).tolist() + np.zeros((len(negDataTrain)), dtype = int).tolist() + (-1*np.ones((len(unlabeledData)), dtype = int)).tolist()
        z = zip(X_train, Y_train)
        random.shuffle(z)
        X_train, Y_train = zip(*z)
        self.train(X_train, Y_train)
        
        # Testing
        X_test = posDataTest + negDataTest
        Y_test = np.ones((len(posDataTest)), dtype = int).tolist() + np.zeros((len(negDataTest)), dtype = int).tolist()
        z = zip(X_test, Y_test)
        random.shuffle(z)
        X_test, Y_test = zip(*z)
        Y_predicted = self.test(X_test)
        print Y_predicted

        (accuracy,precision,recall,f1) = self.evaluation(Y_test,Y_predicted)
        print (accuracy,precision,recall,f1)