Exemple #1
0
    def source_to_target_label_prop(self,
                                    train_feat_space='embeds',
                                    kernel_param={
                                        'type': 'rbf',
                                        'gamma': 20
                                    }):
        print(
            '-----------------------------------------------------------------------'
        )
        print('Propagating labels from source to target in {0} space'.format(
            train_feat_space))
        if train_feat_space == 'encoded':
            if not hasattr(self, 'source_encoded_reps'):
                self.dim_red_autoencode()
            concat_embs = np.concatenate(
                (self.source_encoded_reps, self.target_encoded_reps))
        elif train_feat_space == 'embeds':
            concat_embs = np.concatenate(
                (self.source_embds_vec, self.target_embds_vec))
        elif train_feat_space == 'embeds_tsne':
            if self.tsne_computed == 0:
                self.compute_tsne()
            feat_cols = []
            for idx in range(self.n_tsne_components):
                feat_cols.append('embeds_tsne_' + str(idx))
            source_data_feats = self.source_data[feat_cols].as_matrix()
            target_data_feats = self.target_data[feat_cols].as_matrix()
            concat_embs = np.concatenate(
                (source_data_feats, target_data_feats))
        else:
            raise NotImplemented
        unknown_labels = np.ones_like(self.target_labels) * -1
        label_prop_train_labels = np.concatenate(
            (self.source_labels, unknown_labels))
        lp_model = LabelSpreading()
        lp_model.fit(concat_embs, label_prop_train_labels)
        transduction_labels = lp_model.transduction_
        label_distributions = lp_model.label_distributions_

        print(label_distributions[0:10, :])
        self.source_data[
            train_feat_space +
            'Space_prop_pred'] = transduction_labels[:self.n_source]
        self.target_data[
            train_feat_space +
            'Space_prop_pred'] = transduction_labels[self.n_source:]
        # self.source_data[train_feat_space+'label_prop_groups'] = label_distributions[:self.n_source]
        # self.target_data[train_feat_space + 'label_prop_groups'] = label_distributions[self.n_source:]

        # self.embds_space_grouping.append(train_feat_space + 'label_prop_groups')
        # self.embds_space_classifiers.append(train_feat_space+'Space_prop_pred')
        if self.inter_save:
            print('Saving propagated labels')
            self.save_perforamance(self.serving_dir, suffix=self.save_suffix)

        print('Completed source to target label propagation in {0} space'
              ).format(train_feat_space)
        print(
            '-----------------------------------------------------------------------'
        )
Exemple #2
0
def test_LabelSpreading_rbf(*data):
    x, y, unlabeled_indices = data
    y_train = np.copy(y)
    y_train[unlabeled_indices] = -1
    fig = plt.figure()
    ax = fig.add_subplot(1, 1, 1)
    alphas = np.linspace(0.01, 1, num=10, endpoint=True)
    gammas = np.logspace(-2, 2, num=50)
    colors = ((1, 0, 0), (0, 1, 0), (0, 0, 1), (0.5, 0.5, 0), (0, 0.5, 0.5), (0.5, 0, 0.5), (0.4, 0.6, 0), (0.6, 0.4, 0) \
                  , (0, 0.6, 0.4), (0.5, 0.3, 0.2))  # 颜色集合,不同的曲线用不同的颜色
    # 训练并绘图
    for alpha, color in zip(alphas, colors):
        scores = []
        for gamma in gammas:
            clf = LabelSpreading(max_iter=100, gamma=gamma, alpha=alpha, kernel='rbf')
            clf.fit(x, y_train)
            scores.append(clf.score(x[unlabeled_indices], y[unlabeled_indices]))
        ax.plot(gammas, scores, label=r"$\alpha=%s$" % alpha, color=color)
    # 设置图形
    ax.set_xlabel(r"$\gamma$")
    ax.set_ylabel("score")
    ax.set_xscale("log")
    ax.legend(loc='best')
    ax.set_title("LabelSpreading rbf kernel")
    plt.show()
Exemple #3
0
    def label_spread(self, X_train, y_train, gamma = None, max_iter = None):
        """
        Train Label Spreading model from scikit-learn

        Parameters
        __________
        X_train: Scaled training data
        y_train: Class label
        gamma: Parameter for rbf kernel
        max_iter: Maximum number of iterations allowed

        Returns
        ________
        Predicted labels and probability
        """
        # Label spreading model
        model = LabelSpreading(kernel='rbf', gamma = gamma, max_iter = max_iter, n_jobs= -1)

        # Fit the training set
        model.fit(X_train, y_train)

        # Predict the labels of the unlabeled data points
        predicted_labels = model.transduction_

        # Predict probability
        predicted_proba = model.predict_proba(X_train)
        return predicted_labels, predicted_proba
Exemple #4
0
def semiLabelSpreding(feature_extractor, generator, val_generator, kernel,
                      neighbors, gamma, alpha):
    semi = LabelSpreading(kernel=kernel,
                          n_neighbors=neighbors,
                          gamma=gamma,
                          alpha=alpha,
                          tol=0.001,
                          max_iter=1000000)

    features = feature_extractor.predict_generator(generator,
                                                   steps=generator.samples /
                                                   generator.batch_size,
                                                   verbose=1)

    classes = generator.classes

    for i in range(0, generator.samples):
        if (generator.filenames[i][0] == 'N'):
            classes[i] = -1

    semi.fit(features, classes)

    val_features = feature_extractor.predict_generator(
        val_generator,
        steps=val_generator.samples / val_generator.batch_size,
        verbose=1)
    predicted_classes = semi.predict(val_features)

    return predicted_classes
Exemple #5
0
class ModelLabelSpreading:
    def __init__(self):
        np.random.seed(1102)
        self.model = LabelSpreading(
            kernel="rbf",
            n_jobs=int(np.max([multiprocessing.cpu_count() - 2, 1])),
            alpha=0.2,
            n_neighbors=10,
            max_iter=15)
        self.name = "LABEL-SPREADING"
        self.scaler = MinMaxScaler()

    def fit(self, X, y, Xu=None):
        np.random.seed(1102)
        self.Xl = X
        self.yl = y
        #self.Xu = Xu

    def predict(self, X):
        np.random.seed(1102)
        self.Xt = X
        X = self.scaler.fit_transform(np.vstack((self.Xl, self.Xt)))
        y = np.append(self.yl, np.repeat(-1, self.Xt.shape[0]))
        #y = np.append(y, np.repeat(-1, self.Xt.shape[0]))
        y = np.int64(y)

        assert X.shape[0] == len(y)

        self.model.fit(X, y)

        return np.array(
            self.model.label_distributions_)[(-self.Xt.shape[0]):, :]
Exemple #6
0
def label(filenames, train_path='../data/train_molecules_30.mat'):
    """
    Label data with the provided filenames.

    :param filenames: List of filenames containing data to label.
    :return: Newly labeled and conglomerate datasets
    """
    unlabeled = [scipy.io.loadmat(fname) for fname in filenames]
    unlabeled_X = np.vstack([data['X'] for data in unlabeled])
    X, Y = load_data(train_path, shape=(-1, 30, 30, 30))

    num_unlabeled = unlabeled_X.shape[0]
    unlabeled_Y = np.zeros(num_unlabeled) - 1
    unlabeled_Y = unlabeled_Y.reshape((-1, 1))
    Y = Y.reshape((-1, 1))
    Y_all = np.vstack((Y, unlabeled_Y))

    X_all = np.vstack((X, unlabeled_X))
    X_all = X_all.reshape((-1, 27000))

    label_prop_model = LabelSpreading()
    label_prop_model.fit(X_all, Y_all)
    Y_all = label_prop_model.transduction_
    unlabeled_Y = Y_all[num_unlabeled:]
    return (unlabeled_X, unlabeled_Y), (X_all, Y_all)
Exemple #7
0
    def predict_ssl(self, x_sup, y_sup, x_unsup, y_unsup, x_test, y_test):

        ls_model = LabelSpreading(kernel='knn', n_neighbors=5)
        indices = np.arange(self.train_size)
        unlabeled_indices = indices[x_sup.shape[0]:]
        y_sup_unsup = np.concatenate([y_sup, y_unsup])
        y_sup_unsup_train = np.copy(y_sup_unsup)
        y_sup_unsup_train[unlabeled_indices] = -1

        x_fit = np.concatenate([x_sup, x_unsup], axis=0)
        h_fit = self.model_e.predict(x_fit)
        h_fit = np.reshape(h_fit,
                           (h_fit.shape[0], h_fit.shape[1] * h_fit.shape[2]))
        ls_model.fit(h_fit, y_sup_unsup_train)
        y_unsup_pred = ls_model.transduction_[unlabeled_indices]

        #print("LabelSpread Accuracy is ", accuracy_score(y_unsup, y_unsup_pred))

        h_test = self.model_e.predict(x_test)
        h_test = np.reshape(
            h_test, (h_test.shape[0], h_test.shape[1] * h_test.shape[2]))

        #SVM
        clf_svc = svm.SVC(kernel='linear')
        y_fit_true = ls_model.transduction_
        clf_svc.fit(h_fit, y_fit_true)
        acc_svm = accuracy_score(y_test, clf_svc.predict(h_test))

        clf_svc = svm.LinearSVC()
        clf_svc.fit(h_fit, y_fit_true)
        acc_svm_linear = accuracy_score(y_test, clf_svc.predict(h_test))
        print('acc_svm is ', max(acc_svm, acc_svm_linear))
Exemple #8
0
def testLabelPropagation():
    from sklearn.semi_supervised import LabelSpreading
    from sklearn import preprocessing
    label_enc = preprocessing.LabelEncoder()

    label_prop_model = LabelSpreading(kernel='knn')
    train_iter = getDocumentIterator1("published = 0 and is_test = 1")
    validation_iter = getDocumentIterator1("published = 1 and is_test = 1")
    semantic_model = gensim_tests.SemanticModel.load(
        'gensim/full_corpus_300000')
    all_profiles, labels = [], []
    propagation_labels = []
    for doc in train_iter:
        all_profiles.append(semantic_model.inferProfile(doc.rawtext))
        labels.append(doc.learned_category[0])
        propagation_labels.append(doc.learned_category[0])

    label_enc.fit(propagation_labels)
    propagation_labels = label_enc.transform(propagation_labels).tolist()

    for doc in validation_iter:
        all_profiles.append(semantic_model.inferProfile(doc.rawtext))
        labels.append(doc.learned_category[0])
        propagation_labels.append(-1)
    print propagation_labels
    print "Fitting"
    label_prop_model.fit(all_profiles, propagation_labels)
    output_labels = label_prop_model.transduction_
    for propagated, orig in zip(label_enc.inverse_transform(output_labels),
                                labels):
        print propagated, orig
Exemple #9
0
def test_LabelSpreading_knn(*data):
    x, y, unlabeled_indices = data
    y_train = np.copy(y)
    y_train[unlabeled_indices] = -1
    fig = plt.figure()
    ax = fig.add_subplot(1, 1, 1)
    alphas = np.linspace(0.01, 1, num=10, endpoint=True)
    Ks = [1, 2, 3, 4, 5, 8, 10, 15, 20, 25, 30, 35, 40, 50]
    colors = (
    (1, 0, 0), (0, 1, 0), (0, 0, 1), (0.5, 0.5, 0), (0, 0.5, 0.5), (0.5, 0, 0.5), (0.4, 0.6, 0), (0.6, 0.4, 0), \
    (0, 0.6, 0.4), (0.5, 0.3, 0.2))  # 颜色集合,不同的曲线用不同的颜色
    # 训练并绘图
    for alpha, color in zip(alphas, colors):
        scores = []
        for K in Ks:
            clf = LabelSpreading(max_iter=100, n_neighbors=K, alpha=alpha, kernel='knn')
            clf.fit(x, y_train)
            scores.append(clf.score(x[unlabeled_indices], y[unlabeled_indices]))
        ax.plot(Ks, scores, label=r"$\alpha=%s$" % alpha, color=color)
    # 设置图形
    ax.set_xlabel(r"k")
    ax.set_ylabel("score")
    ax.legend(loc='best')
    ax.set_title("LabelSpreading knn kernel")
    plt.show()
Exemple #10
0
def soft_clamping(kernel, xTrain, yTrain, MI=10000, k=3, g=0.6, a=0.1):
    spread = LabelSpreading(kernel=kernel,
                            n_neighbors=k,
                            gamma=g,
                            alpha=a,
                            max_iter=MI,
                            n_jobs=-1)
    spread.fit(xTrain, yTrain)
    predY = spread.predict_proba(xTrain)

    norm_Y = normalize(yTrain, predY)
    labels = []
    for i in norm_Y:
        if i[0] > i[1]:
            labels.append(benign)
        elif i[0] < i[1]:
            labels.append(malware)

    lm_to_b, lb_to_m, tp, tn, fp, fn, pred_day1, missed_day1 = stats(
        yTrain, labels, yExpect, day_one)

    results = [
        'SC', kernel, k, g, a, lm_to_b, lb_to_m, tp, tn, fp, fn, pred_day1,
        missed_day1
    ]

    file_name = 'SC_CMN_5per_' + str(rate) + '.csv'
    write_csv(file_name, results)
def label_spreading(X_train, y_train, Xunlabelled, X_test, y_test):
    #pca = randomized_PCA(X_train)
    #X_train, X_test, y_train, y_test = cross_validation.train_test_split(tr_images, tr_labels, test_size=0.3)
    #X = pca.transform(X)
    #val_images = pca.transform(val_images)
    #y= y[:]

    X_train = X_train[:, :]
    y_train = y_train[:]
    Xunlabelled = Xunlabelled[:10000,:]

    #import ipdb; ipdb.set_trace()

    X_both = np.vstack((X_train, Xunlabelled))

    y_both = np.append(y_train, -np.ones((Xunlabelled.shape[0],)))


    label_prop_model = LabelSpreading(max_iter=100)
    #random_unlabeled_points = np.where(np.random.random_integers(0, 1, size=len(y_train)))
    #labels = np.copy(y_train)
    #labels[random_unlabeled_points] = -1
    label_prop_model.fit(np.copy(X_both), np.copy(y_both))
    y_pred = label_prop_model.predict(np.copy(X_both))
    print(y_pred)
Exemple #12
0
class LabelSpreadingModel(SupervisedW2VModel):
    def fit_with_test(self, test_data):
        xs, ys = [], []
        self.ans_mapping = []
        for ans, cvs in self.context_vectors.items():
            xs.extend(cvs)
            if ans not in self.ans_mapping:
                y = len(self.ans_mapping)
                self.ans_mapping.append(ans)
            else:
                y = self.ans_mapping.index(ans)
            ys.extend(y for _ in cvs)
        for ctx in test_data:
            xs.append(self.cv(ctx))
            ys.append(-1)  # unlabeled
        self.ls_clf = LabelSpreading(kernel='knn', n_neighbors=11)
        self.ls_clf.fit(xs, ys)

    def __call__(self, x, ans=None, with_confidence=False):
        v = self.cv(x)
        probs = self.ls_clf.predict_proba([v])[0]
        pred = probs.argmax()
        m_ans = self.ans_mapping[pred]
        # TODO - get confidence as difference between probs[pred] and next
        return (m_ans, 0.0) if with_confidence else m_ans
 def doLabelSpreading(self,X,y,**kwargs):
     label_spread_model = LabelSpreading(**kwargs)
     if self.verbose>2: 
         print("X, y shapes: ",X.shape,y.shape)
         print(" y hist: ",np.histogram(y))
     label_spread_model.fit(X, y)
     if self.verbose>2: print("ls_predict:",np.histogram(label_spread_model.predict(X)) )
     return label_spread_model.predict_proba(X)
Exemple #14
0
def test_LabelSpreading(*data):
    x, y, unlabeled_indices = data
    y_train = np.copy(y)
    y_train[unlabeled_indices] = -1
    clf = LabelSpreading(max_iter=100, kernel='rbf', gamma=0.1)
    clf.fit(x, y_train)
    predicted_labels = clf.transduction_[unlabeled_indices]
    true_labels = y[unlabeled_indices]
    print("Accuracy: %f" % metrics.accuracy_score(true_labels, predicted_labels))
def semi_supervised():
	features,labels = separate_cols_with_unknown(gtd)
	features = process_nontext(features)
	features = convertDType(features)
	model = LabelPropagation(kernel="knn")
	model2 = LabelSpreading(kernel="knn")
	model2.fit(features,labels)
	preds = cross_val_predict(model2,features,labels,cv=5)
	print('5 fold cross val accuracy of model: %0.2f ' % accuracy_score(labels,preds))
 def LabelSpreadingWrapper(X_train, y_train, X_test):
     clf = LabelSpreading(kernel='knn',
                          n_neighbors=10,
                          n_jobs=-1,
                          max_iter=1000,
                          alpha=0.1)
     newlabels = np.concatenate((np.array(y_train), -np.ones(len(X_test))))
     clf.fit(np.concatenate((X_train, X_test)), newlabels)
     return clf.transduction_[-len(X_test):]
def semi_supervised():
    features, labels = separate_cols_with_unknown(gtd)
    features = process_nontext(features)
    features = convertDType(features)
    model = LabelPropagation(kernel="knn")
    model2 = LabelSpreading(kernel="knn")
    model2.fit(features, labels)
    preds = cross_val_predict(model2, features, labels, cv=5)
    print('5 fold cross val accuracy of model: %0.2f ' %
          accuracy_score(labels, preds))
Exemple #18
0
def test_LabelSpreading(*data):
    X,y,unlabeled_indices = data
    y_train = np.copy(y)
    y_train[unlabeled_indices] = -1
    clf = LabelSpreading(max_iter=1000, kernel='knn',gamma = 0.1)
    clf.fit(X,y_train)
    true_labels = y[unlabeled_indices]
    predicted_labels = clf.transduction_[unlabeled_indices]
    print('Accuracy : %f' %clf.score(X[unlabeled_indices],true_labels))
    print('Accuracy : %f' %metrics.accuracy_score(true_labels,predicted_labels))
 def label_spreading(self, X_train, y, X_test):
     clf = LabelSpreading()
     X = np.concatenate((X_train.todense(), X_test.todense()), axis=0)
     print("X shape now ", X.shape)
     print("Y shape now ", y.shape)
     clf.fit(X, y)
     final_labels = clf.predict(X_test)
     label_prob = clf.predict_proba(X_test)
     print(compare_labels_probabilities().compare(label_prob, final_labels))
     return final_labels, clf
def knn(X, labels):
    # #############################################################################
    # Learn with LabelSpreading
    label_spread = LabelSpreading(kernel='knn', alpha=0.6, max_iter=100)
    label_spread.fit(X, labels)

    # #############################################################################
    # Plot output labels
    output_labels = label_spread.transduction_

    return output_labels
def propagate_labels(
    features,
    labels,
):
    label_prop_model = LabelSpreading(kernel=construct_graph, n_jobs=-1)
    label_prop_model.fit(features, labels)
    logger.debug(label_prop_model.classes_)
    # preds = label_prop_model.predict(features)
    preds = label_prop_model.predict_proba(features)
    # logger.debug(label_prop_model.classes_)

    return preds
Exemple #22
0
def apply_notl(trainX, trainY, testX, testY, window, source_pos, target_pos):
    #######################
    ### SEMI-SUPERVISED ###
    ########################
    # Label Propagation
    label_prop_model = LabelPropagation(kernel='knn')
    label_prop_model.fit(trainX, trainY)
    Y_Pred = label_prop_model.predict(testX);
    acc_ss_propagation, acc_ss_propagation_INFO = check_accuracy(testY, Y_Pred)
    # Label Spreading
    label_prop_models_spr = LabelSpreading(kernel='knn')
    label_prop_models_spr.fit(trainX, trainY)
    Y_Pred = label_prop_models_spr.predict(testX);
    acc_ss_spreading, acc_ss_spreading_INFO = check_accuracy(testY, Y_Pred)
    ########################
    #### WITHOUT TL ########
    ########################
    # LogisticRegression 
    modelLR = LogisticRegression()
    modelLR.fit(trainX, trainY)
    predLR = modelLR.predict(testX)
    accLR, acc_LR_INFO = check_accuracy(testY, predLR)
    # DecisionTreeClassifier
    modelDT = tree.DecisionTreeClassifier()
    modelDT.fit(trainX, trainY)
    predDT = modelDT.predict(testX)
    accDT, acc_DT_INFO = check_accuracy(testY, predDT)
    # BernoulliNB
    modelNB = BernoulliNB()
    modelNB.fit(trainX, trainY)
    predND = modelNB.predict(testX)
    accNB, acc_NB_INFO = check_accuracy(testY, predND)
    #
    return pd.DataFrame(
        [{ 
        'window': window,
        'source_position': source_pos,
        'target_position': target_pos,

        'acc_SS_propagation': acc_ss_propagation,
        'acc_SS_propagation_INFO':acc_ss_propagation_INFO,
        'acc_SS_spreading': acc_ss_spreading,
        'acc_SS_spreading_INFO':acc_ss_spreading_INFO,
        'acc_LR':accLR,
        'acc_LR_INFO': str(acc_LR_INFO),
        'acc_DT': accDT,
        'acc_DT_INFO': str(acc_DT_INFO),
        'acc_NB': accNB,
        'acc_NB_INFO': str(acc_NB_INFO)       

        }]
    )
 def _semi_supervised_learning(self, data_matrix, target):
     n_classes = len(set(target))
     # if there are too few classes (e.g. less than -1 and at least 2 other classes)
     # then just bail out and return the original target
     # otherwise one cannot meaningfully spread the information of only one class
     if n_classes > 2:
         semi_supervised_estimator = LabelSpreading(kernel='knn', n_neighbors=self.n_neighbors)
         semi_supervised_estimator.fit(data_matrix, target)
         predicted_target = semi_supervised_estimator.predict(data_matrix)
         predicted_target = self._clamp(target, predicted_target)
         return predicted_target
     else:
         return target
 def _semi_supervised_learning(self, data_matrix, target):
     n_classes = len(set(target))
     # if there are too few classes (e.g. less than -1 and at least 2 other classes)
     # then just bail out and return the original target
     # otherwise one cannot meaningfully spread the information of only one class
     if n_classes > 2:
         semi_supervised_estimator = LabelSpreading(
             kernel='knn', n_neighbors=self.n_neighbors)
         semi_supervised_estimator.fit(data_matrix, target)
         predicted_target = semi_supervised_estimator.predict(data_matrix)
         predicted_target = self._clamp(target, predicted_target)
         return predicted_target
     else:
         return target
Exemple #25
0
def LabelPropagation(support, support_ys, query):
    alpha = 0.3
    k_neighbours = 38
    all_embeddings = np.concatenate((support, query), axis=0)
    #X = all_embeddings.cpu().detach().numpy()
    labels = np.full(all_embeddings.shape[0], -1.)
    labels[:support.shape[0]] = support_ys
    label_propagation = LabelSpreading(kernel='knn',
                                       alpha=alpha,
                                       n_neighbors=k_neighbours,
                                       tol=0.000001)
    label_propagation.fit(all_embeddings, labels)
    predicted_labels = label_propagation.transduction_
    query_prop = predicted_labels[support.shape[0]:]
    return query_prop
Exemple #26
0
class SemiSupervised(BaselineModel):
    """
    LabelSpreading Implementation
    """
    def fit(self):
        #Need to concatenate labeled and unlabeled data
        #unlabeled data labels are set to -1
        X = np.concatenate(
            (self.val_primitive_matrix, self.train_primitive_matrix))
        val_labels = (self.val_ground + 1) / 2.
        train_labels = -1. * np.ones(np.shape(self.train_primitive_matrix)[0])
        y = np.concatenate((val_labels, train_labels))

        self.model = LabelSpreading(kernel='knn')
        self.model.fit(X, y)
Exemple #27
0
def computeSimilarities2(vect, matrix_values, numLine, numRan=10):
    """ build the model with the semi supervised approach labelSpreading     
    Args:
    matrix_values: descriptor matrix i.e. all the probability of all ir models
    vect: the answer vector (value 0 for false links and 1 for true links);
    numLine: number of pairs of artefacts
  Returns:
    preds: probability that a pair of artefact is linked
  """
    #number of iterations

    allPrediction = []
    model = LabelSpreading()

    #compute multiple (10) random vector of the matrix_values
    for i in range(0, numRan):
        subVect, subMatrix_values = computeRandom(vect, matrix_values, numLine)
        #compute the prediction function of each random vector
        computeModel = model.fit(subMatrix_values, subVect)
        print("new predicted function computed")
        #compute the prediction of each pair of artefact with the random model
        preds0 = computeModel.predict_proba(matrix_values)
        allPrediction.append(preds0[:, 1])

    # by the "vote majoritaire"
    preds = vote(allPrediction, len(vect), numRan)

    print(preds)

    return preds
Exemple #28
0
def semi_supervised_learning(data_matrix, target):
    if -1 in list(target):
        # if -1 is present in target do label spreading
        from sklearn.semi_supervised import LabelSpreading
        label_prop_model = LabelSpreading(kernel='knn', n_neighbors=5)
        label_prop_model.fit(data_matrix, target)
        pred_target = label_prop_model.predict(data_matrix)
        extended_target = []
        for pred_label, label in zip(pred_target, target):
            if label != -1 and pred_label != label:
                extended_target.append(label)
            else:
                extended_target.append(pred_label)
    else:
        extended_target = target
    return np.array(extended_target)
    def test_label_spreading_algorithms():
        """
        Compare scikit's algorithm and our algorithm
        """
        x = np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9], [10, 11, 12]])

        # scikit takes different input that our algorithm
        y_sklearn = np.array([1, 2, -1, -1])
        y_custom = np.array([[1, 0], [0, 1], [0, 0], [0, 0]])

        # scikit's algorithm
        alpha = 0.2
        max_iter = 30
        tol = 1e-3
        label_spreading = LabelSpreadingSKLearn(kernel="rbf",
                                                max_iter=max_iter,
                                                alpha=alpha, tol=tol)
        model = label_spreading.fit(x, y_sklearn)
        expected = model.predict(x)

        # our algorithm
        w = distance_matrix(x, measure=rbf_distance)
        ls = LabelSpreadingCustom(alpha=alpha, max_iter=max_iter, tol=tol)
        ls = ls.fit(w, y_custom)
        actual = ls.predict(y_custom)
        actual = np.array(actual) + 1  # add plus 1 to every prediction
        assert_array_equal(actual, expected)
Exemple #30
0
def semi_supervised_learning(data_matrix, target):
    if -1 in list(target):
        # if -1 is present in target do label spreading
        from sklearn.semi_supervised import LabelSpreading
        label_prop_model = LabelSpreading(kernel='knn', n_neighbors=5)
        label_prop_model.fit(data_matrix, target)
        pred_target = label_prop_model.predict(data_matrix)
        extended_target = []
        for pred_label, label in zip(pred_target, target):
            if label != -1 and pred_label != label:
                extended_target.append(label)
            else:
                extended_target.append(pred_label)
    else:
        extended_target = target
    return np.array(extended_target)
Exemple #31
0
    def augment_instances(self, X_train, y_train):

        if self.args.num_unlabeled == 0:
            return X_train, y_train

        X_unlabeled = self.dataset.X_train_unlabeled
        y_unlabeled = self.dataset.y_train_unlabeled

        X_unlabeled = X_unlabeled.values
        y_unlabeled = y_unlabeled.values

        X_train_text = X_train[:, self.args.text_col]
        self.fit_text(X_train_text, y_train)
        X_train_rep = self.transform_text(X_train_text)
        X_train_rep = self.augment_features(X_train_rep, X_train)

        chunk_size = 1000
        num_instances = X_unlabeled.shape[0]
        num_cols = y_train.shape[1]
        for row in tqdm(range(0, self.args.num_unlabeled, chunk_size),
                        desc='spreading labels in rows',
                        total=int(self.args.num_unlabeled / chunk_size)):
            end_row = row + chunk_size
            end_row = np.minimum(end_row, num_instances)
            for col in tqdm(range(num_cols),
                            desc='spreading labels in cols',
                            leave=False):

                X_unlabeled_rep = self.transform_text(
                    X_unlabeled[row:end_row, self.args.text_col])
                X_unlabeled_rep = self.augment_features(
                    X_unlabeled_rep, X_unlabeled[row:end_row, :])

                X_spread = np.append(X_train_rep, X_unlabeled_rep, axis=0)
                y_spread = np.append(y_train[:, col],
                                     y_unlabeled[row:end_row, col],
                                     axis=0)

                labeling = LabelSpreading()
                labeling.fit(X_spread, y_spread)
                y_unlabeled[row:end_row,
                            col] = labeling.predict(X_unlabeled_rep)

        X_train = np.append(X_train, X_unlabeled[:row + chunk_size], axis=0)
        y_train = np.append(y_train, y_unlabeled[:row + chunk_size], axis=0)
        return X_train, y_train
Exemple #32
0
def runLabelSpreading(data, assignment):
    lp_model = LabelSpreading(kernel='knn', n_neighbors=10)
    labels = [-1] * len(data)
    for x, y in assignment:
        labels[x - 1] = y
    labels = np.array(labels)
    lp_model.fit(data, labels)
    pred = lp_model.transduction_

    result = []
    d = {}
    for i in range(6000, len(pred)):
        c = d.setdefault(int(pred[i]), 0)
        d[int(pred[i])] = c + 1
        result.append([i + 1, int(pred[i])])
    print d
    return result
Exemple #33
0
class LabelSpreadingImpl:
    def __init__(self, **hyperparams):
        self._hyperparams = hyperparams
        self._wrapped_model = Op(**self._hyperparams)

    def fit(self, X, y=None):
        if y is not None:
            self._wrapped_model.fit(X, y)
        else:
            self._wrapped_model.fit(X)
        return self

    def predict(self, X):
        return self._wrapped_model.predict(X)

    def predict_proba(self, X):
        return self._wrapped_model.predict_proba(X)
    def objective(self, x):
        """
        Objective function for hyper-parameter selection/evaluation

        Parameters
        ----------
        x : hyper-parameter under test - gamma

        Returns
        -------
        float
            a measure directly proportional to entropy
        """
        model = LabelSpreading(kernel=self.kernel, alpha=self.alpha,
                                                 gamma=x)
        model.fit(self.x, self.y)
        label_prob = model.label_distributions_
        return get_average_label_entropy(label_prob) + self.learning_rate * x ** 2
Exemple #35
0
def train_model(nodes, datasets):
    y = np.array(range(len(nodes)))
    nodes = list(nodes)
    vectorizer = DictVectorizer(sparse=True)
    for i, dataset in enumerate(datasets):
        g = compute_dataset(dataset)
        nodes.extend(g.classes)
        sys.stdout.write('\r')
        sys.stdout.write(str(i + 1))
        sys.stdout.flush()
    X = vectorizer.fit_transform([dict(node.concept_vector) for node in nodes])
    y = y + [-1 for i in range(len(nodes) - len(y))]

    unlabeled = []
    model = LabelSpreading()
    model.fit(X, y)
    model.vectorizer = vectorizer
    return model
def train_model(nodes, datasets):
    y = np.array(range(len(nodes)))
    nodes = list(nodes)
    vectorizer = DictVectorizer(sparse=True)
    for i, dataset in enumerate(datasets):
        g = compute_dataset(dataset)
        nodes.extend(g.classes)
        sys.stdout.write('\r')
        sys.stdout.write(str(i+1))
        sys.stdout.flush()
    X = vectorizer.fit_transform([dict(node.concept_vector) for node in nodes])
    y = y + [-1 for i in range(len(nodes) - len(y))]

    unlabeled = []
    model = LabelSpreading()
    model.fit(X, y)
    model.vectorizer = vectorizer
    return model
Exemple #37
0
def propogation(model, uids, labeled_ids):
    X, y1, y2 = [], [], []
    pool = []
    for uid in labeled_ids:
        X.append(model.docvecs[uid])
        y1.append(1)
    for uid in uids:
        if uid not in labeled_ids:
            X.append(model.docvecs[uid])
            y2.append(-1)
    label_prop_model = LabelSpreading(kernel='knn', alpha=1.0)
    y2 = np.array(y2)
    y2[0:(len(y1)-1)] = 0
    print len(y1) + len(y2)
    for i in xrange(5):
        np.random.shuffle(y2)
        label_prop_model.fit(X, y1 + y2.tolist())
        pool.append(label_prop_model.transduction_)
    pickle.dump(pool, open('data/propagation.pick', 'w'))
    pool = pickle.load(open('data/propagation.pick', 'r'))
    pool = np.array(pool)
    for column in pool.T:
        print column
  1.  1. -1.  1.  6. -1.  3.  6.  1.  4.  4.  6.  4.  6.  4.  1. -1.  6.
  1.  2.  6.  4. -1.  2.  6.  2. -1. -1.  6.  4. -1.  1.  6.  4.  4.  6.
  6.  6. -1. -1.  1.  3. -1.  6.  2. -1.  1.  4. -1.  6.  1.  4.  3.  3.
  4.  1.  6. -1.  4.  4.  1.  1.  6.  6. -1.  4.  4.  4.  3.  2.  6. -1.
  1.  6.  4.  4.  4.  5.  6. -1. -1.  5.  2.  6.  1.  6.  3.  2.  6.  3.
  3.  1.  2.  5.  2. -1. -1.  1.  6.  6. -1.  6.  6.  6.  4.  6. -1.  2.
  3.  2.  5.  4.  4.  6.  4. -1.  4.  2.  6.  1.  1.  2. -1.  5.  2.  4.
  3. -1.  6.  2.  5.  2.  2.  5.  5.  4.  2.  1. -1.  1.]
(500, 100)
(500,)
"""

from sklearn.semi_supervised import LabelSpreading

label_propagation_model = LabelSpreading()
label_propagation_model.fit(X, y)

# make predictions for first twenty samples (some will be known, some unknown)
for i in range(20): print 'y: ', y[i], '\t', 'y_hat: ', label_propagation_model.predict(X[i].reshape(1,-1))
"""
y:  6.0     y_hat:  [6.]
y:  6.0     y_hat:  [6.]
y:  2.0     y_hat:  [2.]
y:  1.0     y_hat:  [1.]
y:  -1.0    y_hat:  [6.]    *
y:  2.0     y_hat:  [2.]
y:  6.0     y_hat:  [6.]
y:  4.0     y_hat:  [4.]
y:  3.0     y_hat:  [3.]
y:  5.0     y_hat:  [5.]
y:  6.0     y_hat:  [6.]
Exemple #39
0
def main():
    usage = "usage prog [options] arg"
    parser = OptionParser(usage=usage)

    parser.add_option("-t", "--task", dest="task", help="the task name")
    parser.add_option("-o", "--output", dest="output", help="the output file")

    (options, remainder) = parser.parse_args()

    train_paths = [
        "../data/train_simple_feature.csv",
        "../data/train_plus_feature.csv",
        "../data/train_azure_plus_feature.csv",
        #"../data/train_azure_feature.csv",
        #"../data/train_module_feature.csv",
        #"../data/train_course_feature.csv",
        #"./blend_train.csv"
        ]
    label_path = "../data/truth_train.csv"

    test_paths = [
        "../data/test_simple_feature.csv",
        "../data/test_plus_feature.csv",
        "../data/test_azure_plus_feature.csv",
        #"../data/test_azure_feature.csv",
        #"../data/test_module_feature.csv",
        #"../data/test_course_feature.csv",
        #"./blend_test.csv"
        ]

    train = merge_features(train_paths, label_path)
    train = train.drop(['user_drop_ratio'], axis=1)
    #train['user_drop_ratio'] = (train['user_drop_ratio'] + 8.0 / train['user_courses']) / (1.0 + 10.0 / train['user_courses'])
    y = encode_labels(train.dropout.values)
    train = train.drop('dropout', axis=1)
    tr_ids = train.enrollment_id.values
    X = train.drop('enrollment_id', axis=1)
    m, n = X.shape
    print 'train.shape=%s' % (str(X.shape))


    test = merge_features(test_paths)
    test = test.drop(['user_drop_ratio'], axis=1)
    #test['user_drop_ratio'] = (test['user_drop_ratio'] + 8.0 / test['user_courses']) / (1.0 + 10.0 / test['user_courses'])
    tt_ids = test.enrollment_id.values
    X_test = test.drop('enrollment_id', axis=1)
    print 'test.shape=%s' % (str(X_test.shape))

    scaler = StandardScaler().fit(np.vstack((X, X_test)))

    task = options.task
    if not task:
        task = "blend"

    if task == 'blend':

        clf_list = [
            #("knn_p2_10", create_clf('knn', {"n_neighbors": 10, "p": 2})),
            #("knn_p2_10_scaler", create_clf('knn', {"n_neighbors": 10, "p": 2, "scaler": scaler})),
            #("knn_p2_100", create_clf('knn', {"n_neighbors": 100, "p": 2})),
            #("knn_p2_100_scaler", create_clf('knn', {"n_neighbors": 100, "p": 2, "scaler": scaler})),
            #("knn_p2_500", create_clf('knn', {"n_neighbors": 500, "p": 2})),
            #("knn_p2_500_scaler", create_clf('knn', {"n_neighbors": 500, "p": 2, "scaler": scaler})),
            #("knn_p2_100", create_clf('knn', {"n_neighbors": 100, "p": 2})),
            #("knn_p2_800", create_clf('knn', {"n_neighbors": 800, "p": 2})),
            #("knn_p1_10", create_clf('knn', {"n_neighbors": 10, "p": 1})),
            #("knn_p1_10_scaler", create_clf('knn', {"n_neighbors": 10, "p": 1, "scaler": scaler})),
            #("knn_p1_100", create_clf('knn', {"n_neighbors": 100, "p": 1})),
            #("knn_p1_100_scaler", create_clf('knn', {"n_neighbors": 100, "p": 1, "scaler": scaler})),
            #("knn_p1_500", create_clf('knn', {"n_neighbors": 500, "p": 1})),
            #("knn_p1_500_scaler", create_clf('knn', {"n_neighbors": 500, "p": 1, "scaler": scaler})),
            #("knn_p1_800", create_clf('knn', {"n_neighbors": 800, "p": 1})),
            #("knn_p1_800_scaler", create_clf('knn', {"n_neighbors": 800, "p": 1, "scaler": scaler})),
            #("extra_gini_10depth", create_clf("ext", {"criterion": "gini", "n_estimators": 200, "max_depth": 10})),
            #("extra_entropy_10depth", create_clf("ext", {"criterion": "entropy", "n_estimators": 200, "max_depth": 10})),
            #("extra_gini_20depth", create_clf("ext", {"criterion": "gini", "n_estimators": 200, "max_depth": 20})),
            #("extra_entropy_20depth", create_clf("ext", {"criterion": "entropy", "n_estimators": 200, "max_depth": 20})),
            ("extra_gini_30depth", create_clf("ext", {"criterion": "gini", "n_estimators": 200, "max_depth": 30})),
            ("extra_entropy_30depth", create_clf("ext", {"criterion": "entropy", "n_estimators": 200, "max_depth": 30})),
            #("rfc_gini_3depth", create_clf("rfc", {"criterion": "gini", "max_depth": 3, "n_estimators": 200})),
            #("rfc_entropy_3depth", create_clf("rfc", {"criterion": "entropy", "max_depth": 3, "n_estimators": 200})),
            ("rfc_gini_5depth", create_clf("rfc", {"criterion": "gini", "max_depth": 5, "n_estimators": 200})),
            ("rfc_entropy_5depth", create_clf("rfc", {"criterion": "entropy", "max_depth": 5, "n_estimators": 200})),
            ("rfc_gini_6depth", create_clf("rfc", {"criterion": "gini", "max_depth": 6, "n_estimators": 200})),
            ("rfc_entropy_6depth", create_clf("rfc", {"criterion": "entropy", "max_depth": 6, "n_estimators": 200})),
            ("rfc_gini_8depth", create_clf("rfc", {"criterion": "gini", "max_depth": 8, "n_estimators": 200})),
            ("rfc_entropy_8depth", create_clf("rfc", {"criterion": "entropy", "max_depth": 8, "n_estimators": 200})),
            ("rfc_gini_10depth", create_clf("rfc", {"criterion": "gini", "max_depth": 10, "n_estimators": 200})),
            ("rfc_entropy_10depth", create_clf("rfc", {"criterion": "entropy", "max_depth": 10, "n_estimators": 200})),
            ("rfc_gini_12depth", create_clf("rfc", {"criterion": "gini", "max_depth": 12, "n_estimators": 200})),
            ("rfc_entropy_12depth", create_clf("rfc", {"criterion": "entropy", "max_depth": 12, "n_estimators": 200})),
            #("xgb_1500_2depth", create_clf("xgb", {"max_depth": 2, "n_estimators": 1500, "learning_rate": 0.03})),
            #("xgb_600_3depth", create_clf("xgb", {"max_depth": 3, "n_estimators": 600, "learning_rate": 0.03})),
            #("xgb_600_4depth", create_clf("xgb", {"max_depth": 4, "n_estimators": 600, "learning_rate": 0.03})),
            ("xgb_600_5depth", create_clf("xgb", {"max_depth": 5, "n_estimators": 600, "learning_rate": 0.03})),
            #("xgb_600_6depth", create_clf("xgb", {"max_depth": 6, "n_estimators": 600, "learning_rate": 0.02})),
            #("xgb_600_7depth", create_clf("xgb", {"max_depth": 7, "n_estimators": 600, "learning_rate": 0.01})),
            #("xgb_600_8depth", create_clf("xgb", {"max_depth": 8, "n_estimators": 600, "learning_rate": 0.01})),
            #("lgc_1c_scale", create_clf("lgc", {"C": 1.0, "scaler": scaler})),
            #("lgc_1c", create_clf("lgc", {"C": 1.0})),
            #("lgc_1c_l1", create_clf("lgc", {"C": 1.0, "penalty": "l1"})),
            #("lgc_3c_scale", create_clf("lgc", {"C": 3.0, "scaler": scaler})),
            #("lgc_3c", create_clf("lgc", {"C": 3.0})),
            ("lgc_3c_l1", create_clf("lgc", {"C": 3.0, "penalty": "l1"})),
            #("lgc_5c_scale", create_clf("lgc", {"C": 5.0, "scaler": scaler})),
            #("lgc_5c", create_clf("lgc", {"C": 5.0})),
            ]

        X = X.values
        blend_train, blend_test = train_blend(X, y, X_test, clf_list, 5)
        print 'blend_train.shape=%s' % (str(blend_train.shape))
        print 'blend_test.shape=%s' % (str(blend_test.shape))


        cols = [cname for cname, clf in clf_list]
        cols = ['enrollment_id'] + cols
        blend_train_ids = np.hstack((np.matrix(tr_ids).T, blend_train))
        blend_test_ids = np.hstack((np.matrix(tt_ids).T, blend_test))
        dump_data(blend_train_ids, cols, "new_blend_train.csv")
        dump_data(blend_test_ids, cols, "new_blend_test.csv")


        blender = create_clf('lgc', {"C": 1.0, "penalty": "l1"})
        auc = cv_loop(blend_train, y, blender)
        print 'AUC (LGC blend): %f' % auc

        blender = create_clf('ext', {"max_depth": 10, "criterion": "entropy", "n_estimator": 100})
        auc = cv_loop(blend_train, y, blender)
        print 'AUC (EXT blend): %f' % auc

        blender = create_clf('xgb', {'max_depth': 2, "n_estimators": 150, "learning_rate": 0.05})
        auc = cv_loop(blend_train, y, blender)
        print "AUC (XGB blend {d: %d, n: %d}): %f" % (2, 150, auc)
        blender = create_clf('xgb', {'max_depth': 3, "n_estimators": 200, "learning_rate": 0.05})
        auc = cv_loop(blend_train, y, blender)
        print 'AUC (XGB blend {d: %d, n: %d}): %f' % (3, 200, auc)

        blender = create_clf('xgb', {'max_depth': 3, "n_estimators": 100, "learning_rate": 0.1})
        blender = blender.fit(blend_train, y)
        preds = blender.predict_proba(blend_test)[:,1]
        write_submission(tt_ids, preds, "new_blend_submission.csv")

        combined_train = np.hstack((X, blend_train))
        combined_test = np.hstack((X_test, blend_test))
        blender = create_clf('xgb', {'max_depth': 5, "n_estimators": 600, "learning_rate": 0.03})
        blender = blender.fit(combined_train, y)
        preds = blender.predict_proba(combined_test)[:,1]
        write_submission(tt_ids, preds, "new_combined_blend_submission.csv")

    elif task == 'lgc':
        print 'Try logistic regression ..'
        clf = create_clf("lgc", {"C": 3, "scaler": scaler, "penalty": "l1"})
        auc = cv_loop(X, y, clf, 5)
        print 'AUC (all): %f' % auc

    elif task == "ext":
        print 'Try ExtraTreeClassifier'
        #clf = create_clf("ext", {"max_depth": 10}) # 0.86261
        #clf = create_clf("ext", {"max_depth": 20}) # 0.862636
        #clf = create_clf("ext", {"max_depth": 30}) # 0.860944
        #clf = create_clf("ext", {"criterion": "entropy", "max_depth": 10}) # 0.862610
        #clf = create_clf("ext", {"criterion": "entropy", "max_depth": 20}) # 0.862564
        clf = create_clf("ext", {"criterion": "entropy", "max_depth": 20, "n_estimators": 100}) # 0.861795
        #clf = create_clf("ext", {"criterion": "entropy", "max_depth": 20, "n_estimators": 2000}) # 0.862695
        #clf = create_clf("ext", {"criterion": "entropy", "max_depth": 30, "n_estimators": 2000}) # 0.860
        auc = cv_loop(X, y, clf, 5)
        print 'AUC (all): %f' % auc

    elif task == 'rfc':
        print 'Try RFC ..'
        #clf = create_clf('rfc', {'max_depth': 5}) # 0.859583
        #clf = create_clf("rfc", {"criterion": "entropy", "max_depth": 10}) # 0.863369
        #clf = create_clf("rfc", {"criterion": "entropy", "max_depth": 10, "n_estimators": 200}) # 0.863285
        # clf = create_clf("rfc", {"criterion": "entropy", "max_depth": 10, "n_estimators": 100}) # 0.863207
        #clf = create_clf("rfc", {"criterion": "entropy", "max_depth": 10, "max_features": None, "n_estimators": 200}) # 0.863341
        clf = create_clf("rfc", {"criterion": "entropy", "max_depth": 40, "max_features": None, "n_estimators": 10000, "min_samples_split": 100}) # 0.863291
        auc = cv_loop(X, y, clf, 5)
        print 'AUC (all): %f' % auc

    elif task == 'knn':
        clf = create_clf('knn', {"n_neighbors": 800, "p": 2, "scaler": scaler})
        auc = cv_loop(X, y, clf, 5)
        print 'AUC (all): %f' % auc

    elif task == "gbt":
        paras = json.load(open('paras/gbt.json', 'r'))
        clf = create_clf("gbt", paras)
        clf = clf.fit(X, y)
        preds = clf.predict_proba(X_test)[:,1]
        write_submission(tt_ids, preds, "gbt_submission.csv")

    elif task == "xgb":
        #clf = create_clf('xgb', {"max_depth": 2, "n_estimators": 1500, "learning_rate": 0.03}) # 0.860279
        #clf = create_clf('xgb', {"max_depth": 5, "n_estimators": 600, "learning_rate": 0.03}) # public: 0.8891443712867697;
        clf = create_clf('xgb', {"max_depth": 5, "n_estimators": 600, "learning_rate": 0.03}) # public:
        auc = cv_loop(X, y, clf, 5)
        print "AUC (all): %f" % auc
        #sys.exit()
        clf = clf.fit(X, y)
        preds = clf.predict_proba(X_test)[:,1]
        write_submission(tt_ids, preds, 'xgb_new_submission.csv')
    elif task == "deep":
        clf = create_clf('deep', {"neuro_num": 512, "nb_epoch": 20, "scaler": scaler, "optimizer": "adadelta"})
        auc = cv_loop(X, y, clf, 5)
        print 'AUC (all): %f' % auc
        sys.exit(0)

        clf = clf.fit(X, y)
        preds = clf.predict_proba(X_test)[:,1]
        write_submission(tt_ids, preds, 'deep_submission.csv')

    elif task == 'semi':
        clf = create_clf("ext", {"criterion": "entropy", "max_depth": 20, "n_estimators": 100}) # 0.861795
        train_semi(X, y, X_test, clf, 5)

    elif task == 'gbc':
        from sklearn.ensemble import GradientBoostingClassifier
        clf = GradientBoostingClassifier(n_estimators=300,
                                        learning_rate=0.1,
                                        min_samples_split=50,
                                        min_samples_leaf=50,
                                        max_depth=10,
                                        subsample=0.6,
                                        max_features='log2',
                                        verbose=1)
        auc = cv_loop(X, y, clf, 5)
        print 'AUC (all): %f' % auc

    elif task == 'label':
        from sklearn.semi_supervised import LabelPropagation
        from sklearn.semi_supervised import LabelSpreading
        label_prop_model = LabelSpreading()
        all_X = np.vstack((X, X_test))
        tm, tn = X_test.shape
        unlabeles = [-1] * tm
        ys = [list(y)]
        ys.append(unlabeles)
        labels = np.concatenate(ys)
        print 'ALL shape=%s' % (str(all_X.shape))
        print 'ALL y shape=%s' % (str(labels))
        label_prop_model.fit(all_X, labels)
 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
 2 2]
'''

print cutdown_labels
'''
[ 0  0  0  0 -1 -1 -1  0  0  0 -1  0  0 -1 -1 -1  0  0  0 -1  0 -1 -1  0  0
  0 -1  0  0 -1  0 -1 -1  0  0  0  0 -1  0  0 -1  0 -1  0 -1  0  0  0  0 -1
  1  1  1  1  1  1 -1 -1 -1  1  1 -1  1  1 -1  1 -1  1 -1  1  1 -1 -1  1  1
  1  1 -1  1 -1  1  1  1 -1  1  1  1  1  1  1 -1  1  1  1  1  1  1  1 -1 -1
 -1  2  2  2  2 -1  2  2 -1 -1 -1 -1  2  2  2  2  2 -1  2  2  2  2  2 -1 -1
  2  2  2 -1  2  2 -1 -1  2  2  2  2  2  2  2  2 -1  2  2 -1 -1  2  2 -1 -1]
'''

# fit LabelSpreading model
label_propagation_model.fit(iris['data'], cutdown_labels)

# quick test
print 'y: ', full_labels[-1]
print 'y_hat: ', label_propagation_model.predict(iris['data'][-1])
'''
y:  2
y_hat:  [2]
'''

# overall accuracy
correct = 0.0
for i in range(len(iris['data'])):

	if label_propagation_model.predict(iris['data'][i])[0] == full_labels[i]:
		correct += 1
rate = 1

train_comb = train
# select 10% of test data
selected_test = test.sample(test.shape[0]/rate,replace=False,random_state=20422438)
train_comb = train_comb.append(selected_test)
train_comb_label = train_label
train_comb_unlabeled = pd.DataFrame(np.array([-1]*(test.shape[0]/rate)))
train_comb_label = np.array(train_comb_label.append(train_comb_unlabeled))
train_comb_label = train_comb_label.reshape(len(train_comb_label))


a_level = 1
label_prop_model = LabelSpreading(kernel="knn",alpha=a_level)
label_prop_model.fit(train_comb, train_comb_label)
pred_y = label_prop_model.transduction_
pred_y[:train.shape[0]] = train_label



X_train, X_test, y_train, y_test = train_test_split(label_prop_model.X_, pred_y, test_size=0.10, random_state=20422438)

model_erf = se.ExtraTreesClassifier(random_state=20422438,n_jobs=-1,n_estimators=1000)
model_erf.fit(X_train,y_train)

model_erf_pred = model_erf.predict(X_test)
model_erf_error = errFn(model_erf_pred,y_test)

print a_level
print model_erf_error