def test_label_spreading_algorithms():
        """
        Compare scikit's algorithm and our algorithm
        """
        x = np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9], [10, 11, 12]])

        # scikit takes different input that our algorithm
        y_sklearn = np.array([1, 2, -1, -1])
        y_custom = np.array([[1, 0], [0, 1], [0, 0], [0, 0]])

        # scikit's algorithm
        alpha = 0.2
        max_iter = 30
        tol = 1e-3
        label_spreading = LabelSpreadingSKLearn(kernel="rbf",
                                                max_iter=max_iter,
                                                alpha=alpha, tol=tol)
        model = label_spreading.fit(x, y_sklearn)
        expected = model.predict(x)

        # our algorithm
        w = distance_matrix(x, measure=rbf_distance)
        ls = LabelSpreadingCustom(alpha=alpha, max_iter=max_iter, tol=tol)
        ls = ls.fit(w, y_custom)
        actual = ls.predict(y_custom)
        actual = np.array(actual) + 1  # add plus 1 to every prediction
        assert_array_equal(actual, expected)
Esempio n. 2
0
def test_LabelSpreading_rbf(*data):
    x, y, unlabeled_indices = data
    y_train = np.copy(y)
    y_train[unlabeled_indices] = -1
    fig = plt.figure()
    ax = fig.add_subplot(1, 1, 1)
    alphas = np.linspace(0.01, 1, num=10, endpoint=True)
    gammas = np.logspace(-2, 2, num=50)
    colors = ((1, 0, 0), (0, 1, 0), (0, 0, 1), (0.5, 0.5, 0), (0, 0.5, 0.5), (0.5, 0, 0.5), (0.4, 0.6, 0), (0.6, 0.4, 0) \
                  , (0, 0.6, 0.4), (0.5, 0.3, 0.2))  # 颜色集合,不同的曲线用不同的颜色
    # 训练并绘图
    for alpha, color in zip(alphas, colors):
        scores = []
        for gamma in gammas:
            clf = LabelSpreading(max_iter=100, gamma=gamma, alpha=alpha, kernel='rbf')
            clf.fit(x, y_train)
            scores.append(clf.score(x[unlabeled_indices], y[unlabeled_indices]))
        ax.plot(gammas, scores, label=r"$\alpha=%s$" % alpha, color=color)
    # 设置图形
    ax.set_xlabel(r"$\gamma$")
    ax.set_ylabel("score")
    ax.set_xscale("log")
    ax.legend(loc='best')
    ax.set_title("LabelSpreading rbf kernel")
    plt.show()
Esempio n. 3
0
def test_LabelSpreading_knn(*data):
    x, y, unlabeled_indices = data
    y_train = np.copy(y)
    y_train[unlabeled_indices] = -1
    fig = plt.figure()
    ax = fig.add_subplot(1, 1, 1)
    alphas = np.linspace(0.01, 1, num=10, endpoint=True)
    Ks = [1, 2, 3, 4, 5, 8, 10, 15, 20, 25, 30, 35, 40, 50]
    colors = (
    (1, 0, 0), (0, 1, 0), (0, 0, 1), (0.5, 0.5, 0), (0, 0.5, 0.5), (0.5, 0, 0.5), (0.4, 0.6, 0), (0.6, 0.4, 0), \
    (0, 0.6, 0.4), (0.5, 0.3, 0.2))  # 颜色集合,不同的曲线用不同的颜色
    # 训练并绘图
    for alpha, color in zip(alphas, colors):
        scores = []
        for K in Ks:
            clf = LabelSpreading(max_iter=100, n_neighbors=K, alpha=alpha, kernel='knn')
            clf.fit(x, y_train)
            scores.append(clf.score(x[unlabeled_indices], y[unlabeled_indices]))
        ax.plot(Ks, scores, label=r"$\alpha=%s$" % alpha, color=color)
    # 设置图形
    ax.set_xlabel(r"k")
    ax.set_ylabel("score")
    ax.legend(loc='best')
    ax.set_title("LabelSpreading knn kernel")
    plt.show()
Esempio n. 4
0
def soft_clamping(kernel, xTrain, yTrain, MI=10000, k=3, g=0.6, a=0.1):
    spread = LabelSpreading(kernel=kernel,
                            n_neighbors=k,
                            gamma=g,
                            alpha=a,
                            max_iter=MI,
                            n_jobs=-1)
    spread.fit(xTrain, yTrain)
    predY = spread.predict_proba(xTrain)

    norm_Y = normalize(yTrain, predY)
    labels = []
    for i in norm_Y:
        if i[0] > i[1]:
            labels.append(benign)
        elif i[0] < i[1]:
            labels.append(malware)

    lm_to_b, lb_to_m, tp, tn, fp, fn, pred_day1, missed_day1 = stats(
        yTrain, labels, yExpect, day_one)

    results = [
        'SC', kernel, k, g, a, lm_to_b, lb_to_m, tp, tn, fp, fn, pred_day1,
        missed_day1
    ]

    file_name = 'SC_CMN_5per_' + str(rate) + '.csv'
    write_csv(file_name, results)
Esempio n. 5
0
def semiLabelSpreding(feature_extractor, generator, val_generator, kernel,
                      neighbors, gamma, alpha):
    semi = LabelSpreading(kernel=kernel,
                          n_neighbors=neighbors,
                          gamma=gamma,
                          alpha=alpha,
                          tol=0.001,
                          max_iter=1000000)

    features = feature_extractor.predict_generator(generator,
                                                   steps=generator.samples /
                                                   generator.batch_size,
                                                   verbose=1)

    classes = generator.classes

    for i in range(0, generator.samples):
        if (generator.filenames[i][0] == 'N'):
            classes[i] = -1

    semi.fit(features, classes)

    val_features = feature_extractor.predict_generator(
        val_generator,
        steps=val_generator.samples / val_generator.batch_size,
        verbose=1)
    predicted_classes = semi.predict(val_features)

    return predicted_classes
Esempio n. 6
0
def computeSimilarities2(vect, matrix_values, numLine, numRan=10):
    """ build the model with the semi supervised approach labelSpreading     
    Args:
    matrix_values: descriptor matrix i.e. all the probability of all ir models
    vect: the answer vector (value 0 for false links and 1 for true links);
    numLine: number of pairs of artefacts
  Returns:
    preds: probability that a pair of artefact is linked
  """
    #number of iterations

    allPrediction = []
    model = LabelSpreading()

    #compute multiple (10) random vector of the matrix_values
    for i in range(0, numRan):
        subVect, subMatrix_values = computeRandom(vect, matrix_values, numLine)
        #compute the prediction function of each random vector
        computeModel = model.fit(subMatrix_values, subVect)
        print("new predicted function computed")
        #compute the prediction of each pair of artefact with the random model
        preds0 = computeModel.predict_proba(matrix_values)
        allPrediction.append(preds0[:, 1])

    # by the "vote majoritaire"
    preds = vote(allPrediction, len(vect), numRan)

    print(preds)

    return preds
def label_spreading(X_train, y_train, Xunlabelled, X_test, y_test):
    #pca = randomized_PCA(X_train)
    #X_train, X_test, y_train, y_test = cross_validation.train_test_split(tr_images, tr_labels, test_size=0.3)
    #X = pca.transform(X)
    #val_images = pca.transform(val_images)
    #y= y[:]

    X_train = X_train[:, :]
    y_train = y_train[:]
    Xunlabelled = Xunlabelled[:10000,:]

    #import ipdb; ipdb.set_trace()

    X_both = np.vstack((X_train, Xunlabelled))

    y_both = np.append(y_train, -np.ones((Xunlabelled.shape[0],)))


    label_prop_model = LabelSpreading(max_iter=100)
    #random_unlabeled_points = np.where(np.random.random_integers(0, 1, size=len(y_train)))
    #labels = np.copy(y_train)
    #labels[random_unlabeled_points] = -1
    label_prop_model.fit(np.copy(X_both), np.copy(y_both))
    y_pred = label_prop_model.predict(np.copy(X_both))
    print(y_pred)
Esempio n. 8
0
def testLabelPropagation():
    from sklearn.semi_supervised import LabelSpreading
    from sklearn import preprocessing
    label_enc = preprocessing.LabelEncoder()

    label_prop_model = LabelSpreading(kernel='knn')
    train_iter = getDocumentIterator1("published = 0 and is_test = 1")
    validation_iter = getDocumentIterator1("published = 1 and is_test = 1")
    semantic_model = gensim_tests.SemanticModel.load(
        'gensim/full_corpus_300000')
    all_profiles, labels = [], []
    propagation_labels = []
    for doc in train_iter:
        all_profiles.append(semantic_model.inferProfile(doc.rawtext))
        labels.append(doc.learned_category[0])
        propagation_labels.append(doc.learned_category[0])

    label_enc.fit(propagation_labels)
    propagation_labels = label_enc.transform(propagation_labels).tolist()

    for doc in validation_iter:
        all_profiles.append(semantic_model.inferProfile(doc.rawtext))
        labels.append(doc.learned_category[0])
        propagation_labels.append(-1)
    print propagation_labels
    print "Fitting"
    label_prop_model.fit(all_profiles, propagation_labels)
    output_labels = label_prop_model.transduction_
    for propagated, orig in zip(label_enc.inverse_transform(output_labels),
                                labels):
        print propagated, orig
Esempio n. 9
0
    def predict_ssl(self, x_sup, y_sup, x_unsup, y_unsup, x_test, y_test):

        ls_model = LabelSpreading(kernel='knn', n_neighbors=5)
        indices = np.arange(self.train_size)
        unlabeled_indices = indices[x_sup.shape[0]:]
        y_sup_unsup = np.concatenate([y_sup, y_unsup])
        y_sup_unsup_train = np.copy(y_sup_unsup)
        y_sup_unsup_train[unlabeled_indices] = -1

        x_fit = np.concatenate([x_sup, x_unsup], axis=0)
        h_fit = self.model_e.predict(x_fit)
        h_fit = np.reshape(h_fit,
                           (h_fit.shape[0], h_fit.shape[1] * h_fit.shape[2]))
        ls_model.fit(h_fit, y_sup_unsup_train)
        y_unsup_pred = ls_model.transduction_[unlabeled_indices]

        #print("LabelSpread Accuracy is ", accuracy_score(y_unsup, y_unsup_pred))

        h_test = self.model_e.predict(x_test)
        h_test = np.reshape(
            h_test, (h_test.shape[0], h_test.shape[1] * h_test.shape[2]))

        #SVM
        clf_svc = svm.SVC(kernel='linear')
        y_fit_true = ls_model.transduction_
        clf_svc.fit(h_fit, y_fit_true)
        acc_svm = accuracy_score(y_test, clf_svc.predict(h_test))

        clf_svc = svm.LinearSVC()
        clf_svc.fit(h_fit, y_fit_true)
        acc_svm_linear = accuracy_score(y_test, clf_svc.predict(h_test))
        print('acc_svm is ', max(acc_svm, acc_svm_linear))
Esempio n. 10
0
    def source_to_target_label_prop(self,
                                    train_feat_space='embeds',
                                    kernel_param={
                                        'type': 'rbf',
                                        'gamma': 20
                                    }):
        print(
            '-----------------------------------------------------------------------'
        )
        print('Propagating labels from source to target in {0} space'.format(
            train_feat_space))
        if train_feat_space == 'encoded':
            if not hasattr(self, 'source_encoded_reps'):
                self.dim_red_autoencode()
            concat_embs = np.concatenate(
                (self.source_encoded_reps, self.target_encoded_reps))
        elif train_feat_space == 'embeds':
            concat_embs = np.concatenate(
                (self.source_embds_vec, self.target_embds_vec))
        elif train_feat_space == 'embeds_tsne':
            if self.tsne_computed == 0:
                self.compute_tsne()
            feat_cols = []
            for idx in range(self.n_tsne_components):
                feat_cols.append('embeds_tsne_' + str(idx))
            source_data_feats = self.source_data[feat_cols].as_matrix()
            target_data_feats = self.target_data[feat_cols].as_matrix()
            concat_embs = np.concatenate(
                (source_data_feats, target_data_feats))
        else:
            raise NotImplemented
        unknown_labels = np.ones_like(self.target_labels) * -1
        label_prop_train_labels = np.concatenate(
            (self.source_labels, unknown_labels))
        lp_model = LabelSpreading()
        lp_model.fit(concat_embs, label_prop_train_labels)
        transduction_labels = lp_model.transduction_
        label_distributions = lp_model.label_distributions_

        print(label_distributions[0:10, :])
        self.source_data[
            train_feat_space +
            'Space_prop_pred'] = transduction_labels[:self.n_source]
        self.target_data[
            train_feat_space +
            'Space_prop_pred'] = transduction_labels[self.n_source:]
        # self.source_data[train_feat_space+'label_prop_groups'] = label_distributions[:self.n_source]
        # self.target_data[train_feat_space + 'label_prop_groups'] = label_distributions[self.n_source:]

        # self.embds_space_grouping.append(train_feat_space + 'label_prop_groups')
        # self.embds_space_classifiers.append(train_feat_space+'Space_prop_pred')
        if self.inter_save:
            print('Saving propagated labels')
            self.save_perforamance(self.serving_dir, suffix=self.save_suffix)

        print('Completed source to target label propagation in {0} space'
              ).format(train_feat_space)
        print(
            '-----------------------------------------------------------------------'
        )
Esempio n. 11
0
class LabelSpreadingModel(SupervisedW2VModel):
    def fit_with_test(self, test_data):
        xs, ys = [], []
        self.ans_mapping = []
        for ans, cvs in self.context_vectors.items():
            xs.extend(cvs)
            if ans not in self.ans_mapping:
                y = len(self.ans_mapping)
                self.ans_mapping.append(ans)
            else:
                y = self.ans_mapping.index(ans)
            ys.extend(y for _ in cvs)
        for ctx in test_data:
            xs.append(self.cv(ctx))
            ys.append(-1)  # unlabeled
        self.ls_clf = LabelSpreading(kernel='knn', n_neighbors=11)
        self.ls_clf.fit(xs, ys)

    def __call__(self, x, ans=None, with_confidence=False):
        v = self.cv(x)
        probs = self.ls_clf.predict_proba([v])[0]
        pred = probs.argmax()
        m_ans = self.ans_mapping[pred]
        # TODO - get confidence as difference between probs[pred] and next
        return (m_ans, 0.0) if with_confidence else m_ans
Esempio n. 12
0
    def train(self, inputs, targets, min_=0.01, max_=30, niter=10, stepsize=0.1):
        """
        Train the LP model given the data

        Parameters
        ----------
        inputs : nd-array
            independent variables
        targets : vector
            dependent variable
        min : float
            []
        max : float
            []
        niter : int
            number of training iterations
        stepsize : float
            []
        """
        # Scale the training data
        self.x = inputs
        self.y = targets

        # Tune gamma in RBF using basinhopping 
        self.gamma = self.optimize(min_, max_, niter, stepsize)[0]

        # Propogate labels
        self.model = LabelSpreading(kernel=self.kernel, alpha=self.alpha,
                                                      gamma=self.gamma)
        self.model.fit(self.x, self.y)
        if self.use_logger:
            self.logger.info("Label Propagation model trained with {} samples".format(len(self.y)))
Esempio n. 13
0
def label(filenames, train_path='../data/train_molecules_30.mat'):
    """
    Label data with the provided filenames.

    :param filenames: List of filenames containing data to label.
    :return: Newly labeled and conglomerate datasets
    """
    unlabeled = [scipy.io.loadmat(fname) for fname in filenames]
    unlabeled_X = np.vstack([data['X'] for data in unlabeled])
    X, Y = load_data(train_path, shape=(-1, 30, 30, 30))

    num_unlabeled = unlabeled_X.shape[0]
    unlabeled_Y = np.zeros(num_unlabeled) - 1
    unlabeled_Y = unlabeled_Y.reshape((-1, 1))
    Y = Y.reshape((-1, 1))
    Y_all = np.vstack((Y, unlabeled_Y))

    X_all = np.vstack((X, unlabeled_X))
    X_all = X_all.reshape((-1, 27000))

    label_prop_model = LabelSpreading()
    label_prop_model.fit(X_all, Y_all)
    Y_all = label_prop_model.transduction_
    unlabeled_Y = Y_all[num_unlabeled:]
    return (unlabeled_X, unlabeled_Y), (X_all, Y_all)
Esempio n. 14
0
def run_lp_bow_runtime_vocabulary(nbr, str_list, neighbors):
    i = 0
    avg_f1 = 0
    avg_accuracy = 0
    while i < 10:
        dataset = Dataset(categories)
        dataset.load_preprocessed(categories)
        dataset.split_train_true(nbr)
        print_v2_test_docs_vocabulary_labeled(categories)
        dataset.load_preprocessed_test_vocabulary_labeled_in_use(categories)
        vectorizer = CountVectorizer(vocabulary=Vocabulary.get_vocabulary(categories))
        vectors = vectorizer.fit_transform(dataset.train['data'])

        clf = LabelSpreading(kernel='knn', n_neighbors=neighbors).fit(vectors.todense(), dataset.train['target'])
        test_vec = vectorizer.transform(dataset.test['data'])
        pred = clf.predict(test_vec.todense())
        avg_f1 += metrics.f1_score(dataset.test['target'], pred, average='macro')
        avg_accuracy += clf.score(test_vec.todense(), dataset.test['target'])
        i += 1
    avg_accuracy = avg_accuracy/10
    avg_f1 = avg_f1/10
    str_list.extend(["KNN BOW runtime voc Avg f1: " + avg_f1.__str__(), "KNN BOW runtime vod Avg acc: "
                     + avg_accuracy.__str__()])
    print("Avg f1: " + avg_f1.__str__())
    print("Avg acc: " + avg_accuracy.__str__())
Esempio n. 15
0
    def label_spread(self, X_train, y_train, gamma = None, max_iter = None):
        """
        Train Label Spreading model from scikit-learn

        Parameters
        __________
        X_train: Scaled training data
        y_train: Class label
        gamma: Parameter for rbf kernel
        max_iter: Maximum number of iterations allowed

        Returns
        ________
        Predicted labels and probability
        """
        # Label spreading model
        model = LabelSpreading(kernel='rbf', gamma = gamma, max_iter = max_iter, n_jobs= -1)

        # Fit the training set
        model.fit(X_train, y_train)

        # Predict the labels of the unlabeled data points
        predicted_labels = model.transduction_

        # Predict probability
        predicted_proba = model.predict_proba(X_train)
        return predicted_labels, predicted_proba
Esempio n. 16
0
class ModelLabelSpreading:
    def __init__(self):
        np.random.seed(1102)
        self.model = LabelSpreading(
            kernel="rbf",
            n_jobs=int(np.max([multiprocessing.cpu_count() - 2, 1])),
            alpha=0.2,
            n_neighbors=10,
            max_iter=15)
        self.name = "LABEL-SPREADING"
        self.scaler = MinMaxScaler()

    def fit(self, X, y, Xu=None):
        np.random.seed(1102)
        self.Xl = X
        self.yl = y
        #self.Xu = Xu

    def predict(self, X):
        np.random.seed(1102)
        self.Xt = X
        X = self.scaler.fit_transform(np.vstack((self.Xl, self.Xt)))
        y = np.append(self.yl, np.repeat(-1, self.Xt.shape[0]))
        #y = np.append(y, np.repeat(-1, self.Xt.shape[0]))
        y = np.int64(y)

        assert X.shape[0] == len(y)

        self.model.fit(X, y)

        return np.array(
            self.model.label_distributions_)[(-self.Xt.shape[0]):, :]
def semi_supervised():
	features,labels = separate_cols_with_unknown(gtd)
	features = process_nontext(features)
	features = convertDType(features)
	model = LabelPropagation(kernel="knn")
	model2 = LabelSpreading(kernel="knn")
	model2.fit(features,labels)
	preds = cross_val_predict(model2,features,labels,cv=5)
	print('5 fold cross val accuracy of model: %0.2f ' % accuracy_score(labels,preds))
Esempio n. 18
0
def test_LabelSpreading(*data):
    x, y, unlabeled_indices = data
    y_train = np.copy(y)
    y_train[unlabeled_indices] = -1
    clf = LabelSpreading(max_iter=100, kernel='rbf', gamma=0.1)
    clf.fit(x, y_train)
    predicted_labels = clf.transduction_[unlabeled_indices]
    true_labels = y[unlabeled_indices]
    print("Accuracy: %f" % metrics.accuracy_score(true_labels, predicted_labels))
 def LabelSpreadingWrapper(X_train, y_train, X_test):
     clf = LabelSpreading(kernel='knn',
                          n_neighbors=10,
                          n_jobs=-1,
                          max_iter=1000,
                          alpha=0.1)
     newlabels = np.concatenate((np.array(y_train), -np.ones(len(X_test))))
     clf.fit(np.concatenate((X_train, X_test)), newlabels)
     return clf.transduction_[-len(X_test):]
Esempio n. 20
0
def test_LabelSpreading(*data):
    X,y,unlabeled_indices = data
    y_train = np.copy(y)
    y_train[unlabeled_indices] = -1
    clf = LabelSpreading(max_iter=1000, kernel='knn',gamma = 0.1)
    clf.fit(X,y_train)
    true_labels = y[unlabeled_indices]
    predicted_labels = clf.transduction_[unlabeled_indices]
    print('Accuracy : %f' %clf.score(X[unlabeled_indices],true_labels))
    print('Accuracy : %f' %metrics.accuracy_score(true_labels,predicted_labels))
def semi_supervised():
    features, labels = separate_cols_with_unknown(gtd)
    features = process_nontext(features)
    features = convertDType(features)
    model = LabelPropagation(kernel="knn")
    model2 = LabelSpreading(kernel="knn")
    model2.fit(features, labels)
    preds = cross_val_predict(model2, features, labels, cv=5)
    print('5 fold cross val accuracy of model: %0.2f ' %
          accuracy_score(labels, preds))
Esempio n. 22
0
 def __init__(self):
     np.random.seed(1102)
     self.model = LabelSpreading(
         kernel="rbf",
         n_jobs=int(np.max([multiprocessing.cpu_count() - 2, 1])),
         alpha=0.2,
         n_neighbors=10,
         max_iter=15)
     self.name = "LABEL-SPREADING"
     self.scaler = MinMaxScaler()
Esempio n. 23
0
    def fit(self):
        #Need to concatenate labeled and unlabeled data
        #unlabeled data labels are set to -1
        X = np.concatenate(
            (self.val_primitive_matrix, self.train_primitive_matrix))
        val_labels = (self.val_ground + 1) / 2.
        train_labels = -1. * np.ones(np.shape(self.train_primitive_matrix)[0])
        y = np.concatenate((val_labels, train_labels))

        self.model = LabelSpreading(kernel='knn')
        self.model.fit(X, y)
Esempio n. 24
0
 def __init__(self, lmnn=False, max_iter=1000, lm_num=200):
     # self.clf =  LabelPropagation(kernel='knn',max_iter=1000,n_jobs=10,n_neighbors=25)
     self.clf = LabelSpreading(kernel='knn',
                               n_neighbors=25,
                               max_iter=max_iter,
                               alpha=0.2,
                               n_jobs=-1)
     self.lmnn = lmnn
     self.lm_num = lm_num
     if lmnn:
         self.ml = LMNN(use_pca=False, max_iter=2000)
Esempio n. 25
0
def knn(X, labels):
    # #############################################################################
    # Learn with LabelSpreading
    label_spread = LabelSpreading(kernel='knn', alpha=0.6, max_iter=100)
    label_spread.fit(X, labels)

    # #############################################################################
    # Plot output labels
    output_labels = label_spread.transduction_

    return output_labels
def propagate_labels(
    features,
    labels,
):
    label_prop_model = LabelSpreading(kernel=construct_graph, n_jobs=-1)
    label_prop_model.fit(features, labels)
    logger.debug(label_prop_model.classes_)
    # preds = label_prop_model.predict(features)
    preds = label_prop_model.predict_proba(features)
    # logger.debug(label_prop_model.classes_)

    return preds
Esempio n. 27
0
def apply_notl(trainX, trainY, testX, testY, window, source_pos, target_pos):
    #######################
    ### SEMI-SUPERVISED ###
    ########################
    # Label Propagation
    label_prop_model = LabelPropagation(kernel='knn')
    label_prop_model.fit(trainX, trainY)
    Y_Pred = label_prop_model.predict(testX);
    acc_ss_propagation, acc_ss_propagation_INFO = check_accuracy(testY, Y_Pred)
    # Label Spreading
    label_prop_models_spr = LabelSpreading(kernel='knn')
    label_prop_models_spr.fit(trainX, trainY)
    Y_Pred = label_prop_models_spr.predict(testX);
    acc_ss_spreading, acc_ss_spreading_INFO = check_accuracy(testY, Y_Pred)
    ########################
    #### WITHOUT TL ########
    ########################
    # LogisticRegression 
    modelLR = LogisticRegression()
    modelLR.fit(trainX, trainY)
    predLR = modelLR.predict(testX)
    accLR, acc_LR_INFO = check_accuracy(testY, predLR)
    # DecisionTreeClassifier
    modelDT = tree.DecisionTreeClassifier()
    modelDT.fit(trainX, trainY)
    predDT = modelDT.predict(testX)
    accDT, acc_DT_INFO = check_accuracy(testY, predDT)
    # BernoulliNB
    modelNB = BernoulliNB()
    modelNB.fit(trainX, trainY)
    predND = modelNB.predict(testX)
    accNB, acc_NB_INFO = check_accuracy(testY, predND)
    #
    return pd.DataFrame(
        [{ 
        'window': window,
        'source_position': source_pos,
        'target_position': target_pos,

        'acc_SS_propagation': acc_ss_propagation,
        'acc_SS_propagation_INFO':acc_ss_propagation_INFO,
        'acc_SS_spreading': acc_ss_spreading,
        'acc_SS_spreading_INFO':acc_ss_spreading_INFO,
        'acc_LR':accLR,
        'acc_LR_INFO': str(acc_LR_INFO),
        'acc_DT': accDT,
        'acc_DT_INFO': str(acc_DT_INFO),
        'acc_NB': accNB,
        'acc_NB_INFO': str(acc_NB_INFO)       

        }]
    )
 def _semi_supervised_learning(self, data_matrix, target):
     n_classes = len(set(target))
     # if there are too few classes (e.g. less than -1 and at least 2 other classes)
     # then just bail out and return the original target
     # otherwise one cannot meaningfully spread the information of only one class
     if n_classes > 2:
         semi_supervised_estimator = LabelSpreading(kernel='knn', n_neighbors=self.n_neighbors)
         semi_supervised_estimator.fit(data_matrix, target)
         predicted_target = semi_supervised_estimator.predict(data_matrix)
         predicted_target = self._clamp(target, predicted_target)
         return predicted_target
     else:
         return target
Esempio n. 29
0
def computeSimilarities2(lsi_align,
                         vsm_align,
                         lda_align,
                         f1_align,
                         f2_align,
                         f3_align,
                         f4_align,
                         f5_align,
                         f6_align,
                         numLine,
                         numRan=10):
    """ build the model with the semi supervised approach labelSpreading     
    Args:
    matrix_values: descriptor matrix i.e. all the probability of all ir models
    vect: the answer vector (value 0 for false links and 1 for true links);
    numLine: number of pairs of artefacts
  Returns:
    preds: probability that a pair of artefact is linked
  """
    allPrediction = []
    model = LabelSpreading()

    #compute multiple (10) random vector of the matrix_values
    for i in range(0, numRan):
        subVect, subMatrix_values, matrix_values = buildingTrainingSet2(
            lsi_align, vsm_align, lda_align, f1_align, f2_align, f3_align,
            f4_align, f5_align, f6_align, numLine)
        #compute the prediction function of each random vector
        print(len(subVect))
        print(len(subMatrix_values))
        computeModel = model.fit(subMatrix_values, subVect)
        print("new predicted function computed")
        #compute the prediction of each pair of artefact with the random model
        preds0 = computeModel.predict_proba(matrix_values)
        allPrediction.append(preds0[:, 1])

    # by the "vote majoritaire"
    preds = vote(allPrediction, len(lsi_align), numRan)

    result = []

    print(len(preds))

    for p in preds:
        for i in range(0, len(p)):
            result.append(p[i])

    print(len(result))

    return result
 def _semi_supervised_learning(self, data_matrix, target):
     n_classes = len(set(target))
     # if there are too few classes (e.g. less than -1 and at least 2 other classes)
     # then just bail out and return the original target
     # otherwise one cannot meaningfully spread the information of only one class
     if n_classes > 2:
         semi_supervised_estimator = LabelSpreading(
             kernel='knn', n_neighbors=self.n_neighbors)
         semi_supervised_estimator.fit(data_matrix, target)
         predicted_target = semi_supervised_estimator.predict(data_matrix)
         predicted_target = self._clamp(target, predicted_target)
         return predicted_target
     else:
         return target
Esempio n. 31
0
def LabelPropagation(support, support_ys, query):
    alpha = 0.3
    k_neighbours = 38
    all_embeddings = np.concatenate((support, query), axis=0)
    #X = all_embeddings.cpu().detach().numpy()
    labels = np.full(all_embeddings.shape[0], -1.)
    labels[:support.shape[0]] = support_ys
    label_propagation = LabelSpreading(kernel='knn',
                                       alpha=alpha,
                                       n_neighbors=k_neighbours,
                                       tol=0.000001)
    label_propagation.fit(all_embeddings, labels)
    predicted_labels = label_propagation.transduction_
    query_prop = predicted_labels[support.shape[0]:]
    return query_prop
Esempio n. 32
0
def computeSimilarities(vect, matrix_values):
    """ build the model with the semi supervised approach labelSpreading     
    Args:
    matrix_values: descriptor matrix i.e. all the probability of all ir models
    vect: the answer vector (value 0 for false links and 1 for true links);
  Returns:
    preds: probability that a pair of artefact is linked
  """
    model = LabelSpreading()
    computeModel = model.fit(matrix_values, vect)
    print("model built")
    preds = computeModel.predict_proba(matrix_values)
    print(preds)

    return preds[:, 1]
Esempio n. 33
0
class SemiSupervised(BaselineModel):
    """
    LabelSpreading Implementation
    """
    def fit(self):
        #Need to concatenate labeled and unlabeled data
        #unlabeled data labels are set to -1
        X = np.concatenate(
            (self.val_primitive_matrix, self.train_primitive_matrix))
        val_labels = (self.val_ground + 1) / 2.
        train_labels = -1. * np.ones(np.shape(self.train_primitive_matrix)[0])
        y = np.concatenate((val_labels, train_labels))

        self.model = LabelSpreading(kernel='knn')
        self.model.fit(X, y)
Esempio n. 34
0
def compute_confident_measure(vect,
                              lsi_align,
                              lda_align,
                              vsm_align,
                              emb_align,
                              matrixSize=4):
    """ compute confidence measure for each pair of artefacts.
  Args:
    vect: the answer vector (value 0 for false links and 1 for true links);
    lsi_align, vsm_align, lda_align: IR models aligned;
    filename: file where the aggreg similarity scores results will be saved;
  Returns:
     preds: probability that a pair of artefact is linked."""

    vect0 = np.zeros((len(lsi_align), 1))
    vect0[:] = -1

    trueL, falseL = CreateTraining_set.create_link_class(vect, lsi_align)
    print(len(trueL))
    print(len(falseL))
    for i in falseL:
        vect0[i] = 0

    for i in trueL:
        vect0[i] = 1

    print("annoted_vectors_ok")

    emb_values = compValues(emb_align)
    vsm_values = compValues(vsm_align)
    lda_values = compValues(lda_align)
    lsi_values = compValues(lsi_align)

    matrix_values = np.zeros((len(lsi_align), matrixSize))

    matrix_values[:, 0] = lda_values
    matrix_values[:, 1] = emb_values
    matrix_values[:, 2] = vsm_values
    matrix_values[:, 3] = lsi_values

    model = LabelSpreading()
    computeModel = model.fit(matrix_values, vect0)
    print("models built")
    preds = computeModel.predict_proba(matrix_values)

    print(preds)

    return preds[:, 1]
Esempio n. 35
0
def semi_supervised_learning(data_matrix, target):
    if -1 in list(target):
        # if -1 is present in target do label spreading
        from sklearn.semi_supervised import LabelSpreading
        label_prop_model = LabelSpreading(kernel='knn', n_neighbors=5)
        label_prop_model.fit(data_matrix, target)
        pred_target = label_prop_model.predict(data_matrix)
        extended_target = []
        for pred_label, label in zip(pred_target, target):
            if label != -1 and pred_label != label:
                extended_target.append(label)
            else:
                extended_target.append(pred_label)
    else:
        extended_target = target
    return np.array(extended_target)
def train_model(nodes, datasets):
    y = np.array(range(len(nodes)))
    nodes = list(nodes)
    vectorizer = DictVectorizer(sparse=True)
    for i, dataset in enumerate(datasets):
        g = compute_dataset(dataset)
        nodes.extend(g.classes)
        sys.stdout.write('\r')
        sys.stdout.write(str(i+1))
        sys.stdout.flush()
    X = vectorizer.fit_transform([dict(node.concept_vector) for node in nodes])
    y = y + [-1 for i in range(len(nodes) - len(y))]

    unlabeled = []
    model = LabelSpreading()
    model.fit(X, y)
    model.vectorizer = vectorizer
    return model
Esempio n. 37
0
def propogation(model, uids, labeled_ids):
    X, y1, y2 = [], [], []
    pool = []
    for uid in labeled_ids:
        X.append(model.docvecs[uid])
        y1.append(1)
    for uid in uids:
        if uid not in labeled_ids:
            X.append(model.docvecs[uid])
            y2.append(-1)
    label_prop_model = LabelSpreading(kernel='knn', alpha=1.0)
    y2 = np.array(y2)
    y2[0:(len(y1)-1)] = 0
    print len(y1) + len(y2)
    for i in xrange(5):
        np.random.shuffle(y2)
        label_prop_model.fit(X, y1 + y2.tolist())
        pool.append(label_prop_model.transduction_)
    pickle.dump(pool, open('data/propagation.pick', 'w'))
    pool = pickle.load(open('data/propagation.pick', 'r'))
    pool = np.array(pool)
    for column in pool.T:
        print column
Esempio n. 38
0
def main():
    usage = "usage prog [options] arg"
    parser = OptionParser(usage=usage)

    parser.add_option("-t", "--task", dest="task", help="the task name")
    parser.add_option("-o", "--output", dest="output", help="the output file")

    (options, remainder) = parser.parse_args()

    train_paths = [
        "../data/train_simple_feature.csv",
        "../data/train_plus_feature.csv",
        "../data/train_azure_plus_feature.csv",
        #"../data/train_azure_feature.csv",
        #"../data/train_module_feature.csv",
        #"../data/train_course_feature.csv",
        #"./blend_train.csv"
        ]
    label_path = "../data/truth_train.csv"

    test_paths = [
        "../data/test_simple_feature.csv",
        "../data/test_plus_feature.csv",
        "../data/test_azure_plus_feature.csv",
        #"../data/test_azure_feature.csv",
        #"../data/test_module_feature.csv",
        #"../data/test_course_feature.csv",
        #"./blend_test.csv"
        ]

    train = merge_features(train_paths, label_path)
    train = train.drop(['user_drop_ratio'], axis=1)
    #train['user_drop_ratio'] = (train['user_drop_ratio'] + 8.0 / train['user_courses']) / (1.0 + 10.0 / train['user_courses'])
    y = encode_labels(train.dropout.values)
    train = train.drop('dropout', axis=1)
    tr_ids = train.enrollment_id.values
    X = train.drop('enrollment_id', axis=1)
    m, n = X.shape
    print 'train.shape=%s' % (str(X.shape))


    test = merge_features(test_paths)
    test = test.drop(['user_drop_ratio'], axis=1)
    #test['user_drop_ratio'] = (test['user_drop_ratio'] + 8.0 / test['user_courses']) / (1.0 + 10.0 / test['user_courses'])
    tt_ids = test.enrollment_id.values
    X_test = test.drop('enrollment_id', axis=1)
    print 'test.shape=%s' % (str(X_test.shape))

    scaler = StandardScaler().fit(np.vstack((X, X_test)))

    task = options.task
    if not task:
        task = "blend"

    if task == 'blend':

        clf_list = [
            #("knn_p2_10", create_clf('knn', {"n_neighbors": 10, "p": 2})),
            #("knn_p2_10_scaler", create_clf('knn', {"n_neighbors": 10, "p": 2, "scaler": scaler})),
            #("knn_p2_100", create_clf('knn', {"n_neighbors": 100, "p": 2})),
            #("knn_p2_100_scaler", create_clf('knn', {"n_neighbors": 100, "p": 2, "scaler": scaler})),
            #("knn_p2_500", create_clf('knn', {"n_neighbors": 500, "p": 2})),
            #("knn_p2_500_scaler", create_clf('knn', {"n_neighbors": 500, "p": 2, "scaler": scaler})),
            #("knn_p2_100", create_clf('knn', {"n_neighbors": 100, "p": 2})),
            #("knn_p2_800", create_clf('knn', {"n_neighbors": 800, "p": 2})),
            #("knn_p1_10", create_clf('knn', {"n_neighbors": 10, "p": 1})),
            #("knn_p1_10_scaler", create_clf('knn', {"n_neighbors": 10, "p": 1, "scaler": scaler})),
            #("knn_p1_100", create_clf('knn', {"n_neighbors": 100, "p": 1})),
            #("knn_p1_100_scaler", create_clf('knn', {"n_neighbors": 100, "p": 1, "scaler": scaler})),
            #("knn_p1_500", create_clf('knn', {"n_neighbors": 500, "p": 1})),
            #("knn_p1_500_scaler", create_clf('knn', {"n_neighbors": 500, "p": 1, "scaler": scaler})),
            #("knn_p1_800", create_clf('knn', {"n_neighbors": 800, "p": 1})),
            #("knn_p1_800_scaler", create_clf('knn', {"n_neighbors": 800, "p": 1, "scaler": scaler})),
            #("extra_gini_10depth", create_clf("ext", {"criterion": "gini", "n_estimators": 200, "max_depth": 10})),
            #("extra_entropy_10depth", create_clf("ext", {"criterion": "entropy", "n_estimators": 200, "max_depth": 10})),
            #("extra_gini_20depth", create_clf("ext", {"criterion": "gini", "n_estimators": 200, "max_depth": 20})),
            #("extra_entropy_20depth", create_clf("ext", {"criterion": "entropy", "n_estimators": 200, "max_depth": 20})),
            ("extra_gini_30depth", create_clf("ext", {"criterion": "gini", "n_estimators": 200, "max_depth": 30})),
            ("extra_entropy_30depth", create_clf("ext", {"criterion": "entropy", "n_estimators": 200, "max_depth": 30})),
            #("rfc_gini_3depth", create_clf("rfc", {"criterion": "gini", "max_depth": 3, "n_estimators": 200})),
            #("rfc_entropy_3depth", create_clf("rfc", {"criterion": "entropy", "max_depth": 3, "n_estimators": 200})),
            ("rfc_gini_5depth", create_clf("rfc", {"criterion": "gini", "max_depth": 5, "n_estimators": 200})),
            ("rfc_entropy_5depth", create_clf("rfc", {"criterion": "entropy", "max_depth": 5, "n_estimators": 200})),
            ("rfc_gini_6depth", create_clf("rfc", {"criterion": "gini", "max_depth": 6, "n_estimators": 200})),
            ("rfc_entropy_6depth", create_clf("rfc", {"criterion": "entropy", "max_depth": 6, "n_estimators": 200})),
            ("rfc_gini_8depth", create_clf("rfc", {"criterion": "gini", "max_depth": 8, "n_estimators": 200})),
            ("rfc_entropy_8depth", create_clf("rfc", {"criterion": "entropy", "max_depth": 8, "n_estimators": 200})),
            ("rfc_gini_10depth", create_clf("rfc", {"criterion": "gini", "max_depth": 10, "n_estimators": 200})),
            ("rfc_entropy_10depth", create_clf("rfc", {"criterion": "entropy", "max_depth": 10, "n_estimators": 200})),
            ("rfc_gini_12depth", create_clf("rfc", {"criterion": "gini", "max_depth": 12, "n_estimators": 200})),
            ("rfc_entropy_12depth", create_clf("rfc", {"criterion": "entropy", "max_depth": 12, "n_estimators": 200})),
            #("xgb_1500_2depth", create_clf("xgb", {"max_depth": 2, "n_estimators": 1500, "learning_rate": 0.03})),
            #("xgb_600_3depth", create_clf("xgb", {"max_depth": 3, "n_estimators": 600, "learning_rate": 0.03})),
            #("xgb_600_4depth", create_clf("xgb", {"max_depth": 4, "n_estimators": 600, "learning_rate": 0.03})),
            ("xgb_600_5depth", create_clf("xgb", {"max_depth": 5, "n_estimators": 600, "learning_rate": 0.03})),
            #("xgb_600_6depth", create_clf("xgb", {"max_depth": 6, "n_estimators": 600, "learning_rate": 0.02})),
            #("xgb_600_7depth", create_clf("xgb", {"max_depth": 7, "n_estimators": 600, "learning_rate": 0.01})),
            #("xgb_600_8depth", create_clf("xgb", {"max_depth": 8, "n_estimators": 600, "learning_rate": 0.01})),
            #("lgc_1c_scale", create_clf("lgc", {"C": 1.0, "scaler": scaler})),
            #("lgc_1c", create_clf("lgc", {"C": 1.0})),
            #("lgc_1c_l1", create_clf("lgc", {"C": 1.0, "penalty": "l1"})),
            #("lgc_3c_scale", create_clf("lgc", {"C": 3.0, "scaler": scaler})),
            #("lgc_3c", create_clf("lgc", {"C": 3.0})),
            ("lgc_3c_l1", create_clf("lgc", {"C": 3.0, "penalty": "l1"})),
            #("lgc_5c_scale", create_clf("lgc", {"C": 5.0, "scaler": scaler})),
            #("lgc_5c", create_clf("lgc", {"C": 5.0})),
            ]

        X = X.values
        blend_train, blend_test = train_blend(X, y, X_test, clf_list, 5)
        print 'blend_train.shape=%s' % (str(blend_train.shape))
        print 'blend_test.shape=%s' % (str(blend_test.shape))


        cols = [cname for cname, clf in clf_list]
        cols = ['enrollment_id'] + cols
        blend_train_ids = np.hstack((np.matrix(tr_ids).T, blend_train))
        blend_test_ids = np.hstack((np.matrix(tt_ids).T, blend_test))
        dump_data(blend_train_ids, cols, "new_blend_train.csv")
        dump_data(blend_test_ids, cols, "new_blend_test.csv")


        blender = create_clf('lgc', {"C": 1.0, "penalty": "l1"})
        auc = cv_loop(blend_train, y, blender)
        print 'AUC (LGC blend): %f' % auc

        blender = create_clf('ext', {"max_depth": 10, "criterion": "entropy", "n_estimator": 100})
        auc = cv_loop(blend_train, y, blender)
        print 'AUC (EXT blend): %f' % auc

        blender = create_clf('xgb', {'max_depth': 2, "n_estimators": 150, "learning_rate": 0.05})
        auc = cv_loop(blend_train, y, blender)
        print "AUC (XGB blend {d: %d, n: %d}): %f" % (2, 150, auc)
        blender = create_clf('xgb', {'max_depth': 3, "n_estimators": 200, "learning_rate": 0.05})
        auc = cv_loop(blend_train, y, blender)
        print 'AUC (XGB blend {d: %d, n: %d}): %f' % (3, 200, auc)

        blender = create_clf('xgb', {'max_depth': 3, "n_estimators": 100, "learning_rate": 0.1})
        blender = blender.fit(blend_train, y)
        preds = blender.predict_proba(blend_test)[:,1]
        write_submission(tt_ids, preds, "new_blend_submission.csv")

        combined_train = np.hstack((X, blend_train))
        combined_test = np.hstack((X_test, blend_test))
        blender = create_clf('xgb', {'max_depth': 5, "n_estimators": 600, "learning_rate": 0.03})
        blender = blender.fit(combined_train, y)
        preds = blender.predict_proba(combined_test)[:,1]
        write_submission(tt_ids, preds, "new_combined_blend_submission.csv")

    elif task == 'lgc':
        print 'Try logistic regression ..'
        clf = create_clf("lgc", {"C": 3, "scaler": scaler, "penalty": "l1"})
        auc = cv_loop(X, y, clf, 5)
        print 'AUC (all): %f' % auc

    elif task == "ext":
        print 'Try ExtraTreeClassifier'
        #clf = create_clf("ext", {"max_depth": 10}) # 0.86261
        #clf = create_clf("ext", {"max_depth": 20}) # 0.862636
        #clf = create_clf("ext", {"max_depth": 30}) # 0.860944
        #clf = create_clf("ext", {"criterion": "entropy", "max_depth": 10}) # 0.862610
        #clf = create_clf("ext", {"criterion": "entropy", "max_depth": 20}) # 0.862564
        clf = create_clf("ext", {"criterion": "entropy", "max_depth": 20, "n_estimators": 100}) # 0.861795
        #clf = create_clf("ext", {"criterion": "entropy", "max_depth": 20, "n_estimators": 2000}) # 0.862695
        #clf = create_clf("ext", {"criterion": "entropy", "max_depth": 30, "n_estimators": 2000}) # 0.860
        auc = cv_loop(X, y, clf, 5)
        print 'AUC (all): %f' % auc

    elif task == 'rfc':
        print 'Try RFC ..'
        #clf = create_clf('rfc', {'max_depth': 5}) # 0.859583
        #clf = create_clf("rfc", {"criterion": "entropy", "max_depth": 10}) # 0.863369
        #clf = create_clf("rfc", {"criterion": "entropy", "max_depth": 10, "n_estimators": 200}) # 0.863285
        # clf = create_clf("rfc", {"criterion": "entropy", "max_depth": 10, "n_estimators": 100}) # 0.863207
        #clf = create_clf("rfc", {"criterion": "entropy", "max_depth": 10, "max_features": None, "n_estimators": 200}) # 0.863341
        clf = create_clf("rfc", {"criterion": "entropy", "max_depth": 40, "max_features": None, "n_estimators": 10000, "min_samples_split": 100}) # 0.863291
        auc = cv_loop(X, y, clf, 5)
        print 'AUC (all): %f' % auc

    elif task == 'knn':
        clf = create_clf('knn', {"n_neighbors": 800, "p": 2, "scaler": scaler})
        auc = cv_loop(X, y, clf, 5)
        print 'AUC (all): %f' % auc

    elif task == "gbt":
        paras = json.load(open('paras/gbt.json', 'r'))
        clf = create_clf("gbt", paras)
        clf = clf.fit(X, y)
        preds = clf.predict_proba(X_test)[:,1]
        write_submission(tt_ids, preds, "gbt_submission.csv")

    elif task == "xgb":
        #clf = create_clf('xgb', {"max_depth": 2, "n_estimators": 1500, "learning_rate": 0.03}) # 0.860279
        #clf = create_clf('xgb', {"max_depth": 5, "n_estimators": 600, "learning_rate": 0.03}) # public: 0.8891443712867697;
        clf = create_clf('xgb', {"max_depth": 5, "n_estimators": 600, "learning_rate": 0.03}) # public:
        auc = cv_loop(X, y, clf, 5)
        print "AUC (all): %f" % auc
        #sys.exit()
        clf = clf.fit(X, y)
        preds = clf.predict_proba(X_test)[:,1]
        write_submission(tt_ids, preds, 'xgb_new_submission.csv')
    elif task == "deep":
        clf = create_clf('deep', {"neuro_num": 512, "nb_epoch": 20, "scaler": scaler, "optimizer": "adadelta"})
        auc = cv_loop(X, y, clf, 5)
        print 'AUC (all): %f' % auc
        sys.exit(0)

        clf = clf.fit(X, y)
        preds = clf.predict_proba(X_test)[:,1]
        write_submission(tt_ids, preds, 'deep_submission.csv')

    elif task == 'semi':
        clf = create_clf("ext", {"criterion": "entropy", "max_depth": 20, "n_estimators": 100}) # 0.861795
        train_semi(X, y, X_test, clf, 5)

    elif task == 'gbc':
        from sklearn.ensemble import GradientBoostingClassifier
        clf = GradientBoostingClassifier(n_estimators=300,
                                        learning_rate=0.1,
                                        min_samples_split=50,
                                        min_samples_leaf=50,
                                        max_depth=10,
                                        subsample=0.6,
                                        max_features='log2',
                                        verbose=1)
        auc = cv_loop(X, y, clf, 5)
        print 'AUC (all): %f' % auc

    elif task == 'label':
        from sklearn.semi_supervised import LabelPropagation
        from sklearn.semi_supervised import LabelSpreading
        label_prop_model = LabelSpreading()
        all_X = np.vstack((X, X_test))
        tm, tn = X_test.shape
        unlabeles = [-1] * tm
        ys = [list(y)]
        ys.append(unlabeles)
        labels = np.concatenate(ys)
        print 'ALL shape=%s' % (str(all_X.shape))
        print 'ALL y shape=%s' % (str(labels))
        label_prop_model.fit(all_X, labels)
  4.  6.  6.  4.  2.  3.  1.  6.  6.  1.  3.  1.  1.  1.  6.  6.  5. -1.
  1.  1. -1.  1.  6. -1.  3.  6.  1.  4.  4.  6.  4.  6.  4.  1. -1.  6.
  1.  2.  6.  4. -1.  2.  6.  2. -1. -1.  6.  4. -1.  1.  6.  4.  4.  6.
  6.  6. -1. -1.  1.  3. -1.  6.  2. -1.  1.  4. -1.  6.  1.  4.  3.  3.
  4.  1.  6. -1.  4.  4.  1.  1.  6.  6. -1.  4.  4.  4.  3.  2.  6. -1.
  1.  6.  4.  4.  4.  5.  6. -1. -1.  5.  2.  6.  1.  6.  3.  2.  6.  3.
  3.  1.  2.  5.  2. -1. -1.  1.  6.  6. -1.  6.  6.  6.  4.  6. -1.  2.
  3.  2.  5.  4.  4.  6.  4. -1.  4.  2.  6.  1.  1.  2. -1.  5.  2.  4.
  3. -1.  6.  2.  5.  2.  2.  5.  5.  4.  2.  1. -1.  1.]
(500, 100)
(500,)
"""

from sklearn.semi_supervised import LabelSpreading

label_propagation_model = LabelSpreading()
label_propagation_model.fit(X, y)

# make predictions for first twenty samples (some will be known, some unknown)
for i in range(20): print 'y: ', y[i], '\t', 'y_hat: ', label_propagation_model.predict(X[i].reshape(1,-1))
"""
y:  6.0     y_hat:  [6.]
y:  6.0     y_hat:  [6.]
y:  2.0     y_hat:  [2.]
y:  1.0     y_hat:  [1.]
y:  -1.0    y_hat:  [6.]    *
y:  2.0     y_hat:  [2.]
y:  6.0     y_hat:  [6.]
y:  4.0     y_hat:  [4.]
y:  3.0     y_hat:  [3.]
y:  5.0     y_hat:  [5.]
Esempio n. 40
0

rate = 1

train_comb = train
# select 10% of test data
selected_test = test.sample(test.shape[0]/rate,replace=False,random_state=20422438)
train_comb = train_comb.append(selected_test)
train_comb_label = train_label
train_comb_unlabeled = pd.DataFrame(np.array([-1]*(test.shape[0]/rate)))
train_comb_label = np.array(train_comb_label.append(train_comb_unlabeled))
train_comb_label = train_comb_label.reshape(len(train_comb_label))


a_level = 1
label_prop_model = LabelSpreading(kernel="knn",alpha=a_level)
label_prop_model.fit(train_comb, train_comb_label)
pred_y = label_prop_model.transduction_
pred_y[:train.shape[0]] = train_label



X_train, X_test, y_train, y_test = train_test_split(label_prop_model.X_, pred_y, test_size=0.10, random_state=20422438)

model_erf = se.ExtraTreesClassifier(random_state=20422438,n_jobs=-1,n_estimators=1000)
model_erf.fit(X_train,y_train)

model_erf_pred = model_erf.predict(X_test)
model_erf_error = errFn(model_erf_pred,y_test)

print a_level
# Scikit-learns LabelSpreading method for semi-supervised learning

import numpy as np
from sklearn import datasets
from sklearn.semi_supervised import LabelSpreading

label_propagation_model = LabelSpreading()
iris = datasets.load_iris()

'''
Unlabeled entries in y

It is important to assign an identifier to unlabeled points along
with the labeled data when training the model with the fit method. 
The identifier that this implementation uses is the integer value -1.
'''

# generate boolean matrix where less than 30% are 'True'
rand_numgen = np.random.RandomState(42)
random_unlabeled_points = rand_numgen.rand(len(iris['target'])) < 0.3

# create the unlabelled data in the labels (setting to -1)
full_labels = iris['target']
cutdown_labels = np.copy(iris['target'])
cutdown_labels[random_unlabeled_points] = -1

print full_labels
'''
[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 2 2