Esempi in Python per LabelSpreading, esempi in Python per sklearn.semi_supervised.LabelSpreading

Esempio n. 1

0

Mostra file

File: test_label_spreading.py Progetto: romanorac/machine-learning

    def test_label_spreading_algorithms():
        """
        Compare scikit's algorithm and our algorithm
        """
        x = np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9], [10, 11, 12]])

        # scikit takes different input that our algorithm
        y_sklearn = np.array([1, 2, -1, -1])
        y_custom = np.array([[1, 0], [0, 1], [0, 0], [0, 0]])

        # scikit's algorithm
        alpha = 0.2
        max_iter = 30
        tol = 1e-3
        label_spreading = LabelSpreadingSKLearn(kernel="rbf",
                                                max_iter=max_iter,
                                                alpha=alpha, tol=tol)
        model = label_spreading.fit(x, y_sklearn)
        expected = model.predict(x)

        # our algorithm
        w = distance_matrix(x, measure=rbf_distance)
        ls = LabelSpreadingCustom(alpha=alpha, max_iter=max_iter, tol=tol)
        ls = ls.fit(w, y_custom)
        actual = ls.predict(y_custom)
        actual = np.array(actual) + 1  # add plus 1 to every prediction
        assert_array_equal(actual, expected)

Esempio n. 2

0

Mostra file

def test_LabelSpreading_rbf(*data):
    x, y, unlabeled_indices = data
    y_train = np.copy(y)
    y_train[unlabeled_indices] = -1
    fig = plt.figure()
    ax = fig.add_subplot(1, 1, 1)
    alphas = np.linspace(0.01, 1, num=10, endpoint=True)
    gammas = np.logspace(-2, 2, num=50)
    colors = ((1, 0, 0), (0, 1, 0), (0, 0, 1), (0.5, 0.5, 0), (0, 0.5, 0.5), (0.5, 0, 0.5), (0.4, 0.6, 0), (0.6, 0.4, 0) \
                  , (0, 0.6, 0.4), (0.5, 0.3, 0.2))  # 颜色集合，不同的曲线用不同的颜色
    # 训练并绘图
    for alpha, color in zip(alphas, colors):
        scores = []
        for gamma in gammas:
            clf = LabelSpreading(max_iter=100, gamma=gamma, alpha=alpha, kernel='rbf')
            clf.fit(x, y_train)
            scores.append(clf.score(x[unlabeled_indices], y[unlabeled_indices]))
        ax.plot(gammas, scores, label=r"$\alpha=%s$" % alpha, color=color)
    # 设置图形
    ax.set_xlabel(r"$\gamma$")
    ax.set_ylabel("score")
    ax.set_xscale("log")
    ax.legend(loc='best')
    ax.set_title("LabelSpreading rbf kernel")
    plt.show()

Esempio n. 3

0

Mostra file

def test_LabelSpreading_knn(*data):
    x, y, unlabeled_indices = data
    y_train = np.copy(y)
    y_train[unlabeled_indices] = -1
    fig = plt.figure()
    ax = fig.add_subplot(1, 1, 1)
    alphas = np.linspace(0.01, 1, num=10, endpoint=True)
    Ks = [1, 2, 3, 4, 5, 8, 10, 15, 20, 25, 30, 35, 40, 50]
    colors = (
    (1, 0, 0), (0, 1, 0), (0, 0, 1), (0.5, 0.5, 0), (0, 0.5, 0.5), (0.5, 0, 0.5), (0.4, 0.6, 0), (0.6, 0.4, 0), \
    (0, 0.6, 0.4), (0.5, 0.3, 0.2))  # 颜色集合，不同的曲线用不同的颜色
    # 训练并绘图
    for alpha, color in zip(alphas, colors):
        scores = []
        for K in Ks:
            clf = LabelSpreading(max_iter=100, n_neighbors=K, alpha=alpha, kernel='knn')
            clf.fit(x, y_train)
            scores.append(clf.score(x[unlabeled_indices], y[unlabeled_indices]))
        ax.plot(Ks, scores, label=r"$\alpha=%s$" % alpha, color=color)
    # 设置图形
    ax.set_xlabel(r"k")
    ax.set_ylabel("score")
    ax.legend(loc='best')
    ax.set_title("LabelSpreading knn kernel")
    plt.show()

Esempio n. 4

0

Mostra file

def soft_clamping(kernel, xTrain, yTrain, MI=10000, k=3, g=0.6, a=0.1):
    spread = LabelSpreading(kernel=kernel,
                            n_neighbors=k,
                            gamma=g,
                            alpha=a,
                            max_iter=MI,
                            n_jobs=-1)
    spread.fit(xTrain, yTrain)
    predY = spread.predict_proba(xTrain)

    norm_Y = normalize(yTrain, predY)
    labels = []
    for i in norm_Y:
        if i[0] > i[1]:
            labels.append(benign)
        elif i[0] < i[1]:
            labels.append(malware)

    lm_to_b, lb_to_m, tp, tn, fp, fn, pred_day1, missed_day1 = stats(
        yTrain, labels, yExpect, day_one)

    results = [
        'SC', kernel, k, g, a, lm_to_b, lb_to_m, tp, tn, fp, fn, pred_day1,
        missed_day1
    ]

    file_name = 'SC_CMN_5per_' + str(rate) + '.csv'
    write_csv(file_name, results)

Esempio n. 5

0

Mostra file

def semiLabelSpreding(feature_extractor, generator, val_generator, kernel,
                      neighbors, gamma, alpha):
    semi = LabelSpreading(kernel=kernel,
                          n_neighbors=neighbors,
                          gamma=gamma,
                          alpha=alpha,
                          tol=0.001,
                          max_iter=1000000)

    features = feature_extractor.predict_generator(generator,
                                                   steps=generator.samples /
                                                   generator.batch_size,
                                                   verbose=1)

    classes = generator.classes

    for i in range(0, generator.samples):
        if (generator.filenames[i][0] == 'N'):
            classes[i] = -1

    semi.fit(features, classes)

    val_features = feature_extractor.predict_generator(
        val_generator,
        steps=val_generator.samples / val_generator.batch_size,
        verbose=1)
    predicted_classes = semi.predict(val_features)

    return predicted_classes

Esempio n. 6

0

Mostra file

def computeSimilarities2(vect, matrix_values, numLine, numRan=10):
    """ build the model with the semi supervised approach labelSpreading     
    Args:
    matrix_values: descriptor matrix i.e. all the probability of all ir models
    vect: the answer vector (value 0 for false links and 1 for true links);
    numLine: number of pairs of artefacts
  Returns:
    preds: probability that a pair of artefact is linked
  """
    #number of iterations

    allPrediction = []
    model = LabelSpreading()

    #compute multiple (10) random vector of the matrix_values
    for i in range(0, numRan):
        subVect, subMatrix_values = computeRandom(vect, matrix_values, numLine)
        #compute the prediction function of each random vector
        computeModel = model.fit(subMatrix_values, subVect)
        print("new predicted function computed")
        #compute the prediction of each pair of artefact with the random model
        preds0 = computeModel.predict_proba(matrix_values)
        allPrediction.append(preds0[:, 1])

    # by the "vote majoritaire"
    preds = vote(allPrediction, len(vect), numRan)

    print(preds)

    return preds

Esempio n. 7

0

Mostra file

File: classify.py Progetto: akki4289/facial-expression-recognition

def label_spreading(X_train, y_train, Xunlabelled, X_test, y_test):
    #pca = randomized_PCA(X_train)
    #X_train, X_test, y_train, y_test = cross_validation.train_test_split(tr_images, tr_labels, test_size=0.3)
    #X = pca.transform(X)
    #val_images = pca.transform(val_images)
    #y= y[:]

    X_train = X_train[:, :]
    y_train = y_train[:]
    Xunlabelled = Xunlabelled[:10000,:]

    #import ipdb; ipdb.set_trace()

    X_both = np.vstack((X_train, Xunlabelled))

    y_both = np.append(y_train, -np.ones((Xunlabelled.shape[0],)))


    label_prop_model = LabelSpreading(max_iter=100)
    #random_unlabeled_points = np.where(np.random.random_integers(0, 1, size=len(y_train)))
    #labels = np.copy(y_train)
    #labels[random_unlabeled_points] = -1
    label_prop_model.fit(np.copy(X_both), np.copy(y_both))
    y_pred = label_prop_model.predict(np.copy(X_both))
    print(y_pred)

Esempio n. 8

0

Mostra file

def testLabelPropagation():
    from sklearn.semi_supervised import LabelSpreading
    from sklearn import preprocessing
    label_enc = preprocessing.LabelEncoder()

    label_prop_model = LabelSpreading(kernel='knn')
    train_iter = getDocumentIterator1("published = 0 and is_test = 1")
    validation_iter = getDocumentIterator1("published = 1 and is_test = 1")
    semantic_model = gensim_tests.SemanticModel.load(
        'gensim/full_corpus_300000')
    all_profiles, labels = [], []
    propagation_labels = []
    for doc in train_iter:
        all_profiles.append(semantic_model.inferProfile(doc.rawtext))
        labels.append(doc.learned_category[0])
        propagation_labels.append(doc.learned_category[0])

    label_enc.fit(propagation_labels)
    propagation_labels = label_enc.transform(propagation_labels).tolist()

    for doc in validation_iter:
        all_profiles.append(semantic_model.inferProfile(doc.rawtext))
        labels.append(doc.learned_category[0])
        propagation_labels.append(-1)
    print propagation_labels
    print "Fitting"
    label_prop_model.fit(all_profiles, propagation_labels)
    output_labels = label_prop_model.transduction_
    for propagated, orig in zip(label_enc.inverse_transform(output_labels),
                                labels):
        print propagated, orig

Esempio n. 9

0

Mostra file

    def predict_ssl(self, x_sup, y_sup, x_unsup, y_unsup, x_test, y_test):

        ls_model = LabelSpreading(kernel='knn', n_neighbors=5)
        indices = np.arange(self.train_size)
        unlabeled_indices = indices[x_sup.shape[0]:]
        y_sup_unsup = np.concatenate([y_sup, y_unsup])
        y_sup_unsup_train = np.copy(y_sup_unsup)
        y_sup_unsup_train[unlabeled_indices] = -1

        x_fit = np.concatenate([x_sup, x_unsup], axis=0)
        h_fit = self.model_e.predict(x_fit)
        h_fit = np.reshape(h_fit,
                           (h_fit.shape[0], h_fit.shape[1] * h_fit.shape[2]))
        ls_model.fit(h_fit, y_sup_unsup_train)
        y_unsup_pred = ls_model.transduction_[unlabeled_indices]

        #print("LabelSpread Accuracy is ", accuracy_score(y_unsup, y_unsup_pred))

        h_test = self.model_e.predict(x_test)
        h_test = np.reshape(
            h_test, (h_test.shape[0], h_test.shape[1] * h_test.shape[2]))

        #SVM
        clf_svc = svm.SVC(kernel='linear')
        y_fit_true = ls_model.transduction_
        clf_svc.fit(h_fit, y_fit_true)
        acc_svm = accuracy_score(y_test, clf_svc.predict(h_test))

        clf_svc = svm.LinearSVC()
        clf_svc.fit(h_fit, y_fit_true)
        acc_svm_linear = accuracy_score(y_test, clf_svc.predict(h_test))
        print('acc_svm is ', max(acc_svm, acc_svm_linear))

Esempio n. 10

0

Mostra file

    def source_to_target_label_prop(self,
                                    train_feat_space='embeds',
                                    kernel_param={
                                        'type': 'rbf',
                                        'gamma': 20
                                    }):
        print(
            '-----------------------------------------------------------------------'
        )
        print('Propagating labels from source to target in {0} space'.format(
            train_feat_space))
        if train_feat_space == 'encoded':
            if not hasattr(self, 'source_encoded_reps'):
                self.dim_red_autoencode()
            concat_embs = np.concatenate(
                (self.source_encoded_reps, self.target_encoded_reps))
        elif train_feat_space == 'embeds':
            concat_embs = np.concatenate(
                (self.source_embds_vec, self.target_embds_vec))
        elif train_feat_space == 'embeds_tsne':
            if self.tsne_computed == 0:
                self.compute_tsne()
            feat_cols = []
            for idx in range(self.n_tsne_components):
                feat_cols.append('embeds_tsne_' + str(idx))
            source_data_feats = self.source_data[feat_cols].as_matrix()
            target_data_feats = self.target_data[feat_cols].as_matrix()
            concat_embs = np.concatenate(
                (source_data_feats, target_data_feats))
        else:
            raise NotImplemented
        unknown_labels = np.ones_like(self.target_labels) * -1
        label_prop_train_labels = np.concatenate(
            (self.source_labels, unknown_labels))
        lp_model = LabelSpreading()
        lp_model.fit(concat_embs, label_prop_train_labels)
        transduction_labels = lp_model.transduction_
        label_distributions = lp_model.label_distributions_

        print(label_distributions[0:10, :])
        self.source_data[
            train_feat_space +
            'Space_prop_pred'] = transduction_labels[:self.n_source]
        self.target_data[
            train_feat_space +
            'Space_prop_pred'] = transduction_labels[self.n_source:]
        # self.source_data[train_feat_space+'label_prop_groups'] = label_distributions[:self.n_source]
        # self.target_data[train_feat_space + 'label_prop_groups'] = label_distributions[self.n_source:]

        # self.embds_space_grouping.append(train_feat_space + 'label_prop_groups')
        # self.embds_space_classifiers.append(train_feat_space+'Space_prop_pred')
        if self.inter_save:
            print('Saving propagated labels')
            self.save_perforamance(self.serving_dir, suffix=self.save_suffix)

        print('Completed source to target label propagation in {0} space'
              ).format(train_feat_space)
        print(
            '-----------------------------------------------------------------------'
        )

Esempio n. 11

0

Mostra file

class LabelSpreadingModel(SupervisedW2VModel):
    def fit_with_test(self, test_data):
        xs, ys = [], []
        self.ans_mapping = []
        for ans, cvs in self.context_vectors.items():
            xs.extend(cvs)
            if ans not in self.ans_mapping:
                y = len(self.ans_mapping)
                self.ans_mapping.append(ans)
            else:
                y = self.ans_mapping.index(ans)
            ys.extend(y for _ in cvs)
        for ctx in test_data:
            xs.append(self.cv(ctx))
            ys.append(-1)  # unlabeled
        self.ls_clf = LabelSpreading(kernel='knn', n_neighbors=11)
        self.ls_clf.fit(xs, ys)

    def __call__(self, x, ans=None, with_confidence=False):
        v = self.cv(x)
        probs = self.ls_clf.predict_proba([v])[0]
        pred = probs.argmax()
        m_ans = self.ans_mapping[pred]
        # TODO - get confidence as difference between probs[pred] and next
        return (m_ans, 0.0) if with_confidence else m_ans

Esempio n. 12

0

Mostra file

File: label_propagation.py Progetto: vishalbelsare/sciope

    def train(self, inputs, targets, min_=0.01, max_=30, niter=10, stepsize=0.1):
        """
        Train the LP model given the data

        Parameters
        ----------
        inputs : nd-array
            independent variables
        targets : vector
            dependent variable
        min : float
            []
        max : float
            []
        niter : int
            number of training iterations
        stepsize : float
            []
        """
        # Scale the training data
        self.x = inputs
        self.y = targets

        # Tune gamma in RBF using basinhopping 
        self.gamma = self.optimize(min_, max_, niter, stepsize)[0]

        # Propogate labels
        self.model = LabelSpreading(kernel=self.kernel, alpha=self.alpha,
                                                      gamma=self.gamma)
        self.model.fit(self.x, self.y)
        if self.use_logger:
            self.logger.info("Label Propagation model trained with {} samples".format(len(self.y)))

Esempio n. 13

0

Mostra file

File: labelspread.py Progetto: alvinwan/storm

def label(filenames, train_path='../data/train_molecules_30.mat'):
    """
    Label data with the provided filenames.

    :param filenames: List of filenames containing data to label.
    :return: Newly labeled and conglomerate datasets
    """
    unlabeled = [scipy.io.loadmat(fname) for fname in filenames]
    unlabeled_X = np.vstack([data['X'] for data in unlabeled])
    X, Y = load_data(train_path, shape=(-1, 30, 30, 30))

    num_unlabeled = unlabeled_X.shape[0]
    unlabeled_Y = np.zeros(num_unlabeled) - 1
    unlabeled_Y = unlabeled_Y.reshape((-1, 1))
    Y = Y.reshape((-1, 1))
    Y_all = np.vstack((Y, unlabeled_Y))

    X_all = np.vstack((X, unlabeled_X))
    X_all = X_all.reshape((-1, 27000))

    label_prop_model = LabelSpreading()
    label_prop_model.fit(X_all, Y_all)
    Y_all = label_prop_model.transduction_
    unlabeled_Y = Y_all[num_unlabeled:]
    return (unlabeled_X, unlabeled_Y), (X_all, Y_all)

Esempio n. 14

0

Mostra file

def run_lp_bow_runtime_vocabulary(nbr, str_list, neighbors):
    i = 0
    avg_f1 = 0
    avg_accuracy = 0
    while i < 10:
        dataset = Dataset(categories)
        dataset.load_preprocessed(categories)
        dataset.split_train_true(nbr)
        print_v2_test_docs_vocabulary_labeled(categories)
        dataset.load_preprocessed_test_vocabulary_labeled_in_use(categories)
        vectorizer = CountVectorizer(vocabulary=Vocabulary.get_vocabulary(categories))
        vectors = vectorizer.fit_transform(dataset.train['data'])

        clf = LabelSpreading(kernel='knn', n_neighbors=neighbors).fit(vectors.todense(), dataset.train['target'])
        test_vec = vectorizer.transform(dataset.test['data'])
        pred = clf.predict(test_vec.todense())
        avg_f1 += metrics.f1_score(dataset.test['target'], pred, average='macro')
        avg_accuracy += clf.score(test_vec.todense(), dataset.test['target'])
        i += 1
    avg_accuracy = avg_accuracy/10
    avg_f1 = avg_f1/10
    str_list.extend(["KNN BOW runtime voc Avg f1: " + avg_f1.__str__(), "KNN BOW runtime vod Avg acc: "
                     + avg_accuracy.__str__()])
    print("Avg f1: " + avg_f1.__str__())
    print("Avg acc: " + avg_accuracy.__str__())

Esempio n. 15

0

Mostra file

    def label_spread(self, X_train, y_train, gamma = None, max_iter = None):
        """
        Train Label Spreading model from scikit-learn

        Parameters
        __________
        X_train: Scaled training data
        y_train: Class label
        gamma: Parameter for rbf kernel
        max_iter: Maximum number of iterations allowed

        Returns
        ________
        Predicted labels and probability
        """
        # Label spreading model
        model = LabelSpreading(kernel='rbf', gamma = gamma, max_iter = max_iter, n_jobs= -1)

        # Fit the training set
        model.fit(X_train, y_train)

        # Predict the labels of the unlabeled data points
        predicted_labels = model.transduction_

        # Predict probability
        predicted_proba = model.predict_proba(X_train)
        return predicted_labels, predicted_proba

Esempio n. 16

0

Mostra file

class ModelLabelSpreading:
    def __init__(self):
        np.random.seed(1102)
        self.model = LabelSpreading(
            kernel="rbf",
            n_jobs=int(np.max([multiprocessing.cpu_count() - 2, 1])),
            alpha=0.2,
            n_neighbors=10,
            max_iter=15)
        self.name = "LABEL-SPREADING"
        self.scaler = MinMaxScaler()

    def fit(self, X, y, Xu=None):
        np.random.seed(1102)
        self.Xl = X
        self.yl = y
        #self.Xu = Xu

    def predict(self, X):
        np.random.seed(1102)
        self.Xt = X
        X = self.scaler.fit_transform(np.vstack((self.Xl, self.Xt)))
        y = np.append(self.yl, np.repeat(-1, self.Xt.shape[0]))
        #y = np.append(y, np.repeat(-1, self.Xt.shape[0]))
        y = np.int64(y)

        assert X.shape[0] == len(y)

        self.model.fit(X, y)

        return np.array(
            self.model.label_distributions_)[(-self.Xt.shape[0]):, :]

Esempio n. 17

0

Mostra file

File: Classifier_v2.py Progetto: thisisandreeeee/Terror-Eyes-Dashboard

def semi_supervised():
	features,labels = separate_cols_with_unknown(gtd)
	features = process_nontext(features)
	features = convertDType(features)
	model = LabelPropagation(kernel="knn")
	model2 = LabelSpreading(kernel="knn")
	model2.fit(features,labels)
	preds = cross_val_predict(model2,features,labels,cv=5)
	print('5 fold cross val accuracy of model: %0.2f ' % accuracy_score(labels,preds))

Esempio n. 18

0

Mostra file

def test_LabelSpreading(*data):
    x, y, unlabeled_indices = data
    y_train = np.copy(y)
    y_train[unlabeled_indices] = -1
    clf = LabelSpreading(max_iter=100, kernel='rbf', gamma=0.1)
    clf.fit(x, y_train)
    predicted_labels = clf.transduction_[unlabeled_indices]
    true_labels = y[unlabeled_indices]
    print("Accuracy: %f" % metrics.accuracy_score(true_labels, predicted_labels))

Esempio n. 19

0

Mostra file

File: hart-data-set-explore.py Progetto: dimarudnev/Featuresless-data-analysis

 def LabelSpreadingWrapper(X_train, y_train, X_test):
     clf = LabelSpreading(kernel='knn',
                          n_neighbors=10,
                          n_jobs=-1,
                          max_iter=1000,
                          alpha=0.1)
     newlabels = np.concatenate((np.array(y_train), -np.ones(len(X_test))))
     clf.fit(np.concatenate((X_train, X_test)), newlabels)
     return clf.transduction_[-len(X_test):]

Esempio n. 20

0

Mostra file

def test_LabelSpreading(*data):
    X,y,unlabeled_indices = data
    y_train = np.copy(y)
    y_train[unlabeled_indices] = -1
    clf = LabelSpreading(max_iter=1000, kernel='knn',gamma = 0.1)
    clf.fit(X,y_train)
    true_labels = y[unlabeled_indices]
    predicted_labels = clf.transduction_[unlabeled_indices]
    print('Accuracy : %f' %clf.score(X[unlabeled_indices],true_labels))
    print('Accuracy : %f' %metrics.accuracy_score(true_labels,predicted_labels))

Esempio n. 21

0

Mostra file

File: Classifier_v2.py Progetto: thisisandreeeee/Terror-Eyes-Dashboard

def semi_supervised():
    features, labels = separate_cols_with_unknown(gtd)
    features = process_nontext(features)
    features = convertDType(features)
    model = LabelPropagation(kernel="knn")
    model2 = LabelSpreading(kernel="knn")
    model2.fit(features, labels)
    preds = cross_val_predict(model2, features, labels, cv=5)
    print('5 fold cross val accuracy of model: %0.2f ' %
          accuracy_score(labels, preds))

Esempio n. 22

0

Mostra file

 def __init__(self):
     np.random.seed(1102)
     self.model = LabelSpreading(
         kernel="rbf",
         n_jobs=int(np.max([multiprocessing.cpu_count() - 2, 1])),
         alpha=0.2,
         n_neighbors=10,
         max_iter=15)
     self.name = "LABEL-SPREADING"
     self.scaler = MinMaxScaler()

Esempio n. 23

0

Mostra file

    def fit(self):
        #Need to concatenate labeled and unlabeled data
        #unlabeled data labels are set to -1
        X = np.concatenate(
            (self.val_primitive_matrix, self.train_primitive_matrix))
        val_labels = (self.val_ground + 1) / 2.
        train_labels = -1. * np.ones(np.shape(self.train_primitive_matrix)[0])
        y = np.concatenate((val_labels, train_labels))

        self.model = LabelSpreading(kernel='knn')
        self.model.fit(X, y)

Esempio n. 24

0

Mostra file

 def __init__(self, lmnn=False, max_iter=1000, lm_num=200):
     # self.clf =  LabelPropagation(kernel='knn',max_iter=1000,n_jobs=10,n_neighbors=25)
     self.clf = LabelSpreading(kernel='knn',
                               n_neighbors=25,
                               max_iter=max_iter,
                               alpha=0.2,
                               n_jobs=-1)
     self.lmnn = lmnn
     self.lm_num = lm_num
     if lmnn:
         self.ml = LMNN(use_pca=False, max_iter=2000)

Esempio n. 25

0

Mostra file

File: self_training.py Progetto: zernes/Business-Analytics

def knn(X, labels):
    # #############################################################################
    # Learn with LabelSpreading
    label_spread = LabelSpreading(kernel='knn', alpha=0.6, max_iter=100)
    label_spread.fit(X, labels)

    # #############################################################################
    # Plot output labels
    output_labels = label_spread.transduction_

    return output_labels

Esempio n. 26

0

Mostra file

File: label_propagation.py Progetto: SamujjwalSam/Short-text_GNN

def propagate_labels(
    features,
    labels,
):
    label_prop_model = LabelSpreading(kernel=construct_graph, n_jobs=-1)
    label_prop_model.fit(features, labels)
    logger.debug(label_prop_model.classes_)
    # preds = label_prop_model.predict(features)
    preds = label_prop_model.predict_proba(features)
    # logger.debug(label_prop_model.classes_)

    return preds

Esempio n. 27

0

Mostra file

def apply_notl(trainX, trainY, testX, testY, window, source_pos, target_pos):
    #######################
    ### SEMI-SUPERVISED ###
    ########################
    # Label Propagation
    label_prop_model = LabelPropagation(kernel='knn')
    label_prop_model.fit(trainX, trainY)
    Y_Pred = label_prop_model.predict(testX);
    acc_ss_propagation, acc_ss_propagation_INFO = check_accuracy(testY, Y_Pred)
    # Label Spreading
    label_prop_models_spr = LabelSpreading(kernel='knn')
    label_prop_models_spr.fit(trainX, trainY)
    Y_Pred = label_prop_models_spr.predict(testX);
    acc_ss_spreading, acc_ss_spreading_INFO = check_accuracy(testY, Y_Pred)
    ########################
    #### WITHOUT TL ########
    ########################
    # LogisticRegression 
    modelLR = LogisticRegression()
    modelLR.fit(trainX, trainY)
    predLR = modelLR.predict(testX)
    accLR, acc_LR_INFO = check_accuracy(testY, predLR)
    # DecisionTreeClassifier
    modelDT = tree.DecisionTreeClassifier()
    modelDT.fit(trainX, trainY)
    predDT = modelDT.predict(testX)
    accDT, acc_DT_INFO = check_accuracy(testY, predDT)
    # BernoulliNB
    modelNB = BernoulliNB()
    modelNB.fit(trainX, trainY)
    predND = modelNB.predict(testX)
    accNB, acc_NB_INFO = check_accuracy(testY, predND)
    #
    return pd.DataFrame(
        [{ 
        'window': window,
        'source_position': source_pos,
        'target_position': target_pos,

        'acc_SS_propagation': acc_ss_propagation,
        'acc_SS_propagation_INFO':acc_ss_propagation_INFO,
        'acc_SS_spreading': acc_ss_spreading,
        'acc_SS_spreading_INFO':acc_ss_spreading_INFO,
        'acc_LR':accLR,
        'acc_LR_INFO': str(acc_LR_INFO),
        'acc_DT': accDT,
        'acc_DT_INFO': str(acc_DT_INFO),
        'acc_NB': accNB,
        'acc_NB_INFO': str(acc_NB_INFO)       

        }]
    )

Esempio n. 28

0

Mostra file

File: iterated_semisupervised_feature_selection.py Progetto: teresa-m/EDeN

 def _semi_supervised_learning(self, data_matrix, target):
     n_classes = len(set(target))
     # if there are too few classes (e.g. less than -1 and at least 2 other classes)
     # then just bail out and return the original target
     # otherwise one cannot meaningfully spread the information of only one class
     if n_classes > 2:
         semi_supervised_estimator = LabelSpreading(kernel='knn', n_neighbors=self.n_neighbors)
         semi_supervised_estimator.fit(data_matrix, target)
         predicted_target = semi_supervised_estimator.predict(data_matrix)
         predicted_target = self._clamp(target, predicted_target)
         return predicted_target
     else:
         return target

Esempio n. 29

0

Mostra file

File: Aggreg_features_algo2.py Progetto: eleffa/ATLaS

def computeSimilarities2(lsi_align,
                         vsm_align,
                         lda_align,
                         f1_align,
                         f2_align,
                         f3_align,
                         f4_align,
                         f5_align,
                         f6_align,
                         numLine,
                         numRan=10):
    """ build the model with the semi supervised approach labelSpreading     
    Args:
    matrix_values: descriptor matrix i.e. all the probability of all ir models
    vect: the answer vector (value 0 for false links and 1 for true links);
    numLine: number of pairs of artefacts
  Returns:
    preds: probability that a pair of artefact is linked
  """
    allPrediction = []
    model = LabelSpreading()

    #compute multiple (10) random vector of the matrix_values
    for i in range(0, numRan):
        subVect, subMatrix_values, matrix_values = buildingTrainingSet2(
            lsi_align, vsm_align, lda_align, f1_align, f2_align, f3_align,
            f4_align, f5_align, f6_align, numLine)
        #compute the prediction function of each random vector
        print(len(subVect))
        print(len(subMatrix_values))
        computeModel = model.fit(subMatrix_values, subVect)
        print("new predicted function computed")
        #compute the prediction of each pair of artefact with the random model
        preds0 = computeModel.predict_proba(matrix_values)
        allPrediction.append(preds0[:, 1])

    # by the "vote majoritaire"
    preds = vote(allPrediction, len(lsi_align), numRan)

    result = []

    print(len(preds))

    for p in preds:
        for i in range(0, len(p)):
            result.append(p[i])

    print(len(result))

    return result

Esempio n. 30

0

Mostra file

File: iterated_semisupervised_feature_selection.py Progetto: BackofenLab/Graphlearn_long_range

 def _semi_supervised_learning(self, data_matrix, target):
     n_classes = len(set(target))
     # if there are too few classes (e.g. less than -1 and at least 2 other classes)
     # then just bail out and return the original target
     # otherwise one cannot meaningfully spread the information of only one class
     if n_classes > 2:
         semi_supervised_estimator = LabelSpreading(
             kernel='knn', n_neighbors=self.n_neighbors)
         semi_supervised_estimator.fit(data_matrix, target)
         predicted_target = semi_supervised_estimator.predict(data_matrix)
         predicted_target = self._clamp(target, predicted_target)
         return predicted_target
     else:
         return target

Esempio n. 31

0

Mostra file

def LabelPropagation(support, support_ys, query):
    alpha = 0.3
    k_neighbours = 38
    all_embeddings = np.concatenate((support, query), axis=0)
    #X = all_embeddings.cpu().detach().numpy()
    labels = np.full(all_embeddings.shape[0], -1.)
    labels[:support.shape[0]] = support_ys
    label_propagation = LabelSpreading(kernel='knn',
                                       alpha=alpha,
                                       n_neighbors=k_neighbours,
                                       tol=0.000001)
    label_propagation.fit(all_embeddings, labels)
    predicted_labels = label_propagation.transduction_
    query_prop = predicted_labels[support.shape[0]:]
    return query_prop

Esempio n. 32

0

Mostra file

def computeSimilarities(vect, matrix_values):
    """ build the model with the semi supervised approach labelSpreading     
    Args:
    matrix_values: descriptor matrix i.e. all the probability of all ir models
    vect: the answer vector (value 0 for false links and 1 for true links);
  Returns:
    preds: probability that a pair of artefact is linked
  """
    model = LabelSpreading()
    computeModel = model.fit(matrix_values, vect)
    print("model built")
    preds = computeModel.predict_proba(matrix_values)
    print(preds)

    return preds[:, 1]

Esempio n. 33

0

Mostra file

class SemiSupervised(BaselineModel):
    """
    LabelSpreading Implementation
    """
    def fit(self):
        #Need to concatenate labeled and unlabeled data
        #unlabeled data labels are set to -1
        X = np.concatenate(
            (self.val_primitive_matrix, self.train_primitive_matrix))
        val_labels = (self.val_ground + 1) / 2.
        train_labels = -1. * np.ones(np.shape(self.train_primitive_matrix)[0])
        y = np.concatenate((val_labels, train_labels))

        self.model = LabelSpreading(kernel='knn')
        self.model.fit(X, y)

Esempio n. 34

0

Mostra file

def compute_confident_measure(vect,
                              lsi_align,
                              lda_align,
                              vsm_align,
                              emb_align,
                              matrixSize=4):
    """ compute confidence measure for each pair of artefacts.
  Args:
    vect: the answer vector (value 0 for false links and 1 for true links);
    lsi_align, vsm_align, lda_align: IR models aligned;
    filename: file where the aggreg similarity scores results will be saved;
  Returns:
     preds: probability that a pair of artefact is linked."""

    vect0 = np.zeros((len(lsi_align), 1))
    vect0[:] = -1

    trueL, falseL = CreateTraining_set.create_link_class(vect, lsi_align)
    print(len(trueL))
    print(len(falseL))
    for i in falseL:
        vect0[i] = 0

    for i in trueL:
        vect0[i] = 1

    print("annoted_vectors_ok")

    emb_values = compValues(emb_align)
    vsm_values = compValues(vsm_align)
    lda_values = compValues(lda_align)
    lsi_values = compValues(lsi_align)

    matrix_values = np.zeros((len(lsi_align), matrixSize))

    matrix_values[:, 0] = lda_values
    matrix_values[:, 1] = emb_values
    matrix_values[:, 2] = vsm_values
    matrix_values[:, 3] = lsi_values

    model = LabelSpreading()
    computeModel = model.fit(matrix_values, vect0)
    print("models built")
    preds = computeModel.predict_proba(matrix_values)

    print(preds)

    return preds[:, 1]

Esempio n. 35

0

Mostra file

File: embedding.py Progetto: gianlucacorrado/EDeN

def semi_supervised_learning(data_matrix, target):
    if -1 in list(target):
        # if -1 is present in target do label spreading
        from sklearn.semi_supervised import LabelSpreading
        label_prop_model = LabelSpreading(kernel='knn', n_neighbors=5)
        label_prop_model.fit(data_matrix, target)
        pred_target = label_prop_model.predict(data_matrix)
        extended_target = []
        for pred_label, label in zip(pred_target, target):
            if label != -1 and pred_label != label:
                extended_target.append(label)
            else:
                extended_target.append(pred_label)
    else:
        extended_target = target
    return np.array(extended_target)

Esempio n. 36

0

Mostra file

File: dataone_ontology_matching_label_spreading.py Progetto: tetherless-world/linkipedia

def train_model(nodes, datasets):
    y = np.array(range(len(nodes)))
    nodes = list(nodes)
    vectorizer = DictVectorizer(sparse=True)
    for i, dataset in enumerate(datasets):
        g = compute_dataset(dataset)
        nodes.extend(g.classes)
        sys.stdout.write('\r')
        sys.stdout.write(str(i+1))
        sys.stdout.flush()
    X = vectorizer.fit_transform([dict(node.concept_vector) for node in nodes])
    y = y + [-1 for i in range(len(nodes) - len(y))]

    unlabeled = []
    model = LabelSpreading()
    model.fit(X, y)
    model.vectorizer = vectorizer
    return model

Esempio n. 37

0

Mostra file

File: profile_cluster.py Progetto: wtgme/ohsn

def propogation(model, uids, labeled_ids):
    X, y1, y2 = [], [], []
    pool = []
    for uid in labeled_ids:
        X.append(model.docvecs[uid])
        y1.append(1)
    for uid in uids:
        if uid not in labeled_ids:
            X.append(model.docvecs[uid])
            y2.append(-1)
    label_prop_model = LabelSpreading(kernel='knn', alpha=1.0)
    y2 = np.array(y2)
    y2[0:(len(y1)-1)] = 0
    print len(y1) + len(y2)
    for i in xrange(5):
        np.random.shuffle(y2)
        label_prop_model.fit(X, y1 + y2.tolist())
        pool.append(label_prop_model.transduction_)
    pickle.dump(pool, open('data/propagation.pick', 'w'))
    pool = pickle.load(open('data/propagation.pick', 'r'))
    pool = np.array(pool)
    for column in pool.T:
        print column

Esempio n. 38

0

Mostra file

File: main.py Progetto: numb3r3/kdd2015

def main():
    usage = "usage prog [options] arg"
    parser = OptionParser(usage=usage)

    parser.add_option("-t", "--task", dest="task", help="the task name")
    parser.add_option("-o", "--output", dest="output", help="the output file")

    (options, remainder) = parser.parse_args()

    train_paths = [
        "../data/train_simple_feature.csv",
        "../data/train_plus_feature.csv",
        "../data/train_azure_plus_feature.csv",
        #"../data/train_azure_feature.csv",
        #"../data/train_module_feature.csv",
        #"../data/train_course_feature.csv",
        #"./blend_train.csv"
        ]
    label_path = "../data/truth_train.csv"

    test_paths = [
        "../data/test_simple_feature.csv",
        "../data/test_plus_feature.csv",
        "../data/test_azure_plus_feature.csv",
        #"../data/test_azure_feature.csv",
        #"../data/test_module_feature.csv",
        #"../data/test_course_feature.csv",
        #"./blend_test.csv"
        ]

    train = merge_features(train_paths, label_path)
    train = train.drop(['user_drop_ratio'], axis=1)
    #train['user_drop_ratio'] = (train['user_drop_ratio'] + 8.0 / train['user_courses']) / (1.0 + 10.0 / train['user_courses'])
    y = encode_labels(train.dropout.values)
    train = train.drop('dropout', axis=1)
    tr_ids = train.enrollment_id.values
    X = train.drop('enrollment_id', axis=1)
    m, n = X.shape
    print 'train.shape=%s' % (str(X.shape))


    test = merge_features(test_paths)
    test = test.drop(['user_drop_ratio'], axis=1)
    #test['user_drop_ratio'] = (test['user_drop_ratio'] + 8.0 / test['user_courses']) / (1.0 + 10.0 / test['user_courses'])
    tt_ids = test.enrollment_id.values
    X_test = test.drop('enrollment_id', axis=1)
    print 'test.shape=%s' % (str(X_test.shape))

    scaler = StandardScaler().fit(np.vstack((X, X_test)))

    task = options.task
    if not task:
        task = "blend"

    if task == 'blend':

        clf_list = [
            #("knn_p2_10", create_clf('knn', {"n_neighbors": 10, "p": 2})),
            #("knn_p2_10_scaler", create_clf('knn', {"n_neighbors": 10, "p": 2, "scaler": scaler})),
            #("knn_p2_100", create_clf('knn', {"n_neighbors": 100, "p": 2})),
            #("knn_p2_100_scaler", create_clf('knn', {"n_neighbors": 100, "p": 2, "scaler": scaler})),
            #("knn_p2_500", create_clf('knn', {"n_neighbors": 500, "p": 2})),
            #("knn_p2_500_scaler", create_clf('knn', {"n_neighbors": 500, "p": 2, "scaler": scaler})),
            #("knn_p2_100", create_clf('knn', {"n_neighbors": 100, "p": 2})),
            #("knn_p2_800", create_clf('knn', {"n_neighbors": 800, "p": 2})),
            #("knn_p1_10", create_clf('knn', {"n_neighbors": 10, "p": 1})),
            #("knn_p1_10_scaler", create_clf('knn', {"n_neighbors": 10, "p": 1, "scaler": scaler})),
            #("knn_p1_100", create_clf('knn', {"n_neighbors": 100, "p": 1})),
            #("knn_p1_100_scaler", create_clf('knn', {"n_neighbors": 100, "p": 1, "scaler": scaler})),
            #("knn_p1_500", create_clf('knn', {"n_neighbors": 500, "p": 1})),
            #("knn_p1_500_scaler", create_clf('knn', {"n_neighbors": 500, "p": 1, "scaler": scaler})),
            #("knn_p1_800", create_clf('knn', {"n_neighbors": 800, "p": 1})),
            #("knn_p1_800_scaler", create_clf('knn', {"n_neighbors": 800, "p": 1, "scaler": scaler})),
            #("extra_gini_10depth", create_clf("ext", {"criterion": "gini", "n_estimators": 200, "max_depth": 10})),
            #("extra_entropy_10depth", create_clf("ext", {"criterion": "entropy", "n_estimators": 200, "max_depth": 10})),
            #("extra_gini_20depth", create_clf("ext", {"criterion": "gini", "n_estimators": 200, "max_depth": 20})),
            #("extra_entropy_20depth", create_clf("ext", {"criterion": "entropy", "n_estimators": 200, "max_depth": 20})),
            ("extra_gini_30depth", create_clf("ext", {"criterion": "gini", "n_estimators": 200, "max_depth": 30})),
            ("extra_entropy_30depth", create_clf("ext", {"criterion": "entropy", "n_estimators": 200, "max_depth": 30})),
            #("rfc_gini_3depth", create_clf("rfc", {"criterion": "gini", "max_depth": 3, "n_estimators": 200})),
            #("rfc_entropy_3depth", create_clf("rfc", {"criterion": "entropy", "max_depth": 3, "n_estimators": 200})),
            ("rfc_gini_5depth", create_clf("rfc", {"criterion": "gini", "max_depth": 5, "n_estimators": 200})),
            ("rfc_entropy_5depth", create_clf("rfc", {"criterion": "entropy", "max_depth": 5, "n_estimators": 200})),
            ("rfc_gini_6depth", create_clf("rfc", {"criterion": "gini", "max_depth": 6, "n_estimators": 200})),
            ("rfc_entropy_6depth", create_clf("rfc", {"criterion": "entropy", "max_depth": 6, "n_estimators": 200})),
            ("rfc_gini_8depth", create_clf("rfc", {"criterion": "gini", "max_depth": 8, "n_estimators": 200})),
            ("rfc_entropy_8depth", create_clf("rfc", {"criterion": "entropy", "max_depth": 8, "n_estimators": 200})),
            ("rfc_gini_10depth", create_clf("rfc", {"criterion": "gini", "max_depth": 10, "n_estimators": 200})),
            ("rfc_entropy_10depth", create_clf("rfc", {"criterion": "entropy", "max_depth": 10, "n_estimators": 200})),
            ("rfc_gini_12depth", create_clf("rfc", {"criterion": "gini", "max_depth": 12, "n_estimators": 200})),
            ("rfc_entropy_12depth", create_clf("rfc", {"criterion": "entropy", "max_depth": 12, "n_estimators": 200})),
            #("xgb_1500_2depth", create_clf("xgb", {"max_depth": 2, "n_estimators": 1500, "learning_rate": 0.03})),
            #("xgb_600_3depth", create_clf("xgb", {"max_depth": 3, "n_estimators": 600, "learning_rate": 0.03})),
            #("xgb_600_4depth", create_clf("xgb", {"max_depth": 4, "n_estimators": 600, "learning_rate": 0.03})),
            ("xgb_600_5depth", create_clf("xgb", {"max_depth": 5, "n_estimators": 600, "learning_rate": 0.03})),
            #("xgb_600_6depth", create_clf("xgb", {"max_depth": 6, "n_estimators": 600, "learning_rate": 0.02})),
            #("xgb_600_7depth", create_clf("xgb", {"max_depth": 7, "n_estimators": 600, "learning_rate": 0.01})),
            #("xgb_600_8depth", create_clf("xgb", {"max_depth": 8, "n_estimators": 600, "learning_rate": 0.01})),
            #("lgc_1c_scale", create_clf("lgc", {"C": 1.0, "scaler": scaler})),
            #("lgc_1c", create_clf("lgc", {"C": 1.0})),
            #("lgc_1c_l1", create_clf("lgc", {"C": 1.0, "penalty": "l1"})),
            #("lgc_3c_scale", create_clf("lgc", {"C": 3.0, "scaler": scaler})),
            #("lgc_3c", create_clf("lgc", {"C": 3.0})),
            ("lgc_3c_l1", create_clf("lgc", {"C": 3.0, "penalty": "l1"})),
            #("lgc_5c_scale", create_clf("lgc", {"C": 5.0, "scaler": scaler})),
            #("lgc_5c", create_clf("lgc", {"C": 5.0})),
            ]

        X = X.values
        blend_train, blend_test = train_blend(X, y, X_test, clf_list, 5)
        print 'blend_train.shape=%s' % (str(blend_train.shape))
        print 'blend_test.shape=%s' % (str(blend_test.shape))


        cols = [cname for cname, clf in clf_list]
        cols = ['enrollment_id'] + cols
        blend_train_ids = np.hstack((np.matrix(tr_ids).T, blend_train))
        blend_test_ids = np.hstack((np.matrix(tt_ids).T, blend_test))
        dump_data(blend_train_ids, cols, "new_blend_train.csv")
        dump_data(blend_test_ids, cols, "new_blend_test.csv")


        blender = create_clf('lgc', {"C": 1.0, "penalty": "l1"})
        auc = cv_loop(blend_train, y, blender)
        print 'AUC (LGC blend): %f' % auc

        blender = create_clf('ext', {"max_depth": 10, "criterion": "entropy", "n_estimator": 100})
        auc = cv_loop(blend_train, y, blender)
        print 'AUC (EXT blend): %f' % auc

        blender = create_clf('xgb', {'max_depth': 2, "n_estimators": 150, "learning_rate": 0.05})
        auc = cv_loop(blend_train, y, blender)
        print "AUC (XGB blend {d: %d, n: %d}): %f" % (2, 150, auc)
        blender = create_clf('xgb', {'max_depth': 3, "n_estimators": 200, "learning_rate": 0.05})
        auc = cv_loop(blend_train, y, blender)
        print 'AUC (XGB blend {d: %d, n: %d}): %f' % (3, 200, auc)

        blender = create_clf('xgb', {'max_depth': 3, "n_estimators": 100, "learning_rate": 0.1})
        blender = blender.fit(blend_train, y)
        preds = blender.predict_proba(blend_test)[:,1]
        write_submission(tt_ids, preds, "new_blend_submission.csv")

        combined_train = np.hstack((X, blend_train))
        combined_test = np.hstack((X_test, blend_test))
        blender = create_clf('xgb', {'max_depth': 5, "n_estimators": 600, "learning_rate": 0.03})
        blender = blender.fit(combined_train, y)
        preds = blender.predict_proba(combined_test)[:,1]
        write_submission(tt_ids, preds, "new_combined_blend_submission.csv")

    elif task == 'lgc':
        print 'Try logistic regression ..'
        clf = create_clf("lgc", {"C": 3, "scaler": scaler, "penalty": "l1"})
        auc = cv_loop(X, y, clf, 5)
        print 'AUC (all): %f' % auc

    elif task == "ext":
        print 'Try ExtraTreeClassifier'
        #clf = create_clf("ext", {"max_depth": 10}) # 0.86261
        #clf = create_clf("ext", {"max_depth": 20}) # 0.862636
        #clf = create_clf("ext", {"max_depth": 30}) # 0.860944
        #clf = create_clf("ext", {"criterion": "entropy", "max_depth": 10}) # 0.862610
        #clf = create_clf("ext", {"criterion": "entropy", "max_depth": 20}) # 0.862564
        clf = create_clf("ext", {"criterion": "entropy", "max_depth": 20, "n_estimators": 100}) # 0.861795
        #clf = create_clf("ext", {"criterion": "entropy", "max_depth": 20, "n_estimators": 2000}) # 0.862695
        #clf = create_clf("ext", {"criterion": "entropy", "max_depth": 30, "n_estimators": 2000}) # 0.860
        auc = cv_loop(X, y, clf, 5)
        print 'AUC (all): %f' % auc

    elif task == 'rfc':
        print 'Try RFC ..'
        #clf = create_clf('rfc', {'max_depth': 5}) # 0.859583
        #clf = create_clf("rfc", {"criterion": "entropy", "max_depth": 10}) # 0.863369
        #clf = create_clf("rfc", {"criterion": "entropy", "max_depth": 10, "n_estimators": 200}) # 0.863285
        # clf = create_clf("rfc", {"criterion": "entropy", "max_depth": 10, "n_estimators": 100}) # 0.863207
        #clf = create_clf("rfc", {"criterion": "entropy", "max_depth": 10, "max_features": None, "n_estimators": 200}) # 0.863341
        clf = create_clf("rfc", {"criterion": "entropy", "max_depth": 40, "max_features": None, "n_estimators": 10000, "min_samples_split": 100}) # 0.863291
        auc = cv_loop(X, y, clf, 5)
        print 'AUC (all): %f' % auc

    elif task == 'knn':
        clf = create_clf('knn', {"n_neighbors": 800, "p": 2, "scaler": scaler})
        auc = cv_loop(X, y, clf, 5)
        print 'AUC (all): %f' % auc

    elif task == "gbt":
        paras = json.load(open('paras/gbt.json', 'r'))
        clf = create_clf("gbt", paras)
        clf = clf.fit(X, y)
        preds = clf.predict_proba(X_test)[:,1]
        write_submission(tt_ids, preds, "gbt_submission.csv")

    elif task == "xgb":
        #clf = create_clf('xgb', {"max_depth": 2, "n_estimators": 1500, "learning_rate": 0.03}) # 0.860279
        #clf = create_clf('xgb', {"max_depth": 5, "n_estimators": 600, "learning_rate": 0.03}) # public: 0.8891443712867697;
        clf = create_clf('xgb', {"max_depth": 5, "n_estimators": 600, "learning_rate": 0.03}) # public:
        auc = cv_loop(X, y, clf, 5)
        print "AUC (all): %f" % auc
        #sys.exit()
        clf = clf.fit(X, y)
        preds = clf.predict_proba(X_test)[:,1]
        write_submission(tt_ids, preds, 'xgb_new_submission.csv')
    elif task == "deep":
        clf = create_clf('deep', {"neuro_num": 512, "nb_epoch": 20, "scaler": scaler, "optimizer": "adadelta"})
        auc = cv_loop(X, y, clf, 5)
        print 'AUC (all): %f' % auc
        sys.exit(0)

        clf = clf.fit(X, y)
        preds = clf.predict_proba(X_test)[:,1]
        write_submission(tt_ids, preds, 'deep_submission.csv')

    elif task == 'semi':
        clf = create_clf("ext", {"criterion": "entropy", "max_depth": 20, "n_estimators": 100}) # 0.861795
        train_semi(X, y, X_test, clf, 5)

    elif task == 'gbc':
        from sklearn.ensemble import GradientBoostingClassifier
        clf = GradientBoostingClassifier(n_estimators=300,
                                        learning_rate=0.1,
                                        min_samples_split=50,
                                        min_samples_leaf=50,
                                        max_depth=10,
                                        subsample=0.6,
                                        max_features='log2',
                                        verbose=1)
        auc = cv_loop(X, y, clf, 5)
        print 'AUC (all): %f' % auc

    elif task == 'label':
        from sklearn.semi_supervised import LabelPropagation
        from sklearn.semi_supervised import LabelSpreading
        label_prop_model = LabelSpreading()
        all_X = np.vstack((X, X_test))
        tm, tn = X_test.shape
        unlabeles = [-1] * tm
        ys = [list(y)]
        ys.append(unlabeles)
        labels = np.concatenate(ys)
        print 'ALL shape=%s' % (str(all_X.shape))
        print 'ALL y shape=%s' % (str(labels))
        label_prop_model.fit(all_X, labels)

Esempio n. 39

0

Mostra file

File: doc2vec_agglo_clustering_with_clf.py Progetto: amkent5/MachineLearning

  4.  6.  6.  4.  2.  3.  1.  6.  6.  1.  3.  1.  1.  1.  6.  6.  5. -1.
  1.  1. -1.  1.  6. -1.  3.  6.  1.  4.  4.  6.  4.  6.  4.  1. -1.  6.
  1.  2.  6.  4. -1.  2.  6.  2. -1. -1.  6.  4. -1.  1.  6.  4.  4.  6.
  6.  6. -1. -1.  1.  3. -1.  6.  2. -1.  1.  4. -1.  6.  1.  4.  3.  3.
  4.  1.  6. -1.  4.  4.  1.  1.  6.  6. -1.  4.  4.  4.  3.  2.  6. -1.
  1.  6.  4.  4.  4.  5.  6. -1. -1.  5.  2.  6.  1.  6.  3.  2.  6.  3.
  3.  1.  2.  5.  2. -1. -1.  1.  6.  6. -1.  6.  6.  6.  4.  6. -1.  2.
  3.  2.  5.  4.  4.  6.  4. -1.  4.  2.  6.  1.  1.  2. -1.  5.  2.  4.
  3. -1.  6.  2.  5.  2.  2.  5.  5.  4.  2.  1. -1.  1.]
(500, 100)
(500,)
"""

from sklearn.semi_supervised import LabelSpreading

label_propagation_model = LabelSpreading()
label_propagation_model.fit(X, y)

# make predictions for first twenty samples (some will be known, some unknown)
for i in range(20): print 'y: ', y[i], '\t', 'y_hat: ', label_propagation_model.predict(X[i].reshape(1,-1))
"""
y:  6.0     y_hat:  [6.]
y:  6.0     y_hat:  [6.]
y:  2.0     y_hat:  [2.]
y:  1.0     y_hat:  [1.]
y:  -1.0    y_hat:  [6.]    *
y:  2.0     y_hat:  [2.]
y:  6.0     y_hat:  [6.]
y:  4.0     y_hat:  [4.]
y:  3.0     y_hat:  [3.]
y:  5.0     y_hat:  [5.]

Esempio n. 40

0

Mostra file

File: train_semi_sup.py Progetto: zhangyilun/stat441project


rate = 1

train_comb = train
# select 10% of test data
selected_test = test.sample(test.shape[0]/rate,replace=False,random_state=20422438)
train_comb = train_comb.append(selected_test)
train_comb_label = train_label
train_comb_unlabeled = pd.DataFrame(np.array([-1]*(test.shape[0]/rate)))
train_comb_label = np.array(train_comb_label.append(train_comb_unlabeled))
train_comb_label = train_comb_label.reshape(len(train_comb_label))


a_level = 1
label_prop_model = LabelSpreading(kernel="knn",alpha=a_level)
label_prop_model.fit(train_comb, train_comb_label)
pred_y = label_prop_model.transduction_
pred_y[:train.shape[0]] = train_label



X_train, X_test, y_train, y_test = train_test_split(label_prop_model.X_, pred_y, test_size=0.10, random_state=20422438)

model_erf = se.ExtraTreesClassifier(random_state=20422438,n_jobs=-1,n_estimators=1000)
model_erf.fit(X_train,y_train)

model_erf_pred = model_erf.predict(X_test)
model_erf_error = errFn(model_erf_pred,y_test)

print a_level

Esempio n. 41

0

Mostra file

File: LabelSpreadingExample.py Progetto: amkent5/MachineLearning

# Scikit-learns LabelSpreading method for semi-supervised learning

import numpy as np
from sklearn import datasets
from sklearn.semi_supervised import LabelSpreading

label_propagation_model = LabelSpreading()
iris = datasets.load_iris()

'''
Unlabeled entries in y

It is important to assign an identifier to unlabeled points along
with the labeled data when training the model with the fit method. 
The identifier that this implementation uses is the integer value -1.
'''

# generate boolean matrix where less than 30% are 'True'
rand_numgen = np.random.RandomState(42)
random_unlabeled_points = rand_numgen.rand(len(iris['target'])) < 0.3

# create the unlabelled data in the labels (setting to -1)
full_labels = iris['target']
cutdown_labels = np.copy(iris['target'])
cutdown_labels[random_unlabeled_points] = -1

print full_labels
'''
[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 2 2