Example #1
0
    def fit(self, min_k=1, max_k=9, verbose=True):
        acc_ones = []
        acc_zeros = []
        accs = []
        if verbose:
            print(
                '-----------------------------------START KNN OPTIMIZATION------------------------------------'
            )
        best = dict()
        for (n, a) in [(x, a) for x in range(min_k, max_k + 1, 2)
                       for a in [0.1, 0.01, 0.001, 0.0001]]:
            metric = []
            for i in range(self.K):
                train_X = self.train_Xs[i].copy()
                train_y = self.train_ys[i].copy()

                test_X = self.val_Xs[i].copy()
                test_y = self.val_ys[i].copy()

                model = label_propagation.LabelSpreading(kernel='knn',
                                                         n_neighbors=n,
                                                         alpha=a)
                model.fit(train_X, train_y)
                y_pred = model.predict(test_X)

                # evaluating by accuracy #
                # counting respectively True Positives, True Negatives and Correct count values
                ones = [(x, y) for (x, y) in zip(y_pred, test_y)
                        if x == y and y == 1.0]
                zeros = [(x, y) for (x, y) in zip(y_pred, test_y)
                         if x == y and y == 0.0]
                nc = [(x, y) for (x, y) in zip(y_pred, test_y) if x == y]

                one = test_y.count(1.0)
                zero = test_y.count(0.0)

                # accuracy calculation
                if verbose:
                    print(f'{i} - fold')

                acc_ones.append(division(len(ones), one))
                acc_zeros.append(division(len(zeros), zero))
                accs.append(division(len(nc), len(y_pred)))
                metric.append(
                    avg([division(len(ones), one),
                         division(len(zeros), zero)]))
            best.setdefault(avg(metric), (n, a))
            if verbose:
                print(f'----------------------{n}----------------------------')
                print(f'total one: {avg(acc_ones)}')
                print(f'total zero: {avg(acc_zeros)}')
                print(f'total accuracy: {geo_avg(accs)}')
        conf = best.get(np.array(list(best.keys())).max())
        if verbose:
            print(
                '-----------------------------------RESUME KNN OPTIMIZATION------------------------------------'
            )
            print(f"best n: {conf[0]} - best alpha: {conf[1]}")
        print('knn learned')
        return conf
Example #2
0
 def learn(self, i, conf):
     train_X = self.train_Xs[i].copy()
     train_y = self.train_ys[i].copy()
     return label_propagation.LabelSpreading(kernel='knn',
                                             n_neighbors=conf[0],
                                             alpha=conf[1]).fit(
                                                 train_X, train_y)
Example #3
0
def RBFKernel_optimization_experiment():
    best_gamma = 0.0
    best_acc = -1
    gammas = [0.001, 0.01, 0.1, 1, 10, 100, 1000]
    X_labeled = training_dataset[:500]
    y_labeled = training_labels_copy[0:500]
    for j in range(500, 5000):
        training_labels[j] = -1
    for gamma in gammas:
        label_spread = label_propagation.LabelSpreading(kernel='rbf',
                                                        gamma=gamma,
                                                        alpha=0.8)
        label_spread.fit(training_dataset, training_labels)
        y_training_predicted = label_spread.predict(X_labeled)
        y_testing_predicted = label_spread.predict(training_dataset)
        count = 0
        count1 = 0
        for k in range(1, 500):
            if (y_labeled[k] == y_training_predicted[k]):
                count = count + 1
        for l in range(1, 5000):
            if (training_labels_copy[l] == y_testing_predicted[l]):
                count1 = count1 + 1
        if best_acc < (count1 / 5000):
            best_gamma = gamma
            best_acc = count1 / 5000
        print("when gamma is " + str(gamma) + ", the training accuracy is " +
              str(count / 500) + ", the testing accuracy is " +
              str(count1 / 5000))
    print("the best gamma for RBF kernel is " + str(best_gamma))
    return best_gamma, best_acc
Example #4
0
def alpha_optimization_experiment():
    best_acc = -1
    best_alpha = 0.0
    alphas = [0.01, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 0.99]
    X_labeled = training_dataset[:500]
    y_labeled = training_labels_copy[0:500]
    for j in range(500, 5000):
        training_labels[j] = -1
    for alpha in alphas:
        label_spread = label_propagation.LabelSpreading(kernel='knn',
                                                        n_neighbors=10,
                                                        alpha=alpha)
        label_spread.fit(training_dataset, training_labels)
        y_training_predicted = label_spread.predict(X_labeled)
        y_testing_predicted = label_spread.predict(training_dataset)
        count = 0
        count1 = 0
        for k in range(1, 500):
            if (y_labeled[k] == y_training_predicted[k]):
                count = count + 1
        for l in range(1, 5000):
            if (training_labels_copy[l] == y_testing_predicted[l]):
                count1 = count1 + 1
        if best_acc < (count1 / 5000):
            best_alpha = alpha
            best_acc = count1 / 5000
        print("when alpha is " + str(alpha) + ", the training accuracy is " +
              str(count / 500) + ", the testing accuracy is " +
              str(count1 / 5000))
    print("the best alpha is " + str(best_alpha))
    return best_alpha
Example #5
0
def KNNKernel_optimization_experiment():
    best_numNei = 0
    best_acc = -1
    numNeis = [1, 3, 5, 7, 10, 20, 40, 60, 80, 100, 200, 400, 800, 1000]
    X_labeled = training_dataset[:500]
    y_labeled = training_labels_copy[0:500]
    for j in range(500, 5000):
        training_labels[j] = -1
    for numNei in numNeis:
        label_spread = label_propagation.LabelSpreading(kernel='knn',
                                                        n_neighbors=numNei,
                                                        alpha=0.8)
        label_spread.fit(training_dataset, training_labels)
        y_training_predicted = label_spread.predict(X_labeled)
        y_testing_predicted = label_spread.predict(training_dataset)
        count = 0
        count1 = 0
        for k in range(1, 500):
            if (y_labeled[k] == y_training_predicted[k]):
                count = count + 1
        for l in range(1, 5000):
            if (training_labels_copy[l] == y_testing_predicted[l]):
                count1 = count1 + 1
        if best_acc < (count1 / 5000):
            best_numNei = numNei
            best_acc = count1 / 5000
        print("When the number of neighbor is " + str(numNei) +
              ", the training accuracy is " + str(count / 500) +
              " the testing accuracy is " + str(count1 / 5000))
    print("the best number of neighbor is " + str(best_numNei))
    return best_numNei, best_acc
    def labelSpreading(labeled, unlabeled, xcols, ycols, alpha_v=0.8):
        """
        KNN label spreading testing
        
        Arguments:
            labeled {array} -- labeled data
            unlabeled {array} -- unlabeled data
            xcols {array} -- x columns
            ycols {array} -- y columns
        
        Keyword Arguments:
            alpha_v {double} -- alpha parameter (default: {0.8})
        """

        x = labeled.loc[:, xcols]
        y = labeled.loc[:, ycols]

        #Using LabelSpreading
        label_spread = label_propagation.LabelSpreading(kernel="knn",
                                                        alpha=alpha_v)
        label_spread.fit(x, y.values.ravel())

        #output labels
        preds = label_spread.predict(unlabeled)

        unlabeled.loc[:, "label"] = preds

        labeled = pd.concat([labeled, unlabeled],
                            sort=False).reset_index(drop=True)  #Combining

        return (labeled)
def test_valid_alpha():
    n_classes = 2
    X, y = make_classification(n_classes=n_classes,
                               n_samples=200,
                               random_state=0)
    for alpha in [-0.1, 0, 1, 1.1, None]:
        with pytest.raises(ValueError):
            label_propagation.LabelSpreading(alpha=alpha).fit(X, y)
Example #8
0
def test_label_spreading_closed_form():
    n_classes = 2
    X, y = make_classification(n_classes=n_classes, n_samples=200,
                               random_state=0)
    y[::3] = -1
    clf = label_propagation.LabelSpreading().fit(X, y)
    # adopting notation from Zhou et al (2004):
    S = clf._build_graph()
    Y = np.zeros((len(y), n_classes + 1))
    Y[np.arange(len(y)), y] = 1
    Y = Y[:, :-1]
    for alpha in [0.1, 0.3, 0.5, 0.7, 0.9]:
        expected = np.dot(np.linalg.inv(np.eye(len(S)) - alpha * S), Y)
        expected /= expected.sum(axis=1)[:, np.newaxis]
        clf = label_propagation.LabelSpreading(max_iter=10000, alpha=alpha)
        clf.fit(X, y)
        assert_array_almost_equal(expected, clf.label_distributions_, 4)
Example #9
0
 def __init__(self, _feats, max_iter=10):
     super(
         SSLBased,
         self,
     ).__init__(_feats)
     self.max_iter = max_iter
     self.ssl_model = label_propagation.LabelSpreading(
         kernel='rbf', n_neighbors=10, max_iter=self.max_iter)
 def objective(self, x):
     model = label_propagation.LabelSpreading(kernel=self.kernel,
                                              alpha=self.alpha,
                                              gamma=x)
     model.fit(self.x, self.y)
     label_prob = model.label_distributions_
     return get_average_label_entropy(
         label_prob) + self.learning_rate * x**2
Example #11
0
def make_model3():
    model =  label_propagation.LabelSpreading(kernel='knn', n_neighbors=15)
    sensor_data = dataset.load_data()
    X, y = sensor_data.data[:200], sensor_data.target[:200]
    model.fit(X, y)
    np.savetxt("X.csv", X, delimiter=",", fmt='%10.5f')
    np.savetxt("y_train.csv", y, delimiter=",", fmt='%10.1f')
    return model
def test_convergence_warning():
    # This is a non-regression test for #5774
    X = np.array([[1., 0.], [0., 1.], [1., 2.5]])
    y = np.array([0, 1, -1])
    mdl = label_propagation.LabelSpreading(kernel='rbf', max_iter=1)
    assert_warns(ConvergenceWarning, mdl.fit, X, y)
    assert mdl.n_iter_ == mdl.max_iter

    mdl = label_propagation.LabelPropagation(kernel='rbf', max_iter=1)
    assert_warns(ConvergenceWarning, mdl.fit, X, y)
    assert mdl.n_iter_ == mdl.max_iter

    mdl = label_propagation.LabelSpreading(kernel='rbf', max_iter=500)
    assert_no_warnings(mdl.fit, X, y)

    mdl = label_propagation.LabelPropagation(kernel='rbf', max_iter=500)
    assert_no_warnings(mdl.fit, X, y)
Example #13
0
def test_valid_alpha():
    n_classes = 2
    X, y = make_classification(n_classes=n_classes, n_samples=200,
                               random_state=0)
    for alpha in [-0.1, 0, 1, 1.1, None]:
        assert_raises(ValueError,
                      lambda **kwargs:
                      label_propagation.LabelSpreading(**kwargs).fit(X, y),
                      alpha=alpha)
def test_convergence_speed():
    # This is a non-regression test for #5774
    X = np.array([[1., 0.], [0., 1.], [1., 2.5]])
    y = np.array([0, 1, -1])
    mdl = label_propagation.LabelSpreading(kernel='rbf', max_iter=5000)
    mdl.fit(X, y)

    # this should converge quickly:
    assert mdl.n_iter_ < 10
    assert_array_equal(mdl.predict(X), [0, 1, 1])
Example #15
0
def get_85p_churn(train_15p, train_15p_churn, train_85p, **params):
    """
    半监督学习
    :param train_15p:
    :param train_15p_churn:
    :param train_85p:
    :param params:
    :return:
    """

    if DefaultConfig.semi_model == 'pseudo_labeler':
        from xgboost import XGBClassifier
        from lightgbm import LGBMClassifier
        from catboost import CatBoostClassifier

        model = None
        sample_rate = 0.3
        if DefaultConfig.select_model is 'xgb':
            model = XGBClassifier(nthread=10)
            sample_rate = 0.3

        elif DefaultConfig.select_model is 'lgb':
            model = LGBMClassifier(n_jobs=10)
            sample_rate = 0.3

        elif DefaultConfig.select_model is 'cat':
            model = CatBoostClassifier(thread_count=10)
            sample_rate = 0.3

        models = PseudoLabeler(
            model=model,
            unlabled_data=train_85p,
            features=train_85p.columns,
            target='Churn',
            sample_rate=sample_rate)

        models.fit(train_15p, train_15p_churn)
        train_85p['Churn'] = models.predict(train_85p)

    elif DefaultConfig.semi_model == 'label_spreading':
        from sklearn.semi_supervised import label_propagation

        label_spread = label_propagation.LabelSpreading(kernel='rbf', alpha=0.8, gamma=.25, max_iter=200, n_jobs=10)
        label_spread.fit(train_15p, train_15p_churn)
        train_85p['Churn'] = label_spread.predict(train_85p)

    elif DefaultConfig.semi_model == 'label_propagation':
        from sklearn.semi_supervised import LabelPropagation
        label_propagation = LabelPropagation(kernel='knn', gamma=.25, max_iter=200, n_jobs=10)
        label_propagation.fit(train_15p, train_15p_churn)
        train_85p['Churn'] = label_propagation.predict(train_85p)

    return train_85p
Example #16
0
def loadOrCreateModel(pkl_model_filename):
    """Checks if there is already an existing model, otherwise creates a new one"""
    # Check if a pickle file for a model is available already
    if is_file_accessible(pkl_model_filename):
        # Load from file
        with open(pkl_model_filename, 'rb') as file:
            pickle_model = pickle.load(file)
        print("Loading model from file.")
        lp_model = pickle_model
    else:
        lp_model = label_propagation.LabelSpreading(gamma=0.25, max_iter=5)
        print("building a new model")
    return lp_model
Example #17
0
def make_model2():
    sensor_data = dataset.load_data()
    rng = np.random.RandomState(0)
    indices = np.arange(len(sensor_data.data))
    rng.shuffle(indices)
    print(len(sensor_data.data))
    sm = SMOTE(random_state=42)
    X, y  = sm.fit_sample(sensor_data.data[indices[:2000]], sensor_data.target[indices[:2000]])

    n_total_samples = len(y)
    print(len(y))
    n_labeled_points = 200
    max_iterations = 50
    unlabeled_indices = np.arange(n_total_samples)[n_labeled_points:]
    lp_model = label_propagation.LabelSpreading(kernel='knn', n_neighbors=15)

    for i in range(max_iterations):
        if len(unlabeled_indices) == 0:
            print("No unlabeled items left to label.")
            break
        y_train = np.copy(y)
        y_train[unlabeled_indices] = -1
        lp_model.fit(X, y_train)
        p = lp_model.predict_proba(X[unlabeled_indices])
        # predicted_labels = [1 if x > 0.57 else 0 for x in p[:, 1]]
        predicted_labels = lp_model.predict(X[unlabeled_indices])

        true_labels = y[unlabeled_indices]
        # print("#"*20 + "Iteration :: " + str(i) + "#"*20)
        # print(classification_report(true_labels, predicted_labels))

        pred_entropies = stats.distributions.entropy(
            lp_model.label_distributions_.T)
        uncertainty_index = np.argsort(pred_entropies)[::-1]
        uncertainty_index = uncertainty_index[
                                np.in1d(uncertainty_index, unlabeled_indices)][:40]
        delete_indices = np.array([])
        for index in uncertainty_index:
            delete_index, = np.where(unlabeled_indices == index)
            delete_indices = np.concatenate((delete_indices, delete_index))
        unlabeled_indices = np.delete(unlabeled_indices, delete_indices)
        n_labeled_points += len(uncertainty_index)




    np.savetxt("X.csv", X, delimiter=",", fmt='%10.5f')
    np.savetxt("y_train.csv", y_train, delimiter=",", fmt='%10.1f')
    return lp_model
def main():
    print('loading dataset')
    train, test = load_data(dconf)

    print('training model')
    lp_model = label_propagation.LabelSpreading(gamma=0.25, max_iter=5)
    lp_model.fit(train['x'], train['y'])

    print('testing model')
    pred = lp_model.predict(test['x'])

    print(int_accuracy(test['y'], pred))

    for i in range(len(pred)):
        print((pred[i], test['y'][i]), end='')
Example #19
0
def clustershops(labelsamazon,labelsgoogle):
    shopkeys = clusterimages(labelsamazon,labelsgoogle)[4]#getting the shopkeys
    shopkeyslabeled = shopkeysfinal #labeled keys from my saved data   #513
    shopkeys = shopkeyslabeled + shopkeys
    indicesboard = [i for i, x in enumerate(shopkeys) if x in boardkeys] #incides of board
    indicesactivity = [i for i, x in enumerate(shopkeys) if x in activitykeys]
    indicesselfie = [i for i, x in enumerate(shopkeys) if x in selfiekeys]
    indicesstock = [i for i, x in enumerate(shopkeys) if x in stockkeys]

    arr = []
    for i in range(len(shopkeys)):
        arr.append(-1)
    #updating array with labels
    for i in indicesboard:
        arr[i] = 0
    for i in indicesactivity:
        arr[i] = 1    
    for i in indicesselfie:
        arr[i] = 2    
    for i in indicesstock:
        arr[i] = 3
    #so that we dont have to run the label function again
    #we just subset the previous saved labels using the keys
    indices = [i for i, x in enumerate(akey) if x in shopkeys]
    shopamazon = []
    for i in indices:
        shopamazon.append(amazonlabels[i])
    shopgoogle = []
    for i in indices:
        shopgoogle.append(googlelabels[i])
    
    z1 = get_zmatrix_amazon(shopamazon,shopkeys)
    z2 = get_zmatrix_google(shopgoogle,shopkeys)
    z = pd.concat([z1,z2],axis= 1)
    lp_model = label_propagation.LabelSpreading(kernel = 'knn',alpha = 0.8,n_neighbors = 10)
    #acc= 0.6, kappa  = 0.47
    lp_model.fit(z,arr)
    indices = [i for i, x in enumerate(arr) if x == -1]
    predicted_labels = lp_model.transduction_[indices]
    pred_entropies = stats.distributions.entropy(lp_model.label_distributions_.T)
    uncertainty_index = np.argsort(pred_entropies)[-100:]
    keys=[]
    for i in indices:
        keys.append(shopkeys[i])
    silhouette = metrics.silhouette_score(z,predicted_labels , metric='euclidean')
    return(predicted_labels,silhouette,uncertainty_index,pred_entropies)
def get_score(xtrain, xtest, ytrain, ytest):
    scaler = preprocessing.StandardScaler().fit(xtrain)
    xtrain = scaler.transform(xtrain)
    xtest = scaler.transform(xtest)

    # Use label propagation for semi supervised learning and random forest for supervised learning
    model = label_propagation.LabelSpreading(kernel='rbf', alpha=0.2)
    # model = RandomForestClassifier(max_depth=2, random_state=0)

    model.fit(xtrain, ytrain)
    test_pred = np.array(model.predict(xtest))
    # ytest = np.array(ytest)
    # if(test_pred[0] == ytest[0]):
    # 	return 1
    # else:
    # 	return 0
    return test_pred
    def train(self,
              inputs,
              targets,
              min_=0.01,
              max_=30,
              niter=10,
              stepsize=0.1):
        # Scale the training data
        self.x = inputs
        self.y = targets

        # Tune gamma in RBF using basinhopping
        self.gamma = self.optimize(min_, max_, niter, stepsize)[0]

        # Propogate labels
        self.model = label_propagation.LabelSpreading(kernel=self.kernel,
                                                      alpha=self.alpha,
                                                      gamma=self.gamma)
        self.model.fit(self.x, self.y)
Example #22
0
    def objective(self, x):
        """
        Objective function for hyper-parameter selection/evaluation

        Parameters
        ----------
        x : hyper-parameter under test - gamma

        Returns
        -------
        float
            a measure directly proportional to entropy
        """
        model = label_propagation.LabelSpreading(kernel=self.kernel,
                                                 alpha=self.alpha,
                                                 gamma=x)
        model.fit(self.x, self.y)
        label_prob = model.label_distributions_
        return get_average_label_entropy(
            label_prob) + self.learning_rate * x**2
Example #23
0
    def train(self,
              inputs,
              targets,
              min_=0.01,
              max_=30,
              niter=10,
              stepsize=0.1):
        """
        Train the LP model given the data

        Parameters
        ----------
        inputs : nd-array
            independent variables
        targets : vector
            dependent variable
        min : float
            []
        max : float
            []
        niter : int
            number of training iterations
        stepsize : float
            []
        """
        # Scale the training data
        self.x = inputs
        self.y = targets

        # Tune gamma in RBF using basinhopping
        self.gamma = self.optimize(min_, max_, niter, stepsize)[0]

        # Propogate labels
        self.model = label_propagation.LabelSpreading(kernel=self.kernel,
                                                      alpha=self.alpha,
                                                      gamma=self.gamma)
        self.model.fit(self.x, self.y)
        if self.use_logger:
            self.logger.info(
                "Label Propagation model trained with {} samples".format(
                    len(self.y)))
Example #24
0
def retrieve_label_propagation(Lambda_res,
                               lam_h,
                               seed_set_high,
                               n_neighbours,
                               restrict_positive_hfs=False,
                               q=None):
    if Lambda_res < lam_h:
        return None

    labels = np.zeros(len(global_variables.embedding)) - 1
    remaining_elements = set(range(len(global_variables.embedding))) - set(
        [v for v, s in seed_set_high])
    if len(remaining_elements) == 0:
        return None  # assuming that undiscovered low fidelities are depleted before the high ones!
    remaining_elements = np.array(list(remaining_elements))
    if restrict_positive_hfs:
        remaining_elements = np.setdiff1d(
            remaining_elements,
            np.where(global_variables.all_hfs[q] > 0)[0])

    threshold = np.percentile([s for _, s in seed_set_high], 90) - 0.05

    labels = np.zeros(len(global_variables.embedding)) - 1
    for i, s in seed_set_high:
        if s > threshold:
            labels[i] = 1
        else:
            labels[i] = 0
    lp_model = label_propagation.LabelSpreading(kernel='knn',
                                                n_neighbors=n_neighbours,
                                                max_iter=10)
    lp_model.fit(global_variables.embedding, labels)
    class_1 = np.where(lp_model.classes_ == 1)[0]
    class_1_ind = class_1[0]
    v = remaining_elements[np.argmax(
        lp_model.label_distributions_[remaining_elements, class_1_ind])]

    return v
Example #25
0
def different_split_experiment():
    best_split = 0
    best_acc = 0
    for i in range(1, 10):
        X_labeled = training_dataset[:500 * i]
        y_labeled = training_labels_copy[0:500 * i]
        # X_unlabeled = training_dataset[500 * i : 5000]
        #y_unlabeled = training_labels_copy[500 * i : 5000]
        for j in range(500 * i, 5000):
            training_labels[j] = -1
        label_spread = label_propagation.LabelSpreading(kernel='knn',
                                                        n_neighbors=10,
                                                        alpha=0.8)
        label_spread.fit(training_dataset, training_labels)
        y_training_predicted = label_spread.predict(X_labeled)
        y_testing_predicted = label_spread.predict(training_dataset)
        count = 0
        count1 = 0
        for k in range(1, 500 * i):
            if (y_labeled[k] == y_training_predicted[k]):
                count = count + 1
        for l in range(1, 5000):
            if (training_labels_copy[l] == y_testing_predicted[l]):
                count1 = count1 + 1
        if best_acc < (count / (500 * i)):
            best_split = i
            best_acc = count / (500 * i)
        print("when the proportion of labeled data to the unlabeled data is " +
              str(i) + " : " + str((10 - i)) +
              ", the training accuracy of labeled training data is " +
              str(round(count / (500 * i), 5)) +
              ", the testing acuracy of unseen data is " +
              str(round(count1 / (5000), 5)))
    print("the best split proportion of labeled data to unlabeled data is " +
          str(best_split) + " : " + str((10 - best_split)))
    return best_split
Example #26
0
from sklearn.semi_supervised import label_propagation
from sklearn.datasets import make_circles
#%%
# generate ring with inner box
n_samples = 200
X, y = make_circles(n_samples=n_samples,
                    shuffle=False)  # X: coordinates at 2D plane [-1, 1]
# plt.scatter(X[:,0],X[:,1])
outer, inner = 0, 1
labels = np.full(n_samples, -1.)  # original labels
labels[0] = outer  # first point
labels[-1] = inner  # last point

# #############################################################################
# Learn with LabelSpreading
label_spread = label_propagation.LabelSpreading(kernel='knn', alpha=0.8)
label_spread.fit(X, labels)

# #############################################################################
# Plot output labels
output_labels = label_spread.transduction_
plt.figure(figsize=(8.5, 4))
plt.subplot(1, 2, 1)
plt.scatter(X[labels == outer, 0],
            X[labels == outer, 1],
            color='navy',
            marker='s',
            lw=0,
            label="outer labeled",
            s=10)
plt.scatter(X[labels == inner, 0],
Example #27
0
n_total_samples = len(y)  # 330
n_labeled_points = 10  # 标注好的数据共10条
max_iterations = 5  # 迭代5次

unlabeled_indices = np.arange(n_total_samples)[n_labeled_points:]  # 未标注的数据320条
# print('unlabeled_indices:',unlabeled_indices)
f = plt.figure()  # 画图用的

for i in range(max_iterations):
    if len(unlabeled_indices) == 0:
        print("no unlabeled items left to label")  # 没有未标记的标签了,全部标注好了
        break
    y_train = np.copy(y)
    y_train[unlabeled_indices] = -1  #把未标注的数据全部标记为-1,也就是后320条数据

    lp_model = label_propagation.LabelSpreading(gamma=0.25, max_iter=5)  # 训练模型
    lp_model.fit(X, y_train)

    predicted_labels = lp_model.transduction_[unlabeled_indices]  # 预测的标签
    #     print('predicted_labels:',predicted_labels)
    true_labels = y[unlabeled_indices]  # 真实的标签

    cm = confusion_matrix(true_labels,
                          predicted_labels,
                          labels=lp_model.classes_)

    print("iteration %i %s" % (i, 70 * "_"))  # 打印迭代次数
    print("Label Spreading model: %d labeled & %d unlabeled (%d total)" %
          (n_labeled_points, n_total_samples - n_labeled_points,
           n_total_samples))
iris = datasets.load_iris()

X = iris.data[:, :2]
y = iris.target

# step size in the mesh
h = .02

y_30 = np.copy(y)
y_30[rng.rand(len(y)) < 0.3] = -1
y_50 = np.copy(y)
y_50[rng.rand(len(y)) < 0.5] = -1
# we create an instance of SVM and fit out data. We do not scale our
# data since we want to plot the support vectors
ls30 = (label_propagation.LabelSpreading().fit(X, y_30), y_30)
ls50 = (label_propagation.LabelSpreading().fit(X, y_50), y_50)
ls100 = (label_propagation.LabelSpreading().fit(X, y), y)
rbf_svc = (svm.SVC(kernel='rbf').fit(X, y), y)

# create a mesh to plot in
x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1
y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1
xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h))

# title for the plots
titles = [
    'Label Spreading 30% data', 'Label Spreading 50% data',
    'Label Spreading 100% data', 'SVC with rbf kernel'
]
Example #29
0
    def _label_propagation(self):
        
        digits = datasets.load_digits()
        rng = np.random.RandomState(0)
        indices = np.arange(len(digits.data))
        rng.shuffle(indices)

        X = digits.data[indices[:330]]
        y = digits.target[indices[:330]]
        images = digits.images[indices[:330]]

        n_total_samples = len(y)
        n_labeled_points = 10

        unlabeled_indices = np.arange(n_total_samples)[n_labeled_points:]
        f = plt.figure()

        for i in range(5):
            y_train = np.copy(y)
            y_train[unlabeled_indices] = -1

            lp_model = label_propagation.LabelSpreading(gamma=0.25, max_iter=5)
            lp_model.fit(X, y_train)

            predicted_labels = lp_model.transduction_[unlabeled_indices]
            true_labels = y[unlabeled_indices]

            cm = confusion_matrix(true_labels, predicted_labels,
                                  labels=lp_model.classes_)

            print('Iteration %i %s' % (i, 70 * '_'))
            print("Label Spreading model: %d labeled & %d unlabeled (%d total)"
                  % (n_labeled_points, n_total_samples - n_labeled_points, n_total_samples))

            print(classification_report(true_labels, predicted_labels))

            print("Confusion matrix")
            print(cm)

            # compute the entropies of transduced label distributions
            pred_entropies = stats.distributions.entropy(
                lp_model.label_distributions_.T)

            # select five digit examples that the classifier is most uncertain about
            uncertainty_index = uncertainty_index = np.argsort(pred_entropies)[-5:]

            # keep track of indices that we get labels for
            delete_indices = np.array([])

            f.text(.05, (1 - (i + 1) * .183),
                   "model %d\n\nfit with\n%d labels" % ((i + 1), i * 5 + 10), size=10)
            for index, image_index in enumerate(uncertainty_index):
                image = images[image_index]

                sub = f.add_subplot(5, 5, index + 1 + (5 * i))
                sub.imshow(image, cmap=plt.cm.gray_r)
                sub.set_title('predict: %i\ntrue: %i' % (
                    lp_model.transduction_[image_index], y[image_index]), size=10)
                sub.axis('off')

                # labeling 5 points, remote from labeled set
                delete_index, = np.where(unlabeled_indices == image_index)
                delete_indices = np.concatenate((delete_indices, delete_index))

            unlabeled_indices = np.delete(unlabeled_indices, delete_indices)
            n_labeled_points += 5

        f.suptitle("Active learning with Label Propagation.\nRows show 5 most "
                   "uncertain labels to learn with the next model.")
        plt.subplots_adjust(0.12, 0.03, 0.9, 0.8, 0.2, 0.45)
        plt.show()
Example #30
0
def main(merged_filename, real_file, boted_file):

    documents = []
    print merged_filename
    users_info = getUserIMDMessages(merged_filename)
    #print users_info.keys()
    #print "total users:" + str(len(users_info.keys()))
    considered_users_index = []
    index = 0
    for user in users_info.keys():
        if users_info[user]['m'] > 1:
            considered_users_index.append(index)
        index += 1
    real_users_index, bot_users_index, real_users, bot_users = labeling_data(
        merged_filename, real_file, boted_file)

    labels = []
    users = users_info.keys()
    for user in users_info.keys():
        if users.index(user) in considered_users_index:
            if user in real_users:
                labels.append(0)
            else:
                labels.append(1)

    users_dict = users_info

    ## For no of messages per user feature
    user_chats_ft = get_chats_features(users_dict)
    user_chats_ft = pd.DataFrame(user_chats_ft)

    # User windows
    uw_dict = user_windows(merged_filename)
    uw_ft = []
    for user in users_dict.keys():
        if users_dict[user]['m'] > 1:
            uw_ft.append(uw_dict[user])
    uw_ft = pd.DataFrame(uw_ft)

    ## For per user imds features
    user_imd_bins = pd.DataFrame(get_IMD_features(users_dict))

    # Feature of Entropy of imds of a user
    user_entropy = pd.DataFrame(np.array(
        get_entropy_features(merged_filename)))

    ## to get the features of users with no of messages > 1
    user_entropy = get_final_features(user_entropy, considered_users_index)

    # print(uw_ft, user_entropy)

    ## plot entropy feature of each user
    #plot_entropy_feature(user_entropy,considered_users_index,real_users,bot_users,users)

    final_features = pd.concat([user_chats_ft, user_imd_bins], axis=1)
    # final_features = pd.concat([user_chats_ft,user_imd_bins,uw_ft,user_entropy],axis=1)

    real_X = []
    real_Y = []
    bot_X = []
    bot_Y = []
    list_user_fts = final_features.values.tolist()
    #print len(list_user_fts)
    for i, index in enumerate(considered_users_index):
        if users[index] in real_users:
            real_X.append(list_user_fts[i][0])
            real_Y.append(list_user_fts[i][1])
        else:
            bot_X.append(list_user_fts[i][0])
            bot_Y.append(list_user_fts[i][1])
    real_tup = [(real_X[i], real_Y[i]) for i in range(len((real_X)))]
    bot_tup = [(bot_X[i], bot_Y[i]) for i in range(len((bot_X)))]
    #real_tup = sorted(real_tup)
    #bot_tup = sorted(bot_tup)
    #print real_tup
    #print bot_tup
    #plt.scatter(real_X,real_Y,c='red',s=7)
    #plt.scatter(bot_X,bot_Y,c='blue',s=7)
    #plt.axvline(x=initavg_f1,c='yellow')
    #lt.axhline(y=initavg_f2,c='green')
    #plt.title(filename)
    #plt.show()

    ## Cluster datapoints on desired new set of features
    # f1 = final_features.iloc[:,0].values
    # f2 = final_features.iloc[:,1].values
    # gt_X = np.array(list(zip(f1,f2)))
    # Xmeans = XMeans(kmax=7)
    # Xmeans.fit(list(gt_X))
    # XMeanslabels = Xmeans.labels_
    # plot_graph(gt_X,real_X,real_Y,bot_X,bot_Y,XMeanslabels)
    channel_followers = get_channel_followers(
        '../followers_cnt/',
        real_file.split('#')[1].split('database')[0])
    #real_users_index = set(real_users_index)
    users = set(users)
    for user in channel_followers:
        if user in users:
            real_users_index.append(list(users).index(user))
    print real_users_index
    #plot_graph(final_features,real_users_index,bot_users_index)

    print "#considered users:" + str(len(considered_users_index))

    label_X, label_Y = data_labelprop(final_features, real_users_index,
                                      bot_users_index)
    print len(label_X), len(label_Y)
    orig_labelX, orig_labelY = label_X[:], label_Y[:]
    label_X, label_Y = readjust(label_X, label_Y, uw_ft, user_entropy)
    real_X = []
    real_Y = []
    bot_X = []
    bot_Y = []
    for i in range(len(label_X)):
        if label_Y[i] == 0:
            real_X.append(final_features.iloc[i, 0])
            real_Y.append(final_features.iloc[i, 1])
        elif label_Y[i] == 1:
            bot_X.append(final_features.iloc[i, 0])
            bot_Y.append(final_features.iloc[i, 1])
    plt.scatter(real_X, real_Y, c='blue', s=25, label='real')
    plt.scatter(bot_X, bot_Y, c='red', s=25, label='bot')
    #plt.tick_params(labelsize=16)
    #hfont = {'fontname':'Helvetica'}
    plt.rc('font', family='sans-serif')
    plt.rc('xtick', labelsize='x-large')
    plt.rc('ytick', labelsize='x-large')
    plt.xlabel('Number  of  messages  per  user', fontsize='x-large')
    plt.ylabel('Mean  IMD  per  user', fontsize='x-large')
    #plt.title('Seed labels',fontsize='large',fontweight='bold')
    plt.legend(loc='lower right', prop={'size': 12})
    plt.show()
    # Learn with LabelSpreading
    label_spread = label_propagation.LabelSpreading(kernel='rbf', alpha=0.6)
    label_spread.fit(label_X, label_Y)
    output_labels = label_spread.transduction_
    label_spread.fit(orig_labelX, orig_labelY)
    orig_output_labels = label_spread.transduction_

    # print output_labels
    # print labels
    pred_real_X = []
    pred_real_Y = []
    pred_bot_X = []
    pred_bot_Y = []
    for i in range(len(output_labels)):
        if output_labels[i] == 0:
            pred_real_X.append(final_features.iloc[i, 0])
            pred_real_Y.append(final_features.iloc[i, 1])
        else:
            pred_bot_X.append(final_features.iloc[i, 0])
            pred_bot_Y.append(final_features.iloc[i, 1])

    plt.xlim(0, 80)
    plt.ylim(0, 1500)
    plt.scatter(pred_real_X, pred_real_Y, c='blue', s=25, label='real')
    plt.scatter(pred_bot_X, pred_bot_Y, c='red', s=25, label='bot')
    #plt.tick_params(labelsize=16)
    plt.legend(loc='upper right', prop={'size': 12})
    #hfont = {'fontname':'Helvetica'}
    plt.rc('font', family='sans-serif')
    plt.rc('xtick', labelsize='x-large')
    plt.rc('ytick', labelsize='x-large')
    plt.xlabel('Number  of  messages  per  user', fontsize='x-large')
    plt.ylabel('Mean  IMD  per  user', fontsize='x-large')
    #plt.title('Final labels after propagation',fontsize='large',fontweight='bold')
    plt.show()

    orig_tot, orig_cor = 0, 0
    total, correct = 0, 0
    for i in range(len(output_labels)):
        if label_Y[i] == -1:
            if labels[i] == output_labels[i]:
                correct += 1
            else:
                print label_X[i], i
            total += 1
        if orig_labelY[i] == -1:
            if labels[i] == orig_output_labels[i]:
                orig_cor += 1
            orig_tot += 1
    print correct, total
    print(float(correct) / total) * 100
    print accuracy_score(np.array(labels), output_labels) * 100
    return accuracy_score(np.array(labels), output_labels) * 100
    print orig_cor, orig_tot
    print(float(orig_cor) / orig_tot) * 100
    print accuracy_score(np.array(labels), orig_output_labels) * 100