def fit(self, min_k=1, max_k=9, verbose=True): acc_ones = [] acc_zeros = [] accs = [] if verbose: print( '-----------------------------------START KNN OPTIMIZATION------------------------------------' ) best = dict() for (n, a) in [(x, a) for x in range(min_k, max_k + 1, 2) for a in [0.1, 0.01, 0.001, 0.0001]]: metric = [] for i in range(self.K): train_X = self.train_Xs[i].copy() train_y = self.train_ys[i].copy() test_X = self.val_Xs[i].copy() test_y = self.val_ys[i].copy() model = label_propagation.LabelSpreading(kernel='knn', n_neighbors=n, alpha=a) model.fit(train_X, train_y) y_pred = model.predict(test_X) # evaluating by accuracy # # counting respectively True Positives, True Negatives and Correct count values ones = [(x, y) for (x, y) in zip(y_pred, test_y) if x == y and y == 1.0] zeros = [(x, y) for (x, y) in zip(y_pred, test_y) if x == y and y == 0.0] nc = [(x, y) for (x, y) in zip(y_pred, test_y) if x == y] one = test_y.count(1.0) zero = test_y.count(0.0) # accuracy calculation if verbose: print(f'{i} - fold') acc_ones.append(division(len(ones), one)) acc_zeros.append(division(len(zeros), zero)) accs.append(division(len(nc), len(y_pred))) metric.append( avg([division(len(ones), one), division(len(zeros), zero)])) best.setdefault(avg(metric), (n, a)) if verbose: print(f'----------------------{n}----------------------------') print(f'total one: {avg(acc_ones)}') print(f'total zero: {avg(acc_zeros)}') print(f'total accuracy: {geo_avg(accs)}') conf = best.get(np.array(list(best.keys())).max()) if verbose: print( '-----------------------------------RESUME KNN OPTIMIZATION------------------------------------' ) print(f"best n: {conf[0]} - best alpha: {conf[1]}") print('knn learned') return conf
def learn(self, i, conf): train_X = self.train_Xs[i].copy() train_y = self.train_ys[i].copy() return label_propagation.LabelSpreading(kernel='knn', n_neighbors=conf[0], alpha=conf[1]).fit( train_X, train_y)
def RBFKernel_optimization_experiment(): best_gamma = 0.0 best_acc = -1 gammas = [0.001, 0.01, 0.1, 1, 10, 100, 1000] X_labeled = training_dataset[:500] y_labeled = training_labels_copy[0:500] for j in range(500, 5000): training_labels[j] = -1 for gamma in gammas: label_spread = label_propagation.LabelSpreading(kernel='rbf', gamma=gamma, alpha=0.8) label_spread.fit(training_dataset, training_labels) y_training_predicted = label_spread.predict(X_labeled) y_testing_predicted = label_spread.predict(training_dataset) count = 0 count1 = 0 for k in range(1, 500): if (y_labeled[k] == y_training_predicted[k]): count = count + 1 for l in range(1, 5000): if (training_labels_copy[l] == y_testing_predicted[l]): count1 = count1 + 1 if best_acc < (count1 / 5000): best_gamma = gamma best_acc = count1 / 5000 print("when gamma is " + str(gamma) + ", the training accuracy is " + str(count / 500) + ", the testing accuracy is " + str(count1 / 5000)) print("the best gamma for RBF kernel is " + str(best_gamma)) return best_gamma, best_acc
def alpha_optimization_experiment(): best_acc = -1 best_alpha = 0.0 alphas = [0.01, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 0.99] X_labeled = training_dataset[:500] y_labeled = training_labels_copy[0:500] for j in range(500, 5000): training_labels[j] = -1 for alpha in alphas: label_spread = label_propagation.LabelSpreading(kernel='knn', n_neighbors=10, alpha=alpha) label_spread.fit(training_dataset, training_labels) y_training_predicted = label_spread.predict(X_labeled) y_testing_predicted = label_spread.predict(training_dataset) count = 0 count1 = 0 for k in range(1, 500): if (y_labeled[k] == y_training_predicted[k]): count = count + 1 for l in range(1, 5000): if (training_labels_copy[l] == y_testing_predicted[l]): count1 = count1 + 1 if best_acc < (count1 / 5000): best_alpha = alpha best_acc = count1 / 5000 print("when alpha is " + str(alpha) + ", the training accuracy is " + str(count / 500) + ", the testing accuracy is " + str(count1 / 5000)) print("the best alpha is " + str(best_alpha)) return best_alpha
def KNNKernel_optimization_experiment(): best_numNei = 0 best_acc = -1 numNeis = [1, 3, 5, 7, 10, 20, 40, 60, 80, 100, 200, 400, 800, 1000] X_labeled = training_dataset[:500] y_labeled = training_labels_copy[0:500] for j in range(500, 5000): training_labels[j] = -1 for numNei in numNeis: label_spread = label_propagation.LabelSpreading(kernel='knn', n_neighbors=numNei, alpha=0.8) label_spread.fit(training_dataset, training_labels) y_training_predicted = label_spread.predict(X_labeled) y_testing_predicted = label_spread.predict(training_dataset) count = 0 count1 = 0 for k in range(1, 500): if (y_labeled[k] == y_training_predicted[k]): count = count + 1 for l in range(1, 5000): if (training_labels_copy[l] == y_testing_predicted[l]): count1 = count1 + 1 if best_acc < (count1 / 5000): best_numNei = numNei best_acc = count1 / 5000 print("When the number of neighbor is " + str(numNei) + ", the training accuracy is " + str(count / 500) + " the testing accuracy is " + str(count1 / 5000)) print("the best number of neighbor is " + str(best_numNei)) return best_numNei, best_acc
def labelSpreading(labeled, unlabeled, xcols, ycols, alpha_v=0.8): """ KNN label spreading testing Arguments: labeled {array} -- labeled data unlabeled {array} -- unlabeled data xcols {array} -- x columns ycols {array} -- y columns Keyword Arguments: alpha_v {double} -- alpha parameter (default: {0.8}) """ x = labeled.loc[:, xcols] y = labeled.loc[:, ycols] #Using LabelSpreading label_spread = label_propagation.LabelSpreading(kernel="knn", alpha=alpha_v) label_spread.fit(x, y.values.ravel()) #output labels preds = label_spread.predict(unlabeled) unlabeled.loc[:, "label"] = preds labeled = pd.concat([labeled, unlabeled], sort=False).reset_index(drop=True) #Combining return (labeled)
def test_valid_alpha(): n_classes = 2 X, y = make_classification(n_classes=n_classes, n_samples=200, random_state=0) for alpha in [-0.1, 0, 1, 1.1, None]: with pytest.raises(ValueError): label_propagation.LabelSpreading(alpha=alpha).fit(X, y)
def test_label_spreading_closed_form(): n_classes = 2 X, y = make_classification(n_classes=n_classes, n_samples=200, random_state=0) y[::3] = -1 clf = label_propagation.LabelSpreading().fit(X, y) # adopting notation from Zhou et al (2004): S = clf._build_graph() Y = np.zeros((len(y), n_classes + 1)) Y[np.arange(len(y)), y] = 1 Y = Y[:, :-1] for alpha in [0.1, 0.3, 0.5, 0.7, 0.9]: expected = np.dot(np.linalg.inv(np.eye(len(S)) - alpha * S), Y) expected /= expected.sum(axis=1)[:, np.newaxis] clf = label_propagation.LabelSpreading(max_iter=10000, alpha=alpha) clf.fit(X, y) assert_array_almost_equal(expected, clf.label_distributions_, 4)
def __init__(self, _feats, max_iter=10): super( SSLBased, self, ).__init__(_feats) self.max_iter = max_iter self.ssl_model = label_propagation.LabelSpreading( kernel='rbf', n_neighbors=10, max_iter=self.max_iter)
def objective(self, x): model = label_propagation.LabelSpreading(kernel=self.kernel, alpha=self.alpha, gamma=x) model.fit(self.x, self.y) label_prob = model.label_distributions_ return get_average_label_entropy( label_prob) + self.learning_rate * x**2
def make_model3(): model = label_propagation.LabelSpreading(kernel='knn', n_neighbors=15) sensor_data = dataset.load_data() X, y = sensor_data.data[:200], sensor_data.target[:200] model.fit(X, y) np.savetxt("X.csv", X, delimiter=",", fmt='%10.5f') np.savetxt("y_train.csv", y, delimiter=",", fmt='%10.1f') return model
def test_convergence_warning(): # This is a non-regression test for #5774 X = np.array([[1., 0.], [0., 1.], [1., 2.5]]) y = np.array([0, 1, -1]) mdl = label_propagation.LabelSpreading(kernel='rbf', max_iter=1) assert_warns(ConvergenceWarning, mdl.fit, X, y) assert mdl.n_iter_ == mdl.max_iter mdl = label_propagation.LabelPropagation(kernel='rbf', max_iter=1) assert_warns(ConvergenceWarning, mdl.fit, X, y) assert mdl.n_iter_ == mdl.max_iter mdl = label_propagation.LabelSpreading(kernel='rbf', max_iter=500) assert_no_warnings(mdl.fit, X, y) mdl = label_propagation.LabelPropagation(kernel='rbf', max_iter=500) assert_no_warnings(mdl.fit, X, y)
def test_valid_alpha(): n_classes = 2 X, y = make_classification(n_classes=n_classes, n_samples=200, random_state=0) for alpha in [-0.1, 0, 1, 1.1, None]: assert_raises(ValueError, lambda **kwargs: label_propagation.LabelSpreading(**kwargs).fit(X, y), alpha=alpha)
def test_convergence_speed(): # This is a non-regression test for #5774 X = np.array([[1., 0.], [0., 1.], [1., 2.5]]) y = np.array([0, 1, -1]) mdl = label_propagation.LabelSpreading(kernel='rbf', max_iter=5000) mdl.fit(X, y) # this should converge quickly: assert mdl.n_iter_ < 10 assert_array_equal(mdl.predict(X), [0, 1, 1])
def get_85p_churn(train_15p, train_15p_churn, train_85p, **params): """ 半监督学习 :param train_15p: :param train_15p_churn: :param train_85p: :param params: :return: """ if DefaultConfig.semi_model == 'pseudo_labeler': from xgboost import XGBClassifier from lightgbm import LGBMClassifier from catboost import CatBoostClassifier model = None sample_rate = 0.3 if DefaultConfig.select_model is 'xgb': model = XGBClassifier(nthread=10) sample_rate = 0.3 elif DefaultConfig.select_model is 'lgb': model = LGBMClassifier(n_jobs=10) sample_rate = 0.3 elif DefaultConfig.select_model is 'cat': model = CatBoostClassifier(thread_count=10) sample_rate = 0.3 models = PseudoLabeler( model=model, unlabled_data=train_85p, features=train_85p.columns, target='Churn', sample_rate=sample_rate) models.fit(train_15p, train_15p_churn) train_85p['Churn'] = models.predict(train_85p) elif DefaultConfig.semi_model == 'label_spreading': from sklearn.semi_supervised import label_propagation label_spread = label_propagation.LabelSpreading(kernel='rbf', alpha=0.8, gamma=.25, max_iter=200, n_jobs=10) label_spread.fit(train_15p, train_15p_churn) train_85p['Churn'] = label_spread.predict(train_85p) elif DefaultConfig.semi_model == 'label_propagation': from sklearn.semi_supervised import LabelPropagation label_propagation = LabelPropagation(kernel='knn', gamma=.25, max_iter=200, n_jobs=10) label_propagation.fit(train_15p, train_15p_churn) train_85p['Churn'] = label_propagation.predict(train_85p) return train_85p
def loadOrCreateModel(pkl_model_filename): """Checks if there is already an existing model, otherwise creates a new one""" # Check if a pickle file for a model is available already if is_file_accessible(pkl_model_filename): # Load from file with open(pkl_model_filename, 'rb') as file: pickle_model = pickle.load(file) print("Loading model from file.") lp_model = pickle_model else: lp_model = label_propagation.LabelSpreading(gamma=0.25, max_iter=5) print("building a new model") return lp_model
def make_model2(): sensor_data = dataset.load_data() rng = np.random.RandomState(0) indices = np.arange(len(sensor_data.data)) rng.shuffle(indices) print(len(sensor_data.data)) sm = SMOTE(random_state=42) X, y = sm.fit_sample(sensor_data.data[indices[:2000]], sensor_data.target[indices[:2000]]) n_total_samples = len(y) print(len(y)) n_labeled_points = 200 max_iterations = 50 unlabeled_indices = np.arange(n_total_samples)[n_labeled_points:] lp_model = label_propagation.LabelSpreading(kernel='knn', n_neighbors=15) for i in range(max_iterations): if len(unlabeled_indices) == 0: print("No unlabeled items left to label.") break y_train = np.copy(y) y_train[unlabeled_indices] = -1 lp_model.fit(X, y_train) p = lp_model.predict_proba(X[unlabeled_indices]) # predicted_labels = [1 if x > 0.57 else 0 for x in p[:, 1]] predicted_labels = lp_model.predict(X[unlabeled_indices]) true_labels = y[unlabeled_indices] # print("#"*20 + "Iteration :: " + str(i) + "#"*20) # print(classification_report(true_labels, predicted_labels)) pred_entropies = stats.distributions.entropy( lp_model.label_distributions_.T) uncertainty_index = np.argsort(pred_entropies)[::-1] uncertainty_index = uncertainty_index[ np.in1d(uncertainty_index, unlabeled_indices)][:40] delete_indices = np.array([]) for index in uncertainty_index: delete_index, = np.where(unlabeled_indices == index) delete_indices = np.concatenate((delete_indices, delete_index)) unlabeled_indices = np.delete(unlabeled_indices, delete_indices) n_labeled_points += len(uncertainty_index) np.savetxt("X.csv", X, delimiter=",", fmt='%10.5f') np.savetxt("y_train.csv", y_train, delimiter=",", fmt='%10.1f') return lp_model
def main(): print('loading dataset') train, test = load_data(dconf) print('training model') lp_model = label_propagation.LabelSpreading(gamma=0.25, max_iter=5) lp_model.fit(train['x'], train['y']) print('testing model') pred = lp_model.predict(test['x']) print(int_accuracy(test['y'], pred)) for i in range(len(pred)): print((pred[i], test['y'][i]), end='')
def clustershops(labelsamazon,labelsgoogle): shopkeys = clusterimages(labelsamazon,labelsgoogle)[4]#getting the shopkeys shopkeyslabeled = shopkeysfinal #labeled keys from my saved data #513 shopkeys = shopkeyslabeled + shopkeys indicesboard = [i for i, x in enumerate(shopkeys) if x in boardkeys] #incides of board indicesactivity = [i for i, x in enumerate(shopkeys) if x in activitykeys] indicesselfie = [i for i, x in enumerate(shopkeys) if x in selfiekeys] indicesstock = [i for i, x in enumerate(shopkeys) if x in stockkeys] arr = [] for i in range(len(shopkeys)): arr.append(-1) #updating array with labels for i in indicesboard: arr[i] = 0 for i in indicesactivity: arr[i] = 1 for i in indicesselfie: arr[i] = 2 for i in indicesstock: arr[i] = 3 #so that we dont have to run the label function again #we just subset the previous saved labels using the keys indices = [i for i, x in enumerate(akey) if x in shopkeys] shopamazon = [] for i in indices: shopamazon.append(amazonlabels[i]) shopgoogle = [] for i in indices: shopgoogle.append(googlelabels[i]) z1 = get_zmatrix_amazon(shopamazon,shopkeys) z2 = get_zmatrix_google(shopgoogle,shopkeys) z = pd.concat([z1,z2],axis= 1) lp_model = label_propagation.LabelSpreading(kernel = 'knn',alpha = 0.8,n_neighbors = 10) #acc= 0.6, kappa = 0.47 lp_model.fit(z,arr) indices = [i for i, x in enumerate(arr) if x == -1] predicted_labels = lp_model.transduction_[indices] pred_entropies = stats.distributions.entropy(lp_model.label_distributions_.T) uncertainty_index = np.argsort(pred_entropies)[-100:] keys=[] for i in indices: keys.append(shopkeys[i]) silhouette = metrics.silhouette_score(z,predicted_labels , metric='euclidean') return(predicted_labels,silhouette,uncertainty_index,pred_entropies)
def get_score(xtrain, xtest, ytrain, ytest): scaler = preprocessing.StandardScaler().fit(xtrain) xtrain = scaler.transform(xtrain) xtest = scaler.transform(xtest) # Use label propagation for semi supervised learning and random forest for supervised learning model = label_propagation.LabelSpreading(kernel='rbf', alpha=0.2) # model = RandomForestClassifier(max_depth=2, random_state=0) model.fit(xtrain, ytrain) test_pred = np.array(model.predict(xtest)) # ytest = np.array(ytest) # if(test_pred[0] == ytest[0]): # return 1 # else: # return 0 return test_pred
def train(self, inputs, targets, min_=0.01, max_=30, niter=10, stepsize=0.1): # Scale the training data self.x = inputs self.y = targets # Tune gamma in RBF using basinhopping self.gamma = self.optimize(min_, max_, niter, stepsize)[0] # Propogate labels self.model = label_propagation.LabelSpreading(kernel=self.kernel, alpha=self.alpha, gamma=self.gamma) self.model.fit(self.x, self.y)
def objective(self, x): """ Objective function for hyper-parameter selection/evaluation Parameters ---------- x : hyper-parameter under test - gamma Returns ------- float a measure directly proportional to entropy """ model = label_propagation.LabelSpreading(kernel=self.kernel, alpha=self.alpha, gamma=x) model.fit(self.x, self.y) label_prob = model.label_distributions_ return get_average_label_entropy( label_prob) + self.learning_rate * x**2
def train(self, inputs, targets, min_=0.01, max_=30, niter=10, stepsize=0.1): """ Train the LP model given the data Parameters ---------- inputs : nd-array independent variables targets : vector dependent variable min : float [] max : float [] niter : int number of training iterations stepsize : float [] """ # Scale the training data self.x = inputs self.y = targets # Tune gamma in RBF using basinhopping self.gamma = self.optimize(min_, max_, niter, stepsize)[0] # Propogate labels self.model = label_propagation.LabelSpreading(kernel=self.kernel, alpha=self.alpha, gamma=self.gamma) self.model.fit(self.x, self.y) if self.use_logger: self.logger.info( "Label Propagation model trained with {} samples".format( len(self.y)))
def retrieve_label_propagation(Lambda_res, lam_h, seed_set_high, n_neighbours, restrict_positive_hfs=False, q=None): if Lambda_res < lam_h: return None labels = np.zeros(len(global_variables.embedding)) - 1 remaining_elements = set(range(len(global_variables.embedding))) - set( [v for v, s in seed_set_high]) if len(remaining_elements) == 0: return None # assuming that undiscovered low fidelities are depleted before the high ones! remaining_elements = np.array(list(remaining_elements)) if restrict_positive_hfs: remaining_elements = np.setdiff1d( remaining_elements, np.where(global_variables.all_hfs[q] > 0)[0]) threshold = np.percentile([s for _, s in seed_set_high], 90) - 0.05 labels = np.zeros(len(global_variables.embedding)) - 1 for i, s in seed_set_high: if s > threshold: labels[i] = 1 else: labels[i] = 0 lp_model = label_propagation.LabelSpreading(kernel='knn', n_neighbors=n_neighbours, max_iter=10) lp_model.fit(global_variables.embedding, labels) class_1 = np.where(lp_model.classes_ == 1)[0] class_1_ind = class_1[0] v = remaining_elements[np.argmax( lp_model.label_distributions_[remaining_elements, class_1_ind])] return v
def different_split_experiment(): best_split = 0 best_acc = 0 for i in range(1, 10): X_labeled = training_dataset[:500 * i] y_labeled = training_labels_copy[0:500 * i] # X_unlabeled = training_dataset[500 * i : 5000] #y_unlabeled = training_labels_copy[500 * i : 5000] for j in range(500 * i, 5000): training_labels[j] = -1 label_spread = label_propagation.LabelSpreading(kernel='knn', n_neighbors=10, alpha=0.8) label_spread.fit(training_dataset, training_labels) y_training_predicted = label_spread.predict(X_labeled) y_testing_predicted = label_spread.predict(training_dataset) count = 0 count1 = 0 for k in range(1, 500 * i): if (y_labeled[k] == y_training_predicted[k]): count = count + 1 for l in range(1, 5000): if (training_labels_copy[l] == y_testing_predicted[l]): count1 = count1 + 1 if best_acc < (count / (500 * i)): best_split = i best_acc = count / (500 * i) print("when the proportion of labeled data to the unlabeled data is " + str(i) + " : " + str((10 - i)) + ", the training accuracy of labeled training data is " + str(round(count / (500 * i), 5)) + ", the testing acuracy of unseen data is " + str(round(count1 / (5000), 5))) print("the best split proportion of labeled data to unlabeled data is " + str(best_split) + " : " + str((10 - best_split))) return best_split
from sklearn.semi_supervised import label_propagation from sklearn.datasets import make_circles #%% # generate ring with inner box n_samples = 200 X, y = make_circles(n_samples=n_samples, shuffle=False) # X: coordinates at 2D plane [-1, 1] # plt.scatter(X[:,0],X[:,1]) outer, inner = 0, 1 labels = np.full(n_samples, -1.) # original labels labels[0] = outer # first point labels[-1] = inner # last point # ############################################################################# # Learn with LabelSpreading label_spread = label_propagation.LabelSpreading(kernel='knn', alpha=0.8) label_spread.fit(X, labels) # ############################################################################# # Plot output labels output_labels = label_spread.transduction_ plt.figure(figsize=(8.5, 4)) plt.subplot(1, 2, 1) plt.scatter(X[labels == outer, 0], X[labels == outer, 1], color='navy', marker='s', lw=0, label="outer labeled", s=10) plt.scatter(X[labels == inner, 0],
n_total_samples = len(y) # 330 n_labeled_points = 10 # 标注好的数据共10条 max_iterations = 5 # 迭代5次 unlabeled_indices = np.arange(n_total_samples)[n_labeled_points:] # 未标注的数据320条 # print('unlabeled_indices:',unlabeled_indices) f = plt.figure() # 画图用的 for i in range(max_iterations): if len(unlabeled_indices) == 0: print("no unlabeled items left to label") # 没有未标记的标签了,全部标注好了 break y_train = np.copy(y) y_train[unlabeled_indices] = -1 #把未标注的数据全部标记为-1,也就是后320条数据 lp_model = label_propagation.LabelSpreading(gamma=0.25, max_iter=5) # 训练模型 lp_model.fit(X, y_train) predicted_labels = lp_model.transduction_[unlabeled_indices] # 预测的标签 # print('predicted_labels:',predicted_labels) true_labels = y[unlabeled_indices] # 真实的标签 cm = confusion_matrix(true_labels, predicted_labels, labels=lp_model.classes_) print("iteration %i %s" % (i, 70 * "_")) # 打印迭代次数 print("Label Spreading model: %d labeled & %d unlabeled (%d total)" % (n_labeled_points, n_total_samples - n_labeled_points, n_total_samples))
iris = datasets.load_iris() X = iris.data[:, :2] y = iris.target # step size in the mesh h = .02 y_30 = np.copy(y) y_30[rng.rand(len(y)) < 0.3] = -1 y_50 = np.copy(y) y_50[rng.rand(len(y)) < 0.5] = -1 # we create an instance of SVM and fit out data. We do not scale our # data since we want to plot the support vectors ls30 = (label_propagation.LabelSpreading().fit(X, y_30), y_30) ls50 = (label_propagation.LabelSpreading().fit(X, y_50), y_50) ls100 = (label_propagation.LabelSpreading().fit(X, y), y) rbf_svc = (svm.SVC(kernel='rbf').fit(X, y), y) # create a mesh to plot in x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1 y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1 xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h)) # title for the plots titles = [ 'Label Spreading 30% data', 'Label Spreading 50% data', 'Label Spreading 100% data', 'SVC with rbf kernel' ]
def _label_propagation(self): digits = datasets.load_digits() rng = np.random.RandomState(0) indices = np.arange(len(digits.data)) rng.shuffle(indices) X = digits.data[indices[:330]] y = digits.target[indices[:330]] images = digits.images[indices[:330]] n_total_samples = len(y) n_labeled_points = 10 unlabeled_indices = np.arange(n_total_samples)[n_labeled_points:] f = plt.figure() for i in range(5): y_train = np.copy(y) y_train[unlabeled_indices] = -1 lp_model = label_propagation.LabelSpreading(gamma=0.25, max_iter=5) lp_model.fit(X, y_train) predicted_labels = lp_model.transduction_[unlabeled_indices] true_labels = y[unlabeled_indices] cm = confusion_matrix(true_labels, predicted_labels, labels=lp_model.classes_) print('Iteration %i %s' % (i, 70 * '_')) print("Label Spreading model: %d labeled & %d unlabeled (%d total)" % (n_labeled_points, n_total_samples - n_labeled_points, n_total_samples)) print(classification_report(true_labels, predicted_labels)) print("Confusion matrix") print(cm) # compute the entropies of transduced label distributions pred_entropies = stats.distributions.entropy( lp_model.label_distributions_.T) # select five digit examples that the classifier is most uncertain about uncertainty_index = uncertainty_index = np.argsort(pred_entropies)[-5:] # keep track of indices that we get labels for delete_indices = np.array([]) f.text(.05, (1 - (i + 1) * .183), "model %d\n\nfit with\n%d labels" % ((i + 1), i * 5 + 10), size=10) for index, image_index in enumerate(uncertainty_index): image = images[image_index] sub = f.add_subplot(5, 5, index + 1 + (5 * i)) sub.imshow(image, cmap=plt.cm.gray_r) sub.set_title('predict: %i\ntrue: %i' % ( lp_model.transduction_[image_index], y[image_index]), size=10) sub.axis('off') # labeling 5 points, remote from labeled set delete_index, = np.where(unlabeled_indices == image_index) delete_indices = np.concatenate((delete_indices, delete_index)) unlabeled_indices = np.delete(unlabeled_indices, delete_indices) n_labeled_points += 5 f.suptitle("Active learning with Label Propagation.\nRows show 5 most " "uncertain labels to learn with the next model.") plt.subplots_adjust(0.12, 0.03, 0.9, 0.8, 0.2, 0.45) plt.show()
def main(merged_filename, real_file, boted_file): documents = [] print merged_filename users_info = getUserIMDMessages(merged_filename) #print users_info.keys() #print "total users:" + str(len(users_info.keys())) considered_users_index = [] index = 0 for user in users_info.keys(): if users_info[user]['m'] > 1: considered_users_index.append(index) index += 1 real_users_index, bot_users_index, real_users, bot_users = labeling_data( merged_filename, real_file, boted_file) labels = [] users = users_info.keys() for user in users_info.keys(): if users.index(user) in considered_users_index: if user in real_users: labels.append(0) else: labels.append(1) users_dict = users_info ## For no of messages per user feature user_chats_ft = get_chats_features(users_dict) user_chats_ft = pd.DataFrame(user_chats_ft) # User windows uw_dict = user_windows(merged_filename) uw_ft = [] for user in users_dict.keys(): if users_dict[user]['m'] > 1: uw_ft.append(uw_dict[user]) uw_ft = pd.DataFrame(uw_ft) ## For per user imds features user_imd_bins = pd.DataFrame(get_IMD_features(users_dict)) # Feature of Entropy of imds of a user user_entropy = pd.DataFrame(np.array( get_entropy_features(merged_filename))) ## to get the features of users with no of messages > 1 user_entropy = get_final_features(user_entropy, considered_users_index) # print(uw_ft, user_entropy) ## plot entropy feature of each user #plot_entropy_feature(user_entropy,considered_users_index,real_users,bot_users,users) final_features = pd.concat([user_chats_ft, user_imd_bins], axis=1) # final_features = pd.concat([user_chats_ft,user_imd_bins,uw_ft,user_entropy],axis=1) real_X = [] real_Y = [] bot_X = [] bot_Y = [] list_user_fts = final_features.values.tolist() #print len(list_user_fts) for i, index in enumerate(considered_users_index): if users[index] in real_users: real_X.append(list_user_fts[i][0]) real_Y.append(list_user_fts[i][1]) else: bot_X.append(list_user_fts[i][0]) bot_Y.append(list_user_fts[i][1]) real_tup = [(real_X[i], real_Y[i]) for i in range(len((real_X)))] bot_tup = [(bot_X[i], bot_Y[i]) for i in range(len((bot_X)))] #real_tup = sorted(real_tup) #bot_tup = sorted(bot_tup) #print real_tup #print bot_tup #plt.scatter(real_X,real_Y,c='red',s=7) #plt.scatter(bot_X,bot_Y,c='blue',s=7) #plt.axvline(x=initavg_f1,c='yellow') #lt.axhline(y=initavg_f2,c='green') #plt.title(filename) #plt.show() ## Cluster datapoints on desired new set of features # f1 = final_features.iloc[:,0].values # f2 = final_features.iloc[:,1].values # gt_X = np.array(list(zip(f1,f2))) # Xmeans = XMeans(kmax=7) # Xmeans.fit(list(gt_X)) # XMeanslabels = Xmeans.labels_ # plot_graph(gt_X,real_X,real_Y,bot_X,bot_Y,XMeanslabels) channel_followers = get_channel_followers( '../followers_cnt/', real_file.split('#')[1].split('database')[0]) #real_users_index = set(real_users_index) users = set(users) for user in channel_followers: if user in users: real_users_index.append(list(users).index(user)) print real_users_index #plot_graph(final_features,real_users_index,bot_users_index) print "#considered users:" + str(len(considered_users_index)) label_X, label_Y = data_labelprop(final_features, real_users_index, bot_users_index) print len(label_X), len(label_Y) orig_labelX, orig_labelY = label_X[:], label_Y[:] label_X, label_Y = readjust(label_X, label_Y, uw_ft, user_entropy) real_X = [] real_Y = [] bot_X = [] bot_Y = [] for i in range(len(label_X)): if label_Y[i] == 0: real_X.append(final_features.iloc[i, 0]) real_Y.append(final_features.iloc[i, 1]) elif label_Y[i] == 1: bot_X.append(final_features.iloc[i, 0]) bot_Y.append(final_features.iloc[i, 1]) plt.scatter(real_X, real_Y, c='blue', s=25, label='real') plt.scatter(bot_X, bot_Y, c='red', s=25, label='bot') #plt.tick_params(labelsize=16) #hfont = {'fontname':'Helvetica'} plt.rc('font', family='sans-serif') plt.rc('xtick', labelsize='x-large') plt.rc('ytick', labelsize='x-large') plt.xlabel('Number of messages per user', fontsize='x-large') plt.ylabel('Mean IMD per user', fontsize='x-large') #plt.title('Seed labels',fontsize='large',fontweight='bold') plt.legend(loc='lower right', prop={'size': 12}) plt.show() # Learn with LabelSpreading label_spread = label_propagation.LabelSpreading(kernel='rbf', alpha=0.6) label_spread.fit(label_X, label_Y) output_labels = label_spread.transduction_ label_spread.fit(orig_labelX, orig_labelY) orig_output_labels = label_spread.transduction_ # print output_labels # print labels pred_real_X = [] pred_real_Y = [] pred_bot_X = [] pred_bot_Y = [] for i in range(len(output_labels)): if output_labels[i] == 0: pred_real_X.append(final_features.iloc[i, 0]) pred_real_Y.append(final_features.iloc[i, 1]) else: pred_bot_X.append(final_features.iloc[i, 0]) pred_bot_Y.append(final_features.iloc[i, 1]) plt.xlim(0, 80) plt.ylim(0, 1500) plt.scatter(pred_real_X, pred_real_Y, c='blue', s=25, label='real') plt.scatter(pred_bot_X, pred_bot_Y, c='red', s=25, label='bot') #plt.tick_params(labelsize=16) plt.legend(loc='upper right', prop={'size': 12}) #hfont = {'fontname':'Helvetica'} plt.rc('font', family='sans-serif') plt.rc('xtick', labelsize='x-large') plt.rc('ytick', labelsize='x-large') plt.xlabel('Number of messages per user', fontsize='x-large') plt.ylabel('Mean IMD per user', fontsize='x-large') #plt.title('Final labels after propagation',fontsize='large',fontweight='bold') plt.show() orig_tot, orig_cor = 0, 0 total, correct = 0, 0 for i in range(len(output_labels)): if label_Y[i] == -1: if labels[i] == output_labels[i]: correct += 1 else: print label_X[i], i total += 1 if orig_labelY[i] == -1: if labels[i] == orig_output_labels[i]: orig_cor += 1 orig_tot += 1 print correct, total print(float(correct) / total) * 100 print accuracy_score(np.array(labels), output_labels) * 100 return accuracy_score(np.array(labels), output_labels) * 100 print orig_cor, orig_tot print(float(orig_cor) / orig_tot) * 100 print accuracy_score(np.array(labels), orig_output_labels) * 100