def label_spread(self, X_train, y_train, gamma = None, max_iter = None): """ Train Label Spreading model from scikit-learn Parameters __________ X_train: Scaled training data y_train: Class label gamma: Parameter for rbf kernel max_iter: Maximum number of iterations allowed Returns ________ Predicted labels and probability """ # Label spreading model model = LabelSpreading(kernel='rbf', gamma = gamma, max_iter = max_iter, n_jobs= -1) # Fit the training set model.fit(X_train, y_train) # Predict the labels of the unlabeled data points predicted_labels = model.transduction_ # Predict probability predicted_proba = model.predict_proba(X_train) return predicted_labels, predicted_proba
def source_to_target_label_prop(self, train_feat_space='embeds', kernel_param={ 'type': 'rbf', 'gamma': 20 }): print( '-----------------------------------------------------------------------' ) print('Propagating labels from source to target in {0} space'.format( train_feat_space)) if train_feat_space == 'encoded': if not hasattr(self, 'source_encoded_reps'): self.dim_red_autoencode() concat_embs = np.concatenate( (self.source_encoded_reps, self.target_encoded_reps)) elif train_feat_space == 'embeds': concat_embs = np.concatenate( (self.source_embds_vec, self.target_embds_vec)) elif train_feat_space == 'embeds_tsne': if self.tsne_computed == 0: self.compute_tsne() feat_cols = [] for idx in range(self.n_tsne_components): feat_cols.append('embeds_tsne_' + str(idx)) source_data_feats = self.source_data[feat_cols].as_matrix() target_data_feats = self.target_data[feat_cols].as_matrix() concat_embs = np.concatenate( (source_data_feats, target_data_feats)) else: raise NotImplemented unknown_labels = np.ones_like(self.target_labels) * -1 label_prop_train_labels = np.concatenate( (self.source_labels, unknown_labels)) lp_model = LabelSpreading() lp_model.fit(concat_embs, label_prop_train_labels) transduction_labels = lp_model.transduction_ label_distributions = lp_model.label_distributions_ print(label_distributions[0:10, :]) self.source_data[ train_feat_space + 'Space_prop_pred'] = transduction_labels[:self.n_source] self.target_data[ train_feat_space + 'Space_prop_pred'] = transduction_labels[self.n_source:] # self.source_data[train_feat_space+'label_prop_groups'] = label_distributions[:self.n_source] # self.target_data[train_feat_space + 'label_prop_groups'] = label_distributions[self.n_source:] # self.embds_space_grouping.append(train_feat_space + 'label_prop_groups') # self.embds_space_classifiers.append(train_feat_space+'Space_prop_pred') if self.inter_save: print('Saving propagated labels') self.save_perforamance(self.serving_dir, suffix=self.save_suffix) print('Completed source to target label propagation in {0} space' ).format(train_feat_space) print( '-----------------------------------------------------------------------' )
def test_generate_algokey(): from sklearn.ensemble import RandomForestClassifier from sklearn.neighbors import KNeighborsClassifier from sklearn.tree import DecisionTreeClassifier from sklearn.ensemble import AdaBoostClassifier from sklearn.semi_supervised import LabelSpreading n_weaks = 50 tree_depth = 3 K = 6 rfc = RandomForestClassifier(n_estimators=n_weaks, max_depth=tree_depth) knn = KNeighborsClassifier(n_neighbors=K) dtc = DecisionTreeClassifier(max_depth=tree_depth) abc = AdaBoostClassifier( base_estimator=DecisionTreeClassifier(max_depth=tree_depth), n_estimators=n_weaks, ) lsc = LabelSpreading(kernel="knn") funcRes = [generate_algokey(clf) for clf in [rfc, knn, dtc, abc, lsc]] trueRes = [ 'RandomForest_M50_D3', '6NearestNeighbors', 'DecisionTree_D3', 'AdaBoost_M50_D3', 'LabelSpreading_knn', ] assert funcRes == trueRes
def test_LabelSpreading_rbf(*data): x, y, unlabeled_indices = data y_train = np.copy(y) y_train[unlabeled_indices] = -1 fig = plt.figure() ax = fig.add_subplot(1, 1, 1) alphas = np.linspace(0.01, 1, num=10, endpoint=True) gammas = np.logspace(-2, 2, num=50) colors = ((1, 0, 0), (0, 1, 0), (0, 0, 1), (0.5, 0.5, 0), (0, 0.5, 0.5), (0.5, 0, 0.5), (0.4, 0.6, 0), (0.6, 0.4, 0) \ , (0, 0.6, 0.4), (0.5, 0.3, 0.2)) # 颜色集合,不同的曲线用不同的颜色 # 训练并绘图 for alpha, color in zip(alphas, colors): scores = [] for gamma in gammas: clf = LabelSpreading(max_iter=100, gamma=gamma, alpha=alpha, kernel='rbf') clf.fit(x, y_train) scores.append(clf.score(x[unlabeled_indices], y[unlabeled_indices])) ax.plot(gammas, scores, label=r"$\alpha=%s$" % alpha, color=color) # 设置图形 ax.set_xlabel(r"$\gamma$") ax.set_ylabel("score") ax.set_xscale("log") ax.legend(loc='best') ax.set_title("LabelSpreading rbf kernel") plt.show()
def semiLabelSpreding(feature_extractor, generator, val_generator, kernel, neighbors, gamma, alpha): semi = LabelSpreading(kernel=kernel, n_neighbors=neighbors, gamma=gamma, alpha=alpha, tol=0.001, max_iter=1000000) features = feature_extractor.predict_generator(generator, steps=generator.samples / generator.batch_size, verbose=1) classes = generator.classes for i in range(0, generator.samples): if (generator.filenames[i][0] == 'N'): classes[i] = -1 semi.fit(features, classes) val_features = feature_extractor.predict_generator( val_generator, steps=val_generator.samples / val_generator.batch_size, verbose=1) predicted_classes = semi.predict(val_features) return predicted_classes
def testLabelPropagation(): from sklearn.semi_supervised import LabelSpreading from sklearn import preprocessing label_enc = preprocessing.LabelEncoder() label_prop_model = LabelSpreading(kernel='knn') train_iter = getDocumentIterator1("published = 0 and is_test = 1") validation_iter = getDocumentIterator1("published = 1 and is_test = 1") semantic_model = gensim_tests.SemanticModel.load( 'gensim/full_corpus_300000') all_profiles, labels = [], [] propagation_labels = [] for doc in train_iter: all_profiles.append(semantic_model.inferProfile(doc.rawtext)) labels.append(doc.learned_category[0]) propagation_labels.append(doc.learned_category[0]) label_enc.fit(propagation_labels) propagation_labels = label_enc.transform(propagation_labels).tolist() for doc in validation_iter: all_profiles.append(semantic_model.inferProfile(doc.rawtext)) labels.append(doc.learned_category[0]) propagation_labels.append(-1) print propagation_labels print "Fitting" label_prop_model.fit(all_profiles, propagation_labels) output_labels = label_prop_model.transduction_ for propagated, orig in zip(label_enc.inverse_transform(output_labels), labels): print propagated, orig
def test_LabelSpreading_knn(*data): x, y, unlabeled_indices = data y_train = np.copy(y) y_train[unlabeled_indices] = -1 fig = plt.figure() ax = fig.add_subplot(1, 1, 1) alphas = np.linspace(0.01, 1, num=10, endpoint=True) Ks = [1, 2, 3, 4, 5, 8, 10, 15, 20, 25, 30, 35, 40, 50] colors = ( (1, 0, 0), (0, 1, 0), (0, 0, 1), (0.5, 0.5, 0), (0, 0.5, 0.5), (0.5, 0, 0.5), (0.4, 0.6, 0), (0.6, 0.4, 0), \ (0, 0.6, 0.4), (0.5, 0.3, 0.2)) # 颜色集合,不同的曲线用不同的颜色 # 训练并绘图 for alpha, color in zip(alphas, colors): scores = [] for K in Ks: clf = LabelSpreading(max_iter=100, n_neighbors=K, alpha=alpha, kernel='knn') clf.fit(x, y_train) scores.append(clf.score(x[unlabeled_indices], y[unlabeled_indices])) ax.plot(Ks, scores, label=r"$\alpha=%s$" % alpha, color=color) # 设置图形 ax.set_xlabel(r"k") ax.set_ylabel("score") ax.legend(loc='best') ax.set_title("LabelSpreading knn kernel") plt.show()
def computeSimilarities2(vect, matrix_values, numLine, numRan=10): """ build the model with the semi supervised approach labelSpreading Args: matrix_values: descriptor matrix i.e. all the probability of all ir models vect: the answer vector (value 0 for false links and 1 for true links); numLine: number of pairs of artefacts Returns: preds: probability that a pair of artefact is linked """ #number of iterations allPrediction = [] model = LabelSpreading() #compute multiple (10) random vector of the matrix_values for i in range(0, numRan): subVect, subMatrix_values = computeRandom(vect, matrix_values, numLine) #compute the prediction function of each random vector computeModel = model.fit(subMatrix_values, subVect) print("new predicted function computed") #compute the prediction of each pair of artefact with the random model preds0 = computeModel.predict_proba(matrix_values) allPrediction.append(preds0[:, 1]) # by the "vote majoritaire" preds = vote(allPrediction, len(vect), numRan) print(preds) return preds
def train(self, inputs, targets, min_=0.01, max_=30, niter=10, stepsize=0.1): """ Train the LP model given the data Parameters ---------- inputs : nd-array independent variables targets : vector dependent variable min : float [] max : float [] niter : int number of training iterations stepsize : float [] """ # Scale the training data self.x = inputs self.y = targets # Tune gamma in RBF using basinhopping self.gamma = self.optimize(min_, max_, niter, stepsize)[0] # Propogate labels self.model = LabelSpreading(kernel=self.kernel, alpha=self.alpha, gamma=self.gamma) self.model.fit(self.x, self.y) if self.use_logger: self.logger.info("Label Propagation model trained with {} samples".format(len(self.y)))
def run_lp_bow_runtime_vocabulary(nbr, str_list, neighbors): i = 0 avg_f1 = 0 avg_accuracy = 0 while i < 10: dataset = Dataset(categories) dataset.load_preprocessed(categories) dataset.split_train_true(nbr) print_v2_test_docs_vocabulary_labeled(categories) dataset.load_preprocessed_test_vocabulary_labeled_in_use(categories) vectorizer = CountVectorizer(vocabulary=Vocabulary.get_vocabulary(categories)) vectors = vectorizer.fit_transform(dataset.train['data']) clf = LabelSpreading(kernel='knn', n_neighbors=neighbors).fit(vectors.todense(), dataset.train['target']) test_vec = vectorizer.transform(dataset.test['data']) pred = clf.predict(test_vec.todense()) avg_f1 += metrics.f1_score(dataset.test['target'], pred, average='macro') avg_accuracy += clf.score(test_vec.todense(), dataset.test['target']) i += 1 avg_accuracy = avg_accuracy/10 avg_f1 = avg_f1/10 str_list.extend(["KNN BOW runtime voc Avg f1: " + avg_f1.__str__(), "KNN BOW runtime vod Avg acc: " + avg_accuracy.__str__()]) print("Avg f1: " + avg_f1.__str__()) print("Avg acc: " + avg_accuracy.__str__())
def soft_clamping(kernel, xTrain, yTrain, MI=10000, k=3, g=0.6, a=0.1): spread = LabelSpreading(kernel=kernel, n_neighbors=k, gamma=g, alpha=a, max_iter=MI, n_jobs=-1) spread.fit(xTrain, yTrain) predY = spread.predict_proba(xTrain) norm_Y = normalize(yTrain, predY) labels = [] for i in norm_Y: if i[0] > i[1]: labels.append(benign) elif i[0] < i[1]: labels.append(malware) lm_to_b, lb_to_m, tp, tn, fp, fn, pred_day1, missed_day1 = stats( yTrain, labels, yExpect, day_one) results = [ 'SC', kernel, k, g, a, lm_to_b, lb_to_m, tp, tn, fp, fn, pred_day1, missed_day1 ] file_name = 'SC_CMN_5per_' + str(rate) + '.csv' write_csv(file_name, results)
def label(filenames, train_path='../data/train_molecules_30.mat'): """ Label data with the provided filenames. :param filenames: List of filenames containing data to label. :return: Newly labeled and conglomerate datasets """ unlabeled = [scipy.io.loadmat(fname) for fname in filenames] unlabeled_X = np.vstack([data['X'] for data in unlabeled]) X, Y = load_data(train_path, shape=(-1, 30, 30, 30)) num_unlabeled = unlabeled_X.shape[0] unlabeled_Y = np.zeros(num_unlabeled) - 1 unlabeled_Y = unlabeled_Y.reshape((-1, 1)) Y = Y.reshape((-1, 1)) Y_all = np.vstack((Y, unlabeled_Y)) X_all = np.vstack((X, unlabeled_X)) X_all = X_all.reshape((-1, 27000)) label_prop_model = LabelSpreading() label_prop_model.fit(X_all, Y_all) Y_all = label_prop_model.transduction_ unlabeled_Y = Y_all[num_unlabeled:] return (unlabeled_X, unlabeled_Y), (X_all, Y_all)
def __init__(self, method="spreading", kernel="knn", alpha=0.2, gamma=20, n_neighbors=7, **kwargs): super(LabSP, self).__init__(**kwargs) if method.lower() == "propagation": self.regressors = [ LabelPropagation(kernel=kernel, alpha=alpha, gamma=gamma, n_neighbors=n_neighbors) for _ in range(len(self.regions)) ] elif method.lower() == "spreading": self.regressors = [ LabelSpreading(kernel=kernel, alpha=alpha, gamma=gamma, n_neighbors=n_neighbors) for _ in range(len(self.regions)) ] else: raise InitializationError("Method %s not valid" % method)
def label_spreading(x_train_all, y_train_all, cv_semisupervised, kernel="knn", alpha=0.2, name="LabelSpreading", only_model=False, **kwargs): """ Label spreading - a semisupervised model. Parameters: x_train_all (pd.DataFrame): contains both the features of labelled and unlabelled data. y_train_all (pd.Series): contains the labels of the labelled and unlabelled data. Unlabelled data must have label -1. cv_semisupervised (list): List of training and testing tuples which contain the indiced for the different folds. kernel (str): can be either "rbf" or "knn" alpha (float): clamping factor, between 0 and 1 - how strong should a datapoint adopt to its neighbor information? 0 mean not all, 1 means completely. name (str): Name/Description for the model. only_model (bool): if True returns only the model Returns: dict: results from cross validation, inclusive probability based crossvalidation """ # TODO cv: use the same cv split but randomly assign the other unlabelled data pieces to the other cv folds ls_model = LabelSpreading(kernel=kernel, alpha=alpha, n_jobs=-1, max_iter=100).fit(x_train_all, y_train_all) #y_pred = ls_model.predict(x_train) if only_model: return ls_model return calculate_metrics_cv(model=ls_model, X=x_train_all, y_true=y_train_all, cv=cv_semisupervised, name=name)
def predict_ssl(self, x_sup, y_sup, x_unsup, y_unsup, x_test, y_test): ls_model = LabelSpreading(kernel='knn', n_neighbors=5) indices = np.arange(self.train_size) unlabeled_indices = indices[x_sup.shape[0]:] y_sup_unsup = np.concatenate([y_sup, y_unsup]) y_sup_unsup_train = np.copy(y_sup_unsup) y_sup_unsup_train[unlabeled_indices] = -1 x_fit = np.concatenate([x_sup, x_unsup], axis=0) h_fit = self.model_e.predict(x_fit) h_fit = np.reshape(h_fit, (h_fit.shape[0], h_fit.shape[1] * h_fit.shape[2])) ls_model.fit(h_fit, y_sup_unsup_train) y_unsup_pred = ls_model.transduction_[unlabeled_indices] #print("LabelSpread Accuracy is ", accuracy_score(y_unsup, y_unsup_pred)) h_test = self.model_e.predict(x_test) h_test = np.reshape( h_test, (h_test.shape[0], h_test.shape[1] * h_test.shape[2])) #SVM clf_svc = svm.SVC(kernel='linear') y_fit_true = ls_model.transduction_ clf_svc.fit(h_fit, y_fit_true) acc_svm = accuracy_score(y_test, clf_svc.predict(h_test)) clf_svc = svm.LinearSVC() clf_svc.fit(h_fit, y_fit_true) acc_svm_linear = accuracy_score(y_test, clf_svc.predict(h_test)) print('acc_svm is ', max(acc_svm, acc_svm_linear))
def doLabelSpreading(self,X,y,**kwargs): label_spread_model = LabelSpreading(**kwargs) if self.verbose>2: print("X, y shapes: ",X.shape,y.shape) print(" y hist: ",np.histogram(y)) label_spread_model.fit(X, y) if self.verbose>2: print("ls_predict:",np.histogram(label_spread_model.predict(X)) ) return label_spread_model.predict_proba(X)
def label_spr(self): RESULT_ACC_SS = 0 for i in range(self.manyfit): #Initialisinig of variables: self.init_variables() #PCA preprocessing: if (self.PCA_MODE): self.pca_preprocess(self.pca) #Semi supervised algo if (self.ss_mod == 'LabSpr' and self.ss_kern == 'knn'): self.label_prop_model = LabelSpreading( kernel='knn', gamma=self.gamma, n_neighbors=self.neighbors, alpha=self.alpha) elif (self.ss_mod == 'LabProp' and self.ss_kern == 'rbf'): self.label_prop_model = LabelPropagation( kernel='rbf', gamma=self.gamma, n_neighbors=self.neighbors, alpha=self.alpha, max_iter=10) else: self.label_prop_model = LabelPropagtion( kernel=self.ss_kern, gamma=self.gamma, n_neighbors=self.neighbors) print('Starting to fit. Run for shelter!') self.label_prop_model.fit(self.X_tot, self.y_tot) temp_acc = self.label_prop_model.score(self.X_valid_lab, self.y_valid) print('{} / {} :accuracy = {}'.format(i, self.manyfit, temp_acc)) RESULT_ACC_SS += temp_acc self.y_tot = self.label_prop_model.transduction_ self.y_submit = self.label_prop_model.predict(self.X_submit) if (self.datastate == "save"): self.save_to_csv(self.X_tot, self.y_tot, self.X_valid_lab, self.y_valid) RESULT_ACC_SS /= self.manyfit self.json_dict['ss_accuracy'] = RESULT_ACC_SS print('accuracy obtained on the test set of the ss algo:', RESULT_ACC_SS)
def test_LabelSpreading(*data): x, y, unlabeled_indices = data y_train = np.copy(y) y_train[unlabeled_indices] = -1 clf = LabelSpreading(max_iter=100, kernel='rbf', gamma=0.1) clf.fit(x, y_train) predicted_labels = clf.transduction_[unlabeled_indices] true_labels = y[unlabeled_indices] print("Accuracy: %f" % metrics.accuracy_score(true_labels, predicted_labels))
def LabelSpreadingWrapper(X_train, y_train, X_test): clf = LabelSpreading(kernel='knn', n_neighbors=10, n_jobs=-1, max_iter=1000, alpha=0.1) newlabels = np.concatenate((np.array(y_train), -np.ones(len(X_test)))) clf.fit(np.concatenate((X_train, X_test)), newlabels) return clf.transduction_[-len(X_test):]
def label_spreading(self, X_train, y, X_test): clf = LabelSpreading() X = np.concatenate((X_train.todense(), X_test.todense()), axis=0) print("X shape now ", X.shape) print("Y shape now ", y.shape) clf.fit(X, y) final_labels = clf.predict(X_test) label_prob = clf.predict_proba(X_test) print(compare_labels_probabilities().compare(label_prob, final_labels)) return final_labels, clf
def test_LabelSpreading(*data): X,y,unlabeled_indices = data y_train = np.copy(y) y_train[unlabeled_indices] = -1 clf = LabelSpreading(max_iter=1000, kernel='knn',gamma = 0.1) clf.fit(X,y_train) true_labels = y[unlabeled_indices] predicted_labels = clf.transduction_[unlabeled_indices] print('Accuracy : %f' %clf.score(X[unlabeled_indices],true_labels)) print('Accuracy : %f' %metrics.accuracy_score(true_labels,predicted_labels))
def labelSpreading(x, y): scores = [] mses = [] for gamma in tensValues: model = LabelSpreading(gamma=gamma) accuracy, mse = runSet(model, x, y) scores.append(accuracy) mses.append(mse) showMSEGraph(tensValues, scores, mses, "gamma", model.__class__.__name__) addModelComparison(model, max(scores))
def __init__(self): np.random.seed(1102) self.model = LabelSpreading( kernel="rbf", n_jobs=int(np.max([multiprocessing.cpu_count() - 2, 1])), alpha=0.2, n_neighbors=10, max_iter=15) self.name = "LABEL-SPREADING" self.scaler = MinMaxScaler()
def semi_supervised(): features, labels = separate_cols_with_unknown(gtd) features = process_nontext(features) features = convertDType(features) model = LabelPropagation(kernel="knn") model2 = LabelSpreading(kernel="knn") model2.fit(features, labels) preds = cross_val_predict(model2, features, labels, cv=5) print('5 fold cross val accuracy of model: %0.2f ' % accuracy_score(labels, preds))
def __init__(self, lmnn=False, max_iter=1000, lm_num=200): # self.clf = LabelPropagation(kernel='knn',max_iter=1000,n_jobs=10,n_neighbors=25) self.clf = LabelSpreading(kernel='knn', n_neighbors=25, max_iter=max_iter, alpha=0.2, n_jobs=-1) self.lmnn = lmnn self.lm_num = lm_num if lmnn: self.ml = LMNN(use_pca=False, max_iter=2000)
def __init__(self, balanced=False, visual_expansion_use=True, re_score_alpha=0.15, re_score_proportional=True, regions=None, ve_estimators=20, ve_leafs=5, clf_estimators=20, clf_leafs=1, method="spreading", kernel="knn", lab_alpha=0.2, lab_gamma=20, lab_n_neighbors=7, weights=None): super(SAL, self).__init__(balanced=balanced, visual_expansion_use=visual_expansion_use, re_score_alpha=re_score_alpha, re_score_proportional=re_score_proportional, regions=regions, ve_estimators=ve_estimators, ve_leafs=ve_leafs) if not weights: # self.weights = [0.45, 0.45, 0.1] self.weights = [0.5, 0.5, 0.] else: self.weights = weights self.cluster_forest = [ RandomTreesEmbedding(n_estimators=clf_estimators, min_samples_leaf=clf_leafs, n_jobs=-1) for _ in range(len(self.regions)) ] self.affinity_matrix = [] self.feature_matcher = None # Initialized when set_ex if method.lower() == "propagation": self.regressors = [ LabelPropagation(kernel=kernel, alpha=lab_alpha, gamma=lab_gamma, n_neighbors=lab_n_neighbors) for _ in range(len(self.regions)) ] elif method.lower() == "spreading": self.regressors = [ LabelSpreading(kernel=kernel, alpha=lab_alpha, gamma=lab_gamma, n_neighbors=lab_n_neighbors) for _ in range(len(self.regions)) ] else: raise InitializationError("Method %s not valid" % method)
def get_clf(self): if self.clf_method == 'LR': semi_supervised = False clf = LogisticRegression(max_iter=200) elif self.clf_method == 'LP': semi_supervised = True clf = LabelSpreading() else: raise ValueError("Non-implemented method") return clf, semi_supervised
def fit(self): #Need to concatenate labeled and unlabeled data #unlabeled data labels are set to -1 X = np.concatenate( (self.val_primitive_matrix, self.train_primitive_matrix)) val_labels = (self.val_ground + 1) / 2. train_labels = -1. * np.ones(np.shape(self.train_primitive_matrix)[0]) y = np.concatenate((val_labels, train_labels)) self.model = LabelSpreading(kernel='knn') self.model.fit(X, y)
def knn(X, labels): # ############################################################################# # Learn with LabelSpreading label_spread = LabelSpreading(kernel='knn', alpha=0.6, max_iter=100) label_spread.fit(X, labels) # ############################################################################# # Plot output labels output_labels = label_spread.transduction_ return output_labels
def propagate_labels( features, labels, ): label_prop_model = LabelSpreading(kernel=construct_graph, n_jobs=-1) label_prop_model.fit(features, labels) logger.debug(label_prop_model.classes_) # preds = label_prop_model.predict(features) preds = label_prop_model.predict_proba(features) # logger.debug(label_prop_model.classes_) return preds