def test_label_spreading_algorithms(): """ Compare scikit's algorithm and our algorithm """ x = np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9], [10, 11, 12]]) # scikit takes different input that our algorithm y_sklearn = np.array([1, 2, -1, -1]) y_custom = np.array([[1, 0], [0, 1], [0, 0], [0, 0]]) # scikit's algorithm alpha = 0.2 max_iter = 30 tol = 1e-3 label_spreading = LabelSpreadingSKLearn(kernel="rbf", max_iter=max_iter, alpha=alpha, tol=tol) model = label_spreading.fit(x, y_sklearn) expected = model.predict(x) # our algorithm w = distance_matrix(x, measure=rbf_distance) ls = LabelSpreadingCustom(alpha=alpha, max_iter=max_iter, tol=tol) ls = ls.fit(w, y_custom) actual = ls.predict(y_custom) actual = np.array(actual) + 1 # add plus 1 to every prediction assert_array_equal(actual, expected)
def test_LabelSpreading_rbf(*data): x, y, unlabeled_indices = data y_train = np.copy(y) y_train[unlabeled_indices] = -1 fig = plt.figure() ax = fig.add_subplot(1, 1, 1) alphas = np.linspace(0.01, 1, num=10, endpoint=True) gammas = np.logspace(-2, 2, num=50) colors = ((1, 0, 0), (0, 1, 0), (0, 0, 1), (0.5, 0.5, 0), (0, 0.5, 0.5), (0.5, 0, 0.5), (0.4, 0.6, 0), (0.6, 0.4, 0) \ , (0, 0.6, 0.4), (0.5, 0.3, 0.2)) # 颜色集合,不同的曲线用不同的颜色 # 训练并绘图 for alpha, color in zip(alphas, colors): scores = [] for gamma in gammas: clf = LabelSpreading(max_iter=100, gamma=gamma, alpha=alpha, kernel='rbf') clf.fit(x, y_train) scores.append(clf.score(x[unlabeled_indices], y[unlabeled_indices])) ax.plot(gammas, scores, label=r"$\alpha=%s$" % alpha, color=color) # 设置图形 ax.set_xlabel(r"$\gamma$") ax.set_ylabel("score") ax.set_xscale("log") ax.legend(loc='best') ax.set_title("LabelSpreading rbf kernel") plt.show()
def test_LabelSpreading_knn(*data): x, y, unlabeled_indices = data y_train = np.copy(y) y_train[unlabeled_indices] = -1 fig = plt.figure() ax = fig.add_subplot(1, 1, 1) alphas = np.linspace(0.01, 1, num=10, endpoint=True) Ks = [1, 2, 3, 4, 5, 8, 10, 15, 20, 25, 30, 35, 40, 50] colors = ( (1, 0, 0), (0, 1, 0), (0, 0, 1), (0.5, 0.5, 0), (0, 0.5, 0.5), (0.5, 0, 0.5), (0.4, 0.6, 0), (0.6, 0.4, 0), \ (0, 0.6, 0.4), (0.5, 0.3, 0.2)) # 颜色集合,不同的曲线用不同的颜色 # 训练并绘图 for alpha, color in zip(alphas, colors): scores = [] for K in Ks: clf = LabelSpreading(max_iter=100, n_neighbors=K, alpha=alpha, kernel='knn') clf.fit(x, y_train) scores.append(clf.score(x[unlabeled_indices], y[unlabeled_indices])) ax.plot(Ks, scores, label=r"$\alpha=%s$" % alpha, color=color) # 设置图形 ax.set_xlabel(r"k") ax.set_ylabel("score") ax.legend(loc='best') ax.set_title("LabelSpreading knn kernel") plt.show()
def soft_clamping(kernel, xTrain, yTrain, MI=10000, k=3, g=0.6, a=0.1): spread = LabelSpreading(kernel=kernel, n_neighbors=k, gamma=g, alpha=a, max_iter=MI, n_jobs=-1) spread.fit(xTrain, yTrain) predY = spread.predict_proba(xTrain) norm_Y = normalize(yTrain, predY) labels = [] for i in norm_Y: if i[0] > i[1]: labels.append(benign) elif i[0] < i[1]: labels.append(malware) lm_to_b, lb_to_m, tp, tn, fp, fn, pred_day1, missed_day1 = stats( yTrain, labels, yExpect, day_one) results = [ 'SC', kernel, k, g, a, lm_to_b, lb_to_m, tp, tn, fp, fn, pred_day1, missed_day1 ] file_name = 'SC_CMN_5per_' + str(rate) + '.csv' write_csv(file_name, results)
def semiLabelSpreding(feature_extractor, generator, val_generator, kernel, neighbors, gamma, alpha): semi = LabelSpreading(kernel=kernel, n_neighbors=neighbors, gamma=gamma, alpha=alpha, tol=0.001, max_iter=1000000) features = feature_extractor.predict_generator(generator, steps=generator.samples / generator.batch_size, verbose=1) classes = generator.classes for i in range(0, generator.samples): if (generator.filenames[i][0] == 'N'): classes[i] = -1 semi.fit(features, classes) val_features = feature_extractor.predict_generator( val_generator, steps=val_generator.samples / val_generator.batch_size, verbose=1) predicted_classes = semi.predict(val_features) return predicted_classes
def computeSimilarities2(vect, matrix_values, numLine, numRan=10): """ build the model with the semi supervised approach labelSpreading Args: matrix_values: descriptor matrix i.e. all the probability of all ir models vect: the answer vector (value 0 for false links and 1 for true links); numLine: number of pairs of artefacts Returns: preds: probability that a pair of artefact is linked """ #number of iterations allPrediction = [] model = LabelSpreading() #compute multiple (10) random vector of the matrix_values for i in range(0, numRan): subVect, subMatrix_values = computeRandom(vect, matrix_values, numLine) #compute the prediction function of each random vector computeModel = model.fit(subMatrix_values, subVect) print("new predicted function computed") #compute the prediction of each pair of artefact with the random model preds0 = computeModel.predict_proba(matrix_values) allPrediction.append(preds0[:, 1]) # by the "vote majoritaire" preds = vote(allPrediction, len(vect), numRan) print(preds) return preds
def label_spreading(X_train, y_train, Xunlabelled, X_test, y_test): #pca = randomized_PCA(X_train) #X_train, X_test, y_train, y_test = cross_validation.train_test_split(tr_images, tr_labels, test_size=0.3) #X = pca.transform(X) #val_images = pca.transform(val_images) #y= y[:] X_train = X_train[:, :] y_train = y_train[:] Xunlabelled = Xunlabelled[:10000,:] #import ipdb; ipdb.set_trace() X_both = np.vstack((X_train, Xunlabelled)) y_both = np.append(y_train, -np.ones((Xunlabelled.shape[0],))) label_prop_model = LabelSpreading(max_iter=100) #random_unlabeled_points = np.where(np.random.random_integers(0, 1, size=len(y_train))) #labels = np.copy(y_train) #labels[random_unlabeled_points] = -1 label_prop_model.fit(np.copy(X_both), np.copy(y_both)) y_pred = label_prop_model.predict(np.copy(X_both)) print(y_pred)
def testLabelPropagation(): from sklearn.semi_supervised import LabelSpreading from sklearn import preprocessing label_enc = preprocessing.LabelEncoder() label_prop_model = LabelSpreading(kernel='knn') train_iter = getDocumentIterator1("published = 0 and is_test = 1") validation_iter = getDocumentIterator1("published = 1 and is_test = 1") semantic_model = gensim_tests.SemanticModel.load( 'gensim/full_corpus_300000') all_profiles, labels = [], [] propagation_labels = [] for doc in train_iter: all_profiles.append(semantic_model.inferProfile(doc.rawtext)) labels.append(doc.learned_category[0]) propagation_labels.append(doc.learned_category[0]) label_enc.fit(propagation_labels) propagation_labels = label_enc.transform(propagation_labels).tolist() for doc in validation_iter: all_profiles.append(semantic_model.inferProfile(doc.rawtext)) labels.append(doc.learned_category[0]) propagation_labels.append(-1) print propagation_labels print "Fitting" label_prop_model.fit(all_profiles, propagation_labels) output_labels = label_prop_model.transduction_ for propagated, orig in zip(label_enc.inverse_transform(output_labels), labels): print propagated, orig
def predict_ssl(self, x_sup, y_sup, x_unsup, y_unsup, x_test, y_test): ls_model = LabelSpreading(kernel='knn', n_neighbors=5) indices = np.arange(self.train_size) unlabeled_indices = indices[x_sup.shape[0]:] y_sup_unsup = np.concatenate([y_sup, y_unsup]) y_sup_unsup_train = np.copy(y_sup_unsup) y_sup_unsup_train[unlabeled_indices] = -1 x_fit = np.concatenate([x_sup, x_unsup], axis=0) h_fit = self.model_e.predict(x_fit) h_fit = np.reshape(h_fit, (h_fit.shape[0], h_fit.shape[1] * h_fit.shape[2])) ls_model.fit(h_fit, y_sup_unsup_train) y_unsup_pred = ls_model.transduction_[unlabeled_indices] #print("LabelSpread Accuracy is ", accuracy_score(y_unsup, y_unsup_pred)) h_test = self.model_e.predict(x_test) h_test = np.reshape( h_test, (h_test.shape[0], h_test.shape[1] * h_test.shape[2])) #SVM clf_svc = svm.SVC(kernel='linear') y_fit_true = ls_model.transduction_ clf_svc.fit(h_fit, y_fit_true) acc_svm = accuracy_score(y_test, clf_svc.predict(h_test)) clf_svc = svm.LinearSVC() clf_svc.fit(h_fit, y_fit_true) acc_svm_linear = accuracy_score(y_test, clf_svc.predict(h_test)) print('acc_svm is ', max(acc_svm, acc_svm_linear))
def source_to_target_label_prop(self, train_feat_space='embeds', kernel_param={ 'type': 'rbf', 'gamma': 20 }): print( '-----------------------------------------------------------------------' ) print('Propagating labels from source to target in {0} space'.format( train_feat_space)) if train_feat_space == 'encoded': if not hasattr(self, 'source_encoded_reps'): self.dim_red_autoencode() concat_embs = np.concatenate( (self.source_encoded_reps, self.target_encoded_reps)) elif train_feat_space == 'embeds': concat_embs = np.concatenate( (self.source_embds_vec, self.target_embds_vec)) elif train_feat_space == 'embeds_tsne': if self.tsne_computed == 0: self.compute_tsne() feat_cols = [] for idx in range(self.n_tsne_components): feat_cols.append('embeds_tsne_' + str(idx)) source_data_feats = self.source_data[feat_cols].as_matrix() target_data_feats = self.target_data[feat_cols].as_matrix() concat_embs = np.concatenate( (source_data_feats, target_data_feats)) else: raise NotImplemented unknown_labels = np.ones_like(self.target_labels) * -1 label_prop_train_labels = np.concatenate( (self.source_labels, unknown_labels)) lp_model = LabelSpreading() lp_model.fit(concat_embs, label_prop_train_labels) transduction_labels = lp_model.transduction_ label_distributions = lp_model.label_distributions_ print(label_distributions[0:10, :]) self.source_data[ train_feat_space + 'Space_prop_pred'] = transduction_labels[:self.n_source] self.target_data[ train_feat_space + 'Space_prop_pred'] = transduction_labels[self.n_source:] # self.source_data[train_feat_space+'label_prop_groups'] = label_distributions[:self.n_source] # self.target_data[train_feat_space + 'label_prop_groups'] = label_distributions[self.n_source:] # self.embds_space_grouping.append(train_feat_space + 'label_prop_groups') # self.embds_space_classifiers.append(train_feat_space+'Space_prop_pred') if self.inter_save: print('Saving propagated labels') self.save_perforamance(self.serving_dir, suffix=self.save_suffix) print('Completed source to target label propagation in {0} space' ).format(train_feat_space) print( '-----------------------------------------------------------------------' )
class LabelSpreadingModel(SupervisedW2VModel): def fit_with_test(self, test_data): xs, ys = [], [] self.ans_mapping = [] for ans, cvs in self.context_vectors.items(): xs.extend(cvs) if ans not in self.ans_mapping: y = len(self.ans_mapping) self.ans_mapping.append(ans) else: y = self.ans_mapping.index(ans) ys.extend(y for _ in cvs) for ctx in test_data: xs.append(self.cv(ctx)) ys.append(-1) # unlabeled self.ls_clf = LabelSpreading(kernel='knn', n_neighbors=11) self.ls_clf.fit(xs, ys) def __call__(self, x, ans=None, with_confidence=False): v = self.cv(x) probs = self.ls_clf.predict_proba([v])[0] pred = probs.argmax() m_ans = self.ans_mapping[pred] # TODO - get confidence as difference between probs[pred] and next return (m_ans, 0.0) if with_confidence else m_ans
def train(self, inputs, targets, min_=0.01, max_=30, niter=10, stepsize=0.1): """ Train the LP model given the data Parameters ---------- inputs : nd-array independent variables targets : vector dependent variable min : float [] max : float [] niter : int number of training iterations stepsize : float [] """ # Scale the training data self.x = inputs self.y = targets # Tune gamma in RBF using basinhopping self.gamma = self.optimize(min_, max_, niter, stepsize)[0] # Propogate labels self.model = LabelSpreading(kernel=self.kernel, alpha=self.alpha, gamma=self.gamma) self.model.fit(self.x, self.y) if self.use_logger: self.logger.info("Label Propagation model trained with {} samples".format(len(self.y)))
def label(filenames, train_path='../data/train_molecules_30.mat'): """ Label data with the provided filenames. :param filenames: List of filenames containing data to label. :return: Newly labeled and conglomerate datasets """ unlabeled = [scipy.io.loadmat(fname) for fname in filenames] unlabeled_X = np.vstack([data['X'] for data in unlabeled]) X, Y = load_data(train_path, shape=(-1, 30, 30, 30)) num_unlabeled = unlabeled_X.shape[0] unlabeled_Y = np.zeros(num_unlabeled) - 1 unlabeled_Y = unlabeled_Y.reshape((-1, 1)) Y = Y.reshape((-1, 1)) Y_all = np.vstack((Y, unlabeled_Y)) X_all = np.vstack((X, unlabeled_X)) X_all = X_all.reshape((-1, 27000)) label_prop_model = LabelSpreading() label_prop_model.fit(X_all, Y_all) Y_all = label_prop_model.transduction_ unlabeled_Y = Y_all[num_unlabeled:] return (unlabeled_X, unlabeled_Y), (X_all, Y_all)
def run_lp_bow_runtime_vocabulary(nbr, str_list, neighbors): i = 0 avg_f1 = 0 avg_accuracy = 0 while i < 10: dataset = Dataset(categories) dataset.load_preprocessed(categories) dataset.split_train_true(nbr) print_v2_test_docs_vocabulary_labeled(categories) dataset.load_preprocessed_test_vocabulary_labeled_in_use(categories) vectorizer = CountVectorizer(vocabulary=Vocabulary.get_vocabulary(categories)) vectors = vectorizer.fit_transform(dataset.train['data']) clf = LabelSpreading(kernel='knn', n_neighbors=neighbors).fit(vectors.todense(), dataset.train['target']) test_vec = vectorizer.transform(dataset.test['data']) pred = clf.predict(test_vec.todense()) avg_f1 += metrics.f1_score(dataset.test['target'], pred, average='macro') avg_accuracy += clf.score(test_vec.todense(), dataset.test['target']) i += 1 avg_accuracy = avg_accuracy/10 avg_f1 = avg_f1/10 str_list.extend(["KNN BOW runtime voc Avg f1: " + avg_f1.__str__(), "KNN BOW runtime vod Avg acc: " + avg_accuracy.__str__()]) print("Avg f1: " + avg_f1.__str__()) print("Avg acc: " + avg_accuracy.__str__())
def label_spread(self, X_train, y_train, gamma = None, max_iter = None): """ Train Label Spreading model from scikit-learn Parameters __________ X_train: Scaled training data y_train: Class label gamma: Parameter for rbf kernel max_iter: Maximum number of iterations allowed Returns ________ Predicted labels and probability """ # Label spreading model model = LabelSpreading(kernel='rbf', gamma = gamma, max_iter = max_iter, n_jobs= -1) # Fit the training set model.fit(X_train, y_train) # Predict the labels of the unlabeled data points predicted_labels = model.transduction_ # Predict probability predicted_proba = model.predict_proba(X_train) return predicted_labels, predicted_proba
class ModelLabelSpreading: def __init__(self): np.random.seed(1102) self.model = LabelSpreading( kernel="rbf", n_jobs=int(np.max([multiprocessing.cpu_count() - 2, 1])), alpha=0.2, n_neighbors=10, max_iter=15) self.name = "LABEL-SPREADING" self.scaler = MinMaxScaler() def fit(self, X, y, Xu=None): np.random.seed(1102) self.Xl = X self.yl = y #self.Xu = Xu def predict(self, X): np.random.seed(1102) self.Xt = X X = self.scaler.fit_transform(np.vstack((self.Xl, self.Xt))) y = np.append(self.yl, np.repeat(-1, self.Xt.shape[0])) #y = np.append(y, np.repeat(-1, self.Xt.shape[0])) y = np.int64(y) assert X.shape[0] == len(y) self.model.fit(X, y) return np.array( self.model.label_distributions_)[(-self.Xt.shape[0]):, :]
def semi_supervised(): features,labels = separate_cols_with_unknown(gtd) features = process_nontext(features) features = convertDType(features) model = LabelPropagation(kernel="knn") model2 = LabelSpreading(kernel="knn") model2.fit(features,labels) preds = cross_val_predict(model2,features,labels,cv=5) print('5 fold cross val accuracy of model: %0.2f ' % accuracy_score(labels,preds))
def test_LabelSpreading(*data): x, y, unlabeled_indices = data y_train = np.copy(y) y_train[unlabeled_indices] = -1 clf = LabelSpreading(max_iter=100, kernel='rbf', gamma=0.1) clf.fit(x, y_train) predicted_labels = clf.transduction_[unlabeled_indices] true_labels = y[unlabeled_indices] print("Accuracy: %f" % metrics.accuracy_score(true_labels, predicted_labels))
def LabelSpreadingWrapper(X_train, y_train, X_test): clf = LabelSpreading(kernel='knn', n_neighbors=10, n_jobs=-1, max_iter=1000, alpha=0.1) newlabels = np.concatenate((np.array(y_train), -np.ones(len(X_test)))) clf.fit(np.concatenate((X_train, X_test)), newlabels) return clf.transduction_[-len(X_test):]
def test_LabelSpreading(*data): X,y,unlabeled_indices = data y_train = np.copy(y) y_train[unlabeled_indices] = -1 clf = LabelSpreading(max_iter=1000, kernel='knn',gamma = 0.1) clf.fit(X,y_train) true_labels = y[unlabeled_indices] predicted_labels = clf.transduction_[unlabeled_indices] print('Accuracy : %f' %clf.score(X[unlabeled_indices],true_labels)) print('Accuracy : %f' %metrics.accuracy_score(true_labels,predicted_labels))
def semi_supervised(): features, labels = separate_cols_with_unknown(gtd) features = process_nontext(features) features = convertDType(features) model = LabelPropagation(kernel="knn") model2 = LabelSpreading(kernel="knn") model2.fit(features, labels) preds = cross_val_predict(model2, features, labels, cv=5) print('5 fold cross val accuracy of model: %0.2f ' % accuracy_score(labels, preds))
def __init__(self): np.random.seed(1102) self.model = LabelSpreading( kernel="rbf", n_jobs=int(np.max([multiprocessing.cpu_count() - 2, 1])), alpha=0.2, n_neighbors=10, max_iter=15) self.name = "LABEL-SPREADING" self.scaler = MinMaxScaler()
def fit(self): #Need to concatenate labeled and unlabeled data #unlabeled data labels are set to -1 X = np.concatenate( (self.val_primitive_matrix, self.train_primitive_matrix)) val_labels = (self.val_ground + 1) / 2. train_labels = -1. * np.ones(np.shape(self.train_primitive_matrix)[0]) y = np.concatenate((val_labels, train_labels)) self.model = LabelSpreading(kernel='knn') self.model.fit(X, y)
def __init__(self, lmnn=False, max_iter=1000, lm_num=200): # self.clf = LabelPropagation(kernel='knn',max_iter=1000,n_jobs=10,n_neighbors=25) self.clf = LabelSpreading(kernel='knn', n_neighbors=25, max_iter=max_iter, alpha=0.2, n_jobs=-1) self.lmnn = lmnn self.lm_num = lm_num if lmnn: self.ml = LMNN(use_pca=False, max_iter=2000)
def knn(X, labels): # ############################################################################# # Learn with LabelSpreading label_spread = LabelSpreading(kernel='knn', alpha=0.6, max_iter=100) label_spread.fit(X, labels) # ############################################################################# # Plot output labels output_labels = label_spread.transduction_ return output_labels
def propagate_labels( features, labels, ): label_prop_model = LabelSpreading(kernel=construct_graph, n_jobs=-1) label_prop_model.fit(features, labels) logger.debug(label_prop_model.classes_) # preds = label_prop_model.predict(features) preds = label_prop_model.predict_proba(features) # logger.debug(label_prop_model.classes_) return preds
def apply_notl(trainX, trainY, testX, testY, window, source_pos, target_pos): ####################### ### SEMI-SUPERVISED ### ######################## # Label Propagation label_prop_model = LabelPropagation(kernel='knn') label_prop_model.fit(trainX, trainY) Y_Pred = label_prop_model.predict(testX); acc_ss_propagation, acc_ss_propagation_INFO = check_accuracy(testY, Y_Pred) # Label Spreading label_prop_models_spr = LabelSpreading(kernel='knn') label_prop_models_spr.fit(trainX, trainY) Y_Pred = label_prop_models_spr.predict(testX); acc_ss_spreading, acc_ss_spreading_INFO = check_accuracy(testY, Y_Pred) ######################## #### WITHOUT TL ######## ######################## # LogisticRegression modelLR = LogisticRegression() modelLR.fit(trainX, trainY) predLR = modelLR.predict(testX) accLR, acc_LR_INFO = check_accuracy(testY, predLR) # DecisionTreeClassifier modelDT = tree.DecisionTreeClassifier() modelDT.fit(trainX, trainY) predDT = modelDT.predict(testX) accDT, acc_DT_INFO = check_accuracy(testY, predDT) # BernoulliNB modelNB = BernoulliNB() modelNB.fit(trainX, trainY) predND = modelNB.predict(testX) accNB, acc_NB_INFO = check_accuracy(testY, predND) # return pd.DataFrame( [{ 'window': window, 'source_position': source_pos, 'target_position': target_pos, 'acc_SS_propagation': acc_ss_propagation, 'acc_SS_propagation_INFO':acc_ss_propagation_INFO, 'acc_SS_spreading': acc_ss_spreading, 'acc_SS_spreading_INFO':acc_ss_spreading_INFO, 'acc_LR':accLR, 'acc_LR_INFO': str(acc_LR_INFO), 'acc_DT': accDT, 'acc_DT_INFO': str(acc_DT_INFO), 'acc_NB': accNB, 'acc_NB_INFO': str(acc_NB_INFO) }] )
def _semi_supervised_learning(self, data_matrix, target): n_classes = len(set(target)) # if there are too few classes (e.g. less than -1 and at least 2 other classes) # then just bail out and return the original target # otherwise one cannot meaningfully spread the information of only one class if n_classes > 2: semi_supervised_estimator = LabelSpreading(kernel='knn', n_neighbors=self.n_neighbors) semi_supervised_estimator.fit(data_matrix, target) predicted_target = semi_supervised_estimator.predict(data_matrix) predicted_target = self._clamp(target, predicted_target) return predicted_target else: return target
def computeSimilarities2(lsi_align, vsm_align, lda_align, f1_align, f2_align, f3_align, f4_align, f5_align, f6_align, numLine, numRan=10): """ build the model with the semi supervised approach labelSpreading Args: matrix_values: descriptor matrix i.e. all the probability of all ir models vect: the answer vector (value 0 for false links and 1 for true links); numLine: number of pairs of artefacts Returns: preds: probability that a pair of artefact is linked """ allPrediction = [] model = LabelSpreading() #compute multiple (10) random vector of the matrix_values for i in range(0, numRan): subVect, subMatrix_values, matrix_values = buildingTrainingSet2( lsi_align, vsm_align, lda_align, f1_align, f2_align, f3_align, f4_align, f5_align, f6_align, numLine) #compute the prediction function of each random vector print(len(subVect)) print(len(subMatrix_values)) computeModel = model.fit(subMatrix_values, subVect) print("new predicted function computed") #compute the prediction of each pair of artefact with the random model preds0 = computeModel.predict_proba(matrix_values) allPrediction.append(preds0[:, 1]) # by the "vote majoritaire" preds = vote(allPrediction, len(lsi_align), numRan) result = [] print(len(preds)) for p in preds: for i in range(0, len(p)): result.append(p[i]) print(len(result)) return result
def _semi_supervised_learning(self, data_matrix, target): n_classes = len(set(target)) # if there are too few classes (e.g. less than -1 and at least 2 other classes) # then just bail out and return the original target # otherwise one cannot meaningfully spread the information of only one class if n_classes > 2: semi_supervised_estimator = LabelSpreading( kernel='knn', n_neighbors=self.n_neighbors) semi_supervised_estimator.fit(data_matrix, target) predicted_target = semi_supervised_estimator.predict(data_matrix) predicted_target = self._clamp(target, predicted_target) return predicted_target else: return target
def LabelPropagation(support, support_ys, query): alpha = 0.3 k_neighbours = 38 all_embeddings = np.concatenate((support, query), axis=0) #X = all_embeddings.cpu().detach().numpy() labels = np.full(all_embeddings.shape[0], -1.) labels[:support.shape[0]] = support_ys label_propagation = LabelSpreading(kernel='knn', alpha=alpha, n_neighbors=k_neighbours, tol=0.000001) label_propagation.fit(all_embeddings, labels) predicted_labels = label_propagation.transduction_ query_prop = predicted_labels[support.shape[0]:] return query_prop
def computeSimilarities(vect, matrix_values): """ build the model with the semi supervised approach labelSpreading Args: matrix_values: descriptor matrix i.e. all the probability of all ir models vect: the answer vector (value 0 for false links and 1 for true links); Returns: preds: probability that a pair of artefact is linked """ model = LabelSpreading() computeModel = model.fit(matrix_values, vect) print("model built") preds = computeModel.predict_proba(matrix_values) print(preds) return preds[:, 1]
class SemiSupervised(BaselineModel): """ LabelSpreading Implementation """ def fit(self): #Need to concatenate labeled and unlabeled data #unlabeled data labels are set to -1 X = np.concatenate( (self.val_primitive_matrix, self.train_primitive_matrix)) val_labels = (self.val_ground + 1) / 2. train_labels = -1. * np.ones(np.shape(self.train_primitive_matrix)[0]) y = np.concatenate((val_labels, train_labels)) self.model = LabelSpreading(kernel='knn') self.model.fit(X, y)
def compute_confident_measure(vect, lsi_align, lda_align, vsm_align, emb_align, matrixSize=4): """ compute confidence measure for each pair of artefacts. Args: vect: the answer vector (value 0 for false links and 1 for true links); lsi_align, vsm_align, lda_align: IR models aligned; filename: file where the aggreg similarity scores results will be saved; Returns: preds: probability that a pair of artefact is linked.""" vect0 = np.zeros((len(lsi_align), 1)) vect0[:] = -1 trueL, falseL = CreateTraining_set.create_link_class(vect, lsi_align) print(len(trueL)) print(len(falseL)) for i in falseL: vect0[i] = 0 for i in trueL: vect0[i] = 1 print("annoted_vectors_ok") emb_values = compValues(emb_align) vsm_values = compValues(vsm_align) lda_values = compValues(lda_align) lsi_values = compValues(lsi_align) matrix_values = np.zeros((len(lsi_align), matrixSize)) matrix_values[:, 0] = lda_values matrix_values[:, 1] = emb_values matrix_values[:, 2] = vsm_values matrix_values[:, 3] = lsi_values model = LabelSpreading() computeModel = model.fit(matrix_values, vect0) print("models built") preds = computeModel.predict_proba(matrix_values) print(preds) return preds[:, 1]
def semi_supervised_learning(data_matrix, target): if -1 in list(target): # if -1 is present in target do label spreading from sklearn.semi_supervised import LabelSpreading label_prop_model = LabelSpreading(kernel='knn', n_neighbors=5) label_prop_model.fit(data_matrix, target) pred_target = label_prop_model.predict(data_matrix) extended_target = [] for pred_label, label in zip(pred_target, target): if label != -1 and pred_label != label: extended_target.append(label) else: extended_target.append(pred_label) else: extended_target = target return np.array(extended_target)
def train_model(nodes, datasets): y = np.array(range(len(nodes))) nodes = list(nodes) vectorizer = DictVectorizer(sparse=True) for i, dataset in enumerate(datasets): g = compute_dataset(dataset) nodes.extend(g.classes) sys.stdout.write('\r') sys.stdout.write(str(i+1)) sys.stdout.flush() X = vectorizer.fit_transform([dict(node.concept_vector) for node in nodes]) y = y + [-1 for i in range(len(nodes) - len(y))] unlabeled = [] model = LabelSpreading() model.fit(X, y) model.vectorizer = vectorizer return model
def propogation(model, uids, labeled_ids): X, y1, y2 = [], [], [] pool = [] for uid in labeled_ids: X.append(model.docvecs[uid]) y1.append(1) for uid in uids: if uid not in labeled_ids: X.append(model.docvecs[uid]) y2.append(-1) label_prop_model = LabelSpreading(kernel='knn', alpha=1.0) y2 = np.array(y2) y2[0:(len(y1)-1)] = 0 print len(y1) + len(y2) for i in xrange(5): np.random.shuffle(y2) label_prop_model.fit(X, y1 + y2.tolist()) pool.append(label_prop_model.transduction_) pickle.dump(pool, open('data/propagation.pick', 'w')) pool = pickle.load(open('data/propagation.pick', 'r')) pool = np.array(pool) for column in pool.T: print column
def main(): usage = "usage prog [options] arg" parser = OptionParser(usage=usage) parser.add_option("-t", "--task", dest="task", help="the task name") parser.add_option("-o", "--output", dest="output", help="the output file") (options, remainder) = parser.parse_args() train_paths = [ "../data/train_simple_feature.csv", "../data/train_plus_feature.csv", "../data/train_azure_plus_feature.csv", #"../data/train_azure_feature.csv", #"../data/train_module_feature.csv", #"../data/train_course_feature.csv", #"./blend_train.csv" ] label_path = "../data/truth_train.csv" test_paths = [ "../data/test_simple_feature.csv", "../data/test_plus_feature.csv", "../data/test_azure_plus_feature.csv", #"../data/test_azure_feature.csv", #"../data/test_module_feature.csv", #"../data/test_course_feature.csv", #"./blend_test.csv" ] train = merge_features(train_paths, label_path) train = train.drop(['user_drop_ratio'], axis=1) #train['user_drop_ratio'] = (train['user_drop_ratio'] + 8.0 / train['user_courses']) / (1.0 + 10.0 / train['user_courses']) y = encode_labels(train.dropout.values) train = train.drop('dropout', axis=1) tr_ids = train.enrollment_id.values X = train.drop('enrollment_id', axis=1) m, n = X.shape print 'train.shape=%s' % (str(X.shape)) test = merge_features(test_paths) test = test.drop(['user_drop_ratio'], axis=1) #test['user_drop_ratio'] = (test['user_drop_ratio'] + 8.0 / test['user_courses']) / (1.0 + 10.0 / test['user_courses']) tt_ids = test.enrollment_id.values X_test = test.drop('enrollment_id', axis=1) print 'test.shape=%s' % (str(X_test.shape)) scaler = StandardScaler().fit(np.vstack((X, X_test))) task = options.task if not task: task = "blend" if task == 'blend': clf_list = [ #("knn_p2_10", create_clf('knn', {"n_neighbors": 10, "p": 2})), #("knn_p2_10_scaler", create_clf('knn', {"n_neighbors": 10, "p": 2, "scaler": scaler})), #("knn_p2_100", create_clf('knn', {"n_neighbors": 100, "p": 2})), #("knn_p2_100_scaler", create_clf('knn', {"n_neighbors": 100, "p": 2, "scaler": scaler})), #("knn_p2_500", create_clf('knn', {"n_neighbors": 500, "p": 2})), #("knn_p2_500_scaler", create_clf('knn', {"n_neighbors": 500, "p": 2, "scaler": scaler})), #("knn_p2_100", create_clf('knn', {"n_neighbors": 100, "p": 2})), #("knn_p2_800", create_clf('knn', {"n_neighbors": 800, "p": 2})), #("knn_p1_10", create_clf('knn', {"n_neighbors": 10, "p": 1})), #("knn_p1_10_scaler", create_clf('knn', {"n_neighbors": 10, "p": 1, "scaler": scaler})), #("knn_p1_100", create_clf('knn', {"n_neighbors": 100, "p": 1})), #("knn_p1_100_scaler", create_clf('knn', {"n_neighbors": 100, "p": 1, "scaler": scaler})), #("knn_p1_500", create_clf('knn', {"n_neighbors": 500, "p": 1})), #("knn_p1_500_scaler", create_clf('knn', {"n_neighbors": 500, "p": 1, "scaler": scaler})), #("knn_p1_800", create_clf('knn', {"n_neighbors": 800, "p": 1})), #("knn_p1_800_scaler", create_clf('knn', {"n_neighbors": 800, "p": 1, "scaler": scaler})), #("extra_gini_10depth", create_clf("ext", {"criterion": "gini", "n_estimators": 200, "max_depth": 10})), #("extra_entropy_10depth", create_clf("ext", {"criterion": "entropy", "n_estimators": 200, "max_depth": 10})), #("extra_gini_20depth", create_clf("ext", {"criterion": "gini", "n_estimators": 200, "max_depth": 20})), #("extra_entropy_20depth", create_clf("ext", {"criterion": "entropy", "n_estimators": 200, "max_depth": 20})), ("extra_gini_30depth", create_clf("ext", {"criterion": "gini", "n_estimators": 200, "max_depth": 30})), ("extra_entropy_30depth", create_clf("ext", {"criterion": "entropy", "n_estimators": 200, "max_depth": 30})), #("rfc_gini_3depth", create_clf("rfc", {"criterion": "gini", "max_depth": 3, "n_estimators": 200})), #("rfc_entropy_3depth", create_clf("rfc", {"criterion": "entropy", "max_depth": 3, "n_estimators": 200})), ("rfc_gini_5depth", create_clf("rfc", {"criterion": "gini", "max_depth": 5, "n_estimators": 200})), ("rfc_entropy_5depth", create_clf("rfc", {"criterion": "entropy", "max_depth": 5, "n_estimators": 200})), ("rfc_gini_6depth", create_clf("rfc", {"criterion": "gini", "max_depth": 6, "n_estimators": 200})), ("rfc_entropy_6depth", create_clf("rfc", {"criterion": "entropy", "max_depth": 6, "n_estimators": 200})), ("rfc_gini_8depth", create_clf("rfc", {"criterion": "gini", "max_depth": 8, "n_estimators": 200})), ("rfc_entropy_8depth", create_clf("rfc", {"criterion": "entropy", "max_depth": 8, "n_estimators": 200})), ("rfc_gini_10depth", create_clf("rfc", {"criterion": "gini", "max_depth": 10, "n_estimators": 200})), ("rfc_entropy_10depth", create_clf("rfc", {"criterion": "entropy", "max_depth": 10, "n_estimators": 200})), ("rfc_gini_12depth", create_clf("rfc", {"criterion": "gini", "max_depth": 12, "n_estimators": 200})), ("rfc_entropy_12depth", create_clf("rfc", {"criterion": "entropy", "max_depth": 12, "n_estimators": 200})), #("xgb_1500_2depth", create_clf("xgb", {"max_depth": 2, "n_estimators": 1500, "learning_rate": 0.03})), #("xgb_600_3depth", create_clf("xgb", {"max_depth": 3, "n_estimators": 600, "learning_rate": 0.03})), #("xgb_600_4depth", create_clf("xgb", {"max_depth": 4, "n_estimators": 600, "learning_rate": 0.03})), ("xgb_600_5depth", create_clf("xgb", {"max_depth": 5, "n_estimators": 600, "learning_rate": 0.03})), #("xgb_600_6depth", create_clf("xgb", {"max_depth": 6, "n_estimators": 600, "learning_rate": 0.02})), #("xgb_600_7depth", create_clf("xgb", {"max_depth": 7, "n_estimators": 600, "learning_rate": 0.01})), #("xgb_600_8depth", create_clf("xgb", {"max_depth": 8, "n_estimators": 600, "learning_rate": 0.01})), #("lgc_1c_scale", create_clf("lgc", {"C": 1.0, "scaler": scaler})), #("lgc_1c", create_clf("lgc", {"C": 1.0})), #("lgc_1c_l1", create_clf("lgc", {"C": 1.0, "penalty": "l1"})), #("lgc_3c_scale", create_clf("lgc", {"C": 3.0, "scaler": scaler})), #("lgc_3c", create_clf("lgc", {"C": 3.0})), ("lgc_3c_l1", create_clf("lgc", {"C": 3.0, "penalty": "l1"})), #("lgc_5c_scale", create_clf("lgc", {"C": 5.0, "scaler": scaler})), #("lgc_5c", create_clf("lgc", {"C": 5.0})), ] X = X.values blend_train, blend_test = train_blend(X, y, X_test, clf_list, 5) print 'blend_train.shape=%s' % (str(blend_train.shape)) print 'blend_test.shape=%s' % (str(blend_test.shape)) cols = [cname for cname, clf in clf_list] cols = ['enrollment_id'] + cols blend_train_ids = np.hstack((np.matrix(tr_ids).T, blend_train)) blend_test_ids = np.hstack((np.matrix(tt_ids).T, blend_test)) dump_data(blend_train_ids, cols, "new_blend_train.csv") dump_data(blend_test_ids, cols, "new_blend_test.csv") blender = create_clf('lgc', {"C": 1.0, "penalty": "l1"}) auc = cv_loop(blend_train, y, blender) print 'AUC (LGC blend): %f' % auc blender = create_clf('ext', {"max_depth": 10, "criterion": "entropy", "n_estimator": 100}) auc = cv_loop(blend_train, y, blender) print 'AUC (EXT blend): %f' % auc blender = create_clf('xgb', {'max_depth': 2, "n_estimators": 150, "learning_rate": 0.05}) auc = cv_loop(blend_train, y, blender) print "AUC (XGB blend {d: %d, n: %d}): %f" % (2, 150, auc) blender = create_clf('xgb', {'max_depth': 3, "n_estimators": 200, "learning_rate": 0.05}) auc = cv_loop(blend_train, y, blender) print 'AUC (XGB blend {d: %d, n: %d}): %f' % (3, 200, auc) blender = create_clf('xgb', {'max_depth': 3, "n_estimators": 100, "learning_rate": 0.1}) blender = blender.fit(blend_train, y) preds = blender.predict_proba(blend_test)[:,1] write_submission(tt_ids, preds, "new_blend_submission.csv") combined_train = np.hstack((X, blend_train)) combined_test = np.hstack((X_test, blend_test)) blender = create_clf('xgb', {'max_depth': 5, "n_estimators": 600, "learning_rate": 0.03}) blender = blender.fit(combined_train, y) preds = blender.predict_proba(combined_test)[:,1] write_submission(tt_ids, preds, "new_combined_blend_submission.csv") elif task == 'lgc': print 'Try logistic regression ..' clf = create_clf("lgc", {"C": 3, "scaler": scaler, "penalty": "l1"}) auc = cv_loop(X, y, clf, 5) print 'AUC (all): %f' % auc elif task == "ext": print 'Try ExtraTreeClassifier' #clf = create_clf("ext", {"max_depth": 10}) # 0.86261 #clf = create_clf("ext", {"max_depth": 20}) # 0.862636 #clf = create_clf("ext", {"max_depth": 30}) # 0.860944 #clf = create_clf("ext", {"criterion": "entropy", "max_depth": 10}) # 0.862610 #clf = create_clf("ext", {"criterion": "entropy", "max_depth": 20}) # 0.862564 clf = create_clf("ext", {"criterion": "entropy", "max_depth": 20, "n_estimators": 100}) # 0.861795 #clf = create_clf("ext", {"criterion": "entropy", "max_depth": 20, "n_estimators": 2000}) # 0.862695 #clf = create_clf("ext", {"criterion": "entropy", "max_depth": 30, "n_estimators": 2000}) # 0.860 auc = cv_loop(X, y, clf, 5) print 'AUC (all): %f' % auc elif task == 'rfc': print 'Try RFC ..' #clf = create_clf('rfc', {'max_depth': 5}) # 0.859583 #clf = create_clf("rfc", {"criterion": "entropy", "max_depth": 10}) # 0.863369 #clf = create_clf("rfc", {"criterion": "entropy", "max_depth": 10, "n_estimators": 200}) # 0.863285 # clf = create_clf("rfc", {"criterion": "entropy", "max_depth": 10, "n_estimators": 100}) # 0.863207 #clf = create_clf("rfc", {"criterion": "entropy", "max_depth": 10, "max_features": None, "n_estimators": 200}) # 0.863341 clf = create_clf("rfc", {"criterion": "entropy", "max_depth": 40, "max_features": None, "n_estimators": 10000, "min_samples_split": 100}) # 0.863291 auc = cv_loop(X, y, clf, 5) print 'AUC (all): %f' % auc elif task == 'knn': clf = create_clf('knn', {"n_neighbors": 800, "p": 2, "scaler": scaler}) auc = cv_loop(X, y, clf, 5) print 'AUC (all): %f' % auc elif task == "gbt": paras = json.load(open('paras/gbt.json', 'r')) clf = create_clf("gbt", paras) clf = clf.fit(X, y) preds = clf.predict_proba(X_test)[:,1] write_submission(tt_ids, preds, "gbt_submission.csv") elif task == "xgb": #clf = create_clf('xgb', {"max_depth": 2, "n_estimators": 1500, "learning_rate": 0.03}) # 0.860279 #clf = create_clf('xgb', {"max_depth": 5, "n_estimators": 600, "learning_rate": 0.03}) # public: 0.8891443712867697; clf = create_clf('xgb', {"max_depth": 5, "n_estimators": 600, "learning_rate": 0.03}) # public: auc = cv_loop(X, y, clf, 5) print "AUC (all): %f" % auc #sys.exit() clf = clf.fit(X, y) preds = clf.predict_proba(X_test)[:,1] write_submission(tt_ids, preds, 'xgb_new_submission.csv') elif task == "deep": clf = create_clf('deep', {"neuro_num": 512, "nb_epoch": 20, "scaler": scaler, "optimizer": "adadelta"}) auc = cv_loop(X, y, clf, 5) print 'AUC (all): %f' % auc sys.exit(0) clf = clf.fit(X, y) preds = clf.predict_proba(X_test)[:,1] write_submission(tt_ids, preds, 'deep_submission.csv') elif task == 'semi': clf = create_clf("ext", {"criterion": "entropy", "max_depth": 20, "n_estimators": 100}) # 0.861795 train_semi(X, y, X_test, clf, 5) elif task == 'gbc': from sklearn.ensemble import GradientBoostingClassifier clf = GradientBoostingClassifier(n_estimators=300, learning_rate=0.1, min_samples_split=50, min_samples_leaf=50, max_depth=10, subsample=0.6, max_features='log2', verbose=1) auc = cv_loop(X, y, clf, 5) print 'AUC (all): %f' % auc elif task == 'label': from sklearn.semi_supervised import LabelPropagation from sklearn.semi_supervised import LabelSpreading label_prop_model = LabelSpreading() all_X = np.vstack((X, X_test)) tm, tn = X_test.shape unlabeles = [-1] * tm ys = [list(y)] ys.append(unlabeles) labels = np.concatenate(ys) print 'ALL shape=%s' % (str(all_X.shape)) print 'ALL y shape=%s' % (str(labels)) label_prop_model.fit(all_X, labels)
4. 6. 6. 4. 2. 3. 1. 6. 6. 1. 3. 1. 1. 1. 6. 6. 5. -1. 1. 1. -1. 1. 6. -1. 3. 6. 1. 4. 4. 6. 4. 6. 4. 1. -1. 6. 1. 2. 6. 4. -1. 2. 6. 2. -1. -1. 6. 4. -1. 1. 6. 4. 4. 6. 6. 6. -1. -1. 1. 3. -1. 6. 2. -1. 1. 4. -1. 6. 1. 4. 3. 3. 4. 1. 6. -1. 4. 4. 1. 1. 6. 6. -1. 4. 4. 4. 3. 2. 6. -1. 1. 6. 4. 4. 4. 5. 6. -1. -1. 5. 2. 6. 1. 6. 3. 2. 6. 3. 3. 1. 2. 5. 2. -1. -1. 1. 6. 6. -1. 6. 6. 6. 4. 6. -1. 2. 3. 2. 5. 4. 4. 6. 4. -1. 4. 2. 6. 1. 1. 2. -1. 5. 2. 4. 3. -1. 6. 2. 5. 2. 2. 5. 5. 4. 2. 1. -1. 1.] (500, 100) (500,) """ from sklearn.semi_supervised import LabelSpreading label_propagation_model = LabelSpreading() label_propagation_model.fit(X, y) # make predictions for first twenty samples (some will be known, some unknown) for i in range(20): print 'y: ', y[i], '\t', 'y_hat: ', label_propagation_model.predict(X[i].reshape(1,-1)) """ y: 6.0 y_hat: [6.] y: 6.0 y_hat: [6.] y: 2.0 y_hat: [2.] y: 1.0 y_hat: [1.] y: -1.0 y_hat: [6.] * y: 2.0 y_hat: [2.] y: 6.0 y_hat: [6.] y: 4.0 y_hat: [4.] y: 3.0 y_hat: [3.] y: 5.0 y_hat: [5.]
rate = 1 train_comb = train # select 10% of test data selected_test = test.sample(test.shape[0]/rate,replace=False,random_state=20422438) train_comb = train_comb.append(selected_test) train_comb_label = train_label train_comb_unlabeled = pd.DataFrame(np.array([-1]*(test.shape[0]/rate))) train_comb_label = np.array(train_comb_label.append(train_comb_unlabeled)) train_comb_label = train_comb_label.reshape(len(train_comb_label)) a_level = 1 label_prop_model = LabelSpreading(kernel="knn",alpha=a_level) label_prop_model.fit(train_comb, train_comb_label) pred_y = label_prop_model.transduction_ pred_y[:train.shape[0]] = train_label X_train, X_test, y_train, y_test = train_test_split(label_prop_model.X_, pred_y, test_size=0.10, random_state=20422438) model_erf = se.ExtraTreesClassifier(random_state=20422438,n_jobs=-1,n_estimators=1000) model_erf.fit(X_train,y_train) model_erf_pred = model_erf.predict(X_test) model_erf_error = errFn(model_erf_pred,y_test) print a_level
# Scikit-learns LabelSpreading method for semi-supervised learning import numpy as np from sklearn import datasets from sklearn.semi_supervised import LabelSpreading label_propagation_model = LabelSpreading() iris = datasets.load_iris() ''' Unlabeled entries in y It is important to assign an identifier to unlabeled points along with the labeled data when training the model with the fit method. The identifier that this implementation uses is the integer value -1. ''' # generate boolean matrix where less than 30% are 'True' rand_numgen = np.random.RandomState(42) random_unlabeled_points = rand_numgen.rand(len(iris['target'])) < 0.3 # create the unlabelled data in the labels (setting to -1) full_labels = iris['target'] cutdown_labels = np.copy(iris['target']) cutdown_labels[random_unlabeled_points] = -1 print full_labels ''' [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 2 2