def source_to_target_label_prop(self, train_feat_space='embeds', kernel_param={ 'type': 'rbf', 'gamma': 20 }): print( '-----------------------------------------------------------------------' ) print('Propagating labels from source to target in {0} space'.format( train_feat_space)) if train_feat_space == 'encoded': if not hasattr(self, 'source_encoded_reps'): self.dim_red_autoencode() concat_embs = np.concatenate( (self.source_encoded_reps, self.target_encoded_reps)) elif train_feat_space == 'embeds': concat_embs = np.concatenate( (self.source_embds_vec, self.target_embds_vec)) elif train_feat_space == 'embeds_tsne': if self.tsne_computed == 0: self.compute_tsne() feat_cols = [] for idx in range(self.n_tsne_components): feat_cols.append('embeds_tsne_' + str(idx)) source_data_feats = self.source_data[feat_cols].as_matrix() target_data_feats = self.target_data[feat_cols].as_matrix() concat_embs = np.concatenate( (source_data_feats, target_data_feats)) else: raise NotImplemented unknown_labels = np.ones_like(self.target_labels) * -1 label_prop_train_labels = np.concatenate( (self.source_labels, unknown_labels)) lp_model = LabelSpreading() lp_model.fit(concat_embs, label_prop_train_labels) transduction_labels = lp_model.transduction_ label_distributions = lp_model.label_distributions_ print(label_distributions[0:10, :]) self.source_data[ train_feat_space + 'Space_prop_pred'] = transduction_labels[:self.n_source] self.target_data[ train_feat_space + 'Space_prop_pred'] = transduction_labels[self.n_source:] # self.source_data[train_feat_space+'label_prop_groups'] = label_distributions[:self.n_source] # self.target_data[train_feat_space + 'label_prop_groups'] = label_distributions[self.n_source:] # self.embds_space_grouping.append(train_feat_space + 'label_prop_groups') # self.embds_space_classifiers.append(train_feat_space+'Space_prop_pred') if self.inter_save: print('Saving propagated labels') self.save_perforamance(self.serving_dir, suffix=self.save_suffix) print('Completed source to target label propagation in {0} space' ).format(train_feat_space) print( '-----------------------------------------------------------------------' )
def test_LabelSpreading_rbf(*data): x, y, unlabeled_indices = data y_train = np.copy(y) y_train[unlabeled_indices] = -1 fig = plt.figure() ax = fig.add_subplot(1, 1, 1) alphas = np.linspace(0.01, 1, num=10, endpoint=True) gammas = np.logspace(-2, 2, num=50) colors = ((1, 0, 0), (0, 1, 0), (0, 0, 1), (0.5, 0.5, 0), (0, 0.5, 0.5), (0.5, 0, 0.5), (0.4, 0.6, 0), (0.6, 0.4, 0) \ , (0, 0.6, 0.4), (0.5, 0.3, 0.2)) # 颜色集合,不同的曲线用不同的颜色 # 训练并绘图 for alpha, color in zip(alphas, colors): scores = [] for gamma in gammas: clf = LabelSpreading(max_iter=100, gamma=gamma, alpha=alpha, kernel='rbf') clf.fit(x, y_train) scores.append(clf.score(x[unlabeled_indices], y[unlabeled_indices])) ax.plot(gammas, scores, label=r"$\alpha=%s$" % alpha, color=color) # 设置图形 ax.set_xlabel(r"$\gamma$") ax.set_ylabel("score") ax.set_xscale("log") ax.legend(loc='best') ax.set_title("LabelSpreading rbf kernel") plt.show()
def label_spread(self, X_train, y_train, gamma = None, max_iter = None): """ Train Label Spreading model from scikit-learn Parameters __________ X_train: Scaled training data y_train: Class label gamma: Parameter for rbf kernel max_iter: Maximum number of iterations allowed Returns ________ Predicted labels and probability """ # Label spreading model model = LabelSpreading(kernel='rbf', gamma = gamma, max_iter = max_iter, n_jobs= -1) # Fit the training set model.fit(X_train, y_train) # Predict the labels of the unlabeled data points predicted_labels = model.transduction_ # Predict probability predicted_proba = model.predict_proba(X_train) return predicted_labels, predicted_proba
def semiLabelSpreding(feature_extractor, generator, val_generator, kernel, neighbors, gamma, alpha): semi = LabelSpreading(kernel=kernel, n_neighbors=neighbors, gamma=gamma, alpha=alpha, tol=0.001, max_iter=1000000) features = feature_extractor.predict_generator(generator, steps=generator.samples / generator.batch_size, verbose=1) classes = generator.classes for i in range(0, generator.samples): if (generator.filenames[i][0] == 'N'): classes[i] = -1 semi.fit(features, classes) val_features = feature_extractor.predict_generator( val_generator, steps=val_generator.samples / val_generator.batch_size, verbose=1) predicted_classes = semi.predict(val_features) return predicted_classes
class ModelLabelSpreading: def __init__(self): np.random.seed(1102) self.model = LabelSpreading( kernel="rbf", n_jobs=int(np.max([multiprocessing.cpu_count() - 2, 1])), alpha=0.2, n_neighbors=10, max_iter=15) self.name = "LABEL-SPREADING" self.scaler = MinMaxScaler() def fit(self, X, y, Xu=None): np.random.seed(1102) self.Xl = X self.yl = y #self.Xu = Xu def predict(self, X): np.random.seed(1102) self.Xt = X X = self.scaler.fit_transform(np.vstack((self.Xl, self.Xt))) y = np.append(self.yl, np.repeat(-1, self.Xt.shape[0])) #y = np.append(y, np.repeat(-1, self.Xt.shape[0])) y = np.int64(y) assert X.shape[0] == len(y) self.model.fit(X, y) return np.array( self.model.label_distributions_)[(-self.Xt.shape[0]):, :]
def label(filenames, train_path='../data/train_molecules_30.mat'): """ Label data with the provided filenames. :param filenames: List of filenames containing data to label. :return: Newly labeled and conglomerate datasets """ unlabeled = [scipy.io.loadmat(fname) for fname in filenames] unlabeled_X = np.vstack([data['X'] for data in unlabeled]) X, Y = load_data(train_path, shape=(-1, 30, 30, 30)) num_unlabeled = unlabeled_X.shape[0] unlabeled_Y = np.zeros(num_unlabeled) - 1 unlabeled_Y = unlabeled_Y.reshape((-1, 1)) Y = Y.reshape((-1, 1)) Y_all = np.vstack((Y, unlabeled_Y)) X_all = np.vstack((X, unlabeled_X)) X_all = X_all.reshape((-1, 27000)) label_prop_model = LabelSpreading() label_prop_model.fit(X_all, Y_all) Y_all = label_prop_model.transduction_ unlabeled_Y = Y_all[num_unlabeled:] return (unlabeled_X, unlabeled_Y), (X_all, Y_all)
def predict_ssl(self, x_sup, y_sup, x_unsup, y_unsup, x_test, y_test): ls_model = LabelSpreading(kernel='knn', n_neighbors=5) indices = np.arange(self.train_size) unlabeled_indices = indices[x_sup.shape[0]:] y_sup_unsup = np.concatenate([y_sup, y_unsup]) y_sup_unsup_train = np.copy(y_sup_unsup) y_sup_unsup_train[unlabeled_indices] = -1 x_fit = np.concatenate([x_sup, x_unsup], axis=0) h_fit = self.model_e.predict(x_fit) h_fit = np.reshape(h_fit, (h_fit.shape[0], h_fit.shape[1] * h_fit.shape[2])) ls_model.fit(h_fit, y_sup_unsup_train) y_unsup_pred = ls_model.transduction_[unlabeled_indices] #print("LabelSpread Accuracy is ", accuracy_score(y_unsup, y_unsup_pred)) h_test = self.model_e.predict(x_test) h_test = np.reshape( h_test, (h_test.shape[0], h_test.shape[1] * h_test.shape[2])) #SVM clf_svc = svm.SVC(kernel='linear') y_fit_true = ls_model.transduction_ clf_svc.fit(h_fit, y_fit_true) acc_svm = accuracy_score(y_test, clf_svc.predict(h_test)) clf_svc = svm.LinearSVC() clf_svc.fit(h_fit, y_fit_true) acc_svm_linear = accuracy_score(y_test, clf_svc.predict(h_test)) print('acc_svm is ', max(acc_svm, acc_svm_linear))
def testLabelPropagation(): from sklearn.semi_supervised import LabelSpreading from sklearn import preprocessing label_enc = preprocessing.LabelEncoder() label_prop_model = LabelSpreading(kernel='knn') train_iter = getDocumentIterator1("published = 0 and is_test = 1") validation_iter = getDocumentIterator1("published = 1 and is_test = 1") semantic_model = gensim_tests.SemanticModel.load( 'gensim/full_corpus_300000') all_profiles, labels = [], [] propagation_labels = [] for doc in train_iter: all_profiles.append(semantic_model.inferProfile(doc.rawtext)) labels.append(doc.learned_category[0]) propagation_labels.append(doc.learned_category[0]) label_enc.fit(propagation_labels) propagation_labels = label_enc.transform(propagation_labels).tolist() for doc in validation_iter: all_profiles.append(semantic_model.inferProfile(doc.rawtext)) labels.append(doc.learned_category[0]) propagation_labels.append(-1) print propagation_labels print "Fitting" label_prop_model.fit(all_profiles, propagation_labels) output_labels = label_prop_model.transduction_ for propagated, orig in zip(label_enc.inverse_transform(output_labels), labels): print propagated, orig
def test_LabelSpreading_knn(*data): x, y, unlabeled_indices = data y_train = np.copy(y) y_train[unlabeled_indices] = -1 fig = plt.figure() ax = fig.add_subplot(1, 1, 1) alphas = np.linspace(0.01, 1, num=10, endpoint=True) Ks = [1, 2, 3, 4, 5, 8, 10, 15, 20, 25, 30, 35, 40, 50] colors = ( (1, 0, 0), (0, 1, 0), (0, 0, 1), (0.5, 0.5, 0), (0, 0.5, 0.5), (0.5, 0, 0.5), (0.4, 0.6, 0), (0.6, 0.4, 0), \ (0, 0.6, 0.4), (0.5, 0.3, 0.2)) # 颜色集合,不同的曲线用不同的颜色 # 训练并绘图 for alpha, color in zip(alphas, colors): scores = [] for K in Ks: clf = LabelSpreading(max_iter=100, n_neighbors=K, alpha=alpha, kernel='knn') clf.fit(x, y_train) scores.append(clf.score(x[unlabeled_indices], y[unlabeled_indices])) ax.plot(Ks, scores, label=r"$\alpha=%s$" % alpha, color=color) # 设置图形 ax.set_xlabel(r"k") ax.set_ylabel("score") ax.legend(loc='best') ax.set_title("LabelSpreading knn kernel") plt.show()
def soft_clamping(kernel, xTrain, yTrain, MI=10000, k=3, g=0.6, a=0.1): spread = LabelSpreading(kernel=kernel, n_neighbors=k, gamma=g, alpha=a, max_iter=MI, n_jobs=-1) spread.fit(xTrain, yTrain) predY = spread.predict_proba(xTrain) norm_Y = normalize(yTrain, predY) labels = [] for i in norm_Y: if i[0] > i[1]: labels.append(benign) elif i[0] < i[1]: labels.append(malware) lm_to_b, lb_to_m, tp, tn, fp, fn, pred_day1, missed_day1 = stats( yTrain, labels, yExpect, day_one) results = [ 'SC', kernel, k, g, a, lm_to_b, lb_to_m, tp, tn, fp, fn, pred_day1, missed_day1 ] file_name = 'SC_CMN_5per_' + str(rate) + '.csv' write_csv(file_name, results)
def label_spreading(X_train, y_train, Xunlabelled, X_test, y_test): #pca = randomized_PCA(X_train) #X_train, X_test, y_train, y_test = cross_validation.train_test_split(tr_images, tr_labels, test_size=0.3) #X = pca.transform(X) #val_images = pca.transform(val_images) #y= y[:] X_train = X_train[:, :] y_train = y_train[:] Xunlabelled = Xunlabelled[:10000,:] #import ipdb; ipdb.set_trace() X_both = np.vstack((X_train, Xunlabelled)) y_both = np.append(y_train, -np.ones((Xunlabelled.shape[0],))) label_prop_model = LabelSpreading(max_iter=100) #random_unlabeled_points = np.where(np.random.random_integers(0, 1, size=len(y_train))) #labels = np.copy(y_train) #labels[random_unlabeled_points] = -1 label_prop_model.fit(np.copy(X_both), np.copy(y_both)) y_pred = label_prop_model.predict(np.copy(X_both)) print(y_pred)
class LabelSpreadingModel(SupervisedW2VModel): def fit_with_test(self, test_data): xs, ys = [], [] self.ans_mapping = [] for ans, cvs in self.context_vectors.items(): xs.extend(cvs) if ans not in self.ans_mapping: y = len(self.ans_mapping) self.ans_mapping.append(ans) else: y = self.ans_mapping.index(ans) ys.extend(y for _ in cvs) for ctx in test_data: xs.append(self.cv(ctx)) ys.append(-1) # unlabeled self.ls_clf = LabelSpreading(kernel='knn', n_neighbors=11) self.ls_clf.fit(xs, ys) def __call__(self, x, ans=None, with_confidence=False): v = self.cv(x) probs = self.ls_clf.predict_proba([v])[0] pred = probs.argmax() m_ans = self.ans_mapping[pred] # TODO - get confidence as difference between probs[pred] and next return (m_ans, 0.0) if with_confidence else m_ans
def doLabelSpreading(self,X,y,**kwargs): label_spread_model = LabelSpreading(**kwargs) if self.verbose>2: print("X, y shapes: ",X.shape,y.shape) print(" y hist: ",np.histogram(y)) label_spread_model.fit(X, y) if self.verbose>2: print("ls_predict:",np.histogram(label_spread_model.predict(X)) ) return label_spread_model.predict_proba(X)
def test_LabelSpreading(*data): x, y, unlabeled_indices = data y_train = np.copy(y) y_train[unlabeled_indices] = -1 clf = LabelSpreading(max_iter=100, kernel='rbf', gamma=0.1) clf.fit(x, y_train) predicted_labels = clf.transduction_[unlabeled_indices] true_labels = y[unlabeled_indices] print("Accuracy: %f" % metrics.accuracy_score(true_labels, predicted_labels))
def semi_supervised(): features,labels = separate_cols_with_unknown(gtd) features = process_nontext(features) features = convertDType(features) model = LabelPropagation(kernel="knn") model2 = LabelSpreading(kernel="knn") model2.fit(features,labels) preds = cross_val_predict(model2,features,labels,cv=5) print('5 fold cross val accuracy of model: %0.2f ' % accuracy_score(labels,preds))
def LabelSpreadingWrapper(X_train, y_train, X_test): clf = LabelSpreading(kernel='knn', n_neighbors=10, n_jobs=-1, max_iter=1000, alpha=0.1) newlabels = np.concatenate((np.array(y_train), -np.ones(len(X_test)))) clf.fit(np.concatenate((X_train, X_test)), newlabels) return clf.transduction_[-len(X_test):]
def semi_supervised(): features, labels = separate_cols_with_unknown(gtd) features = process_nontext(features) features = convertDType(features) model = LabelPropagation(kernel="knn") model2 = LabelSpreading(kernel="knn") model2.fit(features, labels) preds = cross_val_predict(model2, features, labels, cv=5) print('5 fold cross val accuracy of model: %0.2f ' % accuracy_score(labels, preds))
def test_LabelSpreading(*data): X,y,unlabeled_indices = data y_train = np.copy(y) y_train[unlabeled_indices] = -1 clf = LabelSpreading(max_iter=1000, kernel='knn',gamma = 0.1) clf.fit(X,y_train) true_labels = y[unlabeled_indices] predicted_labels = clf.transduction_[unlabeled_indices] print('Accuracy : %f' %clf.score(X[unlabeled_indices],true_labels)) print('Accuracy : %f' %metrics.accuracy_score(true_labels,predicted_labels))
def label_spreading(self, X_train, y, X_test): clf = LabelSpreading() X = np.concatenate((X_train.todense(), X_test.todense()), axis=0) print("X shape now ", X.shape) print("Y shape now ", y.shape) clf.fit(X, y) final_labels = clf.predict(X_test) label_prob = clf.predict_proba(X_test) print(compare_labels_probabilities().compare(label_prob, final_labels)) return final_labels, clf
def knn(X, labels): # ############################################################################# # Learn with LabelSpreading label_spread = LabelSpreading(kernel='knn', alpha=0.6, max_iter=100) label_spread.fit(X, labels) # ############################################################################# # Plot output labels output_labels = label_spread.transduction_ return output_labels
def propagate_labels( features, labels, ): label_prop_model = LabelSpreading(kernel=construct_graph, n_jobs=-1) label_prop_model.fit(features, labels) logger.debug(label_prop_model.classes_) # preds = label_prop_model.predict(features) preds = label_prop_model.predict_proba(features) # logger.debug(label_prop_model.classes_) return preds
def apply_notl(trainX, trainY, testX, testY, window, source_pos, target_pos): ####################### ### SEMI-SUPERVISED ### ######################## # Label Propagation label_prop_model = LabelPropagation(kernel='knn') label_prop_model.fit(trainX, trainY) Y_Pred = label_prop_model.predict(testX); acc_ss_propagation, acc_ss_propagation_INFO = check_accuracy(testY, Y_Pred) # Label Spreading label_prop_models_spr = LabelSpreading(kernel='knn') label_prop_models_spr.fit(trainX, trainY) Y_Pred = label_prop_models_spr.predict(testX); acc_ss_spreading, acc_ss_spreading_INFO = check_accuracy(testY, Y_Pred) ######################## #### WITHOUT TL ######## ######################## # LogisticRegression modelLR = LogisticRegression() modelLR.fit(trainX, trainY) predLR = modelLR.predict(testX) accLR, acc_LR_INFO = check_accuracy(testY, predLR) # DecisionTreeClassifier modelDT = tree.DecisionTreeClassifier() modelDT.fit(trainX, trainY) predDT = modelDT.predict(testX) accDT, acc_DT_INFO = check_accuracy(testY, predDT) # BernoulliNB modelNB = BernoulliNB() modelNB.fit(trainX, trainY) predND = modelNB.predict(testX) accNB, acc_NB_INFO = check_accuracy(testY, predND) # return pd.DataFrame( [{ 'window': window, 'source_position': source_pos, 'target_position': target_pos, 'acc_SS_propagation': acc_ss_propagation, 'acc_SS_propagation_INFO':acc_ss_propagation_INFO, 'acc_SS_spreading': acc_ss_spreading, 'acc_SS_spreading_INFO':acc_ss_spreading_INFO, 'acc_LR':accLR, 'acc_LR_INFO': str(acc_LR_INFO), 'acc_DT': accDT, 'acc_DT_INFO': str(acc_DT_INFO), 'acc_NB': accNB, 'acc_NB_INFO': str(acc_NB_INFO) }] )
def _semi_supervised_learning(self, data_matrix, target): n_classes = len(set(target)) # if there are too few classes (e.g. less than -1 and at least 2 other classes) # then just bail out and return the original target # otherwise one cannot meaningfully spread the information of only one class if n_classes > 2: semi_supervised_estimator = LabelSpreading(kernel='knn', n_neighbors=self.n_neighbors) semi_supervised_estimator.fit(data_matrix, target) predicted_target = semi_supervised_estimator.predict(data_matrix) predicted_target = self._clamp(target, predicted_target) return predicted_target else: return target
def _semi_supervised_learning(self, data_matrix, target): n_classes = len(set(target)) # if there are too few classes (e.g. less than -1 and at least 2 other classes) # then just bail out and return the original target # otherwise one cannot meaningfully spread the information of only one class if n_classes > 2: semi_supervised_estimator = LabelSpreading( kernel='knn', n_neighbors=self.n_neighbors) semi_supervised_estimator.fit(data_matrix, target) predicted_target = semi_supervised_estimator.predict(data_matrix) predicted_target = self._clamp(target, predicted_target) return predicted_target else: return target
def LabelPropagation(support, support_ys, query): alpha = 0.3 k_neighbours = 38 all_embeddings = np.concatenate((support, query), axis=0) #X = all_embeddings.cpu().detach().numpy() labels = np.full(all_embeddings.shape[0], -1.) labels[:support.shape[0]] = support_ys label_propagation = LabelSpreading(kernel='knn', alpha=alpha, n_neighbors=k_neighbours, tol=0.000001) label_propagation.fit(all_embeddings, labels) predicted_labels = label_propagation.transduction_ query_prop = predicted_labels[support.shape[0]:] return query_prop
class SemiSupervised(BaselineModel): """ LabelSpreading Implementation """ def fit(self): #Need to concatenate labeled and unlabeled data #unlabeled data labels are set to -1 X = np.concatenate( (self.val_primitive_matrix, self.train_primitive_matrix)) val_labels = (self.val_ground + 1) / 2. train_labels = -1. * np.ones(np.shape(self.train_primitive_matrix)[0]) y = np.concatenate((val_labels, train_labels)) self.model = LabelSpreading(kernel='knn') self.model.fit(X, y)
def computeSimilarities2(vect, matrix_values, numLine, numRan=10): """ build the model with the semi supervised approach labelSpreading Args: matrix_values: descriptor matrix i.e. all the probability of all ir models vect: the answer vector (value 0 for false links and 1 for true links); numLine: number of pairs of artefacts Returns: preds: probability that a pair of artefact is linked """ #number of iterations allPrediction = [] model = LabelSpreading() #compute multiple (10) random vector of the matrix_values for i in range(0, numRan): subVect, subMatrix_values = computeRandom(vect, matrix_values, numLine) #compute the prediction function of each random vector computeModel = model.fit(subMatrix_values, subVect) print("new predicted function computed") #compute the prediction of each pair of artefact with the random model preds0 = computeModel.predict_proba(matrix_values) allPrediction.append(preds0[:, 1]) # by the "vote majoritaire" preds = vote(allPrediction, len(vect), numRan) print(preds) return preds
def semi_supervised_learning(data_matrix, target): if -1 in list(target): # if -1 is present in target do label spreading from sklearn.semi_supervised import LabelSpreading label_prop_model = LabelSpreading(kernel='knn', n_neighbors=5) label_prop_model.fit(data_matrix, target) pred_target = label_prop_model.predict(data_matrix) extended_target = [] for pred_label, label in zip(pred_target, target): if label != -1 and pred_label != label: extended_target.append(label) else: extended_target.append(pred_label) else: extended_target = target return np.array(extended_target)
def test_label_spreading_algorithms(): """ Compare scikit's algorithm and our algorithm """ x = np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9], [10, 11, 12]]) # scikit takes different input that our algorithm y_sklearn = np.array([1, 2, -1, -1]) y_custom = np.array([[1, 0], [0, 1], [0, 0], [0, 0]]) # scikit's algorithm alpha = 0.2 max_iter = 30 tol = 1e-3 label_spreading = LabelSpreadingSKLearn(kernel="rbf", max_iter=max_iter, alpha=alpha, tol=tol) model = label_spreading.fit(x, y_sklearn) expected = model.predict(x) # our algorithm w = distance_matrix(x, measure=rbf_distance) ls = LabelSpreadingCustom(alpha=alpha, max_iter=max_iter, tol=tol) ls = ls.fit(w, y_custom) actual = ls.predict(y_custom) actual = np.array(actual) + 1 # add plus 1 to every prediction assert_array_equal(actual, expected)
def augment_instances(self, X_train, y_train): if self.args.num_unlabeled == 0: return X_train, y_train X_unlabeled = self.dataset.X_train_unlabeled y_unlabeled = self.dataset.y_train_unlabeled X_unlabeled = X_unlabeled.values y_unlabeled = y_unlabeled.values X_train_text = X_train[:, self.args.text_col] self.fit_text(X_train_text, y_train) X_train_rep = self.transform_text(X_train_text) X_train_rep = self.augment_features(X_train_rep, X_train) chunk_size = 1000 num_instances = X_unlabeled.shape[0] num_cols = y_train.shape[1] for row in tqdm(range(0, self.args.num_unlabeled, chunk_size), desc='spreading labels in rows', total=int(self.args.num_unlabeled / chunk_size)): end_row = row + chunk_size end_row = np.minimum(end_row, num_instances) for col in tqdm(range(num_cols), desc='spreading labels in cols', leave=False): X_unlabeled_rep = self.transform_text( X_unlabeled[row:end_row, self.args.text_col]) X_unlabeled_rep = self.augment_features( X_unlabeled_rep, X_unlabeled[row:end_row, :]) X_spread = np.append(X_train_rep, X_unlabeled_rep, axis=0) y_spread = np.append(y_train[:, col], y_unlabeled[row:end_row, col], axis=0) labeling = LabelSpreading() labeling.fit(X_spread, y_spread) y_unlabeled[row:end_row, col] = labeling.predict(X_unlabeled_rep) X_train = np.append(X_train, X_unlabeled[:row + chunk_size], axis=0) y_train = np.append(y_train, y_unlabeled[:row + chunk_size], axis=0) return X_train, y_train
def runLabelSpreading(data, assignment): lp_model = LabelSpreading(kernel='knn', n_neighbors=10) labels = [-1] * len(data) for x, y in assignment: labels[x - 1] = y labels = np.array(labels) lp_model.fit(data, labels) pred = lp_model.transduction_ result = [] d = {} for i in range(6000, len(pred)): c = d.setdefault(int(pred[i]), 0) d[int(pred[i])] = c + 1 result.append([i + 1, int(pred[i])]) print d return result
class LabelSpreadingImpl: def __init__(self, **hyperparams): self._hyperparams = hyperparams self._wrapped_model = Op(**self._hyperparams) def fit(self, X, y=None): if y is not None: self._wrapped_model.fit(X, y) else: self._wrapped_model.fit(X) return self def predict(self, X): return self._wrapped_model.predict(X) def predict_proba(self, X): return self._wrapped_model.predict_proba(X)
def objective(self, x): """ Objective function for hyper-parameter selection/evaluation Parameters ---------- x : hyper-parameter under test - gamma Returns ------- float a measure directly proportional to entropy """ model = LabelSpreading(kernel=self.kernel, alpha=self.alpha, gamma=x) model.fit(self.x, self.y) label_prob = model.label_distributions_ return get_average_label_entropy(label_prob) + self.learning_rate * x ** 2
def train_model(nodes, datasets): y = np.array(range(len(nodes))) nodes = list(nodes) vectorizer = DictVectorizer(sparse=True) for i, dataset in enumerate(datasets): g = compute_dataset(dataset) nodes.extend(g.classes) sys.stdout.write('\r') sys.stdout.write(str(i + 1)) sys.stdout.flush() X = vectorizer.fit_transform([dict(node.concept_vector) for node in nodes]) y = y + [-1 for i in range(len(nodes) - len(y))] unlabeled = [] model = LabelSpreading() model.fit(X, y) model.vectorizer = vectorizer return model
def train_model(nodes, datasets): y = np.array(range(len(nodes))) nodes = list(nodes) vectorizer = DictVectorizer(sparse=True) for i, dataset in enumerate(datasets): g = compute_dataset(dataset) nodes.extend(g.classes) sys.stdout.write('\r') sys.stdout.write(str(i+1)) sys.stdout.flush() X = vectorizer.fit_transform([dict(node.concept_vector) for node in nodes]) y = y + [-1 for i in range(len(nodes) - len(y))] unlabeled = [] model = LabelSpreading() model.fit(X, y) model.vectorizer = vectorizer return model
def propogation(model, uids, labeled_ids): X, y1, y2 = [], [], [] pool = [] for uid in labeled_ids: X.append(model.docvecs[uid]) y1.append(1) for uid in uids: if uid not in labeled_ids: X.append(model.docvecs[uid]) y2.append(-1) label_prop_model = LabelSpreading(kernel='knn', alpha=1.0) y2 = np.array(y2) y2[0:(len(y1)-1)] = 0 print len(y1) + len(y2) for i in xrange(5): np.random.shuffle(y2) label_prop_model.fit(X, y1 + y2.tolist()) pool.append(label_prop_model.transduction_) pickle.dump(pool, open('data/propagation.pick', 'w')) pool = pickle.load(open('data/propagation.pick', 'r')) pool = np.array(pool) for column in pool.T: print column
1. 1. -1. 1. 6. -1. 3. 6. 1. 4. 4. 6. 4. 6. 4. 1. -1. 6. 1. 2. 6. 4. -1. 2. 6. 2. -1. -1. 6. 4. -1. 1. 6. 4. 4. 6. 6. 6. -1. -1. 1. 3. -1. 6. 2. -1. 1. 4. -1. 6. 1. 4. 3. 3. 4. 1. 6. -1. 4. 4. 1. 1. 6. 6. -1. 4. 4. 4. 3. 2. 6. -1. 1. 6. 4. 4. 4. 5. 6. -1. -1. 5. 2. 6. 1. 6. 3. 2. 6. 3. 3. 1. 2. 5. 2. -1. -1. 1. 6. 6. -1. 6. 6. 6. 4. 6. -1. 2. 3. 2. 5. 4. 4. 6. 4. -1. 4. 2. 6. 1. 1. 2. -1. 5. 2. 4. 3. -1. 6. 2. 5. 2. 2. 5. 5. 4. 2. 1. -1. 1.] (500, 100) (500,) """ from sklearn.semi_supervised import LabelSpreading label_propagation_model = LabelSpreading() label_propagation_model.fit(X, y) # make predictions for first twenty samples (some will be known, some unknown) for i in range(20): print 'y: ', y[i], '\t', 'y_hat: ', label_propagation_model.predict(X[i].reshape(1,-1)) """ y: 6.0 y_hat: [6.] y: 6.0 y_hat: [6.] y: 2.0 y_hat: [2.] y: 1.0 y_hat: [1.] y: -1.0 y_hat: [6.] * y: 2.0 y_hat: [2.] y: 6.0 y_hat: [6.] y: 4.0 y_hat: [4.] y: 3.0 y_hat: [3.] y: 5.0 y_hat: [5.] y: 6.0 y_hat: [6.]
def main(): usage = "usage prog [options] arg" parser = OptionParser(usage=usage) parser.add_option("-t", "--task", dest="task", help="the task name") parser.add_option("-o", "--output", dest="output", help="the output file") (options, remainder) = parser.parse_args() train_paths = [ "../data/train_simple_feature.csv", "../data/train_plus_feature.csv", "../data/train_azure_plus_feature.csv", #"../data/train_azure_feature.csv", #"../data/train_module_feature.csv", #"../data/train_course_feature.csv", #"./blend_train.csv" ] label_path = "../data/truth_train.csv" test_paths = [ "../data/test_simple_feature.csv", "../data/test_plus_feature.csv", "../data/test_azure_plus_feature.csv", #"../data/test_azure_feature.csv", #"../data/test_module_feature.csv", #"../data/test_course_feature.csv", #"./blend_test.csv" ] train = merge_features(train_paths, label_path) train = train.drop(['user_drop_ratio'], axis=1) #train['user_drop_ratio'] = (train['user_drop_ratio'] + 8.0 / train['user_courses']) / (1.0 + 10.0 / train['user_courses']) y = encode_labels(train.dropout.values) train = train.drop('dropout', axis=1) tr_ids = train.enrollment_id.values X = train.drop('enrollment_id', axis=1) m, n = X.shape print 'train.shape=%s' % (str(X.shape)) test = merge_features(test_paths) test = test.drop(['user_drop_ratio'], axis=1) #test['user_drop_ratio'] = (test['user_drop_ratio'] + 8.0 / test['user_courses']) / (1.0 + 10.0 / test['user_courses']) tt_ids = test.enrollment_id.values X_test = test.drop('enrollment_id', axis=1) print 'test.shape=%s' % (str(X_test.shape)) scaler = StandardScaler().fit(np.vstack((X, X_test))) task = options.task if not task: task = "blend" if task == 'blend': clf_list = [ #("knn_p2_10", create_clf('knn', {"n_neighbors": 10, "p": 2})), #("knn_p2_10_scaler", create_clf('knn', {"n_neighbors": 10, "p": 2, "scaler": scaler})), #("knn_p2_100", create_clf('knn', {"n_neighbors": 100, "p": 2})), #("knn_p2_100_scaler", create_clf('knn', {"n_neighbors": 100, "p": 2, "scaler": scaler})), #("knn_p2_500", create_clf('knn', {"n_neighbors": 500, "p": 2})), #("knn_p2_500_scaler", create_clf('knn', {"n_neighbors": 500, "p": 2, "scaler": scaler})), #("knn_p2_100", create_clf('knn', {"n_neighbors": 100, "p": 2})), #("knn_p2_800", create_clf('knn', {"n_neighbors": 800, "p": 2})), #("knn_p1_10", create_clf('knn', {"n_neighbors": 10, "p": 1})), #("knn_p1_10_scaler", create_clf('knn', {"n_neighbors": 10, "p": 1, "scaler": scaler})), #("knn_p1_100", create_clf('knn', {"n_neighbors": 100, "p": 1})), #("knn_p1_100_scaler", create_clf('knn', {"n_neighbors": 100, "p": 1, "scaler": scaler})), #("knn_p1_500", create_clf('knn', {"n_neighbors": 500, "p": 1})), #("knn_p1_500_scaler", create_clf('knn', {"n_neighbors": 500, "p": 1, "scaler": scaler})), #("knn_p1_800", create_clf('knn', {"n_neighbors": 800, "p": 1})), #("knn_p1_800_scaler", create_clf('knn', {"n_neighbors": 800, "p": 1, "scaler": scaler})), #("extra_gini_10depth", create_clf("ext", {"criterion": "gini", "n_estimators": 200, "max_depth": 10})), #("extra_entropy_10depth", create_clf("ext", {"criterion": "entropy", "n_estimators": 200, "max_depth": 10})), #("extra_gini_20depth", create_clf("ext", {"criterion": "gini", "n_estimators": 200, "max_depth": 20})), #("extra_entropy_20depth", create_clf("ext", {"criterion": "entropy", "n_estimators": 200, "max_depth": 20})), ("extra_gini_30depth", create_clf("ext", {"criterion": "gini", "n_estimators": 200, "max_depth": 30})), ("extra_entropy_30depth", create_clf("ext", {"criterion": "entropy", "n_estimators": 200, "max_depth": 30})), #("rfc_gini_3depth", create_clf("rfc", {"criterion": "gini", "max_depth": 3, "n_estimators": 200})), #("rfc_entropy_3depth", create_clf("rfc", {"criterion": "entropy", "max_depth": 3, "n_estimators": 200})), ("rfc_gini_5depth", create_clf("rfc", {"criterion": "gini", "max_depth": 5, "n_estimators": 200})), ("rfc_entropy_5depth", create_clf("rfc", {"criterion": "entropy", "max_depth": 5, "n_estimators": 200})), ("rfc_gini_6depth", create_clf("rfc", {"criterion": "gini", "max_depth": 6, "n_estimators": 200})), ("rfc_entropy_6depth", create_clf("rfc", {"criterion": "entropy", "max_depth": 6, "n_estimators": 200})), ("rfc_gini_8depth", create_clf("rfc", {"criterion": "gini", "max_depth": 8, "n_estimators": 200})), ("rfc_entropy_8depth", create_clf("rfc", {"criterion": "entropy", "max_depth": 8, "n_estimators": 200})), ("rfc_gini_10depth", create_clf("rfc", {"criterion": "gini", "max_depth": 10, "n_estimators": 200})), ("rfc_entropy_10depth", create_clf("rfc", {"criterion": "entropy", "max_depth": 10, "n_estimators": 200})), ("rfc_gini_12depth", create_clf("rfc", {"criterion": "gini", "max_depth": 12, "n_estimators": 200})), ("rfc_entropy_12depth", create_clf("rfc", {"criterion": "entropy", "max_depth": 12, "n_estimators": 200})), #("xgb_1500_2depth", create_clf("xgb", {"max_depth": 2, "n_estimators": 1500, "learning_rate": 0.03})), #("xgb_600_3depth", create_clf("xgb", {"max_depth": 3, "n_estimators": 600, "learning_rate": 0.03})), #("xgb_600_4depth", create_clf("xgb", {"max_depth": 4, "n_estimators": 600, "learning_rate": 0.03})), ("xgb_600_5depth", create_clf("xgb", {"max_depth": 5, "n_estimators": 600, "learning_rate": 0.03})), #("xgb_600_6depth", create_clf("xgb", {"max_depth": 6, "n_estimators": 600, "learning_rate": 0.02})), #("xgb_600_7depth", create_clf("xgb", {"max_depth": 7, "n_estimators": 600, "learning_rate": 0.01})), #("xgb_600_8depth", create_clf("xgb", {"max_depth": 8, "n_estimators": 600, "learning_rate": 0.01})), #("lgc_1c_scale", create_clf("lgc", {"C": 1.0, "scaler": scaler})), #("lgc_1c", create_clf("lgc", {"C": 1.0})), #("lgc_1c_l1", create_clf("lgc", {"C": 1.0, "penalty": "l1"})), #("lgc_3c_scale", create_clf("lgc", {"C": 3.0, "scaler": scaler})), #("lgc_3c", create_clf("lgc", {"C": 3.0})), ("lgc_3c_l1", create_clf("lgc", {"C": 3.0, "penalty": "l1"})), #("lgc_5c_scale", create_clf("lgc", {"C": 5.0, "scaler": scaler})), #("lgc_5c", create_clf("lgc", {"C": 5.0})), ] X = X.values blend_train, blend_test = train_blend(X, y, X_test, clf_list, 5) print 'blend_train.shape=%s' % (str(blend_train.shape)) print 'blend_test.shape=%s' % (str(blend_test.shape)) cols = [cname for cname, clf in clf_list] cols = ['enrollment_id'] + cols blend_train_ids = np.hstack((np.matrix(tr_ids).T, blend_train)) blend_test_ids = np.hstack((np.matrix(tt_ids).T, blend_test)) dump_data(blend_train_ids, cols, "new_blend_train.csv") dump_data(blend_test_ids, cols, "new_blend_test.csv") blender = create_clf('lgc', {"C": 1.0, "penalty": "l1"}) auc = cv_loop(blend_train, y, blender) print 'AUC (LGC blend): %f' % auc blender = create_clf('ext', {"max_depth": 10, "criterion": "entropy", "n_estimator": 100}) auc = cv_loop(blend_train, y, blender) print 'AUC (EXT blend): %f' % auc blender = create_clf('xgb', {'max_depth': 2, "n_estimators": 150, "learning_rate": 0.05}) auc = cv_loop(blend_train, y, blender) print "AUC (XGB blend {d: %d, n: %d}): %f" % (2, 150, auc) blender = create_clf('xgb', {'max_depth': 3, "n_estimators": 200, "learning_rate": 0.05}) auc = cv_loop(blend_train, y, blender) print 'AUC (XGB blend {d: %d, n: %d}): %f' % (3, 200, auc) blender = create_clf('xgb', {'max_depth': 3, "n_estimators": 100, "learning_rate": 0.1}) blender = blender.fit(blend_train, y) preds = blender.predict_proba(blend_test)[:,1] write_submission(tt_ids, preds, "new_blend_submission.csv") combined_train = np.hstack((X, blend_train)) combined_test = np.hstack((X_test, blend_test)) blender = create_clf('xgb', {'max_depth': 5, "n_estimators": 600, "learning_rate": 0.03}) blender = blender.fit(combined_train, y) preds = blender.predict_proba(combined_test)[:,1] write_submission(tt_ids, preds, "new_combined_blend_submission.csv") elif task == 'lgc': print 'Try logistic regression ..' clf = create_clf("lgc", {"C": 3, "scaler": scaler, "penalty": "l1"}) auc = cv_loop(X, y, clf, 5) print 'AUC (all): %f' % auc elif task == "ext": print 'Try ExtraTreeClassifier' #clf = create_clf("ext", {"max_depth": 10}) # 0.86261 #clf = create_clf("ext", {"max_depth": 20}) # 0.862636 #clf = create_clf("ext", {"max_depth": 30}) # 0.860944 #clf = create_clf("ext", {"criterion": "entropy", "max_depth": 10}) # 0.862610 #clf = create_clf("ext", {"criterion": "entropy", "max_depth": 20}) # 0.862564 clf = create_clf("ext", {"criterion": "entropy", "max_depth": 20, "n_estimators": 100}) # 0.861795 #clf = create_clf("ext", {"criterion": "entropy", "max_depth": 20, "n_estimators": 2000}) # 0.862695 #clf = create_clf("ext", {"criterion": "entropy", "max_depth": 30, "n_estimators": 2000}) # 0.860 auc = cv_loop(X, y, clf, 5) print 'AUC (all): %f' % auc elif task == 'rfc': print 'Try RFC ..' #clf = create_clf('rfc', {'max_depth': 5}) # 0.859583 #clf = create_clf("rfc", {"criterion": "entropy", "max_depth": 10}) # 0.863369 #clf = create_clf("rfc", {"criterion": "entropy", "max_depth": 10, "n_estimators": 200}) # 0.863285 # clf = create_clf("rfc", {"criterion": "entropy", "max_depth": 10, "n_estimators": 100}) # 0.863207 #clf = create_clf("rfc", {"criterion": "entropy", "max_depth": 10, "max_features": None, "n_estimators": 200}) # 0.863341 clf = create_clf("rfc", {"criterion": "entropy", "max_depth": 40, "max_features": None, "n_estimators": 10000, "min_samples_split": 100}) # 0.863291 auc = cv_loop(X, y, clf, 5) print 'AUC (all): %f' % auc elif task == 'knn': clf = create_clf('knn', {"n_neighbors": 800, "p": 2, "scaler": scaler}) auc = cv_loop(X, y, clf, 5) print 'AUC (all): %f' % auc elif task == "gbt": paras = json.load(open('paras/gbt.json', 'r')) clf = create_clf("gbt", paras) clf = clf.fit(X, y) preds = clf.predict_proba(X_test)[:,1] write_submission(tt_ids, preds, "gbt_submission.csv") elif task == "xgb": #clf = create_clf('xgb', {"max_depth": 2, "n_estimators": 1500, "learning_rate": 0.03}) # 0.860279 #clf = create_clf('xgb', {"max_depth": 5, "n_estimators": 600, "learning_rate": 0.03}) # public: 0.8891443712867697; clf = create_clf('xgb', {"max_depth": 5, "n_estimators": 600, "learning_rate": 0.03}) # public: auc = cv_loop(X, y, clf, 5) print "AUC (all): %f" % auc #sys.exit() clf = clf.fit(X, y) preds = clf.predict_proba(X_test)[:,1] write_submission(tt_ids, preds, 'xgb_new_submission.csv') elif task == "deep": clf = create_clf('deep', {"neuro_num": 512, "nb_epoch": 20, "scaler": scaler, "optimizer": "adadelta"}) auc = cv_loop(X, y, clf, 5) print 'AUC (all): %f' % auc sys.exit(0) clf = clf.fit(X, y) preds = clf.predict_proba(X_test)[:,1] write_submission(tt_ids, preds, 'deep_submission.csv') elif task == 'semi': clf = create_clf("ext", {"criterion": "entropy", "max_depth": 20, "n_estimators": 100}) # 0.861795 train_semi(X, y, X_test, clf, 5) elif task == 'gbc': from sklearn.ensemble import GradientBoostingClassifier clf = GradientBoostingClassifier(n_estimators=300, learning_rate=0.1, min_samples_split=50, min_samples_leaf=50, max_depth=10, subsample=0.6, max_features='log2', verbose=1) auc = cv_loop(X, y, clf, 5) print 'AUC (all): %f' % auc elif task == 'label': from sklearn.semi_supervised import LabelPropagation from sklearn.semi_supervised import LabelSpreading label_prop_model = LabelSpreading() all_X = np.vstack((X, X_test)) tm, tn = X_test.shape unlabeles = [-1] * tm ys = [list(y)] ys.append(unlabeles) labels = np.concatenate(ys) print 'ALL shape=%s' % (str(all_X.shape)) print 'ALL y shape=%s' % (str(labels)) label_prop_model.fit(all_X, labels)
2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2] ''' print cutdown_labels ''' [ 0 0 0 0 -1 -1 -1 0 0 0 -1 0 0 -1 -1 -1 0 0 0 -1 0 -1 -1 0 0 0 -1 0 0 -1 0 -1 -1 0 0 0 0 -1 0 0 -1 0 -1 0 -1 0 0 0 0 -1 1 1 1 1 1 1 -1 -1 -1 1 1 -1 1 1 -1 1 -1 1 -1 1 1 -1 -1 1 1 1 1 -1 1 -1 1 1 1 -1 1 1 1 1 1 1 -1 1 1 1 1 1 1 1 -1 -1 -1 2 2 2 2 -1 2 2 -1 -1 -1 -1 2 2 2 2 2 -1 2 2 2 2 2 -1 -1 2 2 2 -1 2 2 -1 -1 2 2 2 2 2 2 2 2 -1 2 2 -1 -1 2 2 -1 -1] ''' # fit LabelSpreading model label_propagation_model.fit(iris['data'], cutdown_labels) # quick test print 'y: ', full_labels[-1] print 'y_hat: ', label_propagation_model.predict(iris['data'][-1]) ''' y: 2 y_hat: [2] ''' # overall accuracy correct = 0.0 for i in range(len(iris['data'])): if label_propagation_model.predict(iris['data'][i])[0] == full_labels[i]: correct += 1
rate = 1 train_comb = train # select 10% of test data selected_test = test.sample(test.shape[0]/rate,replace=False,random_state=20422438) train_comb = train_comb.append(selected_test) train_comb_label = train_label train_comb_unlabeled = pd.DataFrame(np.array([-1]*(test.shape[0]/rate))) train_comb_label = np.array(train_comb_label.append(train_comb_unlabeled)) train_comb_label = train_comb_label.reshape(len(train_comb_label)) a_level = 1 label_prop_model = LabelSpreading(kernel="knn",alpha=a_level) label_prop_model.fit(train_comb, train_comb_label) pred_y = label_prop_model.transduction_ pred_y[:train.shape[0]] = train_label X_train, X_test, y_train, y_test = train_test_split(label_prop_model.X_, pred_y, test_size=0.10, random_state=20422438) model_erf = se.ExtraTreesClassifier(random_state=20422438,n_jobs=-1,n_estimators=1000) model_erf.fit(X_train,y_train) model_erf_pred = model_erf.predict(X_test) model_erf_error = errFn(model_erf_pred,y_test) print a_level print model_erf_error