def label_spread(self, X_train, y_train, gamma = None, max_iter = None): """ Train Label Spreading model from scikit-learn Parameters __________ X_train: Scaled training data y_train: Class label gamma: Parameter for rbf kernel max_iter: Maximum number of iterations allowed Returns ________ Predicted labels and probability """ # Label spreading model model = LabelSpreading(kernel='rbf', gamma = gamma, max_iter = max_iter, n_jobs= -1) # Fit the training set model.fit(X_train, y_train) # Predict the labels of the unlabeled data points predicted_labels = model.transduction_ # Predict probability predicted_proba = model.predict_proba(X_train) return predicted_labels, predicted_proba
class LabelSpreadingModel(SupervisedW2VModel): def fit_with_test(self, test_data): xs, ys = [], [] self.ans_mapping = [] for ans, cvs in self.context_vectors.items(): xs.extend(cvs) if ans not in self.ans_mapping: y = len(self.ans_mapping) self.ans_mapping.append(ans) else: y = self.ans_mapping.index(ans) ys.extend(y for _ in cvs) for ctx in test_data: xs.append(self.cv(ctx)) ys.append(-1) # unlabeled self.ls_clf = LabelSpreading(kernel='knn', n_neighbors=11) self.ls_clf.fit(xs, ys) def __call__(self, x, ans=None, with_confidence=False): v = self.cv(x) probs = self.ls_clf.predict_proba([v])[0] pred = probs.argmax() m_ans = self.ans_mapping[pred] # TODO - get confidence as difference between probs[pred] and next return (m_ans, 0.0) if with_confidence else m_ans
def soft_clamping(kernel, xTrain, yTrain, MI=10000, k=3, g=0.6, a=0.1): spread = LabelSpreading(kernel=kernel, n_neighbors=k, gamma=g, alpha=a, max_iter=MI, n_jobs=-1) spread.fit(xTrain, yTrain) predY = spread.predict_proba(xTrain) norm_Y = normalize(yTrain, predY) labels = [] for i in norm_Y: if i[0] > i[1]: labels.append(benign) elif i[0] < i[1]: labels.append(malware) lm_to_b, lb_to_m, tp, tn, fp, fn, pred_day1, missed_day1 = stats( yTrain, labels, yExpect, day_one) results = [ 'SC', kernel, k, g, a, lm_to_b, lb_to_m, tp, tn, fp, fn, pred_day1, missed_day1 ] file_name = 'SC_CMN_5per_' + str(rate) + '.csv' write_csv(file_name, results)
def doLabelSpreading(self,X,y,**kwargs): label_spread_model = LabelSpreading(**kwargs) if self.verbose>2: print("X, y shapes: ",X.shape,y.shape) print(" y hist: ",np.histogram(y)) label_spread_model.fit(X, y) if self.verbose>2: print("ls_predict:",np.histogram(label_spread_model.predict(X)) ) return label_spread_model.predict_proba(X)
def label_spreading(self, X_train, y, X_test): clf = LabelSpreading() X = np.concatenate((X_train.todense(), X_test.todense()), axis=0) print("X shape now ", X.shape) print("Y shape now ", y.shape) clf.fit(X, y) final_labels = clf.predict(X_test) label_prob = clf.predict_proba(X_test) print(compare_labels_probabilities().compare(label_prob, final_labels)) return final_labels, clf
def propagate_labels( features, labels, ): label_prop_model = LabelSpreading(kernel=construct_graph, n_jobs=-1) label_prop_model.fit(features, labels) logger.debug(label_prop_model.classes_) # preds = label_prop_model.predict(features) preds = label_prop_model.predict_proba(features) # logger.debug(label_prop_model.classes_) return preds
class LabelSpreadingImpl: def __init__(self, **hyperparams): self._hyperparams = hyperparams self._wrapped_model = Op(**self._hyperparams) def fit(self, X, y=None): if y is not None: self._wrapped_model.fit(X, y) else: self._wrapped_model.fit(X) return self def predict(self, X): return self._wrapped_model.predict(X) def predict_proba(self, X): return self._wrapped_model.predict_proba(X)
class LPLearner(Learner): def __init__(self, K, seed): #TODO: It does not seem that fix the random state can remove the randomness self.lp = LabelSpreading() self.K = K def fit(self, X, y): self.lp.fit(X, y) def predict_proba(self, X): prob = self.lp.predict_proba(X) seen_classes = self.lp.classes_ prob = self.adjust_prob(prob, seen_classes) return prob def predict(self, X): prob = self.predict_proba(X) return prob.argmax(1)
def MyLabelSpreading(option, neighbor): CONFIG = GetConfig(option) [word2idx, vocabulary, X, y, X_train, X_test, y_train, y_test, inds_train, inds_test, inds_all] = \ joblib.load(CONFIG['RAW_DATA']) doc2vec = joblib.load(CONFIG['TENSOR_EMBEDDING']) # propagation classes = np.unique(y) n_samples = y.shape[0] n_classes = classes.shape[0] labels = np.zeros((n_samples, n_classes)) for i, val in enumerate(y_train): labels[i][int(val)] = 1.0 step = y_train.shape[0] for i, val in enumerate(y_test): labels[i + step] = -1 label_prop_model = LabelSpreading(kernel='knn', n_neighbors=neighbor) # label_prop_model = LabelSpreading(kernel='rbf', n_neighbors=args.neighbor,\ # gamma=20, alpha=0.2, max_iter=30, tol=0.001) label_prop_model.fit(doc2vec, labels) pred_probability = label_prop_model.predict_proba(doc2vec) pred_class = classes[np.argmax(pred_probability, axis=1)].ravel() accuracy = accuracy_score(y_test, pred_class[inds_test]) prf = precision_recall_fscore_support(y_test, pred_class[inds_test], average='binary') print('Accuracy:%f' % accuracy) print('Precision:%f' % prf[0]) print('Recall:%f' % prf[1]) print('Fscore:%f' % prf[2]) return accuracy, prf[0], prf[1], prf[2]
# Train model and print statistics (use 'knn' as kernel) from sklearn.semi_supervised import LabelSpreading model = LabelSpreading(kernel = 'knn', n_neighbors = 10, max_iter=1000).fit(X_train, Y_train) print("Percentage of correct predictions = {}".format(round(100*model.score(X_test, Y_test),2))) pred = model.predict(X_test) == Y_test print("Correct: {}".format(np.count_nonzero(pred==True)),"/", "Incorrect: {}".format(np.count_nonzero(pred==False))) Z1 = model.predict(X_test).reshape(Y_test.size,1) Z2 = np.asarray(Y_test).reshape(Y_test.size,1) Z3 = np.around(model.predict_proba(X_test),decimals=2) data = np.concatenate((Z1,Z2,Z3),axis=1) outcome = pd.DataFrame(data, columns = ["Predicted Label", "Actual Label", "Prob. Label = 0.0", "Prob. Label = 1.0"]) indicesToKeep = outcome["Predicted Label"] != outcome["Actual Label"] print("False predictions with associated class probabilities:\n{}".format(outcome[indicesToKeep])) # Plot predictions import matplotlib.pyplot as plt plt.figure() plt.figure(figsize=(10,10)) plt.xticks(fontsize=12)
def __call__(self, *args, **kwargs): """ Augment the labels Inputs: tr_percs: percentage of splitting between labeled and unlabeled observations algs: methods to perform the label propagation max_iter: parameter for 'gtg': number of iterations """ tr_percs = kwargs.pop('tr_percs', [0.02, 0.05, 0.1]) algs = kwargs.pop('algs', ['gtg', 'svm', 'labels_only']) max_iter = kwargs.pop('max_iter', 25) if not osp.exists(self.label_dir): os.makedirs(self.label_dir) with open(osp.join(self.label_dir, 'test_labels.txt'), 'w') as dst: loader = prepare_loader( osp.join(self.splitting_dir, 'test.txt'), img_root=self.dset['src'], stats=self.dset['stats'], batch_size=1, shuffle=False, ) for _, label, path in loader: dst.write(osp.join(path[0] + ',' + str(label.item()) + '\n')) for net_name in self.net_names: with open(osp.join(self.feat_dir, 'train', net_name + '.pickle'), 'rb') as pkl: net_name, labels, features, fnames = pickle.load(pkl) labels = labels.ravel() # uncomment to debug code # labels = labels[:5000] # features = features[:5000] # fnames = fnames[:5000] for tr_perc in tr_percs: labeled, unlabeled = equiclass_mapping(labels, tr_perc) for alg in algs: print(net_name + ' - ' + str(self.dset['nr_classes']) + ' classes') # generate alg label file name alg_path = osp.join(self.label_dir, alg, net_name, 'labels_{}.txt'.format(tr_perc)) if self.hard_labels: alg_labels = np.full(labels.shape[0], -1) alg_labels[labeled] = labels[labeled] else: alg_labels = np.zeros( (len(labels), self.dset['nr_classes'])) alg_labels[labeled, labels[labeled].ravel().astype(int)] = 1.0 if alg == 'gtg': # predict labels with gtg if 'W' not in locals(): W = gtg.sim_mat(features, verbose=True) ps = init_rand_probability(labels, labeled, unlabeled) res = gtg.gtg(W, ps, max_iter=max_iter, labels=labels, U=unlabeled, L=labeled) if self.hard_labels: alg_labels[unlabeled] = res[unlabeled].argmax( axis=1) else: alg_labels[unlabeled] = res[unlabeled] elif alg == 'svm': # predict labels with a linear SVM lin_svm = svm.LinearSVC() if self.hard_labels: lin_svm.fit(features[labeled, :], labels[labeled]) svm_labels = lin_svm.predict( features[unlabeled]).astype(int) else: cv = min( np.unique(labels[labeled], return_counts=True)[1].min(), 3) clf = CalibratedClassifierCV(lin_svm, cv=cv) clf.fit(features[labeled, :], labels[labeled]) svm_labels = clf.predict_proba(features[unlabeled]) alg_labels[unlabeled] = svm_labels elif alg == 'label_propagation': # predict labels with a label propagation model label_propagation = LabelPropagation(kernel='rbf', gamma=0.05, max_iter=4000) labels[unlabeled] = -1 label_propagation.fit(features, labels) if self.hard_labels: label_propagation_labels = label_propagation.predict( features[unlabeled]).astype(int) else: label_propagation_labels = label_propagation.predict_proba( features[unlabeled]) alg_labels[unlabeled] = label_propagation_labels elif alg == 'label_spreading': # predict labels with a label propagation model label_spreading = LabelSpreading(kernel='rbf', gamma=0.05) labels[unlabeled] = -1 label_spreading.fit(features, labels) if self.hard_labels: label_spreading_labels = label_spreading.predict( features[unlabeled]).astype(int) else: label_spreading_labels = label_spreading.predict_proba( features[unlabeled]) alg_labels[unlabeled] = label_spreading_labels elif alg == 'harmonic': if 'W' not in locals(): W = gtg.sim_mat(features, verbose=True) soft_labels, hard_labels = harmonic_function( W, labels, labeled, unlabeled) if self.hard_labels: label_harmonic = hard_labels else: label_harmonic = soft_labels alg_labels[unlabeled] = label_harmonic elif alg == 'labels_only': # generate labeled only file alg_labels = alg_labels[labeled] if not osp.exists(osp.dirname(alg_path)): os.makedirs(osp.dirname(alg_path)) if (self.hard_labels and (alg_labels == -1).sum() > 0) or \ (not self.hard_labels and (alg_labels.sum(axis=1) == 0.).sum() > 0): raise ValueError( 'There is some unlabeled observation, check \'' + alg + '\' algorithm,') create_relabeled_file([fnames[i] for i in labeled], alg_path, alg_labels, sep=',') break else: raise ValueError('algorithm \'' + alg + '\' not recognized.') if not osp.exists(osp.dirname(alg_path)): os.makedirs(osp.dirname(alg_path)) if (self.hard_labels and (alg_labels == -1).sum() > 0) or\ (not self.hard_labels and (alg_labels.sum(axis=1) == 0.).sum() > 0): raise ValueError('There is some unlabeled observation,' 'check \'' + alg + '\' algorithm,') create_relabeled_file(fnames, alg_path, alg_labels, sep=',') if 'W' in locals(): del W
def baseline_labelspreading_new(data_path, bad_sample_num, good_sample_num, reject_sample_num, random_state_for_each_epoch, classifier, resampling_model): """ :return: """ '''Data input''' warnings.filterwarnings("ignore") warnings.filterwarnings("ignore") raw_data_train = pd.read_csv(data_path, index_col='ID') data_bad = raw_data_train[raw_data_train['label'] == 1] # print data_bad.shape data_good = raw_data_train[(raw_data_train['label'] == 0)] data_reject = raw_data_train[raw_data_train['label'] == -1] data_bad_sampling = data_bad.sample( n=bad_sample_num, random_state=random_state_for_each_epoch) data_good_sampling = data_good.sample( n=good_sample_num, random_state=random_state_for_each_epoch) data_train = pd.concat([data_bad_sampling, data_good_sampling], axis=0) # print("All Data Size:" + str(data_train.shape)) feature_name = list(data_train.columns.values) # print(feature_name) s = 0 np.random.seed(s) sampler = np.random.permutation(len(data_train.values)) data_train_randomized = data_train.take(sampler) y = data_train_randomized['label'].as_matrix() X = data_train_randomized.drop(['label'], axis=1).as_matrix() '''Split train/test data sets''' X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2, random_state=123) data_reject_sampling = data_reject.sample( n=reject_sample_num, random_state=random_state_for_each_epoch) X_reject = data_reject_sampling.drop(['label'], axis=1).as_matrix() y_reject = data_reject_sampling['label'].as_matrix() X_train_and_reject = np.r_[X_train, X_reject] y_train_and_reject = np.r_[y_train, y_reject] '''Supervised Learning''' ls_semi = LabelSpreading(kernel='rbf', gamma=5, alpha=0.5, max_iter=100, tol=0.001, n_jobs=-1) # ls_semi = LabelSpreading(kernel='knn', n_neighbors=20, alpha=0.5, max_iter=100, tol=0.1, n_jobs=-1) # ls_semi = LabelSpreading(kernel='rbf', gamma=10, alpha=0.7, max_iter=500, tol=0.001, n_jobs=-1) # ls_semi = LabelSpreading(kernel='knn', n_neighbors=5, alpha=0.7, max_iter=400, tol=0.1, n_jobs=-1) ls_semi.fit(X_train_and_reject, y_train_and_reject) y_reject_proba = ls_semi.predict_proba(X_reject) y_reject_predict = ls_semi.predict(X_reject) # y_proba = np.nan_to_num(y_proba) # y_proba中有时会出现nan的情况 # print np.isnan(y_proba).sum() y_train_and_reject_1 = np.r_[y_train, y_reject_predict] # print(y_train_and_reject_1.sum()) '''Supervised Learning''' y_proba = classifier.fit(X_train_and_reject, y_train_and_reject_1).predict_proba(X_test) y_predict = classifier.fit(X_train_and_reject, y_train_and_reject_1).predict(X_test) # y_predict = y_proba[:, 1].copy() # y_predict[y_predict >= 0.2] = 1 # y_predict[y_predict < 0.2] = 0 '''AUC and ROC curve''' fpr, tpr, _ = roc_curve(y_test, y_proba[:, 1]) auc_result = auc(fpr, tpr) # print("AUC Score:" + str(auc_result)) '''Accuracy''' accuracy_result = accuracy_score(y_test, y_predict) # print("Accuracy Score:" + str(accuracy_result)) '''Precision''' precision_result = precision_score(y_test, y_predict) # print("Precision Score:" + str(precision_result)) '''Recall''' recall_result = recall_score(y_test, y_predict) # print("Recall Score:" + str(recall_result)) '''F1''' f1_result = f1_score(y_test, y_predict) # print("F1 Score:" + str(f1_result)) '''Log loss''' log_loss_result = log_loss(y_test, y_proba[:, 1]) # print("logloss Score:" + str(log_loss_result)) '''Cohen-Kappa''' cohen_kappa_result = cohen_kappa_score(y_test, y_predict) # print("Cohen-Kappa Score:" + str(cohen_kappa_result)) '''brier score''' brier_result = brier_score_loss(y_test, y_proba[:, 1]) # print("brier Score:" + str(brier_result)) '''K-S Value''' ks_result = max(tpr - fpr) '''plot auc''' # plt.figure() # lw = 2 # plt.plot(fpr, tpr, color='darkorange', lw=lw, label='ROC curve (area = %0.4f)' % roc_auc) # plt.plot([0, 1], [0, 1], color='navy', lw=lw, linestyle='--') # plt.xlim([0.0, 1.0]) # plt.ylim([0.0, 1.05]) # plt.xlabel('False Positive Rate') # plt.ylabel('True Positive Rate') # plt.title('Receiver operating characteristic example') # plt.legend(loc="lower right") # plt.show() '''Classification Report''' # target_names = ['class 0', 'class 1', 'class 2'] # print(classification_report(y_test, y_predict, target_names=target_names)) '''Confusion Matrix''' # # Compute confusion matrix # cnf_matrix = confusion_matrix(y_test, y_predict) # np.set_printoptions(precision=2) # # # Plot non-normalized confusion matrix # plt.figure() # plot_confusion_matrix(cnf_matrix, classes=[0, 1], title='Confusion matrix, without normalization') # # # Plot normalized confusion matrix # plt.figure() # plot_confusion_matrix(cnf_matrix, classes=[0, 1], normalize=True, title='Normalized confusion matrix') # # plt.show() # print("Accuracy Score:" + str(accuracy_result) + " Precision Score:" + str(precision_result) + " Recall Score:" + str(recall_result) + # " F1 Score:" + str(f1_result) + " logloss Score:" + str(log_loss_result) + " Cohen-Kappa Score:" + str(cohen_kappa_result) + # " brier Score:" + str(brier_result) + " AUC Score:" + str(auc_result)) return accuracy_result, precision_result, recall_result, f1_result, log_loss_result, cohen_kappa_result, brier_result, ks_result, auc_result
train_dataset = train.values X = train_dataset[:, 2:] y = train_dataset[:, 1] y = y.astype('int') test_dataset = test.values X_test = test_dataset[:, 2:] print(type(X_test)) print('X.shape, y.shape, X_test.shape', X.shape, y.shape, X_test.shape) df = pd.DataFrame({"SK_ID_CURR": df['SK_ID_CURR']}) kernels = ['knn'] #'rbf'] - taking too much time on knn for kernel in kernels: print('LabelSpreading kernel****************', kernel) ls = LabelSpreading(kernel=kernel) print('fitting****************') ls_train = ls.fit(X, y) print('predicting on train****************') ls_X_prediction = ls.predict_proba(X)[:, 1] print('predicting on test****************') ls_X_test_prediction = ls.predict_proba(X_test)[:, 1] tr_te_concatenated = np.concatenate( [ls_X_prediction, ls_X_test_prediction]) df['label_spreading_' + kernel + '_kernel'] = tr_te_concatenated print('final tr_te shape', df.shape) df.to_csv('label_spreading_tr_te.csv', index=False) print(df.head())