def testing_predictions(self, test_data, model, num_pcs, gamma=False, max_iter=1000000, mean=False): pca_data = self.principal_components(test_data, self.pca, num_pcs) if mean == False: return np.array([p[1] for p in model.predict_proba(pca_data)]) train_pca_data = self.principal_components(self.X, self.pca, num_pcs) predicted_probs = "" for seed in self.seeds: np.random.seed(seed) model = LabelPropagation(kernel='rbf', gamma=gamma, max_iter=max_iter) model.fit(train_pca_data, self.Y) predicted_prob = np.array( [p[1] for p in model.predict_proba(pca_data)]) if predicted_probs == "": predicted_probs = predicted_prob else: predicted_probs = np.vstack((predicted_probs, predicted_prob)) #get mean of each run: mean_probs = np.mean(predicted_probs, axis=0) return mean_probs
def hard_clamping(kernel, k, xTrain, yTrain, MI=10000, g=0.6): prop = LabelPropagation(kernel=kernel, n_neighbors=k, gamma=g, max_iter=MI, n_jobs=-1) prop.fit(xTrain, yTrain) predY = prop.predict_proba(xTrain) norm_Y = normalize(yTrain, predY) labels = [] for i in norm_Y: if i[0] > i[1]: labels.append(benign) elif i[0] < i[1]: labels.append(malware) lm_to_b, lb_to_m, tp, tn, fp, fn, pred_day1, missed_day1 = stats( yTrain, labels, yExpect, day_one) results = [ 'HC', kernel, k, g, lm_to_b, lb_to_m, tp, tn, fp, fn, pred_day1, missed_day1 ] file_name = 'HC_CMN_5per_' + str(rate) + '.csv' write_csv(file_name, results)
def doLabelPropagation(self,X,y,**kwargs): label_prop_model = LabelPropagation(**kwargs) if self.verbose>2: print("X, y shapes: ",X.shape,y.shape) print(" y hist: ",np.histogram(y)) label_prop_model.fit(X, y) if self.verbose>2: print("lp_predict:",np.histogram(label_prop_model.predict(X)) ) return label_prop_model.predict_proba(X)
def label_propagation(self, X_train, y, X_test): clf = LabelPropagation() print("X_train Shape :", X_train.shape, type(X_train)) print("X_test shape : ", X_test.shape, type(X_test)) print("y shape : ", y.shape) X = np.concatenate((X_train.todense(), X_test.todense()), axis=0) print("X shape now ", X.shape) print("Y shape now ", y.shape) clf.fit(X, y) final_labels = clf.predict(X_test) label_prob = clf.predict_proba(X_test) print(compare_labels_probabilities().compare(label_prob, final_labels)) return final_labels, clf
class _LabelPropagationImpl: def __init__(self, **hyperparams): self._hyperparams = hyperparams self._wrapped_model = Op(**self._hyperparams) def fit(self, X, y=None): if y is not None: self._wrapped_model.fit(X, y) else: self._wrapped_model.fit(X) return self def predict(self, X): return self._wrapped_model.predict(X) def predict_proba(self, X): return self._wrapped_model.predict_proba(X)
def fit(self, X,y, method='self-training', treshold=0.7): getLabel = lambda p: np.where(p>treshold)[0][0] if np.any(p>treshold) else -1 yp = copy(y) mask = np.ones(len(y),dtype=bool) #mask of labeled data mask[np.where(yp==-1)[0]] = False #cheke unlabeled data , msk = number of labeled data lda = LinearDiscriminantAnalysis(solver='svd',store_covariance=True, n_components=10) #print(y) #if there are no unlabeled data if(len(np.where(yp==-1)[0])==0): #replace with len(mask)=0? method = 'supervised' if method =='supervised': lda.fit(X[mask,:],yp[mask]) #train with all labeled data elif method=='self-training': counter=0 while True: lda.fit(X[mask,:],yp[mask]) if len(yp[~mask]) == 0 or counter == self.max_iter: break probs = lda.predict_proba(X[~mask]) yp[~mask] = np.fromiter([getLabel(p) for p in probs], probs.dtype) counter+=1 mask = np.ones(len(y), dtype=bool) mask[np.where(yp==-1)[0]]=False elif method == 'label-propagation': label_prop_model=LabelPropagation(kernel='knn',n_neighbors=10,alpha=0.9) label_prop_model.fit(X,yp) #print(probs) probs = label_prop_model.predict_proba(X[~mask]) yp[~mask] = np.fromiter([getLabel(p) for p in probs], probs.dtype) self.propagated_labels = yp lda.fit(X[mask,:],yp[mask]) else: raise('No valid method was given!') self.classifier, self.means_, self.covariance_ =lda, lda.means_, lda.covariance_
def evaluate_model(self, X, Y, gamma, seed, max_iter=100000): #set random seed: np.random.seed(seed) X_train, X_test, Y_train, Y_test = train_test_split(X, Y, stratify=Y, test_size=0.20, random_state=seed) lp_model = LabelPropagation(kernel='rbf', gamma=gamma, max_iter=max_iter) lp_model.fit(X_train, Y_train) #test model on validation data predicted_labels = lp_model.predict(X_test) predicted_prob = lp_model.predict_proba(X_test) #get just the labeled testing data: labeled_prob = [ p[1] for i, p in enumerate(predicted_prob) if Y_test[i] in [0, 1] ] labels = [ p for i, p in enumerate(predicted_labels) if Y_test[i] in [0, 1] ] true_labels = [l for l in Y_test if l in [0, 1]] #evaluation accuracy = metrics.accuracy_score(true_labels, labels) precision = metrics.precision_score(true_labels, labels) auc = metrics.roc_auc_score(true_labels, labeled_prob) conf = metrics.confusion_matrix(true_labels, labels) return accuracy, precision, auc, conf
y = train_dataset[:, 1] y = y.astype('int') test_dataset = test.values X_test = test_dataset[:, 2:] print(type(X_test)) print('X.shape, y.shape, X_test.shape', X.shape, y.shape, X_test.shape) # In[5]: df = pd.DataFrame({"SK_ID_CURR": df['SK_ID_CURR']}) kernels = [ 'knn', ] #, 'rbf'] - taking too much time on knn - so only one model for kernel in kernels: print('label_propagation begins on kernel****************', kernel) lp = LabelPropagation(kernel=kernel) print('fitting****************') lp_train = lp.fit(X, y) print('predicting on train****************') lp_X_prediction = lp.predict_proba(X)[:, 1] print('predicting on test****************') lp_X_test_prediction = lp.predict_proba(X_test)[:, 1] tr_te_concatenated = np.concatenate( [lp_X_prediction, lp_X_test_prediction]) df['label_propagation_' + kenel + '_kernel'] = tr_te_concatenated print('final tr_te shape', df.shape) df.to_csv('label_propagation_tr_te.csv', index=False) print(df.head())
lspr = LP(gamma = 70) lspr.fit(X_norm,Ytrain) # In[15]: print('nofClasses: ',lspr.classes_) # In[16]: pred = lspr.predict(X_norm) notN = [1 for i in pred if i>0.0] print(sum(notN)) # In[12]: Y_pred = lspr.predict_proba(X_test) # In[13]: print(Y_pred.shape) # In[ ]:
for i in range(113): if i in unique_y_train: continue idx = (Y_test == i).nonzero()[0][0] add_X.append(X_test[idx]) add_Y.append(Y_test[idx]) if len(add_X) != 0: X_train = np.r_[X_train, np.array(add_X)] Y_train = np.r_[Y_train, np.array(add_Y)] print('train unique Y:{} test uniuqe Y:{}'.format(len(np.unique(Y_train)), len(np.unique(Y_test)))) classifier = LabelPropagation(kernel='rbf', n_jobs=50, max_iter=200, gamma=0.25) # Y_train[int(len(Y_train)*0.8):] = -1 print('-' * 15) print(X_train.shape, Y_train.shape, len(np.unique(Y_train))) classifier.fit(X_train, Y_train) # --- testing y_prob = classifier.predict_proba(X_test) # --- report one_hot_Y = np.zeros((X_test.shape[0], len(np.unique(Y)))) one_hot_Y[np.arange(X_test.shape[0]), Y_test] = 1 test_metrics = metric_report(one_hot_Y, y_prob) print(test_metrics) collect_report(METHOD_NAME, args.data_ratio, test_metrics['pr'])
def __call__(self, *args, **kwargs): """ Augment the labels Inputs: tr_percs: percentage of splitting between labeled and unlabeled observations algs: methods to perform the label propagation max_iter: parameter for 'gtg': number of iterations """ tr_percs = kwargs.pop('tr_percs', [0.02, 0.05, 0.1]) algs = kwargs.pop('algs', ['gtg', 'svm', 'labels_only']) max_iter = kwargs.pop('max_iter', 25) if not osp.exists(self.label_dir): os.makedirs(self.label_dir) with open(osp.join(self.label_dir, 'test_labels.txt'), 'w') as dst: loader = prepare_loader( osp.join(self.splitting_dir, 'test.txt'), img_root=self.dset['src'], stats=self.dset['stats'], batch_size=1, shuffle=False, ) for _, label, path in loader: dst.write(osp.join(path[0] + ',' + str(label.item()) + '\n')) for net_name in self.net_names: with open(osp.join(self.feat_dir, 'train', net_name + '.pickle'), 'rb') as pkl: net_name, labels, features, fnames = pickle.load(pkl) labels = labels.ravel() # uncomment to debug code # labels = labels[:5000] # features = features[:5000] # fnames = fnames[:5000] for tr_perc in tr_percs: labeled, unlabeled = equiclass_mapping(labels, tr_perc) for alg in algs: print(net_name + ' - ' + str(self.dset['nr_classes']) + ' classes') # generate alg label file name alg_path = osp.join(self.label_dir, alg, net_name, 'labels_{}.txt'.format(tr_perc)) if self.hard_labels: alg_labels = np.full(labels.shape[0], -1) alg_labels[labeled] = labels[labeled] else: alg_labels = np.zeros( (len(labels), self.dset['nr_classes'])) alg_labels[labeled, labels[labeled].ravel().astype(int)] = 1.0 if alg == 'gtg': # predict labels with gtg if 'W' not in locals(): W = gtg.sim_mat(features, verbose=True) ps = init_rand_probability(labels, labeled, unlabeled) res = gtg.gtg(W, ps, max_iter=max_iter, labels=labels, U=unlabeled, L=labeled) if self.hard_labels: alg_labels[unlabeled] = res[unlabeled].argmax( axis=1) else: alg_labels[unlabeled] = res[unlabeled] elif alg == 'svm': # predict labels with a linear SVM lin_svm = svm.LinearSVC() if self.hard_labels: lin_svm.fit(features[labeled, :], labels[labeled]) svm_labels = lin_svm.predict( features[unlabeled]).astype(int) else: cv = min( np.unique(labels[labeled], return_counts=True)[1].min(), 3) clf = CalibratedClassifierCV(lin_svm, cv=cv) clf.fit(features[labeled, :], labels[labeled]) svm_labels = clf.predict_proba(features[unlabeled]) alg_labels[unlabeled] = svm_labels elif alg == 'label_propagation': # predict labels with a label propagation model label_propagation = LabelPropagation(kernel='rbf', gamma=0.05, max_iter=4000) labels[unlabeled] = -1 label_propagation.fit(features, labels) if self.hard_labels: label_propagation_labels = label_propagation.predict( features[unlabeled]).astype(int) else: label_propagation_labels = label_propagation.predict_proba( features[unlabeled]) alg_labels[unlabeled] = label_propagation_labels elif alg == 'label_spreading': # predict labels with a label propagation model label_spreading = LabelSpreading(kernel='rbf', gamma=0.05) labels[unlabeled] = -1 label_spreading.fit(features, labels) if self.hard_labels: label_spreading_labels = label_spreading.predict( features[unlabeled]).astype(int) else: label_spreading_labels = label_spreading.predict_proba( features[unlabeled]) alg_labels[unlabeled] = label_spreading_labels elif alg == 'harmonic': if 'W' not in locals(): W = gtg.sim_mat(features, verbose=True) soft_labels, hard_labels = harmonic_function( W, labels, labeled, unlabeled) if self.hard_labels: label_harmonic = hard_labels else: label_harmonic = soft_labels alg_labels[unlabeled] = label_harmonic elif alg == 'labels_only': # generate labeled only file alg_labels = alg_labels[labeled] if not osp.exists(osp.dirname(alg_path)): os.makedirs(osp.dirname(alg_path)) if (self.hard_labels and (alg_labels == -1).sum() > 0) or \ (not self.hard_labels and (alg_labels.sum(axis=1) == 0.).sum() > 0): raise ValueError( 'There is some unlabeled observation, check \'' + alg + '\' algorithm,') create_relabeled_file([fnames[i] for i in labeled], alg_path, alg_labels, sep=',') break else: raise ValueError('algorithm \'' + alg + '\' not recognized.') if not osp.exists(osp.dirname(alg_path)): os.makedirs(osp.dirname(alg_path)) if (self.hard_labels and (alg_labels == -1).sum() > 0) or\ (not self.hard_labels and (alg_labels.sum(axis=1) == 0.).sum() > 0): raise ValueError('There is some unlabeled observation,' 'check \'' + alg + '\' algorithm,') create_relabeled_file(fnames, alg_path, alg_labels, sep=',') if 'W' in locals(): del W
from sklearn.semi_supervised import LabelPropagation import numpy as np from random import sample X = np.genfromtxt('/Users/dgy/Desktop/385project/grouped_GPS.csv', delimiter=',') X = X[1:(X.shape[0] - 1), 1:3] rows = X.shape[0] seedsX = X[sample(range(0, rows), 500), ] seedsY = np.repeat([0, 0, 0, 0, 1], 100) lp = LabelPropagation(gamma=10000) lp.fit(seedsX, seedsY) Y = lp.predict_proba(X) p = Y[0:rows, 1] (np.where(p > 0.5))[0].shape np.count_nonzero(np.isnan(p)) np.savetxt("utility.txt", Y, delimiter=" ", fmt="%s")
p.append(0) for i in range(1, len(arr)): p.extend(dp[arr[i]]) res.append(p) return np.array(res) data = hex2bin(path) X = data[:, 1:] Y = data[:, 0] train_X, test_X, train_Y, test_Y = train_test_split(X, Y, test_size=0.7) rng = np.random.RandomState(42) unlabeled_point = rng.rand(len(train_Y)) < 0.4 train_Y[unlabeled_point] = -1 clf = LabelPropagation(n_jobs=8, gamma=0.6) clf.fit(train_X, train_Y) prob = clf.predict_proba(test_X) score = roc_auc_score(test_Y, prob[:, 1]) score = round(score, 2) fpr, tpr, _ = roc_curve(test_Y, prob[:, 1]) plt.plot(fpr, tpr, color='r') plt.plot([0, 1], [0, 1], linestyle='--') plt.xlabel("fpr") plt.ylabel("tpr") plt.text(0.5, 0.4, "AUC=" + str(score), fontsize=15) plt.show() print(score)