def MetricLearning(Data, mapping, remapping, Truth): #build metric learing Truth_Label = list(set(Truth.values())) X_color = np.array([data[0] for data in Data]) #print >> sys.stderr, "X_color concat X_shape", X_color.shape #X_color = np.array([data[0] for data in Data]) Y_color = np.array([Truth_Label.index(Truth[remapping[i]]) for i, data in enumerate(Data)]) print >> sys.stderr, "X_color", X_color.shape, Y_color.shape s = time.time() lmnn_color = LMNN(k=5, min_iter=0, max_iter=400, learn_rate=1e-6) lmnn_color.fit(X_color, Y_color, verbose=True) print >> sys.stderr, time.time() - s, "color learning done" '''X_shape = np.array([data[1] for data in Data]) Y_shape = np.array([Truth_Label.index(Truth[remapping[i]]) for i, data in enumerate(Data)]) print >> sys.stderr, "X_shape", X_shape.shape, Y_shape.shape s = time.time() lmnn_shape = LMNN(k=20, min_iter=0, max_iter=1, learn_rate=1e-6) lmnn_shape.fit(X_shape, Y_shape, verbose=True) print >> sys.stderr, time.time() - s, "shape learning done" return lmnn_color, lmnn_shape''' return lmnn_color
def constructSimilartyMatrixLMNN(self,ks): print 'now doing LMNN for k= ',ks self.y_train=self.y_train.reshape(-1,) lmnn=LMNN(k=ks, learn_rate=1e-7,max_iter=3000) lmnn.fit(self.trainVectorsPCA, self.y_train, verbose=False) self.L_lmnn = lmnn.transformer() name='lmnn/LMNN transformer matrix with dataset shape '+str(self.trainVectorsPCA.shape) np.save(name,self.L_lmnn) print 'L.shape is ',self.L_lmnn.shape,'\n\n' # Input data transformed to the metric space by X*L.T self.transformedTrainLMNN=copy(lmnn.transform(self.trainVectorsPCA)) self.transformedTestLMNN=copy(lmnn.transform(self.testVectorsPCA)) self.transformedAllLMNN=copy(lmnn.transform(self.allDataPCA)) #we compute the pairwise distance on this now projectedDigits = TSNE(random_state=randomState).fit_transform(self.transformedAllLMNN) plt.scatter(projectedDigits[:,0],projectedDigits[:,1],c=self.labels) plt.title('LMNN Transformed ALL set projected to 2 Dimensions by TSNE with k='+str(ks)) plt.savefig(pp,format='pdf') self.pwdis=copy(pairwise_distances(self.transformedAllLMNN,metric='euclidean')) self.D=np.zeros(self.pwdis.shape) for i in range(0,self.pwdis.shape[0]): l1=self.pwdis[i].tolist() #print 'l1 is ',l1,'\n\n' allnearestNeighbours=sorted(range(len(l1)),key=lambda i : l1[i]) #now set the all the weights except for k+1 to 0 self.pwdis[i,allnearestNeighbours[ks:]]=0 self.D[i,i]=sum(self.pwdis[i]) print 'accuracy for LMNN for k= ',ks,'\n' self.labelPropogation()
def test_toy_ex_lmnn(X, y, loss): """Test that the loss give the right result on a toy example""" L = np.array([[1]]) lmnn = LMNN(k=1, regularization=0.5) k = lmnn.k reg = lmnn.regularization X, y = lmnn._prepare_inputs(X, y, dtype=float, ensure_min_samples=2) num_pts, num_dims = X.shape unique_labels, label_inds = np.unique(y, return_inverse=True) lmnn.labels_ = np.arange(len(unique_labels)) lmnn.transformer_ = np.eye(num_dims) target_neighbors = lmnn._select_targets(X, label_inds) impostors = lmnn._find_impostors(target_neighbors[:, -1], X, label_inds) # sum outer products dfG = _sum_outer_products(X, target_neighbors.flatten(), np.repeat(np.arange(X.shape[0]), k)) df = np.zeros_like(dfG) # storage a1 = [None]*k a2 = [None]*k for nn_idx in xrange(k): a1[nn_idx] = np.array([]) a2[nn_idx] = np.array([]) # assert that the loss equals the one computed by hand assert lmnn._loss_grad(X, L.reshape(-1, X.shape[1]), dfG, impostors, 1, k, reg, target_neighbors, df, a1, a2)[1] == loss
def baseline_model(X_train,y_train,X_test,y_test): #dimension reduction feature_selection = LinearSVC(C=1, penalty="l1", dual=False) X_train_reduced = feature_selection.fit_transform(X_train, y_train) X_test_reduced = feature_selection.transform(X_test) #metrics learning ml = LMNN(k=4,min_iter=50,max_iter=1000, learn_rate=1e-7) ml.fit(X_train_reduced,y_train) X_train_new = ml.transform(X_train_reduced) X_test_new = ml.transform(X_test_reduced) neigh = KNeighborsClassifier(n_neighbors=4) neigh.fit(X_train_new, y_train) predicted = neigh.predict(X_test_new) #pickle.dump(ml, open('dist_metrics', 'w')) return predicted
def test_lmnn(self): lmnn = LMNN(k=5, learn_rate=1e-6, verbose=False) lmnn.fit(self.X, self.y) res_1 = lmnn.transform(self.X) lmnn = LMNN(k=5, learn_rate=1e-6, verbose=False) res_2 = lmnn.fit_transform(self.X, self.y) assert_array_almost_equal(res_1, res_2)
class KNNClassifier(BaseEstimator, ClassifierMixin): def __init__(self, k=1): self.k = k self.distanceEstimator = LMNN(k=k) def fit(self, X, y): #TODO msati3: Ideally, LMNN should expose fit_transform. self.distanceEstimator.fit(X, y) self.modelData = self.distanceEstimator.transform(X) self.modelLabels = y return self def transform(self, X): return self.distanceEstimator.transform(X) def predict(self, D): X = self.transform(D) #Pretransform so that euclidean metric suffices distances = distance.cdist(X, self.modelData,'sqeuclidean') topKIndexes = bn.argpartsort(distances, self.k)[:,:self.k] predictions = self.modelLabels[topKIndexes] return stats.mode(predictions, axis=1)[0] def score(self, X, y, fNormalize=True): return accuracy_score(self.predict(X), y, fNormalize)
def test_loss_grad_lbfgs(self): """Test gradient of loss function Assert that the gradient is almost equal to its finite differences approximation. """ rng = np.random.RandomState(42) X, y = make_classification(random_state=rng) L = rng.randn(rng.randint(1, X.shape[1] + 1), X.shape[1]) lmnn = LMNN() k = lmnn.k reg = lmnn.regularization X, y = lmnn._prepare_inputs(X, y, dtype=float, ensure_min_samples=2) num_pts, num_dims = X.shape unique_labels, label_inds = np.unique(y, return_inverse=True) lmnn.labels_ = np.arange(len(unique_labels)) lmnn.transformer_ = np.eye(num_dims) target_neighbors = lmnn._select_targets(X, label_inds) impostors = lmnn._find_impostors(target_neighbors[:, -1], X, label_inds) # sum outer products dfG = _sum_outer_products(X, target_neighbors.flatten(), np.repeat(np.arange(X.shape[0]), k)) df = np.zeros_like(dfG) # storage a1 = [None]*k a2 = [None]*k for nn_idx in xrange(k): a1[nn_idx] = np.array([]) a2[nn_idx] = np.array([]) # initialize L def loss_grad(flat_L): return lmnn._loss_grad(X, flat_L.reshape(-1, X.shape[1]), dfG, impostors, 1, k, reg, target_neighbors, df.copy(), list(a1), list(a2)) def fun(x): return loss_grad(x)[1] def grad(x): return loss_grad(x)[0].ravel() # compute relative error epsilon = np.sqrt(np.finfo(float).eps) rel_diff = (check_grad(fun, grad, L.ravel()) / np.linalg.norm(approx_fprime(L.ravel(), fun, epsilon))) np.testing.assert_almost_equal(rel_diff, 0., decimal=5)
def baseline_model(X_train, y_train, X_test, y_test): #dimension reduction feature_selection = LinearSVC(C=1, penalty="l1", dual=False) X_train_reduced = feature_selection.fit_transform(X_train, y_train) X_test_reduced = feature_selection.transform(X_test) #metrics learning ml = LMNN(k=4, min_iter=50, max_iter=1000, learn_rate=1e-7) ml.fit(X_train_reduced, y_train) X_train_new = ml.transform(X_train_reduced) X_test_new = ml.transform(X_test_reduced) neigh = KNeighborsClassifier(n_neighbors=4) neigh.fit(X_train_new, y_train) predicted = neigh.predict(X_test_new) #pickle.dump(ml, open('dist_metrics', 'w')) return predicted
def test_loss_grad_lbfgs(self): """Test gradient of loss function Assert that the gradient is almost equal to its finite differences approximation. """ rng = np.random.RandomState(42) X, y = make_classification(random_state=rng) L = rng.randn(rng.randint(1, X.shape[1] + 1), X.shape[1]) lmnn = LMNN() k = lmnn.k reg = lmnn.regularization X, y = lmnn._prepare_inputs(X, y, dtype=float, ensure_min_samples=2) num_pts, n_components = X.shape unique_labels, label_inds = np.unique(y, return_inverse=True) lmnn.labels_ = np.arange(len(unique_labels)) lmnn.components_ = np.eye(n_components) target_neighbors = lmnn._select_targets(X, label_inds) # sum outer products dfG = _sum_outer_products(X, target_neighbors.flatten(), np.repeat(np.arange(X.shape[0]), k)) # initialize L def loss_grad(flat_L): return lmnn._loss_grad(X, flat_L.reshape(-1, X.shape[1]), dfG, k, reg, target_neighbors, label_inds) def fun(x): return loss_grad(x)[1] def grad(x): return loss_grad(x)[0].ravel() # compute relative error epsilon = np.sqrt(np.finfo(float).eps) rel_diff = (check_grad(fun, grad, L.ravel()) / np.linalg.norm(approx_fprime(L.ravel(), fun, epsilon))) np.testing.assert_almost_equal(rel_diff, 0., decimal=5)
def lmnn_fit(X_train, Y_train, X_test, Y_test, color_map): lmnn = LMNN(init='pca', k=3, learn_rate=5e-4, max_iter=500000, regularization=0.2) lmnn.fit(X_train, Y_train) X_train_transformed = lmnn.transform(X_train) if (X_train.shape[1] == 2): plt.figure() plt.scatter(X_train_transformed[:, 0], X_train_transformed[:, 1], c=color_map[Y_train], s=2) plt.savefig("after_lmnn_transform_train.png", dpi=300) X_test_transformed = lmnn.transform(X_test) if (X_test.shape[1] == 2): plt.figure() plt.scatter(X_test_transformed[:, 0], X_test_transformed[:, 1], c=color_map[Y_test], s=2) plt.savefig("after_lmnn_transform_test.png", dpi=300) return (X_train_transformed, X_test_transformed)
# 距离测度学习的目的即为了衡量样本之间的相近程度,而这也正是模式识别的核心问题之一。 # 大量的机器学习方法,比如K近邻、支持向量机、径向基函数网络等分类方法以及K-means聚类方法,还有一些基于图的方法,其性能好坏都主要有样本之间的相似度量方法的选择决定。 # large margin nearest neighbor from metric_learn import LMNN import numpy as np X = np.array([[0., 0., 1.], [0., 0., 2.], [1., 0., 0.], [2., 0., 0.], [2., 2., 2.], [2., 5., 4.]]) Y = np.array([1, 1, 2, 2, 0, 0]) lmnn = LMNN()
def exeLMNN(self, knum=5): lmnn = LMNN(k=knum) X_new = lmnn.fit_transform(self.X, self.y) return X_new
print('Data Preparation Done', '\n') #print(FSTrainData.max(axis=0) - FSTrainData.min(axis=0)) #print(len(FSTrainData[0])) #print(len(FSTestData[0])) #print(len(FSTestData)) #print(len(TestData)) #print(TrainData) #print(type(TrainData)) #print(TrainLabels) #print(type(TrainLabels)) if Method == 'LMNN': print("Method: LMNN", '\n') lmnn = LMNN(k=3, learn_rate=1e-6, verbose=False) x = lmnn.fit(FSTrainData, TrainLabels) TFSTestData = x.transform(FSTestData) print('Transformation Done', '\n') elif Method == 'COV': print("Method: COV", '\n') cov = Covariance().fit(FSTrainData) TFSTestData = cov.transform(FSTestData) print('Transformation Done', '\n') elif Method == 'ITML': print("Method: ITML", '\n') itml = ITML_Supervised(num_constraints=200, A0=None) x = itml.fit(FSTrainData, TrainLabels) TFSTestData = x.transform(FSTestData)
def main(params): initialize_results_dir(params.get('results_dir')) backup_params(params, params.get('results_dir')) print('>>> loading data...') X_train, y_train, X_test, y_test = LoaderFactory().create( name=params.get('dataset'), root=params.get('dataset_dir'), random=True, seed=params.getint('split_seed'))() print('<<< data loaded') print('>>> computing psd matrix...') if params.get('algorithm') == 'identity': psd_matrix = np.identity(X_train.shape[1], dtype=X_train.dtype) elif params.get('algorithm') == 'nca': nca = NCA(init='auto', verbose=True, random_state=params.getint('algorithm_seed')) nca.fit(X_train, y_train) psd_matrix = nca.get_mahalanobis_matrix() elif params.get('algorithm') == 'lmnn': lmnn = LMNN(init='auto', verbose=True, random_state=params.getint('algorithm_seed')) lmnn.fit(X_train, y_train) psd_matrix = lmnn.get_mahalanobis_matrix() elif params.get('algorithm') == 'itml': itml = ITML_Supervised(verbose=True, random_state=params.getint('algorithm_seed')) itml.fit(X_train, y_train) psd_matrix = itml.get_mahalanobis_matrix() elif params.get('algorithm') == 'lfda': lfda = LFDA() lfda.fit(X_train, y_train) psd_matrix = lfda.get_mahalanobis_matrix() elif params.get('algorithm') == 'arml': learner = TripleLearner( optimizer=params.get('optimizer'), optimizer_params={ 'lr': params.getfloat('lr'), 'momentum': params.getfloat('momentum'), 'weight_decay': params.getfloat('weight_decay'), }, criterion=params.get('criterion'), criterion_params={'calibration': params.getfloat('calibration')}, n_epochs=params.getint('n_epochs'), batch_size=params.getint('batch_size'), random_initialization=params.getboolean('random_initialization', fallback=False), update_triple=params.getboolean('update_triple', fallback=False), device=params.get('device'), seed=params.getint('learner_seed')) psd_matrix = learner(X_train, y_train, n_candidate_mins=params.getint('n_candidate_mins', fallback=1)) else: raise Exception('unsupported algorithm') print('<<< psd matrix got') np.savetxt(os.path.join(params.get('results_dir'), 'psd_matrix.txt'), psd_matrix)
# from modshogun import LMNN as shogun_LMNN # from modshogun import RealFeatures, MulticlassLabels # import numpy as np from metric_learn import LMNN from sklearn.datasets import load_iris iris_data = load_iris() X = iris_data['data'] Y = iris_data['target'] lmnn = LMNN(k=5, learn_rate=1e-6) lmnn.fit(X, Y, verbose=False)
vectors = vectorizer.transform(data) print 'vectorizer is ' ,vectors[0].todense() itml=ITML() arr2=copy(vectors.todense()) arr=np.zeros((vectors.shape[0],vectors.shape[1])) for i in range(0,vectors.shape[0]): for j in range(0,vectors.shape[1]): arr[i,j]=arr2[i,j] print 'arr .shape is ',arr.shape target=newsgroups_train.target lab=[] for i in target: lab.append(i) lab=np.asarray(lab) print 'lab is ',(lab) print 'target is ',type(arr) #C=itml.prepare_constraints(target,vectors.shape[0],200) #itml.fit(arr,C,verbose=False) lmnn = LMNN(k=20, learn_rate=1e-3,use_pca=True) lmnn.fit(arr,target,verbose=False) print 'Now doing LMNN' l=lmnn.transformer() np.save('LMNN transformer',l)
from modshogun import LMNN as shogun_LMNN from modshogun import RealFeatures, MulticlassLabels import numpy as np from metric_learn import LMNN from sklearn.datasets import load_iris iris_data = load_iris() X = iris_data['data'] Y = iris_data['target'] lmnn = LMNN(k=5, learn_rate=1e-6) lmnn.fit(X, Y, verbose=False)
def main(): print( "************************************************************************************" ) print( "*************************** Metric Learning Demo ***********************************" ) print( "************************************************************************************" ) # Load variables print("Loading data") _, _, _, xTe, xTr, xVa, yTr, yTe, yVa = loadmat( 'data/segment.mat').values() xTe, xTr, xVa = xTe.T, xTr.T, xVa.T yTr, yTe, yVa = yTr.flatten().astype(int) - 1, yTe.flatten().astype( int) - 1, yVa.flatten().astype(int) - 1 print("Training pca...") L0 = pca(xTr.T, whiten=True)[0].T print("Training pca-lda...") pca_lda = Pipeline([('pca', PCA(n_components=5, whiten=True)), ('lda', LinearDiscriminantAnalysis(n_components=3))]) pca_lda.fit(xTr, yTr) pca_eigen_vals = np.diag(1 / np.sqrt(pca_lda[0].explained_variance_)) pcalda_mat = pca_lda[1].scalings_[:, :3].T @ pca_eigen_vals @ pca_lda[ 0].components_ print("Training lmnn...") lmnn = LMNN(init='pca', k=7, learn_rate=1e-6, verbose=False, n_components=3, max_iter=1000) lmnn.fit(xTr, yTr) print('Learning nonlinear metric with GB-LMNN ... ') # L = pcalda_mat L = loadmat('data/lmnn2_L.mat')['L'] # Load the matlab matrix embed = gb_lmnn(xTr, yTr, 3, L, n_trees=200, verbose=True, xval=xVa, yval=yVa) # ################################ k-NN evaluation ################################### print("\nEvaluation:") k = 1 raw_tr_err, raw_te_err = knn_error_score(L0[0:3], xTr, yTr, xTe, yTe, k) print( '1-NN Error for raw (high dimensional) input is, Training: {:.2f}%, Testing {:.2f}%' .format(100 * raw_tr_err, 100 * raw_te_err)) pca_tr_err, pca_te_err = knn_error_score(L0[0:3], xTr, yTr, xTe, yTe, k) print('1-NN Error for PCA in 3d is, Training: {:.2f}%, Testing {:.2f}%'. format(100 * pca_tr_err, 100 * pca_te_err)) lda_tr_err, lda_te_err = knn_error_score(pcalda_mat, xTr, yTr, xTe, yTe, k) print( '1-NN Error for PCA-LDA input is, Training: {:.2f}%, Testing {:.2f}%'. format(100 * lda_tr_err, 100 * lda_te_err)) lmnn_tr_err, lmnn_te_err = knn_error_score(lmnn.components_[0:3], xTr, yTr, xTe, yTe, k) print('1-NN Error for LMNN is, Training: {:.2f}%, Testing {:.2f}%'.format( 100 * lmnn_tr_err, 100 * lmnn_te_err)) gb_tr_err, gb_te_err = knn_error_score([], embed.transform(xTr), yTr, embed.transform(xTe), yTe, 1) print( '1-NN Error for GB-LMNN input is, Training: {:.2f}%, Testing {:.2f}%'. format(100 * gb_tr_err, 100 * gb_te_err)) # ################################ 3-D Plot ################################### print("\nPlotting figures") fig = plt.figure(figsize=(12, 8)) ax1 = fig.add_subplot(2, 2, 1, projection='3d') ax1.set_title("PCA Train Error: {:.2f}, Test Error: {:.2f}".format( 100 * pca_tr_err, 100 * pca_te_err)) pts_to_plt = xTr @ L0[0:3].T for l in np.unique(yTr): mask = np.squeeze(yTr == l) ax1.scatter(pts_to_plt[mask, 0], pts_to_plt[mask, 1], pts_to_plt[mask, 2], label=l) plt.legend() ax2 = fig.add_subplot(2, 2, 2, projection='3d') ax2.set_title("PCA-LDA Train Error: {:.2f}, Test Error: {:.2f}".format( 100 * lda_tr_err, 100 * lda_te_err)) pts_to_plt = xTr @ pcalda_mat.T for l in np.unique(yTr): mask = np.squeeze(yTr == l) ax2.scatter(pts_to_plt[mask, 0], pts_to_plt[mask, 1], pts_to_plt[mask, 2], label=l) plt.legend() ax3 = fig.add_subplot(2, 2, 3, projection='3d') ax3.set_title("LMNN Train Error: {:.2f}, Test Error: {:.2f}".format( 100 * lmnn_tr_err, 100 * lmnn_te_err)) pts_to_plt = lmnn.transform(xTr) for l in np.unique(yTr): mask = np.squeeze(yTr == l) ax3.scatter(pts_to_plt[mask, 0], pts_to_plt[mask, 1], pts_to_plt[mask, 2], label=l) plt.legend() ax4 = fig.add_subplot(2, 2, 4, projection='3d') ax4.set_title("GB-LMNN Train Error: {:.2f}, Test Error: {:.2f}".format( 100 * gb_tr_err, 100 * gb_te_err)) pts_to_plt = embed.transform(xTr) for l in np.unique(yTr): mask = np.squeeze(yTr == l) ax4.scatter(pts_to_plt[mask, 0], pts_to_plt[mask, 1], pts_to_plt[mask, 2], label=l) plt.legend() plt.show()
vembeds, _, vlabels, vacc = model.get_embeddings_logits( model.val_dataset, model.val_indices, batch_size=256, num_workers=16) tembeds = torch.from_numpy(tembeds).cuda() tlabels = torch.from_numpy(tlabels).cuda() vembeds = torch.from_numpy(vembeds).cuda() vlabels = torch.from_numpy(vlabels).cuda() # run LMNN if n > 1: lmnn = LMNN(k=get_k(tlabels.cpu().numpy()), learn_rate=1e-4, verbose=True, max_iter=5000) lmnn.fit(tembeds.cpu().numpy(), tlabels.cpu().numpy()) W_cuda = torch.from_numpy(lmnn.components_.T).cuda().float() #top1 knn top1_knn_before, top3_knn_before = run_simulation( tembeds, tlabels, vembeds, vlabels) if n > 1: # transform into LMNN found space tembeds = torch.matmul(tembeds, W_cuda) vembeds = torch.matmul(vembeds, W_cuda) # top1 lmnn top1_lmnn_before, top3_lmnn_before = run_simulation( tembeds, tlabels, vembeds, vlabels)
out1 = logreg.predict(train) out2 = logreg.predict(test) print("Training set score: %f" % logreg.score(train, train_out)) x = [[0, 0, 0, 0] for i in range(4)] for key, val in enumerate(out1): x[int(val)][int(train_out[key])] += 1 print x print("Test set score: %f" % logreg.score(test, test_out)) x = [[0, 0, 0, 0] for i in range(4)] for key, val in enumerate(out2): x[int(val)][int(test_out[key])] += 1 print x mls = [ LMNN(), # ITML_Supervised(num_constraints=200), # SDML_Supervised(num_constraints=200), # LSML_Supervised(num_constraints=200), ] # x = train # y = train_out print "KNN and Logreg without Coordinate Transfrom" nearest_neighbors(x,y) print "KNN and Logreg with LMNN transform" for ax_num, ml in enumerate(mls, start=3): print "Fitting" ml.fit(x, y)
# data = pd.read_csv("/Users/kueen/Downloads/trajectory dataset/animal movement/vertebrate taxa/Movement syndromes across vertebrate taxa (data from Abrahms et al. 2017)-gps.csv") # id = data["individual-local-identifier"].values # id = list(set(id)) # for i in range(len(id)): # tmp = data[data["individual-local-identifier"]==id[i]] # tmp.to_csv("/Users/kueen/Downloads/trajectory dataset/animal movement/vertebrate taxa/" # + id[i] + ".csv") # data = pd.read_csv("/Users/kueen/Downloads/trajectory dataset/animal movement/vertebrate taxa/Movement syndromes across vertebrate taxa (data from Abrahms et al. 2017)-gps.csv" # , usecols=["timestamp", "location-long", "location-lat", "individual-local-identifier"]) # data = data[data["comments"]=="LI06_LH364"] # coors = data[["location-long", "location-lat"]].values # plt.plot(coors[:,0],coors[:,1]) # plt.show() # file_list = gci("/Users/kueen/Downloads/trajectory dataset/animal movement/vertebrate taxa/") # file_list = [x for x in file_list # if x.split("/")[7].split(" ")[0] == "jackal" or # x.split("/")[7].split(" ")[0] == "elephant" or # x.split("/")[7].split(" ")[0] == "springbok"] # for i in range(len(file_list)): # data = pd.read_csv(file_list[i], usecols=["timestamp", "location-long", "location-lat"]) # data = data.dropna(axis=0, how="any") # coors = data[["location-long", "location-lat"]].values # if file_list[i].split("/")[7].split(" ")[0] == "jackal": # plt.plot(coors[:,0],coors[:,1],color="blue") # elif file_list[i].split("/")[7].split(" ")[0] == "elephant": # plt.plot(coors[:,0],coors[:,1],color="red") mat = np.loadtxt("truck_sim") lmnn = LMNN(k=2, min_iter=100, learn_rate=1e-6) label = np.zeros(mat.shape[0]) print(label)
def test_lmnn(self): lmnn = LMNN(k=5, learn_rate=1e-6, verbose=False) lmnn.fit(self.X, self.y) L = lmnn.transformer_ assert_array_almost_equal(L.T.dot(L), lmnn.metric())
Result_of_Upper = np.zeros([len(datasets), 2]) #LMNN和非度量学习 Result_of_acc_ave = np.zeros([len(datasets) * 2, len(classifiers)]) Result_of_acc_std = np.zeros([len(datasets) * 2, len(classifiers)]) for i in range(len(datasets)): print(datasets[i]) new_path = os.path.join('.\data', datasets[i]) Data_Origi, DataLabel, n_samples, n_attr, n_class = PF.Load_Data(new_path) #归一化处理 scaler = MinMaxScaler() scaler.fit(Data_Origi) Data_Origi = scaler.transform(Data_Origi) for l in range(2): if l == 0: #度量学习 lmnn = LMNN(k=5, learn_rate=1e-6) lmnn.fit(Data_Origi, DataLabel) Data_trans = lmnn.transform(Data_Origi) else: Data_trans = Data_Origi #同质化融合 Dis_Matrix = PF.Calcu_Dis(Data_trans) CompareMatrix = PF.CompareNoiseLabel(Dis_Matrix, DataLabel) Cluster_Checked = PF.Affinity_propagatio_Modify(CompareMatrix) lap_ratio = PF.Count(Cluster_Checked, set_vlaue, n_samples) Result_of_Upper[i, l] = 1 - lap_ratio for j in range(len(classifiers)): print(classifiers[j]) clf = classifiers[j] scores = cross_val_score(clf, Data_trans, DataLabel, cv=cv)
for train_index, valid_index in kfold.split(total_train_features, total_train_labels): train_features = total_train_features[train_index] train_labels = total_train_labels[train_index] valid_features = total_train_features[valid_index] valid_labels = total_train_labels[valid_index] fold_cnt += 1 print("k:", knn_k) print("fold:", fold_cnt) print("train features shape:", train_features.shape) print("train labels shape:", train_labels.shape) print("valid features shape:", valid_features.shape) print("valid labels shape:", valid_labels.shape) lmnn = LMNN(k=5) transformed_features = lmnn.fit_transform(train_features, train_labels) neigh = KNeighborsClassifier(n_neighbors=knn_k) neigh.fit(transformed_features, train_labels) neigh_orig = KNeighborsClassifier(n_neighbors=knn_k) neigh_orig.fit(train_features, train_labels) predict = neigh.predict(lmnn.transform(valid_features)) predict_orig = neigh_orig.predict(valid_features) accuracy = metrics.accuracy_score(valid_labels, predict) accuracy_orig = metrics.accuracy_score(valid_labels, predict_orig) print("accuracy after metric learning:{}".format(accuracy)) print("accuracy before metric learning:{}".format(accuracy_orig)) ac_list.append(accuracy) ac_list_orig.append(accuracy_orig) final_train_accuracy = np.mean(ac_list)
rank_accuracies, mAP = evaluate_metric(X_query_pca, camId_query, y_query, X_gallery_pca, camId_gallery, y_gallery, metric ='mahalanobis', parameters = M) rank_accuracies_l_2.append(rank_accuracies) mAP_l_2.append(mAP) metric_l_2.append('Learnt Mahalanobis (Red. Set)') # In[24]: from metric_learn import LMNN lmnn = LMNN(k=3, learn_rate=1e-6, max_iter=50) lmnn.fit(X_train_pca, y_train) M = lmnn.metric() print ('Metric learnt') rank_accuracies, mAP = evaluate_metric(X_query_pca, camId_query, y_query, X_gallery_pca, camId_gallery, y_gallery, metric ='mahalanobis', parameters = M) rank_accuracies_l_2.append(rank_accuracies)
def __init__(self, k=1): self.k = k self.distanceEstimator = LMNN(k=k)
def __init__(self, n_neighbors=3): super(GeoLMNN, self).__init__(n_neighbors=n_neighbors) self.lmnn = LMNN(n_neighbors)
p.scatter(prototype[:, 0], prototype[:, 1], s=60, c=_tango_color(prototype_label), marker='.') p.axis('equal') y = [] x = [] with open('segmentation.data') as f: for line in f: v = line.split(',') y.append(v[0]) x.append(v[1:]) x = np.asarray(x, dtype='float64') y = np.asarray(y) lmnn = LMNN(k=5, learn_rate=1e-6) lmnn.fit(x, y) x_t = lmnn.transform(x) p1 = plt.subplot(231) p1.scatter(x_t[:, 0], x_t[:, 1], c=_to_tango_colors(y, 0)) p1.axis('equal') p1.set_title('LMNN') # GLVQ glvq = GlvqModel() glvq.fit(x, y) p2 = plt.subplot(232) p2.set_title('GLVQ') plot(PCA().fit_transform(x), y, glvq.predict(x), glvq.w_, glvq.c_w_, p2)
def test_lmnn(self): lmnn = LMNN(k=5, learn_rate=1e-6, verbose=False) lmnn.fit(self.X, self.y) L = lmnn.components_ assert_array_almost_equal(L.T.dot(L), lmnn.get_mahalanobis_matrix())
index = np.random.permutation(len(x)) x = x[index] y = y[index] # x -= x.min(1).reshape(-1, 1) # x /= x.max(1).reshape(-1, 1) # truncate x = x[:MAX, :] y = y[:MAX] print 'MAX={}'.format(MAX) sys.stdout.flush() # training svm = NuSVC(kernel='linear') # linear, poly, rbf, NuSVC lmnn = LMNN(k=5, learn_rate=1e-7, max_iter=400) gnb = GaussianNB() mnb = MultinomialNB(alpha=0.0) bnb = BernoulliNB(alpha=0.0) svmrec = [] lmnnrec = [] gnbrec = [] mnbrec = [] bnbrec = [] for train_index, test_index in KFold(len(x), n_folds=10, shuffle=True): train_x, test_x = x[train_index], x[test_index] train_y, test_y = y[train_index], y[test_index] gnb.fit(train_x, train_y)
def test_lmnn(self): lmnn = LMNN(k=5, learn_rate=1e-6, verbose=False) lmnn.fit(self.X, self.y) L = lmnn.transformer_ assert_array_almost_equal(L.T.dot(L), lmnn.get_mahalanobis_matrix())
quadruplets_learners = [(LSML(), build_quadruplets)] ids_quadruplets_learners = list( map(lambda x: x.__class__.__name__, [learner for (learner, _) in quadruplets_learners])) pairs_learners = [ (ITML(), build_pairs), (MMC(max_iter=2), build_pairs), # max_iter=2 for faster (SDML(), build_pairs), ] ids_pairs_learners = list( map(lambda x: x.__class__.__name__, [learner for (learner, _) in pairs_learners])) classifiers = [(Covariance(), build_classification), (LFDA(), build_classification), (LMNN(), build_classification), (NCA(), build_classification), (RCA(), build_classification), (ITML_Supervised(max_iter=5), build_classification), (LSML_Supervised(), build_classification), (MMC_Supervised(max_iter=5), build_classification), (RCA_Supervised(num_chunks=10), build_classification), (SDML_Supervised(), build_classification)] ids_classifiers = list( map(lambda x: x.__class__.__name__, [learner for (learner, _) in classifiers])) regressors = [(MLKR(), build_regression)] ids_regressors = list( map(lambda x: x.__class__.__name__, [learner for (learner, _) in regressors]))
def lmnn_apply(x_input, y_input, k_input): y_classified = classify_y(y_input) lmnn = LMNN(k=k_input, learn_rate=1e-6) x_output = lmnn.fit_transform(x_input, y_classified) return x_output
class MLPipe: pipe = Pipeline([('scaling', StandardScaler()), ('feature_selection', SelectFromModel(RandomForestClassifier(n_estimators=100, random_state=42), threshold='median')), ('metric_learning', None), ('classifier', SVC())]) save_path = './titanic/pipe_{}.bin' feature_selection_param_grid = { 'SVC': [ { 'scaling': [StandardScaler(), None], 'metric_learning': [None, LMNN()], 'feature_selection': [SelectFromModel(RandomForestClassifier(n_estimators=100, random_state=42), threshold='median'), None], # 'feature_selection__estimator': [RandomForestClassifier(n_estimators=100, random_state=42), SVC(C=1000), KNeighborsClassifier()], 'classifier': [SVC()], 'classifier__kernel': ['rbf'], 'classifier__C': [0.001, 0.01, 0.1, 1, 10, 100], 'classifier__gamma': [0.001, 0.01, 0.1, 1, 10, 100] }, { 'scaling': [StandardScaler(), None], 'metric_learning': [None, LMNN()], 'feature_selection': [SelectFromModel(RandomForestClassifier(n_estimators=100, random_state=42), threshold='median'), None], 'classifier': [SVC()], 'classifier__kernel': ['linear'], 'classifier__C': [0.001, 0.01, 0.1, 1, 10, 100], } ], 'rfc': [ { #'scaling': [StandardScaler(), None], 'scaling': [None], 'metric_learning': [None, LMNN()], 'feature_selection': [SelectFromModel(RandomForestClassifier(n_estimators=100, random_state=42), threshold='median'), None], 'classifier': [RandomForestClassifier()], 'classifier__n_estimators': [10, 25, 50, 75, 100], 'classifier__max_depth': [None, 5, 10, 25], 'classifier__min_samples_split': [5, 10, 15] } ], 'knn': [ { 'scaling': [StandardScaler(), MinMaxScaler(), None], 'metric_learning': [None, LMNN(), ITML_Supervised(num_constraints=200)], 'feature_selection': [ SelectFromModel(RandomForestClassifier(n_estimators=100, random_state=42), threshold='median'), None], 'classifier': [KNeighborsClassifier()], 'classifier__n_neighbors': [2, 3, 4, 5], 'classifier__algorithm': ['auto', 'ball_tree', 'kd_tree'] } ], 'dt': [ { 'scaling': [StandardScaler(), MinMaxScaler(), None], 'metric_learning': [None], 'feature_selection': [ SelectFromModel(RandomForestClassifier(n_estimators=100, random_state=42), threshold='median'), None], 'classifier': [DecisionTreeClassifier()], 'classifier__criterion': ['gini', 'entropy'], 'classifier__max_features': ['auto', 'sqrt', 'log2'], 'classifier__max_depth': [None, 5, 10, 15] } ], 'gbc': [ { 'scaling': [StandardScaler(), None], 'metric_learning': [None, LMNN()], 'feature_selection': [ SelectFromModel(RandomForestClassifier(n_estimators=100, random_state=42), threshold='median'), None], 'classifier': [GradientBoostingClassifier()], 'classifier__loss': ['deviance'], 'classifier__learning_rate': [0.1, 1, 10], 'classifier__n_estimators': [10, 50, 100], 'classifier__criterion': ['friedman_mse'], 'classifier__max_features': ['auto'], 'classifier__max_depth': [None, 2, 5, 10, 15, 25], 'classifier__min_samples_split': [5, 10, 15] } ], 'xgb': [ { 'scaling': [StandardScaler()],#, None], 'metric_learning': [None],#, LMNN(), ITML_Supervised(num_constraints=200)], 'feature_selection': [None], 'classifier': [xgb.XGBClassifier()], 'classifier__n_estimators': [500, 1000, 2000], 'classifier__max_depth': [4, 6, 8, 10], 'classifier__min_child_weight': [1, 2, 3], 'classifier__gamma': [0.4, 0.6, 0.8, 0.9, 1], 'classifier__subsample': [0.4, 0.6, 0.8, 1.0], 'classifier__colsample_bytree': [0.4, 0.6, 0.8, 1.0] } ] } def __init__(self, model_name: str): print('model_name: %s' % model_name) self._model_name = model_name self._param_grid = MLPipe.feature_selection_param_grid[model_name] self._save_path = MLPipe.save_path.format(model_name) self._save_best_path = self._save_path + '-best' self._model = None self._pipe = MLPipe.pipe def fit_model(self, train_X: list, train_y: list): model = load_model(self._save_path) if not model: # create model, if not loading file grid_search = GridSearchCV(self._pipe, self._param_grid, cv=5, n_jobs=-1) grid_search.fit(train_X, train_y) save_model(self._save_path, grid_search) model = grid_search print('best score: {:.2f}'.format(model.best_score_)) print('best estimator \n{}'.format(model.best_estimator_)) self._model = model.best_estimator_ def predict(self, test_X: list) -> list: test_y = self._model.predict(test_X).astype(int) return test_y def get_model(self) -> dict: return self._model def save_best_model(self): save_model(self._save_best_path, self._model) def load_best_model(self): self._model = load_model(self._save_best_path) def get_cv_failure_data(self, train_X: list, train_y: list): ret_index = np.array([]) evaluate_model = self._model kf = KFold(n_splits=5) for train_index, test_index in kf.split(train_X): evaluate_model.fit(train_X[train_index], train_y[train_index]) evaluate_y = evaluate_model.predict(train_X[test_index]) correct_eval_y = train_y[test_index] ret_index = np.concatenate((ret_index, np.array(test_index)[evaluate_y != train_y[test_index]])) return list(ret_index.astype(int))
loader = DataLoader(dataset=trainset, batch_size=args.batch_size, shuffle=False, num_workers=args.num_workers, pin_memory=True) embs = [] labels = [] with torch.no_grad(): for i, batch in tqdm(enumerate(loader, 1), total=len(loader), desc='embedding'): if torch.cuda.is_available(): data, label = batch[0].cuda(), batch[1] else: data, label = batch data_emb = model.encoder(data) embs.append(data_emb) labels.append(label) embs = torch.cat(embs).cpu().numpy() labels = torch.cat(labels).numpy() lmnn = LMNN(verbose=True) print('fitting data....') lmnn.fit(embs, labels) print('fitting data finished.') directory = 'checkpoints/lmnn/' if not osp.exists(directory): os.makedirs(directory) joblib.dump(lmnn, osp.join(directory, '%s.pkl' % args.filename))
concept_info[c]['vector_centroid'] = np.zeros(100) for item in concept_json[c]: if item in model.vocab: tar = list(concept_json.keys()).index(c) data['data'] = np.row_stack((data['data'], model[item])) data['target'] = np.append(data['target'], tar) concept_info[c]['seeds_invocab'].append(item) concept_info[c]['vector_centroid'] += model[item] concept_info[c]['vector_centroid'] /= len(concept_info[c]['seeds_invocab']) X = data['data'] Y = data['target'] print("first training") print("================================================") lmnn = LMNN(None, k=5, learn_rate=1e-6) lmnn.fit(X, Y) print("train finish") print("================================================") # for word in worddict: # score = {} # for c in concept_info: # flag = False # sim1 = cosine_similarity([model[word], concept_info[c]['vector_centroid']])[0, 1] # sim2 = 0 # for i in range(len(concept_info[c]['seeds_invocab'])): # if word == concept_info[c]['seeds_invocab'][i]: # flag = True # continue # dis = lmnn.score_pairs([[model[word], model[concept_info[c]['seeds_invocab'][i]]]])[0]
gallery_labels) # Compute PCA result print("\n-----PCA------") pca = PCA(original_train_features, M=500) pca.fit() pca_query_features = pca.project(query_features) pca_gallery_features = pca.project(gallery_features) compute_k_mean(num_of_clusters, pca_query_features, pca_gallery_features, gallery_labels) # Compute LMNN (Large Margin Nearest Neighbour) Learning print("\n-----LMNN------") lmnn = LMNN(k=5, max_iter=20, use_pca=False, convergence_tol=1e-6, learn_rate=1e-6, verbose=True) lmnn.fit(original_train_features, original_train_labels) transformed_query_features = lmnn.transform(query_features) transformed_gallery_features = lmnn.transform(gallery_features) compute_k_mean(num_of_clusters, transformed_query_features, transformed_gallery_features, gallery_labels) # Compute PCA_LMNN Learning print("\n-----PCA_LMNN-----") lmnn = LMNN(k=5, max_iter=20, use_pca=False, convergence_tol=1e-6, learn_rate=1e-6,
def test_lmnn(self): check_estimator(LMNN())
#Predict on transformed dataset using KNN with euclidean distance pred = knn.knn(testX_lmnn.tolist()[0], X_lmnn.tolist(), y, k=k, edit=False) #print(X_lmnn) #print(testX_lmnn) return pred #%% #train LMNN model freeman_hist = np.array(freeman_hist) freeman_labels = np.array(freeman_labels) print('hist') start = timeit.default_timer() lmnn_hist = LMNN(k=5, learn_rate=1e-6).fit(freeman_hist, freeman_labels) end = timeit.default_timer() print('hist train time') print(end - start) ''' print('dist') start = timeit.default_timer() lmnn_dist = LMNN(k=5, learn_rate=1e-6).fit(edit_dist, freeman_labels) end = timeit.default_timer() print('edit train time') print(end-start) ''' #%% #Test full iteration pred_label_edit = [] print('edit lmnn')
################################################ #%% print("here") import numpy as np X_train = np.array(X_train) Y_train = np.array(Y_train) X_test = np.array(X_test) Y_test = np.array(Y_test) ## tuning here ... scores = [] #for i in range(1,5): #print("current k is ",i) lmnn2 = LMNN(k=5, learn_rate=1e-6) #.fit(X_train,Y_train) print("here2") print(lmnn2) lmnn2 = lmnn2.fit(X_train, Y_train) print("hi") X_train2 = lmnn2.transform(X_train) X_test2 = lmnn2.transform(X_test) kn2 = KNeighborsClassifier(n_neighbors=40).fit(X_train2, Y_train) predict = kn2.predict(X_test2) lmnn_acc = accuracy_score(Y_test, predict) print("lmnn accuracy is ", lmnn_acc) #scores.append(lmnn_acc) #print("the scores are ",scores) #k=np.argmax(scores)+1 #%%using kernal pca
def lmnn(x_train, y_train, x_test): lmnn = LMNN(max_iter=50, k=9, verbose=True) print("It is") lmnn.fit(x_train, y_train) print("done") return lmnn.transform(x_test)
# #### Joint: Post-processing (Clustering) ################ mode = np.load('posterior_mode_multiply.npz')['joint_posterior_mode'] mode = mode[renewed, :] print(mode.shape) from abcpy.statistics import Identity stat = Identity(degree=4, cross=True) mode = stat.statistics([[mode[i, :]] for i in range(mode.shape[0])]) print(mode.shape) label = [1 for i in range(16)] + [2 for i in range(16) ] + [3 for i in range(16)] print(label) from metric_learn import LMNN metric = LMNN(init='auto', k=6, min_iter=10000, max_iter=50000, convergence_tol=1e-6, learn_rate=1e-10, regularization=.5, n_components=2) metric.fit(mode, label) L = metric.components_ np.savez('L_all_3_cross_parameters.npz', L=L) L = np.load('L_all_3_cross_parameters.npz')['L'] mode_lmnn = mode.dot(L.T) print(mode_lmnn.shape) import pylab as plt plt.figure() plt.plot(mode_lmnn[:16, 0], mode_lmnn[:16, 1],
def test_lmnn(self): lmnn = LMNN(k=5, learn_rate=1e-6, verbose=False) lmnn.fit(self.X, self.y) L = lmnn.transformer() assert_array_almost_equal(L.T.dot(L), lmnn.metric())