def test_pipeline_equivalency(): X = iris_data y = iris_target X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3) # Use init='identity' to ensure reproducibility lmnn_params = dict(n_neighbors=3, max_iter=10, init='identity', random_state=42) n_neighbors = 3 lmnn = LargeMarginNearestNeighbor(**lmnn_params) lmnn.fit(X_train, y_train) lmnn_pipe = make_lmnn_pipeline(**lmnn_params) lmnn_pipe.fit(X_train, y_train) pipe_transformation = lmnn_pipe.named_steps.lmnn.components_ assert_array_almost_equal(lmnn.components_, pipe_transformation) knn = KNeighborsClassifier(n_neighbors=n_neighbors) knn.fit(lmnn.transform(X_train), y_train) score = knn.score(lmnn.transform(X_test), y_test) score_pipe = lmnn_pipe.score(X_test, y_test) assert (score == score_pipe)
def test_callback(): lmnn = LargeMarginNearestNeighbor(n_neighbors=3, callback='my_cb') assert_raise_message(ValueError, '`callback` is not callable.', lmnn.fit, iris_data, iris_target) max_iter = 10 def my_cb(transformation, n_iter): rem_iter = max_iter - n_iter print('{} iterations remaining...'.format(rem_iter)) # assert that my_cb is called old_stdout = sys.stdout sys.stdout = StringIO() lmnn = LargeMarginNearestNeighbor(n_neighbors=3, callback=my_cb, max_iter=max_iter, verbose=1) try: lmnn.fit(iris_data, iris_target) finally: out = sys.stdout.getvalue() sys.stdout.close() sys.stdout = old_stdout # check output assert ('{} iterations remaining...'.format(max_iter - 1) in out)
def test_singleton_class(): X = iris_data y = iris_target X_tr, X_te, y_tr, y_te = train_test_split(X, y, test_size=0.3, stratify=y) # one singleton class singleton_class = 1 ind_singleton, = np.where(y_tr == singleton_class) y_tr[ind_singleton] = 2 y_tr[ind_singleton[0]] = singleton_class lmnn = LargeMarginNearestNeighbor(n_neighbors=3, max_iter=30) lmnn.fit(X_tr, y_tr) # One non-singleton class X_tr, X_te, y_tr, y_te = train_test_split(X, y, test_size=0.3, stratify=y) ind_1, = np.where(y_tr == 1) ind_2, = np.where(y_tr == 2) y_tr[ind_1] = 0 y_tr[ind_1[0]] = 1 y_tr[ind_2] = 0 y_tr[ind_2[0]] = 2 lmnn = LargeMarginNearestNeighbor(n_neighbors=3, max_iter=30) assert_raise_message( ValueError, 'LargeMarginNearestNeighbor needs at least 2 ' 'non-singleton classes, got 1.', lmnn.fit, X_tr, y_tr)
def test_verbose(): # assert there is proper output when verbose = 1 old_stdout = sys.stdout sys.stdout = StringIO() lmnn = LargeMarginNearestNeighbor(n_neighbors=3, verbose=1) try: lmnn.fit(iris_data, iris_target) finally: out = sys.stdout.getvalue() sys.stdout.close() sys.stdout = old_stdout # check output assert ("[LargeMarginNearestNeighbor]" in out) assert ("Finding principal components" in out) assert ("Finding the target neighbors" in out) assert ("Computing static part of the gradient" in out) assert ("Finding principal components" in out) assert ("Training took" in out) # assert by default there is no output (verbose=0) old_stdout = sys.stdout sys.stdout = StringIO() lmnn = LargeMarginNearestNeighbor(n_neighbors=3) try: lmnn.fit(iris_data, iris_target) finally: out = sys.stdout.getvalue() sys.stdout.close() sys.stdout = old_stdout # check output assert (out == '')
def test_n_components(): X = np.arange(12).reshape(4, 3) y = [1, 1, 2, 2] init = np.random.rand(X.shape[1] - 1, 3) # n_components = X.shape[1] != transformation.shape[0] n_components = X.shape[1] lmnn = LargeMarginNearestNeighbor(init=init, n_components=n_components) assert_raise_message( ValueError, 'The preferred embedding dimensionality ' '`n_components` ({}) does not match ' 'the output dimensionality of the given ' 'linear transformation `init` ({})!'.format(n_components, init.shape[0]), lmnn.fit, X, y) # n_components > X.shape[1] n_components = X.shape[1] + 2 lmnn = LargeMarginNearestNeighbor(init=init, n_components=n_components) assert_raise_message( ValueError, 'The preferred embedding dimensionality ' '`n_components` ({}) cannot be greater ' 'than the given data dimensionality ({})!'.format( n_components, X.shape[1]), lmnn.fit, X, y) # n_components < X.shape[1] lmnn = LargeMarginNearestNeighbor(n_components=2, init='identity') lmnn.fit(X, y)
def test_max_impostors(): lmnn = LargeMarginNearestNeighbor(n_neighbors=3, max_impostors=1, impostor_store='list') lmnn.fit(iris_data, iris_target) lmnn = LargeMarginNearestNeighbor(n_neighbors=3, max_impostors=1, impostor_store='sparse') lmnn.fit(iris_data, iris_target)
def test_store_opt_result(): X = iris_data y = iris_target X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3) lmnn = LargeMarginNearestNeighbor(n_neighbors=3, max_iter=5, store_opt_result=True) lmnn.fit(X_train, y_train) transformation = lmnn.opt_result_.x assert (transformation.size == X.shape[1]**2)
def test_neighbors_params(): from scipy.spatial.distance import hamming params = {'algorithm': 'brute', 'metric': hamming} lmnn = LargeMarginNearestNeighbor(n_neighbors=3, neighbors_params=params) lmnn.fit(iris_data, iris_target) components_hamming = lmnn.components_ lmnn = LargeMarginNearestNeighbor(n_neighbors=3) lmnn.fit(iris_data, iris_target) components_euclidean = lmnn.components_ assert (not np.allclose(components_hamming, components_euclidean))
def test_same_lmnn_parallel(): X, y = datasets.make_classification(n_samples=30, n_features=5, n_redundant=0, random_state=0) X_train, X_test, y_train, y_test = train_test_split(X, y) lmnn = LargeMarginNearestNeighbor(n_neighbors=3) lmnn.fit(X_train, y_train) components = lmnn.components_ lmnn.set_params(n_jobs=3) lmnn.fit(X_train, y_train) components_parallel = lmnn.components_ assert_array_almost_equal(components, components_parallel)
def test_impostor_store(): k = 3 lmnn = LargeMarginNearestNeighbor(n_neighbors=k, init='identity', impostor_store='list') lmnn.fit(iris_data, iris_target) components_list = lmnn.components_ lmnn = LargeMarginNearestNeighbor(n_neighbors=k, init='identity', impostor_store='sparse') lmnn.fit(iris_data, iris_target) components_sparse = lmnn.components_ assert_array_almost_equal(components_list, components_sparse, err_msg='Toggling `impostor_store` results in ' 'a different solution.')
def test_neighbors_iris(): # Sanity checks on the iris dataset # Puts three points of each label in the plane and performs a # nearest neighbor query on points near the decision boundary. lmnn = LargeMarginNearestNeighbor(n_neighbors=1) lmnn.fit(iris_data, iris_target) knn = KNeighborsClassifier(n_neighbors=lmnn.n_neighbors_) LX = lmnn.transform(iris_data) knn.fit(LX, iris_target) y_pred = knn.predict(LX) assert_array_equal(y_pred, iris_target) lmnn.set_params(n_neighbors=9) lmnn.fit(iris_data, iris_target) knn = KNeighborsClassifier(n_neighbors=lmnn.n_neighbors_) knn.fit(LX, iris_target) assert (knn.score(LX, iris_target) > 0.95)
def test_warm_start_validation(): X, y = datasets.make_classification(n_samples=30, n_features=5, n_classes=4, n_redundant=0, n_informative=5, random_state=0) lmnn = LargeMarginNearestNeighbor(warm_start=True, max_iter=5) lmnn.fit(X, y) X_less_features, y = \ datasets.make_classification(n_samples=30, n_features=4, n_classes=4, n_redundant=0, n_informative=4, random_state=0) assert_raise_message( ValueError, 'The new inputs dimensionality ({}) does not ' 'match the input dimensionality of the ' 'previously learned transformation ({}).'.format( X_less_features.shape[1], lmnn.components_.shape[1]), lmnn.fit, X_less_features, y)
def test_random_state(): """Assert that when having more than max_impostors (forcing sampling), the same impostors will be sampled given the same random_state and different impostors will be sampled given a different random_state leading to a different transformation""" X = iris_data y = iris_target # Use init='identity' to ensure reproducibility params = { 'n_neighbors': 3, 'max_impostors': 5, 'random_state': 1, 'max_iter': 10, 'init': 'identity' } lmnn = LargeMarginNearestNeighbor(**params) lmnn.fit(X, y) transformation_1 = lmnn.components_ lmnn = LargeMarginNearestNeighbor(**params) lmnn.fit(X, y) transformation_2 = lmnn.components_ # This assertion fails on 32bit systems if init='pca' assert_allclose(transformation_1, transformation_2) params['random_state'] = 2 lmnn = LargeMarginNearestNeighbor(**params) lmnn.fit(X, y) transformation_3 = lmnn.components_ assert (not np.allclose(transformation_2, transformation_3))
def test_neighbors_digits(): # Sanity check on the digits dataset # the 'brute' algorithm has been observed to fail if the input # dtype is uint8 due to overflow in distance calculations. X = digits_data.astype('uint8') y = digits_target n_samples, n_features = X.shape train_test_boundary = int(n_samples * 0.8) train = np.arange(0, train_test_boundary) test = np.arange(train_test_boundary, n_samples) X_train, y_train, X_test, y_test = X[train], y[train], X[test], y[test] k = 1 lmnn = LargeMarginNearestNeighbor(n_neighbors=k, max_iter=30) lmnn.fit(X_train, y_train) knn = KNeighborsClassifier(n_neighbors=k) knn.fit(lmnn.transform(X_train), y_train) score_uint8 = knn.score(lmnn.transform(X_test), y_test) knn.fit(lmnn.transform(X_train.astype(float)), y_train) score_float = knn.score(lmnn.transform(X_test.astype(float)), y_test) assert (score_uint8 == score_float)
def test_warm_start_effectiveness(): # A 1-iteration second fit on same data should give almost same result # with warm starting, and quite different result without warm starting. X, y = datasets.make_classification(n_samples=30, n_features=5, n_redundant=0, random_state=0) X_train, X_test, y_train, y_test = train_test_split(X, y) n_iter = 10 lmnn_warm = LargeMarginNearestNeighbor(n_neighbors=3, warm_start=True, max_iter=n_iter, random_state=0) lmnn_warm.fit(X_train, y_train) transformation_warm = lmnn_warm.components_ lmnn_warm.max_iter = 1 lmnn_warm.fit(X_train, y_train) transformation_warm_plus_one = lmnn_warm.components_ lmnn_cold = LargeMarginNearestNeighbor(n_neighbors=3, warm_start=False, max_iter=n_iter, random_state=0) lmnn_cold.fit(X_train, y_train) transformation_cold = lmnn_cold.components_ lmnn_cold.max_iter = 1 lmnn_cold.fit(X_train, y_train) transformation_cold_plus_one = lmnn_cold.components_ diff_warm = np.sum( np.abs(transformation_warm_plus_one - transformation_warm)) diff_cold = np.sum( np.abs(transformation_cold_plus_one - transformation_cold)) assert_true( diff_warm < 2.0, "Transformer changed significantly after one iteration even " "though it was warm-started.") assert_true( diff_cold > diff_warm, "Cold-started transformer changed less significantly than " "warm-started transformer after one iteration.")
def test_init_transformation(): X, y = datasets.make_classification(n_samples=30, n_features=5, n_redundant=0, random_state=0) X_train, X_test, y_train, y_test = train_test_split(X, y) # Initialize with identity lmnn = LargeMarginNearestNeighbor(n_neighbors=3, init='identity') lmnn.fit(X_train, y_train) # Initialize with PCA lmnn_pca = LargeMarginNearestNeighbor(n_neighbors=3, init='pca') lmnn_pca.fit(X_train, y_train) # Initialize with a transformation given by the user init = np.random.rand(X.shape[1], X.shape[1]) lmnn = LargeMarginNearestNeighbor(n_neighbors=3, init=init) lmnn.fit(X_train, y_train) # init.shape[1] must match X.shape[1] init = np.random.rand(X.shape[1], X.shape[1] + 1) lmnn = LargeMarginNearestNeighbor(n_neighbors=3, init=init) assert_raise_message( ValueError, 'The input dimensionality ({}) of the given ' 'linear transformation `init` must match the ' 'dimensionality of the given inputs `X` ({}).'.format( init.shape[1], X.shape[1]), lmnn.fit, X_train, y_train) # init.shape[0] must be <= init.shape[1] init = np.random.rand(X.shape[1] + 1, X.shape[1]) lmnn = LargeMarginNearestNeighbor(n_neighbors=3, init=init) assert_raise_message( ValueError, 'The output dimensionality ({}) of the given ' 'linear transformation `init` cannot be ' 'greater than its input dimensionality ({}).'.format( init.shape[0], init.shape[1]), lmnn.fit, X_train, y_train) # init.shape[0] must match n_components init = np.random.rand(X.shape[1], X.shape[1]) n_components = X.shape[1] - 2 lmnn = LargeMarginNearestNeighbor(n_neighbors=3, init=init, n_components=n_components) assert_raise_message( ValueError, 'The preferred embedding dimensionality ' '`n_components` ({}) does not match ' 'the output dimensionality of the given ' 'linear transformation `init` ({})!'.format(n_components, init.shape[0]), lmnn.fit, X_train, y_train)
{\displaystyle \xi _{ijl}\geq 0}\xi _{{ijl}}\geq 0 {\displaystyle \mathbf {M} \succeq 0}{\mathbf {M}}\succeq 0 For this coursework, PyLMNN package is used to compute LMNN for metric learning:https://pypi.org/project/PyLMNN/ """ # need pip install pylmnn from pylmnn import LargeMarginNearestNeighbor as LMNN # Set up the hyperparameters k_train, n_components, max_iter = 5, 25, 1000 # Instantiate the metric learner lmnn = LMNN(n_neighbors=k_train, max_iter=max_iter, n_components= n_components) # Train the metric learner lmnn_original = lmnn.fit(original_train_list, Y_train) lmnn_test = lmnn_original.transform(original_test_list) lmnn_test= lmnn_test.T print(lmnn_test.shape) rank_k = [] for i in range(1,lmnn_test.shape[1]): rank_k.append(i) #initialise maP and accuracy scores avg_prec = 0 rank1_prec = [] rank10_prec = [] for query_index in range(0,lmnn_test.shape[1]): query_image = lmnn_test[:, query_index]
csv = np.genfromtxt("data/numerical_train.csv", delimiter=',') csv_test = np.genfromtxt("data/numerical_test.csv", delimiter=',') n, d = csv.shape X_train = csv[:, :d - 1] y_train = csv[:, -1] X_test = csv_test[:, :d - 1] y_test = csv_test[:, -1] k_train, n_components, max_iter = 7, d - 1, 180 lmnn = LMNN(n_neighbors=k_train, max_iter=max_iter, n_components=n_components) print('learning the metric...') # Train the metric learner lmnn.fit(X_train, y_train) X_train_transformed = lmnn.transform(X_train) X_test_transformed = lmnn.transform(X_test) pickle.dump(X_train_transformed, open("data/numerical_train_transformed.pkl", 'wb')) pickle.dump(y_train, open("data/numerical_train_labels.pkl", 'wb')) pickle.dump(X_test_transformed, open("data/numerical_test_transformed.pkl", 'wb')) pickle.dump(y_test, open("data/numerical_test_labels.pkl", 'wb')) pickle.dump(lmnn, open("data/lmnn.pkl", 'wb')) print('done!')
acc1 = [] acc2 = [] acc3 = [] acc4 = [] T = [] T1 = [] T2 = [] T3 = [] T4 = [] for k in [9, 11, 12, 13, 14, 16, 17, 18, 19, 21, 22, 23, 24, 26, 27, 28, 29]: print('Running K={} ... ... '.format(k)) t0 = time.time() lmnn = LMNN(n_neighbors=k, max_iter=200, n_components=x.shape[1]) lmnn.fit(x_train, y_train) x_train_ = lmnn.transform(x_train) x_test_ = lmnn.transform(x_test) t1 = time.time() T.append(t1 - t0) print('LMNN Cost:', t1 - t0) knn = KNeighborsClassifier(n_neighbors=k, weights='distance', metric='cosine', algorithm='brute') knn.fit(x_train_, y_train) lmnn_acc = knn.score(x_test_, y_test) acc1.append(lmnn_acc) t2 = time.time() T1.append(t2 - t1)