def test_pipeline_equivalency(): X = iris_data y = iris_target X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3) # Use init='identity' to ensure reproducibility lmnn_params = dict(n_neighbors=3, max_iter=10, init='identity', random_state=42) n_neighbors = 3 lmnn = LargeMarginNearestNeighbor(**lmnn_params) lmnn.fit(X_train, y_train) lmnn_pipe = make_lmnn_pipeline(**lmnn_params) lmnn_pipe.fit(X_train, y_train) pipe_transformation = lmnn_pipe.named_steps.lmnn.components_ assert_array_almost_equal(lmnn.components_, pipe_transformation) knn = KNeighborsClassifier(n_neighbors=n_neighbors) knn.fit(lmnn.transform(X_train), y_train) score = knn.score(lmnn.transform(X_test), y_test) score_pipe = lmnn_pipe.score(X_test, y_test) assert (score == score_pipe)
def test_neighbors_digits(): # Sanity check on the digits dataset # the 'brute' algorithm has been observed to fail if the input # dtype is uint8 due to overflow in distance calculations. X = digits_data.astype('uint8') y = digits_target n_samples, n_features = X.shape train_test_boundary = int(n_samples * 0.8) train = np.arange(0, train_test_boundary) test = np.arange(train_test_boundary, n_samples) X_train, y_train, X_test, y_test = X[train], y[train], X[test], y[test] k = 1 lmnn = LargeMarginNearestNeighbor(n_neighbors=k, max_iter=30) lmnn.fit(X_train, y_train) knn = KNeighborsClassifier(n_neighbors=k) knn.fit(lmnn.transform(X_train), y_train) score_uint8 = knn.score(lmnn.transform(X_test), y_test) knn.fit(lmnn.transform(X_train.astype(float)), y_train) score_float = knn.score(lmnn.transform(X_test.astype(float)), y_test) assert (score_uint8 == score_float)
def test_neighbors_iris(): # Sanity checks on the iris dataset # Puts three points of each label in the plane and performs a # nearest neighbor query on points near the decision boundary. lmnn = LargeMarginNearestNeighbor(n_neighbors=1) lmnn.fit(iris_data, iris_target) knn = KNeighborsClassifier(n_neighbors=lmnn.n_neighbors_) LX = lmnn.transform(iris_data) knn.fit(LX, iris_target) y_pred = knn.predict(LX) assert_array_equal(y_pred, iris_target) lmnn.set_params(n_neighbors=9) lmnn.fit(iris_data, iris_target) knn = KNeighborsClassifier(n_neighbors=lmnn.n_neighbors_) knn.fit(LX, iris_target) assert (knn.score(LX, iris_target) > 0.95)
csv = np.genfromtxt("data/numerical_train.csv", delimiter=',') csv_test = np.genfromtxt("data/numerical_test.csv", delimiter=',') n, d = csv.shape X_train = csv[:, :d - 1] y_train = csv[:, -1] X_test = csv_test[:, :d - 1] y_test = csv_test[:, -1] k_train, n_components, max_iter = 7, d - 1, 180 lmnn = LMNN(n_neighbors=k_train, max_iter=max_iter, n_components=n_components) print('learning the metric...') # Train the metric learner lmnn.fit(X_train, y_train) X_train_transformed = lmnn.transform(X_train) X_test_transformed = lmnn.transform(X_test) pickle.dump(X_train_transformed, open("data/numerical_train_transformed.pkl", 'wb')) pickle.dump(y_train, open("data/numerical_train_labels.pkl", 'wb')) pickle.dump(X_test_transformed, open("data/numerical_test_transformed.pkl", 'wb')) pickle.dump(y_test, open("data/numerical_test_labels.pkl", 'wb')) pickle.dump(lmnn, open("data/lmnn.pkl", 'wb')) print('done!')
from sklearn.neighbors import KNeighborsClassifier from sklearn.model_selection import train_test_split from sklearn.datasets import load_iris from pylmnn import LargeMarginNearestNeighbor as LMNN # Load a data set X, y = load_iris(return_X_y=True) # Split in training and testing set X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.7, stratify=y, random_state=42) # Set up the hyperparameters k_train, k_test, n_components, max_iter = 3, 3, X.shape[1], 180 # Instantiate the metric learner lmnn = LMNN(n_neighbors=k_train, max_iter=max_iter, n_components=n_components) # Train the metric learner lmnn.fit(X_train, y_train) # Fit the nearest neighbors classifier knn = KNeighborsClassifier(n_neighbors=k_test) knn.fit(lmnn.transform(X_train), y_train) # Compute the k-nearest neighbor test accuracy after applying the learned transformation lmnn_acc = knn.score(lmnn.transform(X_test), y_test) print('LMNN accuracy on test set of {} points: {:.4f}'.format(X_test.shape[0], lmnn_acc))
acc2 = [] acc3 = [] acc4 = [] T = [] T1 = [] T2 = [] T3 = [] T4 = [] for k in [9, 11, 12, 13, 14, 16, 17, 18, 19, 21, 22, 23, 24, 26, 27, 28, 29]: print('Running K={} ... ... '.format(k)) t0 = time.time() lmnn = LMNN(n_neighbors=k, max_iter=200, n_components=x.shape[1]) lmnn.fit(x_train, y_train) x_train_ = lmnn.transform(x_train) x_test_ = lmnn.transform(x_test) t1 = time.time() T.append(t1 - t0) print('LMNN Cost:', t1 - t0) knn = KNeighborsClassifier(n_neighbors=k, weights='distance', metric='cosine', algorithm='brute') knn.fit(x_train_, y_train) lmnn_acc = knn.score(x_test_, y_test) acc1.append(lmnn_acc) t2 = time.time() T1.append(t2 - t1) print('cosine Cost:', t2 - t1, '|accuracy:', lmnn_acc)