def generate_train_test(N, task): # generates a training and a test set with the same # data distribution from two possibilities: easy dataset with low class # overlap, or hard dataset with high class overlap # # Input: # # N - Number of samples per classs # task - String, either 'easy' or 'hard' if task == 'easy': mu1 = [0, 0] mu2 = [4, 2] sigma1 = [[1, 0], [0, 1]] sigma2 = [[1, -1], [-1, 3]] if task == 'hard': mu1 = [0, 0] mu2 = [1, 1] sigma1 = [[3, 0], [0, 2]] sigma2 = [[2, 0], [0, 3]] trainX, trainY = seg.generate_gaussian_data(N, mu1, mu2, sigma1, sigma2) testX, testY = seg.generate_gaussian_data(N, mu1, mu2, sigma1, sigma2) return trainX, trainY, testX, testY
def generate_train_test(N, task): # generates a training and a test set with the same # data distribution from two possibilities: easy dataset with low class # overlap, or hard dataset with high class overlap # # Input: # # N - Number of samples per classs # task - String, either 'easy' or 'hard' if task == 'easy': #-------------------------------------------------------------------# #TODO: modify these values to create an easy train/test dataset pair #-------------------------------------------------------------------# pass if task == 'hard': #-------------------------------------------------------------------# #TODO: modify these values to create an difficult train/test dataset pair #-------------------------------------------------------------------# pass trainX, trainY = seg.generate_gaussian_data(N, mu1, mu2, sigma1, sigma2) testX, testY = seg.generate_gaussian_data(N, mu1, mu2, sigma1, sigma2) return trainX, trainY, testX, testY
def distance_classification_test(): train_data, train_labels = seg.generate_gaussian_data(2) test_data, test_labels = seg.generate_gaussian_data(1) D = scipy.spatial.distance.cdist(test_data, train_data, metric='euclidean') test_labels = train_labels[np.argmin(D, axis=1)] print(test_labels)
def nn_classifier_test_samples(): train_data, train_labels = seg.generate_gaussian_data(20) test_data, test_labels = seg.generate_gaussian_data(10) predicted_labels = seg.nn_classifier(train_data, train_labels, test_data) # predicted_labels = predicted_labels.astype(bool) # test_labels = test_labels.astype(bool) err = util.classification_error(test_labels, predicted_labels) print('True labels:\n{}'.format(test_labels)) print('Predicted labels:\n{}'.format(predicted_labels)) print('Error:\n{}'.format(err))
def covariance_matrix_test(): N=100 mu1=[0,0] mu2=[0,0] sigma1=[[3,1],[1,1]] sigma2=[[3,1],[1,1]] X, Y = seg.generate_gaussian_data(N, mu1, mu2, sigma1, sigma2)
def distance_classification_test(): #------------------------------------------------------------------# # TODO: Use the provided code to generate training and testing data # Classify the points in test_data, based on their distances d to the points in train_data train_data, train_labels = seg.generate_gaussian_data(2) test_data, test_labels = seg.generate_gaussian_data(1) d = scipy.spatial.distance.cdist(test_data, train_data, metric='euclidean') min_index = np.argmin(d, axis=1) print(d) predicted_labels = np.zeros([test_data.shape[0], 1]) for i in range(predicted_labels.shape[0]): predicted_labels[i] = train_labels[min_index[i]] return predicted_labels
def distance_test(): #------------------------------------------------------------------# # TODO: Generate a Gaussian dataset, with 100 samples per class, and compute the distances. # Use plt.imshow() to visualize the distance matrix as an image. X, Y = seg.generate_gaussian_data( 100) # Generates 100 samples per Gaussian class D = scipy.spatial.distance.cdist(X, X, metric='euclidean') plt.imshow(D)
def distance_classification_test(): #------------------------------------------------------------------# # Use the provided code to generate training and testing data # Classify the points in test_data, based on their distances d to the points in train_data train_data, train_labels = seg.generate_gaussian_data(2); test_data, test_labels = seg.generate_gaussian_data(1); # train_data=np.array([[1,1],[0,0],[0,1],[1,0]]); # train_labels=np.array([[0],[1],[0],[1]]) util.scatter_data(train_data, train_labels, 0, 1) util.scatter_data(test_data, test_labels, 0, 1) D = scipy.spatial.distance.cdist(test_data, train_data, metric='euclidean') min_index = np.argmin(D, axis=1) newlabels=train_labels[min_index] print(test_data) print(newlabels) util.scatter_data(test_data,newlabels,0,1)
def small_samples_distance_test(): #------------------------------------------------------------------# # TODO: Generate a small sample Gaussian dataset X, # create dataset C as per the instructions, # and calculate and plot the distances between the datasets. X, Y = seg.generate_gaussian_data(2) C = np.array([[0, 0], [1, 1]]) D = scipy.spatial.distance.cdist(X, C, metric='euclidean') plt.imshow(D) return X, Y, C, D
def distance_test(): #------------------------------------------------------------------# # Generate a Gaussian dataset, with 100 samples per class, and compute the distances. # Use plt.imshow() to visualize the distance matrix as an image. X, Y = seg.generate_gaussian_data(100) X = np.round(X * 3) # Stretch and round the numbers D = scipy.spatial.distance.cdist(X, X, metric='euclidean') plt.imshow(D) #------------------------------------------------------------------# pass
def test_mypca(): #Generates some toy data in 2D, computes PCA, and plots both datasets N = 100 mu1 = [0, 0] mu2 = [2, 0] sigma1 = [[2, 1], [1, 1]] sigma2 = [[2, 1], [1, 1]] XG, YG = seg.generate_gaussian_data(N, mu1, mu2, sigma1, sigma2) fig = plt.figure(figsize=(15, 6)) ax1 = fig.add_subplot(121) util.scatter_data(XG, YG, ax=ax1) sigma = np.cov(XG, rowvar=False) w, v = np.linalg.eig(sigma) ax1.plot([0, v[0, 0]], [0, v[1, 0]], c='g', linewidth=3, label='Eigenvector1') ax1.plot([0, v[0, 1]], [0, v[1, 1]], c='k', linewidth=3, label='Eigenvector2') ax1.set_title('Original data') ax_settings(ax1) ax2 = fig.add_subplot(122) X_pca, v, w, fraction_variance = seg.mypca(XG) util.scatter_data(X_pca, YG, ax=ax2) sigma2 = np.cov(X_pca, rowvar=False) w2, v2 = np.linalg.eig(sigma2) ax2.plot([0, v2[0, 0]], [0, v2[1, 0]], c='g', linewidth=3, label='Eigenvector1') ax2.plot([0, v2[0, 1]], [0, v2[1, 1]], c='k', linewidth=3, label='Eigenvector2') ax2.set_title('My PCA') ax_settings(ax2) handles, labels = ax2.get_legend_handles_labels() plt.figlegend(handles, labels, loc='upper center', bbox_to_anchor=(0.5, -0.05), bbox_transform=plt.gcf().transFigure, ncol=4) print(fraction_variance)
def covariance_matrix_test(): N = 100 mu1 = [0, 0] mu2 = [0, 0] sigma1 = [[3, 1], [1, 1]] sigma2 = [[3, 1], [1, 1]] X, Y = seg.generate_gaussian_data(N, mu1, mu2, sigma1, sigma2) sigma = np.cov(X.T) mu = np.mean(X, axis=0) return X, Y, sigma
def small_samples_distance_test(): #------------------------------------------------------------------# # Generate a small sample Gaussian dataset X, # create dataset C as per the instructions, # and calculate and plot the distances between the datasets. C = np.array([[0, 0], [1, 1]]) X, Y = seg.generate_gaussian_data(2) # Generates 2 samples per Gaussian class X = np.round(X * 3) # Stretch and round the numbers D = scipy.spatial.distance.cdist(X, C, metric='euclidean') plt.imshow(D) #------------------------------------------------------------------# return X,Y,C,D
def initialize_cluster_centers(N=100, num_clusters=2): # Generate 100 samples per Gaussian class X, Y = seg.generate_gaussian_data(N) # Select num_clusters rows from X and store in w_initial start = np.random.randint(0, 98) n_cluster_rows = X[start:(start + num_clusters), :] w_initial = np.array(n_cluster_rows) ax1 = util.scatter_data(X, Y) ax1.scatter(w_initial[:, 0], w_initial[:, 1], c='y') plt.show() return X, w_initial
def distance_classification_test(): #------------------------------------------------------------------# # TODO: Use the provided code to generate training and testing data # Classify the points in test_data, based on their distances d to the points in train_data train_data, trainlabels = seg.generate_gaussian_data(2) test_data, testlabels = seg.generate_gaussian_data(1) D = scipy.spatial.distance.cdist(test_data, train_data, metric='euclidean') #distances between X and C min_index = np.argmin(D, axis=1) min_dist = np.zeros((len(min_index),1)) for i in range(len(min_index)): min_dist[i,0] = D.item((i, min_index[i])) # Sort by intensity of cluster center sorted_order = np.argsort(train_data[:,0], axis=0) # Update the cluster indices based on the sorted order and return results in # predicted_labels predicted_labels = np.empty(*min_index.shape) predicted_labels[:] = np.nan for i in np.arange(len(sorted_order)): predicted_labels[min_index==sorted_order[i]] = i return predicted_labels
def covariance_matrix_test(): N=100 mu1=[0,0] mu2=[0,0] sigma1=[[3,1],[1,1]] sigma2=[[3,1],[1,1]] X, Y = seg.generate_gaussian_data(N, mu1, mu2, sigma1, sigma2) #------------------------------------------------------------------# # TODO: Calculate the mean and covariance matrix of the data, # and compare them to the parameters you used as input. matrix_mean = np.mean(X) matrix_cov = np.cov(X) print("Mean: {}".format(matrix_mean)) print("Covariant matrix: {}".format(matrix_cov)) return X, Y, matrix_cov
def covariance_matrix_test(): N = 100 mu1 = [0, 0] mu2 = [0, 0] sigma1 = [[3, 1], [1, 1]] sigma2 = [[3, 1], [1, 1]] X, Y = seg.generate_gaussian_data(N, mu1, mu2, sigma1, sigma2) #------------------------------------------------------------------# # Calculate the mean and covariance matrix of the data, # and compare them to the parameters you used as input. mn = np.mean(X) print(mn) co = np.cov(X, rowvar=False) print(co) print(co.shape) return X, Y, co
def small_samples_distance_test(): #------------------------------------------------------------------# # TODO: Generate a small sample Gaussian dataset X, # create dataset C as per the instructions, # and calculate and plot the distances between the datasets. X, Y = seg.generate_gaussian_data(100) C = np.array([[0,0],[1,1]]) D1 = scipy.spatial.distance.cdist(X, C, metric='euclidean') #distances between X and C D2 = scipy.spatial.distance.cdist(C,X, metric = 'euclidean') fig = plt.figure(figsize=(10,10)) ax1 = fig.add_subplot(121) ax1.imshow(D1) ax2 = fig.add_subplot(122) ax2.imshow(D2) #mirroring when changing the order of inputs #------------------------------------------------------------------# return X, Y, C, D1
def distance_test(): X, Y = seg.generate_gaussian_data() D = scipy.spatial.distance.cdist(X, X, metric='euclidean') ax = plt.imshow(D)
def logistic_regression(): # dataset preparation num_training_samples = 300 num_validation_samples = 100 # here we reuse the function from the segmentation practicals m1 = [2, 3] m2 = [-0, -4] s1 = [[8, 7], [7, 8]] s2 = [[8, 6], [6, 8]] [trainingX, trainingY] = seg.generate_gaussian_data(num_training_samples, m1, m2, s1, s2) r, c = trainingX.shape print('Training sample shape: {}'.format(trainingX.shape)) # we need a validation set to monitor for overfitting [validationX, validationY] = seg.generate_gaussian_data(num_validation_samples, m1, m2, s1, s2) r_val, c_val = validationX.shape print('Validation sample shape: {}'.format(validationX.shape)) validationXones = util.addones(validationX) # train a logistic regression model: # the learning rate for the gradient descent method # (the same as in intensity-based registration) mu = 0.001 # we are actually using stochastic gradient descent batch_size = 30 # initialize the parameters of the model with small random values, # we need one parameter for each feature and a bias Theta = 0.02 * np.random.rand(c + 1, 1) # number of gradient descent iterations num_iterations = 300 # variables to keep the loss and gradient at every iteration # (needed for visualization) iters = np.arange(num_iterations) loss = np.full(iters.shape, np.nan) validation_loss = np.full(iters.shape, np.nan) # Create base figure fig = plt.figure(figsize=(15, 8)) ax1 = fig.add_subplot(121) im1, Xh_ones, num_range_points = util.plot_lr(trainingX, trainingY, Theta, ax1) seg_util.scatter_data(trainingX, trainingY, ax=ax1) ax1.grid() ax1.set_xlabel('x_1') ax1.set_ylabel('x_2') ax1.legend() ax1.set_title('Training set') text_str1 = '{:.4f}; {:.4f}; {:.4f}'.format(0, 0, 0) txt1 = ax1.text(0.3, 0.95, text_str1, bbox={ 'facecolor': 'white', 'alpha': 1, 'pad': 10 }, transform=ax1.transAxes) ax2 = fig.add_subplot(122) ax2.set_xlabel('Iteration') ax2.set_ylabel('Loss (average per sample)') ax2.set_title('mu = ' + str(mu)) h1, = ax2.plot(iters, loss, linewidth=2, label='Training loss') h2, = ax2.plot(iters, validation_loss, linewidth=2, label='Validation loss') ax2.set_ylim(0, 0.7) ax2.set_xlim(0, num_iterations) ax2.grid() ax1.legend() text_str2 = 'iter.: {}, loss: {:.3f}, val. loss: {:.3f}'.format(0, 0, 0) txt2 = ax2.text(0.3, 0.95, text_str2, bbox={ 'facecolor': 'white', 'alpha': 1, 'pad': 10 }, transform=ax2.transAxes) # iterate for k in np.arange(num_iterations): # pick a batch at random idx = np.random.randint(r, size=batch_size) # the loss function for this particular batch loss_fun = lambda Theta: cad.lr_nll(util.addones(trainingX[idx, :]), trainingY[idx], Theta) # gradient descent: # here we reuse the code for numerical computation of the gradient # of a function Theta = Theta - mu * reg.ngradient(loss_fun, Theta) # compute the loss for the current model parameters for the # training and validation sets # note that the loss is divided with the number of samples so # it is comparable for different number of samples loss[k] = loss_fun(Theta) / batch_size validation_loss[k] = cad.lr_nll(validationXones, validationY, Theta) / r_val # upldate the visualization ph = cad.sigmoid(Xh_ones.dot(Theta)) > 0.5 decision_map = ph.reshape(num_range_points, num_range_points) decision_map_trns = np.flipud(decision_map) im1.set_data(decision_map_trns) text_str1 = '{:.4f}; {:.4f}; {:.4f}'.format(Theta[0, 0], Theta[1, 0], Theta[2, 0]) txt1.set_text(text_str1) h1.set_ydata(loss) h2.set_ydata(validation_loss) text_str2 = 'iter.={}, loss={:.3f}, val. loss={:.3f} '.format( k, loss[k], validation_loss[k]) txt2.set_text(text_str2) display(fig) clear_output(wait=True)
import segmentation as seg import matplotlib.pyplot as plt import cad from IPython.display import display, clear_output, HTML import numpy as np num_training_samples = 300 num_validation_samples = 100 # here we reuse the function from the segmentation practicals m1 = [2, 3] m2 = [-0, -4] s1 = [[8, 7], [7, 8]] s2 = [[8, 6], [6, 8]] [trainingX, trainingY] = seg.generate_gaussian_data(num_training_samples, m1, m2, s1, s2) r, c = trainingX.shape print('Training sample shape: {}'.format(trainingX.shape)) # we need a validation set to monitor for overfitting [validationX, validationY] = seg.generate_gaussian_data(num_validation_samples, m1, m2, s1, s2) r_val, c_val = validationX.shape print('Validation sample shape: {}'.format(validationX.shape)) validationXones = util.addones(validationX) # train a logistic regression model: # the learning rate for the gradient descent method # (the same as in intensity-based registration)
def learning_curve(): # Load training and test data train_data, train_labels = seg.generate_gaussian_data(1000) test_data, test_labels = seg.generate_gaussian_data(1000) [train_data, test_data] = seg.normalize_data(train_data, test_data) #Define parameters train_sizes = np.array([1, 3, 10, 30, 100, 300]) k = 1 num_iter = 3 #How often to repeat the experiment #Store errors test_error = np.empty([len(train_sizes),num_iter]) test_error[:] = np.nan test_dice = np.empty([len(train_sizes),num_iter]) test_dice[:] = np.nan #------------------------------------------------------------------# #TODO: Store errors for training data #------------------------------------------------------------------# ## Train and test with different values for i in np.arange(len(train_sizes)): for j in np.arange(num_iter): print('train_size = {}, iter = {}'.format(train_sizes[i], j)) #Subsample training set ix = np.random.randint(len(train_data), size=train_sizes[i]) subset_train_data = train_data[ix,:] subset_train_labels = train_labels[ix,:] #Train classifier neigh = KNeighborsClassifier(n_neighbors=k) neigh.fit(subset_train_data, subset_train_labels.ravel()) #Evaluate predicted_test_labels = neigh.predict(test_data) test_labels = test_labels.astype(bool) predicted_test_labels = predicted_test_labels.astype(bool) test_error[i,j] = util.classification_error(test_labels, predicted_test_labels) test_dice[i,j] = util.dice_overlap(test_labels, predicted_test_labels) #------------------------------------------------------------------# #TODO: Predict training labels and evaluate #------------------------------------------------------------------# ## Display results fig = plt.figure(figsize=(8,8)) ax1 = fig.add_subplot(111) x = np.log(train_sizes) y_test = np.mean(test_error,1) yerr_test = np.std(test_error,1) p1 = ax1.errorbar(x, y_test, yerr=yerr_test, label='Test error') #------------------------------------------------------------------# #TODO: Plot training size #------------------------------------------------------------------# ax1.set_xlabel('Number of training samples (k)') ax1.set_ylabel('error') ticks = list(x) ax1.set_xticks(ticks) tick_lbls = [str(i) for i in train_sizes] ax1.set_xticklabels(tick_lbls) ax1.grid() ax1.legend()
import segmentation_util as util import matplotlib.pyplot as plt import segmentation as seg from scipy import ndimage, stats import scipy from sklearn.neighbors import KNeighborsClassifier import timeit from IPython.display import display, clear_output N = 100 mu1 = [0, 0] mu2 = [2, 0] sigma1 = [[2, 1], [1, 1]] sigma2 = [[2, 1], [1, 1]] XG, YG = seg.generate_gaussian_data(N, mu1, mu2, sigma1, sigma2) X_pca, v, w, fraction_variance = seg.mypca(XG) X = XG X = X - np.mean(X, axis=0) #------------------------------------------------------------------# #TODO: Calculate covariance matrix of X, find eigenvalues and eigenvectors, # sort them, and rotate X using the eigenvectors cov_matrix = np.cov(X, rowvar=False) np.sort(cov_matrix) w, v = np.linalg.eig(cov_matrix) print(w) print(v)
def small_samples_distance_test(): X, Y = seg.generate_gaussian_data(50) C = np.array([[0, 0], [1, 1]]) D = scipy.spatial.distance.cdist(X, C, metric='euclidean') return X, Y, C, D