def knn(k, train_data, train_labels, valid_data): """ Uses the supplied training inputs and labels to make predictions for validation data using the K-nearest neighbours algorithm. Note: N_TRAIN is the number of training examples, N_VALID is the number of validation examples, M is the number of features per example. :param k: The number of neighbours to use for classification of a validation example. :param train_data: N_TRAIN x M array of training data. :param train_labels: N_TRAIN x 1 vector of training labels corresponding to the examples in train_data (must be binary). :param valid_data: N_VALID x M array of data to predict classes for validation data. :return: N_VALID x 1 vector of predicted labels for the validation data. """ dist = l2_distance(valid_data.T, train_data.T) nearest = np.argsort(dist, axis=1)[:, :k] train_labels = train_labels.reshape(-1) valid_labels = train_labels[nearest] # Note this only works for binary labels: valid_labels = (np.mean(valid_labels, axis=1) >= 0.5).astype(np.int) valid_labels = valid_labels.reshape(-1, 1) return valid_labels
def run_knn(k, train_data, train_labels, valid_data): """Uses the supplied training inputs and labels to make predictions for validation data using the K-nearest neighbours algorithm. Note: N_TRAIN is the number of training examples, N_VALID is the number of validation examples, and M is the number of features per example. Inputs: k: The number of neighbours to use for classification of a validation example. train_data: The N_TRAIN x M array of training data. train_labels: The N_TRAIN x 1 vector of training labels corresponding to the examples in train_data (must be binary). valid_data: The N_VALID x M array of data to predict classes for. Outputs: valid_labels: The N_VALID x 1 vector of predicted labels for the validation data. """ # TODO call l2_distance to compute distance between valid data and train data dist = l2_distance(valid_data.T, train_data.T) # TODO sort the distance to get top k nearest data nearest1=np.array([],int) for row in dist: nearest1=np.append(nearest1,np.argpartition(row, k)) nearest1 = nearest1.reshape((50,200)) nearest2=np.array([],int) for row in nearest1: nearest2=np.append(nearest2, row[:k] ) nearest=nearest2.reshape((50,k)) train_labels = train_labels.reshape(-1) valid_labels = train_labels[nearest] # note this only works for binary labels valid_labels = (np.mean(valid_labels, axis=1) >= 0.5).astype(np.int) valid_labels = valid_labels.reshape(-1,1) return valid_labels
def run_knn(k, train_data, train_labels, valid_data): """Uses the supplied training inputs and labels to make predictions for validation data using the K-nearest neighbours algorithm. Note: N_TRAIN is the number of training examples, N_VALID is the number of validation examples, and M is the number of features per example. Inputs: k: The number of neighbours to use for classification of a validation example. train_data: The N_TRAIN x M array of training data. train_labels: The N_TRAIN x 1 vector of training labels corresponding to the examples in train_data (must be binary). valid_data: The N_VALID x M array of data to predict classes for. Outputs: valid_labels: The N_VALID x 1 vector of predicted labels for the validation data. """ print train_data.shape print train_labels.shape print valid_data.shape dist = l2_distance(valid_data.T, train_data.T) nearest = np.argsort(dist, axis=1)[:,:k] train_labels = train_labels.reshape(-1) valid_labels = train_labels[nearest] # note this only works for binary labels valid_labels = (np.mean(valid_labels, axis=1) >= 0.5).astype(np.int) valid_labels = valid_labels.reshape(-1,1) print train_labels print valid_labels return valid_labels
def run_knn(k, train_data, train_labels, valid_data): """Uses the supplied training inputs and labels to make predictions for validation data using the K-nearest neighbours algorithm. Note: N_TRAIN is the number of training examples, N_VALID is the number of validation examples, and M is the number of features per example. Inputs: k: The number of neighbours to use for classification of a validation example. train_data: The N_TRAIN x M array of training data. train_labels: The N_TRAIN x 1 vector of training labels corresponding to the examples in train_data (must be binary). valid_data: The N_VALID x M array of data to predict classes for. Outputs: valid_labels: The N_VALID x 1 vector of predicted labels for the validation data. """ #Call l2_distance to compute distance between valid data and train data dist = l2_distance(np.transpose(valid_data), np.transpose(train_data)) #Sort the distance to get top k nearest data A = np.argsort(dist) B = np.transpose(A) C = np.transpose(B[0:k]) nearest = C train_labels = train_labels.reshape(-1) valid_labels = train_labels[nearest] # note this only works for binary labels valid_labels = (np.mean(valid_labels, axis=1) >= 0.5).astype(np.int) valid_labels = valid_labels.reshape(-1, 1) return valid_labels
def run_knn(k, train_data, train_labels, valid_data): """Uses the supplied training inputs and labels to make predictions for validation data using the K-nearest neighbours algorithm. Note: N_TRAIN is the number of training examples, N_VALID is the number of validation examples, and M is the number of features per example. Inputs: k: The number of neighbours to use for classification of a validation example. train_data: The N_TRAIN x M array of training data. train_labels: The N_TRAIN x 1 vector of training labels corresponding to the examples in train_data (must be binary). valid_data: The N_VALID x M array of data to predict classes for. Outputs: valid_labels: The N_VALID x 1 vector of predicted labels for the validation data. """ dist = l2_distance(valid_data.T, train_data.T) nearest = np.argsort(dist, axis=1)[:,:k] train_labels = train_labels.reshape(-1) valid_labels = train_labels[nearest] results = np.zeros([valid_labels.shape[0],1], int) for i in range(valid_labels.shape[0]): count = np.bincount(valid_labels[i]) max_count = np.argmax(count) results[i] = max_count return results
def run_knn(k, train_data, train_labels, valid_data): """Uses the supplied training inputs and labels to make predictions for validation data using the K-nearest neighbours algorithm. Note: N_TRAIN is the number of training examples, N_VALID is the number of validation examples, and M is the number of features per example. Inputs: k: The number of neighbours to use for classification of a validation example. train_data: The N_TRAIN x M array of training data. train_labels: The N_TRAIN x 1 vector of training labels corresponding to the examples in train_data (must be binary). valid_data: The N_VALID x M array of data to predict classes for. Outputs: valid_labels: The N_VALID x 1 vector of predicted labels for the validation data. """ # Creates a N_VALID x N_TRAIN matrix with rows representing the validation data examples and columns representing the l2 distance to each training example distance = l2_distance(valid_data.T, train_data.T) #For the k=1 case, find the minimum value in each row in the distance matrix, and convert to a 1 x N_VALID matrix of indices if k == 1: nearest = np.array([], dtype=np.int64) for i in xrange(distance.shape[0]): min_index = np.argmin(distance[i, :]) #Finds the index of the nearest training example to each validation set example nearest = np.append(nearest, min_index) # For the k>1 case, create a k x N_VALID matrix where each column is k indices corresponding to the k smallest values in each row of distance else: nearest = np.zeros((k, distance.shape[0]), dtype=np.int64) #Creates a k x N_VALID matrix of zeros to substitute, with int type since they are indices for i in xrange(distance.shape[0]): min_indices = np.argpartition(distance[i, :], k)[:k] nearest[:, i] = min_indices #print nearest #Turn labels into 1X200 row vector train_labels = train_labels.reshape(-1) #Use the indices to find the corresponding training set labels for the k closest training examples if k == 1: valid_labels = train_labels[nearest] else: valid_labels = np.zeros((k,nearest.shape[1]), dtype=np.int64) #Creates a k x N_VALID matrix of zeros to substitute, with int type since they are indices for i in xrange(k): nearest_row = nearest[i, :] valid_labels[i, :] = train_labels[nearest_row] #print valid_labels # note this only works for binary labels if k > 1: valid_labels = (np.mean(valid_labels, axis=0) >= 0.5).astype(np.int) #print valid_labels valid_labels = valid_labels.reshape(-1,1) return valid_labels