def main(): data_file = 'seeds_dataset_shuffled.txt' instances, labels = read_data(data_file) # want labels run from 0 through K-1 # not 1 through K labels = [i-1 for i in labels] print 'Read %d instances and %d labels from file %s.' \ % (len(instances), len(labels), data_file) if len(instances) != len(labels): raise Exception('Expected equal number of instances and labels.') else: n = len(instances) # Find number of clusters by finding out how many unique elements are there # in labels. K = num_unique_labels(labels) print 'Found %d unique labels.' % K # k-nearest neighbor classification for various k k_range = range(1, 31) # create empty list to store cross-validation errors for different k cv_error = [] # 10-fold cross-validation num_folds = 10 for k in k_range: total_error = 0.0 # create and process all folds for fold_id in range(num_folds): # separate indices into training indices # and test indices for this particular fold fold_train_indices, fold_test_indices = \ get_fold_indices(n, num_folds, fold_id) # TASK 3.4.1 # get the training data and labels # and create a k-NN classifier train_data = [instances[x] for x in fold_train_indices] train_labels = [labels[x] for x in fold_train_indices] classifier = lambda p: \ nn_classifier(p, train_data, train_labels, k, K) # TASK 3.4.2 # get the test data and labels # and evaluate the classifier's error test_data = [instances[x] for x in fold_test_indices] test_label = [labels[x] for x in fold_test_indices] fold_error = classification_error(classifier, test_data, test_label) total_error += fold_error cv_error.append(total_error/num_folds) # print the values for k and the corresponding cross-validation errors for i in range(len(k_range)): print k_range[i], cv_error[i]
def main(): data_file = 'seeds_dataset_shuffled.txt' instances, labels = read_data(data_file) # want labels run from 0 through K-1 # not 1 through K labels = [i - 1 for i in labels] print 'Read %d instances and %d labels from file %s.' \ % (len(instances), len(labels), data_file) if len(instances) != len(labels): raise Exception('Expected equal number of instances and labels.') else: n = len(instances) # Find number of clusters by finding out how many unique elements are there # in labels. K = num_unique_labels(labels) print 'Found %d unique labels.' % K # k-nearest neighbor classification for various k k_range = range(1, 31) # create empty list to store cross-validation errors for different k cv_error = [] # 10-fold cross-validation num_folds = 10 for k in k_range: total_error = 0.0 # create and process all folds for fold_id in range(num_folds): # separate indices into training indices # and test indices for this particular fold fold_train_indices, fold_test_indices = \ get_fold_indices(n, num_folds, fold_id) # get the training data and labels # and create a k-NN classifier train_data = [instances[i] for i in fold_train_indices] train_labels = [labels[i] for i in fold_train_indices] classifier = lambda x: nn_classifier(x, train_data, train_labels, k, K) # get the test data and labels # and evaluate the classifier's error test_data = [instances[i] for i in fold_test_indices] test_label = [labels[i] for i in fold_test_indices] fold_error = classification_error(classifier, test_data, test_label) total_error += fold_error cv_error.append(total_error / num_folds) # print the values for k and the corresponding cross-validation errors for i in range(len(k_range)): print k_range[i], cv_error[i]