コード例 #1
0
def main():

    data_file = 'seeds_dataset_shuffled.txt'
    instances, labels = read_data(data_file)

    # want labels run from 0 through K-1
    # not 1 through K
    labels = [i-1 for i in labels]

    print 'Read %d instances and %d labels from file %s.' \
        % (len(instances), len(labels), data_file)

    if len(instances) != len(labels):
        raise Exception('Expected equal number of instances and labels.')
    else:
        n = len(instances)

    # Find number of clusters by finding out how many unique elements are there
    # in labels.
    K = num_unique_labels(labels)
    print 'Found %d unique labels.' % K

    # k-nearest neighbor classification for various k
    k_range = range(1, 31)

    # create empty list to store cross-validation errors for different k
    cv_error = []

    # 10-fold cross-validation
    num_folds = 10

    for k in k_range:

        total_error = 0.0

        # create and process all folds
        for fold_id in range(num_folds):

            # separate indices into training indices
            # and test indices for this particular fold
            fold_train_indices, fold_test_indices = \
                get_fold_indices(n, num_folds, fold_id)

            # TASK 3.4.1
            # get the training data and labels
            # and create a k-NN classifier
            train_data = [instances[x] for x in fold_train_indices]
            train_labels = [labels[x] for x in fold_train_indices]
            classifier = lambda p: \
                nn_classifier(p, train_data, train_labels, k, K)

            # TASK 3.4.2
            # get the test data and labels
            # and evaluate the classifier's error
            test_data = [instances[x] for x in fold_test_indices]
            test_label = [labels[x] for x in fold_test_indices]
            fold_error = classification_error(classifier,
                                              test_data, test_label)

            total_error += fold_error

        cv_error.append(total_error/num_folds)

    # print the values for k and the corresponding cross-validation errors
    for i in range(len(k_range)):
        print k_range[i], cv_error[i]
コード例 #2
0
def main():

    data_file = 'seeds_dataset_shuffled.txt'
    instances, labels = read_data(data_file)

    # want labels run from 0 through K-1
    # not 1 through K
    labels = [i - 1 for i in labels]

    print 'Read %d instances and %d labels from file %s.' \
        % (len(instances), len(labels), data_file)

    if len(instances) != len(labels):
        raise Exception('Expected equal number of instances and labels.')
    else:
        n = len(instances)

    # Find number of clusters by finding out how many unique elements are there
    # in labels.
    K = num_unique_labels(labels)
    print 'Found %d unique labels.' % K

    # k-nearest neighbor classification for various k
    k_range = range(1, 31)

    # create empty list to store cross-validation errors for different k
    cv_error = []

    # 10-fold cross-validation
    num_folds = 10

    for k in k_range:

        total_error = 0.0

        # create and process all folds
        for fold_id in range(num_folds):

            # separate indices into training indices
            # and test indices for this particular fold
            fold_train_indices, fold_test_indices = \
                get_fold_indices(n, num_folds, fold_id)

            # get the training data and labels
            # and create a k-NN classifier
            train_data = [instances[i] for i in fold_train_indices]
            train_labels = [labels[i] for i in fold_train_indices]
            classifier = lambda x: nn_classifier(x, train_data, train_labels,
                                                 k, K)

            # get the test data and labels
            # and evaluate the classifier's error
            test_data = [instances[i] for i in fold_test_indices]
            test_label = [labels[i] for i in fold_test_indices]
            fold_error = classification_error(classifier, test_data,
                                              test_label)

            total_error += fold_error

        cv_error.append(total_error / num_folds)

    # print the values for k and the corresponding cross-validation errors
    for i in range(len(k_range)):
        print k_range[i], cv_error[i]