Example #1
0
def cluster(vals, k):
    dv = DictVectorizer(sparse=False)
    X = dv.fit_transform(vals)
    K, C = kcluster.kcluster(X, correlation, k)
    return K, C, dv
Example #2
0
def k_fold(k,
           binned_data_set,
           validate_data,
           bin_lengths,
           db,
           shuffle,
           type,
           knn,
           debug_file,
           output_file,
           reduction_func=None):
    # List to store mean abs error from all k iterations of any regression dataset
    debug_file.write('STARTING K-FOLD\n')
    output_file.write('STARTING K-FOLD\n')

    if reduction_func:
        debug_file.write('RUNNING WITH ' + reduction_func + '\n')
        output_file.write('RUNNING WITH ' + reduction_func + '\n')

    mse_results = []
    # List to store 0-1 loss results from all k iterations or any classification dataset
    loss_results = []
    attr_headers = db.get_attr()
    class_list = db.get_classifiers()
    # For each bin in our data
    for bin_number in range(k):
        print("K FOLD ITERATION: ", bin_number)
        output_file.write('K FOLD ITERATION ' + str(bin_number) + '\n')
        debug_file.write('K FOLD ITERATION ' + str(bin_number) + '\n')

        test_data = []
        training_data = deepcopy(binned_data_set)
        row_idx = 0

        # Add rows from the main data set to our test_data subset
        # until it is the length of the bin that we are using as our
        # test bin (This is to ensure we stop after finding all of the
        # rows that match the bin we want to use)
        while len(test_data) < bin_lengths[bin_number]:
            if training_data[row_idx][0] == bin_number:
                test_data.append(training_data.pop(row_idx).copy()[1:])
                row_idx -= 1
            row_idx += 1

        # Remove the bin numbers from our training data, this is done because our classifier does not support bin numbers
        for row_idx2 in range(len(training_data)):
            training_data[row_idx2].pop(0)
            training_data[row_idx2] = training_data[row_idx2][0]

        if shuffle:
            training_data = process_data.shuffle_all(training_data, .1)

        debug_file.write('TRAINING DATA: \n')
        for row in training_data:
            debug_file.write(str(row) + '\n')
        print('FULL TRAINING DATA LENGTH: ', len(training_data))
        # Check which reduction_func we are using
        if reduction_func == 'edited_nn':
            training_data = knn.edited_knn(training_data, validate_data)
            debug_file.write('\n\n REDUCED TRAINING DATA: \n')
            for row in training_data:
                debug_file.write(str(row) + '\n')
        elif reduction_func == 'condensed_nn':
            training_data = knn.condensed_nn(training_data)
            debug_file.write('\n\n REDUCED TRAINING DATA: \n')
            for row in training_data:
                debug_file.write(str(row) + '\n')
        elif reduction_func == 'k_means':
            if type == 'classification':
                edited_data = knn.edited_knn(training_data, validate_data)
                print("Finished enn.")
                print("Making ", len(edited_data), " clusters.")
                kc = kcluster(len(edited_data), 10, training_data,
                              db.get_classifier_attr_cols(), 'k-means')
            else:
                num_clusters = math.sqrt(len(training_data))
                kc = kcluster(num_clusters, 10, training_data,
                              db.get_classifier_attr_cols(), 'k-means')
            training_data = kc.get_centroids()
            for i, point in enumerate(training_data):
                if len(point) == 0:
                    del training_data[i]

        elif reduction_func == 'k_medoids':
            if type == 'classification':
                edited_data = knn.edited_knn(training_data, validate_data)
                print("Finished enn.")
                print("Making ", len(edited_data), " clusters.")
                kc = kcluster(len(edited_data), 10, training_data,
                              db.get_classifier_attr_cols(), 'k-medoids')
            else:
                num_clusters = math.sqrt(len(training_data))
                kc = kcluster(num_clusters, 100, training_data,
                              db.get_classifier_attr_cols(), 'k-medoids')
            medoid_idxs = kc.get_medoids()
            new_training_data = []
            for idx in medoid_idxs:
                new_training_data.append(training_data[idx])
            training_data = new_training_data

        print('CONDENSED TRAINING DATA LENGTH: ', len(training_data))

        current_loss_results = []  # Set of each 0-1 loss result
        squared_errors = [
        ]  # Set of absolute errors of each regression prediction

        debug_file.write('\n\nTEST DATA: \n')
        for row in test_data:
            debug_file.write(str(row) + '\n')
        # For each row (sample) in our test_data, run knn to predict its class
        for test_row in test_data:
            debug_file.write('RUNNING K-NN ON TEST POINT ' + str(test_row) +
                             '\n')
            # Guess class with knn
            predicted = knn.k_nearest_neighbors(training_data, test_row[0])
            debug_file.write('PREDICITED CLASS: ' + str(predicted) + '\n')
            if type == 'classification':
                if predicted == test_row[0][db.get_classifier_col()]:
                    current_loss_results.append(0)
                else:
                    current_loss_results.append(1)

            elif type == 'regression':
                squared_errors.append(
                    pow((float(test_row[0][db.get_classifier_col()]) -
                         predicted), 2))

        # Compute average 0-1 loss and mean absolute error for this iteration
        if type == 'classification':
            loss = sum(current_loss_results) / len(current_loss_results)
            output_file.write('CALCULATED LOSS: ' + str(loss) + '\n')
            debug_file.write('CALCULATED LOSS: ' + str(loss) + '\n')
            loss_results.append(loss)
        elif type == 'regression':
            mse = sum(squared_errors) / len(squared_errors)
            output_file.write('CALCULATED MsE: ' + str(mse) + '\n')
            output_file.write('CALCULATED MsE: ' + str(mse) + '\n')
            mse_results.append(mse)

    print("0-1 LOSS RESULTS: ", loss_results)
    print("MsE RESULTS: ", mse_results)
    # Return the correct loss function results
    if type == 'classification':
        return loss_results
    else:
        return mse_results
def cluster(X, k):
    K, C = kcluster.kcluster(X, correlation, k)
    print('silhouette_score: %.5f' % (silhouette_score(X, K, metric=correlation)))
    return K, C
Example #4
0
def cluster(vals, k):
    dv = DictVectorizer(sparse=False)
    X = dv.fit_transform(vals)
    K, C = kcluster.kcluster(X, correlation, k)
    return K, C, dv
#!/usr/bin/env python
# encoding: utf-8

import pandas as pd
from scipy.spatial.distance import correlation
from sklearn.preprocessing import scale
import kcluster

df = pd.read_csv('speed_move_count.csv', index_col='uid')
X = scale(df.values)
K, C = kcluster.kcluster(X, correlation, 3)
Example #6
0
def main():
    pm = path_manager()
    selected_dbs = select_db(pm.find_folders(pm.get_databases_dir()))

    for database in selected_dbs:
        # NOTE OUTPUT WILL WRITE TO A FILE, AS DEFINED BELOW:
        # MAKE SURE TO CREATE THIS DIRECTORY BEFORE YOU RUN, AND YOU CAN
        # SHOW THE FILE THAT'S CREATED IN THE VIDEO FOR OUTPUT
        filename = "../output/kmedoids/" + database + "_output.txt"
        output_file = open(filename, "w+")

        db = prepare_db(database, pm)
        k_nn = knn(5, db.get_dataset_type(), db.get_classifier_col(),
                   db.get_classifier_attr_cols())
        classes = db.get_class_list() if db.get_dataset_type(
        ) == 'classification' else []
        class_count = len(
            classes) if db.get_dataset_type() == 'classification' else 1
        X = process_data.shuffle_all(db.get_data(), 1)
        y = np.array(db.get_data())[:, db.get_classifier_col()]

        # RUN K-MEDOIDS ------------------------------------------------------------
        print("RUNNING K-MEDOIDS")
        kc = kcluster(10, 10, db.get_data(), db.get_classifier_attr_cols(),
                      'k-medoids')
        indices = kc.get_medoids()
        centers = [db.get_data()[i] for i in indices]
        rbf = RBF(len(centers), class_count, output_file, 25)
        rbf.fit(X, centers, y, db.get_dataset_type(), classes)
        print("INITIAL WEIGHTS: ", rbf.weights)
        output_file.write("INITIAL WEIGHTS: \n")
        output_file.write(str(rbf.weights) + "\n")
        print("CENTERS: ", centers)
        output_file.write("FINAL WEIGHTS: \n")
        output_file.write(str(rbf.weights) + "\n")
        output_file.write("FINAL TESTS: \n")
        rbf.test(X, db.get_dataset_type(), y, centers, classes)
        print("FINALS WEIGHTS:")
        print(rbf.weights)
        # ----------------------------------------------------------------------------

        # BEGIN classification FFNN
        if db.get_dataset_type() == 'classification':

            # BEGIN preprocessing
            process_data.FFNN_encoding(db)

            # (1) First layer (input layer) has 1 node per attribute.
            # (2) Hidden layers has arbitrary number of nodes.
            # (3) Output layer has 1 node per possible classification.

            layer_sizes = [len(db.get_attr()), 10,
                           len(db.get_class_list())]  # (3)

            # This number is arbitrary.
            # NOTICE: Tune this per dataset
            learning_rate = .5

            ffnn = FFNN(layer_sizes, db.get_dataset_type(), db_name,
                        db.get_data(), learning_rate)

            # BEGIN regression FFNN
        elif db.get_dataset_type() == 'regression':

            process_data.FFNN_encoding(db)

            # (1) First layer (input layer) has 1 node per attribute.
            # (2) Hidden layers has arbitrary number of nodes.
            # (3) Output layer has 1 node, just some real number.
            layer_sizes = [len(db.get_attr()) - 1, 5, 5, 1]

            learning_rate = .0001

            ffnn = FFNN(layer_sizes, db.get_dataset_type(), db_name,
                        db.get_data(), learning_rate)

        else:
            print('Database type invalid. Type = ' + db.get_dataset_type())
Example #7
0
def cluster(X, k):
    K, C = kcluster.kcluster(X, correlation, k)
    print('silhouette_score: %.5f' %
          (silhouette_score(X, K, metric=correlation)))
    return K, C