def cluster(vals, k): dv = DictVectorizer(sparse=False) X = dv.fit_transform(vals) K, C = kcluster.kcluster(X, correlation, k) return K, C, dv
def k_fold(k, binned_data_set, validate_data, bin_lengths, db, shuffle, type, knn, debug_file, output_file, reduction_func=None): # List to store mean abs error from all k iterations of any regression dataset debug_file.write('STARTING K-FOLD\n') output_file.write('STARTING K-FOLD\n') if reduction_func: debug_file.write('RUNNING WITH ' + reduction_func + '\n') output_file.write('RUNNING WITH ' + reduction_func + '\n') mse_results = [] # List to store 0-1 loss results from all k iterations or any classification dataset loss_results = [] attr_headers = db.get_attr() class_list = db.get_classifiers() # For each bin in our data for bin_number in range(k): print("K FOLD ITERATION: ", bin_number) output_file.write('K FOLD ITERATION ' + str(bin_number) + '\n') debug_file.write('K FOLD ITERATION ' + str(bin_number) + '\n') test_data = [] training_data = deepcopy(binned_data_set) row_idx = 0 # Add rows from the main data set to our test_data subset # until it is the length of the bin that we are using as our # test bin (This is to ensure we stop after finding all of the # rows that match the bin we want to use) while len(test_data) < bin_lengths[bin_number]: if training_data[row_idx][0] == bin_number: test_data.append(training_data.pop(row_idx).copy()[1:]) row_idx -= 1 row_idx += 1 # Remove the bin numbers from our training data, this is done because our classifier does not support bin numbers for row_idx2 in range(len(training_data)): training_data[row_idx2].pop(0) training_data[row_idx2] = training_data[row_idx2][0] if shuffle: training_data = process_data.shuffle_all(training_data, .1) debug_file.write('TRAINING DATA: \n') for row in training_data: debug_file.write(str(row) + '\n') print('FULL TRAINING DATA LENGTH: ', len(training_data)) # Check which reduction_func we are using if reduction_func == 'edited_nn': training_data = knn.edited_knn(training_data, validate_data) debug_file.write('\n\n REDUCED TRAINING DATA: \n') for row in training_data: debug_file.write(str(row) + '\n') elif reduction_func == 'condensed_nn': training_data = knn.condensed_nn(training_data) debug_file.write('\n\n REDUCED TRAINING DATA: \n') for row in training_data: debug_file.write(str(row) + '\n') elif reduction_func == 'k_means': if type == 'classification': edited_data = knn.edited_knn(training_data, validate_data) print("Finished enn.") print("Making ", len(edited_data), " clusters.") kc = kcluster(len(edited_data), 10, training_data, db.get_classifier_attr_cols(), 'k-means') else: num_clusters = math.sqrt(len(training_data)) kc = kcluster(num_clusters, 10, training_data, db.get_classifier_attr_cols(), 'k-means') training_data = kc.get_centroids() for i, point in enumerate(training_data): if len(point) == 0: del training_data[i] elif reduction_func == 'k_medoids': if type == 'classification': edited_data = knn.edited_knn(training_data, validate_data) print("Finished enn.") print("Making ", len(edited_data), " clusters.") kc = kcluster(len(edited_data), 10, training_data, db.get_classifier_attr_cols(), 'k-medoids') else: num_clusters = math.sqrt(len(training_data)) kc = kcluster(num_clusters, 100, training_data, db.get_classifier_attr_cols(), 'k-medoids') medoid_idxs = kc.get_medoids() new_training_data = [] for idx in medoid_idxs: new_training_data.append(training_data[idx]) training_data = new_training_data print('CONDENSED TRAINING DATA LENGTH: ', len(training_data)) current_loss_results = [] # Set of each 0-1 loss result squared_errors = [ ] # Set of absolute errors of each regression prediction debug_file.write('\n\nTEST DATA: \n') for row in test_data: debug_file.write(str(row) + '\n') # For each row (sample) in our test_data, run knn to predict its class for test_row in test_data: debug_file.write('RUNNING K-NN ON TEST POINT ' + str(test_row) + '\n') # Guess class with knn predicted = knn.k_nearest_neighbors(training_data, test_row[0]) debug_file.write('PREDICITED CLASS: ' + str(predicted) + '\n') if type == 'classification': if predicted == test_row[0][db.get_classifier_col()]: current_loss_results.append(0) else: current_loss_results.append(1) elif type == 'regression': squared_errors.append( pow((float(test_row[0][db.get_classifier_col()]) - predicted), 2)) # Compute average 0-1 loss and mean absolute error for this iteration if type == 'classification': loss = sum(current_loss_results) / len(current_loss_results) output_file.write('CALCULATED LOSS: ' + str(loss) + '\n') debug_file.write('CALCULATED LOSS: ' + str(loss) + '\n') loss_results.append(loss) elif type == 'regression': mse = sum(squared_errors) / len(squared_errors) output_file.write('CALCULATED MsE: ' + str(mse) + '\n') output_file.write('CALCULATED MsE: ' + str(mse) + '\n') mse_results.append(mse) print("0-1 LOSS RESULTS: ", loss_results) print("MsE RESULTS: ", mse_results) # Return the correct loss function results if type == 'classification': return loss_results else: return mse_results
def cluster(X, k): K, C = kcluster.kcluster(X, correlation, k) print('silhouette_score: %.5f' % (silhouette_score(X, K, metric=correlation))) return K, C
#!/usr/bin/env python # encoding: utf-8 import pandas as pd from scipy.spatial.distance import correlation from sklearn.preprocessing import scale import kcluster df = pd.read_csv('speed_move_count.csv', index_col='uid') X = scale(df.values) K, C = kcluster.kcluster(X, correlation, 3)
def main(): pm = path_manager() selected_dbs = select_db(pm.find_folders(pm.get_databases_dir())) for database in selected_dbs: # NOTE OUTPUT WILL WRITE TO A FILE, AS DEFINED BELOW: # MAKE SURE TO CREATE THIS DIRECTORY BEFORE YOU RUN, AND YOU CAN # SHOW THE FILE THAT'S CREATED IN THE VIDEO FOR OUTPUT filename = "../output/kmedoids/" + database + "_output.txt" output_file = open(filename, "w+") db = prepare_db(database, pm) k_nn = knn(5, db.get_dataset_type(), db.get_classifier_col(), db.get_classifier_attr_cols()) classes = db.get_class_list() if db.get_dataset_type( ) == 'classification' else [] class_count = len( classes) if db.get_dataset_type() == 'classification' else 1 X = process_data.shuffle_all(db.get_data(), 1) y = np.array(db.get_data())[:, db.get_classifier_col()] # RUN K-MEDOIDS ------------------------------------------------------------ print("RUNNING K-MEDOIDS") kc = kcluster(10, 10, db.get_data(), db.get_classifier_attr_cols(), 'k-medoids') indices = kc.get_medoids() centers = [db.get_data()[i] for i in indices] rbf = RBF(len(centers), class_count, output_file, 25) rbf.fit(X, centers, y, db.get_dataset_type(), classes) print("INITIAL WEIGHTS: ", rbf.weights) output_file.write("INITIAL WEIGHTS: \n") output_file.write(str(rbf.weights) + "\n") print("CENTERS: ", centers) output_file.write("FINAL WEIGHTS: \n") output_file.write(str(rbf.weights) + "\n") output_file.write("FINAL TESTS: \n") rbf.test(X, db.get_dataset_type(), y, centers, classes) print("FINALS WEIGHTS:") print(rbf.weights) # ---------------------------------------------------------------------------- # BEGIN classification FFNN if db.get_dataset_type() == 'classification': # BEGIN preprocessing process_data.FFNN_encoding(db) # (1) First layer (input layer) has 1 node per attribute. # (2) Hidden layers has arbitrary number of nodes. # (3) Output layer has 1 node per possible classification. layer_sizes = [len(db.get_attr()), 10, len(db.get_class_list())] # (3) # This number is arbitrary. # NOTICE: Tune this per dataset learning_rate = .5 ffnn = FFNN(layer_sizes, db.get_dataset_type(), db_name, db.get_data(), learning_rate) # BEGIN regression FFNN elif db.get_dataset_type() == 'regression': process_data.FFNN_encoding(db) # (1) First layer (input layer) has 1 node per attribute. # (2) Hidden layers has arbitrary number of nodes. # (3) Output layer has 1 node, just some real number. layer_sizes = [len(db.get_attr()) - 1, 5, 5, 1] learning_rate = .0001 ffnn = FFNN(layer_sizes, db.get_dataset_type(), db_name, db.get_data(), learning_rate) else: print('Database type invalid. Type = ' + db.get_dataset_type())