def evaluate_model(model, model_type, num_of_classes, candidate_feature_set, data_set): '''This method uses the inputted feature subset to cluster the inputted data and scores performance using a LDA-like objective function.''' # Convert candidate_feature_set representation from # f_1, ... f_d to the list of indices of the f_i = 1 # (for example, [1 0 0 1 0] -> [0 3] candidate_feature_set = \ [idx for idx in xrange(len(candidate_feature_set)) if candidate_feature_set[idx] == 1] if model_type == "Kmeans": model = KMeans(num_of_classes) elif model_type == "HAC": model = HAC(num_of_classes) model.cluster(data_set[:,candidate_feature_set]) return model.calculate_performance()
def perform_SFS_feature_selection(model, model_type, num_of_classes, data_set): # Create a boolean string, 1 = include feature, 0 = leave it out feature_set = [i for i in xrange(data_set.shape[1])] chosen_features = [] chosen_clusters = [] base_performance = float("-inf") # while there are still features to choose from... while len(feature_set) > 0: # initialize performance metrics best_performance = float("-inf") best_clusters = [] #print "best performance = %f" % best_performance # Pick a feature that hasn't be chosen yet and train the model for feature in feature_set: chosen_features.append(feature) # Train model if model_type == "Kmeans": model = KMeans(num_of_classes) elif model_type == "HAC": model = HAC(num_of_classes) #print "Modeling with %s" % chosen_features clusters = model.cluster(data_set) # Calculate performance via LDA-like objective function current_performance = model.calculate_performance() #print "model performance = %f" % current_performance # if this combo of features beats the best performance so far # take note... if current_performance > best_performance: best_performance = current_performance best_feature = feature best_clusters = clusters #print "best performance updated to %f" % best_performance chosen_features.remove(feature) # If best noted performance beats the best performance we've seen # so far, add to chosen features if best_performance > base_performance: base_performance = best_performance feature_set.remove(best_feature) chosen_features.append(best_feature) chosen_clusters = best_clusters #print "base performance = %f" % base_performance else: #print "best performance = %f" % base_performance break return chosen_features, chosen_clusters
data_instances = [] data_file = open(test[0]) print "Running with %s" % test[0] for line in data_file: line_split = line.split(',') data_instances.append(map(float, line_split)) data_instances = np.array(data_instances) # Run SFS using k-means and HAC kmeans_model = KMeans(test[1]) hac_model = HAC(test[1]) # Glass dataset if "glass" in test[0]: kmeans_sfs_glass = np.array([1,3]) kmeans_model.cluster(data_instances[:,kmeans_sfs_glass]) print "Kmeans SFS glass performance = %f" % kmeans_model.calculate_performance() kmeans_ga_glass = np.array([0,1,2,3,4,5,6]) kmeans_model = KMeans(test[1]) kmeans_model.cluster(data_instances[:,kmeans_ga_glass]) print "Kmeans GA glass performance = %f" % kmeans_model.calculate_performance() hac_sfs_glass = np.array([0]) hac_model.cluster(data_instances[:,hac_sfs_glass]) print "HAC SFS glass performance = %f" % hac_model.calculate_performance() # Iris dataset elif "iris" in test[0]: kmeans_sfs_iris = np.array([1]) kmeans_model = KMeans(test[1])
centered_data = self.audio_data - mean_of_audio_data covariance_matrix = np.cov(centered_data, rowvar=False) eigen_values, eigen_vectors = la.eigh(covariance_matrix) print("vals:::", eigen_values) print("vectors::", eigen_vectors, eigen_vectors.shape) eigen_values_sorted = sorted(eigen_values, reverse=True) print("vals:::", eigen_values_sorted) indices_of_top_eigen_vectors = np.where( eigen_values >= eigen_values_sorted[ self.number_of_principal_components - 1]) #print("lol::",lol) top_eigen_vectors = eigen_vectors[indices_of_top_eigen_vectors] print("top_eigen_vectors", top_eigen_vectors) return top_eigen_vectors def project_data_along_principal_components(self, eigen_vectors): projected_data = np.dot(self.audio_data, eigen_vectors.T) return projected_data if __name__ == '__main__': pca = PCA() eigen_vectors = pca.compute_principal_components() projected_data = pca.project_data_along_principal_components(eigen_vectors) clusters = range(2, 11) k_means = KMeans() #k_means(2,projected_data) losses = k_means.cluster(clusters, projected_data) k_means.plot_objective_function(clusters, losses)
def train(self, training_data, do_clustering=False, num_of_means=-1): if do_clustering: # Run k-means on the training set, the centroids become the middle of the hidden nodes kmeans_model = KMeans(num_of_means) kmeans_model.cluster(training_data) centroids = kmeans_model.get_centroids() # Determine the spread for each hidden node, spread for cent_1_idx, cent_1_val in enumerate(centroids): total_distance = 0.0 for cent_2_idx, cent_2_val in enumerate(centroids): if cent_1_idx != cent_2_idx: # Calculate distance between two centroids total_distance += sqrt( sum((np.array(cent_1_val[:-1]) - np.array(cent_2_val[:-1]))**2)) # Spread defined above, centroid = hidden node center self.hidden_nodes.append( (cent_1_val, 2 * (total_distance / (training_data.shape[0] - 1)))) else: # Randomized 10% subset of training data will serve as hidden node centers random_centers_for_hidden_nodes = [] indices = [] while len(indices) < (training_data.shape[0] * 0.1): random_idx = np.random.randint(0, training_data.shape[0]) if not random_idx in indices: indices.append(random_idx) random_centers_for_hidden_nodes = training_data[indices] # Determine the spread for each hidden node, spread for inst_1_idx, inst_1_val in enumerate( random_centers_for_hidden_nodes): total_distance = 0.0 for inst_2_idx, inst_2_val in enumerate( random_centers_for_hidden_nodes): if inst_1_idx != inst_2_idx: # Calculate distance between two instances total_distance += sqrt( sum((inst_1_val[:-1] - inst_2_val[:-1])**2)) # Spread defined above, instance = hidden node center self.hidden_nodes.append( (inst_1_val[:-1], 2 * (total_distance / (len(indices) - 1)))) print( "Chose random training instances to serve as hidden node centers" ) # Initial weights for gradient descent self.weights = \ np.array([np.random.randint(-100, 100) for weight in range(len(self.hidden_nodes))]) print("Initialized weight vector") # Learn weights to determine hidden node influence on output done = False while not done: print("Started gradient descent") # Batch updating of weights, store individual updates in new_weights new_weights = np.array([0.0 for i in range(len(self.weights))]) for instance in training_data: # Determine Gaussian outputs gaussian_outputs = [] for node in self.hidden_nodes: # radial basis function gaussian_outputs.append( exp((-1 / float(2 * (node[1]**2))) * (sqrt(sum((instance[:-1] - node[0])**2))))) # Determine error gradient (implies a vector) gradient = [] for gaussian_output in gaussian_outputs: if self.learner_type == "REGRESSION": gradient.append( 2 * (np.dot(self.weights, gaussian_outputs) - instance[-1]) * gaussian_output) else: activation_score = 1 / ( 1 + exp(np.dot(self.weights, gaussian_outputs))) gradient.append( (activation_score - instance[-1]) * (activation_score * (1 - activation_score)) * gaussian_output) # Calculate weight update new_weights += (self.weights - (self.learning_rate * np.array(gradient))) new_weights = (new_weights / training_data.shape[0]) if (abs(sum(self.weights - new_weights))) < 0.01: done = True if not done: print("Weights were updated by %f last iteration" % abs(sum(self.weights - new_weights))) self.weights = new_weights print("Found weights: %s" % str(self.weights))
learner_type = "CLASSIFICATION" else: learner_type = "REGRESSION" # 10 fold cross validation fold_size = data_instances.shape[0] / 10 data_indices = [idx for idx in xrange(data_instances.shape[0])] for num_of_means in xrange(1, 50): total_performance = 0.0 for holdout_fold_idx in xrange(10): # try some num of means kmeans_model = KMeans(num_of_means) # run k means on training data to find centroids clusters = kmeans_model.cluster( \ data_instances[ \ np.array( \ np.setdiff1d(data_indices, data_indices[ \ fold_size * holdout_fold_idx : \ fold_size * holdout_fold_idx + fold_size]))]) centroids = kmeans_model.get_centroids() for cluster_idx in xrange(len(clusters)): ave_label = 0.0 for instance in clusters[cluster_idx]: ave_label += instance[-1] if len(clusters[cluster_idx]) > 0: ave_label = ave_label / len(clusters[cluster_idx]) if learner_type == "CLASSIFICATION": ave_label = int(round(ave_label)) centroids[cluster_idx].append(ave_label) # for classification, vote to determine centroid classification # for regression, average to find centroid estimate
from k_means import KMeans from hac import HAC '''This program reads in the test data and runs SFS and GA feature selection using k-means and HAC clustering''' # Datasets to test tests = [('data_sets/original/glass_data.txt', 7), \ ('data_sets/original/iris_data.txt', 3)] #('data_sets/original/spam_data.txt', 2)] for test in tests: data_instances = [] data_file = open(test[0]) print "Running with %s" % test[0] for line in data_file: line_split = line.split(',') data_instances.append(map(float, line_split)) data_instances = np.array(data_instances) # Run GA using k-means kmeans_model = KMeans(test[1]) hac_model = HAC(test[1]) chosen_features = \ perform_GA_feature_selection(kmeans_model, "Kmeans", test[1], data_instances) feature_set = \ [idx for idx in xrange(len(chosen_features[0])) if chosen_features[0][idx] == 1] print "Chosen features for K-means GA: %s" % str(chosen_features) kmeans_model = KMeans(test[1]) chosen_clusters = kmeans_model.cluster(data_instances[:, feature_set]) print "Clusters: %s" % str(chosen_clusters) #chosen_features = perform_GA_feature_selection(hac_model, "HAC", test[1], data_instances) #print "Chosen features for HAC GA: %s" % str(chosen_features)