def evaluate_model(model, model_type, num_of_classes, candidate_feature_set, data_set):
    '''This method uses the inputted feature subset to cluster the inputted data and
    scores performance using a LDA-like objective function.'''
    # Convert candidate_feature_set representation from
    # f_1, ... f_d to the list of indices of the f_i = 1
    # (for example, [1 0 0 1 0] -> [0 3]
    candidate_feature_set = \
        [idx for idx in xrange(len(candidate_feature_set)) if candidate_feature_set[idx] == 1]
    if model_type == "Kmeans":
        model = KMeans(num_of_classes)
    elif model_type == "HAC":
        model = HAC(num_of_classes)
    model.cluster(data_set[:,candidate_feature_set])
    return model.calculate_performance() 
def perform_SFS_feature_selection(model, model_type, num_of_classes, data_set):
    # Create a boolean string, 1 = include feature, 0 = leave it out
    feature_set = [i for i in xrange(data_set.shape[1])]
    chosen_features = []
    chosen_clusters = []
    base_performance = float("-inf")
    # while there are still features to choose from...
    while len(feature_set) > 0:
        # initialize performance metrics
        best_performance = float("-inf")
        best_clusters = []
        #print "best performance = %f" % best_performance
        # Pick a feature that hasn't be chosen yet and train the model
        for feature in feature_set:
            chosen_features.append(feature)
            # Train model
            if model_type == "Kmeans":
                model = KMeans(num_of_classes)
            elif model_type == "HAC":
                model = HAC(num_of_classes)
            #print "Modeling with %s" % chosen_features
            clusters = model.cluster(data_set)
            # Calculate performance via LDA-like objective function
            current_performance = model.calculate_performance()
            #print "model performance = %f" % current_performance
            # if this combo of features beats the best performance so far
            # take note...
            if current_performance > best_performance:
                best_performance = current_performance
                best_feature = feature
                best_clusters = clusters
                #print "best performance updated to %f" % best_performance
            chosen_features.remove(feature)
        # If best noted performance beats the best performance we've seen
        # so far, add to chosen features
        if best_performance > base_performance:
            base_performance = best_performance
            feature_set.remove(best_feature)
            chosen_features.append(best_feature)
            chosen_clusters = best_clusters
            #print "base performance = %f" % base_performance
        else:
            #print "best performance = %f" % base_performance
            break
    return chosen_features, chosen_clusters
   data_instances = []
   data_file = open(test[0])
   print "Running with %s" % test[0]
   for line in data_file:
       line_split = line.split(',')
       data_instances.append(map(float, line_split))
   data_instances = np.array(data_instances)

   # Run SFS using k-means and HAC
   kmeans_model = KMeans(test[1])
   hac_model = HAC(test[1])

   # Glass dataset
   if "glass" in test[0]:
      kmeans_sfs_glass = np.array([1,3])
      kmeans_model.cluster(data_instances[:,kmeans_sfs_glass])
      print "Kmeans SFS glass performance = %f" % kmeans_model.calculate_performance()

      kmeans_ga_glass = np.array([0,1,2,3,4,5,6])
      kmeans_model = KMeans(test[1])
      kmeans_model.cluster(data_instances[:,kmeans_ga_glass])
      print "Kmeans GA glass performance = %f" % kmeans_model.calculate_performance()

      hac_sfs_glass = np.array([0])
      hac_model.cluster(data_instances[:,hac_sfs_glass])
      print "HAC SFS glass performance = %f" % hac_model.calculate_performance()

   # Iris dataset
   elif "iris" in test[0]:
      kmeans_sfs_iris = np.array([1])
      kmeans_model = KMeans(test[1])
Beispiel #4
0
        centered_data = self.audio_data - mean_of_audio_data
        covariance_matrix = np.cov(centered_data, rowvar=False)
        eigen_values, eigen_vectors = la.eigh(covariance_matrix)
        print("vals:::", eigen_values)
        print("vectors::", eigen_vectors, eigen_vectors.shape)
        eigen_values_sorted = sorted(eigen_values, reverse=True)
        print("vals:::", eigen_values_sorted)
        indices_of_top_eigen_vectors = np.where(
            eigen_values >= eigen_values_sorted[
                self.number_of_principal_components - 1])
        #print("lol::",lol)
        top_eigen_vectors = eigen_vectors[indices_of_top_eigen_vectors]
        print("top_eigen_vectors", top_eigen_vectors)
        return top_eigen_vectors

    def project_data_along_principal_components(self, eigen_vectors):
        projected_data = np.dot(self.audio_data, eigen_vectors.T)
        return projected_data


if __name__ == '__main__':
    pca = PCA()
    eigen_vectors = pca.compute_principal_components()
    projected_data = pca.project_data_along_principal_components(eigen_vectors)
    clusters = range(2, 11)
    k_means = KMeans()

    #k_means(2,projected_data)

    losses = k_means.cluster(clusters, projected_data)
    k_means.plot_objective_function(clusters, losses)
    def train(self, training_data, do_clustering=False, num_of_means=-1):

        if do_clustering:
            # Run k-means on the training set, the centroids become the middle of the hidden nodes
            kmeans_model = KMeans(num_of_means)
            kmeans_model.cluster(training_data)
            centroids = kmeans_model.get_centroids()
            # Determine the spread for each hidden node, spread
            for cent_1_idx, cent_1_val in enumerate(centroids):
                total_distance = 0.0
                for cent_2_idx, cent_2_val in enumerate(centroids):
                    if cent_1_idx != cent_2_idx:
                        # Calculate distance between two centroids
                        total_distance += sqrt(
                            sum((np.array(cent_1_val[:-1]) -
                                 np.array(cent_2_val[:-1]))**2))
                # Spread defined above, centroid = hidden node center
                self.hidden_nodes.append(
                    (cent_1_val,
                     2 * (total_distance / (training_data.shape[0] - 1))))
        else:
            # Randomized 10% subset of training data will serve as hidden node centers
            random_centers_for_hidden_nodes = []
            indices = []
            while len(indices) < (training_data.shape[0] * 0.1):
                random_idx = np.random.randint(0, training_data.shape[0])
                if not random_idx in indices:
                    indices.append(random_idx)
            random_centers_for_hidden_nodes = training_data[indices]

            # Determine the spread for each hidden node, spread
            for inst_1_idx, inst_1_val in enumerate(
                    random_centers_for_hidden_nodes):
                total_distance = 0.0
                for inst_2_idx, inst_2_val in enumerate(
                        random_centers_for_hidden_nodes):
                    if inst_1_idx != inst_2_idx:
                        # Calculate distance between two instances
                        total_distance += sqrt(
                            sum((inst_1_val[:-1] - inst_2_val[:-1])**2))
                # Spread defined above, instance = hidden node center
                self.hidden_nodes.append(
                    (inst_1_val[:-1],
                     2 * (total_distance / (len(indices) - 1))))
            print(
                "Chose random training instances to serve as hidden node centers"
            )

        # Initial weights for gradient descent
        self.weights = \
            np.array([np.random.randint(-100, 100) for weight in range(len(self.hidden_nodes))])
        print("Initialized weight vector")

        # Learn weights to determine hidden node influence on output
        done = False
        while not done:
            print("Started gradient descent")
            # Batch updating of weights, store individual updates in new_weights
            new_weights = np.array([0.0 for i in range(len(self.weights))])
            for instance in training_data:
                # Determine Gaussian outputs
                gaussian_outputs = []
                for node in self.hidden_nodes:
                    # radial basis function
                    gaussian_outputs.append(
                        exp((-1 / float(2 * (node[1]**2))) *
                            (sqrt(sum((instance[:-1] - node[0])**2)))))

                # Determine error gradient (implies a vector)
                gradient = []
                for gaussian_output in gaussian_outputs:
                    if self.learner_type == "REGRESSION":
                        gradient.append(
                            2 * (np.dot(self.weights, gaussian_outputs) -
                                 instance[-1]) * gaussian_output)
                    else:
                        activation_score = 1 / (
                            1 + exp(np.dot(self.weights, gaussian_outputs)))
                        gradient.append(
                            (activation_score - instance[-1]) *
                            (activation_score *
                             (1 - activation_score)) * gaussian_output)

                # Calculate weight update
                new_weights += (self.weights -
                                (self.learning_rate * np.array(gradient)))
            new_weights = (new_weights / training_data.shape[0])
            if (abs(sum(self.weights - new_weights))) < 0.01:
                done = True
            if not done:
                print("Weights were updated by %f last iteration" %
                      abs(sum(self.weights - new_weights)))
            self.weights = new_weights
        print("Found weights: %s" % str(self.weights))
Beispiel #6
0
        learner_type = "CLASSIFICATION"
    else:
        learner_type = "REGRESSION"

    # 10 fold cross validation
    fold_size = data_instances.shape[0] / 10
    data_indices = [idx for idx in xrange(data_instances.shape[0])]
    for num_of_means in xrange(1, 50):
        total_performance = 0.0
        for holdout_fold_idx in xrange(10):
            # try some num of means
            kmeans_model = KMeans(num_of_means)
            # run k means on training data to find centroids
            clusters = kmeans_model.cluster( \
                data_instances[ \
                    np.array( \
                        np.setdiff1d(data_indices, data_indices[ \
                                fold_size * holdout_fold_idx : \
                                fold_size * holdout_fold_idx + fold_size]))])
            centroids = kmeans_model.get_centroids()
            for cluster_idx in xrange(len(clusters)):
                ave_label = 0.0
                for instance in clusters[cluster_idx]:
                    ave_label += instance[-1]
                if len(clusters[cluster_idx]) > 0:
                    ave_label = ave_label / len(clusters[cluster_idx])
                if learner_type == "CLASSIFICATION":
                    ave_label = int(round(ave_label))
                centroids[cluster_idx].append(ave_label)

            #     for classification, vote to determine centroid classification
            #     for regression, average to find centroid estimate
Beispiel #7
0
from k_means import KMeans
from hac import HAC
'''This program reads in the test data and runs SFS and GA feature selection using k-means and HAC clustering'''
# Datasets to test
tests = [('data_sets/original/glass_data.txt', 7), \
         ('data_sets/original/iris_data.txt', 3)]
#('data_sets/original/spam_data.txt', 2)]

for test in tests:
    data_instances = []
    data_file = open(test[0])
    print "Running with %s" % test[0]
    for line in data_file:
        line_split = line.split(',')
        data_instances.append(map(float, line_split))
    data_instances = np.array(data_instances)

    # Run GA using k-means
    kmeans_model = KMeans(test[1])
    hac_model = HAC(test[1])
    chosen_features = \
        perform_GA_feature_selection(kmeans_model, "Kmeans", test[1], data_instances)
    feature_set = \
        [idx for idx in xrange(len(chosen_features[0])) if chosen_features[0][idx] == 1]
    print "Chosen features for K-means GA: %s" % str(chosen_features)
    kmeans_model = KMeans(test[1])
    chosen_clusters = kmeans_model.cluster(data_instances[:, feature_set])
    print "Clusters: %s" % str(chosen_clusters)
    #chosen_features = perform_GA_feature_selection(hac_model, "HAC", test[1], data_instances)
    #print "Chosen features for HAC GA: %s" % str(chosen_features)