Beispiel #1
0
def squared_clustering_errors(inputs, k):
    clusterer = KMeans(k)
    clusterer.train(inputs)
    means = clusterer.means
    assignments = map(clusterer.classify, inputs)

    return sum(
        squared_distance(input, means[cluster])
        for input, cluster in zip(inputs, assignments))
Beispiel #2
0
def post_kmeantrain(array: str, featurename: str, orderfeature: str):
    data = pd.read_json(array)
    columnnames = featurename.split(',')
    # columnnames = ['DFA', 'violmax', 'maxpeaksqt']
    num_examples = data.shape[0]
    # Get features.
    x_train = data[[iaxis for iaxis in columnnames]].values.reshape(
        (num_examples, len(columnnames)))
    # print(x_train)
    # Set K-Means parameters.
    num_clusters = 4  # Number of clusters into which we want to split our training dataset.
    max_iterations = 50  # maximum number of training iterations.

    # Init K-Means instance.
    k_means = KMeans(x_train, num_clusters)
    # Train K-Means instance.
    (centroids, closest_centroids_ids) = k_means.train(max_iterations)
    # print(centroids)
    data_frame = pd.DataFrame(centroids,
                              columns=[iaxis for iaxis in columnnames])
    #
    dfsort = data_frame.sort_values(by=[orderfeature])
    L = [chr(i) for i in range(97, 97 + len(centroids))]
    dfsort['L'] = pd.Series(L, index=dfsort.index)
    dfreturn = dfsort.set_index('L', drop=True)
    # print(dfreturn.to_json(orient="index"))
    return dfreturn.to_json(orient="index")
import random
from k_means import KMeans
from meetup_data import attendees_locations_tuples

random.seed(0)
clusterer = KMeans(3)
clusterer.train(attendees_locations_tuples)
print(clusterer.means)
from k_means import KMeans
from matplotlib import image as mat_image
from matplotlib import pyplot

path_to_png_file = "/Users/rileylittlefield/Desktop/squirtle.png"
squirtle_image = mat_image.imread(path_to_png_file)

squirtle_pixels = [pixel for row in squirtle_image for pixel in row]

clusterer = KMeans(5)
clusterer.train(squirtle_pixels)


def recolor(pixel):
    cluster = clusterer.classify(pixel)
    return clusterer.means[cluster]


new_squirtle_image = [[recolor(pixel) for pixel in row]
                      for row in squirtle_image]

pyplot.imshow(new_squirtle_image)
pyplot.axis('off')
pyplot.show()
#!/usr/bin/env python3

import sys
sys.path.append('code')

import numpy as np

from k_means import KMeans

# Pokemon heigh/weight
data = np.array([[0.4, 6.0],  # Pikachu
                 [0.7, 6.9],  # Bulbasaur
                 [0.6, 8.5],  # Charmander
                 [0.5, 9.0],  # Squirtle
                 [1.2, 36.0], # Slowpoke
                 [1.6, 78.5], # Slowbro
                 [1.1, 90.0], # Seel
                 [1.7, 120.0],# Dewgong
                 [2.2, 210.0],# Dragonite
                 [1.7, 55.4], # Articuno
                 [1.6, 52.6], # Zapdos
                 [2.0, 60.0]] # Moltres
                 )
if __name__ == "__main__":
    k_means = KMeans(2)
    k_means.train(data)
    k_means.report()
Beispiel #6
0
plt.legend()

plt.subplot(1, 2, 2)
plt.scatter(data[x_axis][:], data[y_axis][:])
plt.title('label unknown')
plt.show()

num_examples = data.shape[0]
x_train = data[[x_axis, y_axis]].values.reshape(num_examples, 2)

#指定好训练所需的参数
num_clusters = 3
max_iteritions = 50

k_means = KMeans(x_train, num_clusters)
centroids, closest_centroids_ids = k_means.train(max_iteritions)
# 对比结果
plt.figure(figsize=(12, 5))
plt.subplot(1, 2, 1)
for iris_type in iris_types:
    plt.scatter(data[x_axis][data['class'] == iris_type],
                data[y_axis][data['class'] == iris_type],
                label=iris_type)
plt.title('label known')
plt.legend()

plt.subplot(1, 2, 2)
for centroid_id, centroid in enumerate(centroids):
    current_examples_index = (closest_centroids_ids == centroid_id).flatten()
    plt.scatter(data[x_axis][current_examples_index],
                data[y_axis][current_examples_index],
Beispiel #7
0
from k_means import KMeans
import numpy
import pickle
dic = pickle.load(open("dat2.dat", "rb"))

#pickle.dump(dic, open("dat2.dat","wb"), protocol=2)

lst = []
for each in dic["vectors"]:
    temp = []
    for lol in each["vectors"]:
        if len(lol) != 300:
            pass
        else:
            temp.append(lol)
    s = numpy.sum(temp, axis=0)
    try:
        if len(s) == 300:
            lst.append(s.tolist())
    except:
        pass

word_vectors = lst
num_clusters = 2
k_means = KMeans(num_clusters, word_vectors)
k_means.train()
print(k_means.get_cluster(lst[0]))
    def run_sfs(self, num_clusters=2):
        chosen_features = list(
        )  #list of column numbers corresponding to chosen features
        chosen_data_set = pd.DataFrame()

        self.base_performance = -2
        current_performance = -1
        best_performance = -1
        best_features = -1
        #best_model = TestModel(self.data, clusters) #used only for testing
        best_model = KMeans(self.data, num_clusters)
        self.chosen_model = best_model

        while (1):
            num_chosen_features = len(
                chosen_features)  #use as next col index of data
            #print('number of chosen features:', num_chosen_features)
            #print('Iterate over all features')
            #print('------------------------')
            for column in self.data:
                #Account for (skip) features already been chosen as best
                if column in chosen_features:
                    #print('Already selected this column', column, ' Skippin')
                    continue

                #Select a feature (column)
                chosen_features.append(column)

                #Get the feature vector (column)
                feature_vector = self.data[column]

                #Get the data set of ALL chosen
                chosen_data_set[num_chosen_features] = feature_vector
                #model = TestModel(chosen_data_set, 2) #TESTING ONLY
                model = KMeans(chosen_data_set, num_clusters)
                model.train()
                current_performance = model.evaluate()

                print('for features ', chosen_features, 'best perf', best_performance, \
                  ' vs. current_perf', current_performance)
                #print('chosen_data_set')
                #print(chosen_data_set)
                if current_performance > best_performance:
                    best_performance = current_performance
                    best_model = model
                    best_feature = column
                    best_data = pd.DataFrame(chosen_data_set)

                #Remove the data & chosen feature & get next feature/data
                column_to_drop = len(chosen_features) - 1

                chosen_data_set.drop(chosen_data_set.columns[column_to_drop],
                                     axis=1,
                                     inplace=True)
                chosen_features.pop()
            #print('------------------------')

            #print('best perf', best_performance, ' vs. base_performance', base_performance)
            if best_performance > self.base_performance:
                self.base_performance = best_performance
                chosen_features.append(best_feature)
                chosen_data_set = best_data
                self.chosen_model = best_model
                #print('base performance now best perf', base_performance)
                #print('chosen feature column', best_feature)
                #print(chosen_data_set)
            else:
                break

        #print('final best performance', base_performance)
        #print('chosen features')
        #print(chosen_features)
        #print(chosen_data_set)
        return chosen_features, chosen_data_set