def squared_clustering_errors(inputs, k): clusterer = KMeans(k) clusterer.train(inputs) means = clusterer.means assignments = map(clusterer.classify, inputs) return sum( squared_distance(input, means[cluster]) for input, cluster in zip(inputs, assignments))
def post_kmeantrain(array: str, featurename: str, orderfeature: str): data = pd.read_json(array) columnnames = featurename.split(',') # columnnames = ['DFA', 'violmax', 'maxpeaksqt'] num_examples = data.shape[0] # Get features. x_train = data[[iaxis for iaxis in columnnames]].values.reshape( (num_examples, len(columnnames))) # print(x_train) # Set K-Means parameters. num_clusters = 4 # Number of clusters into which we want to split our training dataset. max_iterations = 50 # maximum number of training iterations. # Init K-Means instance. k_means = KMeans(x_train, num_clusters) # Train K-Means instance. (centroids, closest_centroids_ids) = k_means.train(max_iterations) # print(centroids) data_frame = pd.DataFrame(centroids, columns=[iaxis for iaxis in columnnames]) # dfsort = data_frame.sort_values(by=[orderfeature]) L = [chr(i) for i in range(97, 97 + len(centroids))] dfsort['L'] = pd.Series(L, index=dfsort.index) dfreturn = dfsort.set_index('L', drop=True) # print(dfreturn.to_json(orient="index")) return dfreturn.to_json(orient="index")
import random from k_means import KMeans from meetup_data import attendees_locations_tuples random.seed(0) clusterer = KMeans(3) clusterer.train(attendees_locations_tuples) print(clusterer.means)
from k_means import KMeans from matplotlib import image as mat_image from matplotlib import pyplot path_to_png_file = "/Users/rileylittlefield/Desktop/squirtle.png" squirtle_image = mat_image.imread(path_to_png_file) squirtle_pixels = [pixel for row in squirtle_image for pixel in row] clusterer = KMeans(5) clusterer.train(squirtle_pixels) def recolor(pixel): cluster = clusterer.classify(pixel) return clusterer.means[cluster] new_squirtle_image = [[recolor(pixel) for pixel in row] for row in squirtle_image] pyplot.imshow(new_squirtle_image) pyplot.axis('off') pyplot.show()
#!/usr/bin/env python3 import sys sys.path.append('code') import numpy as np from k_means import KMeans # Pokemon heigh/weight data = np.array([[0.4, 6.0], # Pikachu [0.7, 6.9], # Bulbasaur [0.6, 8.5], # Charmander [0.5, 9.0], # Squirtle [1.2, 36.0], # Slowpoke [1.6, 78.5], # Slowbro [1.1, 90.0], # Seel [1.7, 120.0],# Dewgong [2.2, 210.0],# Dragonite [1.7, 55.4], # Articuno [1.6, 52.6], # Zapdos [2.0, 60.0]] # Moltres ) if __name__ == "__main__": k_means = KMeans(2) k_means.train(data) k_means.report()
plt.legend() plt.subplot(1, 2, 2) plt.scatter(data[x_axis][:], data[y_axis][:]) plt.title('label unknown') plt.show() num_examples = data.shape[0] x_train = data[[x_axis, y_axis]].values.reshape(num_examples, 2) #指定好训练所需的参数 num_clusters = 3 max_iteritions = 50 k_means = KMeans(x_train, num_clusters) centroids, closest_centroids_ids = k_means.train(max_iteritions) # 对比结果 plt.figure(figsize=(12, 5)) plt.subplot(1, 2, 1) for iris_type in iris_types: plt.scatter(data[x_axis][data['class'] == iris_type], data[y_axis][data['class'] == iris_type], label=iris_type) plt.title('label known') plt.legend() plt.subplot(1, 2, 2) for centroid_id, centroid in enumerate(centroids): current_examples_index = (closest_centroids_ids == centroid_id).flatten() plt.scatter(data[x_axis][current_examples_index], data[y_axis][current_examples_index],
from k_means import KMeans import numpy import pickle dic = pickle.load(open("dat2.dat", "rb")) #pickle.dump(dic, open("dat2.dat","wb"), protocol=2) lst = [] for each in dic["vectors"]: temp = [] for lol in each["vectors"]: if len(lol) != 300: pass else: temp.append(lol) s = numpy.sum(temp, axis=0) try: if len(s) == 300: lst.append(s.tolist()) except: pass word_vectors = lst num_clusters = 2 k_means = KMeans(num_clusters, word_vectors) k_means.train() print(k_means.get_cluster(lst[0]))
def run_sfs(self, num_clusters=2): chosen_features = list( ) #list of column numbers corresponding to chosen features chosen_data_set = pd.DataFrame() self.base_performance = -2 current_performance = -1 best_performance = -1 best_features = -1 #best_model = TestModel(self.data, clusters) #used only for testing best_model = KMeans(self.data, num_clusters) self.chosen_model = best_model while (1): num_chosen_features = len( chosen_features) #use as next col index of data #print('number of chosen features:', num_chosen_features) #print('Iterate over all features') #print('------------------------') for column in self.data: #Account for (skip) features already been chosen as best if column in chosen_features: #print('Already selected this column', column, ' Skippin') continue #Select a feature (column) chosen_features.append(column) #Get the feature vector (column) feature_vector = self.data[column] #Get the data set of ALL chosen chosen_data_set[num_chosen_features] = feature_vector #model = TestModel(chosen_data_set, 2) #TESTING ONLY model = KMeans(chosen_data_set, num_clusters) model.train() current_performance = model.evaluate() print('for features ', chosen_features, 'best perf', best_performance, \ ' vs. current_perf', current_performance) #print('chosen_data_set') #print(chosen_data_set) if current_performance > best_performance: best_performance = current_performance best_model = model best_feature = column best_data = pd.DataFrame(chosen_data_set) #Remove the data & chosen feature & get next feature/data column_to_drop = len(chosen_features) - 1 chosen_data_set.drop(chosen_data_set.columns[column_to_drop], axis=1, inplace=True) chosen_features.pop() #print('------------------------') #print('best perf', best_performance, ' vs. base_performance', base_performance) if best_performance > self.base_performance: self.base_performance = best_performance chosen_features.append(best_feature) chosen_data_set = best_data self.chosen_model = best_model #print('base performance now best perf', base_performance) #print('chosen feature column', best_feature) #print(chosen_data_set) else: break #print('final best performance', base_performance) #print('chosen features') #print(chosen_features) #print(chosen_data_set) return chosen_features, chosen_data_set