def test_diff_true2(self): old_mean = np.array([[1, 1], [1, 1]]) new_mean = np.array([[0, 0], [0, 0]]) I = kmeans.Kmeans() I.mean = new_mean expr = I.diff(old_mean) self.assertIs(expr, False)
def test_calc_mean(self): I = kmeans.Kmeans() I.features = np.array(([[10, 20], [14, 24], [100, 120], [110, 120]])) I.label = [0, 0, 1, 1] ary = I.calc_mean() expr = np.array(([[12, 22], [105, 120]])) self.assertTrue(np.all(ary == expr))
def test_diff_false(self): old_mean = np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]]) new_mean = np.array([[1, 2, 3], [4, 7, 6], [7, 8, 9]]) I = kmeans.Kmeans() I.mean = new_mean expr = I.diff(old_mean) self.assertIs(expr, False)
def test_diff_false2(self): old_mean = np.array([[0.00000001, 2], [4, 5]]) new_mean = np.array([[0.0000001, 2], [4, 5]]) I = kmeans.Kmeans() I.mean = new_mean expr = I.diff(old_mean) self.assertIs(expr, True)
def kmeans_operation(self): KmeansInstance=kmeans.Kmeans() self.rep_warp,self.weights=KmeansInstance.RunKmeans(self.X,self.Clusters) self.warp_index=0 weight=0 for i in range(0,len(self.rep_warp)): if(self.weights[i]>weight): weight=self.weights[i] self.warp_index=self.rep_warp[i] print(self.warp_index)
def test_experiment1(self): k = 10 trainfile = 'optdigits.train' testfile = 'optdigits.test' km = kmeans.Kmeans(k=k, trainfile=trainfile, testfile=testfile) km.train() accuracy, confusion_matrix = km.evaluate(data=km.test_data) km.report(test_accuracy=accuracy, confusion_matrix=confusion_matrix) assert (accuracy > .70)
def Exercise1(self): data = [[1.9, 7.3], [3.4, 7.5], [2.5, 6.8], [1.5, 6.5], [3.5, 6.4], [2.2, 5.8], [3.4, 5.2], [3.6, 4], [5, 3.2], [4.5, 2.4], [6, 2.6], [1.9, 3], [1, 2.7], [1.9, 2.4], [0.8, 2], [1.6, 1.8], [1, 1]] _data = np.asarray(_matrix.AddColumn( _matrix.Copy(data), -1)) # create new column for classification x1, y1 = _matrix.DivideXY(_data) plot.Kmeans(x1, np.int_(y1), [[]], 'Plot pure data') newData, centroids = kmeans.Kmeans(data, 3) x, y = _matrix.DivideXY(newData) plot.Kmeans(x, np.int_(y), centroids, 'Plot Kmeans')
def kmean_segementation(expression, get_clustering_features, recognition_model): strokes = expression.strokes.values() cluster_features = get_clustering_features(strokes) n = len(strokes) confidences = {} max_confidence = 0 max_segmentation = [] for k in range(n, n // 4, -1): segmenter = kmeans.Kmeans(k) test_k = KMeans(k) test = test_k.fit_predict(cluster_features) # clusters = segmenter.kmeans(cluster_features) clusters = {} for i, index in enumerate(test): arr = clusters.get(index, None) if arr is None: clusters[index] = [i] else: arr.append(i) cluster_confidences = [] cluster_symbols = [] for key in clusters.keys(): cluster = clusters[key] name = ''.join(list(map(str, cluster))) s, c = confidences.get(name, (None, None)) if c is None: # needs to be changed for the max probability score symbol_strokes = [] for stroke in cluster: symbol_strokes.append(list(strokes)[stroke]) s = Symbol(name, symbol_strokes) feature_vector = digitClassifier.features([s.to_digit()]) c, classification = recognition_model.classify_conf( feature_vector) confidences[name] = (s, c) s.classification = recognition_model.int_to_class[ classification] if classification < len( recognition_model.int_to_class) else 'junk' if s.classification == ',': s.classification = 'COMMA' elif s.classification == 'junk': c /= 10 s.weight = c cluster_confidences.append(c) cluster_symbols.append(s) segmentation_confifdence = objective_function(cluster_confidences) if segmentation_confifdence > max_confidence: max_confidence = segmentation_confifdence max_segmentation = cluster_symbols expression.symbols = max_segmentation
def dataset_test(self, path, datatype="float"): print(">>>>>> " + path + " <<<<<<") readmatrix = _matrix.ReadCsv(path, ";", datatype) _data = np.asarray( _matrix.AddColumn(_matrix.Copy(readmatrix), -1)) # create new column for classification x1, y1 = _matrix.DivideXY(_data) plot.Kmeans(x1, np.int_(y1), [[]], path + ' - Plot pure data') newData, centroids = kmeans.Kmeans(readmatrix, 3) x, y = _matrix.DivideXY(newData) plot.Kmeans(x, np.int_(y), centroids, path + ' - Plot Kmeans') print("\n--------------------------------------------------")
def Exercise3(self): iris = datasets.load_iris() x = iris.data # reduce dimensions with Pca resultPca = PCA(n_components=2) resultPca.fit(x) PcaData = resultPca.transform(x) _data = np.asarray(_matrix.AddColumn( _matrix.Copy(PcaData), -1)) # create new column for classification x1, y1 = _matrix.DivideXY(_data) plot.Kmeans(x1, np.int_(y1), [[]], 'Iris - Plot pure data') newData, centroids = kmeans.Kmeans(PcaData, 3) x, y = _matrix.DivideXY(newData) plot.Kmeans(x, np.int_(y), centroids, 'Iris - Plot Kmeans')
def main(): """ Main call to start the program """ #Dataset file_name = 'frogs.csv' file_path = '../data/' full_path = file_path + file_name data = utils.csv_to_arrays(full_path) #Parameters k = 3 mode = 'hammerly' iterations = 1000 tests = 10 model = kmeans.Kmeans(data, k, mode, iterations) #Book keeping variables total_iterations = 0 total_dist = 0 total_time = 0 print("-----" + mode + "-----") for i in range(tests): start = time.time() iters, clusters = model.cluster() stop = time.time() error = model.average_error(clusters) run_time = stop - start total_time += run_time #print ("Stopped after " + str(iters) + # " iterations, with average distance of: " + str(error)) model.reset() total_dist += error total_iterations += iters print("average iterations: " + str(total_iterations / tests)) print("average distance: " + str(total_dist / tests)) print("average time: " + str(total_time / tests))
def do_my_Kmeans(self, args, initCentroid=None): km = mykm.Kmeans(args, initCentroid=initCentroid) print("fitting the model (my own k-means) ...") start_time = time.time() start_clock = time.clock() km.fit(self.X) print("--- time: %s seconds\t clock: %s seconds ---" % (time.time() - start_time, time.clock() - start_clock)) km.plot_sse_chart('my_kmeans_sse', args.save_path) print("acc: ", self.accuracy(self.Y, km.labels_)) print("===== K-means cluster center (my own k-means) =====") print(km.cluster_centers_) print("===================================================\n\n\n\n") return km
def main(): t1 = time.monotonic() model = gensim.models.doc2vec.Doc2Vec.load( '../obj/doc2vec/abstracts_etd_doc2vec_5000_docs') doc_vectors, keys = extract_mapped_doc2vecs(model) km_obj = kmeans.Kmeans(doc_list=keys, n_clusters=10, init='k-means++', n_init=3, n_jobs=5, random_state=42, verbose=1, algorithm='full') km_obj.fit(doc_vectors) km_obj.save('abstracts_etd_doc2vec_5000_docs_kmeans.sav') print("Time taken {}s".format(timedelta(time.monotonic() - t1)))
def main(self): # create synthetic dataset synthesizer = sd.SyntheticData(5, 5, 10, 6) pts = synthesizer.point_assignments # synthetic data pts # TODO: might attempt to plot clusters if time permits. #print(pts.head(30)) # k-means setup for synthetic dataset print('Running k-means with synthetic dataset...') kmeans = km.Kmeans(5, pts, False, True) kmeans_clusters = kmeans.getClusters(3) # max of 3 iterations print('Finished k-means with synthetic dataset') print('Calculating average silhouette coefficient...') kmeans_silhouette = assessment.calculate_silhouette(kmeans_clusters) print('Average k-means silhouette coefficient = {0}\n'.format( kmeans_silhouette)) # DB Scan setup for synthetic dataset print('Running DB Scan with synthetic dataset...') epsilon = 2 minPts = 2 dbScan = db.DbScan(pts, False, True) df, dex = dbScan.getDataframe() m = dbScan.createPairwiseDistanceMatrix(df, dex) lessThanEpsilonDict = dbScan.createLessThanEpsilonDict(df, m, epsilon) dbScan_clusters = dbScan.clusterAssignment(df, lessThanEpsilonDict, minPts) print('Finished DB Scan with synthetic dataset') print('Calculating average silhouette coefficient...') dbScan_silhouette = assessment.calculate_silhouette(dbScan_clusters) print('Average DB Scan silhouette coefficient = {0}\n'.format( dbScan_silhouette)) # k-means and DB scan for classification datasets from UCI repository for dataset in self.alldataset: # for each dataset call each algorithm print('current dataset ::: {0} \n'.format(dataset)) data = self.alldataset.get(dataset) isClassification = self.IsClassificationDict.get(dataset) print('Running k-means on dataset ::: {0}... \n'.format(dataset)) k = 5 kmeans = km.Kmeans(k, data, isClassification, False) kmeans_clusters = kmeans.getClusters(3) #max of 3 iterations print('Finished kmeans for {0} dataset\n'.format(dataset)) print('Calculating purity...') kmeans_purity = assessment.calculate_purity(kmeans_clusters) print('k-means purity on dataset ::: {0} = {1} \n'.format( dataset, kmeans_purity)) print('Running DB Scan on dataset ::: {0}... \n'.format(dataset)) epsilon = 2 minPts = 2 dbScan = db.DbScan(data, isClassification, False) df, dex = dbScan.getDataframe() m = dbScan.createPairwiseDistanceMatrix(df, dex) lessThanEpsilonDict = dbScan.createLessThanEpsilonDict( df, m, epsilon) dbScan_clusters = dbScan.clusterAssignment(df, lessThanEpsilonDict, minPts) print('Finished DB Scan for {0} dataset\n'.format(dataset)) print('Calculating purity...') dbScan_purity = assessment.calculate_purity(dbScan_clusters) print('DB Scan purity on dataset ::: {0} = {1} \n'.format( dataset, dbScan_purity))
import pandas as pd import random import sys import math import kmeans ginf = pd.read_csv('./tagged_data_01_2017.csv', delimiter=';') cols = list(ginf.columns) ginf[cols[1:]] = ginf[cols[1:]].fillna(0).astype('float32') # ginf.info() ginf.set_index('time', inplace=True) ginf.index = pd.to_datetime(ginf.index, format='%d/%m/%y %H:%M') data_ind = ginf.filter(regex="^.*indoor...*$") data_ind_means = data_ind.groupby( data_ind.index.day).agg(lambda x: np.nanmean(x[x < 100])) data_ind_means_numpy = data_ind_means.as_matrix() data_ind_means_num = np.reshape(data_ind_means_numpy, -1) df = pd.DataFrame(data_ind_means_num) df.info() #K-Means Configuration GINF_CONFIG = {'diff_cols': [0], 'diff_labels': ['Indoor_Air_Temp']} config = kmeans.KmeansConfig(GINF_CONFIG['diff_cols'], GINF_CONFIG['diff_labels'], 0) kmeansObj = kmeans.Kmeans(config, 4, df) kmeansObj.cluster()
ydata = data[i][:, j] / np.max(data[i][:, j]) popt = scipy.optimize.fmin(lssq, (1.0, 1), disp=0) params[i, j, 2:4] = np.array(popt) with open('../data/params.pkl', 'wb') as f: pickle.dump(params, f) with open('../data/churns.pkl', 'wb') as f: pickle.dump(churns, f) else: with open('../data/params.pkl', 'rb') as f: params = pickle.load(f) if not os.path.exists('../data/labels.pkl'): kmeans.Kmeans(input=params, range_n_clusters=[2, 3, 4, 5], output='../data/labels.pkl') #kmeans.cluster4() with open('../data/labels.pkl', 'rb') as f: labellist = pickle.load(f) centerlist = pickle.load(f) for dim in xrange(len(labellist)): labels = labellist[dim] centers = centerlist[dim] unique = np.unique(labels) colorshape = ['ro', 'gv', 'bs', 'k*', 'm8', 'y.'] #draw model parameter plots xs = [[] for i in range(len(unique))] ys = [[] for i in range(len(unique))]
from __future__ import division import numpy as np import pandas as pd import random import sys import math import kmeans ginf = pd.read_csv('./tagged_data_01_2017.csv', delimiter=';') cols = list(ginf.columns) ginf[cols[1:]] = ginf[cols[1:]].fillna(0).astype('float32') ginf.info() # K-Means Configuration GINF_CONFIG = { 'diff_cols': [1, 2, 4, 5, 6], 'diff_labels': [ 'Indoor_Air_Temp_1', 'Indoor_Air_Temp_2', 'Outdoor_Air_Temp_1', 'Supply_Water_Temp_1', 'Return_Water_Temp_1' ] } config = kmeans.KmeansConfig(GINF_CONFIG['diff_cols'], GINF_CONFIG['diff_labels'], 1) kmeansObj = kmeans.Kmeans(config, 10, ginf) kmeansObj.cluster()
count = 0 results_set = [] for item in items: image_string = "" image_url = item["link"] try: # with urllib.request.urlopen(image_url) as url: req = Request(image_url, headers={'User-Agent': 'Mozilla/5.0'}) image_string = urlopen(req).read() image_file = BytesIO(image_string) img = Image.open(image_file) filename = "%s.%s" % (count, img.format) img.save(filename) k = kmeans.Kmeans() results = k.run(img) print("Results are:") print(results) k.saveCentroidColours(str(count), img.format) for result in results: print(kmeans.rgb_to_hex(result)) results_set.append(result) print("image", count, "has color:", result) except: print("Unexpected error:", sys.exc_info()[0]) count = count + 1
import numpy as np import kmeans import matplotlib.pyplot as plt #Generate random data around centroids_initial dimensions = 2 amount_of_clusters = 4 amount_of_data_in_cluster = 50 centroids_initial = np.empty([amount_of_clusters, dimensions]) points = np.empty([amount_of_clusters * amount_of_data_in_cluster, dimensions]) for i in range(amount_of_clusters): centroids_initial[i, :] = np.random.uniform(low=-10, high=10, size=(dimensions, )) points[i * amount_of_data_in_cluster:(i + 1) * amount_of_data_in_cluster, :] = np.random.uniform( low=-1, high=1, size=(amount_of_data_in_cluster, dimensions)) + centroids_initial[i, :] centroids, gama, iters = kmeans.Kmeans(points, amount_of_clusters, 10, 1000, 10) plt.scatter(points[:, 0], points[:, 1], s=1) plt.scatter(centroids_initial[:, 0], centroids_initial[:, 1], s=10) plt.scatter(centroids[:, 0], centroids[:, 1], s=30, c="red") plt.show()