def clust(elect_coords, n_clusts, iters, init_clusts): # Load resultant coordinates from Hough circles transform #coords = scipy.io.loadmat(elect_coords); #dat = coords.get('elect_coords'); dat = elect_coords; # Configure Kmeans cluster = sklearn.cluster.KMeans(); cluster.n_clusters= n_clusts; cluster.init = 'k-means++'; cluster.max_iter = iters; cluster.verbose = 0; cluster.n_init = init_clusts; cluster.fit(dat); # Grab vector for plotting each dimension x = list(cluster.cluster_centers_[:,0]); y = list(cluster.cluster_centers_[:,1]); z = list(cluster.cluster_centers_[:,2]); c = list(cluster.labels_); scipy.io.savemat('k_labels.mat', {'labels':cluster.labels_}) scipy.io.savemat('k_coords.mat', {'coords':cluster.cluster_centers_}) # plot the results of kmeans cmap = colors.Colormap('hot'); norm = colors.Normalize(vmin=1, vmax=10); s = 64; fig = plt.figure(); ax = fig.add_subplot(111,projection='3d'); Axes3D.scatter3D(ax,x,y,z,s=s); plt.show(fig); return cluster.cluster_centers_,cluster.labels_;
def clust(elect_coords, n_clusts, iters, init_clusts): # Load resultant coordinates from Hough circles transform #coords = scipy.io.loadmat(elect_coords); #dat = coords.get('elect_coords'); dat = elect_coords # Configure Kmeans cluster = sklearn.cluster.KMeans() cluster.n_clusters = n_clusts cluster.init = 'k-means++' cluster.max_iter = iters cluster.verbose = 0 cluster.n_init = init_clusts cluster.fit(dat) # Grab vector for plotting each dimension x = list(cluster.cluster_centers_[:, 0]) y = list(cluster.cluster_centers_[:, 1]) z = list(cluster.cluster_centers_[:, 2]) c = list(cluster.labels_) scipy.io.savemat('k_labels.mat', {'labels': cluster.labels_}) scipy.io.savemat('k_coords.mat', {'coords': cluster.cluster_centers_}) # plot the results of kmeans cmap = colors.Colormap('hot') norm = colors.Normalize(vmin=1, vmax=10) s = 64 fig = plt.figure() ax = fig.add_subplot(111, projection='3d') Axes3D.scatter3D(ax, x, y, z, s=s) plt.show(fig) return cluster.cluster_centers_, cluster.labels_
def get_reduce_cluster_train(train_x, test_x, train_y, test_y, name, k): clusters = {'kmeans': get_kmeans(k), 'exmax': get_exmax(k)} reducers = { 'pca': get_pca(), 'ica': get_ica(), 'randproj': get_randproj(), 'kernel': get_kernel() } results = [] for cluster_name, cluster in clusters.items(): for reduction_name, reducer in reducers.items(): one_hot = OneHotEncoder() # Train reduced_train = reducer.fit_transform(train_x) if cluster_name == 'exmax': cluster.fit(reduced_train) transformed_train = cluster.predict_proba(reduced_train) else: transformed_train = cluster.fit_predict(reduced_train) transformed_train = one_hot.fit_transform( transformed_train.reshape(-1, 1)).todense() nn = MLPClassifier(hidden_layer_sizes=[256] * 3, learning_rate_init=1e-2, early_stopping=True, max_iter=10000) nn.fit(transformed_train, train_y) train_acc = nn.score(transformed_train, train_y) # Test reduced_test = reducer.transform(test_x) if cluster_name == 'exmax': transformed_test = cluster.predict_proba(reduced_test) else: transformed_test = cluster.predict(reduced_test) transformed_test = one_hot.transform( transformed_test.reshape(-1, 1)).todense() test_acc = nn.score(transformed_test, test_y) results.append({ 'name': f'{name}-{reduction_name}-{cluster_name}', 'train_acc': train_acc, 'test_acc': test_acc }) df = pd.DataFrame.from_records(results, columns=['name', 'train_acc', 'test_acc']) print(df) df.to_csv(outputs_path / f'reduce-train-cluster-{name}.csv')
def excersise_1_b_1(): # EXERCISES # Ex.1.b.1: Choosing the number of clusters in Agglomerative clustering # Please make sure that you have "shopping-data.csv" stored in the same folder as this notebook. # This file contains shopping data of customers. Suppose our task is to segment customers based on their shopping patterns. customer_data = pd.read_csv('shopping-data.csv') # Before we start, let's explore more about this dataset ## Shape of the dataset print(customer_data.shape) ## Print the first 5 data items customer_data.head() # Ex.1.b.1: Choosing the number of clusters in Agglomerative clustering (cont) # We suspect that the last two entries could be used for clustering # Extract the last 2 columns data = customer_data.iloc[:, 3:5].values # Use dendrogram to visualize hierarchical clustering for this dataset plt.figure(figsize=(10, 7)) plt.title("Customer Dendograms") dend = shc.dendrogram(shc.linkage(data, method='ward')) # Ex.1.b.1: Choosing the number of clusters in Agglomerative clustering (cont) # Now, let's make use of the Dendrogram to sucessfully apply Agglomerative clustering # QUESTION: Based on the dendrogram above, what would be the appropriate number of clusters? # Computer Agglomerative Clustering ### YOUR CODE HERE (Fill in the "None") # Hint: define an Agglomerative Clustering object cluster = AgglomerativeClustering(n_clusters=2, affinity="euclidean", memory=None, connectivity=None, compute_full_tree='auto', linkage='ward', pooling_func='deprecated') # Hint: compute Agglomerative Clustering for our dataset. The "cluster" variable must have attributes. cluster.fit(data) ### END OF YOUR CODE # Visualization plt.figure(figsize=(10, 7)) plt.scatter(data[:, 0], data[:, 1], c=cluster.labels_, cmap='rainbow')
def kmeans(self): kmeans_params = { "n_clusters": self.n_clusters, "init": self.init_centers, "n_init": self.n_init, "max_iter": self.max_iter, "tol": self.tol, "precompute_distances": self.precompute_distances, "verbose": self.verbose, "random_state": self.random_state, "copy_x": self.copy_x, "n_jobs": self.n_jobs, "algorithm": self.algorithm } cluster = KMeans(**kmeans_params) cluster.fit(self.X) return cluster
import sklearn.cluster import pandas as pd import numpy as np from matplotlib import pyplot input = np.empty((0,21)) stander = sklearn.preprocessing.MaxAbsScaler() for i in range(1,2001): file = 'flow_per_shop/'+str(i)+'.csv' info = pd.read_csv(file) ts = info['count'].values ts = ts[-21:] ts = stander.fit_transform(ts) input = np.vstack((input,ts)) print input cluster = sklearn.cluster.KMeans(n_clusters=1) af = cluster.fit(input) # print af.cluster_centers_ labels = af.labels_ np.savetxt('labels.csv',labels,fmt='%d') for i in range(8): indice = np.where(labels==i)[0] pyplot.figure() for item in indice[:200]: pyplot.plot(input[item,:]) pyplot.show()
def create_sanogram(elements_set, img, error_func, replace_color=None, n_colors=5): grid_size = elements_set.block_px # block coordinates h, w = img.shape[:2] blocks = [] for iy in range(h/grid_size): for ix in range(w/grid_size): by, bx = iy*grid_size, ix*grid_size ey, ex = min(h, by+grid_size), min(w, bx+grid_size) if (ey-by < grid_size) or (ex-bx < grid_size): continue patch = img[by:ey, bx:ex] blocks.append((iy, ix, by, bx, ey, ex, patch)) bh, bw = iy+1, ix+1 # init labels unassigned. labels = np.ndarray((bh, bw), dtype=np.int32) labels[:, :] = -1 # find best patches h, w = img.shape[:2] for iy, ix, by, bx, ey, ex, patch in blocks: errors = [error_func(patch, elem) for i, elem in enumerate(elements_set.elements)] min_idx = np.argmin(errors) labels[iy, ix] = min_idx # determine the new color if replace_color == 'direct': # use mean color of the target patch directly. color_map = np.ndarray((bh, bw, 3), dtype=img.dtype) for iy, ix, by, bx, ey, ex, patch in blocks: label = labels[iy, ix] if not elements_set.elements[label].is_background: mean_color = patch[elements_set.elements[label].shape].mean(axis=0) color_map[iy, ix] = mean_color elif replace_color == 'representative': # find <n_colors> representative colors from the input image and use the nearest one for each patch. colors = img.reshape((-1, 3)) cluster = sklearn.cluster.KMeans(n_clusters=n_colors) cluster.fit(colors) # assign colors color_map = np.ndarray((bh, bw, 3), dtype=img.dtype) for iy, ix, by, bx, ey, ex, patch in blocks: label = labels[iy, ix] if not elements_set.elements[label].is_background: representative_index = cluster.predict((patch[elements_set.elements[label].shape]).mean(axis=0)) color_map[iy, ix] = cluster.cluster_centers_[representative_index] elif replace_color is None: # color is associated to the patch shape according to elements_set. color_map = None else: color_map = None print 'unknown replace_color=%s' % replace_color # apply labels res_img = np.zeros_like(img) + COLOR_BG for iy, ix, by, bx, ey, ex, patch in blocks: label = labels[iy, ix] if label >= 0: if color_map is None: res_img[by:ey, bx:ex] = elements_set.elements[label].patch else: res_img[by:ey, bx:ex][elements_set.elements[label].shape] = color_map[iy, ix] res_img[by:ey, bx:ex][~elements_set.elements[label].shape] = elements_set.background_color return res_img
test_size=0.30, random_state=0) # %% # fit the model cluster = sklearn.cluster.KMeans(n_clusters=8, init='k-means++', n_init=10, max_iter=300, tol=0.0001, precompute_distances='auto', verbose=0, random_state=None, copy_x=True, n_jobs=1) cluster.fit(features_train) # %% # Predict test features result = cluster.predict(features_test) # %% result # %% # Perform a plot of the clusters import matplotlib.pyplot as plt from sklearn.decomposition import PCA # %% # Use Principal Component Analysis (PCA) to reduce the dimensions
tr_aug_features = np.load('./data/voc12/features/train_aug_features.npy', allow_pickle=True) tr_features = np.load('./data/voc12/features/train_features.npy', allow_pickle=True) val_features = np.load('./data/voc12/features/val_features.npy', allow_pickle=True) features = tr_aug_features.tolist() + tr_features.tolist( ) + val_features.tolist() df = pd.DataFrame.from_records(features) df.drop_duplicates('img_name', inplace=True) cluster = cluster.KMeans(n_clusters=20, n_jobs=-1) df['feature'] = df['feature'].apply(lambda x: x[0].reshape(-1).tolist()) X = np.array(df['feature'].values.tolist()) cluster = cluster.fit(X) label_d = dict() category_size = len(set(cluster.labels_)) for img_name, label in zip(df['img_name'].values, cluster.labels_): cluster_label = np.zeros(category_size) cluster_label[label] = 1 label_d[img_name] = cluster_label np.save('./data/voc12/cls_kmeans_labels.npy', label_d) with open('./data/voc12/category_size.txt', mode='a') as f: f.write('%s %s\n' % ('kmeans_id', category_size))
import sklearn.cluster pd_airports_notna = pd_airports.dropna() n_clusters = 10 cluster = sklearn.cluster.KMeans(n_clusters=n_clusters, init='k-means++', n_init=10, max_iter=3000, tol=0.0001, precompute_distances='auto', verbose=0, random_state=None, copy_x=True, n_jobs=1) cluster.fit(pd_airports_notna[["LONGITUDE", "LATITUDE"]]) cluster.cluster_centers_ from matplotlib.lines import Line2D fig = plt.figure(figsize=(25, 20)) map = usa_map() log, lat = pd_flights_short_ori_airport[ 'ORI_LONGITUDE'], pd_flights_short_ori_airport['ORI_LATITUDE'] log, lat = map(log, lat) count_ori = pd_flights_short_ori_airport['COUNT_ORI_AIRPORT'] for x, y, c in zip(log, lat, count_ori): map.scatter(x, y, s=c / 1200, c='green') cen_log, cen_lat = map(cluster.cluster_centers_[:, 0], cluster.cluster_centers_[:, 1])
import matplotlib.pyplot as plt from sklearn import datasets #Load Data iris = datasets.load_iris() X = iris.data y = iris.target # Step 1 Model from sklearn import cluster cluster = cluster.KMeans(n_clusters=2) # Step 2 Training cluster.fit(X) # Step 3 Evaluation plt.scatter(X[:, 0], X[:, 1], c=cluster.labels_) #Mean Shift Clustering from sklearn.cluster import MeanShift ms = MeanShift() ms.fit(iris.data) from sklearn.cluster import AgglomerativeClustering groups = AgglomerativeClustering(n_clusters=2) groups.fit_predict(iris.data) from sklearn.decomposition import PCA pca = PCA(n_components=2).fit(iris.data) pca_2d = pca.transform(iris.data) for i in range(0, pca_2d.shape[0]): if ms.labels_[i] == 1: c1 = plt.scatter(pca_2d[i, 0], pca_2d[i, 1], c='r', marker='+') elif ms.labels_[i] == 0: c2 = plt.scatter(pca_2d[i, 0], pca_2d[i, 1], c='g', marker='o') #plt.title('Mean shift finds 2 clusters)
img_data = rbDetection(img.data) w, h = original_shape = tuple(img_data.shape) if concated_images is None: concated_images = np.reshape(img_data, (w * h, -1)) else: concated_images = np.r_[concated_images, np.reshape(img_data, (w * h, -1))] start_time = time.time() # Check if k-means is already initialized # If yes, than update (partial) fit, if not full fit. try: cluster.labels_ cluster.partial_fit(concated_images) except: cluster.fit(concated_images) print(concated_images.shape[0], time.time() - start_time) # Get the labels (Cloud = 1, No cloud = 0) labels = Labels(cluster.labels_ * (-1) + 1) # Split the labels into the original image parts splitted_labels = labels.splitUp(indices_or_sections=len(image_list)) for key, splitted_label in enumerate(splitted_labels): targets = None mini_images = None infos = None # Reshape the labels into the width and height of the image label = splitted_label.reshape((w, h), replace=False) start_time = time.time()
for n in n_clusters: clust = cluster.KMeans(n_clusters=n).fit(data) pred = clust.predict(data) centers = clust.cluster_centers_ score = silhouette_score(data, pred) print("The silhouette_score for {} clusters is {} ".format(n, score)) #silhouette score is the distance bwteen the clusters. The silhouette score # should be max but the datapoints need to be close distance within a cluster ################################# #have to install kelbowvisualizer model = cluster.KMeans() from yellowbrick.cluster import KElbowVisualizer kelb_graph = KElbowVisualizer(model, k=(1, 8)) kelb_graph.fit(data) kelb_graph.poof ################################## clust_range = range(1, 10) clust_err = [] for num_clust in clust_range: cluster = cluster.KMeans(num_clust) cluster.fit(data) clust_err.append(cluster.inertia_) cluster_df = pd.DataFrame({ "Num_cluster": clust_range, "Cluster_err": clust_err }) cluster_df[0:10]
print "Max. no. of cluster : ", max_cluster Initial_Label = [] max_choromosome_length = (max_cluster) * len(Idata[0]) print "Max. length of chromosome : ", max_choromosome_length CH = input("enter No. of chromosome : ") T = int(input("Enter no. of generation- ")) K = [] for i in range(1, CH + 1): counter += 1 pop = [] n = randint(2, max_cluster) K.insert(i, n) print "no. of cluster : ", n cluster = KMeans(n_clusters=n) cluster.fit(Idata) label = cluster.predict(Idata) centers = cluster.cluster_centers_ a = centers.tolist() for j in range(len(a)): for k in range(len(Idata[0])): pop.append(a[j][k]) if not max_choromosome_length - len(pop) == 0: extra_zero = max_choromosome_length - len(pop) pop.extend(0 for x in range(extra_zero)) x.insert(i, pop) ss = silhouette_score(Idata, label) pbm = cal_pbm_index(n, Idata, centers, label) sil_sco.insert(i, ss) PBM.insert(i, pbm) Initial_Label.insert(i, label.tolist())