def compute_clusters(self, n_ones_clusters=1000, n_zeros_clusters=1000): """ Compute cluster centers using a MiniBatch K-means algorithm Also compute weights for each centroid, where the weight is equivalent to the number of points assigned to that centroid """ ones_kmeans = cluster.MiniBatchKMeans(n_clusters=n_ones_clusters) zeros_kmeans = cluster.MiniBatchKMeans(n_clusters=n_zeros_clusters) ones_idx = np.where(self.targets == 1) zeros_idx = np.where(self.targets == 0) normalized_training, normalized_targets, normalized_tests = self.get_normalized_production_set() ones_labels = ones_kmeans.fit_predict(normalized_training[ones_idx]) zeros_labels = zeros_kmeans.fit_predict(normalized_training[zeros_idx]) ones_weights = np.zeros(n_ones_clusters) zeros_weights = np.zeros(n_zeros_clusters) for thing in ones_labels: ones_weights[thing] += 1 for thing in zeros_labels: zeros_weights[thing] += 1 np.savetxt("%s/data/ones_cluster_centers_n%d.dat" % (self.cwd, n_ones_clusters), ones_kmeans.cluster_centers_) np.savetxt("%s/data/ones_weights_n%d.dat" % (self.cwd, n_ones_clusters), ones_weights) np.savetxt("%s/data/zeros_cluster_centers_n%d.dat" % (self.cwd, n_zeros_clusters), zeros_kmeans.cluster_centers_) np.savetxt("%s/data/zeros_weights_n%d.dat" % (self.cwd, n_zeros_clusters), zeros_weights)
def make_folds(X, y, target_size, method='random'): n_Y = y.shape[0] n_folds = int(n_Y / target_size) + int(target_size > n_Y) if method == 'random': fold_assignment = np.random.permutation(n_Y) % n_folds elif method == 'cluster': # Thanks scikit print('Clustering [sklearn.cluster] inputs') clusterer = skcluster.MiniBatchKMeans(n_clusters=n_folds, batch_size=1000) fold_assignment = clusterer.fit_predict(X) elif method == 'rcluster': print('Clustering [sklearn.cluster] inputs') clusters = skcluster.MiniBatchKMeans(n_clusters=n_folds, batch_size=1000, compute_labels=True).fit(X) Xcluster = clusters.cluster_centers_ print('Interpolating probability') n_X = X.shape[0] assign_prob = np.zeros((n_folds, n_X)) tris = Delaunay(Xcluster) base_labels = clusters.labels_ for i in range(n_folds): indicator = np.zeros(n_folds) indicator[i] = 1. row = interp.LinearNDInterpolator(tris, indicator, fill_value=-1)(X) row[row < 0] = base_labels[row < 0] == i assign_prob[i] = row # now use these as selection probabilities assign_prob = np.cumsum(assign_prob, axis=0) rvec = np.random.random(n_X) fold_assignment = np.sum(rvec[np.newaxis, :] < assign_prob, axis=0) # veryfy fold assignment? # pl.scatter(X[:, 0], X[:, 1], c=fold_assignment) # pl.show() # exit() else: raise NameError('Unrecognised fold method:' + method) fold_inds = np.unique(fold_assignment) folds = Folds(n_folds, [], [], []) # might contain lists in the multitask case where = lambda y, v: y[np.where(v)[0]] for f in fold_inds: folds.X.append(where(X, fold_assignment == f)) folds.Y.append(where(y, fold_assignment == f)) folds.flat_y.append(where(y, fold_assignment == f)) return folds
def mini_cv(df): df1 = df[['pickup_x', 'pickup_y']].rename(columns={ 'pickup_x': 'x', 'pickup_y': 'y' }) df2 = df[['dropoff_x', 'dropoff_y']].rename(columns={ 'dropoff_x': 'x', 'dropoff_y': 'y' }) df3 = pd.concat([df1, df2]) x = df3[['x', 'y']].as_matrix() nlist = list(range(3, 61)) hyperparams = { 'n_clusters': nlist, 'init': ['k-means++', 'random'], 'batch_size': [100, 200, 300, 400, 500, 600, 700, 800, 900, 1000] } l1 = list(ParameterGrid(hyperparams)) l2 = [] for i in l1: gc.enable() gc.collect() model = cluster.MiniBatchKMeans(**i) y_pre = model.fit_predict(x) name = str(i) # plt.figure(figsize=(12,12)) # plt.title(name) # plt.scatter(x[:, 0], x[:, 1], c=y_pre) # plt.show() chs = metrics.calinski_harabaz_score(x, y_pre) l2.append((chs, i)) print('Score for this fit is', chs) return max(l2)
def kmeans(X, k, max_iter=16, init='kmc2'): X = X.astype(np.float32) np.random.seed(123) # if k is huge, initialize centers with cartesian product of centroids # in two subspaces if init == 'subspaces': sqrt_k = int(np.sqrt(k) + .5) if sqrt_k ** 2 != k: raise ValueError("K must be a square number if init='subspaces'") _, D = X.shape centroids0, _ = kmeans(X[:, :D/2], sqrt_k, max_iter=2) centroids1, _ = kmeans(X[:, D/2:], sqrt_k, max_iter=2) seeds = np.empty((k, D), dtype=np.float32) for i in range(sqrt_k): for j in range(sqrt_k): row = i * sqrt_k + j seeds[row, :D/2] = centroids0[i] seeds[row, D/2:] = centroids1[j] elif init == 'kmc2': seeds = kmc2.kmc2(X, k).astype(np.float32) else: raise ValueError("init parameter must be one of {'kmc2', 'subspaces'}") estimator = cluster.MiniBatchKMeans(k, init=seeds, max_iter=max_iter).fit(X) return estimator.cluster_centers_, estimator.labels_
def compute_codebook(D, code_size, nfeatures, fold_i=None, features='sift'): if features == 'sift': features = '' # do not change filename for basic sift elif features == 'dense_sift': features = 'dense_sift_' if fold_i is not None: code_name = "codebooks/" + str(code_size) + "_" + features + str( nfeatures) + "_fold_" + str(fold_i) + ".dat" else: code_name = "codebooks/" + str(code_size) + "_" + features + str( nfeatures) + ".dat" if not os.path.isfile(code_name): print 'Computing kmeans with ' + str(code_size) + ' centroids' init = time.time() codebook = cluster.MiniBatchKMeans(n_clusters=code_size, verbose=False, batch_size=code_size * 20, compute_labels=False, reassignment_ratio=10**-4) codebook.fit(D) cPickle.dump(codebook, open(code_name, "wb")) end = time.time() print 'Done in ' + str(end - init) + ' secs.' else: codebook = cPickle.load(open(code_name, "r")) return codebook
def k_means(n_clusters, samples): """ Run k-means clustering on vertex coordinates. Parameters: - - - - - n_clusters : int number of clusters to generate samples : array Euclidean-space coordinates of vertices """ # Run Mini-Batch K-Means k_means = cluster.MiniBatchKMeans(n_clusters=n_clusters, init='k-means++', max_iter=1000, batch_size=10000, verbose=False, compute_labels=True, max_no_improvement=100, n_init=5, reassignment_ratio=0.1) k_means.fit(samples) labels = k_means.labels_.copy() labels = labels.astype(np.int32) + 1 return labels
def create_clusters_batch(self, models): all_purity = {'MiniBatchKMeans': [], 'AgglomerativeClustering': []} two_means = cluster.MiniBatchKMeans(init='k-means++', n_clusters=len(self.categories)) average_linkage = cluster.AgglomerativeClustering(linkage="average", affinity="cosine", n_clusters=len( self.categories)) clustering_algorithms = (('MiniBatchKMeans', two_means), ('AgglomerativeClustering', average_linkage)) for name, algorithm in clustering_algorithms: print(name) for m in models: self.model = m labels, embeddings, colors, _, cats = self.get_embeddings_and_labels( ) algorithm.fit(embeddings) if hasattr(algorithm, 'labels_'): cluster_labels = algorithm.labels_.astype(np.int) else: cluster_labels = algorithm.predict(embeddings) purity = self.purity_score(np.array(cats), np.array(cluster_labels)) all_purity[name].append(purity) print(round(purity, 3)) print("Averrage Purity for Kmeans: {} for Agg: {}".format( (sum(all_purity['MiniBatchKMeans']) / len(all_purity['MiniBatchKMeans'])), (sum(all_purity['AgglomerativeClustering']) / len(all_purity['AgglomerativeClustering']))))
def prepare_input(self): targets = [] classifier = cluster.MiniBatchKMeans(n_clusters=self.n_bins_default, batch_size=BATCH_SIZE, compute_labels=False) for images, labels in tqdm( self.eval_batches(), total=len(self.eval_loader), desc="MutualInfo: quantizing input data. Stage 1"): images = images.flatten(start_dim=1) classifier.partial_fit(images, labels) targets.append(labels) targets = torch.cat(targets, dim=0) self.accuracy_estimator = AccuracyFromMutualInfo( n_classes=len(targets.unique())) self.quantized['target'] = targets.numpy() centroids_predicted = [] for images, _ in tqdm( self.eval_batches(), total=len(self.eval_loader), desc="MutualInfo: quantizing input data. Stage 2"): images = images.flatten(start_dim=1) centroids_predicted.append(classifier.predict(images)) self.quantized['input'] = np.hstack(centroids_predicted)
def minibatchkmeans(self): minibatch_kmeans = cluster.MiniBatchKMeans(n_clusters = self.n_clusters, init = 'k-means++', batch_size = 50) minibatch_kmeans.fit(self.data) #print minibatch_kmeans.labels_ #print self.labels return self.report(self.labels, minibatch_kmeans.labels_), minibatch_kmeans.labels_
def clusterKmeans(self, file, numClus, pca=False): print("Clustering...") x = self.loadAndPCA(file, pca) self.numClusters = numClus # Check nltk clustering with cosine distance clusterer = clus.MiniBatchKMeans(numClus, verbose=True, batch_size=5000, max_no_improvement=1000, compute_labels=True, reassignment_ratio=0.001) #clusterer = clus.KMeans(n_clusters=numClus, n_jobs=-1, verbose=1) scores = clusterer.fit_transform(x) print("Clustering done.") counts = Counter(clusterer.labels_) # Add counts for i in range(0, len(counts)): self.clusSizes.append(counts[i]) print("Clustering output: ") print(self.clusSizes) # TODO : Check the outcome of clustering from different # Embedding sizes and with/without PCA return clusterer.labels_, scores
def _load_cluster(self, cluster_file): from mpl_toolkits.basemap import Basemap from sklearn import cluster bm_param, km_param = np.load(cluster_file) self.m = Basemap(resolution='h', **bm_param) self.km = cluster.MiniBatchKMeans(n_clusters=km_param.shape[0]) self.km.cluster_centers_ = km_param
def __init__(self, n_clusters=50, pca_n_components=20, kmpca_n_components=3, kernel_n_components=30): self.counter = text.CountVectorizer(stop_words='english', ngram_range=(1, 2), min_df=30, binary=True, lowercase=True) self.km = cluster.MiniBatchKMeans(n_clusters=n_clusters, n_init=10, batch_size=10000, verbose=1) self.pca = decomposition.RandomizedPCA(n_components=pca_n_components) self.kmpca = decomposition.RandomizedPCA( n_components=kmpca_n_components) self.rbf = kernel_approximation.RBFSampler( n_components=kernel_n_components) self.tree_hasher = ensemble.RandomTreesEmbedding(n_estimators=30, max_depth=5, n_jobs=4) self.X_names = [ 'Title_CounterX', 'Title_ClusterdX', 'Title_KmX', 'Title_PCAX', 'Title_PCAClusterdX', 'Title_RbfX', 'Title_TreeX' ] self.linear_feature_selector = None
def __init__(self, n_clusters=50, pca_n_components=30, kmpca_n_components=3, kernel_n_components=30): ## use (min_df = 30, max_df = 0.5) to generate a lot of features - more choic for feature selection ## use (min_df = 0.001, max_df = 0.05) to generate fewer features - better clustering self.counter = text.CountVectorizer(stop_words='english', charset='utf-8', charset_error='ignore', ngram_range=(1, 1), min_df=0.001, max_df=0.05, binary=True, lowercase=True) self.km = cluster.MiniBatchKMeans(n_clusters=n_clusters, n_init=10, batch_size=10000, verbose=1) self.pca = decomposition.RandomizedPCA(n_components=pca_n_components) self.kmpca = decomposition.RandomizedPCA( n_components=kmpca_n_components) self.rbf = kernel_approximation.RBFSampler( n_components=kernel_n_components) self.tree_hasher = ensemble.RandomTreesEmbedding(n_estimators=30, max_depth=5, n_jobs=4) self.X_names = [ 'Desc_CounterX', 'Desc_ClusterdX', 'Desc_KmX', 'Desc_PCAX', 'Desc_PCAClusterdX', 'Desc_RbfX', 'Desc_TreeX' ] self.linear_feature_selector = None
def build_codebook(self, k): return cluster.MiniBatchKMeans(n_clusters=k, verbose=False, batch_size=k * 20, compute_labels=False, reassignment_ratio=10**-4, random_state=42)
def cluster(self, data) -> List[int]: c = cluster.MiniBatchKMeans(n_clusters=self.NumVisualWords, init='k-means++', init_size=self.NumVisualWords * 3, max_iter=100).fit(data) self.WordCenters = c.cluster_centers_ return c.labels_
def minibatch_kmeans(n_clusters: int, name: str = "minibatch_kmeans", **kwargs) -> ClusterOperation: """Returns ClusterOperation with mini-batchkmeans algorithm Parameters ---------- n_clusters : int number of clusters to create name : str name of this operation, default `minibatch_kmeans` kwargs : keyword arguments to pass to sklearn.cluster.MiniBatchKMeans class Returns ------- ClusterOperation Operation with MiniBatchKMeans algorithm Example ------- >>> op = minibatch_kmeans(n_clusters=10) """ model = skcluster.MiniBatchKMeans(n_clusters=n_clusters, **kwargs) return ClusterOperation(model=model, name=name)
def ClusterTrain(self, component=2, model='Agglomerative'): """Using cluster method to divide the sample into different category unsupervisedly. Different model can be used. 1. Spectral Clustering 2. Agglomerative Clustering 3. MiniBatch KMeans Parameters ---------- component: int, the dimension that convert to. model: string, the model you select for manifold learning """ print '-' * 49 + '\n' + 'Clustering\n' + '-' * 49 clusterlist = { 'spectral': cluster.SpectralClustering(n_clusters=component, eigen_solver='arpack', affinity="nearest_neighbors", random_state=0), 'Agglomerative': cluster.AgglomerativeClustering(n_clusters=component, linkage='ward'), #nice 'MiniBatch': cluster.MiniBatchKMeans(n_clusters=component) } MyCluster = clusterlist[model] return MyCluster.fit_predict(self.Feature)
def main(args): print("Reading Data ...") ann_file = json.load(open(os.path.join(args.root, args.file_list), 'r')) data = [] for _i, _a in enumerate(tqdm(ann_file['annotations'])): _, _, w, h = _a['bbox'] data.append([w / 1920, h / 1920]) data = np.array(data) if args.engine.startswith("sklearn"): if args.engine == "sklearn": km = cluster.KMeans(n_clusters=args.num_clusters, tol=args.tol, verbose=True) elif args.engine == "sklearn-mini": km = cluster.MiniBatchKMeans(n_clusters=args.num_clusters, tol=args.tol, verbose=True) km.fit(data) result = km.cluster_centers_ # distance = km.inertia_ / data.shape[0] distance = avg_iou(data, result) else: result = k_means(data, args.num_clusters, args.tol) distance = avg_iou(data, result) write_anchors_to_file(result, distance, args.output)
def BoW_hardAssignment(k, D, Train_descriptors): #compute the codebook print 'Computing kmeans with ' + str(k) + ' centroids' init = time.time() codebook = cluster.MiniBatchKMeans(n_clusters=k, verbose=False, batch_size=k * 20, compute_labels=False, reassignment_ratio=10**-4, random_state=42) codebook.fit(D) end = time.time() print 'Done in ' + str(end - init) + ' secs.' # get train visual word encoding print 'Getting Train BoVW representation' init = time.time() visual_words = np.zeros((len(Train_descriptors), k), dtype=np.float32) for i in xrange(len(Train_descriptors)): words = codebook.predict(Train_descriptors[i]) visual_words[i, :] = np.bincount(words, minlength=k) end = time.time() print 'Done in ' + str(end - init) + ' secs.' return words, visual_words, codebook
def cluster(file_list, output, n_clusters=None, max_files=None): import warnings warnings.filterwarnings("ignore", category=DeprecationWarning) from mpl_toolkits.basemap import Basemap import numpy as np if n_clusters is None: n_clusters = 100 # Parse the coordinates parser = CoordParser() c = np.array([parser(l) for l in open(file_list, 'r')]) # Create the basemap parameters bnd = 0 basemap_params = dict(projection='merc', llcrnrlat=np.min(c[:, 0]) - bnd, urcrnrlat=np.max(c[:, 0]) + bnd, llcrnrlon=np.min(c[:, 1]) - bnd, urcrnrlon=np.max(c[:, 1]) + bnd) # Select a subset of the coordinates to cluster if max_files is None: max_files = 100000 np.random.shuffle(c) c = c[:max_files] # Project the coordinates into x, y coordinates m = Basemap(**basemap_params) x, y = m(c[:, 1], c[:, 0]) from sklearn import cluster km = cluster.MiniBatchKMeans(n_clusters=n_clusters).fit( np.concatenate((x[:, None], y[:, None]), axis=1)) np.save(output, (basemap_params, km.cluster_centers_))
def cluster_model(newdata, data, model_name, input_param): ds = data params = input_param if str.lower(model_name) == 'kmeans': cluster_obj = cluster.KMeans(n_clusters=params['n_clusters']) if str.lower(model_name) == str.lower('MiniBatchKMeans'): cluster_obj = cluster.MiniBatchKMeans(n_clusters=params['n_clusters']) if str.lower(model_name) == str.lower('SpectralClustering'): cluster_obj = cluster.SpectralClustering(n_clusters=params['n_clusters']) if str.lower(model_name) == str.lower('MeanShift'): cluster_obj = cluster.MeanShift(bandwidth=params['bandwidth']) if str.lower(model_name) == str.lower('DBSCAN'): cluster_obj = cluster.DBSCAN(eps=params['eps']) if str.lower(model_name) == str.lower('AffinityPropagation'): cluster_obj = cluster.AffinityPropagation(damping=params['damping'], preference=params['preference']) cluster_obj.fit(ds) if str.lower(model_name) == str.lower('Birch'): cluster_obj = cluster.Birch(n_clusters=input_param['n_clusters']) if str.lower(model_name) == str.lower('GaussianMixture'): cluster_obj = mixture.GaussianMixture(n_components=params['n_clusters'], covariance_type='full') cluster_obj.fit(ds) if str.lower(model_name) in ['affinitypropagation', 'gaussianmixture']: model_result = cluster_obj.predict(ds) else: model_result = cluster_obj.fit_predict(ds) newdata[model_name] = pd.DataFrame(model_result) return newdata
def _initialize_parameters(self, X, random_state): """Initialize the model parameters. Parameters ---------- X : array-like, shape (n_samples, n_features) random_state : RandomState A random number generator instance. """ n_samples, _ = X.shape if self.init_params == 'kmeans': resp = np.zeros((n_samples, self.n_components)) label = cluster.MiniBatchKMeans( n_clusters=self.n_components, n_init=1, random_state=random_state).fit(X).labels_ resp[np.arange(n_samples), label] = 1 elif self.init_params == 'random': resp = random_state.rand(n_samples, self.n_components) resp /= resp.sum(axis=1)[:, np.newaxis] else: raise ValueError("Unimplemented initialization method '%s'" % self.init_params) self._initialize(X, resp)
def update_data(self, attrname, old, new): #store the models here models = [ cluster.MiniBatchKMeans(n_clusters=self.k_means_slider.value), cluster.DBSCAN(eps=self.DBSCAN_slider.value), cluster.Birch(n_clusters=self.birch_slider.value), cluster.MeanShift(bandwidth=self.bandwidth, bin_seeding=True) ] #AgglomerativeClustering assert len(models) == 4 for model in models: model.fit(self.X) for i in range(4): if hasattr(model, 'labels_'): y_pred = models[i].labels_.astype(np.int) else: y_pred = models[i].predict(self.X) self.colors[i] = [Spectral6[f % 6] for f in y_pred] self.source[i].data['colors'] = self.colors[i]
def definition_clusters(subset): #Importante -> normalizar el conjunto de datos que utilizamos normalized_set = preprocessing.normalize(subset, norm='l2') print("-------- Definiendo los clusteres...") k_means = cl.KMeans(init='k-means++', n_clusters=5, n_init=100) two_means = cl.MiniBatchKMeans(n_clusters=5, init='k-means++') # estimate bandwidth for mean shift bandwidth = cl.estimate_bandwidth(normalized_set, quantile=0.3) ms = cl.MeanShift(bandwidth=bandwidth) # connectivity matrix for structured Ward #connectivity = kneighbors_graph(normalized_set, n_neighbors=10, include_self=False) # make connectivity symmetric #connectivity = 0.5 * (connectivity + connectivity.T) ward = cl.AgglomerativeClustering(n_clusters=100, linkage='ward') average = cl.AgglomerativeClustering(n_clusters=100, linkage='average') #Utilizarlo para casos de estudio pequeños #n_jobs = -1 para q vaya en paralelo #spectral = cl.SpectralClustering(n_clusters=3, affinity="nearest_neighbors",n_jobs=-1, n_neighbors = 3) #dbscan = cl.DBSCAN(eps=0.3) #Los añadimos a una lista clustering_algorithms = (('K-Means', k_means), ('MeanShift', ms), ('MiniBatchMeans', two_means), ('AgglomerativeWard', ward), ('AgglomerativeAverage', average)) return clustering_algorithms
def plotTripCluster(data, numClusters): ''' Function to cluster all 1.4 million trips to 80 stereotypical template trips and then look at the distribution of this "bag of trips" and how it changes over time. ''' tripAttributes = np.array(data.loc[:, [ 'src lat [km]', 'src long [km]', 'dst lat [km]', 'dst long [km]', 'duration [min]' ]]) meanTripAttr = tripAttributes.mean(axis=0) stdTripAttr = tripAttributes.std(axis=0) tripAttributes = stats.zscore(tripAttributes, axis=0) TripKmeansModel = cluster.MiniBatchKMeans(n_clusters=numClusters, batch_size=120000, n_init=100, random_state=1) clusterInds = TripKmeansModel.fit_predict(tripAttributes) clusterTotalCounts, _ = np.histogram(clusterInds, bins=numClusters) sortedClusterInds = np.flipud(np.argsort(clusterTotalCounts)) plt.figure(figsize=(12, 4)) plt.title('Cluster Histogram of all trip') plt.bar(range(1, numClusters + 1), clusterTotalCounts[sortedClusterInds]) plt.ylabel('Frequency [counts]') plt.xlabel('Cluster index (sorted by cluster frequency)') plt.xlim(0, numClusters + 1) plt.savefig('Figures/cluster-histogram-trip.png') return meanTripAttr, stdTripAttr
def select_cluster_algorithm(algorithm, no_clusters): if algorithm == 'SpectralClustering': return cluster.SpectralClustering(n_clusters=no_clusters) elif algorithm == 'MiniBatchKMeans': return cluster.MiniBatchKMeans(n_clusters=no_clusters) elif algorithm == 'AgglomerativeClustering': return cluster.AgglomerativeClustering(n_clusters=no_clusters)
def test_basic(self, single_chunk_blobs): X, y = single_chunk_blobs a = cluster.PartialMiniBatchKMeans(n_clusters=3, random_state=0) b = cluster_.MiniBatchKMeans(n_clusters=3, random_state=0) a.fit(X) b.partial_fit(X) assert_estimator_equal(a, b, exclude=['random_state_'])
def clustering_K_means(pontos): from sklearn import datasets import matplotlib.pyplot as plt from sklearn import datasets import matplotlib.pyplot as plt from sklearn import cluster y_kmeans = [] #print(len(pontos),len(y),type(pontos),type(y_kmeans)) #ira agrupar sem 2 grupos um será o grupo de palavras chaves e o outro será o grupo de não palavras chaves kmeans = cluster.MiniBatchKMeans(n_clusters=2, batch_size=10) y_kmeans = kmeans.fit_predict(pontos) for i in range(0, len(pontos)): if y_kmeans[i] == 0: print('\033[31m' + '0' + '\033[0;0m', pontos[i], y_kmeans[i], listPhrase[i].phrase, "\n") for i in range(0, len(pontos)): if y_kmeans[i] == 1: print('\033[32m' + '1' + '\033[0;0m', pontos[i], y_kmeans[i], listPhrase[i].phrase, "\n") # desenha os pontos no gr ́afico # as cores s~ao definidas pelo valor de y (grupo) e # h ́a contorno nos c ́ırculos (edgecolor) plt.scatter(pontos[:, 0], pontos[:, 1], marker='o', c=y_kmeans, s=25, edgecolor='k') plt.show()
def kmeans(X, k, max_iter=16, init='kmc2'): X = X.astype(np.float32) # if k is huge, initialize centers with cartesian product of centroids # in two subspaces sqrt_k = int(np.sqrt(k) + .5) if k > 256 and sqrt_k**2 == k and init == 'subspaces': print "kmeans: clustering in subspaces first; k, sqrt(k) =" \ " {}, {}".format(k, sqrt_k) _, D = X.shape centroids0, _ = kmeans(X[:, :D / 2], sqrt_k, max_iter=1) centroids1, _ = kmeans(X[:, D / 2:], sqrt_k, max_iter=1) seeds = np.empty((k, D), dtype=np.float32) for i in range(sqrt_k): for j in range(sqrt_k): row = i * sqrt_k + j seeds[row, :D / 2] = centroids0[i] seeds[row, D / 2:] = centroids1[j] elif init == 'kmc2': seeds = kmc2.kmc2(X, k).astype(np.float32) else: raise ValueError("init parameter must be one of {'kmc2', 'subspaces'}") estimator = cluster.MiniBatchKMeans(k, init=seeds, max_iter=max_iter).fit(X) return estimator.cluster_centers_, estimator.labels_
def K_means(coords, hyper_params={}): params = {'n_clusters': 2} # default values params.update(hyper_params) clustering_obj = cluster.MiniBatchKMeans(n_clusters=params['n_clusters']) clustering_obj.fit(coords) y_pred = clustering_obj.labels_.astype(np.int) return y_pred