def test_minibatch_reassign(): # Give a perfect initialization, but a large reassignment_ratio, # as a result all the centers should be reassigned and the model # should not longer be good for this_X in (X, X_csr): mb_k_means = MiniBatchKMeans(n_clusters=n_clusters, batch_size=1, random_state=42) mb_k_means.fit(this_X) centers_before = mb_k_means.cluster_centers_.copy() try: old_stdout = sys.stdout sys.stdout = StringIO() # Turn on verbosity to smoke test the display code _mini_batch_step(this_X, (X ** 2).sum(axis=1), mb_k_means.cluster_centers_, mb_k_means.counts_, np.zeros(X.shape[1], np.double), False, random_reassign=True, random_state=42, reassignment_ratio=1, verbose=True) finally: sys.stdout = old_stdout centers_after = mb_k_means.cluster_centers_.copy() # Check that all the centers have moved assert_greater(((centers_before - centers_after)**2).sum(axis=1).min(), .2)
def test_minibatch_reassign(): # Give a perfect initialization, but a large reassignment_ratio, # as a result all the centers should be reassigned and the model # should no longer be good sample_weight = np.ones(X.shape[0], dtype=X.dtype) for this_X in (X, X_csr): mb_k_means = MiniBatchKMeans(n_clusters=n_clusters, batch_size=100, random_state=42) mb_k_means.fit(this_X) score_before = mb_k_means.score(this_X) try: old_stdout = sys.stdout sys.stdout = StringIO() # Turn on verbosity to smoke test the display code _mini_batch_step(this_X, sample_weight, (X**2).sum(axis=1), mb_k_means.cluster_centers_, mb_k_means.counts_, np.zeros(X.shape[1], np.double), False, distances=np.zeros(X.shape[0]), random_reassign=True, random_state=42, reassignment_ratio=1, verbose=True) finally: sys.stdout = old_stdout assert_greater(score_before, mb_k_means.score(this_X)) # Give a perfect initialization, with a small reassignment_ratio, # no center should be reassigned for this_X in (X, X_csr): mb_k_means = MiniBatchKMeans(n_clusters=n_clusters, batch_size=100, init=centers.copy(), random_state=42, n_init=1) mb_k_means.fit(this_X) clusters_before = mb_k_means.cluster_centers_ # Turn on verbosity to smoke test the display code _mini_batch_step(this_X, sample_weight, (X**2).sum(axis=1), mb_k_means.cluster_centers_, mb_k_means.counts_, np.zeros(X.shape[1], np.double), False, distances=np.zeros(X.shape[0]), random_reassign=True, random_state=42, reassignment_ratio=1e-15) assert_array_almost_equal(clusters_before, mb_k_means.cluster_centers_)
def test_minibatch_reassign(): # Give a perfect initialization, but a large reassignment_ratio, # as a result all the centers should be reassigned and the model # should not longer be good for this_X in (X, X_csr): mb_k_means = MiniBatchKMeans(n_clusters=n_clusters, batch_size=100, random_state=42) mb_k_means.fit(this_X) score_before = mb_k_means.score(this_X) try: old_stdout = sys.stdout sys.stdout = StringIO() # Turn on verbosity to smoke test the display code _mini_batch_step( this_X, (X ** 2).sum(axis=1), mb_k_means.cluster_centers_, mb_k_means.counts_, np.zeros(X.shape[1], np.double), False, distances=np.zeros(X.shape[0]), random_reassign=True, random_state=42, reassignment_ratio=1, verbose=True, ) finally: sys.stdout = old_stdout assert_greater(score_before, mb_k_means.score(this_X)) # Give a perfect initialization, with a small reassignment_ratio, # no center should be reassigned for this_X in (X, X_csr): mb_k_means = MiniBatchKMeans( n_clusters=n_clusters, batch_size=100, init=centers.copy(), random_state=42, n_init=1 ) mb_k_means.fit(this_X) clusters_before = mb_k_means.cluster_centers_ # Turn on verbosity to smoke test the display code _mini_batch_step( this_X, (X ** 2).sum(axis=1), mb_k_means.cluster_centers_, mb_k_means.counts_, np.zeros(X.shape[1], np.double), False, distances=np.zeros(X.shape[0]), random_reassign=True, random_state=42, reassignment_ratio=1e-15, ) assert_array_almost_equal(clusters_before, mb_k_means.cluster_centers_)
def test_minibatch_reassign(): # Give a perfect initialization, but a large reassignment_ratio, # as a result all the centers should be reassigned and the model # should not longer be good for this_X in (X, X_csr): mb_k_means = MiniBatchKMeans(n_clusters=n_clusters, batch_size=1, random_state=42) mb_k_means.fit(this_X) centers_before = mb_k_means.cluster_centers_.copy() try: old_stdout = sys.stdout sys.stdout = StringIO() # Turn on verbosity to smoke test the display code _mini_batch_step(this_X, (X**2).sum(axis=1), mb_k_means.cluster_centers_, mb_k_means.counts_, np.zeros(X.shape[1], np.double), False, distances=np.zeros(n_clusters), random_reassign=True, random_state=42, reassignment_ratio=1, verbose=True) finally: sys.stdout = old_stdout centers_after = mb_k_means.cluster_centers_.copy() # Check that all the centers have moved assert_greater(((centers_before - centers_after)**2).sum(axis=1).min(), .2) # Give a perfect initialization, with a small reassignment_ratio, # no center should be reassigned for this_X in (X, X_csr): mb_k_means = MiniBatchKMeans(n_clusters=n_clusters, batch_size=1, init=centers.copy(), random_state=42, n_init=1) mb_k_means.fit(this_X) centers_before = mb_k_means.cluster_centers_.copy() # Turn on verbosity to smoke test the display code _mini_batch_step(this_X, (X**2).sum(axis=1), mb_k_means.cluster_centers_, mb_k_means.counts_, np.zeros(X.shape[1], np.double), False, distances=np.zeros(n_clusters), random_reassign=True, random_state=42, reassignment_ratio=1e-15)
def test_minibatch_update_consistency(): # Check that dense and sparse minibatch update give the same results rng = np.random.RandomState(42) old_centers = centers + rng.normal(size=centers.shape) new_centers = old_centers.copy() new_centers_csr = old_centers.copy() counts = np.zeros(new_centers.shape[0], dtype=np.int32) counts_csr = np.zeros(new_centers.shape[0], dtype=np.int32) x_squared_norms = (X ** 2).sum(axis=1) x_squared_norms_csr = row_norms(X_csr, squared=True) buffer = np.zeros(centers.shape[1], dtype=np.double) buffer_csr = np.zeros(centers.shape[1], dtype=np.double) # extract a small minibatch X_mb = X[:10] X_mb_csr = X_csr[:10] x_mb_squared_norms = x_squared_norms[:10] x_mb_squared_norms_csr = x_squared_norms_csr[:10] # step 1: compute the dense minibatch update old_inertia, incremental_diff = _mini_batch_step( X_mb, x_mb_squared_norms, new_centers, counts, buffer, 1, None, random_reassign=False) assert_greater(old_inertia, 0.0) # compute the new inertia on the same batch to check that it decreased labels, new_inertia = _labels_inertia( X_mb, x_mb_squared_norms, new_centers) assert_greater(new_inertia, 0.0) assert_less(new_inertia, old_inertia) # check that the incremental difference computation is matching the # final observed value effective_diff = np.sum((new_centers - old_centers) ** 2) assert_almost_equal(incremental_diff, effective_diff) # step 2: compute the sparse minibatch update old_inertia_csr, incremental_diff_csr = _mini_batch_step( X_mb_csr, x_mb_squared_norms_csr, new_centers_csr, counts_csr, buffer_csr, 1, None, random_reassign=False) assert_greater(old_inertia_csr, 0.0) # compute the new inertia on the same batch to check that it decreased labels_csr, new_inertia_csr = _labels_inertia( X_mb_csr, x_mb_squared_norms_csr, new_centers_csr) assert_greater(new_inertia_csr, 0.0) assert_less(new_inertia_csr, old_inertia_csr) # check that the incremental difference computation is matching the # final observed value effective_diff = np.sum((new_centers_csr - old_centers) ** 2) assert_almost_equal(incremental_diff_csr, effective_diff) # step 3: check that sparse and dense updates lead to the same results assert_array_equal(labels, labels_csr) assert_array_almost_equal(new_centers, new_centers_csr) assert_almost_equal(incremental_diff, incremental_diff_csr) assert_almost_equal(old_inertia, old_inertia_csr) assert_almost_equal(new_inertia, new_inertia_csr)
def test_minibatch_update_consistency(): # Check that dense and sparse minibatch update give the same results rng = np.random.RandomState(42) old_centers = centers + rng.normal(size=centers.shape) new_centers = old_centers.copy() new_centers_csr = old_centers.copy() weight_sums = np.zeros(new_centers.shape[0], dtype=np.double) weight_sums_csr = np.zeros(new_centers.shape[0], dtype=np.double) x_squared_norms = (X**2).sum(axis=1) x_squared_norms_csr = row_norms(X_csr, squared=True) buffer = np.zeros(centers.shape[1], dtype=np.double) buffer_csr = np.zeros(centers.shape[1], dtype=np.double) # extract a small minibatch X_mb = X[:10] X_mb_csr = X_csr[:10] x_mb_squared_norms = x_squared_norms[:10] x_mb_squared_norms_csr = x_squared_norms_csr[:10] sample_weight_mb = np.ones(X_mb.shape[0], dtype=np.double) # step 1: compute the dense minibatch update old_inertia, incremental_diff = _mini_batch_step(X_mb, sample_weight_mb, x_mb_squared_norms, new_centers, weight_sums, buffer, 1, None, random_reassign=False) assert_greater(old_inertia, 0.0) # compute the new inertia on the same batch to check that it decreased labels, new_inertia = _labels_inertia(X_mb, sample_weight_mb, x_mb_squared_norms, new_centers) assert_greater(new_inertia, 0.0) assert_less(new_inertia, old_inertia) # check that the incremental difference computation is matching the # final observed value effective_diff = np.sum((new_centers - old_centers)**2) assert_almost_equal(incremental_diff, effective_diff) # step 2: compute the sparse minibatch update old_inertia_csr, incremental_diff_csr = _mini_batch_step( X_mb_csr, sample_weight_mb, x_mb_squared_norms_csr, new_centers_csr, weight_sums_csr, buffer_csr, 1, None, random_reassign=False) assert_greater(old_inertia_csr, 0.0) # compute the new inertia on the same batch to check that it decreased labels_csr, new_inertia_csr = _labels_inertia(X_mb_csr, sample_weight_mb, x_mb_squared_norms_csr, new_centers_csr) assert_greater(new_inertia_csr, 0.0) assert_less(new_inertia_csr, old_inertia_csr) # check that the incremental difference computation is matching the # final observed value effective_diff = np.sum((new_centers_csr - old_centers)**2) assert_almost_equal(incremental_diff_csr, effective_diff) # step 3: check that sparse and dense updates lead to the same results assert_array_equal(labels, labels_csr) assert_array_almost_equal(new_centers, new_centers_csr) assert_almost_equal(incremental_diff, incremental_diff_csr) assert_almost_equal(old_inertia, old_inertia_csr) assert_almost_equal(new_inertia, new_inertia_csr)
def test_minibatch_update_consistency(): """Check that dense and sparse minibatch update give the same results""" rng = np.random.RandomState(42) old_centers = centers + rng.normal(size=centers.shape) new_centers = old_centers.copy() new_centers_csr = old_centers.copy() counts = np.zeros(new_centers.shape[0], dtype=np.int32) counts_csr = np.zeros(new_centers.shape[0], dtype=np.int32) x_squared_norms = (X**2).sum(axis=1) x_squared_norms_csr = csr_row_norm_l2(X_csr, squared=True) buffer = np.zeros(centers.shape[1], dtype=np.double) buffer_csr = np.zeros(centers.shape[1], dtype=np.double) # extract a small minibatch X_mb = X[:10] X_mb_csr = X_csr[:10] x_mb_squared_norms = x_squared_norms[:10] x_mb_squared_norms_csr = x_squared_norms_csr[:10] # step 1: compute the dense minibatch update old_inertia, incremental_diff = _mini_batch_step(X_mb, x_mb_squared_norms, new_centers, counts, buffer, 1) assert_true(old_inertia > 0.0) # compute the new inertia on the same batch to check that it decreased labels, new_inertia = _labels_inertia(X_mb, x_mb_squared_norms, new_centers) assert_true(new_inertia > 0.0) assert_true(new_inertia < old_inertia) # check that the incremental difference computation is matching the # final observed value effective_diff = np.sum((new_centers - old_centers)**2) assert_almost_equal(incremental_diff, effective_diff) # step 2: compute the sparse minibatch update old_inertia_csr, incremental_diff_csr = _mini_batch_step( X_mb_csr, x_mb_squared_norms_csr, new_centers_csr, counts_csr, buffer_csr, 1) assert_true(old_inertia_csr > 0.0) # compute the new inertia on the same batch to check that it decreased labels_csr, new_inertia_csr = _labels_inertia(X_mb_csr, x_mb_squared_norms_csr, new_centers_csr) assert_true(new_inertia_csr > 0.0) assert_true(new_inertia_csr < old_inertia_csr) # check that the incremental difference computation is matching the # final observed value effective_diff = np.sum((new_centers_csr - old_centers)**2) assert_almost_equal(incremental_diff_csr, effective_diff) # step 3: check that sparse and dense updates lead to the same results assert_array_equal(labels, labels_csr) assert_array_almost_equal(new_centers, new_centers_csr) assert_almost_equal(incremental_diff, incremental_diff_csr) assert_almost_equal(old_inertia, old_inertia_csr) assert_almost_equal(new_inertia, new_inertia_csr)
def fit(self, X, y=None): """Compute the centroids on X by chunking it into mini-batches. Parameters ---------- X : array-like or sparse matrix, shape=(n_samples, n_features) Training instances to cluster. y : Ignored """ random_state = check_random_state(self.random_state) X = check_array(X, accept_sparse="csr", order='C', dtype=[np.float64, np.float32]) n_samples, n_features = X.shape if n_samples < self.n_clusters: raise ValueError("Number of samples smaller than number " "of clusters.") n_init = self.n_init if hasattr(self.init, '__array__'): self.init = np.ascontiguousarray(self.init, dtype=X.dtype) if n_init != 1: warnings.warn( 'Explicit initial center position passed: ' 'performing only one init in MiniBatchKMeans instead of ' 'n_init=%d' % self.n_init, RuntimeWarning, stacklevel=2) n_init = 1 x_squared_norms = k_means_.row_norms(X, squared=True) if self.tol > 0.0: tol = k_means_._tolerance(X, self.tol) # using tol-based early stopping needs the allocation of a # dedicated before which can be expensive for high dim data: # hence we allocate it outside of the main loop old_center_buffer = np.zeros(n_features, dtype=X.dtype) else: tol = 0.0 # no need for the center buffer if tol-based early stopping is # disabled old_center_buffer = np.zeros(0, dtype=X.dtype) distances = np.zeros(self.batch_size, dtype=X.dtype) n_batches = int(np.ceil(float(n_samples) / self.batch_size)) n_iter = int(self.max_iter * n_batches) init_size = self.init_size if init_size is None: init_size = 3 * self.batch_size if init_size > n_samples: init_size = n_samples self.init_size_ = init_size validation_indices = random_state.randint(0, n_samples, init_size) X_valid = X[validation_indices] x_squared_norms_valid = x_squared_norms[validation_indices] # perform several inits with random sub-sets best_inertia = None for init_idx in range(n_init): if self.verbose: print("Init %d/%d with method: %s" % (init_idx + 1, n_init, self.init)) counts = np.zeros(self.n_clusters, dtype=np.int32) # TODO: once the `k_means` function works with sparse input we # should refactor the following init to use it instead. # Initialize the centers using only a fraction of the data as we # expect n_samples to be very large when using MiniBatchKMeans cluster_centers = k_means_._init_centroids( X, self.n_clusters, self.init, random_state=random_state, x_squared_norms=x_squared_norms, init_size=init_size) # Compute the label assignment on the init dataset batch_inertia, centers_squared_diff = k_means_._mini_batch_step( X_valid, x_squared_norms[validation_indices], cluster_centers, counts, old_center_buffer, False, distances=None, verbose=self.verbose) # Keep only the best cluster centers across independent inits on # the common validation set _, inertia = k_means_._labels_inertia(X_valid, x_squared_norms_valid, cluster_centers) if self.verbose: print("Inertia for init %d/%d: %f" % (init_idx + 1, n_init, inertia)) if best_inertia is None or inertia < best_inertia: self.cluster_centers_ = cluster_centers self.counts_ = counts best_inertia = inertia # Empty context to be used inplace by the convergence check routine convergence_context = {} # Perform the iterative optimization until the final convergence # criterion for iteration_idx in range(n_iter): # Sample a minibatch from the full dataset minibatch_indices = random_state.randint(0, n_samples, self.batch_size) # Perform the actual update step on the minibatch data batch_inertia, centers_squared_diff = k_means_._mini_batch_step( X[minibatch_indices], x_squared_norms[minibatch_indices], self.cluster_centers_, self.counts_, old_center_buffer, tol > 0.0, distances=distances, # Here we randomly choose whether to perform # random reassignment: the choice is done as a function # of the iteration index, and the minimum number of # counts, in order to force this reassignment to happen # every once in a while random_reassign=((iteration_idx + 1) % (10 + self.counts_.min()) == 0), random_state=random_state, reassignment_ratio=self.reassignment_ratio, verbose=self.verbose) # Monitor convergence and do early stopping if necessary if k_means_._mini_batch_convergence(self, iteration_idx, n_iter, tol, n_samples, centers_squared_diff, batch_inertia, convergence_context, verbose=self.verbose): break self.n_iter_ = iteration_idx + 1 if self.compute_labels: self.labels_, self.inertia_ = self._labels_inertia_minibatch(X) return self
def mbkmean(self, options, n_clusters, n_init, batch_size, n_iter, n_samples, labels_true, k_means, X): #to do with online MBK_mean #Compute clustering with MiniBatchKMeans mbk = cluster.MiniBatchKMeans(init=self.init, n_clusters=n_clusters, batch_size=batch_size, n_init=10, max_no_improvement=n_iter, verbose=0) #INIT THREADs try: if options[2] == '-pp' or options[3] == '-pp': thread_1 = afficheur('starting threads', labels_true, mbk, k_means, X, n_clusters) thread_1.start() except IndexError: pass try: if options[2] == '-s': #init state n_batches = int(np.ceil(float(n_samples) / batch_size)) max_iter = 100 tol = 0 _, n_features = X.shape old_center_buffer = np.zeros(n_features, dtype=X.dtype) random_state = check_random_state(None) init_size = 3 * batch_size if init_size > n_samples: init_size = n_samples validation_indices = random_state.randint( 0, n_samples, init_size) X_valid = X[validation_indices] x_squared_norms = row_norms(X, squared=True) x_squared_norms_valid = x_squared_norms[validation_indices] counts = np.zeros(n_clusters, dtype=np.int32) best_inertia = None cluster_centers = None for init_idx in range(n_init): cluster_centers = cluster._init_centroids( X, n_clusters, self.init, random_state=random_state, x_squared_norms=x_squared_norms, init_size=init_size) batch_inertia, centers_squared_diff = cluster._mini_batch_step( X_valid, x_squared_norms[validation_indices], cluster_centers, counts, old_center_buffer, False, distances=None, verbose=False) _, inertia = cluster._labels_inertia( X_valid, x_squared_norms_valid, cluster_centers) if best_inertia is None or inertia < best_inertia: mbk.cluster_centers_ = cluster_centers mbk.counts_ = counts best_inertia = inertia print('best inertia %d' % best_inertia) while (True): thread_1 = afficheur('starting threads', labels_true, mbk, k_means, X, n_clusters) thread_1.start() t0 = time.time() for iteration_idx in range(n_iter): minibatch_indices = random_state.randint( 0, n_samples, batch_size) mbk = mbk.partial_fit(X[minibatch_indices]) thread_1.update(mbk) t_mini_batch = time.time() - t0 thread_1.stop() thread_1.join() n_iter = self.input_num("Iterations suivante : ") if n_iter == "stop": return mbk, t_mini_batch break if isinstance(n_iter, int) == False: print('error integer is required !!! type %s' % type(n_iter)) break except IndexError: pass try: if options[2] == '-pp': random_state = check_random_state(None) t0 = time.time() # Sample a minibatch from the full dataset for iteration_idx in range(n_iter - 1): minibatch_indices = random_state.randint( 0, n_samples, batch_size) mbk = mbk.partial_fit(X[minibatch_indices]) thread_1.update(mbk) t_mini_batch = time.time() - t0 thread_1.stop() thread_1.join() return mbk, t_mini_batch except IndexError: pass try: if options[2] == '-p': random_state = check_random_state(None) t0 = time.time() for iteration_idx in range(n_iter): minibatch_indices = random_state.randint( 0, n_samples, batch_size) mbk = mbk.partial_fit(X[minibatch_indices]) t_mini_batch = time.time() - t0 return mbk, t_mini_batch except IndexError: pass try: if options[2] == '-n': t0 = time.time() mbk = mbk.fit(X) t_mini_batch = time.time() - t0 return mbk, t_mini_batch except IndexError: pass try: if options[2] == None: random_state = check_random_state(None) # Sample a minibatch from the full dataset t0 = time.time() for iteration_idx in range(n_iter - 1): minibatch_indices = random_state.randint( 0, n_samples, self.batch_size) mbk = mbk.partial_fit(X, minibatch_indices=minibatch_indices) t_mini_batch = time.time() - t0 return mbk, t_mini_batch except IndexError: pass try: if options[2] == '-o': n_batches = int(np.ceil(float(n_samples) / batch_size)) max_iter = 100 n_iter = int(max_iter * n_batches) tol = 0 _, n_features = X.shape old_center_buffer = np.zeros(n_features, dtype=X.dtype) try: # print('self.max_iter %d , n_batches %d '%(n_iter,n_batches)) if options[3] == '-pp': #init state random_state = check_random_state(None) init_size = 3 * batch_size if init_size > n_samples: init_size = n_samples validation_indices = random_state.randint( 0, n_samples, init_size) X_valid = X[validation_indices] x_squared_norms = row_norms(X, squared=True) x_squared_norms_valid = x_squared_norms[ validation_indices] counts = np.zeros(n_clusters, dtype=np.int32) best_inertia = None cluster_centers = None #Random init with minimum inertia for init_idx in range(n_init): cluster_centers = cluster._init_centroids( X, n_clusters, self.init, random_state=random_state, x_squared_norms=x_squared_norms, init_size=init_size) batch_inertia, centers_squared_diff = cluster._mini_batch_step( X_valid, x_squared_norms[validation_indices], cluster_centers, counts, old_center_buffer, False, distances=None, verbose=False) _, inertia = cluster._labels_inertia( X_valid, x_squared_norms_valid, cluster_centers) if best_inertia is None or inertia < best_inertia: mbk.cluster_centers_ = cluster_centers mbk.counts_ = counts best_inertia = inertia print('best inertia %d' % best_inertia) convergence_context = {} mbk.batch_inertia = batch_inertia mbk.centers_squared_diff = centers_squared_diff t0 = time.time() for iteration_idx in range(n_iter): minibatch_indices = random_state.randint( 0, n_samples, batch_size) mbk = mbk.partial_fit(X[minibatch_indices]) tol = self._tolerance(X, tol) thread_1.update(mbk) # Monitor convergence and do early stopping if necessary if cluster._mini_batch_convergence( mbk, iteration_idx, n_iter, tol, n_samples, mbk.centers_squared_diff, mbk.batch_inertia, convergence_context, verbose=mbk.verbose): t_mini_batch = time.time() - t0 thread_1.stop() thread_1.join() return mbk, t_mini_batch break elif options[3] == '-p': random_state = check_random_state(None) convergence_context = {} t0 = time.time() for iteration_idx in range(n_iter): minibatch_indices = random_state.randint( 0, n_samples, batch_size) mbk = mbk.partial_fit(X[minibatch_indices]) tol = self._tolerance(X, tol) # Monitor convergence and do early stopping if necessary if cluster._mini_batch_convergence( mbk, iteration_idx, n_iter, tol, n_samples, mbk.centers_squared_diff, mbk.batch_inertia, convergence_context, verbose=False): t_mini_batch = time.time() - t0 return mbk, t_mini_batch break except IndexError: pass except IndexError: pass