def test_2d_cdist(metric, kw, seed, u_shape, u_chunks, v_shape, v_chunks): np.random.seed(seed) a_u = 2 * np.random.random(u_shape) - 1 a_v = 2 * np.random.random(v_shape) - 1 d_u = da.from_array(a_u, chunks=u_chunks) d_v = da.from_array(a_v, chunks=v_chunks) if metric == "mahalanobis": if "VI" not in kw: kw["VI"] = 2 * np.random.random(2 * u_shape[-1:]) - 1 elif kw["VI"] is None: kw.pop("VI") elif metric == "seuclidean": if "V" not in kw: kw["V"] = 2 * np.random.random(u_shape[-1:]) - 1 elif kw["V"] is None: kw.pop("V") elif metric == "wminkowski": kw["w"] = np.random.random(u_shape[-1:]) a_r = spdist.cdist(a_u, a_v, metric, **kw) d_r = dask_distance.cdist(d_u, d_v, metric, **kw) assert d_r.shape == a_r.shape assert np.allclose(np.array(d_r)[()], a_r, equal_nan=True)
def test_2d_bool_cdist(metric, seed, u_shape, u_chunks, v_shape, v_chunks): np.random.seed(seed) a_u = np.random.randint(0, 2, u_shape, dtype=bool) a_v = np.random.randint(0, 2, v_shape, dtype=bool) d_u = da.from_array(a_u, chunks=u_chunks) d_v = da.from_array(a_v, chunks=v_chunks) a_r = spdist.cdist(a_u, a_v, metric) d_r = dask_distance.cdist(d_u, d_v, metric) assert d_r.shape == a_r.shape assert np.allclose(np.array(d_r)[()], a_r, equal_nan=True)
def _looping_solution_ids(self, X, idx_ref, dist_func, d_nearest, n_used_ref, mu_x, sigma_x, j, i): """Iterating through all of the different solution_ids Args: X (np.ndarray): The Dataset. idx_ref (np.ndarray): The random indices to be tested. dist_func (callable): The distance function. d_nearest (np.ndarray): The nearest points to the centers. n_used_ref (int): The number of used references mu_x (np.ndarray): The running mean. sigma_x (np.ndarray): The confidence interval. j (int): The solution ids i (int): The index of the center currently trying to be found. Returns: mu_x (np.ndarray): The running mean. sigma_x (np.ndarray): The confidence interval. """ if isinstance(X, da.Array): d = dask_distance.cdist(X[idx_ref, :], X[j, :].reshape(1, -1), metric=dist_func).squeeze() d = d.compute() else: d = cdist(X[idx_ref, :], X[j, :].reshape(1, -1), metric=dist_func).squeeze() if i == 0: td = d.sum() var = sigma_x[j]**2 * n_used_ref n_used_ref, mu_x[j], var = self._update(n_used_ref, mu_x[j], var, d) var, var_sample = self._finalize(n_used_ref, var) sigma_x[j] = np.sqrt(var) else: tmp_delta = d - d_nearest[idx_ref] g = np.where(tmp_delta > 0, 0, tmp_delta) td = np.sum(g) mu_x[j] = ( (n_used_ref * mu_x[j]) + td) / (n_used_ref + self.batchsize) sigma_x[j] = np.std(g) return sigma_x[j], mu_x[j]
def __call__(self, data: Data, centroids: Centroids) -> IntLabels: """Find closest centroids @param data: observations in rows @param centroids: centroids in rows @return: vector of labels of centroids closest to points """ if data.shape[1] != centroids.shape[1]: msg = ("Dimensionality of data and centroids must be equal. " + f"Was {data.shape[1]} and {centroids.shape[1]}") logging.error(msg) raise ValueError(msg) if self.allow_dask and (data.shape[0] > 10000 or data.shape[1] > 1000): X1 = da.from_array(data) X2 = da.from_array(centroids) distances = ddst.cdist(X1, X2, self.distance_metric) labels = da.argmin(distances, axis=1).compute() else: distances = dst.cdist(data, centroids, self.distance_metric) labels = np.argmin(distances, axis=1) return labels
def maxmin(nsamples, NPS, var_importance): # pylint: disable=unused-argument # This function returns the most diverse set of parameters weighted with variable importance. selected_indices = [] selected_set = [] if len(selected_set) > nsamples: print("Already selected set, no need for minMaX!") return selected_set, selected_indices if len(selected_set) == 0: selected_set = [NPS[0, :]] selected_indices.append(0) prtime_start = time.time() test_time = [time.time()] while len(selected_set) < nsamples: # selected_set_array = np.array(selected_set) last_entry = selected_set[-1].reshape(1, -1) dtemp = dask_distance.cdist(last_entry, NPS, metric='euclidean') d = np.asarray(dtemp) if len(selected_set) > 1: dmat = np.vstack([dmat, d]) else: dmat = d new_ind = np.argmax(np.amin(dmat, axis=0)) print(new_ind) selected_indices.append(new_ind) selected_set.append(NPS[new_ind, :]) if len(selected_set) % 10 == 0: print("\n\n found landmark %i out of %i" % (len(selected_set), nsamples)) print("size of the space is :", np.amax(dmat), " and size of the Voronoi cell is: ", np.amax(np.amin(dmat, axis=0))) test_time.append(time.time()) print("CPU clock time to find the past 10 landmarks:", test_time[-1] - test_time[-2]) print(("Total execution time of MaxMin: ", time.time() - prtime_start)) return selected_set, selected_indices
def _swap_bandit(self, X, centers, dist_func, max_iter, tol, verbose): """BANDIT SWAP - improve medoids after initialization Recast as a stochastic estimation problem Run time O(nlogn) https://arxiv.org/pdf/2006.06856.pdf Args: X (np.ndarray): The dataset. centers (np.ndarray): The center medoids of the different clusters dist_func (callable): The distance function max_iter (int): Max number of times to check for a better medoid. tol (float): Tolerance denoting minimal acceptable amount of improvement, controls early stopping. verbose (bool): Determining whether or not to print out updates Returns: centers (np.ndarray): The updated center medoids """ done = False n_samples = X.shape[0] n_clusters = len(centers) current_iteration = 1 Tih_min = float("inf") delta = 1.0 / (1e3 * n_samples) # p 5 'Algorithmic details' while not done and (current_iteration < max_iter): # initialize mu and sigma mu_x = np.zeros((n_samples, n_clusters)) sigma_x = np.zeros((n_samples, n_clusters)) done = True # let's be optimistic we won't find a swap if isinstance(X, da.Array): d = dask_distance.cdist(X, X[centers, :], metric=dist_func) d = d.compute() else: d = cdist(X, X[centers, :], metric=dist_func) # cache nearest (D) and second nearest (E) distances to medoids tmp = np.partition(d, 1) D = tmp[:, 0] E = tmp[:, 1] unselected_ids = np.arange(n_samples) unselected_ids = np.delete(unselected_ids, centers) # this needs to be the product of k x unselected_ids swap_pairs = np.array(list( product(unselected_ids, range(n_clusters))), dtype="int") n_used_ref = 0 while (n_used_ref < n_samples) and (swap_pairs.shape[0] > 1): # sample a batch from S_ref (for init, S_ref = X) idx_ref = np.random.choice(unselected_ids, size=self.batchsize, replace=True) ci_scale = math.sqrt((2 * math.log(1.0 / delta)) / (n_used_ref + self.batchsize)) # This updates the running mean and confidence interval for each tuple in swap pairs np.apply_along_axis( lambda a_swap: self._swap_pairs( X, d, a_swap, dist_func, sorted(idx_ref), n_used_ref, mu_x, sigma_x, D, E, Tih_min, "h", ), 1, swap_pairs, ) # downseslect mu and sigma to match candidate pairs flat_indices = np.ravel_multi_index( (swap_pairs[:, 0], swap_pairs[:, 1]), (n_samples, n_clusters)) tmp_mu = mu_x.flatten()[flat_indices] tmp_sigma = sigma_x.flatten()[flat_indices] C_x = ci_scale * tmp_sigma # Remove pts that cannot be a solution - in terms of potential reward ucb = tmp_mu + C_x idx = np.argmin(ucb) ucb_best = ucb.min() # check if LCB of target is <= UCB of current best lcb_target = tmp_mu - C_x # tmp_ids = np.where(lcb_target <= ucb_best)[0] tmp_ids = np.where(lcb_target <= ucb_best)[0] swap_pairs = swap_pairs[tmp_ids] print("\tremaining candidates - ", tmp_ids.shape[0]) # , tmp_ids) n_used_ref = n_used_ref + self.batchsize # # with reduced number of candidates - run PAM swap # TODO - unify full swaps - like was done with search_singles # print( f"Entering swap with {swap_pairs.shape[0]} candidates...pts used = {n_used_ref}" ) done = True # let's be optimistic we won't find a swap # Checking to see if there are better center points Tih = np.apply_along_axis( lambda a_swap: self._swap_pairs( np.array(X), d, a_swap, dist_func, sorted(idx_ref), n_used_ref, mu_x, sigma_x, D, E, Tih_min, "i", ), 1, swap_pairs, ) idx = np.argmin(Tih) Tih_min = Tih[idx] h_swap = swap_pairs[idx][0] i_swap = swap_pairs[idx][1] if Tih_min < 0 and abs(Tih_min) > tol: if verbose: print("\tSwapped - ", centers[i_swap], h_swap, Tih_min) done = False # sorry we found a swap centers[i_swap] = h_swap print("Centers after swap - ", centers) else: done = True print("\tNO Swap - ", i_swap, h_swap, Tih_min) # our best swap would degrade the clustering (min Tih > 0) current_iteration = current_iteration + 1 return centers
def _swap_pairs( self, X, d, a_swap, dist_func, idx_ref, n_used_ref, mu_x, sigma_x, D, E, Tih_min, h_i, ): """Checking to see if there are any better center points. Args: X (np.ndarray): The Dataset. d (np.ndarray): distance matrix a_swap (tuple): Tuple of clusters as a combination of cluster index and dataset index. E.g. [[0,0],[0,1],[0,2],[1,0]...] dist_func (callable): distance function idx_ref (np.ndarray): The random indices to be tested. n_used_ref (int): Number of used reference points mu_x (np.ndarray): The Running mean. sigma_x (np.ndarray): The confidence interval. D (np.ndarray): Nearest distance to medoid E (np.ndarray): Second nearest distance to medoid Tih_min (float): The sum of values of the best medoid. h_i (str): Determining whether or not to find the updated mean and confidence interval or best medoid Returns: mu_x (np.ndarray): The Running mean. sigma_x (np.ndarray): The confidence interval. Tih (float): The best medoid. """ h = a_swap[0] i = a_swap[1] d_ji = d[:, i] if h_i == "h": if isinstance(X, da.Array): d_jh = dask_distance.cdist(X[idx_ref, :], X[h, :].reshape(1, -1), metric=dist_func).squeeze() d_jh = d_jh.compute() else: d_jh = cdist(X[idx_ref, :], X[h, :].reshape(1, -1), metric=dist_func).squeeze() K_jih = np.zeros(self.batchsize) diff_ji = d_ji[idx_ref] - D[idx_ref] idx = np.where(diff_ji > 0) diff_jh = d_jh - D[idx_ref] K_jih[idx] = np.minimum(diff_jh[idx], 0) idx = np.where(diff_ji == 0) K_jih[idx] = np.minimum(d_jh[idx], E[idx]) - D[idx] # base-line update of mu and sigma mu_x[h, i] = ((n_used_ref * mu_x[h, i]) + np.sum(K_jih)) / (n_used_ref + self.batchsize) sigma_x[h, i] = np.std(K_jih) return mu_x, sigma_x if h_i == "i": if isinstance(X, da.Array): d_jh = dask_distance.cdist(X, X[h, :].reshape(1, -1), metric=dist_func).squeeze() d_jh = d_jh.compute() else: d_jh = cdist(X, X[h, :].reshape(1, -1), metric=dist_func).squeeze() # calculate K_jih K_jih = np.zeros_like(D) # if d_ji > D: # Kjih = min(d(j, h) − Dj, 0) diff_ji = d_ji - D idx = np.where(diff_ji > 0) # K_jih[idx] = min(diff_jh[idx], 0) diff_jh = d_jh - D K_jih[idx] = np.minimum(diff_jh[idx], 0) # if d_ji = Dj: # Kjih = min(d(j, h), Ej) − Dj idx = np.where(diff_ji == 0) K_jih[idx] = np.minimum(d_jh[idx], E[idx]) - D[idx] Tih = np.sum(K_jih) return Tih
def _find_medoids(self, X, n_clusters, dist_func, centers, verbose, n_samples, delta, i): """Finding all of the medoids Args: X (np.ndarray): The Dataset. n_clusters (int): The number of clusters. dist_func (callable): The distance function. centers (np.ndarray): The centers of the different clusters verbose (bool): Print out updates n_samples (int): The number of samples in the dataset. delta (float): The threshold determining whether or not a value is going to be a part of a cluster. i (int): The index of the center Returns: centers (np.ndarray): The list of centers for the different clusters. """ mu_x = np.zeros((n_samples)) sigma_x = np.zeros((n_samples)) d_nearest = np.partition(self.D, 0)[:, 0] # available candidates - S_tar - we draw samples from this population unselected_ids = np.arange(n_samples) unselected_ids = np.delete(unselected_ids, centers[0:i]) # solution candidates - S_solution solution_ids = np.copy(unselected_ids) n_used_ref = 0 while (n_used_ref < n_samples) and (solution_ids.shape[0] > 1): # sample a batch from S_ref (for init, S_ref = X) idx_ref = np.random.choice(unselected_ids, size=self.batchsize, replace=True) ci_scale = math.sqrt( (2 * math.log(1.0 / delta)) / (n_used_ref + self.batchsize)) # This finds the distance of all points in idx_ref to all other points in the dataset. lmbda = np.vectorize( lambda j: self._looping_solution_ids( X, sorted(idx_ref), dist_func, d_nearest, n_used_ref, mu_x, sigma_x, j, i, ), otypes="O", ) lmbda(solution_ids) # Remove pts that are unlikely to be a solution C_x = ci_scale * sigma_x ucb = mu_x + C_x # check if LCB of target is <= UCB of current best lcb_target = mu_x - C_x ucb_best = ucb.min() solution_ids = np.where(lcb_target <= ucb_best)[0] # clean up any center idx that crept in... for ic in centers: if ic in solution_ids: solution_ids = np.delete(solution_ids, int(ic)) n_used_ref = n_used_ref + self.batchsize # finish search over the remaining candidates if verbose: print(f"Final eval with candidates = {solution_ids.shape[0]}" ) # , {solution_ids}") if solution_ids.shape[0] == 1: # save the single sample as a medoid centers[i] = solution_ids # probably a type error if isinstance(X, da.Array): d = dask_distance.cdist(X, X[centers[i], :].reshape(1, -1), metric=dist_func).squeeze() d = d.compute() else: d = cdist(X, X[centers[i], :].reshape(1, -1), metric=dist_func).squeeze() d_best = np.copy(d).reshape(-1, 1) else: # this is fastPam build - with far fewer pts to evaluate tmp_arr = np.zeros((n_samples)) # This creates an array of the sum of distances from the centers. lambda_singles = np.vectorize( lambda j: self._bandit_search_singles(X, dist_func, d_nearest, tmp_arr, j, i), otypes="O", ) tmp_arr = lambda_singles(solution_ids) idx = np.argmin(tmp_arr) centers[i] = solution_ids[idx] if isinstance(X, da.Array): d_best = (dask_distance.cdist( X, X[centers[i], :].reshape(1, -1), metric=dist_func).squeeze().reshape(-1, 1)) d_best = d_best.compute() else: d_best = (cdist(X, X[centers[i], :].reshape(1, -1), metric=dist_func).squeeze().reshape(-1, 1)) d_best = (cdist(X, X[centers[i], :].reshape(1, -1), metric=dist_func).squeeze().reshape(-1, 1)) if i == 0: self.D = d_best else: self.D = np.concatenate((self.D, d_best), axis=1) print("\t updated centers - ", centers) return centers[i]