def test_2d_cdist(metric, kw, seed, u_shape, u_chunks, v_shape, v_chunks):
    np.random.seed(seed)

    a_u = 2 * np.random.random(u_shape) - 1
    a_v = 2 * np.random.random(v_shape) - 1

    d_u = da.from_array(a_u, chunks=u_chunks)
    d_v = da.from_array(a_v, chunks=v_chunks)

    if metric == "mahalanobis":
        if "VI" not in kw:
            kw["VI"] = 2 * np.random.random(2 * u_shape[-1:]) - 1
        elif kw["VI"] is None:
            kw.pop("VI")
    elif metric == "seuclidean":
        if "V" not in kw:
            kw["V"] = 2 * np.random.random(u_shape[-1:]) - 1
        elif kw["V"] is None:
            kw.pop("V")
    elif metric == "wminkowski":
        kw["w"] = np.random.random(u_shape[-1:])

    a_r = spdist.cdist(a_u, a_v, metric, **kw)
    d_r = dask_distance.cdist(d_u, d_v, metric, **kw)

    assert d_r.shape == a_r.shape
    assert np.allclose(np.array(d_r)[()], a_r, equal_nan=True)
def test_2d_bool_cdist(metric, seed, u_shape, u_chunks, v_shape, v_chunks):
    np.random.seed(seed)

    a_u = np.random.randint(0, 2, u_shape, dtype=bool)
    a_v = np.random.randint(0, 2, v_shape, dtype=bool)

    d_u = da.from_array(a_u, chunks=u_chunks)
    d_v = da.from_array(a_v, chunks=v_chunks)

    a_r = spdist.cdist(a_u, a_v, metric)
    d_r = dask_distance.cdist(d_u, d_v, metric)

    assert d_r.shape == a_r.shape
    assert np.allclose(np.array(d_r)[()], a_r, equal_nan=True)
Beispiel #3
0
    def _looping_solution_ids(self, X, idx_ref, dist_func, d_nearest,
                              n_used_ref, mu_x, sigma_x, j, i):
        """Iterating through all of the different solution_ids

        Args:
            X (np.ndarray): The Dataset.
            idx_ref (np.ndarray): The random indices to be tested.
            dist_func (callable): The distance function.
            d_nearest (np.ndarray): The nearest points to the centers.
            n_used_ref (int): The number of used references
            mu_x (np.ndarray): The running mean.
            sigma_x (np.ndarray): The confidence interval.
            j (int): The solution ids
            i (int): The index of the center currently trying to be found.

        Returns:
            mu_x (np.ndarray): The running mean.
            sigma_x (np.ndarray): The confidence interval.
        """
        if isinstance(X, da.Array):
            d = dask_distance.cdist(X[idx_ref, :],
                                    X[j, :].reshape(1, -1),
                                    metric=dist_func).squeeze()
            d = d.compute()
        else:
            d = cdist(X[idx_ref, :], X[j, :].reshape(1, -1),
                      metric=dist_func).squeeze()

        if i == 0:
            td = d.sum()
            var = sigma_x[j]**2 * n_used_ref
            n_used_ref, mu_x[j], var = self._update(n_used_ref, mu_x[j], var,
                                                    d)
            var, var_sample = self._finalize(n_used_ref, var)
            sigma_x[j] = np.sqrt(var)
        else:
            tmp_delta = d - d_nearest[idx_ref]
            g = np.where(tmp_delta > 0, 0, tmp_delta)
            td = np.sum(g)
            mu_x[j] = (
                (n_used_ref * mu_x[j]) + td) / (n_used_ref + self.batchsize)
            sigma_x[j] = np.std(g)

        return sigma_x[j], mu_x[j]
Beispiel #4
0
    def __call__(self, data: Data, centroids: Centroids) -> IntLabels:
        """Find closest centroids

        @param data: observations in rows
        @param centroids: centroids in rows
        @return: vector of labels of centroids closest to points
        """
        if data.shape[1] != centroids.shape[1]:
            msg = ("Dimensionality of data and centroids must be equal. " +
                   f"Was {data.shape[1]} and {centroids.shape[1]}")
            logging.error(msg)
            raise ValueError(msg)

        if self.allow_dask and (data.shape[0] > 10000 or data.shape[1] > 1000):
            X1 = da.from_array(data)
            X2 = da.from_array(centroids)
            distances = ddst.cdist(X1, X2, self.distance_metric)
            labels = da.argmin(distances, axis=1).compute()
        else:
            distances = dst.cdist(data, centroids, self.distance_metric)
            labels = np.argmin(distances, axis=1)
        return labels
Beispiel #5
0
def maxmin(nsamples, NPS, var_importance):  # pylint: disable=unused-argument
    # This function returns the most diverse set of parameters weighted with variable importance.
    selected_indices = []
    selected_set = []
    if len(selected_set) > nsamples:
        print("Already selected set, no need for minMaX!")
        return selected_set, selected_indices

    if len(selected_set) == 0:
        selected_set = [NPS[0, :]]
        selected_indices.append(0)

    prtime_start = time.time()
    test_time = [time.time()]
    while len(selected_set) < nsamples:
        # selected_set_array = np.array(selected_set)
        last_entry = selected_set[-1].reshape(1, -1)
        dtemp = dask_distance.cdist(last_entry, NPS, metric='euclidean')
        d = np.asarray(dtemp)
        if len(selected_set) > 1:
            dmat = np.vstack([dmat, d])
        else:
            dmat = d
        new_ind = np.argmax(np.amin(dmat, axis=0))
        print(new_ind)
        selected_indices.append(new_ind)
        selected_set.append(NPS[new_ind, :])
        if len(selected_set) % 10 == 0:
            print("\n\n found landmark %i out of %i" %
                  (len(selected_set), nsamples))
            print("size of the space is :", np.amax(dmat),
                  " and size of the Voronoi cell is: ",
                  np.amax(np.amin(dmat, axis=0)))
            test_time.append(time.time())
            print("CPU clock time to find the past 10 landmarks:",
                  test_time[-1] - test_time[-2])
    print(("Total execution time of MaxMin: ", time.time() - prtime_start))
    return selected_set, selected_indices
Beispiel #6
0
    def _swap_bandit(self, X, centers, dist_func, max_iter, tol, verbose):
        """BANDIT SWAP - improve medoids after initialization
           Recast as a stochastic estimation problem
           Run time O(nlogn)
           https://arxiv.org/pdf/2006.06856.pdf

        Args:
            X (np.ndarray): The dataset.
            centers (np.ndarray): The center medoids of the different clusters
            dist_func (callable): The distance function
            max_iter (int): Max number of times to check for a better medoid.
            tol (float): Tolerance denoting minimal acceptable amount of improvement, controls early stopping.
            verbose (bool): Determining whether or not to print out updates

        Returns:
            centers (np.ndarray): The updated center medoids
        """
        done = False
        n_samples = X.shape[0]
        n_clusters = len(centers)
        current_iteration = 1
        Tih_min = float("inf")

        delta = 1.0 / (1e3 * n_samples)  # p 5 'Algorithmic details'

        while not done and (current_iteration < max_iter):
            # initialize mu and sigma
            mu_x = np.zeros((n_samples, n_clusters))
            sigma_x = np.zeros((n_samples, n_clusters))

            done = True  # let's be optimistic we won't find a swap

            if isinstance(X, da.Array):
                d = dask_distance.cdist(X, X[centers, :], metric=dist_func)
                d = d.compute()
            else:
                d = cdist(X, X[centers, :], metric=dist_func)

            # cache nearest (D) and second nearest (E) distances to medoids
            tmp = np.partition(d, 1)
            D = tmp[:, 0]
            E = tmp[:, 1]

            unselected_ids = np.arange(n_samples)
            unselected_ids = np.delete(unselected_ids, centers)

            # this needs to be the product of k x unselected_ids
            swap_pairs = np.array(list(
                product(unselected_ids, range(n_clusters))),
                                  dtype="int")

            n_used_ref = 0
            while (n_used_ref < n_samples) and (swap_pairs.shape[0] > 1):
                # sample a batch from S_ref (for init, S_ref = X)
                idx_ref = np.random.choice(unselected_ids,
                                           size=self.batchsize,
                                           replace=True)

                ci_scale = math.sqrt((2 * math.log(1.0 / delta)) /
                                     (n_used_ref + self.batchsize))
                # This updates the running mean and confidence interval for each tuple in swap pairs
                np.apply_along_axis(
                    lambda a_swap: self._swap_pairs(
                        X,
                        d,
                        a_swap,
                        dist_func,
                        sorted(idx_ref),
                        n_used_ref,
                        mu_x,
                        sigma_x,
                        D,
                        E,
                        Tih_min,
                        "h",
                    ),
                    1,
                    swap_pairs,
                )

                # downseslect mu and sigma to match candidate pairs
                flat_indices = np.ravel_multi_index(
                    (swap_pairs[:, 0], swap_pairs[:, 1]),
                    (n_samples, n_clusters))
                tmp_mu = mu_x.flatten()[flat_indices]
                tmp_sigma = sigma_x.flatten()[flat_indices]
                C_x = ci_scale * tmp_sigma

                # Remove pts that cannot be a solution - in terms of potential reward
                ucb = tmp_mu + C_x
                idx = np.argmin(ucb)
                ucb_best = ucb.min()

                # check if LCB of target is <= UCB of current best
                lcb_target = tmp_mu - C_x

                # tmp_ids = np.where(lcb_target <= ucb_best)[0]
                tmp_ids = np.where(lcb_target <= ucb_best)[0]
                swap_pairs = swap_pairs[tmp_ids]
                print("\tremaining candidates - ",
                      tmp_ids.shape[0])  # , tmp_ids)

                n_used_ref = n_used_ref + self.batchsize
            #
            # with reduced number of candidates - run PAM swap
            # TODO - unify full swaps - like was done with search_singles
            #
            print(
                f"Entering swap with {swap_pairs.shape[0]} candidates...pts used = {n_used_ref}"
            )

            done = True  # let's be optimistic we won't find a swap
            # Checking to see if there are better center points
            Tih = np.apply_along_axis(
                lambda a_swap: self._swap_pairs(
                    np.array(X),
                    d,
                    a_swap,
                    dist_func,
                    sorted(idx_ref),
                    n_used_ref,
                    mu_x,
                    sigma_x,
                    D,
                    E,
                    Tih_min,
                    "i",
                ),
                1,
                swap_pairs,
            )

            idx = np.argmin(Tih)
            Tih_min = Tih[idx]
            h_swap = swap_pairs[idx][0]
            i_swap = swap_pairs[idx][1]

            if Tih_min < 0 and abs(Tih_min) > tol:
                if verbose:
                    print("\tSwapped - ", centers[i_swap], h_swap, Tih_min)
                done = False  # sorry we found a swap
                centers[i_swap] = h_swap
                print("Centers after swap - ", centers)
            else:
                done = True
                print("\tNO Swap - ", i_swap, h_swap, Tih_min)
            # our best swap would degrade the clustering (min Tih > 0)
            current_iteration = current_iteration + 1
        return centers
Beispiel #7
0
    def _swap_pairs(
        self,
        X,
        d,
        a_swap,
        dist_func,
        idx_ref,
        n_used_ref,
        mu_x,
        sigma_x,
        D,
        E,
        Tih_min,
        h_i,
    ):
        """Checking to see if there are any better center points.

        Args:
            X (np.ndarray): The Dataset.
            d (np.ndarray): distance matrix
            a_swap (tuple): Tuple of clusters as a combination of cluster index and dataset index. E.g. [[0,0],[0,1],[0,2],[1,0]...]
            dist_func (callable): distance function
            idx_ref (np.ndarray): The random indices to be tested.
            n_used_ref (int): Number of used reference points
            mu_x (np.ndarray): The Running mean.
            sigma_x (np.ndarray): The confidence interval.
            D (np.ndarray): Nearest distance to medoid
            E (np.ndarray): Second nearest distance to medoid
            Tih_min (float): The sum of values of the best medoid.
            h_i (str): Determining whether or not to find the updated mean and confidence interval or best medoid

        Returns:
            mu_x (np.ndarray): The Running mean.
            sigma_x (np.ndarray): The confidence interval.
            Tih (float): The best medoid.
        """
        h = a_swap[0]
        i = a_swap[1]
        d_ji = d[:, i]

        if h_i == "h":
            if isinstance(X, da.Array):
                d_jh = dask_distance.cdist(X[idx_ref, :],
                                           X[h, :].reshape(1, -1),
                                           metric=dist_func).squeeze()
                d_jh = d_jh.compute()
            else:
                d_jh = cdist(X[idx_ref, :],
                             X[h, :].reshape(1, -1),
                             metric=dist_func).squeeze()
            K_jih = np.zeros(self.batchsize)
            diff_ji = d_ji[idx_ref] - D[idx_ref]
            idx = np.where(diff_ji > 0)

            diff_jh = d_jh - D[idx_ref]
            K_jih[idx] = np.minimum(diff_jh[idx], 0)

            idx = np.where(diff_ji == 0)
            K_jih[idx] = np.minimum(d_jh[idx], E[idx]) - D[idx]

            # base-line update of mu and sigma
            mu_x[h, i] = ((n_used_ref * mu_x[h, i]) +
                          np.sum(K_jih)) / (n_used_ref + self.batchsize)
            sigma_x[h, i] = np.std(K_jih)

            return mu_x, sigma_x

        if h_i == "i":
            if isinstance(X, da.Array):
                d_jh = dask_distance.cdist(X,
                                           X[h, :].reshape(1, -1),
                                           metric=dist_func).squeeze()
                d_jh = d_jh.compute()
            else:
                d_jh = cdist(X, X[h, :].reshape(1, -1),
                             metric=dist_func).squeeze()

            # calculate K_jih
            K_jih = np.zeros_like(D)
            # if d_ji > D:
            #    Kjih = min(d(j, h) − Dj, 0)
            diff_ji = d_ji - D
            idx = np.where(diff_ji > 0)

            # K_jih[idx] = min(diff_jh[idx], 0)
            diff_jh = d_jh - D
            K_jih[idx] = np.minimum(diff_jh[idx], 0)

            # if d_ji = Dj:
            #    Kjih = min(d(j, h), Ej) − Dj
            idx = np.where(diff_ji == 0)
            K_jih[idx] = np.minimum(d_jh[idx], E[idx]) - D[idx]

            Tih = np.sum(K_jih)

            return Tih
Beispiel #8
0
    def _find_medoids(self, X, n_clusters, dist_func, centers, verbose,
                      n_samples, delta, i):
        """Finding all of the medoids

        Args:
            X (np.ndarray): The Dataset.
            n_clusters (int): The number of clusters.
            dist_func (callable): The distance function.
            centers (np.ndarray): The centers of the different clusters
            verbose (bool): Print out updates
            n_samples (int): The number of samples in the dataset.
            delta (float): The threshold determining whether or not a value is going to be a part of a cluster.
            i (int): The index of the center

        Returns:
            centers (np.ndarray): The list of centers for the different clusters.
        """
        mu_x = np.zeros((n_samples))
        sigma_x = np.zeros((n_samples))
        d_nearest = np.partition(self.D, 0)[:, 0]

        # available candidates - S_tar - we draw samples from this population
        unselected_ids = np.arange(n_samples)
        unselected_ids = np.delete(unselected_ids, centers[0:i])
        # solution candidates - S_solution
        solution_ids = np.copy(unselected_ids)
        n_used_ref = 0
        while (n_used_ref < n_samples) and (solution_ids.shape[0] > 1):
            # sample a batch from S_ref (for init, S_ref = X)
            idx_ref = np.random.choice(unselected_ids,
                                       size=self.batchsize,
                                       replace=True)
            ci_scale = math.sqrt(
                (2 * math.log(1.0 / delta)) / (n_used_ref + self.batchsize))
            # This finds the distance of all points in idx_ref to all other points in the dataset.
            lmbda = np.vectorize(
                lambda j: self._looping_solution_ids(
                    X,
                    sorted(idx_ref),
                    dist_func,
                    d_nearest,
                    n_used_ref,
                    mu_x,
                    sigma_x,
                    j,
                    i,
                ),
                otypes="O",
            )
            lmbda(solution_ids)

            # Remove pts that are unlikely to be a solution
            C_x = ci_scale * sigma_x
            ucb = mu_x + C_x

            # check if LCB of target is <= UCB of current best
            lcb_target = mu_x - C_x
            ucb_best = ucb.min()
            solution_ids = np.where(lcb_target <= ucb_best)[0]

            # clean up any center idx that crept in...
            for ic in centers:
                if ic in solution_ids:
                    solution_ids = np.delete(solution_ids, int(ic))

            n_used_ref = n_used_ref + self.batchsize

        # finish search over the remaining candidates
        if verbose:
            print(f"Final eval with candidates = {solution_ids.shape[0]}"
                  )  # , {solution_ids}")
        if solution_ids.shape[0] == 1:
            # save the single sample as a medoid
            centers[i] = solution_ids  # probably a type error
            if isinstance(X, da.Array):
                d = dask_distance.cdist(X,
                                        X[centers[i], :].reshape(1, -1),
                                        metric=dist_func).squeeze()
                d = d.compute()
            else:
                d = cdist(X, X[centers[i], :].reshape(1, -1),
                          metric=dist_func).squeeze()
            d_best = np.copy(d).reshape(-1, 1)
        else:  # this is fastPam build - with far fewer pts to evaluate
            tmp_arr = np.zeros((n_samples))
            # This creates an array of the sum of distances from the centers.
            lambda_singles = np.vectorize(
                lambda j: self._bandit_search_singles(X, dist_func, d_nearest,
                                                      tmp_arr, j, i),
                otypes="O",
            )
            tmp_arr = lambda_singles(solution_ids)
            idx = np.argmin(tmp_arr)
            centers[i] = solution_ids[idx]
            if isinstance(X, da.Array):
                d_best = (dask_distance.cdist(
                    X, X[centers[i], :].reshape(1, -1),
                    metric=dist_func).squeeze().reshape(-1, 1))
                d_best = d_best.compute()
            else:
                d_best = (cdist(X,
                                X[centers[i], :].reshape(1, -1),
                                metric=dist_func).squeeze().reshape(-1, 1))
            d_best = (cdist(X,
                            X[centers[i], :].reshape(1, -1),
                            metric=dist_func).squeeze().reshape(-1, 1))
        if i == 0:
            self.D = d_best
        else:
            self.D = np.concatenate((self.D, d_best), axis=1)
        print("\t updated centers - ", centers)

        return centers[i]