Beispiel #1
0
def prepare_velocity_grid_data(
    X_emb,
    xy_grid_nums,
    density=None,
    smooth=None,
    n_neighbors=None,
):

    n_obs, n_dim = X_emb.shape
    density = 1 if density is None else density
    smooth = 0.5 if smooth is None else smooth

    grs, scale = [], 0
    for dim_i in range(n_dim):
        m, M = np.min(X_emb[:, dim_i]), np.max(X_emb[:, dim_i])
        m = m - 0.01 * np.abs(M - m)
        M = M + 0.01 * np.abs(M - m)
        gr = np.linspace(m, M, xy_grid_nums[dim_i] * density)
        scale += gr[1] - gr[0]
        grs.append(gr)

    scale = scale / n_dim * smooth

    meshes_tuple = np.meshgrid(*grs)
    X_grid = np.vstack([i.flat for i in meshes_tuple]).T

    # estimate grid velocities
    if n_neighbors is None:
        n_neighbors = np.max([10, int(n_obs / 50)])

    if X_emb.shape[0] > 200000 and X_emb.shape[1] > 2:
        from pynndescent import NNDescent

        nn = NNDescent(X_emb,
                       metric='euclidean',
                       n_neighbors=n_neighbors,
                       n_jobs=-1,
                       random_state=19491001)
        neighs, dists = nn.query(X_grid, k=n_neighbors)
    else:
        alg = "ball_tree" if X_emb.shape[1] > 10 else 'kd_tree'
        nn = NearestNeighbors(n_neighbors=n_neighbors,
                              n_jobs=-1,
                              algorithm=alg)
        nn.fit(X_emb)
        dists, neighs = nn.kneighbors(X_grid)

    weight = norm.pdf(x=dists, scale=scale)
    p_mass = weight.sum(1)

    return X_grid, p_mass, neighs, weight
Beispiel #2
0
class KNNSearch:
    def __init__(self, features, kwargs):

        self.org_features = features
        if kwargs["normalize"]:
            self.features = preprocessing.normalize(features, norm='l2')
        else:
            self.features = features

        self.kwargs = kwargs
        self.predictor = None

    def fit(self):
        if self.kwargs['algorithm'] == 'datasketch':
            self.__datasketch_fit()
        elif self.kwargs['algorithm'] == 'annoy':
            self.__annoy_fit()
        elif self.kwargs['algorithm'] == 'exact':
            self.__exhaustive_fit()
        elif self.kwargs['algorithm'] == 'falconn':
            self.__falconn_fit()
        elif self.kwargs['algorithm'] == 'descent':
            self.__descent_fit()
        elif self.kwargs['algorithm'] == 'random':
            self.__random_fit()
        else:
            raise Exception("Algorithm=[{}] not yet implemented".format(
                self.kwargs['algorithm']))

    def predict(self, input, k):
        if self.kwargs['algorithm'] == 'datasketch':
            return self.__datasketch_predict(input, k)
        elif self.kwargs['algorithm'] == 'annoy':
            return self.__annoy_predict(input, k)
        elif self.kwargs['algorithm'] == 'exact':
            return self.__exhaustive_predict(input, k)
        elif self.kwargs['algorithm'] == 'falconn':
            return self.__falconn_predict(input, k)
        elif self.kwargs['algorithm'] == 'descent':
            return self.__descent_predict(input, k)
        elif self.kwargs['algorithm'] == 'random':
            return self.__random_predict(input, k)
        else:
            raise Exception("Algorithm=[{}] not yet implemented".format(
                self.kwargs['algorithm']))

    def __datasketch_fit(self):
        if self.kwargs['create']:
            # Create a list of MinHash objects
            min_hash_obj_list = []
            forest = MinHashLSHForest(num_perm=self.kwargs['num_perm'])
            for i in range(len(self.features)):
                min_hash_obj_list.append(
                    MinHash(num_perm=self.kwargs['num_perm']))
                for d in self.features[i]:
                    min_hash_obj_list[i].update(d)
                forest.add(i, min_hash_obj_list[i])
            # IMPORTANT: must call index() otherwise the keys won't be searchable
            forest.index()
            with open(self.kwargs['file_path'], "wb") as f:
                pickle.dump(forest, f)
                pickle.dump(min_hash_obj_list, f)
            self.predictor = [forest, min_hash_obj_list]
        else:
            with open(self.kwargs['file_path'], "rb") as f:
                forest = pickle.load(f)
                min_hash_obj_list = pickle.load(f)
                self.predictor = [forest, min_hash_obj_list]

    def __datasketch_predict(self, input, k):
        forest, min_hash_obj_list = self.predictor
        if type(input) == int:
            return forest.query(min_hash_obj_list[input], k)
        else:
            min_hash_obj = MinHash(num_perm=self.kwargs['num_perm'])
            for d in input:
                min_hash_obj.update(d)
            return forest.query(min_hash_obj, k)

    def __annoy_fit(self):
        if self.kwargs['create']:
            indexer = AnnoyIndex(self.features.shape[1], self.kwargs['metric'])
            for i, f in enumerate(self.features):
                indexer.add_item(i, f)
            indexer.build(self.kwargs['num_trees'])
            indexer.save(self.kwargs['file_path'])
            self.predictor = indexer
        else:
            forest = AnnoyIndex(self.features.shape[1], self.kwargs['metric'])
            forest.load(self.kwargs['file_path'])
            self.predictor = forest

    def __annoy_predict(self, input, k):
        annoy_forest = self.predictor
        if type(input) == int:
            return annoy_forest.get_nns_by_item(input,
                                                k,
                                                search_k=-1,
                                                include_distances=False)
        else:
            return annoy_forest.get_nns_by_vector(input,
                                                  k,
                                                  search_k=-1,
                                                  include_distances=False)

    def __exhaustive_fit(self):
        self.predictor = NearestNeighbors(algorithm='ball_tree')
        self.predictor.fit(self.features)

    def __exhaustive_predict(self, input, k):
        if type(input) == int:
            return self.predictor.kneighbors(self.features[input].reshape(
                1, -1),
                                             n_neighbors=k,
                                             return_distance=False)[0]
        else:
            return self.predictor.kneighbors(input.reshape(1, -1),
                                             n_neighbors=k,
                                             return_distance=False)[0]

    def __falconn_fit(self):
        """
        Initializes locality-sensitive hashing with FALCONN to find nearest neighbors in training data.
        """

        import falconn

        dimension = self.features.shape[1]
        nb_tables = self.kwargs['nb_tables']
        number_bits = self.kwargs['number_bits']

        # LSH parameters
        params_cp = falconn.LSHConstructionParameters()
        params_cp.dimension = dimension
        params_cp.lsh_family = falconn.LSHFamily.CrossPolytope
        params_cp.distance_function = falconn.DistanceFunction.EuclideanSquared
        params_cp.l = nb_tables
        params_cp.num_rotations = 2  # for dense set it to 1; for sparse data set it to 2
        params_cp.seed = 5721840
        # we want to use all the available threads to set up
        params_cp.num_setup_threads = 0
        params_cp.storage_hash_table = falconn.StorageHashTable.BitPackedFlatHashTable

        # we build number_bits-bit hashes so that each table has
        # 2^number_bits bins; a rule of thumb is to have the number
        # of bins be the same order of magnitude as the number of data points
        falconn.compute_number_of_hash_functions(number_bits, params_cp)
        self._falconn_table = falconn.LSHIndex(params_cp)
        self._falconn_query_object = None
        self._FALCONN_NB_TABLES = nb_tables

        # Center the dataset and the queries: this improves the performance of LSH quite a bit.
        self.center = np.mean(self.features, axis=0)
        self.features -= self.center

        # add features to falconn table
        self._falconn_table.setup(self.features)

    def __falconn_predict(self, input, k):

        # Normalize input if you care about the cosine similarity
        if type(input) == int:
            input = self.features[input]
        else:
            if self.kwargs['normalize']:
                input /= np.linalg.norm(input)
                # Center the input and the queries: this improves the performance of LSH quite a bit.
                input -= self.center

        # Late falconn query_object construction
        # Since I suppose there might be an error
        # if table.setup() will be called after
        if self._falconn_query_object is None:
            self._falconn_query_object = self._falconn_table.construct_query_object(
            )
            self._falconn_query_object.set_num_probes(self._FALCONN_NB_TABLES)

        query_res = self._falconn_query_object.find_k_nearest_neighbors(
            input, k)
        return query_res

    def __descent_fit(self):
        self.predictor = NNDescent(data=self.features,
                                   metric=self.kwargs['metric'])

    def __descent_predict(self, input, k):
        input = np.expand_dims(
            input, axis=0)  # input should be an array of search points
        index = self.predictor
        return index.query(input, k)[0][
            0]  # returns indices of NN, distances of the NN from the input

    def __random_fit(self):
        pass

    def __random_predict(self, input, k):
        rand_index_list = []
        for i in range(k):
            rand_index_list.append(random.randint(0, len(self.features) - 1))

        return rand_index_list
Beispiel #3
0
    def build_knn_index(self, data, min_n_neighbors=20, rho=0.5):
        """
        Build a KNN index for the given data set. There will two KNN indices of the SNN distance is used.

        :param data: numpy data array of shape `(N, d)`, where `N` is the number of samples and `d` is the number
                     of dimensions (features).
        :param min_n_neighbors: minimum number of nearest neighbors to use for the `NN-descent` method.
        :param rho: `rho` parameter used by the `NN-descent` method.

        :return: A list with one or two KNN indices.
        """
        # Add one extra neighbor because querying on the points that are part of the KNN index will result in
        # the neighbor set containing the queried point. This can be removed from the query result
        if self.shared_nearest_neighbors:
            k = max(1 + self.n_neighbors_snn, min_n_neighbors)
        else:
            k = max(1 + self.n_neighbors, min_n_neighbors)

        # KNN index based on the primary distance metric
        if self.approx_nearest_neighbors:
            params = {
                'metric': self.metric,
                'metric_kwds': self.metric_kwargs,
                'n_neighbors': k,
                'rho': rho,
                'random_state': self.seed_rng,
                'n_jobs': self.n_jobs
            }
            index_knn_primary = NNDescent(data, **params)
        else:
            # Exact KNN graph
            index_knn_primary = NearestNeighbors(
                n_neighbors=k,
                algorithm='brute',
                metric=self.metric,
                metric_params=self.metric_kwargs,
                n_jobs=self.n_jobs
            )
            index_knn_primary.fit(data)

        if self.shared_nearest_neighbors:
            # Construct a second KNN index that uses the shared nearest neighbor distance
            data_neighbors, _ = remove_self_neighbors(
                *self.query_wrapper_(data, index_knn_primary, self.n_neighbors_snn + 1)
            )
            if self.approx_nearest_neighbors:
                params = {
                    'metric': distance_SNN,
                    'n_neighbors': max(1 + self.n_neighbors, min_n_neighbors),
                    'rho': rho,
                    'random_state': self.seed_rng,
                    'n_jobs': self.n_jobs
                }
                index_knn_secondary = NNDescent(data_neighbors, **params)
            else:
                index_knn_secondary = NearestNeighbors(
                    n_neighbors=(1 + self.n_neighbors),
                    algorithm='brute',
                    metric=distance_SNN,
                    n_jobs=self.n_jobs
                )
                index_knn_secondary.fit(data_neighbors)

            index_knn = [index_knn_primary, index_knn_secondary]
        else:
            index_knn = [index_knn_primary]

        return index_knn
Beispiel #4
0
class NearestNeighbors:
    """Greedy algorithm to balance a K-nearest neighbour graph
    It has an API similar to scikit-learn
    Parameters
    ----------
    k : int  (default=50)
        the number of neighbours in the final graph
    sight_k : int  (default=100)
        the number of neighbours in the initialization graph
        It correspondent to the farthest neighbour that a sample is allowed to connect to
        when no closest neighbours are allowed. If sight_k is reached then the matrix is filled
        with the sample itself
    maxl : int  (default=200)
        max degree of connectivity allowed. Avoids the presence of hubs in the graph, it is the
        maximum number of neighbours that are allowed to contact a node before the node is blocked
    mode : str (default="connectivity")
        decide wich kind of utput "distance" or "connectivity"
    n_jobs : int  (default=4)
        parallelization of the standard KNN search preformed at initialization
    """
    def __init__(self,
                 k: int = 50,
                 sight_k: int = 100,
                 maxl: int = 200,
                 mode: str = "distance",
                 metric: str = "euclidean",
                 minkowski_p: int = 20,
                 n_jobs: int = -1) -> None:
        # input parameters
        self.k = k
        self.sight_k = sight_k
        self.maxl = maxl
        self.mode = mode
        self.metric = metric
        self.minkowski_p = minkowski_p
        self.n_jobs = n_jobs

        # NN graphs
        self.data = None
        self._nn = None  # raw KNN
        self.bknn = None  # balanced KNN
        self.dist = None  # balanced KNN distances
        self.dsi = None  # balanced KNN neighbor index
        self.l = None  # balanced KNN degree of connectivity
        self.mknn = None  # mutual KNN based on bknn
        self.rnn = None  # radius NN based on mknn

    @property
    def n_samples(self) -> int:
        return self.data.shape[0]

    def fit(self, data: np.ndarray, sight_k: int = None) -> Any:
        """Fits the model
        data: np.ndarray (samples, features)
            np
        sight_k: int
            the farthest point that a node is allowed to connect to when its closest neighbours are not allowed
        """
        self.data = data
        if sight_k is not None:
            self.sight_k = sight_k
        logging.debug(
            f"First search the {self.sight_k} nearest neighbours for {self.n_samples}"
        )
        np.random.seed(13)
        if self.metric == "correlation":
            self._nn = _NearestNeighbors(n_neighbors=self.sight_k + 1,
                                         metric=self.metric,
                                         p=self.minkowski_p,
                                         n_jobs=self.n_jobs,
                                         algorithm="brute")
            self._nn.fit(self.data)
        elif self.metric == "js":
            self._nn = NNDescent(data=self.data,
                                 metric=jensen_shannon_distance)
        else:
            self._nn = _NearestNeighbors(n_neighbors=self.sight_k + 1,
                                         metric=self.metric,
                                         p=self.minkowski_p,
                                         n_jobs=self.n_jobs,
                                         leaf_size=30)
            self._nn.fit(self.data)

        # call this to calculate bknn
        self.kneighbors_graph(mode='distance')
        return self

    def kneighbors(self,
                   X: np.ndarray = None,
                   maxl: int = None,
                   mode: str = "distance"
                   ) -> Tuple[np.ndarray, np.ndarray, np.ndarray]:
        if self._nn is None:
            raise ValueError('must fit() before generating kneighbors graphs')
        """Finds the K-neighbors of a point.
            Returns indices of and distances to the neighbors of each point.
            Parameters
            ----------
            X : array-like, shape (n_query, n_features),
                The query point or points.
                If not provided, neighbors of each indexed point are returned.
                In this case, the query point is not considered its own neighbor.
            maxl: int
                max degree of connectivity allowed
            mode : "distance" or "connectivity"
                Decides the kind of output
            Returns
            -------
            dist_new : np.ndarray (samples, k+1)
                distances to the NN
            dsi_new : np.ndarray (samples, k+1)
                indexes of the NN, first column is the sample itself
            l: np.ndarray (samples)
                l[i] is the number of connections from other samples to the sample i
            NOTE:
            First column (0) correspond to the sample itself, the nearest neighbour is at the second column (1)
        """
        if X is not None:
            self.data = X
        if maxl is not None:
            self.maxl = maxl
        if mode == "distance":
            if self.metric == "js":
                self.dsi, self.dist = self._nn.query(self.data,
                                                     k=self.sight_k + 1)
            else:
                self.dist, self.dsi = self._nn.kneighbors(self.data,
                                                          return_distance=True)
        else:
            if self.metric == "js":
                self.dsi, _ = self._nn.query(self.data, k=self.sight_k + 1)
            else:
                self.dsi = self._nn.kneighbors(self.data,
                                               return_distance=False)
            self.dist = np.ones_like(self.dsi, dtype='float64')
            self.dist[:, 0] = 0
        logging.debug(
            f"Using the initialization network to find a {self.k}-NN "
            f"graph with maximum connectivity of {self.maxl}")
        self.dist, self.dsi, self.l = knn_balance(self.dsi,
                                                  self.dist,
                                                  maxl=self.maxl,
                                                  k=self.k)
        return self.dist, self.dsi, self.l

    def kneighbors_graph(self,
                         X: np.ndarray = None,
                         maxl: int = None,
                         mode: str = "distance") -> sparse.csr_matrix:
        """Retrun the K-neighbors graph as a sparse csr matrix
            Parameters
            ----------
            X : array-like, shape (n_query, n_features),
                The query point or points.
                If not provided, neighbors of each indexed point are returned.
                In this case, the query point is not considered its own neighbor.
            maxl: int
                max degree of connectivity allowed
            mode : "distance" or "connectivity"
                Decides the kind of output
            Returns
            -------
            neighbor_graph : scipy.sparse.csr_matrix
                The values are either distances or connectivity dependig of the mode parameter
            NOTE: The diagonal will be zero even though the value 0 is actually stored
        """
        dist_new, dsi_new, _ = self.kneighbors(X=X, maxl=maxl, mode=mode)
        logging.debug("Returning sparse matrix")
        self.bknn = sparse.csr_matrix(
            (np.ravel(dist_new), np.ravel(dsi_new),
             np.arange(0, dist_new.shape[0] * dist_new.shape[1] + 1,
                       dist_new.shape[1])), (self.n_samples, self.n_samples))
        self.bknn.eliminate_zeros()
        return self.bknn

    def mnn_graph(self):
        """get mutual nearest neighbor graph from bknn"""
        if self.mknn is None:
            if self.bknn is None:
                raise ValueError(
                    'must fit() before generating kneighbors graphs')
            # element-wise minimum between bknn and bknn.T, so non-mutual value will be 0
            self.mknn = self.bknn.minimum(self.bknn.transpose())
        return self.mknn

    def rnn_graph(self):
        """get rnn from mknn, return a sparse binary matrix"""
        # Convert distances to similarities
        if self.mknn is None:
            self.mnn_graph()
        mknn_sim = self.mknn.copy()
        bknn_sim = self.bknn.copy()
        max_d = self.bknn.data.max()
        bknn_sim.data = (max_d - bknn_sim.data) / max_d
        mknn_sim.data = (max_d - mknn_sim.data) / max_d
        mknn_sim = mknn_sim.tocoo()
        mknn_sim.setdiag(0)

        # Compute the effective resolution
        d = 1 - bknn_sim.data
        radius = np.percentile(d, 90)
        logging.info(f"  90th percentile radius: {radius:.02}")
        inside = mknn_sim.data > 1 - radius
        self.rnn = sparse.coo_matrix(
            (mknn_sim.data[inside],
             (mknn_sim.row[inside], mknn_sim.col[inside])),
            shape=mknn_sim.shape)
        return self.rnn