Example #1
0
def find_edges(input, test, K):
    print(f"building kNN classifier ... ", end=" ")
    st_time = time.time()

    if kNN_type <= 3:
        input, test = input.todense(), test.todense()

    if kNN_type == 1:
        from sklearn.neighbors import NearestNeighbors
        tree = NearestNeighbors(n_neighbors=K + 1, algorithm='ball_tree').fit(input)
    elif kNN_type == 2:
        from scipy import spatial
        tree = spatial.KDTree(input)
    elif kNN_type == 3:
        from n2 import HnswIndex
        tree = HnswIndex(input.shape[1], distance_type) # distance_type in ['angular', 'L2']
        for index in tqdm(range(input.shape[0])):
            tree.add_data(input[index, :])
        tree.build(n_threads=10)
    elif kNN_type == 4:
        import pysparnn.cluster_index as ci
        input_num = input.shape[0]
        tree = ci.MultiClusterIndex(input, range(input_num))
    else:
        raise NotImplementedError
    print(f"time={time.time()-st_time:.3f}s")


    print("finding indices ... ", end=" ")
    if kNN_type == 1:
        _, indices = tree.kneighbors(test)
    elif kNN_type == 2:
        _, indices = tree.query(test, k=K + 1)
    elif kNN_type == 3:
        indices = []
        for i in tqdm(range(test.shape[0])):
            indices.append(tree.search_by_vector(test[i, :], k=K + 1))
    else:
        indices = tree.search(test, k=K+1, k_clusters=100, return_distance=False)
    print(f"time={time.time()-st_time:.3f}s")


    edge_list = []
    for index1, per in enumerate(indices):
        for index2 in per:
            index2 = int(index2)
            if index1 != index2:
                edge_list.append((index1, index2))
    print(f"done! .... time={time.time()-st_time:.3f}s")
    return edge_list
class NNF():
    def __init__(self,
                 image,
                 target_mask,
                 source_mask,
                 patch_size=(11, 11),
                 patch_weight=None,
                 num_neighbors=1):
        im_h, im_w, im_ch = image.shape

        if patch_weight is None:
            self.patch_weight = np.ones(patch_size, dtype=_im_dtype)

        self.patch_size = patch_size
        self.num_neighb = num_neighbors

        print("Build NNF index: ", end=" ")
        start = time.time()

        if _NN_algorithm != "PatchMatch":
            self.source_ind = op.masked_indices(source_mask)
            self.target_ind = op.masked_indices(target_mask)

            # convert array indices to patch indices
            pad = patch_size[0] // 2
            ind_y, ind_x = np.divmod(self.source_ind, im_w)
            self.source_ind = (ind_x - pad) + (ind_y - pad) * (im_w - 2 * pad)

            source_point_cloud = extract_patches_2d( image, patch_size=patch_size )[self.source_ind].reshape((self.source_ind.size,-1)) \
                   * np.repeat(np.sqrt(self.patch_weight),im_ch)

            # need this because of FLANN bug (?) with memory release
            self.target_point_cloud = np.zeros(
                (self.target_ind.size, source_point_cloud.shape[-1]),
                dtype=_im_dtype)

        if _NN_algorithm == "FLANN":
            self.nn = flann.FLANN()
            self.nn.build_index(source_point_cloud,
                                algorithm="kdtree",
                                trees=1)  #, log_level = "info")
        elif _NN_algorithm == "Sklearn":
            self.nn = NearestNeighbors(
                n_neighbors=num_neighbors,
                algorithm='kd_tree',
                metric='minkowski',
                n_jobs=-1)  #,metric_params={'w':self.patch_weight})
            self.nn.fit(X=source_point_cloud)
        elif _NN_algorithm == "FAISS":
            self.nn = faiss.IndexHNSWFlat(source_point_cloud.shape[1], 50)
            self.nn.add(source_point_cloud)

        if _NN_algorithm == "PatchMatch":
            self.nn = pm.PatchMatch(target_mask,
                                    source_mask,
                                    patch_size=patch_size,
                                    lambdas=np.ones_like(image,
                                                         dtype=_im_dtype))

        print('%f sec' % (time.time() - start))

    def calculate_nnf(self, image, init_guess=None):
        im_h, im_w, im_ch = image.shape

        print("Query NNF index: ", end=" ")
        start = time.time()

        if _NN_algorithm != "PatchMatch":
            ind_nnf = np.zeros((im_h * im_w, self.num_neighb), dtype='int32')
            dist_nnf = np.zeros((im_h * im_w, self.num_neighb))

            # convert array indices to patch indices
            pad = self.patch_size[0] // 2
            ind_y, ind_x = np.divmod(self.target_ind, im_w)
            ind = (ind_x - pad) + (ind_y - pad) * (im_w - 2 * pad)

            # need this because of FLANN bug (?) with memory release
            np.copyto(self.target_point_cloud,
                      extract_patches_2d(
                          image, patch_size=self.patch_size)[ind].reshape(
                              (self.target_ind.size, -1)),
                      casting='same_kind',
                      where=True)
            self.target_point_cloud *= np.repeat(np.sqrt(self.patch_weight),
                                                 im_ch)

        # note that "ind" are patch indices, not array indices
        if _NN_algorithm == "FLANN":
            ind, dist = self.nn.nn_index(self.target_point_cloud,
                                         self.num_neighb)
        elif _NN_algorithm == "Sklearn":
            dist, ind = self.nn.kneighbors(X=self.target_point_cloud,
                                           return_distance=True)
        elif _NN_algorithm == "FAISS":
            dist, ind = self.nn.search(self.target_point_cloud,
                                       self.num_neighb)

        if _NN_algorithm != "PatchMatch":
            ind = ind.reshape((ind.shape[0], self.num_neighb))
            dist = dist.reshape((dist.shape[0], self.num_neighb))

            # convert patch indices to array indices
            ind = self.source_ind[ind.ravel()]
            ind_y, ind_x = np.divmod(ind, im_w - 2 * pad)
            ind = (ind_x + pad) + (ind_y + pad) * im_w

            ind = np.reshape(ind, (-1, self.num_neighb))
            for n in range(self.num_neighb):
                ind_nnf[self.target_ind, :] = ind  #[:,n]
                dist_nnf[self.target_ind, :] = dist  #[:,n]
        elif _NN_algorithm == "PatchMatch":
            ind_nnf, dist_nnf = self.nn.find_nnf(image, init_guess=init_guess)

        print('%f sec' % (time.time() - start))

        ind_nnf = ind_nnf.reshape((-1, self.num_neighb))
        dist_nnf = dist_nnf.reshape((-1, self.num_neighb))

        return ind_nnf, dist_nnf
Example #3
0
def find_edges(input, test, K):
    print(f"\tbuilding kNN classifier ... ", end=" ")
    st_time = time.time()

    if kNN_type in [1, 2]:
        input, test = input.todense(), test.todense()

    if kNN_type == 1:
        from sklearn.neighbors import NearestNeighbors
        tree = NearestNeighbors(n_neighbors=K + 1, algorithm='ball_tree').fit(input)
    elif kNN_type == 2:
        from scipy import spatial
        tree = spatial.KDTree(input)
    elif kNN_type == 3:
        from n2 import HnswIndex
        tree = HnswIndex(input.shape[1], distance_type) # distance_type in ['angular', 'L2']
        for index in tqdm(range(input.shape[0])):
            tree.add_data(input[index, :])
        tree.build(n_threads=20)
    elif kNN_type == 4:
        import pysparnn.cluster_index as ci
        input_num = input.shape[0]
        tree = ci.MultiClusterIndex(input, range(input_num))
    elif kNN_type == 5:
        import nmslib
        M, efC = 30, 100
        index_time_params = {'M': M, 'indexThreadQty': num_threads, 'efConstruction': efC, 'post': 0}
        
        space_names = ['l2_sparse', 'cosinesimil_sparse'] # https://github.com/nmslib/nmslib/blob/master/manual/spaces.md
        space_name = space_names[0]
        data_type = nmslib.DataType.SPARSE_VECTOR
        tree = nmslib.init(method='hnsw', space=space_name, data_type=data_type)
        
        '''
        def calc_zero_rows(i):
            if input[i, :].getnnz() == 0:
                return 1
            else:
                return 0
        pool = Pool(num_threads)
        zero_row_num = sum(pool.map(calc_zero_rows, range(input.shape[0])))
        print(f"# zero rows in input = {zero_row_num}", end=" ")
        '''
        tree.addDataPointBatch(input)

        tree.createIndex(index_time_params, print_progress=True)
        # Setting query-time parameters
        efS = 100
        query_time_params = {'efSearch': efS}
        print('Setting query-time parameters', query_time_params, end=" ")
        tree.setQueryTimeParams(query_time_params)
    else:
        raise NotImplementedError
    print(f"time={time.time()-st_time:.3f}s")


    print("\tfinding indices ... ", end=" ")
    if kNN_type == 1:
        _, indices = tree.kneighbors(test)
    elif kNN_type == 2:
        _, indices = tree.query(test, k=K + 1)
    elif kNN_type == 3:
        indices = []
        for i in tqdm(range(test.shape[0])):
            indices.append(tree.search_by_vector(test[i, :], k=K + 1))
    elif kNN_type == 4:
        indices = tree.search(test, k=K+1, k_clusters=100, return_distance=False)
    elif kNN_type == 5:
        '''
        def calc_zero_rows2(i):
            if test[i, :].getnnz() == 0:
                return 1
            else:
                return 0
        pool = Pool(num_threads)
        zero_row_num = sum(pool.map(calc_zero_rows2, range(test.shape[0])))
        print(f"# zero rows in test = {zero_row_num}")
        '''

        indices_ = tree.knnQueryBatch(test, k=K+1, num_threads=num_threads)
        indices = [i[0] for i in indices_]
        del indices_
    else:
        raise NotImplementedError

    print(f"time={time.time()-st_time:.3f}s")


    edge_list = []
    for index1, per in enumerate(indices):
        assert len(per) == K+1, f"index1={index1} len(per)={len(per)} != K={K}"
        for index2 in per:
            index2 = int(index2)
            if index1 != index2:
                edge_list.append((index1, index2))
    print(f"\tget edges done! .... time={time.time()-st_time:.3f}s")
    return edge_list
Example #4
0
class CustomKMeans(object):
    def __init__(
            self,
            n_centers: int = 2,
            method: str = 'kmeans',
            max_iter: int = 30,
            max_iter_no_progress: int = 10,
            tol_progress: np.float = 1e-3,
            random_state: Optional[int] = 17,
            nn_km_branching: int = 32,  # branching for k-means tree
            nn_km_iter: int = 20,  # number of iterations per k-means step
            nn_kd_trees: int = 32,  # number of randomized trees to use
            nn_checks: int = 75,  # number of leaves to check in the search
            nn_autotune: np.float = -1,  # auto-tuning of nn parameters
            apply_fix: bool = False,
            save_log: bool = True,
            gpu_idx: int = 0,
            verbose: bool = False):

        assert method in {'kmeans', 'kdtree', 'exact', 'exact-gpu'}

        # FLANN hyperparameters
        # The whole list of FLANN parameters is available here:
        # https://github.com/mariusmuja/flann/blob/master/src/cpp/flann/flann.h
        self.params = {
            'kdtree': {
                'algorithm': 'kdtree',
                'num_neighbors': 1,
                'trees': nn_kd_trees,
                'checks': nn_checks,
                'target_precision': nn_autotune
            },
            'kmeans': {
                'algorithm': 'kmeans',
                'num_neighbors': 1,
                'branching': nn_km_branching,
                'iterations': nn_km_iter,
                'checks': nn_checks,
                'target_precision': nn_autotune
            },
            'exact': {
                'algorithm': 'exact'
            },
            'exact-gpu': {
                'algorithm': 'exact-gpu'
            }
        }

        # GPU parameters
        self.gpu = None
        self.gpu_idx = gpu_idx

        # Nearest neighbors search method set up
        if method in {'kmeans', 'kdtree'}:
            self.nn_search = pyflann.FLANN()
        elif method == 'exact':
            self.nn_search = NearestNeighbors(n_neighbors=1,
                                              algorithm='kd_tree',
                                              leaf_size=nn_checks,
                                              metric='minkowski',
                                              p=2,
                                              n_jobs=1)
        else:
            self.nn_search = None
            self.gpu = faiss.StandardGpuResources()

        self.n_centers = n_centers
        self.method = method
        self.max_iter = max_iter
        self.max_iter_no_progress = max_iter_no_progress
        self.tol_progress = tol_progress
        self.random_state = random_state
        self.verbose = verbose

        self._dim = None
        self._n_samples = None
        self.centers_ = None
        self.labels_ = None
        self.sqdist_ = None
        self.stats_ = None
        self.session_id = f'n_centers_{self.n_centers}-' + \
            '-'.join(
                [f'{param}_{val}' for param, val
                 in self.params[self.method].items()]
            )
        self.apply_fix = apply_fix
        self.log_ = []
        self.save_log = save_log

    def __reset_stats(self):
        self.stats_ = {
            'measure': [],  # the cost function
            'evaluation': [],  # time to evaluate labels
            'assignment': [],  # time to re-assign labels
            'n_centers': self.n_centers,
            'apply_fix': self.apply_fix,
            **self.params[self.method]
        }

    def __update_stats(self, measure: np.float, time_eval: np.float,
                       time_assign: np.float):
        self.stats_['measure'].append(measure)
        self.stats_['evaluation'].append(time_eval)
        self.stats_['assignment'].append(time_assign)

    def fit(self, data: np.ndarray) -> None:
        data = data.astype(np.float32)
        self._n_samples, self._dim = data.shape

        np.random.seed(self.random_state)

        # Initialize centers (we use uniform random initialization)
        self.centers_ = np.random.uniform(np.min(data),
                                          np.max(data),
                                          size=self.n_centers *
                                          self._dim).reshape(
                                              self.n_centers,
                                              self._dim).astype(data.dtype)

        # Start fitting
        self.__reset_stats()
        self.log_.append(
            f'Session id: {self.session_id} '
            f'at {datetime.utcnow().strftime("%Y-%m-%d %H:%M:%S")}')
        progress = self.max_iter_no_progress
        for it in range(self.max_iter):

            tic_it = time()

            # Evaluate labels and squared distances
            if self.method in {'kmeans', 'kdtree'}:
                labels_, sqdist = self.nn_search.nn(self.centers_, data,
                                                    **self.params[self.method])
                if self.apply_fix and it > 0:
                    reassigned = np.where(labels_ != self.labels_)[0]
                    if len(reassigned):
                        sqdist_checked = np.linalg.norm(
                            data[reassigned, :] -
                            self.centers_[self.labels_[reassigned], :],
                            axis=1)**2
                        correct = sqdist[reassigned] <= sqdist_checked.ravel()
                        incorrect = ~correct
                        to_assign = reassigned[correct]
                        to_leave = reassigned[incorrect]
                        if len(to_assign):
                            self.labels_[to_assign] = labels_[to_assign]
                        self.sqdist_ = sqdist
                        if len(to_leave):
                            self.sqdist_[to_leave] = sqdist_checked[incorrect]
                else:
                    self.labels_, self.sqdist_ = labels_, sqdist
            elif self.method == 'exact':
                self.nn_search.fit(self.centers_)
                self.sqdist_, self.labels_ = self.nn_search.kneighbors(
                    X=data, return_distance=True)
                self.sqdist_, self.labels_ = \
                    self.sqdist_.ravel()**2, self.labels_.ravel()
            else:
                index_flat = faiss.IndexFlatL2(self._dim)
                self.nn_search = faiss.index_cpu_to_gpu(
                    self.gpu, self.gpu_idx, index_flat)
                self.nn_search.add(self.centers_)
                self.sqdist_, self.labels_ = self.nn_search.search(data, 1)
                self.sqdist_, self.labels_ = \
                    self.sqdist_.ravel(), self.labels_.ravel()

            toc = time()
            t1 = toc - tic_it

            # Update centers
            tic = time()
            for label in range(self.n_centers):
                idx = np.where(self.labels_ == label)[0]
                if len(idx):
                    self.centers_[label] = data[idx].mean(axis=0)
                else:
                    # If the cluster is empty, move its center anyway
                    self.centers_[label] = self.centers_.mean(axis=0)
            toc_it = time()
            t2 = toc_it - tic

            # Print progress
            self.log_.append(f'\t--> iteration {it} '
                             f'has been finished -- {toc_it - tic_it:.3e}s')
            if self.verbose:
                print(self.log_[-1])

            # Check convergence
            p = np.bincount(self.labels_, weights=self.sqdist_).sum()
            p /= self._n_samples
            self.__update_stats(p, t1, t2)
            if it >= 1 and self.stats_['measure'][-2] - \
                    self.stats_['measure'][-1] < self.tol_progress:
                progress -= 1
                if not progress:
                    print(f'\nIteration {it}: '
                          f'no progress during '
                          f'last {self.max_iter_no_progress} iterations')
                    break
            else:
                progress = self.max_iter_no_progress

        # Save the log
        self.log_.append(self.time_report())
        if self.save_log:
            path = Path(f'../output/logs/'
                        f'{self.__class__.__name__}/').resolve()
            path.mkdir(parents=True, exist_ok=True)
            with open(f'{str(path)}/{self.session_id}.txt', 'w') as file:
                for line in self.log_:
                    file.write(f'{line}\n')

    def fit_predict(self, data: np.ndarray) -> np.ndarray:

        self.fit(data)

        return self.labels_

    def time_report(self) -> str:
        if self.stats_ is None:
            rep = 'No statistics are available.'
        else:
            t_eval = np.asarray(self.stats_['evaluation'])
            t_assign = np.asarray(self.stats_['assignment'])
            rep = f'\nEvaluation time per iteration:\n' \
                f'\tAVG. = {t_eval.mean()}s\n' \
                f'\tSTD. = {t_eval.std()}s\n' \
                f'Assignment time per iteration:\n' \
                f'\tAVG. = {t_assign.mean()}s\n\tSTD. = {t_assign.std()}s'
        return rep
Example #5
0
def find_edges(input, test, K, cluster_ids, query_ids):
    print(f"\tbuilding kNN classifier ... ", end=" ")
    st_time = time.time()

    if kNN_type in [1, 2]:
        input, test = input.todense(), test.todense()

    if kNN_type == 1:
        from sklearn.neighbors import NearestNeighbors
        tree = NearestNeighbors(n_neighbors=K + 1, algorithm='ball_tree').fit(input)
    elif kNN_type == 2:
        from scipy import spatial
        tree = spatial.KDTree(input)
    elif kNN_type == 3:
        from n2 import HnswIndex
        tree = HnswIndex(input.shape[1], distance_type) # distance_type in ['angular', 'L2']
        for index in tqdm(range(input.shape[0])):
            tree.add_data(input[index, :])
        tree.build(n_threads=20)
    elif kNN_type == 4:
        import pysparnn.cluster_index as ci
        input_num = input.shape[0]
        tree = ci.MultiClusterIndex(input, range(input_num))
    elif kNN_type == 5:
        import nmslib
        M, efC, num_threads = 30, 100, 10
        index_time_params = {'M': M, 'indexThreadQty': num_threads, 'efConstruction': efC, 'post': 0}
        space_name = 'cosinesimil_sparse'
        data_type = nmslib.DataType.SPARSE_VECTOR
        tree = nmslib.init(method='hnsw', space=space_name, data_type=data_type)
        
        print(f"type(input) = {type(input)} type(test)={type(test)}", end=" ")
        
        tree.addDataPointBatch(input)

        tree.createIndex(index_time_params)
        # Setting query-time parameters
        efS = 100
        query_time_params = {'efSearch': efS}
        print('Setting query-time parameters', query_time_params)
        tree.setQueryTimeParams(query_time_params)

    else:
        raise NotImplementedError
    print(f"time={time.time()-st_time:.3f}s")


    print("\tfinding indices ... ", end=" ")
    if kNN_type == 1:
        _, indices = tree.kneighbors(test)
    elif kNN_type == 2:
        _, indices = tree.query(test, k=K + 1)
    elif kNN_type == 3:
        indices = []
        for i in tqdm(range(test.shape[0])):
            indices.append(tree.search_by_vector(test[i, :], k=K + 1))
    elif kNN_type == 4:
        indices = tree.search(test, k=K+1, k_clusters=100, return_distance=False)
    elif kNN_type == 5:
        indices_ = tree.knnQueryBatch(test, k=K, num_threads=num_threads)
        indices = [i[0] for i in indices_]
        del indices_
    else:
        raise NotImplementedError

    print(f"time={time.time()-st_time:.3f}s")


    edge_list = []
    for index1, per in enumerate(indices):
        for index2 in per:
            index2 = int(index2)
            if index1 != index2:
                edge_list.append((query_ids[index1], center_ids[index2]))
    print(f"\tdone! .... time={time.time()-st_time:.3f}s")
    return edge_list