def find_edges(input, test, K): print(f"building kNN classifier ... ", end=" ") st_time = time.time() if kNN_type <= 3: input, test = input.todense(), test.todense() if kNN_type == 1: from sklearn.neighbors import NearestNeighbors tree = NearestNeighbors(n_neighbors=K + 1, algorithm='ball_tree').fit(input) elif kNN_type == 2: from scipy import spatial tree = spatial.KDTree(input) elif kNN_type == 3: from n2 import HnswIndex tree = HnswIndex(input.shape[1], distance_type) # distance_type in ['angular', 'L2'] for index in tqdm(range(input.shape[0])): tree.add_data(input[index, :]) tree.build(n_threads=10) elif kNN_type == 4: import pysparnn.cluster_index as ci input_num = input.shape[0] tree = ci.MultiClusterIndex(input, range(input_num)) else: raise NotImplementedError print(f"time={time.time()-st_time:.3f}s") print("finding indices ... ", end=" ") if kNN_type == 1: _, indices = tree.kneighbors(test) elif kNN_type == 2: _, indices = tree.query(test, k=K + 1) elif kNN_type == 3: indices = [] for i in tqdm(range(test.shape[0])): indices.append(tree.search_by_vector(test[i, :], k=K + 1)) else: indices = tree.search(test, k=K+1, k_clusters=100, return_distance=False) print(f"time={time.time()-st_time:.3f}s") edge_list = [] for index1, per in enumerate(indices): for index2 in per: index2 = int(index2) if index1 != index2: edge_list.append((index1, index2)) print(f"done! .... time={time.time()-st_time:.3f}s") return edge_list
class NNF(): def __init__(self, image, target_mask, source_mask, patch_size=(11, 11), patch_weight=None, num_neighbors=1): im_h, im_w, im_ch = image.shape if patch_weight is None: self.patch_weight = np.ones(patch_size, dtype=_im_dtype) self.patch_size = patch_size self.num_neighb = num_neighbors print("Build NNF index: ", end=" ") start = time.time() if _NN_algorithm != "PatchMatch": self.source_ind = op.masked_indices(source_mask) self.target_ind = op.masked_indices(target_mask) # convert array indices to patch indices pad = patch_size[0] // 2 ind_y, ind_x = np.divmod(self.source_ind, im_w) self.source_ind = (ind_x - pad) + (ind_y - pad) * (im_w - 2 * pad) source_point_cloud = extract_patches_2d( image, patch_size=patch_size )[self.source_ind].reshape((self.source_ind.size,-1)) \ * np.repeat(np.sqrt(self.patch_weight),im_ch) # need this because of FLANN bug (?) with memory release self.target_point_cloud = np.zeros( (self.target_ind.size, source_point_cloud.shape[-1]), dtype=_im_dtype) if _NN_algorithm == "FLANN": self.nn = flann.FLANN() self.nn.build_index(source_point_cloud, algorithm="kdtree", trees=1) #, log_level = "info") elif _NN_algorithm == "Sklearn": self.nn = NearestNeighbors( n_neighbors=num_neighbors, algorithm='kd_tree', metric='minkowski', n_jobs=-1) #,metric_params={'w':self.patch_weight}) self.nn.fit(X=source_point_cloud) elif _NN_algorithm == "FAISS": self.nn = faiss.IndexHNSWFlat(source_point_cloud.shape[1], 50) self.nn.add(source_point_cloud) if _NN_algorithm == "PatchMatch": self.nn = pm.PatchMatch(target_mask, source_mask, patch_size=patch_size, lambdas=np.ones_like(image, dtype=_im_dtype)) print('%f sec' % (time.time() - start)) def calculate_nnf(self, image, init_guess=None): im_h, im_w, im_ch = image.shape print("Query NNF index: ", end=" ") start = time.time() if _NN_algorithm != "PatchMatch": ind_nnf = np.zeros((im_h * im_w, self.num_neighb), dtype='int32') dist_nnf = np.zeros((im_h * im_w, self.num_neighb)) # convert array indices to patch indices pad = self.patch_size[0] // 2 ind_y, ind_x = np.divmod(self.target_ind, im_w) ind = (ind_x - pad) + (ind_y - pad) * (im_w - 2 * pad) # need this because of FLANN bug (?) with memory release np.copyto(self.target_point_cloud, extract_patches_2d( image, patch_size=self.patch_size)[ind].reshape( (self.target_ind.size, -1)), casting='same_kind', where=True) self.target_point_cloud *= np.repeat(np.sqrt(self.patch_weight), im_ch) # note that "ind" are patch indices, not array indices if _NN_algorithm == "FLANN": ind, dist = self.nn.nn_index(self.target_point_cloud, self.num_neighb) elif _NN_algorithm == "Sklearn": dist, ind = self.nn.kneighbors(X=self.target_point_cloud, return_distance=True) elif _NN_algorithm == "FAISS": dist, ind = self.nn.search(self.target_point_cloud, self.num_neighb) if _NN_algorithm != "PatchMatch": ind = ind.reshape((ind.shape[0], self.num_neighb)) dist = dist.reshape((dist.shape[0], self.num_neighb)) # convert patch indices to array indices ind = self.source_ind[ind.ravel()] ind_y, ind_x = np.divmod(ind, im_w - 2 * pad) ind = (ind_x + pad) + (ind_y + pad) * im_w ind = np.reshape(ind, (-1, self.num_neighb)) for n in range(self.num_neighb): ind_nnf[self.target_ind, :] = ind #[:,n] dist_nnf[self.target_ind, :] = dist #[:,n] elif _NN_algorithm == "PatchMatch": ind_nnf, dist_nnf = self.nn.find_nnf(image, init_guess=init_guess) print('%f sec' % (time.time() - start)) ind_nnf = ind_nnf.reshape((-1, self.num_neighb)) dist_nnf = dist_nnf.reshape((-1, self.num_neighb)) return ind_nnf, dist_nnf
def find_edges(input, test, K): print(f"\tbuilding kNN classifier ... ", end=" ") st_time = time.time() if kNN_type in [1, 2]: input, test = input.todense(), test.todense() if kNN_type == 1: from sklearn.neighbors import NearestNeighbors tree = NearestNeighbors(n_neighbors=K + 1, algorithm='ball_tree').fit(input) elif kNN_type == 2: from scipy import spatial tree = spatial.KDTree(input) elif kNN_type == 3: from n2 import HnswIndex tree = HnswIndex(input.shape[1], distance_type) # distance_type in ['angular', 'L2'] for index in tqdm(range(input.shape[0])): tree.add_data(input[index, :]) tree.build(n_threads=20) elif kNN_type == 4: import pysparnn.cluster_index as ci input_num = input.shape[0] tree = ci.MultiClusterIndex(input, range(input_num)) elif kNN_type == 5: import nmslib M, efC = 30, 100 index_time_params = {'M': M, 'indexThreadQty': num_threads, 'efConstruction': efC, 'post': 0} space_names = ['l2_sparse', 'cosinesimil_sparse'] # https://github.com/nmslib/nmslib/blob/master/manual/spaces.md space_name = space_names[0] data_type = nmslib.DataType.SPARSE_VECTOR tree = nmslib.init(method='hnsw', space=space_name, data_type=data_type) ''' def calc_zero_rows(i): if input[i, :].getnnz() == 0: return 1 else: return 0 pool = Pool(num_threads) zero_row_num = sum(pool.map(calc_zero_rows, range(input.shape[0]))) print(f"# zero rows in input = {zero_row_num}", end=" ") ''' tree.addDataPointBatch(input) tree.createIndex(index_time_params, print_progress=True) # Setting query-time parameters efS = 100 query_time_params = {'efSearch': efS} print('Setting query-time parameters', query_time_params, end=" ") tree.setQueryTimeParams(query_time_params) else: raise NotImplementedError print(f"time={time.time()-st_time:.3f}s") print("\tfinding indices ... ", end=" ") if kNN_type == 1: _, indices = tree.kneighbors(test) elif kNN_type == 2: _, indices = tree.query(test, k=K + 1) elif kNN_type == 3: indices = [] for i in tqdm(range(test.shape[0])): indices.append(tree.search_by_vector(test[i, :], k=K + 1)) elif kNN_type == 4: indices = tree.search(test, k=K+1, k_clusters=100, return_distance=False) elif kNN_type == 5: ''' def calc_zero_rows2(i): if test[i, :].getnnz() == 0: return 1 else: return 0 pool = Pool(num_threads) zero_row_num = sum(pool.map(calc_zero_rows2, range(test.shape[0]))) print(f"# zero rows in test = {zero_row_num}") ''' indices_ = tree.knnQueryBatch(test, k=K+1, num_threads=num_threads) indices = [i[0] for i in indices_] del indices_ else: raise NotImplementedError print(f"time={time.time()-st_time:.3f}s") edge_list = [] for index1, per in enumerate(indices): assert len(per) == K+1, f"index1={index1} len(per)={len(per)} != K={K}" for index2 in per: index2 = int(index2) if index1 != index2: edge_list.append((index1, index2)) print(f"\tget edges done! .... time={time.time()-st_time:.3f}s") return edge_list
class CustomKMeans(object): def __init__( self, n_centers: int = 2, method: str = 'kmeans', max_iter: int = 30, max_iter_no_progress: int = 10, tol_progress: np.float = 1e-3, random_state: Optional[int] = 17, nn_km_branching: int = 32, # branching for k-means tree nn_km_iter: int = 20, # number of iterations per k-means step nn_kd_trees: int = 32, # number of randomized trees to use nn_checks: int = 75, # number of leaves to check in the search nn_autotune: np.float = -1, # auto-tuning of nn parameters apply_fix: bool = False, save_log: bool = True, gpu_idx: int = 0, verbose: bool = False): assert method in {'kmeans', 'kdtree', 'exact', 'exact-gpu'} # FLANN hyperparameters # The whole list of FLANN parameters is available here: # https://github.com/mariusmuja/flann/blob/master/src/cpp/flann/flann.h self.params = { 'kdtree': { 'algorithm': 'kdtree', 'num_neighbors': 1, 'trees': nn_kd_trees, 'checks': nn_checks, 'target_precision': nn_autotune }, 'kmeans': { 'algorithm': 'kmeans', 'num_neighbors': 1, 'branching': nn_km_branching, 'iterations': nn_km_iter, 'checks': nn_checks, 'target_precision': nn_autotune }, 'exact': { 'algorithm': 'exact' }, 'exact-gpu': { 'algorithm': 'exact-gpu' } } # GPU parameters self.gpu = None self.gpu_idx = gpu_idx # Nearest neighbors search method set up if method in {'kmeans', 'kdtree'}: self.nn_search = pyflann.FLANN() elif method == 'exact': self.nn_search = NearestNeighbors(n_neighbors=1, algorithm='kd_tree', leaf_size=nn_checks, metric='minkowski', p=2, n_jobs=1) else: self.nn_search = None self.gpu = faiss.StandardGpuResources() self.n_centers = n_centers self.method = method self.max_iter = max_iter self.max_iter_no_progress = max_iter_no_progress self.tol_progress = tol_progress self.random_state = random_state self.verbose = verbose self._dim = None self._n_samples = None self.centers_ = None self.labels_ = None self.sqdist_ = None self.stats_ = None self.session_id = f'n_centers_{self.n_centers}-' + \ '-'.join( [f'{param}_{val}' for param, val in self.params[self.method].items()] ) self.apply_fix = apply_fix self.log_ = [] self.save_log = save_log def __reset_stats(self): self.stats_ = { 'measure': [], # the cost function 'evaluation': [], # time to evaluate labels 'assignment': [], # time to re-assign labels 'n_centers': self.n_centers, 'apply_fix': self.apply_fix, **self.params[self.method] } def __update_stats(self, measure: np.float, time_eval: np.float, time_assign: np.float): self.stats_['measure'].append(measure) self.stats_['evaluation'].append(time_eval) self.stats_['assignment'].append(time_assign) def fit(self, data: np.ndarray) -> None: data = data.astype(np.float32) self._n_samples, self._dim = data.shape np.random.seed(self.random_state) # Initialize centers (we use uniform random initialization) self.centers_ = np.random.uniform(np.min(data), np.max(data), size=self.n_centers * self._dim).reshape( self.n_centers, self._dim).astype(data.dtype) # Start fitting self.__reset_stats() self.log_.append( f'Session id: {self.session_id} ' f'at {datetime.utcnow().strftime("%Y-%m-%d %H:%M:%S")}') progress = self.max_iter_no_progress for it in range(self.max_iter): tic_it = time() # Evaluate labels and squared distances if self.method in {'kmeans', 'kdtree'}: labels_, sqdist = self.nn_search.nn(self.centers_, data, **self.params[self.method]) if self.apply_fix and it > 0: reassigned = np.where(labels_ != self.labels_)[0] if len(reassigned): sqdist_checked = np.linalg.norm( data[reassigned, :] - self.centers_[self.labels_[reassigned], :], axis=1)**2 correct = sqdist[reassigned] <= sqdist_checked.ravel() incorrect = ~correct to_assign = reassigned[correct] to_leave = reassigned[incorrect] if len(to_assign): self.labels_[to_assign] = labels_[to_assign] self.sqdist_ = sqdist if len(to_leave): self.sqdist_[to_leave] = sqdist_checked[incorrect] else: self.labels_, self.sqdist_ = labels_, sqdist elif self.method == 'exact': self.nn_search.fit(self.centers_) self.sqdist_, self.labels_ = self.nn_search.kneighbors( X=data, return_distance=True) self.sqdist_, self.labels_ = \ self.sqdist_.ravel()**2, self.labels_.ravel() else: index_flat = faiss.IndexFlatL2(self._dim) self.nn_search = faiss.index_cpu_to_gpu( self.gpu, self.gpu_idx, index_flat) self.nn_search.add(self.centers_) self.sqdist_, self.labels_ = self.nn_search.search(data, 1) self.sqdist_, self.labels_ = \ self.sqdist_.ravel(), self.labels_.ravel() toc = time() t1 = toc - tic_it # Update centers tic = time() for label in range(self.n_centers): idx = np.where(self.labels_ == label)[0] if len(idx): self.centers_[label] = data[idx].mean(axis=0) else: # If the cluster is empty, move its center anyway self.centers_[label] = self.centers_.mean(axis=0) toc_it = time() t2 = toc_it - tic # Print progress self.log_.append(f'\t--> iteration {it} ' f'has been finished -- {toc_it - tic_it:.3e}s') if self.verbose: print(self.log_[-1]) # Check convergence p = np.bincount(self.labels_, weights=self.sqdist_).sum() p /= self._n_samples self.__update_stats(p, t1, t2) if it >= 1 and self.stats_['measure'][-2] - \ self.stats_['measure'][-1] < self.tol_progress: progress -= 1 if not progress: print(f'\nIteration {it}: ' f'no progress during ' f'last {self.max_iter_no_progress} iterations') break else: progress = self.max_iter_no_progress # Save the log self.log_.append(self.time_report()) if self.save_log: path = Path(f'../output/logs/' f'{self.__class__.__name__}/').resolve() path.mkdir(parents=True, exist_ok=True) with open(f'{str(path)}/{self.session_id}.txt', 'w') as file: for line in self.log_: file.write(f'{line}\n') def fit_predict(self, data: np.ndarray) -> np.ndarray: self.fit(data) return self.labels_ def time_report(self) -> str: if self.stats_ is None: rep = 'No statistics are available.' else: t_eval = np.asarray(self.stats_['evaluation']) t_assign = np.asarray(self.stats_['assignment']) rep = f'\nEvaluation time per iteration:\n' \ f'\tAVG. = {t_eval.mean()}s\n' \ f'\tSTD. = {t_eval.std()}s\n' \ f'Assignment time per iteration:\n' \ f'\tAVG. = {t_assign.mean()}s\n\tSTD. = {t_assign.std()}s' return rep
def find_edges(input, test, K, cluster_ids, query_ids): print(f"\tbuilding kNN classifier ... ", end=" ") st_time = time.time() if kNN_type in [1, 2]: input, test = input.todense(), test.todense() if kNN_type == 1: from sklearn.neighbors import NearestNeighbors tree = NearestNeighbors(n_neighbors=K + 1, algorithm='ball_tree').fit(input) elif kNN_type == 2: from scipy import spatial tree = spatial.KDTree(input) elif kNN_type == 3: from n2 import HnswIndex tree = HnswIndex(input.shape[1], distance_type) # distance_type in ['angular', 'L2'] for index in tqdm(range(input.shape[0])): tree.add_data(input[index, :]) tree.build(n_threads=20) elif kNN_type == 4: import pysparnn.cluster_index as ci input_num = input.shape[0] tree = ci.MultiClusterIndex(input, range(input_num)) elif kNN_type == 5: import nmslib M, efC, num_threads = 30, 100, 10 index_time_params = {'M': M, 'indexThreadQty': num_threads, 'efConstruction': efC, 'post': 0} space_name = 'cosinesimil_sparse' data_type = nmslib.DataType.SPARSE_VECTOR tree = nmslib.init(method='hnsw', space=space_name, data_type=data_type) print(f"type(input) = {type(input)} type(test)={type(test)}", end=" ") tree.addDataPointBatch(input) tree.createIndex(index_time_params) # Setting query-time parameters efS = 100 query_time_params = {'efSearch': efS} print('Setting query-time parameters', query_time_params) tree.setQueryTimeParams(query_time_params) else: raise NotImplementedError print(f"time={time.time()-st_time:.3f}s") print("\tfinding indices ... ", end=" ") if kNN_type == 1: _, indices = tree.kneighbors(test) elif kNN_type == 2: _, indices = tree.query(test, k=K + 1) elif kNN_type == 3: indices = [] for i in tqdm(range(test.shape[0])): indices.append(tree.search_by_vector(test[i, :], k=K + 1)) elif kNN_type == 4: indices = tree.search(test, k=K+1, k_clusters=100, return_distance=False) elif kNN_type == 5: indices_ = tree.knnQueryBatch(test, k=K, num_threads=num_threads) indices = [i[0] for i in indices_] del indices_ else: raise NotImplementedError print(f"time={time.time()-st_time:.3f}s") edge_list = [] for index1, per in enumerate(indices): for index2 in per: index2 = int(index2) if index1 != index2: edge_list.append((query_ids[index1], center_ids[index2])) print(f"\tdone! .... time={time.time()-st_time:.3f}s") return edge_list