def local_scaling(D:np.ndarray, k:int=7, metric:str='distance', test_ind:np.ndarray=None, n_jobs:int=1): """Transform a distance matrix with Local Scaling. Transforms the given distance matrix into new one using local scaling [1]_ with the given `k`-th nearest neighbor. There are two types of local scaling methods implemented. The original one and NICDM, both reduce hubness in distance spaces, similarly to Mutual Proximity. Parameters ---------- D : ndarray or csr_matrix The ``n x n`` symmetric distance (similarity) matrix. k : int, optional (default: 7) Neighborhood radius for local scaling. metric : {'distance', 'similarity'}, optional (default: 'distance') Define, whether matrix `D` is a distance or similarity matrix. NOTE: self similarities in sparse `D_ls` are set to ``np.inf`` test_ind : ndarray, optional (default: None) Define data points to be hold out as part of a test set. Can be: - None : Rescale all distances - ndarray : Hold out points indexed in this array as test set. n_jobs : int, optional, default: 1 Number of processes for parallel computations. - `1`: Don't use multiprocessing. - `-1`: Use all CPUs Returns ------- D_ls : ndarray Secondary distance LocalScaling matrix. References ---------- .. [1] Schnitzer, D., Flexer, A., Schedl, M., & Widmer, G. (2012). Local and global scaling reduce hubs in space. The Journal of Machine Learning Research, 13(1), 2871–2902. """ log = ConsoleLogging() # Checking input io.check_distance_matrix_shape(D) io.check_valid_metric_parameter(metric) sparse = issparse(D) n = D.shape[0] if n_jobs == -1: n_jobs = cpu_count() if metric == 'similarity': kth = n - k exclude = -np.inf self_tmp_value = np.inf self_value = 1. log.warning("Similarity matrix support for LS is experimental.") if sparse and n_jobs != 1: log.warning("Parallel processing not implemented for sparse " "matrices. Using single process instead.") n_jobs = 1 else: # metric == 'distance': kth = k - 1 exclude = np.inf self_value = 0 self_tmp_value = self_value if sparse: log.error("Sparse distance matrices are not supported.") raise NotImplementedError( "Sparse distance matrices are not supported.") D = np.copy(D) if test_ind is None: train_ind = slice(0, n) #take all else: train_ind = np.setdiff1d(np.arange(n), test_ind) if sparse: r = np.zeros(n) for i in range(n): di = D[i, train_ind].toarray() di[i] = exclude r[i] = np.partition(di, kth=kth)[kth] D_ls = lil_matrix(D.shape) # Number of nonzero cells per row nnz = D.getnnz(axis=1) else: np.fill_diagonal(D, exclude) if n_jobs > 1: r_ctype = RawArray(ctypes.c_double, n) r = np.frombuffer(r_ctype, dtype=np.float64) with Pool(processes=n_jobs, initializer=_ls_load_shared_data, initargs=(D, train_ind, r, r_ctype)) as pool: for _ in pool.imap(func=partial(_ls_calculate_r, kth=kth), iterable=range(n)): pass # results handled within func else: r = np.partition(D[:, train_ind], kth=kth)[:, kth] if sparse or n_jobs == 1: D_ls = np.zeros_like(D) for i in range(n): # vectorized inner loop: calc only triu part tmp = np.empty(n-i) tmp[0] = self_tmp_value if metric == 'similarity': if sparse and nnz[i] <= k: # Don't rescale if there are tmp[1:] = np.nan # too few neighbors in row else: tmp[1:] = np.exp(-1 * D[i, i+1:]**2 / (r[i] * r[i+1:])) else: tmp[1:] = 1 - np.exp(-1 * D[i, i+1:]**2 / (r[i] * r[i+1:])) D_ls[i, i:] = tmp # copy triu to tril -> symmetric matrix (diag=zeros) # NOTE: does not affect self values, since inf+inf=inf and 0+0=0 D_ls += D_ls.T else: D_ls_ctype = RawArray(ctypes.c_double, D.size) D_ls = np.frombuffer(D_ls_ctype, dtype=np.float64).reshape(D.shape) with Pool(processes=n_jobs, initializer=_ls_load_shared_data, initargs=(D, train_ind, r, r_ctype, D_ls, D_ls_ctype)) as pool: for _ in pool.imap(func=partial(_ls_calculate_sec_dist, n=n, metric=metric, self_tmp_value=self_tmp_value), iterable=range(n)): pass # results handled within func # triu is copied to tril within func if sparse: for i, nz in enumerate(nnz): if nz <= k: # too few neighbors D_ls[i, :] = D[i, :] return D_ls.tocsr() else: np.fill_diagonal(D_ls, self_value) return D_ls
def local_scaling_sample(D:np.ndarray, k:int=7, metric:str='distance', train_ind:np.ndarray=None, test_ind:np.ndarray=None): """Transform a distance matrix with Local Scaling. --- DRAFT version --- Transforms the given distance matrix into new one using local scaling [1]_ with the given `k`-th nearest neighbor. There are two types of local scaling methods implemented. The original one and NICDM, both reduce hubness in distance spaces, similarly to Mutual Proximity. Parameters ---------- D : ndarray or csr_matrix The ``n x n`` symmetric distance (similarity) matrix. k : int, optional (default: 7) Neighborhood radius for local scaling. metric : {'distance', 'similarity'}, optional (default: 'distance') Define, whether matrix `D` is a distance or similarity matrix. NOTE: self similarities in sparse `D_ls` are set to ``np.inf`` train_ind : ndarray, optional If given, use only these data points as neighbors for rescaling. test_ind : ndarray, optional (default: None) Define data points to be hold out as part of a test set. Can be: - None : Rescale all distances - ndarray : Hold out points indexed in this array as test set. Returns ------- D_ls : ndarray Secondary distance LocalScaling matrix. References ---------- .. [1] Schnitzer, D., Flexer, A., Schedl, M., & Widmer, G. (2012). Local and global scaling reduce hubs in space. The Journal of Machine Learning Research, 13(1), 2871–2902. """ log = ConsoleLogging() # Checking input io.check_sample_shape_fits(D, train_ind) io.check_valid_metric_parameter(metric) sparse = issparse(D) n = D.shape[0] if metric == 'similarity': if train_ind is not None: raise NotImplementedError kth = n - k exclude = -np.inf self_value = 1. log.warning("Similarity matrix support for LS is experimental.") else: # metric == 'distance': kth = k - 1 exclude = np.inf self_value = 0 if sparse: log.error("Sparse distance matrices are not supported.") raise NotImplementedError( "Sparse distance matrices are not supported.") D = np.copy(D) if test_ind is None: train_set_ind = slice(0, n) #take all n_ind = range(n) else: train_set_ind = np.setdiff1d(np.arange(n), test_ind) n_ind = test_ind # Exclude self distances for j, sample in enumerate(train_ind): D[sample, j] = exclude r = np.zeros(n) for i in range(n): if train_ind is None: if sparse: di = D[i, train_set_ind].toarray() else: di = D[i, train_set_ind] else: di = D[i, :] # all columns are training in this case r[i] = np.partition(di, kth=kth)[kth] if sparse: D_ls = lil_matrix(D.shape) # Number of nonzero cells per row nnz = D.getnnz(axis=1) else: D_ls = np.zeros_like(D) if metric == 'similarity': for i in n_ind: if sparse and nnz[i] <= k: # Don't rescale if there are too few D_ls[i, :] = D[i, :] # neighbors in the current row else: D_ls[i, :] = np.exp(-1 * D[i, :]**2 / (r[i] * r[train_ind])) else: for i in n_ind: D_ls[i, :] = 1 - np.exp(-1 * D[i, :]**2 / (r[i] * r[train_ind])) if test_ind is None: if sparse: return D_ls.tocsr() else: np.fill_diagonal(D_ls, self_value) return D_ls else: # Ensure correct self distances for j, sample in enumerate(train_ind): D_ls[sample, j] = self_value return D_ls[test_ind]
def test_error(self): log = ConsoleLogging() log.error("Error") return self