def mutual_proximity_gaussi_sample(D: np.ndarray, idx: np.ndarray, metric: str = 'distance', test_set_ind: np.ndarray = None, verbose: int = 0): """Transform a distance matrix with Mutual Proximity (empiric distribution). NOTE: this docstring does not yet fully reflect the properties of this proof-of-concept function! Applies Mutual Proximity (MP) [1]_ on a distance/similarity matrix using the empiric data distribution (EXACT, rather SLOW). The resulting secondary distance/similarity matrix should show lower hubness. Parameters ---------- D : ndarray The ``n x s`` distance or similarity matrix, where ``n`` and ``s`` are the dataset and sample size, respectively. idx : ndarray The index array that determines, to which data points the columns in `D` correspond. metric : {'distance', 'similarity'}, optional (default: 'distance') Define, whether matrix `D` is a distance or similarity matrix. test_set_ind : ndarray, optional (default: None) Define data points to be hold out as part of a test set. Can be: - None : Rescale all distances - ndarray : Hold out points indexed in this array as test set. verbose : int, optional (default: 0) Increasing level of output (progress report). Returns ------- D_mp : ndarray Secondary distance MP empiric matrix. References ---------- .. [1] Schnitzer, D., Flexer, A., Schedl, M., & Widmer, G. (2012). Local and global scaling reduce hubs in space. The Journal of Machine Learning Research, 13(1), 2871–2902. """ # Initialization and checking input log = ConsoleLogging() io.check_sample_shape_fits(D, idx) io.check_valid_metric_parameter(metric) n = D.shape[0] s = D.shape[1] j = np.ones(n, int) j *= (n + 1) # illegal indices will throw index out of bounds error j[idx] = np.arange(s) if metric == 'similarity': self_value = 1 else: # metric == 'distance': self_value = 0 exclude_value = np.nan if test_set_ind is None: n_ind = range(n) else: n_ind = test_set_ind # Start MP D = D.copy() if issparse(D): raise NotImplementedError # ensure correct self distances (NOT done for sparse matrices!) for j, sample in enumerate(idx): D[sample, j] = exclude_value # Calculate mean and std per row, w/o self values (nan) mu = np.nanmean(D, 1) sd = np.nanstd(D, 1, ddof=0) # Avoid downstream div/0 errors sd[sd == 0] = 1e-7 # set self dist/sim back to self_value to avoid scipy warnings for j, i in enumerate(idx): D[i, j] = self_value # # MP Gaussi # D_mp = np.zeros_like(D) # for sample, i in enumerate(n_ind): # if verbose and ((i + 1) % 1000 == 0 or i + 1 == n): # log.message("MP_gaussi: {} of {}.".format(i + 1, n), flush=True) # j = slice(0, s) # # if metric == 'similarity': # p1 = norm.cdf(D[i, j], mu[i], sd[i]) # p2 = norm.cdf(D[i, j], mu[idx], sd[idx]) # D_mp[i, j] = (p1 * p2).ravel() # else: # # Survival function: sf(.) := 1 - cdf(.) # p1 = norm.sf(D[i, j], mu[i], sd[i]) # p2 = norm.sf(D[i, j], mu[idx], sd[idx]) # D_mp[i, j] = (1 - p1 * p2).ravel() # # # Ensure correct self distances # for j, sample in enumerate(idx): # D_mp[sample, j] = self_value # if test_set_ind is None: # return D_mp # else: # return D_mp[test_set_ind] return mu, sd
def test_warning(self): log = ConsoleLogging() log.warning("Warning") return self
def test_error(self): log = ConsoleLogging() log.error("Error") return self
def test_console_logging_has_all_methods(self): log = ConsoleLogging() has_all_attributes = hasattr(log, 'warning') and \ hasattr(log, 'warning') and hasattr(log, 'error') return self.assertTrue(has_all_attributes)
def test_message(self): log = ConsoleLogging() log.message("Message") return self
def _mutual_proximity_empiric_full(D: np.ndarray, metric: str = 'distance', test_set_ind: np.ndarray = None, min_nnz: int = 0, verbose: int = 0, n_jobs=None): """Transform a distance matrix with Mutual Proximity (empiric distribution). Applies Mutual Proximity (MP) [1]_ on a distance/similarity matrix using the empiric data distribution (EXACT, rather SLOW). The resulting secondary distance/similarity matrix should show lower hubness. Parameters ---------- D : ndarray or csr_matrix - ndarray: The ``n x n`` symmetric distance or similarity matrix. - csr_matrix: The ``n x n`` symmetric similarity matrix. NOTE: In case of sparse ``D`, zeros are interpreted as missing values and ignored during calculations. Thus, results may differ from using a dense version. metric : {'distance', 'similarity'}, optional (default: 'distance') Define, whether matrix `D` is a distance or similarity matrix. NOTE: In case of sparse `D`, only 'similarity' is supported. test_set_ind : ndarray, optional (default: None) Define data points to be hold out as part of a test set. Can be: - None : Rescale all distances - ndarray : Hold out points indexed in this array as test set. min_nnz : int, optional, default: 0 Calculate MP between two objects `i` and `j`, iff at least ``min_nnz`` values are present in both row ``i`` and ``j``. Otherwise, return the original distance/similarity. NOTE: Currently only implemented for MP empiric w/ sparse sim matrices verbose : int, optional (default: 0) Increasing level of output (progress report). Returns ------- D_mp : ndarray Secondary distance MP empiric matrix. References ---------- .. [1] Schnitzer, D., Flexer, A., Schedl, M., & Widmer, G. (2012). Local and global scaling reduce hubs in space. The Journal of Machine Learning Research, 13(1), 2871–2902. """ # Initialization n = D.shape[0] log = ConsoleLogging() # Check input io.check_distance_matrix_shape(D) io.check_valid_metric_parameter(metric) if metric == 'similarity': self_value = 1 exclude_value = np.inf else: # metric == 'distance': self_value = 0 exclude_value = -np.inf if issparse(D): raise ValueError("MP sparse only supports similarity matrices.") if test_set_ind is None: pass # TODO implement #train_set_ind = slice(0, n) elif not np.all(~test_set_ind): raise NotImplementedError("MP empiric does not yet support train/" "test splits.") #train_set_ind = np.setdiff1d(np.arange(n), test_set_ind) if issparse(D): return _mutual_proximity_empiric_sparse(D, test_set_ind, min_nnz, verbose, log, n_jobs) # Start MP D = D.copy() # ensure correct self distances (NOT done for sparse matrices!) np.fill_diagonal(D, exclude_value) D_mp = np.zeros_like(D) # Calculate MP empiric for i in range(n - 1): if verbose and ((i + 1) % 1000 == 0 or i == n - 2): log.message("MP_empiric: {} of {}.".format(i + 1, n - 1), flush=True) # Calculate only triu part of matrix j_idx = i + 1 dI = D[i, :][np.newaxis, :] dJ = D[j_idx:n, :] d = D[j_idx:n, i][:, np.newaxis] if metric == 'similarity': D_mp[i, j_idx:] = np.sum((dI <= d) & (dJ <= d), 1) / n #(n - 2) else: # metric == 'distance': D_mp[i, j_idx:] = 1 - (np.sum( (dI > d) & (dJ > d), 1) / n) #(n - 2)) # Mirror, so that matrix is symmetric D_mp += D_mp.T np.fill_diagonal(D_mp, self_value) return D_mp
def hubness(D:np.ndarray, k:int=5, metric='distance', verbose:int=0, n_jobs:int=1, random_state=None, shuffle_equal=True): """Compute hubness of a distance matrix. Hubness [1]_ is the skewness of the `k`-occurrence histogram (reverse nearest neighbor count, i.e. how often does a point occur in the `k`-nearest neighbor lists of other points). Parameters ---------- D : ndarray The ``n x n`` symmetric distance (similarity) matrix or an ``n x m`` partial distances matrix (e.g. for train/test splits, with test objects in rows, train objects in column) NOTE: Partial distance matrices MUST NOT contain self distances. k : int, optional (default: 5) Neighborhood size for `k`-occurrence. metric : {'distance', 'similarity'}, optional (default: 'distance') Define, whether matrix `D` is a distance or similarity matrix verbose : int, optional (default: 0) Increasing level of output (progress report). n_jobs : int, optional (default: 1) Number of parallel processes spawned for hubness calculation. Value 1 (default): One process (not using multiprocessing) Value (-1): As many processes as number of available CPUs. random_state : int, optional Seed the RNG for reproducible results. NOTE: Currently only compatible with `n_jobs`=1 shuffle_equal : bool, optional If true, shuffle neighbors with identical distances to avoid artifact hubness. NOTE: This is especially useful for secondary distance measures with a restricted number of possible values, e.g. SNN or MP empiric. Returns ------- S_k : float Hubness (skewness of `k`-occurrence distribution) D_k : ndarray `k`-nearest neighbor lists N_k : ndarray `k`-occurrence list References ---------- .. [1] Radovanović, M., Nanopoulos, A., & Ivanović, M. (2010). Hubs in Space : Popular Nearest Neighbors in High-Dimensional Data. Journal of Machine Learning Research, 11, 2487–2531. Retrieved from http://jmlr.csail.mit.edu/papers/volume11/radovanovic10a/ radovanovic10a.pdf """ # Don't use multiprocessing environment when using only one job if n_jobs == 1: return _hubness_no_multiprocessing(D=D, k=k, metric=metric, verbose=verbose, random_state=random_state, shuffle_equal=shuffle_equal) if random_state is not None: raise ValueError("Seeding the RNG is not compatible with using n_jobs > 1.") log = ConsoleLogging() io.check_is_nD_array(arr=D, n=2, arr_type='Distance') io.check_valid_metric_parameter(metric) n, m = D.shape if k >= m: k_old = k k = m - 1 log.warning("Reducing k from {} to {}, so that it is less than " "the total number of neighbors.".format(k_old, k)) if metric == 'distance': d_self = np.inf sort_order = 1 kth = np.arange(k) if metric == 'similarity': d_self = -np.inf sort_order = -1 kth = np.arange(m - k, m) if verbose: log.message("Hubness calculation (skewness of {}-occurrence)".format(k)) # Initialization D = D.copy() D_k = np.zeros((n, k), dtype=np.float64) if issparse(D): pass # correct self-distance must be ensured upstream for sparse else: if n == m: # Set self dist to inf np.fill_diagonal(D, d_self) else: pass # Partial distance matrices MUST NOT contain self distances # make non-finite (NaN, Inf) appear at the end of the sorted list D[~np.isfinite(D)] = d_self # Parallelization if n_jobs == -1: # take all cpus NUMBER_OF_PROCESSES = mp.cpu_count() # @UndefinedVariable else: NUMBER_OF_PROCESSES = n_jobs D_k_ctype = RawArray(ctypes.c_int32, n*k) D_k = np.frombuffer(D_k_ctype, dtype=np.int32).reshape((n, k)) with Pool(processes=NUMBER_OF_PROCESSES, initializer=_hubness_load_shared_data, initargs=(D, D_k, )) as pool: for _ in pool.imap( func=partial(_hubness_nearest_neighbors, n=n, m=m, d_self=d_self, metric=metric, kth=kth, sort_order=sort_order, log=log, verbose=verbose, shuffle_equal=shuffle_equal), #chunksize=int(1e2), iterable=range(n)): pass # results handled within func # k-occurrence N_k = np.bincount(D_k.astype(int).ravel(), minlength=m) # Hubness S_k = stats.skew(N_k) if verbose: log.message("Hubness calculation done.", flush=True) # return hubness, k-nearest neighbors, N occurence return S_k, D_k, N_k
def r_precision(S:np.ndarray, y:np.ndarray, metric:str='distance', average:str='weighted', return_y_pred:int=0, verbose:int=0, n_jobs:int=1) -> float: """ Calculate R-Precision (recall at R-th position). Parameters ---------- S : ndarray or CSR matrix Distance (similarity) matrix y : ndarray Target (ground truth) labels metric : 'distance' or 'similarity', optional, default: 'similarity' Define, whether `S` is a distance or similarity matrix. average : 'weighted', 'macro' or None, optional, default: 'weighted' Ignored. Weighted and macro precisions are returned. return_y_pred : int, optional, default: 0 If > 0, return the labels of the `return_y_pred` nearest neighbors verbose : int, optional, default: 0 Increasing level of output. n_jobs : int, optional, default: 1 Number of parallel processes to use. Returns ------- r_precision : dictionary with following keys: macro : float Macro R-Precision. weighted : float Weighted R-Precision. per_item : ndarray R-Precision at the object. relevant_items : ndarray Relevant items per class. y_true : ndarray Target labels (req. for weighting). y_pred : ndarray Labels of some k-nearest neighbors """ io.check_distance_matrix_shape(S) io.check_distance_matrix_shape_fits_labels(S, y) io.check_valid_metric_parameter(metric) log = ConsoleLogging() n, _ = S.shape S_is_sparse = issparse(S) if metric != 'similarity' or not S_is_sparse: raise NotImplementedError("Only sparse similarity matrices so far.") # Map labels to 0..n(labels)-1 le = LabelEncoder() # Add int.min for misclassifications incorr_orig = np.array([np.nan]).astype(int) le.fit(np.append(y, incorr_orig)) y = le.transform(y) incorrect = le.transform(incorr_orig) # Number of relevant items, i.e. number of each label relevant_items = np.bincount(y) - 1 # one less for self class # R-Precision for each item r_prec = np.zeros(n, dtype=np.float) # Classify each point in test set if verbose: log.message("Creating shared memory data.") n_random_pred = mp.Value(ctypes.c_int) n_random_pred.value = 0 if verbose and log: log.message("Spawning processes for prediction.") y_pred = np.zeros((n, return_y_pred), dtype=float) kwargs = {'y_pred' : return_y_pred, 'incorrect' : incorrect} with mp.Pool(processes=n_jobs, initializer=_load_shared_csr, initargs=(S, y, n_random_pred, relevant_items)) as pool: for i, r in enumerate( pool.imap( func=partial(_r_prec_worker, **kwargs), iterable=range(n), chunksize=int(1e2))): if verbose and ((i+1)%int(1e7 / 10**verbose) == 0 or i == n-1): log.message("Classification: {} of {} on {}.".format( i+1, n, mp.current_process().name), flush=True) try: r_prec[i] = r[0] y_pred[i, :] = r[1] except: r_prec[i] = r if i == n-1: pass pool.join() if verbose and log: log.message("Retrieving nearest neighbors.") # Work-around for new scikit-learn requirement of 1D arrays for LabelEncoder y_pred = np.asarray([le.inverse_transform(col) for col in y_pred.T.astype(int)]).T if verbose and log: log.message("Finishing.") if n_random_pred.value: log.warning(("{} queries were classified randomly, because all " "distances were non-finite numbers or there were no other " "objects in the same class.").format(n_random_pred.value)) return_dict = {'macro' : r_prec.mean(), 'weighted' : np.average(r_prec, weights=relevant_items[y]), 'per_item' : r_prec, 'relevant_items' : relevant_items, 'y_true' : y, 'y_pred' : y_pred} return return_dict
import multiprocessing as mp from multiprocessing import RawArray, Pool, cpu_count import numpy as np from scipy import stats from scipy.sparse.base import issparse from sklearn.metrics.pairwise import pairwise_distances from sklearn.utils.extmath import row_norms from sklearn.utils.validation import check_random_state from hub_toolbox import io from hub_toolbox.htlogging import ConsoleLogging from hub_toolbox.utils import SynchronizedCounter __all__ = ['Hubness', 'hubness', 'hubness_from_vectors'] VALID_METRICS = ['euclidean', 'cosine', 'precomputed'] log = ConsoleLogging() def _k_neighbors_initializer(X_=None, Y_=None, Dk_=None, X_norm_=None, Y_norm_=None, counter_=None): global X, Y, Dk, X_norm, Y_norm, counter X = X_ Y = Y_ Dk = Dk_ X_norm = X_norm_ Y_norm = Y_norm_ counter = counter_ return def _k_neighbors_parallel(i, kth, start, end, metric, verbose, batch_size, n_batches):
def score(D:np.ndarray, target:np.ndarray, k=5, metric:str='distance', test_set_ind:np.ndarray=None, verbose:int=0, sample_idx=None, filter_self=True): """Perform `k`-nearest neighbor classification. Use the ``n x n`` symmetric distance matrix `D` and target class labels `target` to perform a `k`-NN experiment (leave-one-out cross-validation or evaluation of test set; see parameter `test_set_ind`). Ties are broken by the nearest neighbor. Parameters ---------- D : ndarray The ``n x n`` symmetric distance (similarity) matrix. target : ndarray (of dtype=int) The ``n x 1`` target class labels (ground truth). k : int or array_like (of dtype=int), optional (default: 5) Neighborhood size for `k`-NN classification. For each value in `k`, one `k`-NN experiment is performed. HINT: Providing more than one value for `k` is a cheap means to perform multiple `k`-NN experiments at once. Try e.g. ``k=[1, 5, 20]``. metric : {'distance', 'similarity'}, optional (default: 'distance') Define, whether matrix `D` is a distance or similarity matrix test_sed_ind : ndarray, optional (default: None) Define data points to be hold out as part of a test set. Can be: - None : Perform a LOO-CV experiment - ndarray : Hold out points indexed in this array as test set. Fit model to remaining data. Evaluate model on test set. verbose : int, optional (default: 0) Increasing level of output (progress report). sample_idx : ... TODO add description filter_self : bool, optional, default: True Remove self similarities from sparse ``D``. This assumes that the highest similarity per row is the self similarity. NOTE: Quadratic dense matrices are always filtered for self distances/similarities, even if `filter_self` is set t0 `False`. Returns ------- acc : ndarray (shape=(n_k x 1), dtype=float) Classification accuracy (`n_k`... number of items in parameter `k`) HINT: Refering to the above example... ... ``acc[0]`` gives the accuracy of the ``k=1`` experiment. corr : ndarray (shape=(n_k x n), dtype=int) Raw vectors of correctly classified items HINT: ... ``corr[1, :]`` gives these items for the ``k=5`` experiment. cmat : ndarray (shape=(n_k x n_t x n_t), dtype=int) Confusion matrix (``n_t`` number of unique items in parameter target) HINT: ... ``cmat[2, :, :]`` gives the confusion matrix of the ``k=20`` experiment. """ # Check input sanity log = ConsoleLogging() if sample_idx is None: io.check_distance_matrix_shape(D) else: io.check_sample_shape_fits(D, sample_idx) io.check_distance_matrix_shape_fits_labels(D, target) io.check_valid_metric_parameter(metric) if metric == 'distance': d_self = np.inf sort_order = 1 if metric == 'similarity': d_self = -np.inf sort_order = -1 # Copy, because data is changed D = D.copy() target = target.astype(int) D_is_sparse = issparse(D) if verbose: log.message("Start k-NN experiment.") # Handle LOO-CV vs. test set mode if test_set_ind is None: n = D.shape[0] test_set_ind = range(n) # dummy train_set_ind = n # dummy else: # number of points to be classified n = test_set_ind.size # Indices of training examples train_set_ind = np.setdiff1d(np.arange(n), test_set_ind) if sample_idx is not None: raise NotImplementedError("Sample k-NN does not support train/" "test splits at the moment.") # Number of k-NN parameters try: k_length = k.size except AttributeError as e: if isinstance(k, int): k = np.array([k]) k_length = k.size elif isinstance(k, list): k = np.array(k) k_length = k.size else: raise e acc = np.zeros((k_length, 1)) corr = np.zeros((k_length, D.shape[0])) cl = np.sort(np.unique(target)) if D_is_sparse: # Add a label for unknown class (object w/o nonzero sim to any others) cl = np.append(cl, cl.max()+1) n_classes = len(cl) + 1 else: n_classes = len(cl) cmat = np.zeros((k_length, n_classes, n_classes)) classes = target.copy() for idx, cur_class in enumerate(cl): # change labels to 0, 1, ..., len(cl)-1 classes[target == cur_class] = idx if sample_idx is not None: sample_classes = classes[sample_idx] j = np.ones(n, int) j *= (n+1) # illegal indices will throw index out of bounds error j[sample_idx] = np.arange(len(sample_idx)) for j, sample in enumerate(sample_idx): D[sample, j] = d_self cl = range(len(cl)) rnd_classif = np.zeros(k_length) # Classify each point in test set for i in test_set_ind: if verbose and ((i+1)%1000==0 or i+1==n): log.message("Prediction: {} of {}.".format(i+1, n), flush=True) seed_class = classes[i] if D_is_sparse: row = D.getrow(i) else: row = D[i, :] if sample_idx is None: row[i] = d_self # Sort points in training set according to distance # Randomize, in case there are several points of same distance # (this is especially relevant for SNN rescaling) if sample_idx is None: rp = train_set_ind else: rp = np.arange(len(sample_idx)) if D_is_sparse: nnz = row.nnz rp = np.random.permutation(nnz) d2 = row.data[rp] # Partition for each k value kth = nnz - k - 1 # sort the two highest similarities to end kth = np.append(kth, [nnz-2, nnz-1]) # Clip negative indices (nnz < k) np.clip(kth, a_min=0, a_max=nnz-1, out=kth) # Remove duplicate k values and sort kth = np.unique(kth) d2idx = np.argpartition(d2, kth=kth) d2idx = d2idx[~np.isnan(d2[d2idx])][::-1] idx = row.nonzero()[1][rp[d2idx]] idx = idx[1:] # rem self sim else: rp = np.random.permutation(rp) d2 = row[rp] d2idx = np.argsort(d2, axis=0)[::sort_order] d2idx = d2idx[~np.isnan(d2[d2idx])] # filter NaN values idx = rp[d2idx] # More than one k is useful for cheap multiple k-NN experiments at once for j in range(k_length): # Make sure no inf/-inf/nan values are used for classification if D_is_sparse: #print(row[0, idx[0:k[j]]].toarray()) finite_val = np.isfinite(row[0, idx[0:k[j]]].toarray().ravel()) #print(finite_val) else: finite_val = np.isfinite(row[idx[0:k[j]]]) # However, if no values are finite, classify randomly if finite_val.sum() == 0: idx = np.random.permutation(idx) finite_val = np.ones_like(finite_val) rnd_classif[j] += 1 if sample_idx is None: nn_class = classes[idx[0:k[j]]][finite_val] else: #finite_val = np.isfinite(sample_row[idx[0:k[j]]]) nn_class = sample_classes[idx[0:k[j]]][finite_val] cs = np.bincount(nn_class.astype(int)) if cs.size > 0: max_cs = np.where(cs == np.max(cs))[0] else: max_cs = np.array([len(cl) - 1]) # misclassification label # "tie": use nearest neighbor if len(max_cs) > 1: if seed_class == nn_class[0]: acc[j] += 1/n corr[j, i] = 1 cmat[j, seed_class, nn_class[0]] += 1 # majority vote else: if cl[max_cs[0]] == seed_class: acc[j] += 1/n corr[j, i] = 1 cmat[j, seed_class, cl[max_cs[0]]] += 1 if np.any(rnd_classif): for x in rnd_classif: log.warning(("{} queries were classified randomly, because all " "distances were non-finite numbers.").format(x)) if verbose: log.message("Finished k-NN experiment.") return acc, corr, cmat
def predict(D:np.ndarray, target:np.ndarray, k=5, metric:str='distance', test_ind:np.ndarray=None, verbose:int=0, sample_idx=None, return_cmat=True): """Perform `k`-nearest neighbor classification. Use the ``n x n`` symmetric distance matrix `D` and target class labels `target` to perform a `k`-NN experiment (leave-one-out cross-validation or evaluation of test set; see parameter `test_ind`). Ties are broken by the nearest neighbor. Parameters ---------- D : ndarray The ``n x n`` symmetric distance (similarity) matrix. target : ndarray (of dtype=int) The ``n x 1`` target class labels (ground truth) or ``n x c`` in case of ``c`` binarized multilabels k : int or array_like (of dtype=int), optional (default: 5) Neighborhood size for `k`-NN classification. For each value in `k`, one `k`-NN experiment is performed. HINT: Providing more than one value for `k` is a cheap means to perform multiple `k`-NN experiments at once. Try e.g. ``k=[1, 5, 20]``. metric : {'distance', 'similarity'}, optional (default: 'distance') Define, whether matrix `D` is a distance or similarity matrix test_ind : ndarray, optional (default: None) Define data points to be hold out as part of a test set. Can be: - None : Perform a LOO-CV experiment - ndarray : Hold out points indexed in this array as test set. Fit model to remaining data. Evaluate model on test set. verbose : int, optional (default: 0) Increasing level of output (progress report). return_cmat : bool, optional, default: True If False, only return the predictions `y_pred`. Otherwise also return the confusion matrices. Returns ------- y_pred : ndarray (shape=(n_k, n, c), dtype=int) Predicted class labels (`n_k`... number of items in parameter `k`) HINT: Referring to the above example... ... ``y_pred[0]`` gives the predictions of the ``k=1`` experiment. cmat : ndarray (shape=(n_k x c x n_t x n_t), dtype=int) Confusion matrix (``n_t`` number of unique items in parameter target) HINT: ... ``cmat[2, 0, :, :]`` gives the confusion matrix of the first class in the ``k=20`` experiment in the following order: TN FP FN TP """ # Check input sanity log = ConsoleLogging() if sample_idx is None: io.check_distance_matrix_shape(D) else: io.check_sample_shape_fits(D, sample_idx) #io._check_distance_matrix_shape_fits_labels(D, target) io.check_valid_metric_parameter(metric) if metric == 'distance': d_self = np.inf sort_order = 1 if metric == 'similarity': d_self = -np.inf sort_order = -1 # Copy, because data is changed if not issparse(D): D = D.copy() target = target.astype(int) if target.ndim == 1: target = target[:, np.newaxis] if verbose: log.message("Start k-NN experiment.") # Handle LOO-CV vs. test set mode if test_ind is None: n = D.shape[0] test_set_ind = range(n) # dummy io.check_valid_metric_parameter(metric) train_set_ind = n # dummy else: # number of points to be classified n = test_set_ind.size # Indices of training examples train_set_ind = np.setdiff1d(np.arange(n), test_set_ind) if sample_idx is not None: raise NotImplementedError("Sample k-NN does not support train/" "test splits at the moment.") # Number of k-NN parameters try: k_length = k.size except AttributeError as e: if isinstance(k, int): k = np.array([k]) k_length = k.size elif isinstance(k, list): k = np.array(k) k_length = k.size else: raise e cl = np.sort(np.unique(target)) cmat = np.zeros((k_length, target.shape[1], len(cl), len(cl)), dtype=int) y_pred = np.zeros((k_length, *target.shape), dtype=int) classes = target.copy() for idx, cur_class in enumerate(np.array(cl).ravel()): # change labels to 0, 1, ..., len(cl)-1 classes[target == cur_class] = idx if sample_idx is not None: sample_classes = classes[sample_idx] j = np.ones(n, int) j *= (n+1) # illegal indices will throw index out of bounds error j[sample_idx] = np.arange(len(sample_idx)) for j, sample in enumerate(sample_idx): D[sample, j] = d_self cl = range(len(cl)) # Classify each point in test set for i in test_set_ind: if verbose and ((i+1)%1000==0 or i+1==n): log.message("Prediction: {} of {}.".format(i+1, n), flush=True) if issparse(D): row = D.getrow(i) #row = D.data ind = row.nonzero()[1] row = row.toarray().ravel() else: row = D[i, :] if sample_idx is None: row[i] = d_self # Sort points in training set according to distance # Randomize, in case there are several points of same distance # (this is especially relevant for SNN rescaling) if sample_idx is None: rp = train_set_ind else: if issparse(D): rp = ind else: rp = np.arange(len(sample_idx)) rp = np.random.permutation(rp) d2 = row[rp] d2idx = np.argsort(d2, axis=0)[::sort_order] d2idx = d2idx[~np.isnan(d2[d2idx])] # filter NaN values idx = rp[d2idx] # More than one k is useful for cheap multiple k-NN experiments at once for j in range(k_length): # Make sure no inf/-inf/nan values are used for classification finite_val = np.isfinite(row[idx[0:k[j]]]) # However, if no values are finite, classify randomly if finite_val.sum() == 0: idx = np.random.permutation(idx) finite_val = np.ones_like(finite_val) log.warning("Query was classified randomly, because all " "distances were non-finite numbers.") for l in range(target.shape[1]): l_classes = classes[:, l] if sample_idx is None: nn_class = l_classes[idx[0:k[j]]][finite_val] else: l_sample_classes = sample_classes[:, l] nn_class = l_sample_classes[idx[0:k[j]]][finite_val] cs = np.bincount(nn_class.astype(int)) max_cs = np.where(cs == np.max(cs))[0] seed_class = classes[i, l] # "tie": use nearest neighbor if len(max_cs) > 1: y_pred[j, i, l] = nn_class[0] cmat[j, l, seed_class, nn_class[0]] += 1 # majority vote else: y_pred[j, i, l] = cl[max_cs[0]] cmat[j, l, seed_class, cl[max_cs[0]]] += 1 if verbose: log.message("Finished k-NN experiment.") if return_cmat: return y_pred, cmat else: return y_pred
def _mutual_proximity_empiric_sample(D: np.ndarray, idx: np.ndarray, metric: str = 'distance', test_set_ind: np.ndarray = None, verbose: int = 0, n_jobs=None): """Transform a distance matrix with Mutual Proximity (empiric distribution). NOTE: this docstring does not yet fully reflect the properties of this proof-of-concept function! Applies Mutual Proximity (MP) [1]_ on a distance/similarity matrix using the empiric data distribution (EXACT, rather SLOW). The resulting secondary distance/similarity matrix should show lower hubness. Parameters ---------- D : ndarray The ``n x s`` distance or similarity matrix, where ``n`` and ``s`` are the dataset and sample size, respectively. idx : ndarray The index array that determines, to which data points the columns in `D` correspond. metric : {'distance', 'similarity'}, optional (default: 'distance') Define, whether matrix `D` is a distance or similarity matrix. test_sed_ind : ndarray, optional (default: None) Define data points to be hold out as part of a test set. Can be: - None : Rescale all distances - ndarray : Hold out points indexed in this array as test set. verbose : int, optional (default: 0) Increasing level of output (progress report). Returns ------- D_mp : ndarray Secondary distance MP empiric matrix. References ---------- .. [1] Schnitzer, D., Flexer, A., Schedl, M., & Widmer, G. (2012). Local and global scaling reduce hubs in space. The Journal of Machine Learning Research, 13(1), 2871–2902. """ # Initialization and checking input log = ConsoleLogging() io.check_sample_shape_fits(D, idx) io.check_valid_metric_parameter(metric) n = D.shape[0] s = D.shape[1] if metric == 'similarity': self_value = 1 exclude_value = np.inf else: # metric == 'distance': self_value = 0 exclude_value = -np.inf if issparse(D): raise ValueError("MP sparse only supports similarity matrices.") if test_set_ind is None: n_ind = range(n) #elif not np.all(~test_set_ind): else: n_ind = test_set_ind #raise NotImplementedError("MP empiric does not yet support train/" # "test splits.") #train_set_ind = np.setdiff1d(np.arange(n), test_set_ind) # Start MP D = D.copy() if issparse(D): raise NotImplementedError #return _mutual_proximity_empiric_sparse(D, test_set_ind, verbose, log) # ensure correct self distances (NOT done for sparse matrices!) for j, sample in enumerate(idx): D[sample, j] = exclude_value D_mp = np.zeros_like(D) * np.nan # Calculate MP empiric for i in n_ind: #range(n): if verbose and ((i + 1) % 1000 == 0 or i == n - 2): log.message("MP_empiric: {} of {}.".format(i + 1, n - 1), flush=True) dI = D[i, :][np.newaxis, :] # broadcasted afterwards dJ = D[idx, :] # fancy indexing, thus copy d = dI.T # D[i, :][:, np.newaxis] # both versions are equal # div by n n_pts = s # div by n-1, n-2 #n_pts = (np.isfinite(dI) & np.isfinite(dJ)).sum(1) if metric == 'similarity': D_mp[i, :] = np.sum((dI <= d) & (dJ <= d), 1) / n_pts else: # metric == 'distance': D_mp[i, :] = 1 - (np.sum((dI > d) & (dJ > d), 1) / n_pts) # Ensure correct self distances for j, sample in enumerate(idx): D_mp[sample, j] = self_value if test_set_ind is None: return D_mp else: return D_mp[test_set_ind]
def mutual_proximity_gammai(D: np.ndarray, metric: str = 'distance', min_nnz: int = 30, test_set_ind: np.ndarray = None, verbose: int = 0): """Transform a distance matrix with Mutual Proximity (indep. Gamma distr.). Applies Mutual Proximity (MP) [1]_ on a distance/similarity matrix. Gammai variant assumes independent Gamma distributed distances (FAST). The resulting second. distance/similarity matrix should show lower hubness. Parameters ---------- D : ndarray or csr_matrix - ndarray: The ``n x n`` symmetric distance or similarity matrix. - csr_matrix: The ``n x n`` symmetric similarity matrix. NOTE: In case of sparse `D`, zeros are interpreted as missing values and ignored during calculations. Thus, results may differ from using a dense version. metric : {'distance', 'similarity'}, optional (default: 'distance') Define, whether matrix `D` is a distance or similarity matrix. NOTE: In case of sparse `D`, only 'similarity' is supported. min_nnz : int, optional, default: 30 Calculate MP between two objects `i` and `j`, iff at least ``min_nnz`` values are present in both row ``i`` and ``j``. Otherwise, return the original similarity. Ignored, if `metric` is 'distance'. test_set_ind : ndarray, optional (default: None) Define data points to be hold out as part of a test set. Can be: - None : Rescale all distances - ndarray : Hold out points indexed in this array as test set. verbose : int, optional (default: 0) Increasing level of output (progress report). Returns ------- D_mp : ndarray Secondary distance MP gammai matrix. References ---------- .. [1] Schnitzer, D., Flexer, A., Schedl, M., & Widmer, G. (2012). Local and global scaling reduce hubs in space. The Journal of Machine Learning Research, 13(1), 2871–2902. """ # Initialization n = D.shape[0] log = ConsoleLogging() # Checking input io.check_distance_matrix_shape(D) io.check_valid_metric_parameter(metric) if metric == 'similarity': self_value = 1 else: # metric == 'distance': self_value = 0 if test_set_ind is None: train_set_ind = slice(0, n) else: train_set_ind = np.setdiff1d(np.arange(n), test_set_ind) # Start MP if verbose: log.message('Mutual proximity Gammai rescaling started.', flush=True) D = D.copy() if issparse(D): return _mutual_proximity_gammai_sparse(D, min_nnz, test_set_ind, verbose, log) np.fill_diagonal(D, np.nan) mu = np.nanmean(D[train_set_ind], 0) va = np.nanvar(D[train_set_ind], 0, ddof=1) # Avoid downstream div/0 errors va[va == 0] = 1e-7 A = (mu**2) / va B = va / mu D_mp = np.zeros_like(D) # MP gammai for i in range(n): if verbose and ((i + 1) % 1000 == 0 or i + 1 == n): log.message("MP_gammai: {} of {}".format(i + 1, n), flush=True) j_idx = slice(i + 1, n) if metric == 'similarity': p1 = _local_gamcdf(D[i, j_idx], A[i], B[i]) p2 = _local_gamcdf(D[j_idx, i], A[j_idx], B[j_idx]) D_mp[i, j_idx] = (p1 * p2).ravel() else: # distance p1 = 1 - _local_gamcdf(D[i, j_idx], A[i], B[i]) p2 = 1 - _local_gamcdf(D[j_idx, i], A[j_idx], B[j_idx]) D_mp[i, j_idx] = (1 - p1 * p2).ravel() # Mirroring the matrix D_mp += D_mp.T # set correct self dist/sim np.fill_diagonal(D_mp, self_value) return D_mp
def mutual_proximity_gaussi( D: np.ndarray, metric: str = 'distance', sample_size: int = 0, min_nnz: int = 30, test_set_ind: np.ndarray = None, verbose: int = 0, idx: np.ndarray = None, ): """Transform distances with Mutual Proximity (indep. normal distributions). Applies Mutual Proximity (MP) [1]_ on a distance/similarity matrix. Gaussi variant assumes independent normal distributions (FAST). The resulting second. distance/similarity matrix should show lower hubness. Parameters ---------- D : ndarray or csr_matrix - ndarray: The ``n x n`` symmetric distance or similarity matrix. - csr_matrix: The ``n x n`` symmetric similarity matrix. NOTE: In case of sparse `D`, zeros are interpreted as missing values and ignored during calculations. Thus, results may differ from using a dense version. metric : {'distance', 'similarity'}, optional (default: 'distance') Define, whether matrix `D` is a distance or similarity matrix. NOTE: In case of sparse `D`, only 'similarity' is supported. sample_size : int, optional (default: 0) Define sample size from which Gauss parameters are estimated. Use all data when set to ``0``. Ignored in case of SampleMP (i.e. if provided `idx`). min_nnz : int, optional, default: 30 Calculate MP between two objects `i` and `j`, iff at least ``min_nnz`` values are present in both row ``i`` and ``j``. Otherwise, return the original similarity. Ignored, if `metric` is 'distance'. test_set_ind : ndarray, optional (default: None) Define data points to be hold out as part of a test set. Can be: - None : Rescale all distances - ndarray : Hold out points indexed in this array as test set. Ignored in case of SampleMP (i.e. if provided `idx`). verbose : int, optional (default: 0) Increasing level of output (progress report). idx : ndarray, optional (default: None) The index array that determines to which data points the columns in `D` correspond. Only required for SampleMP. Returns ------- D_mp : ndarray Secondary distance MP gaussi matrix. References ---------- .. [1] Schnitzer, D., Flexer, A., Schedl, M., & Widmer, G. (2012). Local and global scaling reduce hubs in space. The Journal of Machine Learning Research, 13(1), 2871–2902. """ # Initialization log = ConsoleLogging() # Checking input if idx is None: io.check_distance_matrix_shape(D) else: io.check_sample_shape_fits(D, idx) io.check_valid_metric_parameter(metric) n = D.shape[0] s = D.shape[1] if metric == 'similarity': self_value = 1 else: # metric == 'distance': self_value = 0 if test_set_ind is None: train_set_ind = slice(0, n) else: train_set_ind = np.setdiff1d(np.arange(n), test_set_ind) # Start MP Gaussi if verbose: log.message('Mutual Proximity Gaussi rescaling started.', flush=True) D = D.copy() if issparse(D): return _mutual_proximity_gaussi_sparse(D, sample_size, min_nnz, test_set_ind, verbose, log) # ignore self dist/sim for parameter estimation if idx is None: np.fill_diagonal(D, np.nan) else: for j, i in enumerate(idx): D[i, j] = np.nan # Calculate mean and std if idx is None: if sample_size == 0: mu = np.nanmean(D[train_set_ind], 0) sd = np.nanstd(D[train_set_ind], 0, ddof=0) else: samples = np.random.shuffle(train_set_ind)[0:sample_size] mu = np.nanmean(D[samples], 0) sd = np.nanstd(D[samples], 0, ddof=0) else: mu = np.nanmean(D, 1) sd = np.nanstd(D, 1, ddof=0) # Avoid downstream div/0 errors sd[sd == 0] = 1e-7 # set self dist/sim back to self_value to avoid scipy warnings if idx is None: np.fill_diagonal(D, self_value) else: for j, i in enumerate(idx): D[i, j] = self_value # MP Gaussi D_mp = np.zeros_like(D) for i in range(n): if verbose and ((i + 1) % 1000 == 0 or i + 1 == n): log.message("MP_gaussi: {} of {}.".format(i + 1, n), flush=True) if idx is None: j = slice(i + 1, n) j_mom = j else: j = slice(0, s) j_mom = idx[j] if metric == 'similarity': p1 = norm.cdf(D[i, j], mu[i], sd[i]) p2 = norm.cdf(D[i, j], mu[j_mom], sd[j_mom]) D_mp[i, j] = (p1 * p2).ravel() else: # sf(.) := 1 - cdf(.) p1 = norm.sf(D[i, j], mu[i], sd[i]) p2 = norm.sf(D[i, j], mu[j_mom], sd[j_mom]) D_mp[i, j] = (1 - p1 * p2).ravel() if idx is None: D_mp += D_mp.T np.fill_diagonal(D_mp, self_value) else: # Ensure correct self distances for j, sample in enumerate(idx): D_mp[sample, j] = self_value return D_mp
def local_scaling(D:np.ndarray, k:int=7, metric:str='distance', test_ind:np.ndarray=None, n_jobs:int=1): """Transform a distance matrix with Local Scaling. Transforms the given distance matrix into new one using local scaling [1]_ with the given `k`-th nearest neighbor. There are two types of local scaling methods implemented. The original one and NICDM, both reduce hubness in distance spaces, similarly to Mutual Proximity. Parameters ---------- D : ndarray or csr_matrix The ``n x n`` symmetric distance (similarity) matrix. k : int, optional (default: 7) Neighborhood radius for local scaling. metric : {'distance', 'similarity'}, optional (default: 'distance') Define, whether matrix `D` is a distance or similarity matrix. NOTE: self similarities in sparse `D_ls` are set to ``np.inf`` test_ind : ndarray, optional (default: None) Define data points to be hold out as part of a test set. Can be: - None : Rescale all distances - ndarray : Hold out points indexed in this array as test set. n_jobs : int, optional, default: 1 Number of processes for parallel computations. - `1`: Don't use multiprocessing. - `-1`: Use all CPUs Returns ------- D_ls : ndarray Secondary distance LocalScaling matrix. References ---------- .. [1] Schnitzer, D., Flexer, A., Schedl, M., & Widmer, G. (2012). Local and global scaling reduce hubs in space. The Journal of Machine Learning Research, 13(1), 2871–2902. """ log = ConsoleLogging() # Checking input io.check_distance_matrix_shape(D) io.check_valid_metric_parameter(metric) sparse = issparse(D) n = D.shape[0] if n_jobs == -1: n_jobs = cpu_count() if metric == 'similarity': kth = n - k exclude = -np.inf self_tmp_value = np.inf self_value = 1. log.warning("Similarity matrix support for LS is experimental.") if sparse and n_jobs != 1: log.warning("Parallel processing not implemented for sparse " "matrices. Using single process instead.") n_jobs = 1 else: # metric == 'distance': kth = k - 1 exclude = np.inf self_value = 0 self_tmp_value = self_value if sparse: log.error("Sparse distance matrices are not supported.") raise NotImplementedError( "Sparse distance matrices are not supported.") D = np.copy(D) if test_ind is None: train_ind = slice(0, n) #take all else: train_ind = np.setdiff1d(np.arange(n), test_ind) if sparse: r = np.zeros(n) for i in range(n): di = D[i, train_ind].toarray() di[i] = exclude r[i] = np.partition(di, kth=kth)[kth] D_ls = lil_matrix(D.shape) # Number of nonzero cells per row nnz = D.getnnz(axis=1) else: np.fill_diagonal(D, exclude) if n_jobs > 1: r_ctype = RawArray(ctypes.c_double, n) r = np.frombuffer(r_ctype, dtype=np.float64) with Pool(processes=n_jobs, initializer=_ls_load_shared_data, initargs=(D, train_ind, r, r_ctype)) as pool: for _ in pool.imap(func=partial(_ls_calculate_r, kth=kth), iterable=range(n)): pass # results handled within func else: r = np.partition(D[:, train_ind], kth=kth)[:, kth] if sparse or n_jobs == 1: D_ls = np.zeros_like(D) for i in range(n): # vectorized inner loop: calc only triu part tmp = np.empty(n-i) tmp[0] = self_tmp_value if metric == 'similarity': if sparse and nnz[i] <= k: # Don't rescale if there are tmp[1:] = np.nan # too few neighbors in row else: tmp[1:] = np.exp(-1 * D[i, i+1:]**2 / (r[i] * r[i+1:])) else: tmp[1:] = 1 - np.exp(-1 * D[i, i+1:]**2 / (r[i] * r[i+1:])) D_ls[i, i:] = tmp # copy triu to tril -> symmetric matrix (diag=zeros) # NOTE: does not affect self values, since inf+inf=inf and 0+0=0 D_ls += D_ls.T else: D_ls_ctype = RawArray(ctypes.c_double, D.size) D_ls = np.frombuffer(D_ls_ctype, dtype=np.float64).reshape(D.shape) with Pool(processes=n_jobs, initializer=_ls_load_shared_data, initargs=(D, train_ind, r, r_ctype, D_ls, D_ls_ctype)) as pool: for _ in pool.imap(func=partial(_ls_calculate_sec_dist, n=n, metric=metric, self_tmp_value=self_tmp_value), iterable=range(n)): pass # results handled within func # triu is copied to tril within func if sparse: for i, nz in enumerate(nnz): if nz <= k: # too few neighbors D_ls[i, :] = D[i, :] return D_ls.tocsr() else: np.fill_diagonal(D_ls, self_value) return D_ls
def _hubness_no_multiprocessing(D:np.ndarray, k:int=5, metric='distance', verbose:int=0, random_state=None, shuffle_equal:bool=True): """ Hubness calculations without multiprocessing overhead. """ log = ConsoleLogging() io.check_is_nD_array(arr=D, n=2, arr_type='Distance') io.check_valid_metric_parameter(metric) n, m = D.shape if k >= m: k_old = k k = m - 1 log.warning("Reducing k from {} to {}, so that it is less than " "the total number of neighbors.".format(k_old, k)) if metric == 'distance': d_self = np.inf sort_order = 1 kth = np.arange(k) if metric == 'similarity': d_self = -np.inf sort_order = -1 kth = np.arange(n - k, n) if verbose: log.message("Hubness calculation (skewness of {}-occurence)".format(k)) D = D.copy() D_k = np.zeros((n, k), dtype=np.float64) rnd = np.random.RandomState(random_state) if issparse(D): pass # correct self-distance must be ensured upstream for sparse else: if n == m: # Set self dist to inf np.fill_diagonal(D, d_self) else: pass # a partial distances matrix should not contain self distances # make non-finite (NaN, Inf) appear at the end of the sorted list D[~np.isfinite(D)] = d_self for i in range(n): if verbose and ((i+1)%10000==0 or i+1==n): log.message("NN: {} of {}.".format(i+1, n), flush=True) if issparse(D): d = D[i, :].toarray().ravel() # dense copy of one row else: # normal ndarray d = D[i, :] if n == m: d[i] = d_self else: # this does not hold for general dissimilarities if metric == 'distance': d[d==0] = d_self d[~np.isfinite(d)] = d_self if shuffle_equal: # Randomize equal values in the distance matrix rows to avoid the # problem case if all numbers to sort are the same, which would # yield high hubness, even if there is none. rp = rnd.permutation(m) d2 = d[rp] d2idx = np.argpartition(d2, kth=kth) D_k[i, :] = rp[d2idx[kth]][::sort_order] else: d_idx = np.argpartition(d, kth=kth) D_k[i, :] = d_idx[kth][::sort_order] # N-occurence N_k = np.bincount(D_k.astype(int).ravel(), minlength=m) # Hubness S_k = stats.skew(N_k) # return k-hubness, k-nearest neighbors, k-occurence if verbose: log.message("Hubness calculation done.", flush=True) return S_k, D_k, N_k
def local_scaling_sample(D:np.ndarray, k:int=7, metric:str='distance', train_ind:np.ndarray=None, test_ind:np.ndarray=None): """Transform a distance matrix with Local Scaling. --- DRAFT version --- Transforms the given distance matrix into new one using local scaling [1]_ with the given `k`-th nearest neighbor. There are two types of local scaling methods implemented. The original one and NICDM, both reduce hubness in distance spaces, similarly to Mutual Proximity. Parameters ---------- D : ndarray or csr_matrix The ``n x n`` symmetric distance (similarity) matrix. k : int, optional (default: 7) Neighborhood radius for local scaling. metric : {'distance', 'similarity'}, optional (default: 'distance') Define, whether matrix `D` is a distance or similarity matrix. NOTE: self similarities in sparse `D_ls` are set to ``np.inf`` train_ind : ndarray, optional If given, use only these data points as neighbors for rescaling. test_ind : ndarray, optional (default: None) Define data points to be hold out as part of a test set. Can be: - None : Rescale all distances - ndarray : Hold out points indexed in this array as test set. Returns ------- D_ls : ndarray Secondary distance LocalScaling matrix. References ---------- .. [1] Schnitzer, D., Flexer, A., Schedl, M., & Widmer, G. (2012). Local and global scaling reduce hubs in space. The Journal of Machine Learning Research, 13(1), 2871–2902. """ log = ConsoleLogging() # Checking input io.check_sample_shape_fits(D, train_ind) io.check_valid_metric_parameter(metric) sparse = issparse(D) n = D.shape[0] if metric == 'similarity': if train_ind is not None: raise NotImplementedError kth = n - k exclude = -np.inf self_value = 1. log.warning("Similarity matrix support for LS is experimental.") else: # metric == 'distance': kth = k - 1 exclude = np.inf self_value = 0 if sparse: log.error("Sparse distance matrices are not supported.") raise NotImplementedError( "Sparse distance matrices are not supported.") D = np.copy(D) if test_ind is None: train_set_ind = slice(0, n) #take all n_ind = range(n) else: train_set_ind = np.setdiff1d(np.arange(n), test_ind) n_ind = test_ind # Exclude self distances for j, sample in enumerate(train_ind): D[sample, j] = exclude r = np.zeros(n) for i in range(n): if train_ind is None: if sparse: di = D[i, train_set_ind].toarray() else: di = D[i, train_set_ind] else: di = D[i, :] # all columns are training in this case r[i] = np.partition(di, kth=kth)[kth] if sparse: D_ls = lil_matrix(D.shape) # Number of nonzero cells per row nnz = D.getnnz(axis=1) else: D_ls = np.zeros_like(D) if metric == 'similarity': for i in n_ind: if sparse and nnz[i] <= k: # Don't rescale if there are too few D_ls[i, :] = D[i, :] # neighbors in the current row else: D_ls[i, :] = np.exp(-1 * D[i, :]**2 / (r[i] * r[train_ind])) else: for i in n_ind: D_ls[i, :] = 1 - np.exp(-1 * D[i, :]**2 / (r[i] * r[train_ind])) if test_ind is None: if sparse: return D_ls.tocsr() else: np.fill_diagonal(D_ls, self_value) return D_ls else: # Ensure correct self distances for j, sample in enumerate(train_ind): D_ls[sample, j] = self_value return D_ls[test_ind]
def mp_dissim(X: np.ndarray, Y: np.ndarray = None, p: float = 2, n_bins: int = 0, bin_size: str = 'range', n_jobs: int = 1, verbose: int = 0): """ Calculate m_p dissimilarity. The data-dependent m_p dissimilarity measure considers the relative positions of objects x and y with respect to the rest of the data distribution in each dimension [1]_. Parameters ---------- X : ndarray Vector data (e.g. test set), shape (n_x, d) Y : ndarray, optional, default: None Vector data (e.g. training set), shape (n_y, d). Number of features ``d`` must be equal in `X` and `Y`. p : float, optional, default: 2 Parameter, similar to `p` in Minkowski norm n_bins : int, optional, default: 0 Number of bins for probability mass estimation bin_size : str, optional, default: 'range' Strategy for binning. May be one of: 'range' ... create bins with uniform range length 'mass' ... create bins with approx. uniform mass n_jobs : int, optional, default: 1 Parallel computation with multiple processes. verbose : int, optional, default: 0 Increasing level of output Returns ------- D : ndarray, shape (X.shape[0], Y.shape[0]) m_p dissimilarity matrix References ---------- .. [1] Aryal et al. (2017). Data-dependent dissimilarity measure: an effective alternative to geometric distance measures. Knowledge and Information Systems, Springer-Verlag London. """ # Some preparation n_x, d = X.shape # All-against-all in X, or X against Y? if Y is None: Y = X n_y, d_y = Y.shape # X and Y must have same dimensionality assert d == d_y if n_jobs == -1: n_jobs = cpu_count() n_bins = int(n_bins) if p == 0: log = ConsoleLogging() log.warning('Got mpDisSim parameter p=0. Changed to default ' 'value p=2 instead, in order to avoid zero division.') p = 2. # RawArrays have no locks. Must take EXTREME CARE!! R_bins = RawArray(ctypes.c_int32, d * n_bins * n_bins) R_bins_np = np.frombuffer(R_bins, dtype=np.int32).reshape( (d, n_bins, n_bins)) X_bins = RawArray(ctypes.c_int32, d * n_x) X_bins_np = np.frombuffer(X_bins, dtype=np.int32).reshape((d, n_x)) Y_bins = RawArray(ctypes.c_int32, d * n_y) Y_bins_np = np.frombuffer(Y_bins, dtype=np.int32).reshape((d, n_y)) mp = RawArray(ctypes.c_double, n_x * n_y) mp_np = np.frombuffer(mp).reshape((n_x, n_y)) global histograms, kth kth = np.arange(0, n_y)[0:n_y:int(n_y / n_bins)] if kth[-1] != n_y - 1: kth = np.append(kth, n_y - 1) if verbose: print("Creating bins for estimating probability data mass.") with Pool(processes=n_jobs, initializer=_mp_load_shared_Y, initargs=(Y, n_bins)) as pool: if 'mass'.startswith(bin_size): histograms = pool.map(func=_mp_calc_histograms, iterable=range(d)) elif 'range'.startswith(bin_size): histograms = pool.map(func=_mp_calc_histograms_n_bins, iterable=range(d)) else: raise ValueError("{}' is not a valid value for `bin_size`. " "Please use 'range' or 'mass'.".format(bin_size)) # The second pool needs `histograms` with Pool(processes=n_jobs, initializer=_mp_load_shared_data, initargs=(X, Y, p, n_bins, R_bins, R_bins_np, X_bins, X_bins_np, Y_bins, Y_bins_np, mp, mp_np)) as pool: pool.map(func=_mp_create_r_bins, iterable=range(d)) if verbose: print("Estimating probability data mass in all regions R_i(x,y).") pool.map(func=_mp_estimate_r, iterable=range(d)) if verbose: print("Calculating m_p dissimilarity for all pairs x, y.") pool.map(func=_mp_calc_mp_dissim, iterable=range(n_x)) if verbose: print("Done.") return mp_np