def from_numpy(x, labels_matrix=None, undirected=True): G = Graph() if issparse(x): cx = x.tocoo() for i, j, v in zip(cx.row, cx.col, cx.data): if i == j: continue G[str(i)][str(j)] = {} if undirected: G[str(j)][str(i)] = {} else: raise Exception("Dense matrices not yet supported.") if labels_matrix != None: if issparse(labels_matrix): cx = labels_matrix.tocoo() for i, j, v in zip(cx.row, cx.col, cx.data): if random.random() > 0.5: G.label[str(i)] = j # print(len(cx.row), len(cx.col)) # exit() else: raise Exception("Dense matrices not yet supported.") # if undirected: # G.make_undirected() # G.make_consistent() return G
def check_sample_shape_fits(D: np.ndarray, idx: np.ndarray): """ Check that number of columns in ``D`` equals the size of ``idx``. """ if issparse(D) or issparse(idx): raise TypeError("Sparse matrices are not supported for SampleMP.") check_is_nD_array(D, 2, "Distance/similarity") check_is_nD_array(idx, 1, "Index") if D.shape[1] > D.shape[0]: raise ValueError("Number of samples is higher than number of points. " "Must be less than or equal. In the latter case, " "consider not using samples at all for efficiency. " "Shape of `D`: {}.".format(D.shape)) if D.shape[1] != idx.size: raise TypeError("Number of samples in index array does not match " "the number of samples in the data matrix. " "Size of `idx`: {}, Columns in `D`: {}.".format( idx.size, D.shape[1]))
def to_netflux(flux): r"""Compute the netflux from the gross flux. Parameters ---------- flux : (M, M) ndarray Matrix of flux values between pairs of states. Returns ------- netflux : (M, M) ndarray Matrix of netflux values between pairs of states. Notes ----- The netflux or effective current is defined as .. math:: f_{ij}^{+}=\max \{ f_{ij}-f_{ji}, 0 \} :math:`f_{ij}` is the flux for the transition from :math:`A` to :math:`B`. References ---------- .. [1] P. Metzner, C. Schuette and E. Vanden-Eijnden. Transition Path Theory for Markov Jump Processes. Multiscale Model Simul 7: 1192-1219 (2009) """ if issparse(flux): return sparse.tpt.to_netflux(flux) elif isdense(flux): return dense.tpt.to_netflux(flux) else: raise _type_not_supported
def _partial_hubness(k, d_self, log, sort_order, rows, submatrix, idx, n, verbose): """Parallel hubness calculation: Get k nearest neighbors for all points in 'rows'""" Dk = np.zeros((k, len(rows)), dtype=np.float32) for i, row in enumerate(submatrix): if verbose and ((rows[i]+1)%10000==0 or rows[i]+1==n): log.message("NN: {} of {}.".format(rows[i]+1, n), flush=True) if issparse(submatrix): d = row.toarray().ravel() # dense copy of one row else: # normal ndarray d = row d[rows[i]] = d_self d[~np.isfinite(d)] = d_self # randomize the distance matrix rows to avoid the problem case # if all numbers to sort are the same, which would yield high # hubness, even if there is none rp = np.random.permutation(n) d2 = d[rp] d2idx = np.argsort(d2, axis=0)[::sort_order] Dk[:, i] = rp[d2idx[0:k]] return [rows, Dk]
def total_flux(F, A=None): r"""Compute the total flux, or turnover flux, that is produced by the flux sources and consumed by the flux sinks. Parameters ---------- F : (M, M) ndarray Matrix of flux values between pairs of states. A : array_like (optional) List of integer state labels for set A (reactant) Returns ------- F : float The total flux, or turnover flux, that is produced by the flux sources and consumed by the flux sinks References ---------- .. [1] P. Metzner, C. Schuette and E. Vanden-Eijnden. Transition Path Theory for Markov Jump Processes. Multiscale Model Simul 7: 1192-1219 (2009) """ if issparse(F): return sparse.tpt.total_flux(F, A=A) elif isdense(F): return dense.tpt.total_flux(F, A=A) else: raise _type_not_supported
def coarsegrain(F, sets): r"""Coarse-grains the flux to the given sets. Parameters ---------- F : (n, n) ndarray or scipy.sparse matrix Matrix of flux values between pairs of states. sets : list of array-like of ints The sets of states onto which the flux is coarse-grained. Notes ----- The coarse grained flux is defined as .. math:: fc_{I,J} = \sum_{i \in I,j \in J} f_{i,j} Note that if you coarse-grain a net flux, it does n ot necessarily have a net flux property anymore. If want to make sure you get a netflux, use to_netflux(coarsegrain(F,sets)). References ---------- .. [1] F. Noe, Ch. Schuette, E. Vanden-Eijnden, L. Reich and T. Weikl: Constructing the Full Ensemble of Folding Pathways from Short Off-Equilibrium Simulations. Proc. Natl. Acad. Sci. USA, 106, 19011-19016 (2009) """ if issparse(F): return sparse.tpt.coarsegrain(F, sets) elif isdense(F): return dense.tpt.coarsegrain(F, sets) else: raise _type_not_supported
def __init__(self, D, classes, k, isSimilarityMatrix=False): """ .. note:: Deprecated in hub-toolbox 2.3 Class will be removed in hub-toolbox 3.0. Please use static functions instead. """ print("DEPRECATED: Please use KnnClassification.score() instead.", file=sys.stderr) if issparse(D): self.D = D else: self.D = np.copy(D) self.classes = np.copy(classes) if type(k) is np.ndarray: self.k = np.copy(k) else: self.k = np.array([k]) self.isSimilarityMatrix = isSimilarityMatrix if self.isSimilarityMatrix: self.self_value = -np.inf self.sort_order = -1 else: self.self_value = np.inf self.sort_order = 1 assert D.shape[0] == len(classes)
def _hubness_nearest_neighbors(i, n, m, d_self, metric, kth, sort_order, log, verbose, shuffle_equal): if verbose and ((i+1)%10000==0 or i+1==n): log.message("NN: {} of {}.".format(i+1, n), flush=True) if issparse(D): d = D[i, :].toarray().ravel() # dense copy of one row else: # normal ndarray d = D[i, :] if n == m: d[i] = d_self else: # this does not hold for general dissimilarities if metric == 'distance': d[d==0] = d_self d[~np.isfinite(d)] = d_self if shuffle_equal: # Randomize equal values in the distance matrix rows to avoid the # problem case if all numbers to sort are the same, which would yield # high hubness, even if there is none. rp = np.random.permutation(m) d2 = d[rp] d2idx = np.argpartition(d2, kth=kth) D_k[i, :] = rp[d2idx[kth]][::sort_order] else: d_idx = np.argpartition(d, kth=kth) D_k[i, :] = d_idx[kth][::sort_order] return
def sparse_matrix_to_hdf(sparse_matrix, name_to_store, hdf_file_path): nonzero_indices = np.nonzero(sparse_matrix > 0) if len(nonzero_indices[0]) == 0: raise Exception("can't store empty sparse matrix!") if issparse(sparse_matrix): if sparse_matrix.__class__ is lil_matrix: nonzero_values = sparse_matrix.tocsr()[nonzero_indices].A1 else: nonzero_values = lil_matrix( sparse_matrix).tocsr()[nonzero_indices].A1 else: nonzero_values = np.array(sparse_matrix[nonzero_indices]) # print(sparse_matrix.__class__,'=',name_to_store,sparse_matrix.shape,len(nonzero_values)) matrix_dataframe = pd.DataFrame({ "row_indexes": nonzero_indices[0], "col_indexes": nonzero_indices[1], "data": nonzero_values }) matrix_shape_series = pd.Series(sparse_matrix.shape) matrix_dataframe.to_hdf(hdf_file_path, name_to_store) matrix_shape_series.to_hdf(hdf_file_path, "%s_shape" % name_to_store) del nonzero_indices, nonzero_values, matrix_dataframe, matrix_shape_series
def total_flux(F, A = None): r"""Compute the total flux, or turnover flux, that is produced by the flux sources and consumed by the flux sinks. Parameters ---------- F : (M, M) ndarray Matrix of flux values between pairs of states. A : array_like (optional) List of integer state labels for set A (reactant) Returns ------- F : float The total flux, or turnover flux, that is produced by the flux sources and consumed by the flux sinks References ---------- .. [1] P. Metzner, C. Schuette and E. Vanden-Eijnden. Transition Path Theory for Markov Jump Processes. Multiscale Model Simul 7: 1192-1219 (2009) """ if issparse(F): return sparse.tpt.total_flux(F, A = A) elif isdense(F): return dense.tpt.total_flux(F, A = A) else: raise _type_not_supported
def _k_neighbors_precomputed_sparse(self, X, n_samples=None): ''' Find nearest neighbors in sparse distance matrix. Parameters ---------- X : sparse, shape = [n_test, n_indexed] Sparse distance matrix. Only non-zero elements may be considered neighbors. n_samples : int Number of sampled indexed objects, e.g. in approximate hubness reduction. If None, this is inferred from the first row of X. Returns ------- k_neighbors : ndarray Flattened array of neighbor indices. ''' assert issparse(X), f'Matrix is not sparse' X = X.tocsr() if n_samples is None: n_samples = X.indptr[1] - X.indptr[0] n_test, _ = X.shape # To allow different number of explicit entries per row, # we need to process the matrix row-by-row. if np.all(X.indptr[1:] - X.indptr[:-1] == n_samples)\ and not self.shuffle_equal: min_ind = np.argpartition(X.data.reshape(n_test, n_samples), kth=np.arange(self.k), axis=1)[:, :self.k] k_neighbors = X.indices[ min_ind.ravel() + np.repeat(X.indptr[:-1], repeats=self.k)] else: min_ind = np.empty((n_test,), dtype=object) k_neighbors = np.empty((n_test,), dtype=object) if self.shuffle_equal: for i in range(n_test): if self.verbose > 1 \ or self.verbose and (i % 1000 == 0 or i+1 == n_test): log.message(f"k neighbors (from sparse distances): " f"{i+1}/{n_test}.", flush=True) x = X.getrow(i) rp = self.random_state.permutation(x.nnz) d2 = x.data[rp] d2idx = np.argpartition(d2, kth=np.arange(self.k)) k_neighbors[i] = x.indices[rp[d2idx[:self.k]]] else: for i in range(n_test): if self.verbose > 1 \ or self.verbose and (i % 1000 == 0 or i+1 == n_test): log.message(f"k neighbors (from sparse distances): " f"{i+1}/{n_test}.", flush=True) x = X.getrow(i) min_ind = np.argpartition( x.data, kth=np.arange(self.k))[:self.k] k_neighbors[i] = x.indices[min_ind] k_neighbors = np.concatenate(k_neighbors) return k_neighbors
def pathways(F, A, B, fraction=1.0, maxiter=1000): r"""Decompose flux network into dominant reaction paths. Parameters ---------- F : (M, M) scipy.sparse matrix The flux network (matrix of netflux values) A : array_like The set of starting states B : array_like The set of end states fraction : float, optional Fraction of total flux to assemble in pathway decomposition maxiter : int, optional Maximum number of pathways for decomposition Returns ------- paths : list List of dominant reaction pathways capacities: list List of capacities corresponding to each reactions pathway in paths Notes ----- The default value for fraction is 1.0, i.e. all dominant reaction pathways for the flux network are computed. For large netorks the number of possible reaction paths can increase rapidly so that it becomes prohibitevely expensive to compute all possible reaction paths. To prevent this from happening maxiter sets the maximum number of reaction pathways that will be computed. For large flux networks it might be necessary to decrease fraction or to increase maxiter. It is advisable to begin with a small value for fraction and monitor the number of pathways returned when increasing the value of fraction. References ---------- .. [1] P. Metzner, C. Schuette and E. Vanden-Eijnden. Transition Path Theory for Markov Jump Processes. Multiscale Model Simul 7: 1192-1219 (2009) """ if issparse(F): return sparse.pathways.pathways(F, A, B, fraction=fraction, maxiter=maxiter) elif isdense(F): return sparse.pathways.pathways(csr_matrix(F), A, B, fraction=fraction, maxiter=maxiter) else: raise _type_not_supported
def _k_neighbors_precomputed_sparse(self, X: csr_matrix, n_samples: int = None): """ Find nearest neighbors in sparse distance matrix. Parameters ---------- X: sparse, shape = [n_test, n_indexed] Sparse distance matrix. Only non-zero elements may be considered neighbors. n_samples: int Number of sampled indexed objects, e.g. in approximate hubness reduction. If None, this is inferred from the first row of X. Returns ------- k_neighbors : ndarray Flattened array of neighbor indices. """ if not issparse(X): raise TypeError(f'Matrix X is not sparse') X = X.tocsr() if n_samples is None: n_samples = X.indptr[1] - X.indptr[0] n_test, _ = X.shape # To allow different number of explicit entries per row, # we need to process the matrix row-by-row. if np.all(X.indptr[1:] - X.indptr[:-1] == n_samples) and not self.shuffle_equal: min_ind = np.argpartition(X.data.reshape(n_test, n_samples), kth=np.arange(self.k), axis=1)[:, :self.k] k_neighbors = X.indices[min_ind.ravel() + np.repeat(X.indptr[:-1], repeats=self.k)] else: k_neighbors = np.empty((n_test, ), dtype=object) if self.verbose: range_n_test = tqdm(range(n_test)) else: range_n_test = range(n_test) if self.shuffle_equal: for i in range_n_test: x = X.getrow(i) rp = self._random_state.permutation(x.nnz) d2 = x.data[rp] d2idx = np.argpartition(d2, kth=np.arange(self.k)) k_neighbors[i] = x.indices[rp[d2idx[:self.k]]] else: for i in range_n_test: x = X.getrow(i) min_ind = np.argpartition(x.data, kth=np.arange(self.k))[:self.k] k_neighbors[i] = x.indices[min_ind] k_neighbors = np.concatenate(k_neighbors) return k_neighbors
def __init__(self, D, k:int=7, scalingType='nicdm', isSimilarityMatrix=False): """ .. note:: Deprecated in hub-toolbox 2.3 Class will be removed in hub-toolbox 3.0. Please use static functions instead. """ print("DEPRECATED: Please use LocalScaling.local_scaling() or " "LocalScaling.nicdm() instead.", file=sys.stderr) self.log = Logging.ConsoleLogging() self.D = np.copy(D) self.k = k self.scalingType = scalingType if isSimilarityMatrix: if scalingType=='nicdm': if issparse(D): self.log.error("NICDM does not support sparse matrices.") raise NotImplementedError( "NICDM does not support sparse matrices.") else: self.log.warning("NICDM does not support similarities. " "Distances will be calculated as D=1-S/S.max and used " "for NICDM scaling. Similarities are subsequently " "obtained by the same procedure S=1-D/D.max") else: self.log.warning("Similarity-based LS support is experimental.") self.isSimilarityMatrix = isSimilarityMatrix if self.isSimilarityMatrix: self.sort_order = -1 self.exclude = -np.inf else: self.sort_order = 1 self.exclude = np.inf if issparse(D): if isSimilarityMatrix: self.log.warning("Sparse matrix support for LS is experimental.") else: self.log.error("Sparse distance matrices are not supported.") raise NotImplementedError( "Sparse distance matrices are not supported.")
def pathways(F, A, B, qplus, fraction = 1.0, totalflux = None): r"""Pathway decomposition of the net flux. Parameters ---------- F : (M, M) ndarray Matrix of flux values between pairs of states. A : array-like of ints A states (source, educt states) B : array-like of ints B states (sinks, product states) qplus : (M,) ndarray Forward committor fraction = float (optional) The fraction of the total flux for which pathways will be computed. When set larger than 1.0, will use 1.0. When set <= 0.0, no pathways will be computed and two empty lists will be returned. For example, when set to fraction = 0.9, the pathway decomposition will stop when 90% of the flux have been accumulated. This is very useful for large flux networks which often contain a few major and a lot of minor paths. In such networks, the algorithm would spend a very long time in the last few percent of pathways Returns ------- (paths,pathfluxes) : (list of int-arrays, double-array) paths in the order of decreasing flux. Each path is given as an int-array of state indexes, ordered by increasing forward committor values. The first index of each path will be a state in A, the last index a state in B. The corresponding figure in the pathfluxes-array is the flux carried by that path. The pathfluxes-array sums to the requested fraction of the total A->B flux. References ---------- .. [1] P. Metzner, C. Schuette and E. Vanden-Eijnden. Transition Path Theory for Markov Jump Processes. Multiscale Model Simul 7: 1192-1219 (2009) """ # initialize decomposition object Fdense = F if (issparse(F)): RuntimeWarning('Sparse pathway decomposition is not implemented. Using dense pathway implementation.' +'Sorry, but this might lead to poor performance or memory overflow.') Fdense = F.toarray() return dense.tpt.pathways(Fdense, A, B, qplus, fraction=fraction, totalflux=totalflux)
def _write_instance(feats, class_label, f): if issparse(feats): feats = feats.toarray().reshape(-1,) try: feat_list = [str(i).replace(" ", "_") for i in feats] except ValueError: print("Please enter correct feature values") else: try: class_label = str(class_label) except ValueError: print("Please enter correct class label") else: f.write(",".join(feat_list)+","+class_label+"\n")
def save(self, filename): # Store counts as a sparse matrix so they don't take up tonnes of space if issparse(self.counts): sparse_counts = self.counts else: sparse_counts = csr_matrix(self.counts) with open("%s-sparse-counts" % filename, "w") as f: pickle.dump(sparse_counts, f, -1) numpy.save("%s-init.npy" % filename, self.initial_counts) with open("%s.params" % filename, 'w') as params_file: pickle.dump( { "laplace_smoothing": self.laplace_smoothing, "backoff_threshold": self.backoff_threshold, }, params_file)
def is_multilabel(y): """ Check if ``y`` is in a multilabel format. Parameters ---------- y : numpy array of shape [n_samples] Target values. Returns ------- out : bool, Return ``True``, if ``y`` is in a multilabel format, else ```False``. Examples -------- >>> import numpy as np >>> from sklearn.utils.multiclass import is_multilabel >>> is_multilabel([0, 1, 0, 1]) False >>> is_multilabel([[1], [0, 2], []]) False >>> is_multilabel(np.array([[1, 0], [0, 0]])) True >>> is_multilabel(np.array([[1], [0], [0]])) False >>> is_multilabel(np.array([[1, 0, 0]])) True """ if hasattr(y, '__array__') or isinstance(y, Sequence): y = np.asarray(y) if not (hasattr(y, "shape") and y.ndim == 2 and y.shape[1] > 1): return False if issparse(y): if isinstance(y, (dok_matrix, lil_matrix)): y = y.tocsr() return (len(y.data) == 0 or np.unique(y.data).size == 1 and (y.dtype.kind in 'biu' or # bool, int, uint _is_integral_float(np.unique(y.data)))) else: labels = np.unique(y) return len(labels) < 3 and (y.dtype.kind in 'biu' or # bool, int, uint _is_integral_float(labels))
def cal_skewness(dist_mat, k, metric): """ This code is based on 'hub-toobox' args: - dist_mat (ndarray) : Distant (Similarity) matrix (n_query, n_target) - k (int) : Neighborhood size for `k`-occurence - metric ({'similarity' or 'distant'}) : whether dist_mat is distant or similarity """ if issparse(dist_mat): raise NotImplementedError() if metric == 'distance': self_val = np.inf sort_order = 1 elif metric == 'similarity': self_val = -np.inf sort_order = -1 else: raise ValueError('Invalid metric: {}'.format(metric)) dist_mat = dist_mat.copy() n_query, n_target = dist_mat.shape kbest_idxs = np.zeros((k, n_query), dtype=np.float32) # np.fill_diagonal(dist_mat, self_val) dist_mat[~np.isfinite(dist_mat)] = self_val for i in range(n_query): dists = dist_mat[i, :] # dists[i] = self_val dists[~np.isfinite(dists)] = self_val # randomize equal values for avoiding high hubness (see original code) rand_idxs = np.random.permutation(n_target) dists2 = dists[rand_idxs] rank_dists2 = np.argsort(dists2, axis=0)[::sort_order] kbest_idxs[:, i] = rand_idxs[rank_dists2[0:k]] n_k = np.bincount(kbest_idxs.astype(int).ravel()) skewness = stats.skew(n_k) return skewness
def local_scaling(D:np.ndarray, k:int=7, metric:str='distance', test_set_ind:np.ndarray=None): """Transform a distance matrix with Local Scaling. Transforms the given distance matrix into new one using local scaling [1]_ with the given `k`-th nearest neighbor. There are two types of local scaling methods implemented. The original one and NICDM, both reduce hubness in distance spaces, similarly to Mutual Proximity. Parameters ---------- D : ndarray or csr_matrix The ``n x n`` symmetric distance (similarity) matrix. k : int, optional (default: 7) Neighborhood radius for local scaling. metric : {'distance', 'similarity'}, optional (default: 'distance') Define, whether matrix `D` is a distance or similarity matrix. NOTE: self similarities in sparse `D_ls` are set to ``np.inf`` test_sed_ind : ndarray, optional (default: None) Define data points to be hold out as part of a test set. Can be: - None : Rescale all distances - ndarray : Hold out points indexed in this array as test set. Returns ------- D_ls : ndarray Secondary distance LocalScaling matrix. References ---------- .. [1] Schnitzer, D., Flexer, A., Schedl, M., & Widmer, G. (2012). Local and global scaling reduce hubs in space. The Journal of Machine Learning Research, 13(1), 2871–2902. """ log = Logging.ConsoleLogging() # Checking input IO._check_distance_matrix_shape(D) IO._check_valid_metric_parameter(metric) if metric == 'similarity': sort_order = -1 exclude = -np.inf self_tmp_value = np.inf self_value = 1. log.warning("Similarity matrix support for LS is experimental.") else: # metric == 'distance': sort_order = 1 exclude = np.inf self_value = 0 self_tmp_value = self_value if issparse(D): log.error("Sparse distance matrices are not supported.") raise NotImplementedError( "Sparse distance matrices are not supported.") D = np.copy(D) n = D.shape[0] if test_set_ind is None: train_set_ind = slice(0, n) #take all else: train_set_ind = np.setdiff1d(np.arange(n), test_set_ind) r = np.zeros(n) for i in range(n): if issparse(D): di = D[i, train_set_ind].toarray() else: di = D[i, train_set_ind] di[i] = exclude nn = np.argsort(di)[::sort_order] r[i] = di[nn[k-1]] #largest similarities or smallest distances if issparse(D): D_ls = lil_matrix(D.shape) else: D_ls = np.zeros_like(D) for i in range(n): # vectorized inner loop: calc only triu part tmp = np.empty(n-i) tmp[0] = self_tmp_value if metric == 'similarity': tmp[1:] = np.exp(-1 * D[i, i+1:]**2 / (r[i] * r[i+1:])) else: tmp[1:] = 1 - np.exp(-1 * D[i, i+1:]**2 / (r[i] * r[i+1:])) D_ls[i, i:] = tmp # copy triu to tril -> symmetric matrix (diag=zeros) # NOTE: does not affect self values, since inf+inf=inf and 0+0=0 D_ls += D_ls.T if issparse(D): return D_ls.tocsr() else: np.fill_diagonal(D_ls, self_value) return D_ls
def score(D:np.ndarray, target:np.ndarray, k=5, metric:str='distance', test_set_ind:np.ndarray=None, verbose:int=0): """Perform `k`-nearest neighbor classification. Use the ``n x n`` symmetric distance matrix `D` and target class labels `target` to perform a `k`-NN experiment (leave-one-out cross-validation or evaluation of test set; see parameter `test_set_ind`). Ties are broken by the nearest neighbor. Parameters ---------- D : ndarray The ``n x n`` symmetric distance (similarity) matrix. target : ndarray (of dtype=int) The ``n x 1`` target class labels (ground truth). k : int or array_like (of dtype=int), optional (default: 5) Neighborhood size for `k`-NN classification. For each value in `k`, one `k`-NN experiment is performed. HINT: Providing more than one value for `k` is a cheap means to perform multiple `k`-NN experiments at once. Try e.g. ``k=[1, 5, 20]``. metric : {'distance', 'similarity'}, optional (default: 'distance') Define, whether matrix `D` is a distance or similarity matrix test_sed_ind : ndarray, optional (default: None) Define data points to be hold out as part of a test set. Can be: - None : Perform a LOO-CV experiment - ndarray : Hold out points indexed in this array as test set. Fit model to remaining data. Evaluate model on test set. verbose : int, optional (default: 0) Increasing level of output (progress report). Returns ------- acc : ndarray (shape=(n_k x 1), dtype=float) Classification accuracy (`n_k`... number of items in parameter `k`) HINT: Refering to the above example... ... ``acc[0]`` gives the accuracy of the ``k=1`` experiment. corr : ndarray (shape=(n_k x n), dtype=int) Raw vectors of correctly classified items HINT: ... ``corr[1, :]`` gives these items for the ``k=5`` experiment. cmat : ndarray (shape=(n_k x n_t x n_t), dtype=int) Confusion matrix (``n_t`` number of unique items in parameter target) HINT: ... ``cmat[2, :, :]`` gives the confusion matrix of the ``k=20`` experiment. """ # Check input sanity log = Logging.ConsoleLogging() IO._check_distance_matrix_shape(D) IO._check_distance_matrix_shape_fits_labels(D, target) IO._check_valid_metric_parameter(metric) if metric == 'distance': d_self = np.inf sort_order = 1 if metric == 'similarity': d_self = -np.inf sort_order = -1 # Copy, because data is changed D = D.copy() target = target.astype(int) if verbose: log.message("Start k-NN experiment.") # Handle LOO-CV vs. test set mode if test_set_ind is None: n = D.shape[0] test_set_ind = range(n) # dummy train_set_ind = n # dummy else: # number of points to be classified n = test_set_ind.size # Indices of training examples train_set_ind = np.setdiff1d(np.arange(n), test_set_ind) # Number of k-NN parameters try: k_length = k.size except AttributeError as e: if isinstance(k, int): k = np.array([k]) k_length = k.size elif isinstance(k, list): k = np.array(k) k_length = k.size else: raise e acc = np.zeros((k_length, 1)) corr = np.zeros((k_length, D.shape[0])) cl = np.sort(np.unique(target)) cmat = np.zeros((k_length, len(cl), len(cl))) classes = target.copy() for idx, cur_class in enumerate(cl): # change labels to 0, 1, ..., len(cl)-1 classes[target == cur_class] = idx cl = range(len(cl)) # Classify each point in test set for i in test_set_ind: seed_class = classes[i] if issparse(D): row = D.getrow(i).toarray().ravel() else: row = D[i, :] row[i] = d_self # Sort points in training set according to distance # Randomize, in case there are several points of same distance # (this is especially relevant for SNN rescaling) rp = train_set_ind rp = np.random.permutation(rp) d2 = row[rp] d2idx = np.argsort(d2, axis=0)[::sort_order] idx = rp[d2idx] # More than one k is useful for cheap multiple k-NN experiments at once for j in range(k_length): nn_class = classes[idx[0:k[j]]] cs = np.bincount(nn_class.astype(int)) max_cs = np.where(cs == np.max(cs))[0] # "tie": use nearest neighbor if len(max_cs) > 1: if seed_class == nn_class[0]: acc[j] += 1/n corr[j, i] = 1 cmat[j, seed_class, nn_class[0]] += 1 # majority vote else: if cl[max_cs[0]] == seed_class: acc[j] += 1/n corr[j, i] = 1 cmat[j, seed_class, cl[max_cs[0]]] += 1 if verbose: log.message("Finished k-NN experiment.") return acc, corr, cmat
def flux_matrix(T, pi, qminus, qplus, netflux=True): r"""Compute the TPT flux network for the reaction A-->B. Parameters ---------- T : (M, M) ndarray transition matrix pi : (M,) ndarray Stationary distribution corresponding to T qminus : (M,) ndarray Backward comittor qplus : (M,) ndarray Forward committor netflux : boolean True: net flux matrix will be computed False: gross flux matrix will be computed Returns ------- flux : (M, M) ndarray Matrix of flux values between pairs of states. Notes ----- Computation of the flux network relies on transition path theory (TPT) [1]. Here we use discrete transition path theory [2] in the transition matrix formulation [3]. See also -------- committor.forward_committor, committor.backward_committor Notes ----- Computation of the flux network relies on transition path theory (TPT). The central object used in transition path theory is the forward and backward comittor function. The TPT (gross) flux is defined as .. math:: f_{ij}=\left \{ \begin{array}{rl} \pi_i q_i^{(-)} p_{ij} q_j^{(+)} & i \neq j \\ 0 & i=j\ \end{array} \right . The TPT net flux is then defined as .. math:: f_{ij}=\max\{f_{ij} - f_{ji}, 0\} \:\:\:\forall i,j. References ---------- .. [1] W. E and E. Vanden-Eijnden. Towards a theory of transition paths. J. Stat. Phys. 123: 503-523 (2006) .. [2] P. Metzner, C. Schuette and E. Vanden-Eijnden. Transition Path Theory for Markov Jump Processes. Multiscale Model Simul 7: 1192-1219 (2009) .. [3] F. Noe, Ch. Schuette, E. Vanden-Eijnden, L. Reich and T. Weikl: Constructing the Full Ensemble of Folding Pathways from Short Off-Equilibrium Simulations. Proc. Natl. Acad. Sci. USA, 106, 19011-19016 (2009) """ if issparse(T): return sparse.tpt.flux_matrix(T, pi, qminus, qplus, netflux=netflux) elif isdense(T): return dense.tpt.flux_matrix(T, pi, qminus, qplus, netflux=netflux) else: raise _type_not_supported
def _hubness_no_multiprocessing(D:np.ndarray, k:int=5, metric='distance', verbose:int=0, random_state=None, shuffle_equal:bool=True): """ Hubness calculations without multiprocessing overhead. """ log = ConsoleLogging() io.check_is_nD_array(arr=D, n=2, arr_type='Distance') io.check_valid_metric_parameter(metric) n, m = D.shape if k >= m: k_old = k k = m - 1 log.warning("Reducing k from {} to {}, so that it is less than " "the total number of neighbors.".format(k_old, k)) if metric == 'distance': d_self = np.inf sort_order = 1 kth = np.arange(k) if metric == 'similarity': d_self = -np.inf sort_order = -1 kth = np.arange(n - k, n) if verbose: log.message("Hubness calculation (skewness of {}-occurence)".format(k)) D = D.copy() D_k = np.zeros((n, k), dtype=np.float64) rnd = np.random.RandomState(random_state) if issparse(D): pass # correct self-distance must be ensured upstream for sparse else: if n == m: # Set self dist to inf np.fill_diagonal(D, d_self) else: pass # a partial distances matrix should not contain self distances # make non-finite (NaN, Inf) appear at the end of the sorted list D[~np.isfinite(D)] = d_self for i in range(n): if verbose and ((i+1)%10000==0 or i+1==n): log.message("NN: {} of {}.".format(i+1, n), flush=True) if issparse(D): d = D[i, :].toarray().ravel() # dense copy of one row else: # normal ndarray d = D[i, :] if n == m: d[i] = d_self else: # this does not hold for general dissimilarities if metric == 'distance': d[d==0] = d_self d[~np.isfinite(d)] = d_self if shuffle_equal: # Randomize equal values in the distance matrix rows to avoid the # problem case if all numbers to sort are the same, which would # yield high hubness, even if there is none. rp = rnd.permutation(m) d2 = d[rp] d2idx = np.argpartition(d2, kth=kth) D_k[i, :] = rp[d2idx[kth]][::sort_order] else: d_idx = np.argpartition(d, kth=kth) D_k[i, :] = d_idx[kth][::sort_order] # N-occurence N_k = np.bincount(D_k.astype(int).ravel(), minlength=m) # Hubness S_k = stats.skew(N_k) # return k-hubness, k-nearest neighbors, k-occurence if verbose: log.message("Hubness calculation done.", flush=True) return S_k, D_k, N_k
def hubness(D:np.ndarray, k:int=5, metric='distance', verbose:int=0): """Compute hubness of a distance matrix. Hubness [1]_ is the skewness of the `k`-occurrence histogram (reverse nearest neighbor count, i.e. how often does a point occur in the `k`-nearest neighbor lists of other points). Parameters ---------- D : ndarray The ``n x n`` symmetric distance (similarity) matrix. k : int, optional (default: 5) Neighborhood size for `k`-occurence. metric : {'distance', 'similarity'}, optional (default: 'distance') Define, whether matrix `D` is a distance or similarity matrix verbose : int, optional (default: 0) Increasing level of output (progress report). Returns ------- S_k : float Hubness (skewness of `k`-occurence distribution) D_k : ndarray `k`-nearest neighbor lists N_k : ndarray `k`-occurence list References ---------- .. [1] Radovanović, M., Nanopoulos, A., & Ivanović, M. (2010). Hubs in Space : Popular Nearest Neighbors in High-Dimensional Data. Journal of Machine Learning Research, 11, 2487–2531. Retrieved from http://jmlr.csail.mit.edu/papers/volume11/radovanovic10a/ radovanovic10a.pdf """ log = Logging.ConsoleLogging() IO._check_distance_matrix_shape(D) IO._check_valid_metric_parameter(metric) if metric == 'distance': d_self = np.inf sort_order = 1 if metric == 'similarity': d_self = -np.inf sort_order = -1 if verbose: log.message("Hubness calculation (skewness of {}-occurence)".format(k)) D = D.copy() D_k = np.zeros((k, D.shape[1]), dtype=np.float32) n = D.shape[0] if issparse(D): pass # correct self-distance must be ensured upstream for sparse else: # Set self dist to inf np.fill_diagonal(D, d_self) # make non-finite (NaN, Inf) appear at the end of the sorted list D[~np.isfinite(D)] = d_self for i in range(n): if verbose and ((i+1)%10000==0 or i+1==n): log.message("NN: {} of {}.".format(i+1, n), flush=True) if issparse(D): d = D[i, :].toarray().ravel() # dense copy of one row else: # normal ndarray d = D[i, :] d[i] = d_self d[~np.isfinite(d)] = d_self # Randomize equal values in the distance matrix rows to avoid the # problem case if all numbers to sort are the same, which would yield # high hubness, even if there is none. rp = np.random.permutation(n) d2 = d[rp] d2idx = np.argsort(d2, axis=0)[::sort_order] D_k[:, i] = rp[d2idx[0:k]] # N-occurence N_k = np.bincount(D_k.astype(int).ravel(), minlength=n) # Hubness S_k = stats.skew(N_k) # return k-hubness, k-nearest neighbors, k-occurence if verbose: log.message("Hubness calculation done.", flush=True) return S_k, D_k, N_k
def local_scaling_sample(D:np.ndarray, k:int=7, metric:str='distance', train_ind:np.ndarray=None, test_ind:np.ndarray=None): """Transform a distance matrix with Local Scaling. --- DRAFT version --- Transforms the given distance matrix into new one using local scaling [1]_ with the given `k`-th nearest neighbor. There are two types of local scaling methods implemented. The original one and NICDM, both reduce hubness in distance spaces, similarly to Mutual Proximity. Parameters ---------- D : ndarray or csr_matrix The ``n x n`` symmetric distance (similarity) matrix. k : int, optional (default: 7) Neighborhood radius for local scaling. metric : {'distance', 'similarity'}, optional (default: 'distance') Define, whether matrix `D` is a distance or similarity matrix. NOTE: self similarities in sparse `D_ls` are set to ``np.inf`` train_ind : ndarray, optional If given, use only these data points as neighbors for rescaling. test_ind : ndarray, optional (default: None) Define data points to be hold out as part of a test set. Can be: - None : Rescale all distances - ndarray : Hold out points indexed in this array as test set. Returns ------- D_ls : ndarray Secondary distance LocalScaling matrix. References ---------- .. [1] Schnitzer, D., Flexer, A., Schedl, M., & Widmer, G. (2012). Local and global scaling reduce hubs in space. The Journal of Machine Learning Research, 13(1), 2871–2902. """ log = ConsoleLogging() # Checking input io.check_sample_shape_fits(D, train_ind) io.check_valid_metric_parameter(metric) sparse = issparse(D) n = D.shape[0] if metric == 'similarity': if train_ind is not None: raise NotImplementedError kth = n - k exclude = -np.inf self_value = 1. log.warning("Similarity matrix support for LS is experimental.") else: # metric == 'distance': kth = k - 1 exclude = np.inf self_value = 0 if sparse: log.error("Sparse distance matrices are not supported.") raise NotImplementedError( "Sparse distance matrices are not supported.") D = np.copy(D) if test_ind is None: train_set_ind = slice(0, n) #take all n_ind = range(n) else: train_set_ind = np.setdiff1d(np.arange(n), test_ind) n_ind = test_ind # Exclude self distances for j, sample in enumerate(train_ind): D[sample, j] = exclude r = np.zeros(n) for i in range(n): if train_ind is None: if sparse: di = D[i, train_set_ind].toarray() else: di = D[i, train_set_ind] else: di = D[i, :] # all columns are training in this case r[i] = np.partition(di, kth=kth)[kth] if sparse: D_ls = lil_matrix(D.shape) # Number of nonzero cells per row nnz = D.getnnz(axis=1) else: D_ls = np.zeros_like(D) if metric == 'similarity': for i in n_ind: if sparse and nnz[i] <= k: # Don't rescale if there are too few D_ls[i, :] = D[i, :] # neighbors in the current row else: D_ls[i, :] = np.exp(-1 * D[i, :]**2 / (r[i] * r[train_ind])) else: for i in n_ind: D_ls[i, :] = 1 - np.exp(-1 * D[i, :]**2 / (r[i] * r[train_ind])) if test_ind is None: if sparse: return D_ls.tocsr() else: np.fill_diagonal(D_ls, self_value) return D_ls else: # Ensure correct self distances for j, sample in enumerate(train_ind): D_ls[sample, j] = self_value return D_ls[test_ind]
def score(D:np.ndarray, target:np.ndarray, k=5, metric:str='distance', test_set_ind:np.ndarray=None, verbose:int=0, sample_idx=None, filter_self=True): """Perform `k`-nearest neighbor classification. Use the ``n x n`` symmetric distance matrix `D` and target class labels `target` to perform a `k`-NN experiment (leave-one-out cross-validation or evaluation of test set; see parameter `test_set_ind`). Ties are broken by the nearest neighbor. Parameters ---------- D : ndarray The ``n x n`` symmetric distance (similarity) matrix. target : ndarray (of dtype=int) The ``n x 1`` target class labels (ground truth). k : int or array_like (of dtype=int), optional (default: 5) Neighborhood size for `k`-NN classification. For each value in `k`, one `k`-NN experiment is performed. HINT: Providing more than one value for `k` is a cheap means to perform multiple `k`-NN experiments at once. Try e.g. ``k=[1, 5, 20]``. metric : {'distance', 'similarity'}, optional (default: 'distance') Define, whether matrix `D` is a distance or similarity matrix test_sed_ind : ndarray, optional (default: None) Define data points to be hold out as part of a test set. Can be: - None : Perform a LOO-CV experiment - ndarray : Hold out points indexed in this array as test set. Fit model to remaining data. Evaluate model on test set. verbose : int, optional (default: 0) Increasing level of output (progress report). sample_idx : ... TODO add description filter_self : bool, optional, default: True Remove self similarities from sparse ``D``. This assumes that the highest similarity per row is the self similarity. NOTE: Quadratic dense matrices are always filtered for self distances/similarities, even if `filter_self` is set t0 `False`. Returns ------- acc : ndarray (shape=(n_k x 1), dtype=float) Classification accuracy (`n_k`... number of items in parameter `k`) HINT: Refering to the above example... ... ``acc[0]`` gives the accuracy of the ``k=1`` experiment. corr : ndarray (shape=(n_k x n), dtype=int) Raw vectors of correctly classified items HINT: ... ``corr[1, :]`` gives these items for the ``k=5`` experiment. cmat : ndarray (shape=(n_k x n_t x n_t), dtype=int) Confusion matrix (``n_t`` number of unique items in parameter target) HINT: ... ``cmat[2, :, :]`` gives the confusion matrix of the ``k=20`` experiment. """ # Check input sanity log = ConsoleLogging() if sample_idx is None: io.check_distance_matrix_shape(D) else: io.check_sample_shape_fits(D, sample_idx) io.check_distance_matrix_shape_fits_labels(D, target) io.check_valid_metric_parameter(metric) if metric == 'distance': d_self = np.inf sort_order = 1 if metric == 'similarity': d_self = -np.inf sort_order = -1 # Copy, because data is changed D = D.copy() target = target.astype(int) D_is_sparse = issparse(D) if verbose: log.message("Start k-NN experiment.") # Handle LOO-CV vs. test set mode if test_set_ind is None: n = D.shape[0] test_set_ind = range(n) # dummy train_set_ind = n # dummy else: # number of points to be classified n = test_set_ind.size # Indices of training examples train_set_ind = np.setdiff1d(np.arange(n), test_set_ind) if sample_idx is not None: raise NotImplementedError("Sample k-NN does not support train/" "test splits at the moment.") # Number of k-NN parameters try: k_length = k.size except AttributeError as e: if isinstance(k, int): k = np.array([k]) k_length = k.size elif isinstance(k, list): k = np.array(k) k_length = k.size else: raise e acc = np.zeros((k_length, 1)) corr = np.zeros((k_length, D.shape[0])) cl = np.sort(np.unique(target)) if D_is_sparse: # Add a label for unknown class (object w/o nonzero sim to any others) cl = np.append(cl, cl.max()+1) n_classes = len(cl) + 1 else: n_classes = len(cl) cmat = np.zeros((k_length, n_classes, n_classes)) classes = target.copy() for idx, cur_class in enumerate(cl): # change labels to 0, 1, ..., len(cl)-1 classes[target == cur_class] = idx if sample_idx is not None: sample_classes = classes[sample_idx] j = np.ones(n, int) j *= (n+1) # illegal indices will throw index out of bounds error j[sample_idx] = np.arange(len(sample_idx)) for j, sample in enumerate(sample_idx): D[sample, j] = d_self cl = range(len(cl)) rnd_classif = np.zeros(k_length) # Classify each point in test set for i in test_set_ind: if verbose and ((i+1)%1000==0 or i+1==n): log.message("Prediction: {} of {}.".format(i+1, n), flush=True) seed_class = classes[i] if D_is_sparse: row = D.getrow(i) else: row = D[i, :] if sample_idx is None: row[i] = d_self # Sort points in training set according to distance # Randomize, in case there are several points of same distance # (this is especially relevant for SNN rescaling) if sample_idx is None: rp = train_set_ind else: rp = np.arange(len(sample_idx)) if D_is_sparse: nnz = row.nnz rp = np.random.permutation(nnz) d2 = row.data[rp] # Partition for each k value kth = nnz - k - 1 # sort the two highest similarities to end kth = np.append(kth, [nnz-2, nnz-1]) # Clip negative indices (nnz < k) np.clip(kth, a_min=0, a_max=nnz-1, out=kth) # Remove duplicate k values and sort kth = np.unique(kth) d2idx = np.argpartition(d2, kth=kth) d2idx = d2idx[~np.isnan(d2[d2idx])][::-1] idx = row.nonzero()[1][rp[d2idx]] idx = idx[1:] # rem self sim else: rp = np.random.permutation(rp) d2 = row[rp] d2idx = np.argsort(d2, axis=0)[::sort_order] d2idx = d2idx[~np.isnan(d2[d2idx])] # filter NaN values idx = rp[d2idx] # More than one k is useful for cheap multiple k-NN experiments at once for j in range(k_length): # Make sure no inf/-inf/nan values are used for classification if D_is_sparse: #print(row[0, idx[0:k[j]]].toarray()) finite_val = np.isfinite(row[0, idx[0:k[j]]].toarray().ravel()) #print(finite_val) else: finite_val = np.isfinite(row[idx[0:k[j]]]) # However, if no values are finite, classify randomly if finite_val.sum() == 0: idx = np.random.permutation(idx) finite_val = np.ones_like(finite_val) rnd_classif[j] += 1 if sample_idx is None: nn_class = classes[idx[0:k[j]]][finite_val] else: #finite_val = np.isfinite(sample_row[idx[0:k[j]]]) nn_class = sample_classes[idx[0:k[j]]][finite_val] cs = np.bincount(nn_class.astype(int)) if cs.size > 0: max_cs = np.where(cs == np.max(cs))[0] else: max_cs = np.array([len(cl) - 1]) # misclassification label # "tie": use nearest neighbor if len(max_cs) > 1: if seed_class == nn_class[0]: acc[j] += 1/n corr[j, i] = 1 cmat[j, seed_class, nn_class[0]] += 1 # majority vote else: if cl[max_cs[0]] == seed_class: acc[j] += 1/n corr[j, i] = 1 cmat[j, seed_class, cl[max_cs[0]]] += 1 if np.any(rnd_classif): for x in rnd_classif: log.warning(("{} queries were classified randomly, because all " "distances were non-finite numbers.").format(x)) if verbose: log.message("Finished k-NN experiment.") return acc, corr, cmat
def __init__(self, counts, mu=None, reversible=False, Tinit=None): """ Sets the count matrix used for sampling. Assumes that the prior (if desired) is included. Parameters ---------- counts : ndarray (n, n) the posterior count matrix mu : ndarray (n) optional stationary distribution, if given, the sampled transition matrix will have this this stat dist. reversible : boolean should sample a reversible transition matrix. Tinit : ndarray(n, n) optional start point for sampling algorithm. Example ------- >>> C = np.array([[5, 2], [1,10]]) >>> sampler = ITransitionMatrixSampler(C) >>> T = sampler.sample(10**6) >>> print T """ if issparse(counts): counts = counts.toarray() # the interface in stallone takes counts as doubles counts = counts.astype(np.float64) try: C = ndarray_to_stallone_array(counts) jpackage = stallone.mc.sampling # convert types to java if Tinit is not None: Tinit = ndarray_to_stallone_array(Tinit) if mu is not None: mu = ndarray_to_stallone_array(mu) if reversible: if mu: # fixed pi if Tinit: self.sampler = jpackage.TransitionMatrixSamplerRevFixPi( C, Tinit, mu) else: self.sampler = jpackage.TransitionMatrixSamplerRevFixPi( C, mu) else: # sample reversible matrix, with arbitrary pi if Tinit: self.sampler = jpackage.TransitionMatrixSamplerRev( C, Tinit) else: self.sampler = jpackage.TransitionMatrixSamplerRev(C) else: # sample non rev if Tinit: self.sampler = jpackage.TransitionMatrixSamplerNonrev( C, Tinit) else: self.sampler = jpackage.TransitionMatrixSamplerNonrev(C) except JavaException as je: log = getLogger() log.exception("Error during creation of tmatrix sampling wrapper:" " stack\n%s" % je.stacktrace()) raise
def to_array(self, potential_sparse_array): if issparse(potential_sparse_array): return potential_sparse_array.toarray() else: return potential_sparse_array
def score(self, X: np.ndarray = None, y=None, has_self_distances: bool = False): """ Estimate hubness in a data set. Hubness is estimated from the distances between all objects in X to all objects in Y. If Y is None, all-against-all distances between the objects in X are used. If self.metric == 'precomputed', X must be a distance matrix. Parameters ---------- X: ndarray, shape (n_query, n_features) or (n_query, n_indexed) Array of query vectors, or distance, if self.metric == 'precomputed' y: ignored has_self_distances: bool, default = False Define, whether a precomputed distance matrix contains self distances, which need to be excluded. Returns ------- hubness_measure: float or dict Return the hubness measure as indicated by `return_value`. Additional hubness indices are provided as attributes (e.g. :func:`robinhood_index_`). if return_value is 'all', a dict of all hubness measures is returned. """ check_is_fitted(self, 'X_train_') if X is None: X_test = self.X_train_ else: X_test = X X_test = check_array(X_test, accept_sparse=True) X_train = self.X_train_ kth = np.arange(self.k) start = 0 end = self.k if self.metric == 'precomputed': if X is not None: raise ValueError( f'No X must be passed with metric=="precomputed".') n_test, n_train = X_test.shape if has_self_distances: kth = np.arange(self.k + 1) start = 1 end = self.k + 1 else: if X is None: # Self distances do occur in this case kth = np.arange(self.k + 1) start = 1 end = self.k + 1 n_test, m_test = X_test.shape n_train, m_train = X_train.shape if m_test != m_train: raise ValueError( f'Number of features do not match: X_train.shape={X_train.shape}, ' f'X_test.shape={X_test.shape}.') if self.metric == 'precomputed': if issparse(X_test): k_neighbors = self._k_neighbors_precomputed_sparse(X_test) else: k_neighbors = self._k_neighbors_precomputed( X_test, kth, start, end) else: if X is None: k_neighbors = self._k_neighbors() else: k_neighbors = self._k_neighbors(X_test=X_test) if self.store_k_neighbors: self.k_neighbors = k_neighbors # Negative indices can occur, when ANN does not find enough neighbors, # and must be removed mask = k_neighbors < 0 if np.any(mask): k_neighbors = k_neighbors[~mask] del mask k_occurrence = np.bincount(k_neighbors.astype(int).ravel(), minlength=n_train) if self.store_k_occurrence: self.k_occurrence = k_occurrence # traditional skewness measure self.k_skewness = stats.skew(k_occurrence) # new skewness measure (truncated normal distribution) self.k_skewness_truncnorm = self._calc_skewness_truncnorm(k_occurrence) # Gini index limiting = 'space' if k_occurrence.shape[0] > 10_000 else 'time' self.gini_index = self._calc_gini_index(k_occurrence, limiting) # Robin Hood index self.robinhood_index = self._calc_robinhood_index(k_occurrence) # Atkinson index self.atkinson_index = self._calc_atkinson_index(k_occurrence) # anti-hub occurrence self.antihubs, self.antihub_occurrence = \ self._calc_antihub_occurrence(k_occurrence) # hub occurrence self.hubs, self.hub_occurrence = \ self._calc_hub_occurrence(k=self.k, k_occurrence=k_occurrence, n_test=n_test, hub_size=self.hub_size) # Largest hub self.groupie_ratio = k_occurrence.max() / n_test / self.k # Dictionary of all hubness measures self.hubness_measures = { 'k_skewness': self.k_skewness, 'k_skewness_truncnorm': self.k_skewness_truncnorm, 'atkinson': self.atkinson_index, 'gini': self.gini_index, 'robinhood': self.robinhood_index, 'antihubs': self.antihubs, 'antihub_occurrence': self.antihub_occurrence, 'hubs': self.hubs, 'hub_occurrence': self.hub_occurrence, 'groupie_ratio': self.groupie_ratio, } if hasattr(self, 'k_neighbors'): self.hubness_measures['k_neighbors'] = self.k_neighbors if hasattr(self, 'k_occurrence'): self.hubness_measures['k_occurrence'] = self.k_occurrence if self.return_value == 'all': return self.hubness_measures else: return self.hubness_measures[self.return_value]
def r_precision(S:np.ndarray, y:np.ndarray, metric:str='distance', average:str='weighted', return_y_pred:int=0, verbose:int=0, n_jobs:int=1) -> float: """ Calculate R-Precision (recall at R-th position). Parameters ---------- S : ndarray or CSR matrix Distance (similarity) matrix y : ndarray Target (ground truth) labels metric : 'distance' or 'similarity', optional, default: 'similarity' Define, whether `S` is a distance or similarity matrix. average : 'weighted', 'macro' or None, optional, default: 'weighted' Ignored. Weighted and macro precisions are returned. return_y_pred : int, optional, default: 0 If > 0, return the labels of the `return_y_pred` nearest neighbors verbose : int, optional, default: 0 Increasing level of output. n_jobs : int, optional, default: 1 Number of parallel processes to use. Returns ------- r_precision : dictionary with following keys: macro : float Macro R-Precision. weighted : float Weighted R-Precision. per_item : ndarray R-Precision at the object. relevant_items : ndarray Relevant items per class. y_true : ndarray Target labels (req. for weighting). y_pred : ndarray Labels of some k-nearest neighbors """ io.check_distance_matrix_shape(S) io.check_distance_matrix_shape_fits_labels(S, y) io.check_valid_metric_parameter(metric) log = ConsoleLogging() n, _ = S.shape S_is_sparse = issparse(S) if metric != 'similarity' or not S_is_sparse: raise NotImplementedError("Only sparse similarity matrices so far.") # Map labels to 0..n(labels)-1 le = LabelEncoder() # Add int.min for misclassifications incorr_orig = np.array([np.nan]).astype(int) le.fit(np.append(y, incorr_orig)) y = le.transform(y) incorrect = le.transform(incorr_orig) # Number of relevant items, i.e. number of each label relevant_items = np.bincount(y) - 1 # one less for self class # R-Precision for each item r_prec = np.zeros(n, dtype=np.float) # Classify each point in test set if verbose: log.message("Creating shared memory data.") n_random_pred = mp.Value(ctypes.c_int) n_random_pred.value = 0 if verbose and log: log.message("Spawning processes for prediction.") y_pred = np.zeros((n, return_y_pred), dtype=float) kwargs = {'y_pred' : return_y_pred, 'incorrect' : incorrect} with mp.Pool(processes=n_jobs, initializer=_load_shared_csr, initargs=(S, y, n_random_pred, relevant_items)) as pool: for i, r in enumerate( pool.imap( func=partial(_r_prec_worker, **kwargs), iterable=range(n), chunksize=int(1e2))): if verbose and ((i+1)%int(1e7 / 10**verbose) == 0 or i == n-1): log.message("Classification: {} of {} on {}.".format( i+1, n, mp.current_process().name), flush=True) try: r_prec[i] = r[0] y_pred[i, :] = r[1] except: r_prec[i] = r if i == n-1: pass pool.join() if verbose and log: log.message("Retrieving nearest neighbors.") # Work-around for new scikit-learn requirement of 1D arrays for LabelEncoder y_pred = np.asarray([le.inverse_transform(col) for col in y_pred.T.astype(int)]).T if verbose and log: log.message("Finishing.") if n_random_pred.value: log.warning(("{} queries were classified randomly, because all " "distances were non-finite numbers or there were no other " "objects in the same class.").format(n_random_pred.value)) return_dict = {'macro' : r_prec.mean(), 'weighted' : np.average(r_prec, weights=relevant_items[y]), 'per_item' : r_prec, 'relevant_items' : relevant_items, 'y_true' : y, 'y_pred' : y_pred} return return_dict
def hubness(D:np.ndarray, k:int=5, metric='distance', verbose:int=0, n_jobs:int=-1): """Compute hubness of a distance matrix. Hubness [1]_ is the skewness of the `k`-occurrence histogram (reverse nearest neighbor count, i.e. how often does a point occur in the `k`-nearest neighbor lists of other points). Parameters ---------- D : ndarray The ``n x n`` symmetric distance (similarity) matrix. k : int, optional (default: 5) Neighborhood size for `k`-occurence. metric : {'distance', 'similarity'}, optional (default: 'distance') Define, whether matrix `D` is a distance or similarity matrix verbose : int, optional (default: 0) Increasing level of output (progress report). n_jobs : int, optional (default: -1) Number of parallel processes spawned for hubness calculation. Default value (-1): number of available CPUs. Returns ------- S_k : float Hubness (skewness of `k`-occurence distribution) D_k : ndarray `k`-nearest neighbor lists N_k : ndarray `k`-occurence list References ---------- .. [1] Radovanović, M., Nanopoulos, A., & Ivanović, M. (2010). Hubs in Space : Popular Nearest Neighbors in High-Dimensional Data. Journal of Machine Learning Research, 11, 2487–2531. Retrieved from http://jmlr.csail.mit.edu/papers/volume11/radovanovic10a/ radovanovic10a.pdf """ log = Logging.ConsoleLogging() IO._check_distance_matrix_shape(D) IO._check_valid_metric_parameter(metric) if metric == 'distance': d_self = np.inf sort_order = 1 if metric == 'similarity': d_self = -np.inf sort_order = -1 if verbose: log.message("Hubness calculation (skewness of {}-occurence)".format(k)) # Initialization n = D.shape[0] D = D.copy() D_k = np.zeros((k, D.shape[1]), dtype=np.float32 ) if issparse(D): pass # correct self-distance must be ensured upstream for sparse else: # Set self dist to inf np.fill_diagonal(D, d_self) # make non-finite (NaN, Inf) appear at the end of the sorted list D[~np.isfinite(D)] = d_self # Parallelization if n_jobs == -1: # take all cpus NUMBER_OF_PROCESSES = mp.cpu_count() # @UndefinedVariable else: NUMBER_OF_PROCESSES = n_jobs tasks = [] batches = [] batch_size = n // NUMBER_OF_PROCESSES for i in range(NUMBER_OF_PROCESSES-1): batches.append( np.arange(i*batch_size, (i+1)*batch_size) ) batches.append( np.arange((NUMBER_OF_PROCESSES-1)*batch_size, n) ) for idx, batch in enumerate(batches): submatrix = D[batch[0]:batch[-1]+1] tasks.append((_partial_hubness, (k, d_self, log, sort_order, batch, submatrix, idx, n, verbose))) task_queue = mp.Queue() # @UndefinedVariable done_queue = mp.Queue() # @UndefinedVariable for task in tasks: task_queue.put(task) for i in range(NUMBER_OF_PROCESSES): # @UnusedVariable mp.Process(target=_worker, args=(task_queue, done_queue)).start() # @UndefinedVariable for i in range(len(tasks)): # @UnusedVariable rows, Dk_part = done_queue.get() D_k[:, rows[0]:rows[-1]+1] = Dk_part for i in range(NUMBER_OF_PROCESSES): # @UnusedVariable task_queue.put('STOP') # k-occurence N_k = np.bincount(D_k.astype(int).ravel()) # Hubness S_k = stats.skew(N_k) if verbose: log.message("Hubness calculation done.", flush=True) # return hubness, k-nearest neighbors, N occurence return S_k, D_k, N_k
def predict(D:np.ndarray, target:np.ndarray, k=5, metric:str='distance', test_ind:np.ndarray=None, verbose:int=0, sample_idx=None, return_cmat=True): """Perform `k`-nearest neighbor classification. Use the ``n x n`` symmetric distance matrix `D` and target class labels `target` to perform a `k`-NN experiment (leave-one-out cross-validation or evaluation of test set; see parameter `test_ind`). Ties are broken by the nearest neighbor. Parameters ---------- D : ndarray The ``n x n`` symmetric distance (similarity) matrix. target : ndarray (of dtype=int) The ``n x 1`` target class labels (ground truth) or ``n x c`` in case of ``c`` binarized multilabels k : int or array_like (of dtype=int), optional (default: 5) Neighborhood size for `k`-NN classification. For each value in `k`, one `k`-NN experiment is performed. HINT: Providing more than one value for `k` is a cheap means to perform multiple `k`-NN experiments at once. Try e.g. ``k=[1, 5, 20]``. metric : {'distance', 'similarity'}, optional (default: 'distance') Define, whether matrix `D` is a distance or similarity matrix test_ind : ndarray, optional (default: None) Define data points to be hold out as part of a test set. Can be: - None : Perform a LOO-CV experiment - ndarray : Hold out points indexed in this array as test set. Fit model to remaining data. Evaluate model on test set. verbose : int, optional (default: 0) Increasing level of output (progress report). return_cmat : bool, optional, default: True If False, only return the predictions `y_pred`. Otherwise also return the confusion matrices. Returns ------- y_pred : ndarray (shape=(n_k, n, c), dtype=int) Predicted class labels (`n_k`... number of items in parameter `k`) HINT: Referring to the above example... ... ``y_pred[0]`` gives the predictions of the ``k=1`` experiment. cmat : ndarray (shape=(n_k x c x n_t x n_t), dtype=int) Confusion matrix (``n_t`` number of unique items in parameter target) HINT: ... ``cmat[2, 0, :, :]`` gives the confusion matrix of the first class in the ``k=20`` experiment in the following order: TN FP FN TP """ # Check input sanity log = ConsoleLogging() if sample_idx is None: io.check_distance_matrix_shape(D) else: io.check_sample_shape_fits(D, sample_idx) #io._check_distance_matrix_shape_fits_labels(D, target) io.check_valid_metric_parameter(metric) if metric == 'distance': d_self = np.inf sort_order = 1 if metric == 'similarity': d_self = -np.inf sort_order = -1 # Copy, because data is changed if not issparse(D): D = D.copy() target = target.astype(int) if target.ndim == 1: target = target[:, np.newaxis] if verbose: log.message("Start k-NN experiment.") # Handle LOO-CV vs. test set mode if test_ind is None: n = D.shape[0] test_set_ind = range(n) # dummy io.check_valid_metric_parameter(metric) train_set_ind = n # dummy else: # number of points to be classified n = test_set_ind.size # Indices of training examples train_set_ind = np.setdiff1d(np.arange(n), test_set_ind) if sample_idx is not None: raise NotImplementedError("Sample k-NN does not support train/" "test splits at the moment.") # Number of k-NN parameters try: k_length = k.size except AttributeError as e: if isinstance(k, int): k = np.array([k]) k_length = k.size elif isinstance(k, list): k = np.array(k) k_length = k.size else: raise e cl = np.sort(np.unique(target)) cmat = np.zeros((k_length, target.shape[1], len(cl), len(cl)), dtype=int) y_pred = np.zeros((k_length, *target.shape), dtype=int) classes = target.copy() for idx, cur_class in enumerate(np.array(cl).ravel()): # change labels to 0, 1, ..., len(cl)-1 classes[target == cur_class] = idx if sample_idx is not None: sample_classes = classes[sample_idx] j = np.ones(n, int) j *= (n+1) # illegal indices will throw index out of bounds error j[sample_idx] = np.arange(len(sample_idx)) for j, sample in enumerate(sample_idx): D[sample, j] = d_self cl = range(len(cl)) # Classify each point in test set for i in test_set_ind: if verbose and ((i+1)%1000==0 or i+1==n): log.message("Prediction: {} of {}.".format(i+1, n), flush=True) if issparse(D): row = D.getrow(i) #row = D.data ind = row.nonzero()[1] row = row.toarray().ravel() else: row = D[i, :] if sample_idx is None: row[i] = d_self # Sort points in training set according to distance # Randomize, in case there are several points of same distance # (this is especially relevant for SNN rescaling) if sample_idx is None: rp = train_set_ind else: if issparse(D): rp = ind else: rp = np.arange(len(sample_idx)) rp = np.random.permutation(rp) d2 = row[rp] d2idx = np.argsort(d2, axis=0)[::sort_order] d2idx = d2idx[~np.isnan(d2[d2idx])] # filter NaN values idx = rp[d2idx] # More than one k is useful for cheap multiple k-NN experiments at once for j in range(k_length): # Make sure no inf/-inf/nan values are used for classification finite_val = np.isfinite(row[idx[0:k[j]]]) # However, if no values are finite, classify randomly if finite_val.sum() == 0: idx = np.random.permutation(idx) finite_val = np.ones_like(finite_val) log.warning("Query was classified randomly, because all " "distances were non-finite numbers.") for l in range(target.shape[1]): l_classes = classes[:, l] if sample_idx is None: nn_class = l_classes[idx[0:k[j]]][finite_val] else: l_sample_classes = sample_classes[:, l] nn_class = l_sample_classes[idx[0:k[j]]][finite_val] cs = np.bincount(nn_class.astype(int)) max_cs = np.where(cs == np.max(cs))[0] seed_class = classes[i, l] # "tie": use nearest neighbor if len(max_cs) > 1: y_pred[j, i, l] = nn_class[0] cmat[j, l, seed_class, nn_class[0]] += 1 # majority vote else: y_pred[j, i, l] = cl[max_cs[0]] cmat[j, l, seed_class, cl[max_cs[0]]] += 1 if verbose: log.message("Finished k-NN experiment.") if return_cmat: return y_pred, cmat else: return y_pred
def hubness(D:np.ndarray, k:int=5, metric='distance', verbose:int=0, n_jobs:int=1, random_state=None, shuffle_equal=True): """Compute hubness of a distance matrix. Hubness [1]_ is the skewness of the `k`-occurrence histogram (reverse nearest neighbor count, i.e. how often does a point occur in the `k`-nearest neighbor lists of other points). Parameters ---------- D : ndarray The ``n x n`` symmetric distance (similarity) matrix or an ``n x m`` partial distances matrix (e.g. for train/test splits, with test objects in rows, train objects in column) NOTE: Partial distance matrices MUST NOT contain self distances. k : int, optional (default: 5) Neighborhood size for `k`-occurrence. metric : {'distance', 'similarity'}, optional (default: 'distance') Define, whether matrix `D` is a distance or similarity matrix verbose : int, optional (default: 0) Increasing level of output (progress report). n_jobs : int, optional (default: 1) Number of parallel processes spawned for hubness calculation. Value 1 (default): One process (not using multiprocessing) Value (-1): As many processes as number of available CPUs. random_state : int, optional Seed the RNG for reproducible results. NOTE: Currently only compatible with `n_jobs`=1 shuffle_equal : bool, optional If true, shuffle neighbors with identical distances to avoid artifact hubness. NOTE: This is especially useful for secondary distance measures with a restricted number of possible values, e.g. SNN or MP empiric. Returns ------- S_k : float Hubness (skewness of `k`-occurrence distribution) D_k : ndarray `k`-nearest neighbor lists N_k : ndarray `k`-occurrence list References ---------- .. [1] Radovanović, M., Nanopoulos, A., & Ivanović, M. (2010). Hubs in Space : Popular Nearest Neighbors in High-Dimensional Data. Journal of Machine Learning Research, 11, 2487–2531. Retrieved from http://jmlr.csail.mit.edu/papers/volume11/radovanovic10a/ radovanovic10a.pdf """ # Don't use multiprocessing environment when using only one job if n_jobs == 1: return _hubness_no_multiprocessing(D=D, k=k, metric=metric, verbose=verbose, random_state=random_state, shuffle_equal=shuffle_equal) if random_state is not None: raise ValueError("Seeding the RNG is not compatible with using n_jobs > 1.") log = ConsoleLogging() io.check_is_nD_array(arr=D, n=2, arr_type='Distance') io.check_valid_metric_parameter(metric) n, m = D.shape if k >= m: k_old = k k = m - 1 log.warning("Reducing k from {} to {}, so that it is less than " "the total number of neighbors.".format(k_old, k)) if metric == 'distance': d_self = np.inf sort_order = 1 kth = np.arange(k) if metric == 'similarity': d_self = -np.inf sort_order = -1 kth = np.arange(m - k, m) if verbose: log.message("Hubness calculation (skewness of {}-occurrence)".format(k)) # Initialization D = D.copy() D_k = np.zeros((n, k), dtype=np.float64) if issparse(D): pass # correct self-distance must be ensured upstream for sparse else: if n == m: # Set self dist to inf np.fill_diagonal(D, d_self) else: pass # Partial distance matrices MUST NOT contain self distances # make non-finite (NaN, Inf) appear at the end of the sorted list D[~np.isfinite(D)] = d_self # Parallelization if n_jobs == -1: # take all cpus NUMBER_OF_PROCESSES = mp.cpu_count() # @UndefinedVariable else: NUMBER_OF_PROCESSES = n_jobs D_k_ctype = RawArray(ctypes.c_int32, n*k) D_k = np.frombuffer(D_k_ctype, dtype=np.int32).reshape((n, k)) with Pool(processes=NUMBER_OF_PROCESSES, initializer=_hubness_load_shared_data, initargs=(D, D_k, )) as pool: for _ in pool.imap( func=partial(_hubness_nearest_neighbors, n=n, m=m, d_self=d_self, metric=metric, kth=kth, sort_order=sort_order, log=log, verbose=verbose, shuffle_equal=shuffle_equal), #chunksize=int(1e2), iterable=range(n)): pass # results handled within func # k-occurrence N_k = np.bincount(D_k.astype(int).ravel(), minlength=m) # Hubness S_k = stats.skew(N_k) if verbose: log.message("Hubness calculation done.", flush=True) # return hubness, k-nearest neighbors, N occurence return S_k, D_k, N_k
def local_scaling(D:np.ndarray, k:int=7, metric:str='distance', test_ind:np.ndarray=None, n_jobs:int=1): """Transform a distance matrix with Local Scaling. Transforms the given distance matrix into new one using local scaling [1]_ with the given `k`-th nearest neighbor. There are two types of local scaling methods implemented. The original one and NICDM, both reduce hubness in distance spaces, similarly to Mutual Proximity. Parameters ---------- D : ndarray or csr_matrix The ``n x n`` symmetric distance (similarity) matrix. k : int, optional (default: 7) Neighborhood radius for local scaling. metric : {'distance', 'similarity'}, optional (default: 'distance') Define, whether matrix `D` is a distance or similarity matrix. NOTE: self similarities in sparse `D_ls` are set to ``np.inf`` test_ind : ndarray, optional (default: None) Define data points to be hold out as part of a test set. Can be: - None : Rescale all distances - ndarray : Hold out points indexed in this array as test set. n_jobs : int, optional, default: 1 Number of processes for parallel computations. - `1`: Don't use multiprocessing. - `-1`: Use all CPUs Returns ------- D_ls : ndarray Secondary distance LocalScaling matrix. References ---------- .. [1] Schnitzer, D., Flexer, A., Schedl, M., & Widmer, G. (2012). Local and global scaling reduce hubs in space. The Journal of Machine Learning Research, 13(1), 2871–2902. """ log = ConsoleLogging() # Checking input io.check_distance_matrix_shape(D) io.check_valid_metric_parameter(metric) sparse = issparse(D) n = D.shape[0] if n_jobs == -1: n_jobs = cpu_count() if metric == 'similarity': kth = n - k exclude = -np.inf self_tmp_value = np.inf self_value = 1. log.warning("Similarity matrix support for LS is experimental.") if sparse and n_jobs != 1: log.warning("Parallel processing not implemented for sparse " "matrices. Using single process instead.") n_jobs = 1 else: # metric == 'distance': kth = k - 1 exclude = np.inf self_value = 0 self_tmp_value = self_value if sparse: log.error("Sparse distance matrices are not supported.") raise NotImplementedError( "Sparse distance matrices are not supported.") D = np.copy(D) if test_ind is None: train_ind = slice(0, n) #take all else: train_ind = np.setdiff1d(np.arange(n), test_ind) if sparse: r = np.zeros(n) for i in range(n): di = D[i, train_ind].toarray() di[i] = exclude r[i] = np.partition(di, kth=kth)[kth] D_ls = lil_matrix(D.shape) # Number of nonzero cells per row nnz = D.getnnz(axis=1) else: np.fill_diagonal(D, exclude) if n_jobs > 1: r_ctype = RawArray(ctypes.c_double, n) r = np.frombuffer(r_ctype, dtype=np.float64) with Pool(processes=n_jobs, initializer=_ls_load_shared_data, initargs=(D, train_ind, r, r_ctype)) as pool: for _ in pool.imap(func=partial(_ls_calculate_r, kth=kth), iterable=range(n)): pass # results handled within func else: r = np.partition(D[:, train_ind], kth=kth)[:, kth] if sparse or n_jobs == 1: D_ls = np.zeros_like(D) for i in range(n): # vectorized inner loop: calc only triu part tmp = np.empty(n-i) tmp[0] = self_tmp_value if metric == 'similarity': if sparse and nnz[i] <= k: # Don't rescale if there are tmp[1:] = np.nan # too few neighbors in row else: tmp[1:] = np.exp(-1 * D[i, i+1:]**2 / (r[i] * r[i+1:])) else: tmp[1:] = 1 - np.exp(-1 * D[i, i+1:]**2 / (r[i] * r[i+1:])) D_ls[i, i:] = tmp # copy triu to tril -> symmetric matrix (diag=zeros) # NOTE: does not affect self values, since inf+inf=inf and 0+0=0 D_ls += D_ls.T else: D_ls_ctype = RawArray(ctypes.c_double, D.size) D_ls = np.frombuffer(D_ls_ctype, dtype=np.float64).reshape(D.shape) with Pool(processes=n_jobs, initializer=_ls_load_shared_data, initargs=(D, train_ind, r, r_ctype, D_ls, D_ls_ctype)) as pool: for _ in pool.imap(func=partial(_ls_calculate_sec_dist, n=n, metric=metric, self_tmp_value=self_tmp_value), iterable=range(n)): pass # results handled within func # triu is copied to tril within func if sparse: for i, nz in enumerate(nnz): if nz <= k: # too few neighbors D_ls[i, :] = D[i, :] return D_ls.tocsr() else: np.fill_diagonal(D_ls, self_value) return D_ls
def fit_transform(self, X, Y=None, has_self_distances=False): # Let's assume there are no self distances in X kth = np.arange(self.k) start = 0 end = self.k if self.metric == 'precomputed': if Y is not None: raise ValueError( f"Y must be None when using precomputed distances.") n_test, n_train = X.shape if n_test == n_train and has_self_distances: kth = np.arange(self.k + 1) start = 1 end = self.k + 1 else: n_test, m_test = X.shape if Y is None: Y = X # Self distances do occur in this case kth = np.arange(self.k + 1) start = 1 end = self.k + 1 n_train, m_train = Y.shape assert m_test == m_train, f'Number of features do not match' if self.metric == 'precomputed': if issparse(X): k_neighbors = self._k_neighbors_precomputed_sparse(X) else: k_neighbors = self._k_neighbors_precomputed(X, kth, start, end) else: k_neighbors = self._k_neighbors( X, Y, kth=kth, n_test=n_test, start=start, end=end) if self.return_k_neighbors: self.k_neighbors_ = k_neighbors k_occurrence = np.bincount( k_neighbors.astype(int).ravel(), minlength=n_train) if self.return_k_occurrence: self.k_occurrence_ = k_occurrence # traditional skewness measure self.k_skewness_ = stats.skew(k_occurrence) # new skewness measure (truncated normal distribution) self.k_skewness_truncnorm_ = self._skewness_truncnorm(k_occurrence) # Gini index if k_occurrence.shape[0] > 10_000: limiting = 'space' else: limiting = 'time' self.gini_index_ = self._gini_index(k_occurrence, limiting) # Robin Hood index self.hood_index_ = self._hood_index(k_occurrence) # Atkinson index self.atkinson_index_ = self._atkinson_index(k_occurrence) # anti-hub occurrence self.antihubs_, self.antihub_occurrence_ = \ self._antihub_occurrence(k_occurrence) # hub occurrence self.hubs_, self.hub_occurrence_ = \ self._hub_occurrence(k=self.k, k_occurrence=k_occurrence, n_test=n_test, hub_size=self.hub_size) # Largest hub # TODO That should probably also be diveded by k... self.groupie_ratio_ = k_occurrence.max() / n_test return self
def nicdm_sample(D:np.ndarray, k:int=7, metric:str='distance', train_ind:np.ndarray=None, test_ind:np.ndarray=None): """Transform a distance matrix with local scaling variant NICDM. --- DRAFT version --- Transforms the given distance matrix into new one using NICDM [1]_ with the given neighborhood radius `k` (average). There are two types of local scaling methods implemented. The original one and the non-iterative contextual dissimilarity measure, both reduce hubness in distance spaces, similarly to Mutual Proximity. Parameters ---------- D : ndarray or csr_matrix The ``n x n`` symmetric distance (similarity) matrix. k : int, optional (default: 7) Neighborhood radius for local scaling. metric : {'distance', 'similarity'}, optional (default: 'distance') Define, whether matrix `D` is a distance or similarity matrix. NOTE: self similarities in sparse `D_ls` are set to ``np.inf`` train_ind : ndarray, optional If given, use only these data points as neighbors for rescaling. test_ind : ndarray, optional (default: None) Define data points to be hold out as part of a test set. Can be: - None : Rescale all distances - ndarray : Hold out points indexed in this array as test set. Returns ------- D_nicdm : ndarray Secondary distance NICDM matrix. References ---------- .. [1] Schnitzer, D., Flexer, A., Schedl, M., & Widmer, G. (2012). Local and global scaling reduce hubs in space. The Journal of Machine Learning Research, 13(1), 2871–2902. """ # Checking input io.check_sample_shape_fits(D, train_ind) io.check_valid_metric_parameter(metric) if metric == 'similarity': raise NotImplementedError("NICDM does not support similarity matrices " "at the moment.") else: # metric == 'distance': D = np.copy(D) kth = np.arange(k) exclude = np.inf self_value = 0 if issparse(D): raise NotImplementedError( "Sparse distance matrices are not supported.") n = D.shape[0] if test_ind is None: n_ind = range(n) else: n_ind = test_ind # Exclude self distances for j, sample in enumerate(train_ind): D[sample, j] = exclude # Statistics r = np.partition(D, kth=kth, axis=1)[:, :k].mean(axis=1) r_geom = _local_geomean(r) #knn.ravel()) # Calculate secondary distances D_nicdm = np.zeros_like(D) for i in n_ind: # vectorized inner loop (using broadcasting) D_nicdm[i, :] = (r_geom * D[i, :]) / np.sqrt(r[i] * r[train_ind]) #D_nicdm[i, :] = ((r_geom**2) * D[i, :]) / (r[i] * r[train_ind]) # Ensure correct self distances and return sec. dist. matrix if test_ind is None: np.fill_diagonal(D_nicdm, self_value) return D_nicdm else: for j, sample in enumerate(train_ind): D_nicdm[sample, j] = self_value return D_nicdm[test_ind]
def simcond(self, xo, method='approx', i_unknown=None): """ Simulate values conditionally on observed known values Parameters ---------- x : vector timeseries including missing data. (missing data must be NaN if i_unknown is not given) Assumption: The covariance of x is equal to self and have the same sample period. method : string defining method used in the conditional simulation. Options are: 'approximate': Condition only on the closest points. Quite fast 'exact' : Exact simulation. Slow for large data sets, may not return any result due to near singularity of the covariance matrix. i_unknown : integers indices to spurious or missing data in x Returns ------- sample : ndarray a random sample of the missing values conditioned on the observed data. mu, sigma : ndarray mean and standard deviation, respectively, of the missing values conditioned on the observed data. Notes ----- SIMCOND generates the missing values from x conditioned on the observed values assuming x comes from a multivariate Gaussian distribution with zero expectation and Auto Covariance function R. See also -------- CovData1D.sim TimeSeries.reconstruct, rndnormnd Reference --------- Brodtkorb, P, Myrhaug, D, and Rue, H (2001) "Joint distribution of wave height and wave crest velocity from reconstructed data with application to ringing" Int. Journal of Offshore and Polar Engineering, Vol 11, No. 1, pp 23--32 Brodtkorb, P, Myrhaug, D, and Rue, H (1999) "Joint distribution of wave height and wave crest velocity from reconstructed data" in Proceedings of 9th ISOPE Conference, Vol III, pp 66-73 """ x = atleast_1d(xo).ravel() acf = self._get_acf() num_x = len(x) num_acf = len(acf) if i_unknown is not None: x[i_unknown] = nan i_unknown = flatnonzero(isnan(x)) num_unknown = len(i_unknown) mu1o = zeros((num_unknown, )) mu1o_std = zeros((num_unknown, )) sample = zeros((num_unknown, )) if num_unknown == 0: warnings.warn('No missing data, no point to continue.') return sample, mu1o, mu1o_std if num_unknown == num_x: warnings.warn('All data missing, returning sample from' + ' the apriori distribution.') mu1o_std = ones(num_unknown) * sqrt(acf[0]) return self.sim(ns=num_unknown, cases=1)[:, 1], mu1o, mu1o_std i_known = flatnonzero(1 - isnan(x)) if method.startswith('exac'): # exact but slow. It also may not return any result if num_acf > 0.3 * num_x: Sigma = toeplitz(hstack((acf, zeros(num_x - num_acf)))) else: acf[0] = acf[0] * 1.00001 Sigma = sptoeplitz(hstack((acf, zeros(num_x - num_acf)))) Soo, So1, S11 = self._split_cov(Sigma, i_known, i_unknown) if issparse(Sigma): So1 = So1.todense() S11 = S11.todense() S1o_Sooinv = spsolve(Soo + Soo.T, 2 * So1).T else: Sooinv_So1, _res, _rank, _s = lstsq(Soo + Soo.T, 2 * So1, cond=1e-4) S1o_Sooinv = Sooinv_So1.T mu1o = S1o_Sooinv.dot(x[i_known]) Sigma1o = S11 - S1o_Sooinv.dot(So1) if (diag(Sigma1o) < 0).any(): raise ValueError('Failed to converge to a solution') mu1o_std = sqrt(diag(Sigma1o)) sample[:] = rndnormnd(mu1o, Sigma1o, cases=1).ravel() elif method.startswith('appr'): # approximating by only condition on the closest points Nsig = min(2 * num_acf, num_x) Sigma = toeplitz(hstack((acf, zeros(Nsig - num_acf)))) overlap = int(Nsig / 4) # indices to the points used idx = r_[0:Nsig] + max(0, min(i_unknown[0] - overlap, num_x - Nsig)) mask_unknown = zeros(num_x, dtype=bool) # temporary storage of indices to missing points mask_unknown[i_unknown] = True t_unknown = where(mask_unknown[idx])[0] t_known = where(1 - mask_unknown[idx])[0] ns = len(t_unknown) # number of missing data in the interval num_restored = 0 # number of previously simulated points x2 = x.copy() while ns > 0: Soo, So1, S11 = self._split_cov(Sigma, t_known, t_unknown) if issparse(Soo): So1 = So1.todense() S11 = S11.todense() S1o_Sooinv = spsolve(Soo + Soo.T, 2 * So1).T else: Sooinv_So1, _res, _rank, _s = lstsq(Soo + Soo.T, 2 * So1, cond=1e-4) S1o_Sooinv = Sooinv_So1.T Sigma1o = S11 - S1o_Sooinv.dot(So1) if (diag(Sigma1o) < 0).any(): raise ValueError('Failed to converge to a solution') ix = slice((num_restored), (num_restored + ns)) # standard deviation of the expected surface mu1o_std[ix] = np.maximum(mu1o_std[ix], sqrt(diag(Sigma1o))) # expected surface conditioned on the closest known # observations from x mu1o[ix] = S1o_Sooinv.dot(x2[idx[t_known]]) # sample conditioned on the known observations from x mu1os = S1o_Sooinv.dot(x[idx[t_known]]) sample[ix] = rndnormnd(mu1os, Sigma1o, cases=1) if idx[-1] == num_x - 1: ns = 0 # no more points to simulate else: x2[idx[t_unknown]] = mu1o[ix] # expected surface x[idx[t_unknown]] = sample[ix] # sampled surface # removing indices to data which has been simulated mask_unknown[idx[:-overlap]] = False # data we want to simulate once more nw = sum(mask_unknown[idx[-overlap:]] is True) num_restored += ns - nw # update # points simulated so far idx = self._update_window(idx, i_unknown, num_x, num_acf, overlap, nw, num_restored) # find new interval with missing data t_unknown = flatnonzero(mask_unknown[idx]) t_known = flatnonzero(1 - mask_unknown[idx]) ns = len(t_unknown) # # missing data in the interval return sample, mu1o, mu1o_std
def simcond(self, xo, method='approx', i_unknown=None): """ Simulate values conditionally on observed known values Parameters ---------- x : vector timeseries including missing data. (missing data must be NaN if i_unknown is not given) Assumption: The covariance of x is equal to self and have the same sample period. method : string defining method used in the conditional simulation. Options are: 'approximate': Condition only on the closest points. Quite fast 'exact' : Exact simulation. Slow for large data sets, may not return any result due to near singularity of the covariance matrix. i_unknown : integers indices to spurious or missing data in x Returns ------- sample : ndarray a random sample of the missing values conditioned on the observed data. mu, sigma : ndarray mean and standard deviation, respectively, of the missing values conditioned on the observed data. Notes ----- SIMCOND generates the missing values from x conditioned on the observed values assuming x comes from a multivariate Gaussian distribution with zero expectation and Auto Covariance function R. See also -------- CovData1D.sim TimeSeries.reconstruct, rndnormnd Reference --------- Brodtkorb, P, Myrhaug, D, and Rue, H (2001) "Joint distribution of wave height and wave crest velocity from reconstructed data with application to ringing" Int. Journal of Offshore and Polar Engineering, Vol 11, No. 1, pp 23--32 Brodtkorb, P, Myrhaug, D, and Rue, H (1999) "Joint distribution of wave height and wave crest velocity from reconstructed data" in Proceedings of 9th ISOPE Conference, Vol III, pp 66-73 """ x = atleast_1d(xo).ravel() acf = self._get_acf() num_x = len(x) num_acf = len(acf) if i_unknown is not None: x[i_unknown] = nan i_unknown = flatnonzero(isnan(x)) num_unknown = len(i_unknown) mu1o = zeros((num_unknown,)) mu1o_std = zeros((num_unknown,)) sample = zeros((num_unknown,)) if num_unknown == 0: warnings.warn('No missing data, no point to continue.') return sample, mu1o, mu1o_std if num_unknown == num_x: warnings.warn('All data missing, returning sample from' + ' the apriori distribution.') mu1o_std = ones(num_unknown) * sqrt(acf[0]) return self.sim(ns=num_unknown, cases=1)[:, 1], mu1o, mu1o_std i_known = flatnonzero(1 - isnan(x)) if method.startswith('exac'): # exact but slow. It also may not return any result if num_acf > 0.3 * num_x: Sigma = toeplitz(hstack((acf, zeros(num_x - num_acf)))) else: acf[0] = acf[0] * 1.00001 Sigma = sptoeplitz(hstack((acf, zeros(num_x - num_acf)))) Soo, So1, S11 = self._split_cov(Sigma, i_known, i_unknown) if issparse(Sigma): So1 = So1.todense() S11 = S11.todense() S1o_Sooinv = spsolve(Soo + Soo.T, 2 * So1).T else: Sooinv_So1, _res, _rank, _s = lstsq(Soo + Soo.T, 2 * So1, cond=1e-4) S1o_Sooinv = Sooinv_So1.T mu1o = S1o_Sooinv.dot(x[i_known]) Sigma1o = S11 - S1o_Sooinv.dot(So1) if (diag(Sigma1o) < 0).any(): raise ValueError('Failed to converge to a solution') mu1o_std = sqrt(diag(Sigma1o)) sample[:] = rndnormnd(mu1o, Sigma1o, cases=1).ravel() elif method.startswith('appr'): # approximating by only condition on the closest points Nsig = min(2 * num_acf, num_x) Sigma = toeplitz(hstack((acf, zeros(Nsig - num_acf)))) overlap = int(Nsig / 4) # indices to the points used idx = r_[0:Nsig] + max(0, min(i_unknown[0] - overlap, num_x - Nsig)) mask_unknown = zeros(num_x, dtype=bool) # temporary storage of indices to missing points mask_unknown[i_unknown] = True t_unknown = where(mask_unknown[idx])[0] t_known = where(1 - mask_unknown[idx])[0] ns = len(t_unknown) # number of missing data in the interval num_restored = 0 # number of previously simulated points x2 = x.copy() while ns > 0: Soo, So1, S11 = self._split_cov(Sigma, t_known, t_unknown) if issparse(Soo): So1 = So1.todense() S11 = S11.todense() S1o_Sooinv = spsolve(Soo + Soo.T, 2 * So1).T else: Sooinv_So1, _res, _rank, _s = lstsq(Soo + Soo.T, 2 * So1, cond=1e-4) S1o_Sooinv = Sooinv_So1.T Sigma1o = S11 - S1o_Sooinv.dot(So1) if (diag(Sigma1o) < 0).any(): raise ValueError('Failed to converge to a solution') ix = slice((num_restored), (num_restored + ns)) # standard deviation of the expected surface mu1o_std[ix] = np.maximum(mu1o_std[ix], sqrt(diag(Sigma1o))) # expected surface conditioned on the closest known # observations from x mu1o[ix] = S1o_Sooinv.dot(x2[idx[t_known]]) # sample conditioned on the known observations from x mu1os = S1o_Sooinv.dot(x[idx[t_known]]) sample[ix] = rndnormnd(mu1os, Sigma1o, cases=1) if idx[-1] == num_x - 1: ns = 0 # no more points to simulate else: x2[idx[t_unknown]] = mu1o[ix] # expected surface x[idx[t_unknown]] = sample[ix] # sampled surface # removing indices to data which has been simulated mask_unknown[idx[:-overlap]] = False # data we want to simulate once more nw = sum(mask_unknown[idx[-overlap:]] is True) num_restored += ns - nw # update # points simulated so far idx = self._update_window(idx, i_unknown, num_x, num_acf, overlap, nw, num_restored) # find new interval with missing data t_unknown = flatnonzero(mask_unknown[idx]) t_known = flatnonzero(1 - mask_unknown[idx]) ns = len(t_unknown) # # missing data in the interval return sample, mu1o, mu1o_std
def __nearest_neighbors_search(pipe_to_exec, source_file_path, file_path): ''' runs "pipe_to_exec" nearest neighbors search estimator parameters: * source_file_path : hdf file in which input documents, queries and targets are stored * file_path: hdf filename where nns results will be stored ''' # print(linei.describe) d = hdf_to_sparse_matrix('documents', source_file_path) pipe_to_exec.fit(d, None) d_mean_time = pipe_to_exec.steps[0][1].fit_time print("fitted in %f s" % (d_mean_time)) del d q = hdf_to_sparse_matrix('queries', source_file_path) d_indices, qd_distances, q_mean_time = pipe_to_exec.transform(q) # print("mean retrieval time %f s"%(q_mean_time)) time_dataframe = pd.DataFrame({ 'documents_mean_time': [d_mean_time], 'queries_mean_time': [q_mean_time], }) ''' storing nearest neighbors search results ''' time_dataframe.to_hdf(file_path.replace('results.h5', 'time.h5'), 'time_dataframe') sparse_matrix_to_hdf(d_indices, 'retrieved_docs', file_path) sparse_matrix_to_hdf(lil_matrix(qd_distances), 'qd_distances', file_path) del q, d_mean_time, q_mean_time, qd_distances, time_dataframe ''' Evaluating results in terms of Precision, Recalls and MAP. ''' t = hdf_to_sparse_matrix('targets', source_file_path) retrieved_relevants = [] for q_index in range(d_indices.shape[0]): q_retrieved_relevants = np.cumsum(t[q_index, d_indices[q_index, :]].A, axis=1) retrieved_relevants.append(q_retrieved_relevants) retrieved_relevants = vstack(retrieved_relevants) ''' broadcasting ''' approachi_recalls = np.divide(retrieved_relevants, np.matrix(t.sum(axis=1))) ranking_sum = np.multiply( np.ones(retrieved_relevants.shape), np.matrix(range(1, retrieved_relevants.shape[1] + 1))) approachi_precisions = np.divide(retrieved_relevants, ranking_sum) average_precision = np.zeros((d_indices.shape[0], 1)) for q_index in range(d_indices.shape[0]): relevants_precision = np.multiply(approachi_precisions[q_index, :], t[q_index, d_indices[q_index, :]].A) average_precision[q_index, 0] = relevants_precision.mean(axis=1) # print(q_index,'.MAP =',average_precision[q_index,0]) # print(t.sum(axis=1)) # print(retrieved_relevants) del d_indices, retrieved_relevants # print("MAP=",average_precision.mean(),average_precision.std(),'precision.sum=',average_precision.sum()) # print("recalls.sum = ",approachi_recalls.sum(),'| mean = ',approachi_recalls.sum()/(approachi_recalls.shape[0]*approachi_recalls.shape[1])) for to_store, to_store_name in [(approachi_precisions, 'precisions'), (approachi_recalls, 'recalls'), (average_precision, 'average_precisions')]: if not issparse(to_store): to_store = csr_matrix(to_store) sparse_matrix_to_hdf( to_store, to_store_name, file_path.replace('results', 'results_evaluation')) del to_store