def __init__(self,
                 D: np.ndarray,
                 secondary_distance_type: str,
                 metric: str = 'distance',
                 classes: np.ndarray = None,
                 vectors: np.ndarray = None):
        """Initialize a hubness experiment"""

        io.check_distance_matrix_shape(D)
        io.check_valid_metric_parameter(metric)
        if secondary_distance_type not in SEC_DIST.keys():
            raise ValueError("Requested secondary distance type unknown.")
        if classes is not None:
            io.check_distance_matrix_shape_fits_labels(D, classes)
        if vectors is None:
            self.embedding_dim = None
        else:  # got vectors
            io.check_distance_matrix_shape_fits_vectors(D, vectors)
            self.embedding_dim = vectors.shape[1]
        self.original_distance = D
        self.secondary_distance_type = secondary_distance_type
        self.classes = classes
        self.vectors = vectors
        self.metric = metric
        self.n = D.shape[0]
        # Obtained later through functions:
        self.secondary_distance = None
        self.hubness = dict()
        self.anti_hubs = dict()
        self.max_hub_k_occurence = dict()
        self.knn_accuracy = dict()
        self.gk_index = None
Example #2
0
def nicdm(D:np.ndarray, k:int=7, metric:str='distance',
          test_ind:np.ndarray=None, n_jobs:int=1):
    """Transform a distance matrix with local scaling variant NICDM.

    Transforms the given distance matrix into new one using NICDM [1]_
    with the given neighborhood radius `k` (average). There are two types of
    local scaling methods implemented. The original one and the non-iterative
    contextual dissimilarity measure, both reduce hubness in distance spaces,
    similarly to Mutual Proximity.

    Parameters
    ----------
    D : ndarray
        The ``n x n`` symmetric distance (similarity) matrix.

    k : int, optional (default: 7)
        Neighborhood radius for local scaling.

    metric : {'distance'}, optional (default: 'distance')
        Currently, only distance matrices are supported.

    test_ind : ndarray, optional (default: None)
        Define data points to be hold out as part of a test set. Can be:

        - None : Rescale all distances
        - ndarray : Hold out points indexed in this array as test set.

    n_jobs : int, optional, default: 1
        Number of processes for parallel computations.

        - `1`: Don't use multiprocessing.
        - `-1`: Use all CPUs

    Returns
    -------
    D_nicdm : ndarray
        Secondary distance NICDM matrix.

    References
    ----------
    .. [1] Schnitzer, D., Flexer, A., Schedl, M., & Widmer, G. (2012).
           Local and global scaling reduce hubs in space. The Journal of Machine
           Learning Research, 13(1), 2871–2902.
    """
    #log = logging.ConsoleLogging()
    # Checking input
    io.check_distance_matrix_shape(D)
    io.check_valid_metric_parameter(metric)
    if metric == 'similarity':
        raise NotImplementedError("NICDM does not support similarity matrices "
                                  "at the moment.")
    else:
        kth = np.arange(k)
        exclude = np.inf
    if n_jobs == -1:
        n_jobs = cpu_count()
    D = np.copy(D)
    n = D.shape[0]

    if test_ind is None:
        train_ind = slice(0, n)
    else:
        train_ind = np.setdiff1d(np.arange(n), test_ind)

    np.fill_diagonal(D, exclude)
    if n_jobs > 1:
        r_ctype = RawArray(ctypes.c_double, n)
        r = np.frombuffer(r_ctype, dtype=np.float64)
        with Pool(processes=n_jobs,
                  initializer=_nicdm_load_shared_data,
                  initargs=(D, train_ind, r, r_ctype)) as pool:
            for i, knn in enumerate(pool.imap(
                func=partial(_nicdm_calculate_r, kth=kth, k=k),
                iterable=range(n))):
                pass # r is handled within func
            r_geom = _local_geomean(r)
        D_nicdm_ctype = RawArray(ctypes.c_double, D.size)
        D_nicdm = np.frombuffer(D_nicdm_ctype, dtype=np.float64).reshape(D.shape)
        with Pool(processes=n_jobs,
                  initializer=_nicdm_load_shared_data,
                  initargs=(D, train_ind, r, r_ctype, D_nicdm, D_nicdm_ctype)) as pool:
            for _ in pool.imap(
                func=partial(_nicdm_calculate_sec_dist, r_geom=r_geom, n=n, metric=metric),
                iterable=range(n)):
                pass # results handled within func
    else: # no multiprocessing
        knn = np.partition(D[:, train_ind], kth=kth, axis=1)[:, :k]
        r = knn.mean(axis=1)
        r_geom = _local_geomean(r)

        D_nicdm = np.zeros_like(D)
        for i in range(n):
            # vectorized inner loop for 100x speed-up (using broadcasting)
            #D_nicdm[i, i+1:] = ((r_geom**2) * D[i, i+1:]) / (r[i] * r[i+1:])
            D_nicdm[i, i+1:] = (r_geom * D[i, i+1:]) / np.sqrt(r[i] * r[i+1:])
        D_nicdm += D_nicdm.T

    return D_nicdm
Example #3
0
def local_scaling(D:np.ndarray, k:int=7, metric:str='distance',
                  test_ind:np.ndarray=None, n_jobs:int=1):
    """Transform a distance matrix with Local Scaling.

    Transforms the given distance matrix into new one using local scaling [1]_
    with the given `k`-th nearest neighbor. There are two types of local
    scaling methods implemented. The original one and NICDM, both reduce
    hubness in distance spaces, similarly to Mutual Proximity.

    Parameters
    ----------
    D : ndarray or csr_matrix
        The ``n x n`` symmetric distance (similarity) matrix.

    k : int, optional (default: 7)
        Neighborhood radius for local scaling.

    metric : {'distance', 'similarity'}, optional (default: 'distance')
        Define, whether matrix `D` is a distance or similarity matrix.

        NOTE: self similarities in sparse `D_ls` are set to ``np.inf``

    test_ind : ndarray, optional (default: None)
        Define data points to be hold out as part of a test set. Can be:

        - None : Rescale all distances
        - ndarray : Hold out points indexed in this array as test set.

    n_jobs : int, optional, default: 1
        Number of processes for parallel computations.

        - `1`: Don't use multiprocessing.
        - `-1`: Use all CPUs

    Returns
    -------
    D_ls : ndarray
        Secondary distance LocalScaling matrix.

    References
    ----------
    .. [1] Schnitzer, D., Flexer, A., Schedl, M., & Widmer, G. (2012).
           Local and global scaling reduce hubs in space. The Journal of Machine
           Learning Research, 13(1), 2871–2902.
    """
    log = ConsoleLogging()
    # Checking input
    io.check_distance_matrix_shape(D)
    io.check_valid_metric_parameter(metric)
    sparse = issparse(D)
    n = D.shape[0]
    if n_jobs == -1:
        n_jobs = cpu_count()
    if metric == 'similarity':
        kth = n - k
        exclude = -np.inf
        self_tmp_value = np.inf
        self_value = 1.
        log.warning("Similarity matrix support for LS is experimental.")
        if sparse and n_jobs != 1:
            log.warning("Parallel processing not implemented for sparse "
                        "matrices. Using single process instead.")
            n_jobs = 1
    else: # metric == 'distance':
        kth = k - 1
        exclude = np.inf
        self_value = 0
        self_tmp_value = self_value
        if sparse:
            log.error("Sparse distance matrices are not supported.")
            raise NotImplementedError(
                "Sparse distance matrices are not supported.")
    D = np.copy(D)

    if test_ind is None:
        train_ind = slice(0, n) #take all        
    else:
        train_ind = np.setdiff1d(np.arange(n), test_ind)
    if sparse:
        r = np.zeros(n)
        for i in range(n):
            di = D[i, train_ind].toarray()
            di[i] = exclude
            r[i] = np.partition(di, kth=kth)[kth]
        D_ls = lil_matrix(D.shape)
        # Number of nonzero cells per row
        nnz = D.getnnz(axis=1)
    else:
        np.fill_diagonal(D, exclude)
        if n_jobs > 1:
            r_ctype = RawArray(ctypes.c_double, n)
            r = np.frombuffer(r_ctype, dtype=np.float64)
            with Pool(processes=n_jobs,
                      initializer=_ls_load_shared_data,
                      initargs=(D, train_ind, r, r_ctype)) as pool:
                for _ in pool.imap(func=partial(_ls_calculate_r, kth=kth),
                                   iterable=range(n)):
                    pass # results handled within func
        else:
            r = np.partition(D[:, train_ind], kth=kth)[:, kth]

    if sparse or n_jobs == 1:
        D_ls = np.zeros_like(D)
        for i in range(n):
            # vectorized inner loop: calc only triu part
            tmp = np.empty(n-i)
            tmp[0] = self_tmp_value
            if metric == 'similarity':
                if sparse and nnz[i] <= k:  # Don't rescale if there are
                    tmp[1:] = np.nan        # too few neighbors in row
                else:
                    tmp[1:] = np.exp(-1 * D[i, i+1:]**2 / (r[i] * r[i+1:]))
            else:
                tmp[1:] = 1 - np.exp(-1 * D[i, i+1:]**2 / (r[i] * r[i+1:]))
            D_ls[i, i:] = tmp
        # copy triu to tril -> symmetric matrix (diag=zeros)
        # NOTE: does not affect self values, since inf+inf=inf and 0+0=0
        D_ls += D_ls.T
    else:
        D_ls_ctype = RawArray(ctypes.c_double, D.size)
        D_ls = np.frombuffer(D_ls_ctype, dtype=np.float64).reshape(D.shape)
        with Pool(processes=n_jobs,
                  initializer=_ls_load_shared_data,
                  initargs=(D, train_ind, r, r_ctype, D_ls, D_ls_ctype)) as pool:
            for _ in pool.imap(func=partial(_ls_calculate_sec_dist,
                                  n=n, metric=metric,
                                  self_tmp_value=self_tmp_value),
                               iterable=range(n)):
                pass # results handled within func
        # triu is copied to tril within func
    if sparse:
        for i, nz in enumerate(nnz):
            if nz <= k: # too few neighbors
                D_ls[i, :] = D[i, :]
        return D_ls.tocsr()
    else:
        np.fill_diagonal(D_ls, self_value)
        return D_ls
Example #4
0
def centering(X: np.ndarray,
              metric: str = 'vector',
              test_set_mask: np.ndarray = None):
    """
    Perform  centering, i.e. shift the origin to the data centroid.

    Centering of vector data `X` with ``n`` objects in an ``m``-dimensional
    feature space.
    The mean of each feature is calculated and subtracted from each
    point [1]_. In distance based mode, it must be checked upstream, that
    the distance matrix is a gram matrix as described below!

    Parameters
    ----------
    X : ndarray
        - An ``(n x m)`` vector data matrix with ``n`` objects in an
          ``m``-dimensional feature space
        - An ``(n x n)`` distance matrix
          of form ``K = X(X.T)``, if `X` is an ``(n x m)`` matrix;
          and of form ``K = (X.T)X``, if `X` is an ``(m x n)`` matrix,
          where ``X.T`` denotes the transpose of `X`.

        NOTE: The type must be defined via parameter 'metric'!

    metric : {'vector', 'inner'}, optional (Default: 'vector')
        Define, whether `X` is vector data or a Gram matrix of inner product
        similarities as described above.

    test_set_mask : ndarray, optional (default: None)
        Hold back data as a test set and perform centering on the remaining 
        data (training set).

    Returns
    ------- 
    X_cent : ndarray

        Centered vectors with shape (n, m), if given vector data.

    K_cent : ndarray

        Centered inner product similarities with shape (n, n), if given Gram matrix.
        
    References
    ----------
    .. [1] Suzuki, I., Hara, K., Shimbo, M., Saerens, M., & Fukumizu, K. (2013). 
           Centering similarity measures to reduce hubs. In Proceedings of the 
           2013 Conference on Empirical Methods in Natural Language Processing 
           (pp 613–623). 
           Retrieved from https://www.aclweb.org/anthology/D/D13/D13-1058.pdf
    """
    # Kernel based centering requires inner product similarities, NOT distances.
    # Since the parameter was previously erroneously called 'distance',
    # this is kept for compatibility reasons.
    if metric in ('similarity', 'distance', 'inner', 'inner_product'):
        if test_set_mask is not None:
            raise NotImplementedError("Kernel based centering does not "
                                      "support train/test splits so far.")
        io.check_distance_matrix_shape(X)
        n = X.shape[0]
        H = np.identity(n) - np.ones((n, n)) / n
        # K = X.T.X must be provided upstream
        return H.dot(X).dot(H)
    elif metric == 'vector':
        n = X.shape[0]
        if test_set_mask is None:
            # center among all data
            return X - np.mean(X, axis=0)
        else:
            # center among training data
            train_ind = np.setdiff1d(np.arange(n), test_set_mask)
            return X - np.mean(X[train_ind], axis=0)
    else:
        raise ValueError("Parameter 'metric' must be 'inner' or 'vector'.")
def simhub(D: np.ndarray,
           y: np.ndarray,
           train_ind: np.ndarray = None,
           test_ind: np.ndarray = None,
           s: int = 50,
           return_distances: bool = True,
           vect_usage: int = 0):
    """Calculate dissimilarity based on hubness-aware SNN distances [1]_.

    Parameters
    ----------
    D : ndarray
        The ``n x s`` distance or similarity matrix, where ``n`` and ``s``
        are the dataset and sample size, respectively.

    y : ndarray or None
        Class labels. Required for supervised simhub (simhubIN + simhubPUR).
        If None, calculate unsupervised simhubIN as per equation (6) in [1]_.

    train_ind : ndarray, optional, default: None
        The index array that determines, to which data points the columns in
        `D` correspond. Not required, if `D` is a quadratic all-against-all
        distance matrix.

    test_ind : ndarray, optional, default: None
        Define data points to be hold out as part of a test set. Can be:

        - None : Rescale all distances
        - ndarray : Hold out points indexed in this array as test set.

    s : int, optional, default: 50
        Neighborhood size. Can be optimized as to minimize hubness.

    return_distances : bool, optional, default: True
        If True, return distances (1 - similarities).
        Otherwise return similarities.

    vect_usage : int, optional, default: 0
        If > 0, always use vectorization for the inner simhub loop.
        If < 0, always use nested loops.
        If == 0, this is dependent on data set size
        and vectorization is used if ``n >= 2000``.

    Returns
    -------
    D_shi : ndarray
        Secondary distance (simhubIN) matrix.

    References
    ----------
    .. [1] Tomašev, N., Mladenić, D.(2012).
           Hubness-aware shared neighbor distances for high-dimensional
           $$k$$ -nearest neighbor classification.
           Knowledge and Information Systems, 39(1), 89–122.
           http://doi.org/10.1007/s10115-012-0607-5
    """
    if train_ind is None:
        io.check_distance_matrix_shape(D)
    else:
        io.check_sample_shape_fits(D, train_ind)
    # Assuming distances in D
    self_value = 0.
    sort_order = 1
    exclude = np.inf
    distance = D.copy()
    n, m = distance.shape
    if not 0 < s < m:
        raise ValueError("Neighbor hood size s, must be [1, {}-1], but "
                         "was {}.".format(m, s))
    if test_ind is None:
        n_ind = range(n)
    else:
        n_ind = test_ind
    # Exclude self distances
    if train_ind is None:
        np.fill_diagonal(distance, exclude)
    else:
        for j, sample in enumerate(train_ind):
            distance[sample, j] = exclude

    knn = np.zeros_like(distance, bool)

    # find nearest neighbors for each point
    for i in range(n):
        di = distance[i, :]
        # TODO change to np.partition for PERF
        nn = np.argsort(di)[::sort_order]
        knn[i, nn[:s]] = True
    del distance

    # Reverse nearest neighbor count
    N_s = knn[:m, :].sum(axis=0)

    if y is not None:
        # Set of class labels
        C = np.unique(y)

        # Class specific reverse nearest neighbors
        N_sc = np.zeros((C.size, m))
        for c_idx, c_val in enumerate(C):
            N_sc[c_idx, :] = np.sum(knn[:m, :] * (y == c_val).reshape(-1, 1),
                                    axis=0)
        assert np.alltrue(N_sc.sum(
            axis=0) == N_s), "N_s,c(x) don't sum up to N_s(x)"

        # Account for each point being the 0th nearest neighbor
        N_sc += 1
    # In any case: the same for N_s
    N_s += 1

    if y is not None:
        # non-homogeneity (inconsistency) in occurrence
        N_sc /= N_s
        HR_s = -np.sum(N_sc * np.log(N_sc), axis=0)

        # Information gain
        max_H_s = np.log(C.size)
        info_gain = max_H_s - HR_s
    else:  # set a dummy value for unsupervised mode
        info_gain = 1

    # "occurrence informativeness"
    I_n = np.log(m / N_s)

    # simhub calculation
    D_shi = np.zeros_like(D)
    if train_ind is None:
        train_ind = ...
    if vect_usage > 0 or (vect_usage == 0 and m < 2000):
        # using vectorization and broadcasting
        for i in n_ind:
            x = np.logical_and(knn[i, :], knn[train_ind, :])
            D_shi[i, :] = np.sum(x * I_n * info_gain, axis=1)
    else:  # use non-vectorized loops
        for i in n_ind:
            for j in range(m):
                x = np.logical_and(knn[i, :], knn[j, :])
                D_shi[i, j] = np.sum(x * I_n * info_gain)
    del knn
    # Normalization to [0, 1] range
    if y is None:
        D_shi /= (s * np.log(m))
    else:
        D_shi /= (s * np.log(m) * max_H_s)

    # Convert to distances
    if return_distances:
        D_shi *= -1
        D_shi += 1
    else:
        self_value = 1

    if test_ind is None:
        # Ensure correct self distances and return sec. dist. matrix
        np.fill_diagonal(D_shi, self_value)
        return D_shi
    else:
        # only return test-train-distances (there are no self distances here)
        return D_shi[test_ind]
def simhubIN(D: np.ndarray,
             train_ind: np.ndarray = None,
             test_ind: np.ndarray = None,
             s: int = 50,
             return_distances: bool = True,
             n_jobs: int = 1):
    """Calculate dissimilarity based on hubness-aware SNN distances [1]_.

    Parameters
    ----------
    D : ndarray
        The ``n x s`` distance, where ``n`` and ``s``
        are the dataset and sample size, respectively.

    train_ind : ndarray, optional, default: None
        The index array that determines, to which data points the columns in
        `D` correspond. Not required, if `D` is a quadratic all-against-all
        distance matrix.

    test_ind : ndarray, optional, default: None
        Define data points to be hold out as part of a test set. Can be:

        - None : Rescale all distances
        - ndarray : Hold out points indexed in this array as test set.

    s : int, optional, default: 50
        Neighborhood size. Can be optimized as to minimize hubness.

    return_distances : bool, optional, default: True
        If True, return distances (1 - similarities).
        Otherwise return similarities.

    n_jobs : int, optional, default: 1
        Number of processes for parallel computations.

        - `1`: Don't use multiprocessing.
        - `-1`: Use all CPUs

    Returns
    -------
    D_shi : ndarray
        Secondary distance (simhubIN) matrix.

    References
    ----------
    .. [1] Tomašev, N., Mladenić, D., Tomasev, N., & Mladenić, D. (2012).
           Hubness-aware shared neighbor distances for high-dimensional
           $$k$$ -nearest neighbor classification.
           Knowledge and Information Systems, 39(1), 89–122.
           http://doi.org/10.1007/s10115-012-0607-5
    """
    if train_ind is None:
        io.check_distance_matrix_shape(D)
    else:
        io.check_sample_shape_fits(D, train_ind)
    # Assuming distances in D
    self_value = 0.
    sort_order = 1
    exclude = np.inf
    distance = D.copy()
    n, m = distance.shape
    if test_ind is None:
        n_ind = range(n)
    else:
        n_ind = test_ind
    # Exclude self distances
    if train_ind is None:
        np.fill_diagonal(distance, exclude)
    else:
        for j, sample in enumerate(train_ind):
            distance[sample, j] = exclude

    if n_jobs == -1:
        n_jobs = cpu_count()
    if n_jobs > 1:
        knn_ctype = RawArray(ctypes.c_bool, D.size)
        knn = np.frombuffer(knn_ctype, dtype=bool).reshape(D.shape)
        with Pool(processes=n_jobs,
                  initializer=_shi_init_knn,
                  initargs=(distance, knn)) as pool:
            for _ in pool.imap(func=partial(_shi_hood,
                                            s=s,
                                            sort_order=sort_order),
                               iterable=range(n)):
                pass
    else:
        knn = np.zeros_like(distance, bool)
        # find nearest neighbors for each point
        for i in range(n):
            di = distance[i, :]
            # TODO change to np.partition for PERF
            nn = np.argsort(di)[::sort_order]
            knn[i, nn[:s]] = True
    del distance

    # "Occurence informativeness"
    occ_inf_knn = knn[:m, :].copy()
    np.fill_diagonal(occ_inf_knn, True)
    N_s = occ_inf_knn.sum(axis=0)
    I_n = np.log(m / N_s)
    del occ_inf_knn

    # simhub calculation
    if train_ind is None:
        train_ind = ...
    if n_jobs > 1:
        D_shi_ctype = RawArray(ctypes.c_double, D.size)
        D_shi = np.frombuffer(D_shi_ctype, dtype=np.float64).reshape(D.shape)
        with Pool(processes=n_jobs,
                  initializer=_shi_init_simhub,
                  initargs=(knn, train_ind, I_n, D_shi)) as pool:
            if m < 2000:
                for _ in pool.imap(func=partial(_shi_simhub_vect, s=s),
                                   iterable=n_ind):
                    pass
            else:
                for _ in pool.imap(func=partial(_shi_simhub, s=s, m=m),
                                   iterable=n_ind):
                    pass
    else:
        D_shi = np.zeros_like(D)
        if m < 2000:  # using vectorization and broadcasting
            for i in n_ind:
                x = np.logical_and(knn[i, :], knn[train_ind, :])
                D_shi[i, :] = np.sum(x * I_n, axis=1)
        else:  # use non-vectorized loops
            for i in n_ind:
                for j in range(m):
                    x = np.logical_and(knn[i, :], knn[j, :])
                    D_shi[i, j] = np.sum(x * I_n)
    del knn
    # Normalization to [0, 1] range
    D_shi /= (s * np.log(m))

    # Convert to distances
    if return_distances:
        D_shi *= -1
        D_shi += 1
    else:
        self_value = 1

    if test_ind is None:
        # Ensure correct self distances and return sec. dist. matrix
        np.fill_diagonal(D_shi, self_value)
        return D_shi
    else:
        # only return test-train-distances (there are no self distances here)
        return D_shi[test_ind]
def shared_nearest_neighbors(D: np.ndarray,
                             k: int = 10,
                             metric='distance',
                             n_jobs: int = 1):
    """Transform distance matrix using shared nearest neighbors [1]_.

    SNN similarity is based on computing the overlap between the `k` nearest
    neighbors of two objects. SNN approaches try to symmetrize nearest neighbor
    relations using only rank and not distance information [2]_.

    Parameters
    ----------
    D : np.ndarray
        The ``n x n`` symmetric distance (similarity) matrix.

    k : int, optional (default: 10)
        Neighborhood radius: The `k` nearest neighbors are used to calculate SNN.

    metric : {'distance', 'similarity'}, optional (default: 'distance')
        Define, whether the matrix `D` is a distance or similarity matrix

    n_jobs : int, optional, default: 1
        Number of processes for parallel computations.

        - `1`: Don't use multiprocessing.
        - `-1`: Use all CPUs

    Returns
    -------
    D_snn : ndarray
        Secondary distance SNN matrix

    References
    ---------- 
    .. [1] R. Jarvis and E. A. Patrick, “Clustering using a similarity measure
           based on shared near neighbors,” IEEE Transactions on Computers,
           vol. 22, pp. 1025–1034, 1973.

    .. [2] Flexer, A., & Schnitzer, D. (2013). Can Shared Nearest Neighbors
           Reduce Hubness in High-Dimensional Spaces? 2013 IEEE 13th
           International Conference on Data Mining Workshops, 460–467.
           http://doi.org/10.1109/ICDMW.2013.101
    """
    io.check_distance_matrix_shape(D)
    io.check_valid_metric_parameter(metric)
    n = D.shape[0]
    if metric == 'distance':
        self_value = 0.
        sort_order = 1
        exclude = np.inf
        kth = k
    if metric == 'similarity':
        self_value = 1.
        sort_order = -1
        exclude = -np.inf
        kth = n - k
    distance = D.copy()
    np.fill_diagonal(distance, exclude)

    if n_jobs == -1:
        n_jobs = cpu_count()
    if n_jobs > 1:
        knn_ctype = RawArray(ctypes.c_bool, D.size)
        knn = np.frombuffer(knn_ctype, dtype=bool).reshape(D.shape)
        D_snn_ctype = RawArray(ctypes.c_double, D.size)
        D_snn = np.frombuffer(D_snn_ctype, dtype=np.float64).reshape(D.shape)
        with Pool(processes=n_jobs,
                  initializer=_snn_init,
                  initargs=(distance, knn, D_snn)) as pool:
            for _ in pool.imap(func=partial(_snn_my_hood,
                                            k=k,
                                            kth=kth,
                                            sort_order=sort_order),
                               iterable=range(n)):
                pass
            for _ in pool.imap(func=partial(_snn_our_hood, k=k, metric=metric),
                               iterable=range(n)):
                pass
    else:
        knn = np.zeros_like(distance, bool)
        # find nearest neighbors for each point
        for i in range(n):
            di = distance[i, :]
            nn = np.argpartition(di, kth=kth)[::sort_order]
            knn[i, nn[0:k]] = True
        D_snn = np.zeros_like(distance)
        for i in range(n):
            knn_i = knn[i, :]
            j_idx = slice(i + 1, n)

            # using broadcasting
            Dij = np.sum(np.logical_and(knn_i, knn[j_idx, :]), 1)
            if metric == 'distance':
                D_snn[i, j_idx] = 1. - Dij / k
            else:  # metric == 'similarity':
                D_snn[i, j_idx] = Dij / k

    D_snn += D_snn.T
    np.fill_diagonal(D_snn, self_value)
    return D_snn
Example #8
0
 def test_check_shape(self):
     with self.assertRaises(TypeError):
         d = np.empty((2, 3))
         io.check_distance_matrix_shape(d)
def r_precision(S:np.ndarray, y:np.ndarray, metric:str='distance',
                average:str='weighted', return_y_pred:int=0,
                verbose:int=0, n_jobs:int=1) -> float:
    """ Calculate R-Precision (recall at R-th position).

    Parameters
    ----------
    S : ndarray or CSR matrix
        Distance (similarity) matrix

    y : ndarray
        Target (ground truth) labels

    metric : 'distance' or 'similarity', optional, default: 'similarity'
        Define, whether `S` is a distance or similarity matrix.

    average : 'weighted', 'macro' or None, optional, default: 'weighted'
        Ignored. Weighted and macro precisions are returned.

    return_y_pred : int, optional, default: 0
        If > 0, return the labels of the `return_y_pred` nearest neighbors

    verbose : int, optional, default: 0
        Increasing level of output.

    n_jobs : int, optional, default: 1
        Number of parallel processes to use.

    Returns
    -------
    r_precision : dictionary with following keys:
        macro : float
            Macro R-Precision.

        weighted : float
            Weighted R-Precision.

        per_item : ndarray
            R-Precision at the object.

        relevant_items : ndarray
            Relevant items per class.

        y_true : ndarray
            Target labels (req. for weighting).

        y_pred : ndarray
            Labels of some k-nearest neighbors
    """
    io.check_distance_matrix_shape(S)
    io.check_distance_matrix_shape_fits_labels(S, y)
    io.check_valid_metric_parameter(metric)
    log = ConsoleLogging()
    n, _ = S.shape
    S_is_sparse = issparse(S)
    if metric != 'similarity' or not S_is_sparse:
        raise NotImplementedError("Only sparse similarity matrices so far.")

    # Map labels to 0..n(labels)-1
    le = LabelEncoder()
    # Add int.min for misclassifications
    incorr_orig = np.array([np.nan]).astype(int)
    le.fit(np.append(y, incorr_orig))
    y = le.transform(y)
    incorrect = le.transform(incorr_orig)
    # Number of relevant items, i.e. number of each label
    relevant_items = np.bincount(y) - 1 # one less for self class
    # R-Precision for each item
    r_prec = np.zeros(n, dtype=np.float)
    
    # Classify each point in test set
    if verbose:
        log.message("Creating shared memory data.")
    n_random_pred = mp.Value(ctypes.c_int)
    n_random_pred.value = 0
    if verbose and log:
        log.message("Spawning processes for prediction.")
    y_pred = np.zeros((n, return_y_pred), dtype=float)
    kwargs = {'y_pred' : return_y_pred,
              'incorrect' : incorrect}
    with mp.Pool(processes=n_jobs, 
                 initializer=_load_shared_csr, 
                 initargs=(S, y, n_random_pred, relevant_items)) as pool:
        for i, r in enumerate(
            pool.imap(
                func=partial(_r_prec_worker, **kwargs),
                iterable=range(n), 
                chunksize=int(1e2))):
            if verbose and ((i+1)%int(1e7 / 10**verbose) == 0 or i == n-1):
                log.message("Classification: {} of {} on {}.".format(
                            i+1, n, mp.current_process().name), flush=True)
            try:
                r_prec[i] = r[0]
                y_pred[i, :] = r[1]
            except:
                r_prec[i] = r
            if i == n-1:
                pass
    pool.join()

    if verbose and log:
        log.message("Retrieving nearest neighbors.")
    # Work-around for new scikit-learn requirement of 1D arrays for LabelEncoder
    y_pred = np.asarray([le.inverse_transform(col) for col in y_pred.T.astype(int)]).T
    if verbose and log:
        log.message("Finishing.")
    if n_random_pred.value:
        log.warning(("{} queries were classified randomly, because all "
                     "distances were non-finite numbers or there were no other "
                     "objects in the same class.").format(n_random_pred.value))
    return_dict = {'macro' : r_prec.mean(),
                   'weighted' : np.average(r_prec, weights=relevant_items[y]),
                   'per_item' : r_prec,
                   'relevant_items' : relevant_items,
                   'y_true' : y,
                   'y_pred' : y_pred}
    return return_dict
def predict(D:np.ndarray, target:np.ndarray, k=5,
            metric:str='distance', test_ind:np.ndarray=None, verbose:int=0,
            sample_idx=None, return_cmat=True):
    """Perform `k`-nearest neighbor classification.

    Use the ``n x n`` symmetric distance matrix `D` and target class
    labels `target` to perform a `k`-NN experiment (leave-one-out
    cross-validation or evaluation of test set; see parameter `test_ind`).
    Ties are broken by the nearest neighbor.

    Parameters
    ----------
    D : ndarray
        The ``n x n`` symmetric distance (similarity) matrix.

    target : ndarray (of dtype=int)
        The ``n x 1`` target class labels (ground truth) or
        ``n x c`` in case of ``c`` binarized multilabels

    k : int or array_like (of dtype=int), optional (default: 5)
        Neighborhood size for `k`-NN classification.
        For each value in `k`, one `k`-NN experiment is performed.

        HINT: Providing more than one value for `k` is a cheap means to perform
        multiple `k`-NN experiments at once. Try e.g. ``k=[1, 5, 20]``.

    metric : {'distance', 'similarity'}, optional (default: 'distance')
        Define, whether matrix `D` is a distance or similarity matrix

    test_ind : ndarray, optional (default: None)
        Define data points to be hold out as part of a test set. Can be:

        - None : Perform a LOO-CV experiment
        - ndarray : Hold out points indexed in this array as test set. Fit
          model to remaining data. Evaluate model on test set.

    verbose : int, optional (default: 0)
        Increasing level of output (progress report).

    return_cmat : bool, optional, default: True
        If False, only return the predictions `y_pred`.
        Otherwise also return the confusion matrices.

    Returns
    -------
    y_pred : ndarray (shape=(n_k, n, c), dtype=int)
        Predicted class labels (`n_k`... number of items in parameter `k`)
        
        HINT: Referring to the above example... 
        ... ``y_pred[0]`` gives the predictions of the ``k=1`` experiment.

    cmat : ndarray (shape=(n_k x c x n_t x n_t), dtype=int) 
        Confusion matrix (``n_t`` number of unique items in parameter target)

        HINT: ... ``cmat[2, 0, :, :]`` gives the confusion matrix of
        the first class in the ``k=20`` experiment in the following order:
            TN    FP
            FN    TP
    """

    # Check input sanity
    log = ConsoleLogging()
    if sample_idx is None:
        io.check_distance_matrix_shape(D)
    else:
        io.check_sample_shape_fits(D, sample_idx)
    #io._check_distance_matrix_shape_fits_labels(D, target)
    io.check_valid_metric_parameter(metric)
    if metric == 'distance':
        d_self = np.inf
        sort_order = 1
    if metric == 'similarity':
        d_self = -np.inf
        sort_order = -1

    # Copy, because data is changed
    if not issparse(D):
        D = D.copy()
    target = target.astype(int)
    if target.ndim == 1:
        target = target[:, np.newaxis]
    if verbose:
        log.message("Start k-NN experiment.")
    # Handle LOO-CV vs. test set mode
    if test_ind is None:
        n = D.shape[0]
        test_set_ind = range(n)    # dummy     io.check_valid_metric_parameter(metric)
        train_set_ind = n   # dummy
    else:
        # number of points to be classified
        n = test_set_ind.size
        # Indices of training examples
        train_set_ind = np.setdiff1d(np.arange(n), test_set_ind)
        if sample_idx is not None:
            raise NotImplementedError("Sample k-NN does not support train/"
                                      "test splits at the moment.")
    # Number of k-NN parameters
    try:
        k_length = k.size
    except AttributeError as e:
        if isinstance(k, int):
            k = np.array([k])
            k_length = k.size
        elif isinstance(k, list):
            k = np.array(k)
            k_length = k.size
        else:
            raise e

    cl = np.sort(np.unique(target))
    cmat = np.zeros((k_length, target.shape[1], len(cl), len(cl)), dtype=int)
    y_pred = np.zeros((k_length, *target.shape), dtype=int)

    classes = target.copy()
    for idx, cur_class in enumerate(np.array(cl).ravel()):
        # change labels to 0, 1, ..., len(cl)-1
        classes[target == cur_class] = idx
    if sample_idx is not None:
        sample_classes = classes[sample_idx]
        j = np.ones(n, int)
        j *= (n+1) # illegal indices will throw index out of bounds error
        j[sample_idx] = np.arange(len(sample_idx))
        for j, sample in enumerate(sample_idx):
            D[sample, j] = d_self
    cl = range(len(cl))

    # Classify each point in test set
    for i in test_set_ind:
        if verbose and ((i+1)%1000==0 or i+1==n):
            log.message("Prediction: {} of {}.".format(i+1, n), flush=True)

        if issparse(D):
            row = D.getrow(i)
            #row = D.data
            ind = row.nonzero()[1]
            row = row.toarray().ravel()
        else:
            row = D[i, :]
        if sample_idx is None:
            row[i] = d_self

        # Sort points in training set according to distance
        # Randomize, in case there are several points of same distance
        # (this is especially relevant for SNN rescaling)
        if sample_idx is None:
            rp = train_set_ind
        else:
            if issparse(D):
                rp = ind
            else:
                rp = np.arange(len(sample_idx))
        rp = np.random.permutation(rp)
        d2 = row[rp]
        d2idx = np.argsort(d2, axis=0)[::sort_order]
        d2idx = d2idx[~np.isnan(d2[d2idx])] # filter NaN values
        idx = rp[d2idx]

        # More than one k is useful for cheap multiple k-NN experiments at once
        for j in range(k_length):
            # Make sure no inf/-inf/nan values are used for classification
            finite_val = np.isfinite(row[idx[0:k[j]]])
            # However, if no values are finite, classify randomly
            if finite_val.sum() == 0:
                idx = np.random.permutation(idx)
                finite_val = np.ones_like(finite_val)
                log.warning("Query was classified randomly, because all "
                            "distances were non-finite numbers.")
            for l in range(target.shape[1]):
                l_classes = classes[:, l]
                if sample_idx is None:
                    nn_class = l_classes[idx[0:k[j]]][finite_val]
                else:
                    l_sample_classes = sample_classes[:, l]
                    nn_class = l_sample_classes[idx[0:k[j]]][finite_val]
                cs = np.bincount(nn_class.astype(int))
                max_cs = np.where(cs == np.max(cs))[0]
                seed_class = classes[i, l]
                # "tie": use nearest neighbor
                if len(max_cs) > 1:
                    y_pred[j, i, l] = nn_class[0]
                    cmat[j, l, seed_class, nn_class[0]] += 1
                # majority vote
                else:
                    y_pred[j, i, l] = cl[max_cs[0]]
                    cmat[j, l, seed_class, cl[max_cs[0]]] += 1

    if verbose:
        log.message("Finished k-NN experiment.")

    if return_cmat:
        return y_pred, cmat
    else:
        return y_pred
def score(D:np.ndarray, target:np.ndarray, k=5,
          metric:str='distance', test_set_ind:np.ndarray=None, verbose:int=0,
          sample_idx=None, filter_self=True):
    """Perform `k`-nearest neighbor classification.

    Use the ``n x n`` symmetric distance matrix `D` and target class
    labels `target` to perform a `k`-NN experiment (leave-one-out
    cross-validation or evaluation of test set; see parameter `test_set_ind`).
    Ties are broken by the nearest neighbor.

    Parameters
    ----------
    D : ndarray
        The ``n x n`` symmetric distance (similarity) matrix.

    target : ndarray (of dtype=int)
        The ``n x 1`` target class labels (ground truth).

    k : int or array_like (of dtype=int), optional (default: 5)
        Neighborhood size for `k`-NN classification.
        For each value in `k`, one `k`-NN experiment is performed.

        HINT: Providing more than one value for `k` is a cheap means to perform
        multiple `k`-NN experiments at once. Try e.g. ``k=[1, 5, 20]``.

    metric : {'distance', 'similarity'}, optional (default: 'distance')
        Define, whether matrix `D` is a distance or similarity matrix

    test_sed_ind : ndarray, optional (default: None)
        Define data points to be hold out as part of a test set. Can be:

        - None : Perform a LOO-CV experiment
        - ndarray : Hold out points indexed in this array as test set. Fit
          model to remaining data. Evaluate model on test set.

    verbose : int, optional (default: 0)
        Increasing level of output (progress report).

    sample_idx : ...
        TODO add description

    filter_self : bool, optional, default: True
        Remove self similarities from sparse ``D``.
        This assumes that the highest similarity per row is the self
        similarity.
        
        NOTE: Quadratic dense matrices are always filtered for self
        distances/similarities, even if `filter_self` is set t0 `False`.
        
    Returns
    -------
    acc : ndarray (shape=(n_k x 1), dtype=float)
        Classification accuracy (`n_k`... number of items in parameter `k`)

        HINT: Refering to the above example... 
        ... ``acc[0]`` gives the accuracy of the ``k=1`` experiment.
    corr : ndarray (shape=(n_k x n), dtype=int)
        Raw vectors of correctly classified items

        HINT: ... ``corr[1, :]`` gives these items for the ``k=5`` experiment.
    cmat : ndarray (shape=(n_k x n_t x n_t), dtype=int) 
        Confusion matrix (``n_t`` number of unique items in parameter target)

        HINT: ... ``cmat[2, :, :]`` gives the confusion matrix of
        the ``k=20`` experiment.
    """

    # Check input sanity
    log = ConsoleLogging()
    if sample_idx is None:
        io.check_distance_matrix_shape(D)
    else:
        io.check_sample_shape_fits(D, sample_idx)
    io.check_distance_matrix_shape_fits_labels(D, target)
    io.check_valid_metric_parameter(metric)
    if metric == 'distance':
        d_self = np.inf
        sort_order = 1
    if metric == 'similarity':
        d_self = -np.inf
        sort_order = -1

    # Copy, because data is changed
    D = D.copy()
    target = target.astype(int)
    D_is_sparse = issparse(D)

    if verbose:
        log.message("Start k-NN experiment.")
    # Handle LOO-CV vs. test set mode
    if test_set_ind is None:
        n = D.shape[0]
        test_set_ind = range(n)    # dummy 
        train_set_ind = n   # dummy
    else:
        # number of points to be classified
        n = test_set_ind.size
        # Indices of training examples
        train_set_ind = np.setdiff1d(np.arange(n), test_set_ind)
        if sample_idx is not None:
            raise NotImplementedError("Sample k-NN does not support train/"
                                      "test splits at the moment.")
    # Number of k-NN parameters
    try:
        k_length = k.size
    except AttributeError as e:
        if isinstance(k, int):
            k = np.array([k])
            k_length = k.size
        elif isinstance(k, list):
            k = np.array(k)
            k_length = k.size
        else:
            raise e

    acc = np.zeros((k_length, 1))
    corr = np.zeros((k_length, D.shape[0]))

    cl = np.sort(np.unique(target))
    if D_is_sparse:
        # Add a label for unknown class (object w/o nonzero sim to any others)
        cl = np.append(cl, cl.max()+1)
        n_classes = len(cl) + 1
    else:
        n_classes = len(cl)
    cmat = np.zeros((k_length, n_classes, n_classes))

    classes = target.copy()
    for idx, cur_class in enumerate(cl):
        # change labels to 0, 1, ..., len(cl)-1
        classes[target == cur_class] = idx
    if sample_idx is not None:
        sample_classes = classes[sample_idx]
        j = np.ones(n, int)
        j *= (n+1) # illegal indices will throw index out of bounds error
        j[sample_idx] = np.arange(len(sample_idx))
        for j, sample in enumerate(sample_idx):
            D[sample, j] = d_self
    cl = range(len(cl))

    rnd_classif = np.zeros(k_length)
    # Classify each point in test set
    for i in test_set_ind:
        if verbose and ((i+1)%1000==0 or i+1==n):
            log.message("Prediction: {} of {}.".format(i+1, n), flush=True)

        seed_class = classes[i]

        if D_is_sparse:
            row = D.getrow(i)
        else:
            row = D[i, :]
            if sample_idx is None:
                row[i] = d_self

        # Sort points in training set according to distance
        # Randomize, in case there are several points of same distance
        # (this is especially relevant for SNN rescaling)
        if sample_idx is None:
            rp = train_set_ind
        else:
            rp = np.arange(len(sample_idx))
        if D_is_sparse:
            nnz = row.nnz
            rp = np.random.permutation(nnz)
            d2 = row.data[rp]
            # Partition for each k value
            kth = nnz - k - 1
            # sort the two highest similarities to end
            kth = np.append(kth, [nnz-2, nnz-1])
            # Clip negative indices (nnz < k)
            np.clip(kth, a_min=0, a_max=nnz-1, out=kth)
            # Remove duplicate k values and sort
            kth = np.unique(kth)
            d2idx = np.argpartition(d2, kth=kth)
            d2idx = d2idx[~np.isnan(d2[d2idx])][::-1]
            idx = row.nonzero()[1][rp[d2idx]]
            idx = idx[1:] # rem self sim
        else:
            rp = np.random.permutation(rp)
            d2 = row[rp]
            d2idx = np.argsort(d2, axis=0)[::sort_order]
            d2idx = d2idx[~np.isnan(d2[d2idx])] # filter NaN values
            idx = rp[d2idx]

        # More than one k is useful for cheap multiple k-NN experiments at once
        for j in range(k_length):
            # Make sure no inf/-inf/nan values are used for classification
            if D_is_sparse:
                #print(row[0, idx[0:k[j]]].toarray())
                finite_val = np.isfinite(row[0, idx[0:k[j]]].toarray().ravel())
                #print(finite_val)
            else:
                finite_val = np.isfinite(row[idx[0:k[j]]])
            # However, if no values are finite, classify randomly
            if finite_val.sum() == 0:
                idx = np.random.permutation(idx)
                finite_val = np.ones_like(finite_val)
                rnd_classif[j] += 1
            if sample_idx is None:
                nn_class = classes[idx[0:k[j]]][finite_val]
            else:
                #finite_val = np.isfinite(sample_row[idx[0:k[j]]])
                nn_class = sample_classes[idx[0:k[j]]][finite_val]
            cs = np.bincount(nn_class.astype(int))
            if cs.size > 0:
                max_cs = np.where(cs == np.max(cs))[0]
            else:
                max_cs = np.array([len(cl) - 1]) # misclassification label

            # "tie": use nearest neighbor
            if len(max_cs) > 1:
                if seed_class == nn_class[0]:
                    acc[j] += 1/n 
                    corr[j, i] = 1
                cmat[j, seed_class, nn_class[0]] += 1
            # majority vote
            else:
                if cl[max_cs[0]] == seed_class:
                    acc[j] += 1/n
                    corr[j, i] = 1
                cmat[j, seed_class, cl[max_cs[0]]] += 1

    if np.any(rnd_classif):
        for x in rnd_classif:
            log.warning(("{} queries were classified randomly, because all "
                        "distances were non-finite numbers.").format(x))
    if verbose:
        log.message("Finished k-NN experiment.")

    return acc, corr, cmat
Example #12
0
def mutual_proximity_gammai(D: np.ndarray,
                            metric: str = 'distance',
                            min_nnz: int = 30,
                            test_set_ind: np.ndarray = None,
                            verbose: int = 0):
    """Transform a distance matrix with Mutual Proximity (indep. Gamma distr.).
    
    Applies Mutual Proximity (MP) [1]_ on a distance/similarity matrix. Gammai 
    variant assumes independent Gamma distributed distances (FAST).
    The resulting second. distance/similarity matrix should show lower hubness.

    Parameters
    ----------
    D : ndarray or csr_matrix
        - ndarray: The ``n x n`` symmetric distance or similarity matrix.
        - csr_matrix: The ``n x n`` symmetric similarity matrix.
        
        NOTE: In case of sparse `D`, zeros are interpreted as missing values 
        and ignored during calculations. Thus, results may differ 
        from using a dense version.

    metric : {'distance', 'similarity'}, optional (default: 'distance')
        Define, whether matrix `D` is a distance or similarity matrix.

        NOTE: In case of sparse `D`, only 'similarity' is supported.

    min_nnz : int, optional, default: 30
        Calculate MP between two objects `i` and `j`, iff at least ``min_nnz``
        values are present in both row ``i`` and ``j``.
        Otherwise, return the original similarity.
        Ignored, if `metric` is 'distance'.

    test_set_ind : ndarray, optional (default: None)
        Define data points to be hold out as part of a test set. Can be:

        - None : Rescale all distances
        - ndarray : Hold out points indexed in this array as test set. 
 
    verbose : int, optional (default: 0)
        Increasing level of output (progress report).

    Returns
    -------
    D_mp : ndarray
        Secondary distance MP gammai matrix.

    References
    ----------
    .. [1] Schnitzer, D., Flexer, A., Schedl, M., & Widmer, G. (2012). 
           Local and global scaling reduce hubs in space. 
           The Journal of Machine Learning Research, 13(1), 2871–2902.
    """
    # Initialization
    n = D.shape[0]
    log = ConsoleLogging()

    # Checking input
    io.check_distance_matrix_shape(D)
    io.check_valid_metric_parameter(metric)
    if metric == 'similarity':
        self_value = 1
    else:  # metric == 'distance':
        self_value = 0
    if test_set_ind is None:
        train_set_ind = slice(0, n)
    else:
        train_set_ind = np.setdiff1d(np.arange(n), test_set_ind)

    # Start MP
    if verbose:
        log.message('Mutual proximity Gammai rescaling started.', flush=True)
    D = D.copy()

    if issparse(D):
        return _mutual_proximity_gammai_sparse(D, min_nnz, test_set_ind,
                                               verbose, log)

    np.fill_diagonal(D, np.nan)

    mu = np.nanmean(D[train_set_ind], 0)
    va = np.nanvar(D[train_set_ind], 0, ddof=1)
    # Avoid downstream div/0 errors
    va[va == 0] = 1e-7
    A = (mu**2) / va
    B = va / mu

    D_mp = np.zeros_like(D)

    # MP gammai
    for i in range(n):
        if verbose and ((i + 1) % 1000 == 0 or i + 1 == n):
            log.message("MP_gammai: {} of {}".format(i + 1, n), flush=True)
        j_idx = slice(i + 1, n)

        if metric == 'similarity':
            p1 = _local_gamcdf(D[i, j_idx], A[i], B[i])
            p2 = _local_gamcdf(D[j_idx, i], A[j_idx], B[j_idx])
            D_mp[i, j_idx] = (p1 * p2).ravel()
        else:  # distance
            p1 = 1 - _local_gamcdf(D[i, j_idx], A[i], B[i])
            p2 = 1 - _local_gamcdf(D[j_idx, i], A[j_idx], B[j_idx])
            D_mp[i, j_idx] = (1 - p1 * p2).ravel()

    # Mirroring the matrix
    D_mp += D_mp.T
    # set correct self dist/sim
    np.fill_diagonal(D_mp, self_value)

    return D_mp
Example #13
0
def mutual_proximity_gaussi(
    D: np.ndarray,
    metric: str = 'distance',
    sample_size: int = 0,
    min_nnz: int = 30,
    test_set_ind: np.ndarray = None,
    verbose: int = 0,
    idx: np.ndarray = None,
):
    """Transform distances with Mutual Proximity (indep. normal distributions).
    
    Applies Mutual Proximity (MP) [1]_ on a distance/similarity matrix. Gaussi 
    variant assumes independent normal distributions (FAST).
    The resulting second. distance/similarity matrix should show lower hubness.
    
    Parameters
    ----------
    D : ndarray or csr_matrix
        - ndarray: The ``n x n`` symmetric distance or similarity matrix.
        - csr_matrix: The ``n x n`` symmetric similarity matrix.
        
        NOTE: In case of sparse `D`, zeros are interpreted as missing values 
        and ignored during calculations. Thus, results may differ 
        from using a dense version.

    metric : {'distance', 'similarity'}, optional (default: 'distance')
        Define, whether matrix `D` is a distance or similarity matrix.
        
        NOTE: In case of sparse `D`, only 'similarity' is supported.
        
    sample_size : int, optional (default: 0)
        Define sample size from which Gauss parameters are estimated.
        Use all data when set to ``0``.
        Ignored in case of SampleMP (i.e. if provided `idx`).

    min_nnz : int, optional, default: 30
        Calculate MP between two objects `i` and `j`, iff at least ``min_nnz``
        values are present in both row ``i`` and ``j``.
        Otherwise, return the original similarity.
        Ignored, if `metric` is 'distance'.

    test_set_ind : ndarray, optional (default: None)
        Define data points to be hold out as part of a test set. Can be:
        
        - None : Rescale all distances
        - ndarray : Hold out points indexed in this array as test set.

        Ignored in case of SampleMP (i.e. if provided `idx`).

    verbose : int, optional (default: 0)
        Increasing level of output (progress report).

    idx : ndarray, optional (default: None)
        The index array that determines to which data points the columns in
        `D` correspond. Only required for SampleMP.

    Returns
    -------
    D_mp : ndarray
        Secondary distance MP gaussi matrix.
    
    References
    ----------
    .. [1] Schnitzer, D., Flexer, A., Schedl, M., & Widmer, G. (2012). 
           Local and global scaling reduce hubs in space. The Journal of Machine 
           Learning Research, 13(1), 2871–2902.
    """
    # Initialization
    log = ConsoleLogging()

    # Checking input
    if idx is None:
        io.check_distance_matrix_shape(D)
    else:
        io.check_sample_shape_fits(D, idx)
    io.check_valid_metric_parameter(metric)
    n = D.shape[0]
    s = D.shape[1]

    if metric == 'similarity':
        self_value = 1
    else:  # metric == 'distance':
        self_value = 0
    if test_set_ind is None:
        train_set_ind = slice(0, n)
    else:
        train_set_ind = np.setdiff1d(np.arange(n), test_set_ind)

    # Start MP Gaussi
    if verbose:
        log.message('Mutual Proximity Gaussi rescaling started.', flush=True)
    D = D.copy()

    if issparse(D):
        return _mutual_proximity_gaussi_sparse(D, sample_size, min_nnz,
                                               test_set_ind, verbose, log)

    # ignore self dist/sim for parameter estimation
    if idx is None:
        np.fill_diagonal(D, np.nan)
    else:
        for j, i in enumerate(idx):
            D[i, j] = np.nan

    # Calculate mean and std
    if idx is None:
        if sample_size == 0:
            mu = np.nanmean(D[train_set_ind], 0)
            sd = np.nanstd(D[train_set_ind], 0, ddof=0)
        else:
            samples = np.random.shuffle(train_set_ind)[0:sample_size]
            mu = np.nanmean(D[samples], 0)
            sd = np.nanstd(D[samples], 0, ddof=0)
    else:
        mu = np.nanmean(D, 1)
        sd = np.nanstd(D, 1, ddof=0)
    # Avoid downstream div/0 errors
    sd[sd == 0] = 1e-7
    # set self dist/sim back to self_value to avoid scipy warnings
    if idx is None:
        np.fill_diagonal(D, self_value)
    else:
        for j, i in enumerate(idx):
            D[i, j] = self_value

    # MP Gaussi
    D_mp = np.zeros_like(D)
    for i in range(n):
        if verbose and ((i + 1) % 1000 == 0 or i + 1 == n):
            log.message("MP_gaussi: {} of {}.".format(i + 1, n), flush=True)
        if idx is None:
            j = slice(i + 1, n)
            j_mom = j
        else:
            j = slice(0, s)
            j_mom = idx[j]

        if metric == 'similarity':
            p1 = norm.cdf(D[i, j], mu[i], sd[i])
            p2 = norm.cdf(D[i, j], mu[j_mom], sd[j_mom])
            D_mp[i, j] = (p1 * p2).ravel()
        else:
            # sf(.) := 1 - cdf(.)
            p1 = norm.sf(D[i, j], mu[i], sd[i])
            p2 = norm.sf(D[i, j], mu[j_mom], sd[j_mom])
            D_mp[i, j] = (1 - p1 * p2).ravel()

    if idx is None:
        D_mp += D_mp.T
        np.fill_diagonal(D_mp, self_value)
    else:
        # Ensure correct self distances
        for j, sample in enumerate(idx):
            D_mp[sample, j] = self_value
    return D_mp
Example #14
0
def _mutual_proximity_empiric_full(D: np.ndarray,
                                   metric: str = 'distance',
                                   test_set_ind: np.ndarray = None,
                                   min_nnz: int = 0,
                                   verbose: int = 0,
                                   n_jobs=None):
    """Transform a distance matrix with Mutual Proximity (empiric distribution).
    
    Applies Mutual Proximity (MP) [1]_ on a distance/similarity matrix using 
    the empiric data distribution (EXACT, rather SLOW). The resulting 
    secondary distance/similarity matrix should show lower hubness.
    
    Parameters
    ----------
    D : ndarray or csr_matrix
        - ndarray: The ``n x n`` symmetric distance or similarity matrix.
        - csr_matrix: The ``n x n`` symmetric similarity matrix.
          
        NOTE: In case of sparse ``D`, zeros are interpreted as missing values 
        and ignored during calculations. Thus, results may differ 
        from using a dense version.
    
    metric : {'distance', 'similarity'}, optional (default: 'distance')
        Define, whether matrix `D` is a distance or similarity matrix.
        
        NOTE: In case of sparse `D`, only 'similarity' is supported.
        
    test_set_ind : ndarray, optional (default: None)
        Define data points to be hold out as part of a test set. Can be:
        
        - None : Rescale all distances
        - ndarray : Hold out points indexed in this array as test set. 

    min_nnz : int, optional, default: 0
        Calculate MP between two objects `i` and `j`, iff at least ``min_nnz``
        values are present in both row ``i`` and ``j``.
        Otherwise, return the original distance/similarity.
        
        NOTE: Currently only implemented for MP empiric w/ sparse sim matrices

    verbose : int, optional (default: 0)
        Increasing level of output (progress report).
        
    Returns
    -------
    D_mp : ndarray
        Secondary distance MP empiric matrix.
    
    References
    ----------
    .. [1] Schnitzer, D., Flexer, A., Schedl, M., & Widmer, G. (2012). 
           Local and global scaling reduce hubs in space. The Journal of Machine 
           Learning Research, 13(1), 2871–2902.
    """
    # Initialization
    n = D.shape[0]
    log = ConsoleLogging()

    # Check input
    io.check_distance_matrix_shape(D)
    io.check_valid_metric_parameter(metric)
    if metric == 'similarity':
        self_value = 1
        exclude_value = np.inf
    else:  # metric == 'distance':
        self_value = 0
        exclude_value = -np.inf
        if issparse(D):
            raise ValueError("MP sparse only supports similarity matrices.")
    if test_set_ind is None:
        pass  # TODO implement
        #train_set_ind = slice(0, n)
    elif not np.all(~test_set_ind):
        raise NotImplementedError("MP empiric does not yet support train/"
                                  "test splits.")
        #train_set_ind = np.setdiff1d(np.arange(n), test_set_ind)

    if issparse(D):
        return _mutual_proximity_empiric_sparse(D, test_set_ind, min_nnz,
                                                verbose, log, n_jobs)
    # Start MP
    D = D.copy()

    # ensure correct self distances (NOT done for sparse matrices!)
    np.fill_diagonal(D, exclude_value)

    D_mp = np.zeros_like(D)

    # Calculate MP empiric
    for i in range(n - 1):
        if verbose and ((i + 1) % 1000 == 0 or i == n - 2):
            log.message("MP_empiric: {} of {}.".format(i + 1, n - 1),
                        flush=True)
        # Calculate only triu part of matrix
        j_idx = i + 1

        dI = D[i, :][np.newaxis, :]
        dJ = D[j_idx:n, :]
        d = D[j_idx:n, i][:, np.newaxis]

        if metric == 'similarity':
            D_mp[i, j_idx:] = np.sum((dI <= d) & (dJ <= d), 1) / n  #(n - 2)
        else:  # metric == 'distance':
            D_mp[i, j_idx:] = 1 - (np.sum(
                (dI > d) & (dJ > d), 1) / n)  #(n - 2))

    # Mirror, so that matrix is symmetric
    D_mp += D_mp.T
    np.fill_diagonal(D_mp, self_value)

    return D_mp