def __init__(self,
                 D: np.ndarray,
                 secondary_distance_type: str,
                 metric: str = 'distance',
                 classes: np.ndarray = None,
                 vectors: np.ndarray = None):
        """Initialize a hubness experiment"""

        io.check_distance_matrix_shape(D)
        io.check_valid_metric_parameter(metric)
        if secondary_distance_type not in SEC_DIST.keys():
            raise ValueError("Requested secondary distance type unknown.")
        if classes is not None:
            io.check_distance_matrix_shape_fits_labels(D, classes)
        if vectors is None:
            self.embedding_dim = None
        else:  # got vectors
            io.check_distance_matrix_shape_fits_vectors(D, vectors)
            self.embedding_dim = vectors.shape[1]
        self.original_distance = D
        self.secondary_distance_type = secondary_distance_type
        self.classes = classes
        self.vectors = vectors
        self.metric = metric
        self.n = D.shape[0]
        # Obtained later through functions:
        self.secondary_distance = None
        self.hubness = dict()
        self.anti_hubs = dict()
        self.max_hub_k_occurence = dict()
        self.knn_accuracy = dict()
        self.gk_index = None
Example #2
0
def local_scaling_sample(D:np.ndarray, k:int=7, metric:str='distance',
                         train_ind:np.ndarray=None, test_ind:np.ndarray=None):
    """Transform a distance matrix with Local Scaling.

    --- DRAFT version ---

    Transforms the given distance matrix into new one using local scaling [1]_
    with the given `k`-th nearest neighbor. There are two types of local
    scaling methods implemented. The original one and NICDM, both reduce
    hubness in distance spaces, similarly to Mutual Proximity.

    Parameters
    ----------
    D : ndarray or csr_matrix
        The ``n x n`` symmetric distance (similarity) matrix.

    k : int, optional (default: 7)
        Neighborhood radius for local scaling.

    metric : {'distance', 'similarity'}, optional (default: 'distance')
        Define, whether matrix `D` is a distance or similarity matrix.

        NOTE: self similarities in sparse `D_ls` are set to ``np.inf``

    train_ind : ndarray, optional
        If given, use only these data points as neighbors for rescaling.

    test_ind : ndarray, optional (default: None)
        Define data points to be hold out as part of a test set. Can be:

        - None : Rescale all distances
        - ndarray : Hold out points indexed in this array as test set.

    Returns
    -------
    D_ls : ndarray
        Secondary distance LocalScaling matrix.

    References
    ----------
    .. [1] Schnitzer, D., Flexer, A., Schedl, M., & Widmer, G. (2012).
           Local and global scaling reduce hubs in space. The Journal of Machine
           Learning Research, 13(1), 2871–2902.
    """
    log = ConsoleLogging()
    # Checking input
    io.check_sample_shape_fits(D, train_ind)
    io.check_valid_metric_parameter(metric)
    sparse = issparse(D)
    n = D.shape[0]
    if metric == 'similarity':
        if train_ind is not None:
            raise NotImplementedError
        kth = n - k
        exclude = -np.inf
        self_value = 1.
        log.warning("Similarity matrix support for LS is experimental.")
    else: # metric == 'distance':
        kth = k - 1
        exclude = np.inf
        self_value = 0
        if sparse:
            log.error("Sparse distance matrices are not supported.")
            raise NotImplementedError(
                "Sparse distance matrices are not supported.")

    D = np.copy(D)
    if test_ind is None:
        train_set_ind = slice(0, n) #take all
        n_ind = range(n)
    else:
        train_set_ind = np.setdiff1d(np.arange(n), test_ind)
        n_ind = test_ind
    # Exclude self distances
    for j, sample in enumerate(train_ind):
        D[sample, j] = exclude
    r = np.zeros(n)
    for i in range(n):
        if train_ind is None:
            if sparse:
                di = D[i, train_set_ind].toarray()
            else:
                di = D[i, train_set_ind]
        else:
            di = D[i, :] # all columns are training in this case
        r[i] = np.partition(di, kth=kth)[kth]

    if sparse:
        D_ls = lil_matrix(D.shape)
        # Number of nonzero cells per row
        nnz = D.getnnz(axis=1)
    else:
        D_ls = np.zeros_like(D)

    if metric == 'similarity':
        for i in n_ind:
            if sparse and nnz[i] <= k: # Don't rescale if there are too few 
                D_ls[i, :] = D[i, :]   # neighbors in the current row
            else:
                D_ls[i, :] = np.exp(-1 * D[i, :]**2 / (r[i] * r[train_ind]))
    else:
        for i in n_ind:
            D_ls[i, :] = 1 - np.exp(-1 * D[i, :]**2 / (r[i] * r[train_ind]))

    if test_ind is None:
        if sparse:
            return D_ls.tocsr()
        else:
            np.fill_diagonal(D_ls, self_value)
            return D_ls
    else:
        # Ensure correct self distances
        for j, sample in enumerate(train_ind):
            D_ls[sample, j] = self_value
        return D_ls[test_ind]
Example #3
0
def local_scaling(D:np.ndarray, k:int=7, metric:str='distance',
                  test_ind:np.ndarray=None, n_jobs:int=1):
    """Transform a distance matrix with Local Scaling.

    Transforms the given distance matrix into new one using local scaling [1]_
    with the given `k`-th nearest neighbor. There are two types of local
    scaling methods implemented. The original one and NICDM, both reduce
    hubness in distance spaces, similarly to Mutual Proximity.

    Parameters
    ----------
    D : ndarray or csr_matrix
        The ``n x n`` symmetric distance (similarity) matrix.

    k : int, optional (default: 7)
        Neighborhood radius for local scaling.

    metric : {'distance', 'similarity'}, optional (default: 'distance')
        Define, whether matrix `D` is a distance or similarity matrix.

        NOTE: self similarities in sparse `D_ls` are set to ``np.inf``

    test_ind : ndarray, optional (default: None)
        Define data points to be hold out as part of a test set. Can be:

        - None : Rescale all distances
        - ndarray : Hold out points indexed in this array as test set.

    n_jobs : int, optional, default: 1
        Number of processes for parallel computations.

        - `1`: Don't use multiprocessing.
        - `-1`: Use all CPUs

    Returns
    -------
    D_ls : ndarray
        Secondary distance LocalScaling matrix.

    References
    ----------
    .. [1] Schnitzer, D., Flexer, A., Schedl, M., & Widmer, G. (2012).
           Local and global scaling reduce hubs in space. The Journal of Machine
           Learning Research, 13(1), 2871–2902.
    """
    log = ConsoleLogging()
    # Checking input
    io.check_distance_matrix_shape(D)
    io.check_valid_metric_parameter(metric)
    sparse = issparse(D)
    n = D.shape[0]
    if n_jobs == -1:
        n_jobs = cpu_count()
    if metric == 'similarity':
        kth = n - k
        exclude = -np.inf
        self_tmp_value = np.inf
        self_value = 1.
        log.warning("Similarity matrix support for LS is experimental.")
        if sparse and n_jobs != 1:
            log.warning("Parallel processing not implemented for sparse "
                        "matrices. Using single process instead.")
            n_jobs = 1
    else: # metric == 'distance':
        kth = k - 1
        exclude = np.inf
        self_value = 0
        self_tmp_value = self_value
        if sparse:
            log.error("Sparse distance matrices are not supported.")
            raise NotImplementedError(
                "Sparse distance matrices are not supported.")
    D = np.copy(D)

    if test_ind is None:
        train_ind = slice(0, n) #take all        
    else:
        train_ind = np.setdiff1d(np.arange(n), test_ind)
    if sparse:
        r = np.zeros(n)
        for i in range(n):
            di = D[i, train_ind].toarray()
            di[i] = exclude
            r[i] = np.partition(di, kth=kth)[kth]
        D_ls = lil_matrix(D.shape)
        # Number of nonzero cells per row
        nnz = D.getnnz(axis=1)
    else:
        np.fill_diagonal(D, exclude)
        if n_jobs > 1:
            r_ctype = RawArray(ctypes.c_double, n)
            r = np.frombuffer(r_ctype, dtype=np.float64)
            with Pool(processes=n_jobs,
                      initializer=_ls_load_shared_data,
                      initargs=(D, train_ind, r, r_ctype)) as pool:
                for _ in pool.imap(func=partial(_ls_calculate_r, kth=kth),
                                   iterable=range(n)):
                    pass # results handled within func
        else:
            r = np.partition(D[:, train_ind], kth=kth)[:, kth]

    if sparse or n_jobs == 1:
        D_ls = np.zeros_like(D)
        for i in range(n):
            # vectorized inner loop: calc only triu part
            tmp = np.empty(n-i)
            tmp[0] = self_tmp_value
            if metric == 'similarity':
                if sparse and nnz[i] <= k:  # Don't rescale if there are
                    tmp[1:] = np.nan        # too few neighbors in row
                else:
                    tmp[1:] = np.exp(-1 * D[i, i+1:]**2 / (r[i] * r[i+1:]))
            else:
                tmp[1:] = 1 - np.exp(-1 * D[i, i+1:]**2 / (r[i] * r[i+1:]))
            D_ls[i, i:] = tmp
        # copy triu to tril -> symmetric matrix (diag=zeros)
        # NOTE: does not affect self values, since inf+inf=inf and 0+0=0
        D_ls += D_ls.T
    else:
        D_ls_ctype = RawArray(ctypes.c_double, D.size)
        D_ls = np.frombuffer(D_ls_ctype, dtype=np.float64).reshape(D.shape)
        with Pool(processes=n_jobs,
                  initializer=_ls_load_shared_data,
                  initargs=(D, train_ind, r, r_ctype, D_ls, D_ls_ctype)) as pool:
            for _ in pool.imap(func=partial(_ls_calculate_sec_dist,
                                  n=n, metric=metric,
                                  self_tmp_value=self_tmp_value),
                               iterable=range(n)):
                pass # results handled within func
        # triu is copied to tril within func
    if sparse:
        for i, nz in enumerate(nnz):
            if nz <= k: # too few neighbors
                D_ls[i, :] = D[i, :]
        return D_ls.tocsr()
    else:
        np.fill_diagonal(D_ls, self_value)
        return D_ls
Example #4
0
def mutual_proximity_gaussi_sample(D: np.ndarray, idx: np.ndarray,
                                   metric: str = 'distance', test_set_ind: np.ndarray = None, verbose: int = 0):
    """Transform a distance matrix with Mutual Proximity (empiric distribution).

    NOTE: this docstring does not yet fully reflect the properties of this
    proof-of-concept function!

    Applies Mutual Proximity (MP) [1]_ on a distance/similarity matrix using
    the empiric data distribution (EXACT, rather SLOW). The resulting
    secondary distance/similarity matrix should show lower hubness.

    Parameters
    ----------
    D : ndarray
        The ``n x s`` distance or similarity matrix, where ``n`` and ``s``
        are the dataset and sample size, respectively.
    idx : ndarray
        The index array that determines, to which data points the columns in
        `D` correspond.
    metric : {'distance', 'similarity'}, optional (default: 'distance')
        Define, whether matrix `D` is a distance or similarity matrix.
    test_set_ind : ndarray, optional (default: None)
        Define data points to be hold out as part of a test set. Can be:
        - None : Rescale all distances
        - ndarray : Hold out points indexed in this array as test set.
    verbose : int, optional (default: 0)
        Increasing level of output (progress report).
    Returns
    -------
    D_mp : ndarray
        Secondary distance MP empiric matrix.
    References
    ----------
    .. [1] Schnitzer, D., Flexer, A., Schedl, M., & Widmer, G. (2012).
           Local and global scaling reduce hubs in space. The Journal of Machine
           Learning Research, 13(1), 2871–2902.
    """
    # Initialization and checking input
    log = ConsoleLogging()
    io.check_sample_shape_fits(D, idx)
    io.check_valid_metric_parameter(metric)
    n = D.shape[0]
    s = D.shape[1]
    j = np.ones(n, int)
    j *= (n + 1)  # illegal indices will throw index out of bounds error
    j[idx] = np.arange(s)
    if metric == 'similarity':
        self_value = 1
    else:  # metric == 'distance':
        self_value = 0
    exclude_value = np.nan
    if test_set_ind is None:
        n_ind = range(n)
    else:
        n_ind = test_set_ind

    # Start MP
    D = D.copy()

    if issparse(D):
        raise NotImplementedError

    # ensure correct self distances (NOT done for sparse matrices!)
    for j, sample in enumerate(idx):
        D[sample, j] = exclude_value

    # Calculate mean and std per row, w/o self values (nan)
    mu = np.nanmean(D, 1)
    sd = np.nanstd(D, 1, ddof=0)
    # Avoid downstream div/0 errors
    sd[sd == 0] = 1e-7

    # set self dist/sim back to self_value to avoid scipy warnings
    for j, i in enumerate(idx):
        D[i, j] = self_value

    # # MP Gaussi
    # D_mp = np.zeros_like(D)
    # for sample, i in enumerate(n_ind):
    #     if verbose and ((i + 1) % 1000 == 0 or i + 1 == n):
    #         log.message("MP_gaussi: {} of {}.".format(i + 1, n), flush=True)
    #     j = slice(0, s)
    #
    #     if metric == 'similarity':
    #         p1 = norm.cdf(D[i, j], mu[i], sd[i])
    #         p2 = norm.cdf(D[i, j], mu[idx], sd[idx])
    #         D_mp[i, j] = (p1 * p2).ravel()
    #     else:
    #         # Survival function: sf(.) := 1 - cdf(.)
    #         p1 = norm.sf(D[i, j], mu[i], sd[i])
    #         p2 = norm.sf(D[i, j], mu[idx], sd[idx])
    #         D_mp[i, j] = (1 - p1 * p2).ravel()
    #
    # # Ensure correct self distances
    # for j, sample in enumerate(idx):
    #     D_mp[sample, j] = self_value

    # if test_set_ind is None:
    #     return D_mp
    # else:
    #     return D_mp[test_set_ind]

    return mu, sd
Example #5
0
def hubness(D:np.ndarray, k:int=5, metric='distance',
            verbose:int=0, n_jobs:int=1,
            random_state=None, shuffle_equal=True):
    """Compute hubness of a distance matrix.

    Hubness [1]_ is the skewness of the `k`-occurrence histogram (reverse
    nearest neighbor count, i.e. how often does a point occur in the
    `k`-nearest neighbor lists of other points).

    Parameters
    ----------
    D : ndarray
        The ``n x n`` symmetric distance (similarity) matrix or
        an ``n x m`` partial distances matrix (e.g. for train/test splits,
        with test objects in rows, train objects in column)
        
        NOTE: Partial distance matrices MUST NOT contain self distances.

    k : int, optional (default: 5)
        Neighborhood size for `k`-occurrence.

    metric : {'distance', 'similarity'}, optional (default: 'distance')
        Define, whether matrix `D` is a distance or similarity matrix

    verbose : int, optional (default: 0)
        Increasing level of output (progress report).

    n_jobs : int, optional (default: 1)
        Number of parallel processes spawned for hubness calculation.
        Value 1 (default): One process (not using multiprocessing)
        Value (-1): As many processes as number of available CPUs.

    random_state : int, optional
        Seed the RNG for reproducible results.
        
        NOTE: Currently only compatible with `n_jobs`=1

    shuffle_equal : bool, optional
        If true, shuffle neighbors with identical distances to avoid
        artifact hubness.
        NOTE: This is especially useful for secondary distance measures
        with a restricted number of possible values, e.g. SNN or MP empiric.

    Returns
    -------
    S_k : float
        Hubness (skewness of `k`-occurrence distribution)
    D_k : ndarray
        `k`-nearest neighbor lists
    N_k : ndarray
        `k`-occurrence list

    References
    ----------
    .. [1] Radovanović, M., Nanopoulos, A., & Ivanović, M. (2010).
           Hubs in Space : Popular Nearest Neighbors in High-Dimensional Data.
           Journal of Machine Learning Research, 11, 2487–2531. Retrieved from
           http://jmlr.csail.mit.edu/papers/volume11/radovanovic10a/
           radovanovic10a.pdf
    """
    # Don't use multiprocessing environment when using only one job
    if n_jobs == 1:
        return _hubness_no_multiprocessing(D=D,
                                           k=k,
                                           metric=metric,
                                           verbose=verbose,
                                           random_state=random_state,
                                           shuffle_equal=shuffle_equal)
    if random_state is not None:
        raise ValueError("Seeding the RNG is not compatible with using n_jobs > 1.")
    log = ConsoleLogging()
    io.check_is_nD_array(arr=D, n=2, arr_type='Distance')
    io.check_valid_metric_parameter(metric)
    n, m = D.shape
    if k >= m:
        k_old = k
        k = m - 1
        log.warning("Reducing k from {} to {}, so that it is less than "
                    "the total number of neighbors.".format(k_old, k))
    if metric == 'distance':
        d_self = np.inf
        sort_order = 1
        kth = np.arange(k)
    if metric == 'similarity':
        d_self = -np.inf
        sort_order = -1
        kth = np.arange(m - k, m)

    if verbose:
        log.message("Hubness calculation (skewness of {}-occurrence)".format(k))

    # Initialization
    D = D.copy()
    D_k = np.zeros((n, k), dtype=np.float64)

    if issparse(D):
        pass # correct self-distance must be ensured upstream for sparse
    else:
        if n == m:
            # Set self dist to inf
            np.fill_diagonal(D, d_self)
        else:
            pass # Partial distance matrices MUST NOT contain self distances
        # make non-finite (NaN, Inf) appear at the end of the sorted list
        D[~np.isfinite(D)] = d_self

    # Parallelization
    if n_jobs == -1: # take all cpus
        NUMBER_OF_PROCESSES = mp.cpu_count() # @UndefinedVariable
    else:
        NUMBER_OF_PROCESSES = n_jobs
    D_k_ctype = RawArray(ctypes.c_int32, n*k)
    D_k = np.frombuffer(D_k_ctype, dtype=np.int32).reshape((n, k))
    with Pool(processes=NUMBER_OF_PROCESSES,
              initializer=_hubness_load_shared_data,
              initargs=(D, D_k, )) as pool:
        for _ in pool.imap(
            func=partial(_hubness_nearest_neighbors, n=n, m=m, 
                         d_self=d_self, metric=metric, kth=kth, 
                         sort_order=sort_order, log=log, verbose=verbose,
                         shuffle_equal=shuffle_equal),
            #chunksize=int(1e2),
            iterable=range(n)):
            pass # results handled within func

    # k-occurrence
    N_k = np.bincount(D_k.astype(int).ravel(), minlength=m)
    # Hubness
    S_k = stats.skew(N_k)

    if verbose:
        log.message("Hubness calculation done.", flush=True)

    # return hubness, k-nearest neighbors, N occurence
    return S_k, D_k, N_k
Example #6
0
def _hubness_no_multiprocessing(D:np.ndarray, k:int=5, metric='distance',
                                verbose:int=0, random_state=None,
                                shuffle_equal:bool=True):
    """ Hubness calculations without multiprocessing overhead. """
    log = ConsoleLogging()
    io.check_is_nD_array(arr=D, n=2, arr_type='Distance')
    io.check_valid_metric_parameter(metric)
    n, m = D.shape
    if k >= m:
        k_old = k
        k = m - 1
        log.warning("Reducing k from {} to {}, so that it is less than "
                    "the total number of neighbors.".format(k_old, k))
    if metric == 'distance':
        d_self = np.inf
        sort_order = 1
        kth = np.arange(k)
    if metric == 'similarity':
        d_self = -np.inf
        sort_order = -1
        kth = np.arange(n - k, n)

    if verbose:
        log.message("Hubness calculation (skewness of {}-occurence)".format(k))
    D = D.copy()
    D_k = np.zeros((n, k), dtype=np.float64)
    rnd = np.random.RandomState(random_state)

    if issparse(D):
        pass # correct self-distance must be ensured upstream for sparse
    else:
        if n == m:
            # Set self dist to inf
            np.fill_diagonal(D, d_self)
        else:
            pass # a partial distances matrix should not contain self distances
        # make non-finite (NaN, Inf) appear at the end of the sorted list
        D[~np.isfinite(D)] = d_self

    for i in range(n):
        if verbose and ((i+1)%10000==0 or i+1==n):
            log.message("NN: {} of {}.".format(i+1, n), flush=True)
        if issparse(D):
            d = D[i, :].toarray().ravel() # dense copy of one row
        else: # normal ndarray
            d = D[i, :]
        if n == m:
            d[i] = d_self
        else: # this does not hold for general dissimilarities
            if metric == 'distance':
                d[d==0] = d_self
        d[~np.isfinite(d)] = d_self
        if shuffle_equal:
            # Randomize equal values in the distance matrix rows to avoid the
            # problem case if all numbers to sort are the same, which would
            # yield high hubness, even if there is none.
            rp = rnd.permutation(m)
            d2 = d[rp]
            d2idx = np.argpartition(d2, kth=kth)
            D_k[i, :] = rp[d2idx[kth]][::sort_order]
        else:
            d_idx = np.argpartition(d, kth=kth)
            D_k[i, :] = d_idx[kth][::sort_order]

    # N-occurence
    N_k = np.bincount(D_k.astype(int).ravel(), minlength=m)
    # Hubness
    S_k = stats.skew(N_k)

    # return k-hubness, k-nearest neighbors, k-occurence
    if verbose:
        log.message("Hubness calculation done.", flush=True)
    return S_k, D_k, N_k
def snn_sample(D: np.ndarray,
               k: int = 10,
               metric='distance',
               train_ind: np.ndarray = None,
               test_ind: np.ndarray = None,
               n_jobs: int = 1):
    """Transform distance matrix using shared nearest neighbors [1]_.

    __DRAFT_VERSION__

    SNN similarity is based on computing the overlap between the `k` nearest
    neighbors of two objects. SNN approaches try to symmetrize nearest neighbor
    relations using only rank and not distance information [2]_.

    Parameters
    ----------
    D : np.ndarray
        The ``n x s`` distance (similarity) matrix, where ``s==train_ind.size``

    k : int, optional (default: 10)
        Neighborhood radius: The `k` nearest neighbors are used to calculate SNN.

    metric : {'distance', 'similarity'}, optional (default: 'distance')
        Define, whether the matrix `D` is a distance or similarity matrix

    train_ind : ndarray, optional
        If given, use only these data points as neighbors for rescaling.

    test_ind : ndarray, optional (default: None)
        Define data points to be hold out as part of a test set. Can be:

        - None : Rescale all distances
        - ndarray : Hold out points indexed in this array as test set. 

    n_jobs : int, optional, default: 1
        Number of processes for parallel computations.

        - `1`: Don't use multiprocessing.
        - `-1`: Use all CPUs

    Returns
    -------
    D_snn : ndarray
        Secondary distance SNN matrix

    References
    ---------- 
    .. [1] R. Jarvis and E. A. Patrick, “Clustering using a similarity measure
           based on shared near neighbors,” IEEE Transactions on Computers,
           vol. 22, pp. 1025–1034, 1973.

    .. [2] Flexer, A., & Schnitzer, D. (2013). Can Shared Nearest Neighbors
           Reduce Hubness in High-Dimensional Spaces? 2013 IEEE 13th
           International Conference on Data Mining Workshops, 460–467.
           http://doi.org/10.1109/ICDMW.2013.101
    """
    io.check_sample_shape_fits(D, train_ind)
    io.check_valid_metric_parameter(metric)
    if metric == 'distance':
        self_value = 0.
        sort_order = 1
        exclude = np.inf
    if metric == 'similarity':
        self_value = 1.
        sort_order = -1
        exclude = -np.inf
    distance = D.copy()
    n = distance.shape[0]
    if test_ind is None:
        n_ind = range(n)
    else:
        n_ind = test_ind
    # Exclude self distances
    for j, sample in enumerate(train_ind):
        distance[sample, j] = exclude

    if n_jobs == -1:
        n_jobs = cpu_count()
    if n_jobs > 1:
        knn_ctype = RawArray(ctypes.c_bool, distance.size)
        knn = np.frombuffer(knn_ctype, dtype=bool).reshape(D.shape)
        D_snn_ctype = RawArray(ctypes.c_double, distance.size)
        D_snn = np.frombuffer(D_snn_ctype, dtype=np.float64).reshape(D.shape)
        with Pool(processes=n_jobs,
                  initializer=_snns_init,
                  initargs=(distance, knn, train_ind, D_snn)) as pool:
            for _ in pool.imap(func=partial(_snns_my_hood,
                                            k=k,
                                            sort_order=sort_order),
                               iterable=range(n)):
                pass  # Handling inside function
            for _ in pool.imap(func=partial(_snns_our_hood, k=k,
                                            metric=metric),
                               iterable=n_ind):
                pass  # Handling inside function
    else:
        knn = np.zeros_like(distance, bool)
        # find nearest neighbors for each point
        for i in range(n):
            di = distance[i, :]
            # TODO change to np.partition for PERF
            nn = np.argsort(di)[::sort_order]
            knn[i, nn[0:k]] = True
        D_snn = np.zeros_like(distance)
        for i in n_ind:
            knn_i = knn[i, :]
            # using broadcasting
            Dij = np.sum(np.logical_and(knn_i, knn[train_ind, :]), 1)
            if metric == 'distance':
                D_snn[i, :] = 1. - Dij / k
            else:  # metric == 'similarity':
                D_snn[i, :] = Dij / k

    # Ensure correct self distances and return sec. dist. matrix
    if test_ind is None:
        np.fill_diagonal(D_snn, self_value)
        return D_snn
    else:
        for j, sample in enumerate(train_ind):
            D_snn[sample, j] = self_value
        return D_snn[test_ind]
Example #8
0
def mutual_proximity_gaussi(
    D: np.ndarray,
    metric: str = 'distance',
    sample_size: int = 0,
    min_nnz: int = 30,
    test_set_ind: np.ndarray = None,
    verbose: int = 0,
    idx: np.ndarray = None,
):
    """Transform distances with Mutual Proximity (indep. normal distributions).
    
    Applies Mutual Proximity (MP) [1]_ on a distance/similarity matrix. Gaussi 
    variant assumes independent normal distributions (FAST).
    The resulting second. distance/similarity matrix should show lower hubness.
    
    Parameters
    ----------
    D : ndarray or csr_matrix
        - ndarray: The ``n x n`` symmetric distance or similarity matrix.
        - csr_matrix: The ``n x n`` symmetric similarity matrix.
        
        NOTE: In case of sparse `D`, zeros are interpreted as missing values 
        and ignored during calculations. Thus, results may differ 
        from using a dense version.

    metric : {'distance', 'similarity'}, optional (default: 'distance')
        Define, whether matrix `D` is a distance or similarity matrix.
        
        NOTE: In case of sparse `D`, only 'similarity' is supported.
        
    sample_size : int, optional (default: 0)
        Define sample size from which Gauss parameters are estimated.
        Use all data when set to ``0``.
        Ignored in case of SampleMP (i.e. if provided `idx`).

    min_nnz : int, optional, default: 30
        Calculate MP between two objects `i` and `j`, iff at least ``min_nnz``
        values are present in both row ``i`` and ``j``.
        Otherwise, return the original similarity.
        Ignored, if `metric` is 'distance'.

    test_set_ind : ndarray, optional (default: None)
        Define data points to be hold out as part of a test set. Can be:
        
        - None : Rescale all distances
        - ndarray : Hold out points indexed in this array as test set.

        Ignored in case of SampleMP (i.e. if provided `idx`).

    verbose : int, optional (default: 0)
        Increasing level of output (progress report).

    idx : ndarray, optional (default: None)
        The index array that determines to which data points the columns in
        `D` correspond. Only required for SampleMP.

    Returns
    -------
    D_mp : ndarray
        Secondary distance MP gaussi matrix.
    
    References
    ----------
    .. [1] Schnitzer, D., Flexer, A., Schedl, M., & Widmer, G. (2012). 
           Local and global scaling reduce hubs in space. The Journal of Machine 
           Learning Research, 13(1), 2871–2902.
    """
    # Initialization
    log = ConsoleLogging()

    # Checking input
    if idx is None:
        io.check_distance_matrix_shape(D)
    else:
        io.check_sample_shape_fits(D, idx)
    io.check_valid_metric_parameter(metric)
    n = D.shape[0]
    s = D.shape[1]

    if metric == 'similarity':
        self_value = 1
    else:  # metric == 'distance':
        self_value = 0
    if test_set_ind is None:
        train_set_ind = slice(0, n)
    else:
        train_set_ind = np.setdiff1d(np.arange(n), test_set_ind)

    # Start MP Gaussi
    if verbose:
        log.message('Mutual Proximity Gaussi rescaling started.', flush=True)
    D = D.copy()

    if issparse(D):
        return _mutual_proximity_gaussi_sparse(D, sample_size, min_nnz,
                                               test_set_ind, verbose, log)

    # ignore self dist/sim for parameter estimation
    if idx is None:
        np.fill_diagonal(D, np.nan)
    else:
        for j, i in enumerate(idx):
            D[i, j] = np.nan

    # Calculate mean and std
    if idx is None:
        if sample_size == 0:
            mu = np.nanmean(D[train_set_ind], 0)
            sd = np.nanstd(D[train_set_ind], 0, ddof=0)
        else:
            samples = np.random.shuffle(train_set_ind)[0:sample_size]
            mu = np.nanmean(D[samples], 0)
            sd = np.nanstd(D[samples], 0, ddof=0)
    else:
        mu = np.nanmean(D, 1)
        sd = np.nanstd(D, 1, ddof=0)
    # Avoid downstream div/0 errors
    sd[sd == 0] = 1e-7
    # set self dist/sim back to self_value to avoid scipy warnings
    if idx is None:
        np.fill_diagonal(D, self_value)
    else:
        for j, i in enumerate(idx):
            D[i, j] = self_value

    # MP Gaussi
    D_mp = np.zeros_like(D)
    for i in range(n):
        if verbose and ((i + 1) % 1000 == 0 or i + 1 == n):
            log.message("MP_gaussi: {} of {}.".format(i + 1, n), flush=True)
        if idx is None:
            j = slice(i + 1, n)
            j_mom = j
        else:
            j = slice(0, s)
            j_mom = idx[j]

        if metric == 'similarity':
            p1 = norm.cdf(D[i, j], mu[i], sd[i])
            p2 = norm.cdf(D[i, j], mu[j_mom], sd[j_mom])
            D_mp[i, j] = (p1 * p2).ravel()
        else:
            # sf(.) := 1 - cdf(.)
            p1 = norm.sf(D[i, j], mu[i], sd[i])
            p2 = norm.sf(D[i, j], mu[j_mom], sd[j_mom])
            D_mp[i, j] = (1 - p1 * p2).ravel()

    if idx is None:
        D_mp += D_mp.T
        np.fill_diagonal(D_mp, self_value)
    else:
        # Ensure correct self distances
        for j, sample in enumerate(idx):
            D_mp[sample, j] = self_value
    return D_mp
Example #9
0
 def test_check_valid_metric(self):
     with self.assertRaises(ValueError):
         metric = 'dissimilarity'
         io.check_valid_metric_parameter(metric)
def predict(D:np.ndarray, target:np.ndarray, k=5,
            metric:str='distance', test_ind:np.ndarray=None, verbose:int=0,
            sample_idx=None, return_cmat=True):
    """Perform `k`-nearest neighbor classification.

    Use the ``n x n`` symmetric distance matrix `D` and target class
    labels `target` to perform a `k`-NN experiment (leave-one-out
    cross-validation or evaluation of test set; see parameter `test_ind`).
    Ties are broken by the nearest neighbor.

    Parameters
    ----------
    D : ndarray
        The ``n x n`` symmetric distance (similarity) matrix.

    target : ndarray (of dtype=int)
        The ``n x 1`` target class labels (ground truth) or
        ``n x c`` in case of ``c`` binarized multilabels

    k : int or array_like (of dtype=int), optional (default: 5)
        Neighborhood size for `k`-NN classification.
        For each value in `k`, one `k`-NN experiment is performed.

        HINT: Providing more than one value for `k` is a cheap means to perform
        multiple `k`-NN experiments at once. Try e.g. ``k=[1, 5, 20]``.

    metric : {'distance', 'similarity'}, optional (default: 'distance')
        Define, whether matrix `D` is a distance or similarity matrix

    test_ind : ndarray, optional (default: None)
        Define data points to be hold out as part of a test set. Can be:

        - None : Perform a LOO-CV experiment
        - ndarray : Hold out points indexed in this array as test set. Fit
          model to remaining data. Evaluate model on test set.

    verbose : int, optional (default: 0)
        Increasing level of output (progress report).

    return_cmat : bool, optional, default: True
        If False, only return the predictions `y_pred`.
        Otherwise also return the confusion matrices.

    Returns
    -------
    y_pred : ndarray (shape=(n_k, n, c), dtype=int)
        Predicted class labels (`n_k`... number of items in parameter `k`)
        
        HINT: Referring to the above example... 
        ... ``y_pred[0]`` gives the predictions of the ``k=1`` experiment.

    cmat : ndarray (shape=(n_k x c x n_t x n_t), dtype=int) 
        Confusion matrix (``n_t`` number of unique items in parameter target)

        HINT: ... ``cmat[2, 0, :, :]`` gives the confusion matrix of
        the first class in the ``k=20`` experiment in the following order:
            TN    FP
            FN    TP
    """

    # Check input sanity
    log = ConsoleLogging()
    if sample_idx is None:
        io.check_distance_matrix_shape(D)
    else:
        io.check_sample_shape_fits(D, sample_idx)
    #io._check_distance_matrix_shape_fits_labels(D, target)
    io.check_valid_metric_parameter(metric)
    if metric == 'distance':
        d_self = np.inf
        sort_order = 1
    if metric == 'similarity':
        d_self = -np.inf
        sort_order = -1

    # Copy, because data is changed
    if not issparse(D):
        D = D.copy()
    target = target.astype(int)
    if target.ndim == 1:
        target = target[:, np.newaxis]
    if verbose:
        log.message("Start k-NN experiment.")
    # Handle LOO-CV vs. test set mode
    if test_ind is None:
        n = D.shape[0]
        test_set_ind = range(n)    # dummy     io.check_valid_metric_parameter(metric)
        train_set_ind = n   # dummy
    else:
        # number of points to be classified
        n = test_set_ind.size
        # Indices of training examples
        train_set_ind = np.setdiff1d(np.arange(n), test_set_ind)
        if sample_idx is not None:
            raise NotImplementedError("Sample k-NN does not support train/"
                                      "test splits at the moment.")
    # Number of k-NN parameters
    try:
        k_length = k.size
    except AttributeError as e:
        if isinstance(k, int):
            k = np.array([k])
            k_length = k.size
        elif isinstance(k, list):
            k = np.array(k)
            k_length = k.size
        else:
            raise e

    cl = np.sort(np.unique(target))
    cmat = np.zeros((k_length, target.shape[1], len(cl), len(cl)), dtype=int)
    y_pred = np.zeros((k_length, *target.shape), dtype=int)

    classes = target.copy()
    for idx, cur_class in enumerate(np.array(cl).ravel()):
        # change labels to 0, 1, ..., len(cl)-1
        classes[target == cur_class] = idx
    if sample_idx is not None:
        sample_classes = classes[sample_idx]
        j = np.ones(n, int)
        j *= (n+1) # illegal indices will throw index out of bounds error
        j[sample_idx] = np.arange(len(sample_idx))
        for j, sample in enumerate(sample_idx):
            D[sample, j] = d_self
    cl = range(len(cl))

    # Classify each point in test set
    for i in test_set_ind:
        if verbose and ((i+1)%1000==0 or i+1==n):
            log.message("Prediction: {} of {}.".format(i+1, n), flush=True)

        if issparse(D):
            row = D.getrow(i)
            #row = D.data
            ind = row.nonzero()[1]
            row = row.toarray().ravel()
        else:
            row = D[i, :]
        if sample_idx is None:
            row[i] = d_self

        # Sort points in training set according to distance
        # Randomize, in case there are several points of same distance
        # (this is especially relevant for SNN rescaling)
        if sample_idx is None:
            rp = train_set_ind
        else:
            if issparse(D):
                rp = ind
            else:
                rp = np.arange(len(sample_idx))
        rp = np.random.permutation(rp)
        d2 = row[rp]
        d2idx = np.argsort(d2, axis=0)[::sort_order]
        d2idx = d2idx[~np.isnan(d2[d2idx])] # filter NaN values
        idx = rp[d2idx]

        # More than one k is useful for cheap multiple k-NN experiments at once
        for j in range(k_length):
            # Make sure no inf/-inf/nan values are used for classification
            finite_val = np.isfinite(row[idx[0:k[j]]])
            # However, if no values are finite, classify randomly
            if finite_val.sum() == 0:
                idx = np.random.permutation(idx)
                finite_val = np.ones_like(finite_val)
                log.warning("Query was classified randomly, because all "
                            "distances were non-finite numbers.")
            for l in range(target.shape[1]):
                l_classes = classes[:, l]
                if sample_idx is None:
                    nn_class = l_classes[idx[0:k[j]]][finite_val]
                else:
                    l_sample_classes = sample_classes[:, l]
                    nn_class = l_sample_classes[idx[0:k[j]]][finite_val]
                cs = np.bincount(nn_class.astype(int))
                max_cs = np.where(cs == np.max(cs))[0]
                seed_class = classes[i, l]
                # "tie": use nearest neighbor
                if len(max_cs) > 1:
                    y_pred[j, i, l] = nn_class[0]
                    cmat[j, l, seed_class, nn_class[0]] += 1
                # majority vote
                else:
                    y_pred[j, i, l] = cl[max_cs[0]]
                    cmat[j, l, seed_class, cl[max_cs[0]]] += 1

    if verbose:
        log.message("Finished k-NN experiment.")

    if return_cmat:
        return y_pred, cmat
    else:
        return y_pred
def r_precision(S:np.ndarray, y:np.ndarray, metric:str='distance',
                average:str='weighted', return_y_pred:int=0,
                verbose:int=0, n_jobs:int=1) -> float:
    """ Calculate R-Precision (recall at R-th position).

    Parameters
    ----------
    S : ndarray or CSR matrix
        Distance (similarity) matrix

    y : ndarray
        Target (ground truth) labels

    metric : 'distance' or 'similarity', optional, default: 'similarity'
        Define, whether `S` is a distance or similarity matrix.

    average : 'weighted', 'macro' or None, optional, default: 'weighted'
        Ignored. Weighted and macro precisions are returned.

    return_y_pred : int, optional, default: 0
        If > 0, return the labels of the `return_y_pred` nearest neighbors

    verbose : int, optional, default: 0
        Increasing level of output.

    n_jobs : int, optional, default: 1
        Number of parallel processes to use.

    Returns
    -------
    r_precision : dictionary with following keys:
        macro : float
            Macro R-Precision.

        weighted : float
            Weighted R-Precision.

        per_item : ndarray
            R-Precision at the object.

        relevant_items : ndarray
            Relevant items per class.

        y_true : ndarray
            Target labels (req. for weighting).

        y_pred : ndarray
            Labels of some k-nearest neighbors
    """
    io.check_distance_matrix_shape(S)
    io.check_distance_matrix_shape_fits_labels(S, y)
    io.check_valid_metric_parameter(metric)
    log = ConsoleLogging()
    n, _ = S.shape
    S_is_sparse = issparse(S)
    if metric != 'similarity' or not S_is_sparse:
        raise NotImplementedError("Only sparse similarity matrices so far.")

    # Map labels to 0..n(labels)-1
    le = LabelEncoder()
    # Add int.min for misclassifications
    incorr_orig = np.array([np.nan]).astype(int)
    le.fit(np.append(y, incorr_orig))
    y = le.transform(y)
    incorrect = le.transform(incorr_orig)
    # Number of relevant items, i.e. number of each label
    relevant_items = np.bincount(y) - 1 # one less for self class
    # R-Precision for each item
    r_prec = np.zeros(n, dtype=np.float)
    
    # Classify each point in test set
    if verbose:
        log.message("Creating shared memory data.")
    n_random_pred = mp.Value(ctypes.c_int)
    n_random_pred.value = 0
    if verbose and log:
        log.message("Spawning processes for prediction.")
    y_pred = np.zeros((n, return_y_pred), dtype=float)
    kwargs = {'y_pred' : return_y_pred,
              'incorrect' : incorrect}
    with mp.Pool(processes=n_jobs, 
                 initializer=_load_shared_csr, 
                 initargs=(S, y, n_random_pred, relevant_items)) as pool:
        for i, r in enumerate(
            pool.imap(
                func=partial(_r_prec_worker, **kwargs),
                iterable=range(n), 
                chunksize=int(1e2))):
            if verbose and ((i+1)%int(1e7 / 10**verbose) == 0 or i == n-1):
                log.message("Classification: {} of {} on {}.".format(
                            i+1, n, mp.current_process().name), flush=True)
            try:
                r_prec[i] = r[0]
                y_pred[i, :] = r[1]
            except:
                r_prec[i] = r
            if i == n-1:
                pass
    pool.join()

    if verbose and log:
        log.message("Retrieving nearest neighbors.")
    # Work-around for new scikit-learn requirement of 1D arrays for LabelEncoder
    y_pred = np.asarray([le.inverse_transform(col) for col in y_pred.T.astype(int)]).T
    if verbose and log:
        log.message("Finishing.")
    if n_random_pred.value:
        log.warning(("{} queries were classified randomly, because all "
                     "distances were non-finite numbers or there were no other "
                     "objects in the same class.").format(n_random_pred.value))
    return_dict = {'macro' : r_prec.mean(),
                   'weighted' : np.average(r_prec, weights=relevant_items[y]),
                   'per_item' : r_prec,
                   'relevant_items' : relevant_items,
                   'y_true' : y,
                   'y_pred' : y_pred}
    return return_dict
def score(D:np.ndarray, target:np.ndarray, k=5,
          metric:str='distance', test_set_ind:np.ndarray=None, verbose:int=0,
          sample_idx=None, filter_self=True):
    """Perform `k`-nearest neighbor classification.

    Use the ``n x n`` symmetric distance matrix `D` and target class
    labels `target` to perform a `k`-NN experiment (leave-one-out
    cross-validation or evaluation of test set; see parameter `test_set_ind`).
    Ties are broken by the nearest neighbor.

    Parameters
    ----------
    D : ndarray
        The ``n x n`` symmetric distance (similarity) matrix.

    target : ndarray (of dtype=int)
        The ``n x 1`` target class labels (ground truth).

    k : int or array_like (of dtype=int), optional (default: 5)
        Neighborhood size for `k`-NN classification.
        For each value in `k`, one `k`-NN experiment is performed.

        HINT: Providing more than one value for `k` is a cheap means to perform
        multiple `k`-NN experiments at once. Try e.g. ``k=[1, 5, 20]``.

    metric : {'distance', 'similarity'}, optional (default: 'distance')
        Define, whether matrix `D` is a distance or similarity matrix

    test_sed_ind : ndarray, optional (default: None)
        Define data points to be hold out as part of a test set. Can be:

        - None : Perform a LOO-CV experiment
        - ndarray : Hold out points indexed in this array as test set. Fit
          model to remaining data. Evaluate model on test set.

    verbose : int, optional (default: 0)
        Increasing level of output (progress report).

    sample_idx : ...
        TODO add description

    filter_self : bool, optional, default: True
        Remove self similarities from sparse ``D``.
        This assumes that the highest similarity per row is the self
        similarity.
        
        NOTE: Quadratic dense matrices are always filtered for self
        distances/similarities, even if `filter_self` is set t0 `False`.
        
    Returns
    -------
    acc : ndarray (shape=(n_k x 1), dtype=float)
        Classification accuracy (`n_k`... number of items in parameter `k`)

        HINT: Refering to the above example... 
        ... ``acc[0]`` gives the accuracy of the ``k=1`` experiment.
    corr : ndarray (shape=(n_k x n), dtype=int)
        Raw vectors of correctly classified items

        HINT: ... ``corr[1, :]`` gives these items for the ``k=5`` experiment.
    cmat : ndarray (shape=(n_k x n_t x n_t), dtype=int) 
        Confusion matrix (``n_t`` number of unique items in parameter target)

        HINT: ... ``cmat[2, :, :]`` gives the confusion matrix of
        the ``k=20`` experiment.
    """

    # Check input sanity
    log = ConsoleLogging()
    if sample_idx is None:
        io.check_distance_matrix_shape(D)
    else:
        io.check_sample_shape_fits(D, sample_idx)
    io.check_distance_matrix_shape_fits_labels(D, target)
    io.check_valid_metric_parameter(metric)
    if metric == 'distance':
        d_self = np.inf
        sort_order = 1
    if metric == 'similarity':
        d_self = -np.inf
        sort_order = -1

    # Copy, because data is changed
    D = D.copy()
    target = target.astype(int)
    D_is_sparse = issparse(D)

    if verbose:
        log.message("Start k-NN experiment.")
    # Handle LOO-CV vs. test set mode
    if test_set_ind is None:
        n = D.shape[0]
        test_set_ind = range(n)    # dummy 
        train_set_ind = n   # dummy
    else:
        # number of points to be classified
        n = test_set_ind.size
        # Indices of training examples
        train_set_ind = np.setdiff1d(np.arange(n), test_set_ind)
        if sample_idx is not None:
            raise NotImplementedError("Sample k-NN does not support train/"
                                      "test splits at the moment.")
    # Number of k-NN parameters
    try:
        k_length = k.size
    except AttributeError as e:
        if isinstance(k, int):
            k = np.array([k])
            k_length = k.size
        elif isinstance(k, list):
            k = np.array(k)
            k_length = k.size
        else:
            raise e

    acc = np.zeros((k_length, 1))
    corr = np.zeros((k_length, D.shape[0]))

    cl = np.sort(np.unique(target))
    if D_is_sparse:
        # Add a label for unknown class (object w/o nonzero sim to any others)
        cl = np.append(cl, cl.max()+1)
        n_classes = len(cl) + 1
    else:
        n_classes = len(cl)
    cmat = np.zeros((k_length, n_classes, n_classes))

    classes = target.copy()
    for idx, cur_class in enumerate(cl):
        # change labels to 0, 1, ..., len(cl)-1
        classes[target == cur_class] = idx
    if sample_idx is not None:
        sample_classes = classes[sample_idx]
        j = np.ones(n, int)
        j *= (n+1) # illegal indices will throw index out of bounds error
        j[sample_idx] = np.arange(len(sample_idx))
        for j, sample in enumerate(sample_idx):
            D[sample, j] = d_self
    cl = range(len(cl))

    rnd_classif = np.zeros(k_length)
    # Classify each point in test set
    for i in test_set_ind:
        if verbose and ((i+1)%1000==0 or i+1==n):
            log.message("Prediction: {} of {}.".format(i+1, n), flush=True)

        seed_class = classes[i]

        if D_is_sparse:
            row = D.getrow(i)
        else:
            row = D[i, :]
            if sample_idx is None:
                row[i] = d_self

        # Sort points in training set according to distance
        # Randomize, in case there are several points of same distance
        # (this is especially relevant for SNN rescaling)
        if sample_idx is None:
            rp = train_set_ind
        else:
            rp = np.arange(len(sample_idx))
        if D_is_sparse:
            nnz = row.nnz
            rp = np.random.permutation(nnz)
            d2 = row.data[rp]
            # Partition for each k value
            kth = nnz - k - 1
            # sort the two highest similarities to end
            kth = np.append(kth, [nnz-2, nnz-1])
            # Clip negative indices (nnz < k)
            np.clip(kth, a_min=0, a_max=nnz-1, out=kth)
            # Remove duplicate k values and sort
            kth = np.unique(kth)
            d2idx = np.argpartition(d2, kth=kth)
            d2idx = d2idx[~np.isnan(d2[d2idx])][::-1]
            idx = row.nonzero()[1][rp[d2idx]]
            idx = idx[1:] # rem self sim
        else:
            rp = np.random.permutation(rp)
            d2 = row[rp]
            d2idx = np.argsort(d2, axis=0)[::sort_order]
            d2idx = d2idx[~np.isnan(d2[d2idx])] # filter NaN values
            idx = rp[d2idx]

        # More than one k is useful for cheap multiple k-NN experiments at once
        for j in range(k_length):
            # Make sure no inf/-inf/nan values are used for classification
            if D_is_sparse:
                #print(row[0, idx[0:k[j]]].toarray())
                finite_val = np.isfinite(row[0, idx[0:k[j]]].toarray().ravel())
                #print(finite_val)
            else:
                finite_val = np.isfinite(row[idx[0:k[j]]])
            # However, if no values are finite, classify randomly
            if finite_val.sum() == 0:
                idx = np.random.permutation(idx)
                finite_val = np.ones_like(finite_val)
                rnd_classif[j] += 1
            if sample_idx is None:
                nn_class = classes[idx[0:k[j]]][finite_val]
            else:
                #finite_val = np.isfinite(sample_row[idx[0:k[j]]])
                nn_class = sample_classes[idx[0:k[j]]][finite_val]
            cs = np.bincount(nn_class.astype(int))
            if cs.size > 0:
                max_cs = np.where(cs == np.max(cs))[0]
            else:
                max_cs = np.array([len(cl) - 1]) # misclassification label

            # "tie": use nearest neighbor
            if len(max_cs) > 1:
                if seed_class == nn_class[0]:
                    acc[j] += 1/n 
                    corr[j, i] = 1
                cmat[j, seed_class, nn_class[0]] += 1
            # majority vote
            else:
                if cl[max_cs[0]] == seed_class:
                    acc[j] += 1/n
                    corr[j, i] = 1
                cmat[j, seed_class, cl[max_cs[0]]] += 1

    if np.any(rnd_classif):
        for x in rnd_classif:
            log.warning(("{} queries were classified randomly, because all "
                        "distances were non-finite numbers.").format(x))
    if verbose:
        log.message("Finished k-NN experiment.")

    return acc, corr, cmat
Example #13
0
def _mutual_proximity_empiric_sample(D: np.ndarray,
                                     idx: np.ndarray,
                                     metric: str = 'distance',
                                     test_set_ind: np.ndarray = None,
                                     verbose: int = 0,
                                     n_jobs=None):
    """Transform a distance matrix with Mutual Proximity (empiric distribution).
    
    NOTE: this docstring does not yet fully reflect the properties of this
    proof-of-concept function!
    
    Applies Mutual Proximity (MP) [1]_ on a distance/similarity matrix using
    the empiric data distribution (EXACT, rather SLOW). The resulting
    secondary distance/similarity matrix should show lower hubness.
    
    Parameters
    ----------
    D : ndarray
        The ``n x s`` distance or similarity matrix, where ``n`` and ``s``
        are the dataset and sample size, respectively.

    idx : ndarray
        The index array that determines, to which data points the columns in
        `D` correspond.

    metric : {'distance', 'similarity'}, optional (default: 'distance')
        Define, whether matrix `D` is a distance or similarity matrix.

    test_sed_ind : ndarray, optional (default: None)
        Define data points to be hold out as part of a test set. Can be:

        - None : Rescale all distances
        - ndarray : Hold out points indexed in this array as test set.

    verbose : int, optional (default: 0)
        Increasing level of output (progress report).

    Returns
    -------
    D_mp : ndarray
        Secondary distance MP empiric matrix.

    References
    ----------
    .. [1] Schnitzer, D., Flexer, A., Schedl, M., & Widmer, G. (2012).
           Local and global scaling reduce hubs in space. The Journal of Machine
           Learning Research, 13(1), 2871–2902.
    """
    # Initialization and checking input
    log = ConsoleLogging()
    io.check_sample_shape_fits(D, idx)
    io.check_valid_metric_parameter(metric)
    n = D.shape[0]
    s = D.shape[1]
    if metric == 'similarity':
        self_value = 1
        exclude_value = np.inf
    else:  # metric == 'distance':
        self_value = 0
        exclude_value = -np.inf
        if issparse(D):
            raise ValueError("MP sparse only supports similarity matrices.")
    if test_set_ind is None:
        n_ind = range(n)
    #elif not np.all(~test_set_ind):
    else:
        n_ind = test_set_ind
        #raise NotImplementedError("MP empiric does not yet support train/"
        #                          "test splits.")
        #train_set_ind = np.setdiff1d(np.arange(n), test_set_ind)

    # Start MP
    D = D.copy()

    if issparse(D):
        raise NotImplementedError
        #return _mutual_proximity_empiric_sparse(D, test_set_ind, verbose, log)

    # ensure correct self distances (NOT done for sparse matrices!)
    for j, sample in enumerate(idx):
        D[sample, j] = exclude_value

    D_mp = np.zeros_like(D) * np.nan

    # Calculate MP empiric
    for i in n_ind:  #range(n):
        if verbose and ((i + 1) % 1000 == 0 or i == n - 2):
            log.message("MP_empiric: {} of {}.".format(i + 1, n - 1),
                        flush=True)
        dI = D[i, :][np.newaxis, :]  # broadcasted afterwards
        dJ = D[idx, :]  # fancy indexing, thus copy
        d = dI.T  # D[i, :][:, np.newaxis] # both versions are equal
        # div by n
        n_pts = s
        # div by n-1, n-2 #n_pts = (np.isfinite(dI) & np.isfinite(dJ)).sum(1)
        if metric == 'similarity':
            D_mp[i, :] = np.sum((dI <= d) & (dJ <= d), 1) / n_pts
        else:  # metric == 'distance':
            D_mp[i, :] = 1 - (np.sum((dI > d) & (dJ > d), 1) / n_pts)

    # Ensure correct self distances
    for j, sample in enumerate(idx):
        D_mp[sample, j] = self_value

    if test_set_ind is None:
        return D_mp
    else:
        return D_mp[test_set_ind]
Example #14
0
def mutual_proximity_gammai(D: np.ndarray,
                            metric: str = 'distance',
                            min_nnz: int = 30,
                            test_set_ind: np.ndarray = None,
                            verbose: int = 0):
    """Transform a distance matrix with Mutual Proximity (indep. Gamma distr.).
    
    Applies Mutual Proximity (MP) [1]_ on a distance/similarity matrix. Gammai 
    variant assumes independent Gamma distributed distances (FAST).
    The resulting second. distance/similarity matrix should show lower hubness.

    Parameters
    ----------
    D : ndarray or csr_matrix
        - ndarray: The ``n x n`` symmetric distance or similarity matrix.
        - csr_matrix: The ``n x n`` symmetric similarity matrix.
        
        NOTE: In case of sparse `D`, zeros are interpreted as missing values 
        and ignored during calculations. Thus, results may differ 
        from using a dense version.

    metric : {'distance', 'similarity'}, optional (default: 'distance')
        Define, whether matrix `D` is a distance or similarity matrix.

        NOTE: In case of sparse `D`, only 'similarity' is supported.

    min_nnz : int, optional, default: 30
        Calculate MP between two objects `i` and `j`, iff at least ``min_nnz``
        values are present in both row ``i`` and ``j``.
        Otherwise, return the original similarity.
        Ignored, if `metric` is 'distance'.

    test_set_ind : ndarray, optional (default: None)
        Define data points to be hold out as part of a test set. Can be:

        - None : Rescale all distances
        - ndarray : Hold out points indexed in this array as test set. 
 
    verbose : int, optional (default: 0)
        Increasing level of output (progress report).

    Returns
    -------
    D_mp : ndarray
        Secondary distance MP gammai matrix.

    References
    ----------
    .. [1] Schnitzer, D., Flexer, A., Schedl, M., & Widmer, G. (2012). 
           Local and global scaling reduce hubs in space. 
           The Journal of Machine Learning Research, 13(1), 2871–2902.
    """
    # Initialization
    n = D.shape[0]
    log = ConsoleLogging()

    # Checking input
    io.check_distance_matrix_shape(D)
    io.check_valid_metric_parameter(metric)
    if metric == 'similarity':
        self_value = 1
    else:  # metric == 'distance':
        self_value = 0
    if test_set_ind is None:
        train_set_ind = slice(0, n)
    else:
        train_set_ind = np.setdiff1d(np.arange(n), test_set_ind)

    # Start MP
    if verbose:
        log.message('Mutual proximity Gammai rescaling started.', flush=True)
    D = D.copy()

    if issparse(D):
        return _mutual_proximity_gammai_sparse(D, min_nnz, test_set_ind,
                                               verbose, log)

    np.fill_diagonal(D, np.nan)

    mu = np.nanmean(D[train_set_ind], 0)
    va = np.nanvar(D[train_set_ind], 0, ddof=1)
    # Avoid downstream div/0 errors
    va[va == 0] = 1e-7
    A = (mu**2) / va
    B = va / mu

    D_mp = np.zeros_like(D)

    # MP gammai
    for i in range(n):
        if verbose and ((i + 1) % 1000 == 0 or i + 1 == n):
            log.message("MP_gammai: {} of {}".format(i + 1, n), flush=True)
        j_idx = slice(i + 1, n)

        if metric == 'similarity':
            p1 = _local_gamcdf(D[i, j_idx], A[i], B[i])
            p2 = _local_gamcdf(D[j_idx, i], A[j_idx], B[j_idx])
            D_mp[i, j_idx] = (p1 * p2).ravel()
        else:  # distance
            p1 = 1 - _local_gamcdf(D[i, j_idx], A[i], B[i])
            p2 = 1 - _local_gamcdf(D[j_idx, i], A[j_idx], B[j_idx])
            D_mp[i, j_idx] = (1 - p1 * p2).ravel()

    # Mirroring the matrix
    D_mp += D_mp.T
    # set correct self dist/sim
    np.fill_diagonal(D_mp, self_value)

    return D_mp
Example #15
0
def nicdm_sample(D:np.ndarray, k:int=7, metric:str='distance',
                 train_ind:np.ndarray=None, test_ind:np.ndarray=None):
    """Transform a distance matrix with local scaling variant NICDM.
    
    --- DRAFT version ---

    Transforms the given distance matrix into new one using NICDM [1]_
    with the given neighborhood radius `k` (average). There are two types of
    local scaling methods implemented. The original one and the non-iterative
    contextual dissimilarity measure, both reduce hubness in distance spaces,
    similarly to Mutual Proximity.

    Parameters
    ----------
    D : ndarray or csr_matrix
        The ``n x n`` symmetric distance (similarity) matrix.

    k : int, optional (default: 7)
        Neighborhood radius for local scaling.

    metric : {'distance', 'similarity'}, optional (default: 'distance')
        Define, whether matrix `D` is a distance or similarity matrix.

        NOTE: self similarities in sparse `D_ls` are set to ``np.inf``

    train_ind : ndarray, optional
        If given, use only these data points as neighbors for rescaling.

    test_ind : ndarray, optional (default: None)
        Define data points to be hold out as part of a test set. Can be:

        - None : Rescale all distances
        - ndarray : Hold out points indexed in this array as test set.

    Returns
    -------
    D_nicdm : ndarray
        Secondary distance NICDM matrix.

    References
    ----------
    .. [1] Schnitzer, D., Flexer, A., Schedl, M., & Widmer, G. (2012).
           Local and global scaling reduce hubs in space. The Journal of Machine
           Learning Research, 13(1), 2871–2902.
    """
    # Checking input
    io.check_sample_shape_fits(D, train_ind)
    io.check_valid_metric_parameter(metric)
    if metric == 'similarity':
        raise NotImplementedError("NICDM does not support similarity matrices "
                                  "at the moment.")
    else: # metric == 'distance':
        D = np.copy(D)
        kth = np.arange(k)
        exclude = np.inf
        self_value = 0
        if issparse(D):
            raise NotImplementedError(
                "Sparse distance matrices are not supported.")

    n = D.shape[0]
    if test_ind is None:
        n_ind = range(n)
    else:
        n_ind = test_ind
    # Exclude self distances
    for j, sample in enumerate(train_ind):
        D[sample, j] = exclude

    # Statistics
    r = np.partition(D, kth=kth, axis=1)[:, :k].mean(axis=1)
    r_geom = _local_geomean(r) #knn.ravel())

    # Calculate secondary distances
    D_nicdm = np.zeros_like(D)
    for i in n_ind:
        # vectorized inner loop (using broadcasting)
        D_nicdm[i, :] = (r_geom * D[i, :]) / np.sqrt(r[i] * r[train_ind])
        #D_nicdm[i, :] = ((r_geom**2) * D[i, :]) / (r[i] * r[train_ind])

    # Ensure correct self distances and return sec. dist. matrix
    if test_ind is None:
        np.fill_diagonal(D_nicdm, self_value)
        return D_nicdm 
    else:
        for j, sample in enumerate(train_ind):
            D_nicdm[sample, j] = self_value
        return D_nicdm[test_ind]
def shared_nearest_neighbors(D: np.ndarray,
                             k: int = 10,
                             metric='distance',
                             n_jobs: int = 1):
    """Transform distance matrix using shared nearest neighbors [1]_.

    SNN similarity is based on computing the overlap between the `k` nearest
    neighbors of two objects. SNN approaches try to symmetrize nearest neighbor
    relations using only rank and not distance information [2]_.

    Parameters
    ----------
    D : np.ndarray
        The ``n x n`` symmetric distance (similarity) matrix.

    k : int, optional (default: 10)
        Neighborhood radius: The `k` nearest neighbors are used to calculate SNN.

    metric : {'distance', 'similarity'}, optional (default: 'distance')
        Define, whether the matrix `D` is a distance or similarity matrix

    n_jobs : int, optional, default: 1
        Number of processes for parallel computations.

        - `1`: Don't use multiprocessing.
        - `-1`: Use all CPUs

    Returns
    -------
    D_snn : ndarray
        Secondary distance SNN matrix

    References
    ---------- 
    .. [1] R. Jarvis and E. A. Patrick, “Clustering using a similarity measure
           based on shared near neighbors,” IEEE Transactions on Computers,
           vol. 22, pp. 1025–1034, 1973.

    .. [2] Flexer, A., & Schnitzer, D. (2013). Can Shared Nearest Neighbors
           Reduce Hubness in High-Dimensional Spaces? 2013 IEEE 13th
           International Conference on Data Mining Workshops, 460–467.
           http://doi.org/10.1109/ICDMW.2013.101
    """
    io.check_distance_matrix_shape(D)
    io.check_valid_metric_parameter(metric)
    n = D.shape[0]
    if metric == 'distance':
        self_value = 0.
        sort_order = 1
        exclude = np.inf
        kth = k
    if metric == 'similarity':
        self_value = 1.
        sort_order = -1
        exclude = -np.inf
        kth = n - k
    distance = D.copy()
    np.fill_diagonal(distance, exclude)

    if n_jobs == -1:
        n_jobs = cpu_count()
    if n_jobs > 1:
        knn_ctype = RawArray(ctypes.c_bool, D.size)
        knn = np.frombuffer(knn_ctype, dtype=bool).reshape(D.shape)
        D_snn_ctype = RawArray(ctypes.c_double, D.size)
        D_snn = np.frombuffer(D_snn_ctype, dtype=np.float64).reshape(D.shape)
        with Pool(processes=n_jobs,
                  initializer=_snn_init,
                  initargs=(distance, knn, D_snn)) as pool:
            for _ in pool.imap(func=partial(_snn_my_hood,
                                            k=k,
                                            kth=kth,
                                            sort_order=sort_order),
                               iterable=range(n)):
                pass
            for _ in pool.imap(func=partial(_snn_our_hood, k=k, metric=metric),
                               iterable=range(n)):
                pass
    else:
        knn = np.zeros_like(distance, bool)
        # find nearest neighbors for each point
        for i in range(n):
            di = distance[i, :]
            nn = np.argpartition(di, kth=kth)[::sort_order]
            knn[i, nn[0:k]] = True
        D_snn = np.zeros_like(distance)
        for i in range(n):
            knn_i = knn[i, :]
            j_idx = slice(i + 1, n)

            # using broadcasting
            Dij = np.sum(np.logical_and(knn_i, knn[j_idx, :]), 1)
            if metric == 'distance':
                D_snn[i, j_idx] = 1. - Dij / k
            else:  # metric == 'similarity':
                D_snn[i, j_idx] = Dij / k

    D_snn += D_snn.T
    np.fill_diagonal(D_snn, self_value)
    return D_snn
Example #17
0
def nicdm(D:np.ndarray, k:int=7, metric:str='distance',
          test_ind:np.ndarray=None, n_jobs:int=1):
    """Transform a distance matrix with local scaling variant NICDM.

    Transforms the given distance matrix into new one using NICDM [1]_
    with the given neighborhood radius `k` (average). There are two types of
    local scaling methods implemented. The original one and the non-iterative
    contextual dissimilarity measure, both reduce hubness in distance spaces,
    similarly to Mutual Proximity.

    Parameters
    ----------
    D : ndarray
        The ``n x n`` symmetric distance (similarity) matrix.

    k : int, optional (default: 7)
        Neighborhood radius for local scaling.

    metric : {'distance'}, optional (default: 'distance')
        Currently, only distance matrices are supported.

    test_ind : ndarray, optional (default: None)
        Define data points to be hold out as part of a test set. Can be:

        - None : Rescale all distances
        - ndarray : Hold out points indexed in this array as test set.

    n_jobs : int, optional, default: 1
        Number of processes for parallel computations.

        - `1`: Don't use multiprocessing.
        - `-1`: Use all CPUs

    Returns
    -------
    D_nicdm : ndarray
        Secondary distance NICDM matrix.

    References
    ----------
    .. [1] Schnitzer, D., Flexer, A., Schedl, M., & Widmer, G. (2012).
           Local and global scaling reduce hubs in space. The Journal of Machine
           Learning Research, 13(1), 2871–2902.
    """
    #log = logging.ConsoleLogging()
    # Checking input
    io.check_distance_matrix_shape(D)
    io.check_valid_metric_parameter(metric)
    if metric == 'similarity':
        raise NotImplementedError("NICDM does not support similarity matrices "
                                  "at the moment.")
    else:
        kth = np.arange(k)
        exclude = np.inf
    if n_jobs == -1:
        n_jobs = cpu_count()
    D = np.copy(D)
    n = D.shape[0]

    if test_ind is None:
        train_ind = slice(0, n)
    else:
        train_ind = np.setdiff1d(np.arange(n), test_ind)

    np.fill_diagonal(D, exclude)
    if n_jobs > 1:
        r_ctype = RawArray(ctypes.c_double, n)
        r = np.frombuffer(r_ctype, dtype=np.float64)
        with Pool(processes=n_jobs,
                  initializer=_nicdm_load_shared_data,
                  initargs=(D, train_ind, r, r_ctype)) as pool:
            for i, knn in enumerate(pool.imap(
                func=partial(_nicdm_calculate_r, kth=kth, k=k),
                iterable=range(n))):
                pass # r is handled within func
            r_geom = _local_geomean(r)
        D_nicdm_ctype = RawArray(ctypes.c_double, D.size)
        D_nicdm = np.frombuffer(D_nicdm_ctype, dtype=np.float64).reshape(D.shape)
        with Pool(processes=n_jobs,
                  initializer=_nicdm_load_shared_data,
                  initargs=(D, train_ind, r, r_ctype, D_nicdm, D_nicdm_ctype)) as pool:
            for _ in pool.imap(
                func=partial(_nicdm_calculate_sec_dist, r_geom=r_geom, n=n, metric=metric),
                iterable=range(n)):
                pass # results handled within func
    else: # no multiprocessing
        knn = np.partition(D[:, train_ind], kth=kth, axis=1)[:, :k]
        r = knn.mean(axis=1)
        r_geom = _local_geomean(r)

        D_nicdm = np.zeros_like(D)
        for i in range(n):
            # vectorized inner loop for 100x speed-up (using broadcasting)
            #D_nicdm[i, i+1:] = ((r_geom**2) * D[i, i+1:]) / (r[i] * r[i+1:])
            D_nicdm[i, i+1:] = (r_geom * D[i, i+1:]) / np.sqrt(r[i] * r[i+1:])
        D_nicdm += D_nicdm.T

    return D_nicdm
Example #18
0
def _mutual_proximity_empiric_full(D: np.ndarray,
                                   metric: str = 'distance',
                                   test_set_ind: np.ndarray = None,
                                   min_nnz: int = 0,
                                   verbose: int = 0,
                                   n_jobs=None):
    """Transform a distance matrix with Mutual Proximity (empiric distribution).
    
    Applies Mutual Proximity (MP) [1]_ on a distance/similarity matrix using 
    the empiric data distribution (EXACT, rather SLOW). The resulting 
    secondary distance/similarity matrix should show lower hubness.
    
    Parameters
    ----------
    D : ndarray or csr_matrix
        - ndarray: The ``n x n`` symmetric distance or similarity matrix.
        - csr_matrix: The ``n x n`` symmetric similarity matrix.
          
        NOTE: In case of sparse ``D`, zeros are interpreted as missing values 
        and ignored during calculations. Thus, results may differ 
        from using a dense version.
    
    metric : {'distance', 'similarity'}, optional (default: 'distance')
        Define, whether matrix `D` is a distance or similarity matrix.
        
        NOTE: In case of sparse `D`, only 'similarity' is supported.
        
    test_set_ind : ndarray, optional (default: None)
        Define data points to be hold out as part of a test set. Can be:
        
        - None : Rescale all distances
        - ndarray : Hold out points indexed in this array as test set. 

    min_nnz : int, optional, default: 0
        Calculate MP between two objects `i` and `j`, iff at least ``min_nnz``
        values are present in both row ``i`` and ``j``.
        Otherwise, return the original distance/similarity.
        
        NOTE: Currently only implemented for MP empiric w/ sparse sim matrices

    verbose : int, optional (default: 0)
        Increasing level of output (progress report).
        
    Returns
    -------
    D_mp : ndarray
        Secondary distance MP empiric matrix.
    
    References
    ----------
    .. [1] Schnitzer, D., Flexer, A., Schedl, M., & Widmer, G. (2012). 
           Local and global scaling reduce hubs in space. The Journal of Machine 
           Learning Research, 13(1), 2871–2902.
    """
    # Initialization
    n = D.shape[0]
    log = ConsoleLogging()

    # Check input
    io.check_distance_matrix_shape(D)
    io.check_valid_metric_parameter(metric)
    if metric == 'similarity':
        self_value = 1
        exclude_value = np.inf
    else:  # metric == 'distance':
        self_value = 0
        exclude_value = -np.inf
        if issparse(D):
            raise ValueError("MP sparse only supports similarity matrices.")
    if test_set_ind is None:
        pass  # TODO implement
        #train_set_ind = slice(0, n)
    elif not np.all(~test_set_ind):
        raise NotImplementedError("MP empiric does not yet support train/"
                                  "test splits.")
        #train_set_ind = np.setdiff1d(np.arange(n), test_set_ind)

    if issparse(D):
        return _mutual_proximity_empiric_sparse(D, test_set_ind, min_nnz,
                                                verbose, log, n_jobs)
    # Start MP
    D = D.copy()

    # ensure correct self distances (NOT done for sparse matrices!)
    np.fill_diagonal(D, exclude_value)

    D_mp = np.zeros_like(D)

    # Calculate MP empiric
    for i in range(n - 1):
        if verbose and ((i + 1) % 1000 == 0 or i == n - 2):
            log.message("MP_empiric: {} of {}.".format(i + 1, n - 1),
                        flush=True)
        # Calculate only triu part of matrix
        j_idx = i + 1

        dI = D[i, :][np.newaxis, :]
        dJ = D[j_idx:n, :]
        d = D[j_idx:n, i][:, np.newaxis]

        if metric == 'similarity':
            D_mp[i, j_idx:] = np.sum((dI <= d) & (dJ <= d), 1) / n  #(n - 2)
        else:  # metric == 'distance':
            D_mp[i, j_idx:] = 1 - (np.sum(
                (dI > d) & (dJ > d), 1) / n)  #(n - 2))

    # Mirror, so that matrix is symmetric
    D_mp += D_mp.T
    np.fill_diagonal(D_mp, self_value)

    return D_mp