Beispiel #1
0
def nicdm_sample(D:np.ndarray, k:int=7, metric:str='distance',
                 train_ind:np.ndarray=None, test_ind:np.ndarray=None):
    """Transform a distance matrix with local scaling variant NICDM.
    
    --- DRAFT version ---

    Transforms the given distance matrix into new one using NICDM [1]_
    with the given neighborhood radius `k` (average). There are two types of
    local scaling methods implemented. The original one and the non-iterative
    contextual dissimilarity measure, both reduce hubness in distance spaces,
    similarly to Mutual Proximity.

    Parameters
    ----------
    D : ndarray or csr_matrix
        The ``n x n`` symmetric distance (similarity) matrix.

    k : int, optional (default: 7)
        Neighborhood radius for local scaling.

    metric : {'distance', 'similarity'}, optional (default: 'distance')
        Define, whether matrix `D` is a distance or similarity matrix.

        NOTE: self similarities in sparse `D_ls` are set to ``np.inf``

    train_ind : ndarray, optional
        If given, use only these data points as neighbors for rescaling.

    test_ind : ndarray, optional (default: None)
        Define data points to be hold out as part of a test set. Can be:

        - None : Rescale all distances
        - ndarray : Hold out points indexed in this array as test set.

    Returns
    -------
    D_nicdm : ndarray
        Secondary distance NICDM matrix.

    References
    ----------
    .. [1] Schnitzer, D., Flexer, A., Schedl, M., & Widmer, G. (2012).
           Local and global scaling reduce hubs in space. The Journal of Machine
           Learning Research, 13(1), 2871–2902.
    """
    # Checking input
    io.check_sample_shape_fits(D, train_ind)
    io.check_valid_metric_parameter(metric)
    if metric == 'similarity':
        raise NotImplementedError("NICDM does not support similarity matrices "
                                  "at the moment.")
    else: # metric == 'distance':
        D = np.copy(D)
        kth = np.arange(k)
        exclude = np.inf
        self_value = 0
        if issparse(D):
            raise NotImplementedError(
                "Sparse distance matrices are not supported.")

    n = D.shape[0]
    if test_ind is None:
        n_ind = range(n)
    else:
        n_ind = test_ind
    # Exclude self distances
    for j, sample in enumerate(train_ind):
        D[sample, j] = exclude

    # Statistics
    r = np.partition(D, kth=kth, axis=1)[:, :k].mean(axis=1)
    r_geom = _local_geomean(r) #knn.ravel())

    # Calculate secondary distances
    D_nicdm = np.zeros_like(D)
    for i in n_ind:
        # vectorized inner loop (using broadcasting)
        D_nicdm[i, :] = (r_geom * D[i, :]) / np.sqrt(r[i] * r[train_ind])
        #D_nicdm[i, :] = ((r_geom**2) * D[i, :]) / (r[i] * r[train_ind])

    # Ensure correct self distances and return sec. dist. matrix
    if test_ind is None:
        np.fill_diagonal(D_nicdm, self_value)
        return D_nicdm 
    else:
        for j, sample in enumerate(train_ind):
            D_nicdm[sample, j] = self_value
        return D_nicdm[test_ind]
Beispiel #2
0
def mutual_proximity_gaussi_sample(D: np.ndarray, idx: np.ndarray,
                                   metric: str = 'distance', test_set_ind: np.ndarray = None, verbose: int = 0):
    """Transform a distance matrix with Mutual Proximity (empiric distribution).

    NOTE: this docstring does not yet fully reflect the properties of this
    proof-of-concept function!

    Applies Mutual Proximity (MP) [1]_ on a distance/similarity matrix using
    the empiric data distribution (EXACT, rather SLOW). The resulting
    secondary distance/similarity matrix should show lower hubness.

    Parameters
    ----------
    D : ndarray
        The ``n x s`` distance or similarity matrix, where ``n`` and ``s``
        are the dataset and sample size, respectively.
    idx : ndarray
        The index array that determines, to which data points the columns in
        `D` correspond.
    metric : {'distance', 'similarity'}, optional (default: 'distance')
        Define, whether matrix `D` is a distance or similarity matrix.
    test_set_ind : ndarray, optional (default: None)
        Define data points to be hold out as part of a test set. Can be:
        - None : Rescale all distances
        - ndarray : Hold out points indexed in this array as test set.
    verbose : int, optional (default: 0)
        Increasing level of output (progress report).
    Returns
    -------
    D_mp : ndarray
        Secondary distance MP empiric matrix.
    References
    ----------
    .. [1] Schnitzer, D., Flexer, A., Schedl, M., & Widmer, G. (2012).
           Local and global scaling reduce hubs in space. The Journal of Machine
           Learning Research, 13(1), 2871–2902.
    """
    # Initialization and checking input
    log = ConsoleLogging()
    io.check_sample_shape_fits(D, idx)
    io.check_valid_metric_parameter(metric)
    n = D.shape[0]
    s = D.shape[1]
    j = np.ones(n, int)
    j *= (n + 1)  # illegal indices will throw index out of bounds error
    j[idx] = np.arange(s)
    if metric == 'similarity':
        self_value = 1
    else:  # metric == 'distance':
        self_value = 0
    exclude_value = np.nan
    if test_set_ind is None:
        n_ind = range(n)
    else:
        n_ind = test_set_ind

    # Start MP
    D = D.copy()

    if issparse(D):
        raise NotImplementedError

    # ensure correct self distances (NOT done for sparse matrices!)
    for j, sample in enumerate(idx):
        D[sample, j] = exclude_value

    # Calculate mean and std per row, w/o self values (nan)
    mu = np.nanmean(D, 1)
    sd = np.nanstd(D, 1, ddof=0)
    # Avoid downstream div/0 errors
    sd[sd == 0] = 1e-7

    # set self dist/sim back to self_value to avoid scipy warnings
    for j, i in enumerate(idx):
        D[i, j] = self_value

    # # MP Gaussi
    # D_mp = np.zeros_like(D)
    # for sample, i in enumerate(n_ind):
    #     if verbose and ((i + 1) % 1000 == 0 or i + 1 == n):
    #         log.message("MP_gaussi: {} of {}.".format(i + 1, n), flush=True)
    #     j = slice(0, s)
    #
    #     if metric == 'similarity':
    #         p1 = norm.cdf(D[i, j], mu[i], sd[i])
    #         p2 = norm.cdf(D[i, j], mu[idx], sd[idx])
    #         D_mp[i, j] = (p1 * p2).ravel()
    #     else:
    #         # Survival function: sf(.) := 1 - cdf(.)
    #         p1 = norm.sf(D[i, j], mu[i], sd[i])
    #         p2 = norm.sf(D[i, j], mu[idx], sd[idx])
    #         D_mp[i, j] = (1 - p1 * p2).ravel()
    #
    # # Ensure correct self distances
    # for j, sample in enumerate(idx):
    #     D_mp[sample, j] = self_value

    # if test_set_ind is None:
    #     return D_mp
    # else:
    #     return D_mp[test_set_ind]

    return mu, sd
Beispiel #3
0
def local_scaling_sample(D:np.ndarray, k:int=7, metric:str='distance',
                         train_ind:np.ndarray=None, test_ind:np.ndarray=None):
    """Transform a distance matrix with Local Scaling.

    --- DRAFT version ---

    Transforms the given distance matrix into new one using local scaling [1]_
    with the given `k`-th nearest neighbor. There are two types of local
    scaling methods implemented. The original one and NICDM, both reduce
    hubness in distance spaces, similarly to Mutual Proximity.

    Parameters
    ----------
    D : ndarray or csr_matrix
        The ``n x n`` symmetric distance (similarity) matrix.

    k : int, optional (default: 7)
        Neighborhood radius for local scaling.

    metric : {'distance', 'similarity'}, optional (default: 'distance')
        Define, whether matrix `D` is a distance or similarity matrix.

        NOTE: self similarities in sparse `D_ls` are set to ``np.inf``

    train_ind : ndarray, optional
        If given, use only these data points as neighbors for rescaling.

    test_ind : ndarray, optional (default: None)
        Define data points to be hold out as part of a test set. Can be:

        - None : Rescale all distances
        - ndarray : Hold out points indexed in this array as test set.

    Returns
    -------
    D_ls : ndarray
        Secondary distance LocalScaling matrix.

    References
    ----------
    .. [1] Schnitzer, D., Flexer, A., Schedl, M., & Widmer, G. (2012).
           Local and global scaling reduce hubs in space. The Journal of Machine
           Learning Research, 13(1), 2871–2902.
    """
    log = ConsoleLogging()
    # Checking input
    io.check_sample_shape_fits(D, train_ind)
    io.check_valid_metric_parameter(metric)
    sparse = issparse(D)
    n = D.shape[0]
    if metric == 'similarity':
        if train_ind is not None:
            raise NotImplementedError
        kth = n - k
        exclude = -np.inf
        self_value = 1.
        log.warning("Similarity matrix support for LS is experimental.")
    else: # metric == 'distance':
        kth = k - 1
        exclude = np.inf
        self_value = 0
        if sparse:
            log.error("Sparse distance matrices are not supported.")
            raise NotImplementedError(
                "Sparse distance matrices are not supported.")

    D = np.copy(D)
    if test_ind is None:
        train_set_ind = slice(0, n) #take all
        n_ind = range(n)
    else:
        train_set_ind = np.setdiff1d(np.arange(n), test_ind)
        n_ind = test_ind
    # Exclude self distances
    for j, sample in enumerate(train_ind):
        D[sample, j] = exclude
    r = np.zeros(n)
    for i in range(n):
        if train_ind is None:
            if sparse:
                di = D[i, train_set_ind].toarray()
            else:
                di = D[i, train_set_ind]
        else:
            di = D[i, :] # all columns are training in this case
        r[i] = np.partition(di, kth=kth)[kth]

    if sparse:
        D_ls = lil_matrix(D.shape)
        # Number of nonzero cells per row
        nnz = D.getnnz(axis=1)
    else:
        D_ls = np.zeros_like(D)

    if metric == 'similarity':
        for i in n_ind:
            if sparse and nnz[i] <= k: # Don't rescale if there are too few 
                D_ls[i, :] = D[i, :]   # neighbors in the current row
            else:
                D_ls[i, :] = np.exp(-1 * D[i, :]**2 / (r[i] * r[train_ind]))
    else:
        for i in n_ind:
            D_ls[i, :] = 1 - np.exp(-1 * D[i, :]**2 / (r[i] * r[train_ind]))

    if test_ind is None:
        if sparse:
            return D_ls.tocsr()
        else:
            np.fill_diagonal(D_ls, self_value)
            return D_ls
    else:
        # Ensure correct self distances
        for j, sample in enumerate(train_ind):
            D_ls[sample, j] = self_value
        return D_ls[test_ind]
def simhub(D: np.ndarray,
           y: np.ndarray,
           train_ind: np.ndarray = None,
           test_ind: np.ndarray = None,
           s: int = 50,
           return_distances: bool = True,
           vect_usage: int = 0):
    """Calculate dissimilarity based on hubness-aware SNN distances [1]_.

    Parameters
    ----------
    D : ndarray
        The ``n x s`` distance or similarity matrix, where ``n`` and ``s``
        are the dataset and sample size, respectively.

    y : ndarray or None
        Class labels. Required for supervised simhub (simhubIN + simhubPUR).
        If None, calculate unsupervised simhubIN as per equation (6) in [1]_.

    train_ind : ndarray, optional, default: None
        The index array that determines, to which data points the columns in
        `D` correspond. Not required, if `D` is a quadratic all-against-all
        distance matrix.

    test_ind : ndarray, optional, default: None
        Define data points to be hold out as part of a test set. Can be:

        - None : Rescale all distances
        - ndarray : Hold out points indexed in this array as test set.

    s : int, optional, default: 50
        Neighborhood size. Can be optimized as to minimize hubness.

    return_distances : bool, optional, default: True
        If True, return distances (1 - similarities).
        Otherwise return similarities.

    vect_usage : int, optional, default: 0
        If > 0, always use vectorization for the inner simhub loop.
        If < 0, always use nested loops.
        If == 0, this is dependent on data set size
        and vectorization is used if ``n >= 2000``.

    Returns
    -------
    D_shi : ndarray
        Secondary distance (simhubIN) matrix.

    References
    ----------
    .. [1] Tomašev, N., Mladenić, D.(2012).
           Hubness-aware shared neighbor distances for high-dimensional
           $$k$$ -nearest neighbor classification.
           Knowledge and Information Systems, 39(1), 89–122.
           http://doi.org/10.1007/s10115-012-0607-5
    """
    if train_ind is None:
        io.check_distance_matrix_shape(D)
    else:
        io.check_sample_shape_fits(D, train_ind)
    # Assuming distances in D
    self_value = 0.
    sort_order = 1
    exclude = np.inf
    distance = D.copy()
    n, m = distance.shape
    if not 0 < s < m:
        raise ValueError("Neighbor hood size s, must be [1, {}-1], but "
                         "was {}.".format(m, s))
    if test_ind is None:
        n_ind = range(n)
    else:
        n_ind = test_ind
    # Exclude self distances
    if train_ind is None:
        np.fill_diagonal(distance, exclude)
    else:
        for j, sample in enumerate(train_ind):
            distance[sample, j] = exclude

    knn = np.zeros_like(distance, bool)

    # find nearest neighbors for each point
    for i in range(n):
        di = distance[i, :]
        # TODO change to np.partition for PERF
        nn = np.argsort(di)[::sort_order]
        knn[i, nn[:s]] = True
    del distance

    # Reverse nearest neighbor count
    N_s = knn[:m, :].sum(axis=0)

    if y is not None:
        # Set of class labels
        C = np.unique(y)

        # Class specific reverse nearest neighbors
        N_sc = np.zeros((C.size, m))
        for c_idx, c_val in enumerate(C):
            N_sc[c_idx, :] = np.sum(knn[:m, :] * (y == c_val).reshape(-1, 1),
                                    axis=0)
        assert np.alltrue(N_sc.sum(
            axis=0) == N_s), "N_s,c(x) don't sum up to N_s(x)"

        # Account for each point being the 0th nearest neighbor
        N_sc += 1
    # In any case: the same for N_s
    N_s += 1

    if y is not None:
        # non-homogeneity (inconsistency) in occurrence
        N_sc /= N_s
        HR_s = -np.sum(N_sc * np.log(N_sc), axis=0)

        # Information gain
        max_H_s = np.log(C.size)
        info_gain = max_H_s - HR_s
    else:  # set a dummy value for unsupervised mode
        info_gain = 1

    # "occurrence informativeness"
    I_n = np.log(m / N_s)

    # simhub calculation
    D_shi = np.zeros_like(D)
    if train_ind is None:
        train_ind = ...
    if vect_usage > 0 or (vect_usage == 0 and m < 2000):
        # using vectorization and broadcasting
        for i in n_ind:
            x = np.logical_and(knn[i, :], knn[train_ind, :])
            D_shi[i, :] = np.sum(x * I_n * info_gain, axis=1)
    else:  # use non-vectorized loops
        for i in n_ind:
            for j in range(m):
                x = np.logical_and(knn[i, :], knn[j, :])
                D_shi[i, j] = np.sum(x * I_n * info_gain)
    del knn
    # Normalization to [0, 1] range
    if y is None:
        D_shi /= (s * np.log(m))
    else:
        D_shi /= (s * np.log(m) * max_H_s)

    # Convert to distances
    if return_distances:
        D_shi *= -1
        D_shi += 1
    else:
        self_value = 1

    if test_ind is None:
        # Ensure correct self distances and return sec. dist. matrix
        np.fill_diagonal(D_shi, self_value)
        return D_shi
    else:
        # only return test-train-distances (there are no self distances here)
        return D_shi[test_ind]
def snn_sample(D: np.ndarray,
               k: int = 10,
               metric='distance',
               train_ind: np.ndarray = None,
               test_ind: np.ndarray = None,
               n_jobs: int = 1):
    """Transform distance matrix using shared nearest neighbors [1]_.

    __DRAFT_VERSION__

    SNN similarity is based on computing the overlap between the `k` nearest
    neighbors of two objects. SNN approaches try to symmetrize nearest neighbor
    relations using only rank and not distance information [2]_.

    Parameters
    ----------
    D : np.ndarray
        The ``n x s`` distance (similarity) matrix, where ``s==train_ind.size``

    k : int, optional (default: 10)
        Neighborhood radius: The `k` nearest neighbors are used to calculate SNN.

    metric : {'distance', 'similarity'}, optional (default: 'distance')
        Define, whether the matrix `D` is a distance or similarity matrix

    train_ind : ndarray, optional
        If given, use only these data points as neighbors for rescaling.

    test_ind : ndarray, optional (default: None)
        Define data points to be hold out as part of a test set. Can be:

        - None : Rescale all distances
        - ndarray : Hold out points indexed in this array as test set. 

    n_jobs : int, optional, default: 1
        Number of processes for parallel computations.

        - `1`: Don't use multiprocessing.
        - `-1`: Use all CPUs

    Returns
    -------
    D_snn : ndarray
        Secondary distance SNN matrix

    References
    ---------- 
    .. [1] R. Jarvis and E. A. Patrick, “Clustering using a similarity measure
           based on shared near neighbors,” IEEE Transactions on Computers,
           vol. 22, pp. 1025–1034, 1973.

    .. [2] Flexer, A., & Schnitzer, D. (2013). Can Shared Nearest Neighbors
           Reduce Hubness in High-Dimensional Spaces? 2013 IEEE 13th
           International Conference on Data Mining Workshops, 460–467.
           http://doi.org/10.1109/ICDMW.2013.101
    """
    io.check_sample_shape_fits(D, train_ind)
    io.check_valid_metric_parameter(metric)
    if metric == 'distance':
        self_value = 0.
        sort_order = 1
        exclude = np.inf
    if metric == 'similarity':
        self_value = 1.
        sort_order = -1
        exclude = -np.inf
    distance = D.copy()
    n = distance.shape[0]
    if test_ind is None:
        n_ind = range(n)
    else:
        n_ind = test_ind
    # Exclude self distances
    for j, sample in enumerate(train_ind):
        distance[sample, j] = exclude

    if n_jobs == -1:
        n_jobs = cpu_count()
    if n_jobs > 1:
        knn_ctype = RawArray(ctypes.c_bool, distance.size)
        knn = np.frombuffer(knn_ctype, dtype=bool).reshape(D.shape)
        D_snn_ctype = RawArray(ctypes.c_double, distance.size)
        D_snn = np.frombuffer(D_snn_ctype, dtype=np.float64).reshape(D.shape)
        with Pool(processes=n_jobs,
                  initializer=_snns_init,
                  initargs=(distance, knn, train_ind, D_snn)) as pool:
            for _ in pool.imap(func=partial(_snns_my_hood,
                                            k=k,
                                            sort_order=sort_order),
                               iterable=range(n)):
                pass  # Handling inside function
            for _ in pool.imap(func=partial(_snns_our_hood, k=k,
                                            metric=metric),
                               iterable=n_ind):
                pass  # Handling inside function
    else:
        knn = np.zeros_like(distance, bool)
        # find nearest neighbors for each point
        for i in range(n):
            di = distance[i, :]
            # TODO change to np.partition for PERF
            nn = np.argsort(di)[::sort_order]
            knn[i, nn[0:k]] = True
        D_snn = np.zeros_like(distance)
        for i in n_ind:
            knn_i = knn[i, :]
            # using broadcasting
            Dij = np.sum(np.logical_and(knn_i, knn[train_ind, :]), 1)
            if metric == 'distance':
                D_snn[i, :] = 1. - Dij / k
            else:  # metric == 'similarity':
                D_snn[i, :] = Dij / k

    # Ensure correct self distances and return sec. dist. matrix
    if test_ind is None:
        np.fill_diagonal(D_snn, self_value)
        return D_snn
    else:
        for j, sample in enumerate(train_ind):
            D_snn[sample, j] = self_value
        return D_snn[test_ind]
def simhubIN(D: np.ndarray,
             train_ind: np.ndarray = None,
             test_ind: np.ndarray = None,
             s: int = 50,
             return_distances: bool = True,
             n_jobs: int = 1):
    """Calculate dissimilarity based on hubness-aware SNN distances [1]_.

    Parameters
    ----------
    D : ndarray
        The ``n x s`` distance, where ``n`` and ``s``
        are the dataset and sample size, respectively.

    train_ind : ndarray, optional, default: None
        The index array that determines, to which data points the columns in
        `D` correspond. Not required, if `D` is a quadratic all-against-all
        distance matrix.

    test_ind : ndarray, optional, default: None
        Define data points to be hold out as part of a test set. Can be:

        - None : Rescale all distances
        - ndarray : Hold out points indexed in this array as test set.

    s : int, optional, default: 50
        Neighborhood size. Can be optimized as to minimize hubness.

    return_distances : bool, optional, default: True
        If True, return distances (1 - similarities).
        Otherwise return similarities.

    n_jobs : int, optional, default: 1
        Number of processes for parallel computations.

        - `1`: Don't use multiprocessing.
        - `-1`: Use all CPUs

    Returns
    -------
    D_shi : ndarray
        Secondary distance (simhubIN) matrix.

    References
    ----------
    .. [1] Tomašev, N., Mladenić, D., Tomasev, N., & Mladenić, D. (2012).
           Hubness-aware shared neighbor distances for high-dimensional
           $$k$$ -nearest neighbor classification.
           Knowledge and Information Systems, 39(1), 89–122.
           http://doi.org/10.1007/s10115-012-0607-5
    """
    if train_ind is None:
        io.check_distance_matrix_shape(D)
    else:
        io.check_sample_shape_fits(D, train_ind)
    # Assuming distances in D
    self_value = 0.
    sort_order = 1
    exclude = np.inf
    distance = D.copy()
    n, m = distance.shape
    if test_ind is None:
        n_ind = range(n)
    else:
        n_ind = test_ind
    # Exclude self distances
    if train_ind is None:
        np.fill_diagonal(distance, exclude)
    else:
        for j, sample in enumerate(train_ind):
            distance[sample, j] = exclude

    if n_jobs == -1:
        n_jobs = cpu_count()
    if n_jobs > 1:
        knn_ctype = RawArray(ctypes.c_bool, D.size)
        knn = np.frombuffer(knn_ctype, dtype=bool).reshape(D.shape)
        with Pool(processes=n_jobs,
                  initializer=_shi_init_knn,
                  initargs=(distance, knn)) as pool:
            for _ in pool.imap(func=partial(_shi_hood,
                                            s=s,
                                            sort_order=sort_order),
                               iterable=range(n)):
                pass
    else:
        knn = np.zeros_like(distance, bool)
        # find nearest neighbors for each point
        for i in range(n):
            di = distance[i, :]
            # TODO change to np.partition for PERF
            nn = np.argsort(di)[::sort_order]
            knn[i, nn[:s]] = True
    del distance

    # "Occurence informativeness"
    occ_inf_knn = knn[:m, :].copy()
    np.fill_diagonal(occ_inf_knn, True)
    N_s = occ_inf_knn.sum(axis=0)
    I_n = np.log(m / N_s)
    del occ_inf_knn

    # simhub calculation
    if train_ind is None:
        train_ind = ...
    if n_jobs > 1:
        D_shi_ctype = RawArray(ctypes.c_double, D.size)
        D_shi = np.frombuffer(D_shi_ctype, dtype=np.float64).reshape(D.shape)
        with Pool(processes=n_jobs,
                  initializer=_shi_init_simhub,
                  initargs=(knn, train_ind, I_n, D_shi)) as pool:
            if m < 2000:
                for _ in pool.imap(func=partial(_shi_simhub_vect, s=s),
                                   iterable=n_ind):
                    pass
            else:
                for _ in pool.imap(func=partial(_shi_simhub, s=s, m=m),
                                   iterable=n_ind):
                    pass
    else:
        D_shi = np.zeros_like(D)
        if m < 2000:  # using vectorization and broadcasting
            for i in n_ind:
                x = np.logical_and(knn[i, :], knn[train_ind, :])
                D_shi[i, :] = np.sum(x * I_n, axis=1)
        else:  # use non-vectorized loops
            for i in n_ind:
                for j in range(m):
                    x = np.logical_and(knn[i, :], knn[j, :])
                    D_shi[i, j] = np.sum(x * I_n)
    del knn
    # Normalization to [0, 1] range
    D_shi /= (s * np.log(m))

    # Convert to distances
    if return_distances:
        D_shi *= -1
        D_shi += 1
    else:
        self_value = 1

    if test_ind is None:
        # Ensure correct self distances and return sec. dist. matrix
        np.fill_diagonal(D_shi, self_value)
        return D_shi
    else:
        # only return test-train-distances (there are no self distances here)
        return D_shi[test_ind]
def predict(D:np.ndarray, target:np.ndarray, k=5,
            metric:str='distance', test_ind:np.ndarray=None, verbose:int=0,
            sample_idx=None, return_cmat=True):
    """Perform `k`-nearest neighbor classification.

    Use the ``n x n`` symmetric distance matrix `D` and target class
    labels `target` to perform a `k`-NN experiment (leave-one-out
    cross-validation or evaluation of test set; see parameter `test_ind`).
    Ties are broken by the nearest neighbor.

    Parameters
    ----------
    D : ndarray
        The ``n x n`` symmetric distance (similarity) matrix.

    target : ndarray (of dtype=int)
        The ``n x 1`` target class labels (ground truth) or
        ``n x c`` in case of ``c`` binarized multilabels

    k : int or array_like (of dtype=int), optional (default: 5)
        Neighborhood size for `k`-NN classification.
        For each value in `k`, one `k`-NN experiment is performed.

        HINT: Providing more than one value for `k` is a cheap means to perform
        multiple `k`-NN experiments at once. Try e.g. ``k=[1, 5, 20]``.

    metric : {'distance', 'similarity'}, optional (default: 'distance')
        Define, whether matrix `D` is a distance or similarity matrix

    test_ind : ndarray, optional (default: None)
        Define data points to be hold out as part of a test set. Can be:

        - None : Perform a LOO-CV experiment
        - ndarray : Hold out points indexed in this array as test set. Fit
          model to remaining data. Evaluate model on test set.

    verbose : int, optional (default: 0)
        Increasing level of output (progress report).

    return_cmat : bool, optional, default: True
        If False, only return the predictions `y_pred`.
        Otherwise also return the confusion matrices.

    Returns
    -------
    y_pred : ndarray (shape=(n_k, n, c), dtype=int)
        Predicted class labels (`n_k`... number of items in parameter `k`)
        
        HINT: Referring to the above example... 
        ... ``y_pred[0]`` gives the predictions of the ``k=1`` experiment.

    cmat : ndarray (shape=(n_k x c x n_t x n_t), dtype=int) 
        Confusion matrix (``n_t`` number of unique items in parameter target)

        HINT: ... ``cmat[2, 0, :, :]`` gives the confusion matrix of
        the first class in the ``k=20`` experiment in the following order:
            TN    FP
            FN    TP
    """

    # Check input sanity
    log = ConsoleLogging()
    if sample_idx is None:
        io.check_distance_matrix_shape(D)
    else:
        io.check_sample_shape_fits(D, sample_idx)
    #io._check_distance_matrix_shape_fits_labels(D, target)
    io.check_valid_metric_parameter(metric)
    if metric == 'distance':
        d_self = np.inf
        sort_order = 1
    if metric == 'similarity':
        d_self = -np.inf
        sort_order = -1

    # Copy, because data is changed
    if not issparse(D):
        D = D.copy()
    target = target.astype(int)
    if target.ndim == 1:
        target = target[:, np.newaxis]
    if verbose:
        log.message("Start k-NN experiment.")
    # Handle LOO-CV vs. test set mode
    if test_ind is None:
        n = D.shape[0]
        test_set_ind = range(n)    # dummy     io.check_valid_metric_parameter(metric)
        train_set_ind = n   # dummy
    else:
        # number of points to be classified
        n = test_set_ind.size
        # Indices of training examples
        train_set_ind = np.setdiff1d(np.arange(n), test_set_ind)
        if sample_idx is not None:
            raise NotImplementedError("Sample k-NN does not support train/"
                                      "test splits at the moment.")
    # Number of k-NN parameters
    try:
        k_length = k.size
    except AttributeError as e:
        if isinstance(k, int):
            k = np.array([k])
            k_length = k.size
        elif isinstance(k, list):
            k = np.array(k)
            k_length = k.size
        else:
            raise e

    cl = np.sort(np.unique(target))
    cmat = np.zeros((k_length, target.shape[1], len(cl), len(cl)), dtype=int)
    y_pred = np.zeros((k_length, *target.shape), dtype=int)

    classes = target.copy()
    for idx, cur_class in enumerate(np.array(cl).ravel()):
        # change labels to 0, 1, ..., len(cl)-1
        classes[target == cur_class] = idx
    if sample_idx is not None:
        sample_classes = classes[sample_idx]
        j = np.ones(n, int)
        j *= (n+1) # illegal indices will throw index out of bounds error
        j[sample_idx] = np.arange(len(sample_idx))
        for j, sample in enumerate(sample_idx):
            D[sample, j] = d_self
    cl = range(len(cl))

    # Classify each point in test set
    for i in test_set_ind:
        if verbose and ((i+1)%1000==0 or i+1==n):
            log.message("Prediction: {} of {}.".format(i+1, n), flush=True)

        if issparse(D):
            row = D.getrow(i)
            #row = D.data
            ind = row.nonzero()[1]
            row = row.toarray().ravel()
        else:
            row = D[i, :]
        if sample_idx is None:
            row[i] = d_self

        # Sort points in training set according to distance
        # Randomize, in case there are several points of same distance
        # (this is especially relevant for SNN rescaling)
        if sample_idx is None:
            rp = train_set_ind
        else:
            if issparse(D):
                rp = ind
            else:
                rp = np.arange(len(sample_idx))
        rp = np.random.permutation(rp)
        d2 = row[rp]
        d2idx = np.argsort(d2, axis=0)[::sort_order]
        d2idx = d2idx[~np.isnan(d2[d2idx])] # filter NaN values
        idx = rp[d2idx]

        # More than one k is useful for cheap multiple k-NN experiments at once
        for j in range(k_length):
            # Make sure no inf/-inf/nan values are used for classification
            finite_val = np.isfinite(row[idx[0:k[j]]])
            # However, if no values are finite, classify randomly
            if finite_val.sum() == 0:
                idx = np.random.permutation(idx)
                finite_val = np.ones_like(finite_val)
                log.warning("Query was classified randomly, because all "
                            "distances were non-finite numbers.")
            for l in range(target.shape[1]):
                l_classes = classes[:, l]
                if sample_idx is None:
                    nn_class = l_classes[idx[0:k[j]]][finite_val]
                else:
                    l_sample_classes = sample_classes[:, l]
                    nn_class = l_sample_classes[idx[0:k[j]]][finite_val]
                cs = np.bincount(nn_class.astype(int))
                max_cs = np.where(cs == np.max(cs))[0]
                seed_class = classes[i, l]
                # "tie": use nearest neighbor
                if len(max_cs) > 1:
                    y_pred[j, i, l] = nn_class[0]
                    cmat[j, l, seed_class, nn_class[0]] += 1
                # majority vote
                else:
                    y_pred[j, i, l] = cl[max_cs[0]]
                    cmat[j, l, seed_class, cl[max_cs[0]]] += 1

    if verbose:
        log.message("Finished k-NN experiment.")

    if return_cmat:
        return y_pred, cmat
    else:
        return y_pred
def score(D:np.ndarray, target:np.ndarray, k=5,
          metric:str='distance', test_set_ind:np.ndarray=None, verbose:int=0,
          sample_idx=None, filter_self=True):
    """Perform `k`-nearest neighbor classification.

    Use the ``n x n`` symmetric distance matrix `D` and target class
    labels `target` to perform a `k`-NN experiment (leave-one-out
    cross-validation or evaluation of test set; see parameter `test_set_ind`).
    Ties are broken by the nearest neighbor.

    Parameters
    ----------
    D : ndarray
        The ``n x n`` symmetric distance (similarity) matrix.

    target : ndarray (of dtype=int)
        The ``n x 1`` target class labels (ground truth).

    k : int or array_like (of dtype=int), optional (default: 5)
        Neighborhood size for `k`-NN classification.
        For each value in `k`, one `k`-NN experiment is performed.

        HINT: Providing more than one value for `k` is a cheap means to perform
        multiple `k`-NN experiments at once. Try e.g. ``k=[1, 5, 20]``.

    metric : {'distance', 'similarity'}, optional (default: 'distance')
        Define, whether matrix `D` is a distance or similarity matrix

    test_sed_ind : ndarray, optional (default: None)
        Define data points to be hold out as part of a test set. Can be:

        - None : Perform a LOO-CV experiment
        - ndarray : Hold out points indexed in this array as test set. Fit
          model to remaining data. Evaluate model on test set.

    verbose : int, optional (default: 0)
        Increasing level of output (progress report).

    sample_idx : ...
        TODO add description

    filter_self : bool, optional, default: True
        Remove self similarities from sparse ``D``.
        This assumes that the highest similarity per row is the self
        similarity.
        
        NOTE: Quadratic dense matrices are always filtered for self
        distances/similarities, even if `filter_self` is set t0 `False`.
        
    Returns
    -------
    acc : ndarray (shape=(n_k x 1), dtype=float)
        Classification accuracy (`n_k`... number of items in parameter `k`)

        HINT: Refering to the above example... 
        ... ``acc[0]`` gives the accuracy of the ``k=1`` experiment.
    corr : ndarray (shape=(n_k x n), dtype=int)
        Raw vectors of correctly classified items

        HINT: ... ``corr[1, :]`` gives these items for the ``k=5`` experiment.
    cmat : ndarray (shape=(n_k x n_t x n_t), dtype=int) 
        Confusion matrix (``n_t`` number of unique items in parameter target)

        HINT: ... ``cmat[2, :, :]`` gives the confusion matrix of
        the ``k=20`` experiment.
    """

    # Check input sanity
    log = ConsoleLogging()
    if sample_idx is None:
        io.check_distance_matrix_shape(D)
    else:
        io.check_sample_shape_fits(D, sample_idx)
    io.check_distance_matrix_shape_fits_labels(D, target)
    io.check_valid_metric_parameter(metric)
    if metric == 'distance':
        d_self = np.inf
        sort_order = 1
    if metric == 'similarity':
        d_self = -np.inf
        sort_order = -1

    # Copy, because data is changed
    D = D.copy()
    target = target.astype(int)
    D_is_sparse = issparse(D)

    if verbose:
        log.message("Start k-NN experiment.")
    # Handle LOO-CV vs. test set mode
    if test_set_ind is None:
        n = D.shape[0]
        test_set_ind = range(n)    # dummy 
        train_set_ind = n   # dummy
    else:
        # number of points to be classified
        n = test_set_ind.size
        # Indices of training examples
        train_set_ind = np.setdiff1d(np.arange(n), test_set_ind)
        if sample_idx is not None:
            raise NotImplementedError("Sample k-NN does not support train/"
                                      "test splits at the moment.")
    # Number of k-NN parameters
    try:
        k_length = k.size
    except AttributeError as e:
        if isinstance(k, int):
            k = np.array([k])
            k_length = k.size
        elif isinstance(k, list):
            k = np.array(k)
            k_length = k.size
        else:
            raise e

    acc = np.zeros((k_length, 1))
    corr = np.zeros((k_length, D.shape[0]))

    cl = np.sort(np.unique(target))
    if D_is_sparse:
        # Add a label for unknown class (object w/o nonzero sim to any others)
        cl = np.append(cl, cl.max()+1)
        n_classes = len(cl) + 1
    else:
        n_classes = len(cl)
    cmat = np.zeros((k_length, n_classes, n_classes))

    classes = target.copy()
    for idx, cur_class in enumerate(cl):
        # change labels to 0, 1, ..., len(cl)-1
        classes[target == cur_class] = idx
    if sample_idx is not None:
        sample_classes = classes[sample_idx]
        j = np.ones(n, int)
        j *= (n+1) # illegal indices will throw index out of bounds error
        j[sample_idx] = np.arange(len(sample_idx))
        for j, sample in enumerate(sample_idx):
            D[sample, j] = d_self
    cl = range(len(cl))

    rnd_classif = np.zeros(k_length)
    # Classify each point in test set
    for i in test_set_ind:
        if verbose and ((i+1)%1000==0 or i+1==n):
            log.message("Prediction: {} of {}.".format(i+1, n), flush=True)

        seed_class = classes[i]

        if D_is_sparse:
            row = D.getrow(i)
        else:
            row = D[i, :]
            if sample_idx is None:
                row[i] = d_self

        # Sort points in training set according to distance
        # Randomize, in case there are several points of same distance
        # (this is especially relevant for SNN rescaling)
        if sample_idx is None:
            rp = train_set_ind
        else:
            rp = np.arange(len(sample_idx))
        if D_is_sparse:
            nnz = row.nnz
            rp = np.random.permutation(nnz)
            d2 = row.data[rp]
            # Partition for each k value
            kth = nnz - k - 1
            # sort the two highest similarities to end
            kth = np.append(kth, [nnz-2, nnz-1])
            # Clip negative indices (nnz < k)
            np.clip(kth, a_min=0, a_max=nnz-1, out=kth)
            # Remove duplicate k values and sort
            kth = np.unique(kth)
            d2idx = np.argpartition(d2, kth=kth)
            d2idx = d2idx[~np.isnan(d2[d2idx])][::-1]
            idx = row.nonzero()[1][rp[d2idx]]
            idx = idx[1:] # rem self sim
        else:
            rp = np.random.permutation(rp)
            d2 = row[rp]
            d2idx = np.argsort(d2, axis=0)[::sort_order]
            d2idx = d2idx[~np.isnan(d2[d2idx])] # filter NaN values
            idx = rp[d2idx]

        # More than one k is useful for cheap multiple k-NN experiments at once
        for j in range(k_length):
            # Make sure no inf/-inf/nan values are used for classification
            if D_is_sparse:
                #print(row[0, idx[0:k[j]]].toarray())
                finite_val = np.isfinite(row[0, idx[0:k[j]]].toarray().ravel())
                #print(finite_val)
            else:
                finite_val = np.isfinite(row[idx[0:k[j]]])
            # However, if no values are finite, classify randomly
            if finite_val.sum() == 0:
                idx = np.random.permutation(idx)
                finite_val = np.ones_like(finite_val)
                rnd_classif[j] += 1
            if sample_idx is None:
                nn_class = classes[idx[0:k[j]]][finite_val]
            else:
                #finite_val = np.isfinite(sample_row[idx[0:k[j]]])
                nn_class = sample_classes[idx[0:k[j]]][finite_val]
            cs = np.bincount(nn_class.astype(int))
            if cs.size > 0:
                max_cs = np.where(cs == np.max(cs))[0]
            else:
                max_cs = np.array([len(cl) - 1]) # misclassification label

            # "tie": use nearest neighbor
            if len(max_cs) > 1:
                if seed_class == nn_class[0]:
                    acc[j] += 1/n 
                    corr[j, i] = 1
                cmat[j, seed_class, nn_class[0]] += 1
            # majority vote
            else:
                if cl[max_cs[0]] == seed_class:
                    acc[j] += 1/n
                    corr[j, i] = 1
                cmat[j, seed_class, cl[max_cs[0]]] += 1

    if np.any(rnd_classif):
        for x in rnd_classif:
            log.warning(("{} queries were classified randomly, because all "
                        "distances were non-finite numbers.").format(x))
    if verbose:
        log.message("Finished k-NN experiment.")

    return acc, corr, cmat
Beispiel #9
0
def _mutual_proximity_empiric_sample(D: np.ndarray,
                                     idx: np.ndarray,
                                     metric: str = 'distance',
                                     test_set_ind: np.ndarray = None,
                                     verbose: int = 0,
                                     n_jobs=None):
    """Transform a distance matrix with Mutual Proximity (empiric distribution).
    
    NOTE: this docstring does not yet fully reflect the properties of this
    proof-of-concept function!
    
    Applies Mutual Proximity (MP) [1]_ on a distance/similarity matrix using
    the empiric data distribution (EXACT, rather SLOW). The resulting
    secondary distance/similarity matrix should show lower hubness.
    
    Parameters
    ----------
    D : ndarray
        The ``n x s`` distance or similarity matrix, where ``n`` and ``s``
        are the dataset and sample size, respectively.

    idx : ndarray
        The index array that determines, to which data points the columns in
        `D` correspond.

    metric : {'distance', 'similarity'}, optional (default: 'distance')
        Define, whether matrix `D` is a distance or similarity matrix.

    test_sed_ind : ndarray, optional (default: None)
        Define data points to be hold out as part of a test set. Can be:

        - None : Rescale all distances
        - ndarray : Hold out points indexed in this array as test set.

    verbose : int, optional (default: 0)
        Increasing level of output (progress report).

    Returns
    -------
    D_mp : ndarray
        Secondary distance MP empiric matrix.

    References
    ----------
    .. [1] Schnitzer, D., Flexer, A., Schedl, M., & Widmer, G. (2012).
           Local and global scaling reduce hubs in space. The Journal of Machine
           Learning Research, 13(1), 2871–2902.
    """
    # Initialization and checking input
    log = ConsoleLogging()
    io.check_sample_shape_fits(D, idx)
    io.check_valid_metric_parameter(metric)
    n = D.shape[0]
    s = D.shape[1]
    if metric == 'similarity':
        self_value = 1
        exclude_value = np.inf
    else:  # metric == 'distance':
        self_value = 0
        exclude_value = -np.inf
        if issparse(D):
            raise ValueError("MP sparse only supports similarity matrices.")
    if test_set_ind is None:
        n_ind = range(n)
    #elif not np.all(~test_set_ind):
    else:
        n_ind = test_set_ind
        #raise NotImplementedError("MP empiric does not yet support train/"
        #                          "test splits.")
        #train_set_ind = np.setdiff1d(np.arange(n), test_set_ind)

    # Start MP
    D = D.copy()

    if issparse(D):
        raise NotImplementedError
        #return _mutual_proximity_empiric_sparse(D, test_set_ind, verbose, log)

    # ensure correct self distances (NOT done for sparse matrices!)
    for j, sample in enumerate(idx):
        D[sample, j] = exclude_value

    D_mp = np.zeros_like(D) * np.nan

    # Calculate MP empiric
    for i in n_ind:  #range(n):
        if verbose and ((i + 1) % 1000 == 0 or i == n - 2):
            log.message("MP_empiric: {} of {}.".format(i + 1, n - 1),
                        flush=True)
        dI = D[i, :][np.newaxis, :]  # broadcasted afterwards
        dJ = D[idx, :]  # fancy indexing, thus copy
        d = dI.T  # D[i, :][:, np.newaxis] # both versions are equal
        # div by n
        n_pts = s
        # div by n-1, n-2 #n_pts = (np.isfinite(dI) & np.isfinite(dJ)).sum(1)
        if metric == 'similarity':
            D_mp[i, :] = np.sum((dI <= d) & (dJ <= d), 1) / n_pts
        else:  # metric == 'distance':
            D_mp[i, :] = 1 - (np.sum((dI > d) & (dJ > d), 1) / n_pts)

    # Ensure correct self distances
    for j, sample in enumerate(idx):
        D_mp[sample, j] = self_value

    if test_set_ind is None:
        return D_mp
    else:
        return D_mp[test_set_ind]
Beispiel #10
0
def mutual_proximity_gaussi(
    D: np.ndarray,
    metric: str = 'distance',
    sample_size: int = 0,
    min_nnz: int = 30,
    test_set_ind: np.ndarray = None,
    verbose: int = 0,
    idx: np.ndarray = None,
):
    """Transform distances with Mutual Proximity (indep. normal distributions).
    
    Applies Mutual Proximity (MP) [1]_ on a distance/similarity matrix. Gaussi 
    variant assumes independent normal distributions (FAST).
    The resulting second. distance/similarity matrix should show lower hubness.
    
    Parameters
    ----------
    D : ndarray or csr_matrix
        - ndarray: The ``n x n`` symmetric distance or similarity matrix.
        - csr_matrix: The ``n x n`` symmetric similarity matrix.
        
        NOTE: In case of sparse `D`, zeros are interpreted as missing values 
        and ignored during calculations. Thus, results may differ 
        from using a dense version.

    metric : {'distance', 'similarity'}, optional (default: 'distance')
        Define, whether matrix `D` is a distance or similarity matrix.
        
        NOTE: In case of sparse `D`, only 'similarity' is supported.
        
    sample_size : int, optional (default: 0)
        Define sample size from which Gauss parameters are estimated.
        Use all data when set to ``0``.
        Ignored in case of SampleMP (i.e. if provided `idx`).

    min_nnz : int, optional, default: 30
        Calculate MP between two objects `i` and `j`, iff at least ``min_nnz``
        values are present in both row ``i`` and ``j``.
        Otherwise, return the original similarity.
        Ignored, if `metric` is 'distance'.

    test_set_ind : ndarray, optional (default: None)
        Define data points to be hold out as part of a test set. Can be:
        
        - None : Rescale all distances
        - ndarray : Hold out points indexed in this array as test set.

        Ignored in case of SampleMP (i.e. if provided `idx`).

    verbose : int, optional (default: 0)
        Increasing level of output (progress report).

    idx : ndarray, optional (default: None)
        The index array that determines to which data points the columns in
        `D` correspond. Only required for SampleMP.

    Returns
    -------
    D_mp : ndarray
        Secondary distance MP gaussi matrix.
    
    References
    ----------
    .. [1] Schnitzer, D., Flexer, A., Schedl, M., & Widmer, G. (2012). 
           Local and global scaling reduce hubs in space. The Journal of Machine 
           Learning Research, 13(1), 2871–2902.
    """
    # Initialization
    log = ConsoleLogging()

    # Checking input
    if idx is None:
        io.check_distance_matrix_shape(D)
    else:
        io.check_sample_shape_fits(D, idx)
    io.check_valid_metric_parameter(metric)
    n = D.shape[0]
    s = D.shape[1]

    if metric == 'similarity':
        self_value = 1
    else:  # metric == 'distance':
        self_value = 0
    if test_set_ind is None:
        train_set_ind = slice(0, n)
    else:
        train_set_ind = np.setdiff1d(np.arange(n), test_set_ind)

    # Start MP Gaussi
    if verbose:
        log.message('Mutual Proximity Gaussi rescaling started.', flush=True)
    D = D.copy()

    if issparse(D):
        return _mutual_proximity_gaussi_sparse(D, sample_size, min_nnz,
                                               test_set_ind, verbose, log)

    # ignore self dist/sim for parameter estimation
    if idx is None:
        np.fill_diagonal(D, np.nan)
    else:
        for j, i in enumerate(idx):
            D[i, j] = np.nan

    # Calculate mean and std
    if idx is None:
        if sample_size == 0:
            mu = np.nanmean(D[train_set_ind], 0)
            sd = np.nanstd(D[train_set_ind], 0, ddof=0)
        else:
            samples = np.random.shuffle(train_set_ind)[0:sample_size]
            mu = np.nanmean(D[samples], 0)
            sd = np.nanstd(D[samples], 0, ddof=0)
    else:
        mu = np.nanmean(D, 1)
        sd = np.nanstd(D, 1, ddof=0)
    # Avoid downstream div/0 errors
    sd[sd == 0] = 1e-7
    # set self dist/sim back to self_value to avoid scipy warnings
    if idx is None:
        np.fill_diagonal(D, self_value)
    else:
        for j, i in enumerate(idx):
            D[i, j] = self_value

    # MP Gaussi
    D_mp = np.zeros_like(D)
    for i in range(n):
        if verbose and ((i + 1) % 1000 == 0 or i + 1 == n):
            log.message("MP_gaussi: {} of {}.".format(i + 1, n), flush=True)
        if idx is None:
            j = slice(i + 1, n)
            j_mom = j
        else:
            j = slice(0, s)
            j_mom = idx[j]

        if metric == 'similarity':
            p1 = norm.cdf(D[i, j], mu[i], sd[i])
            p2 = norm.cdf(D[i, j], mu[j_mom], sd[j_mom])
            D_mp[i, j] = (p1 * p2).ravel()
        else:
            # sf(.) := 1 - cdf(.)
            p1 = norm.sf(D[i, j], mu[i], sd[i])
            p2 = norm.sf(D[i, j], mu[j_mom], sd[j_mom])
            D_mp[i, j] = (1 - p1 * p2).ravel()

    if idx is None:
        D_mp += D_mp.T
        np.fill_diagonal(D_mp, self_value)
    else:
        # Ensure correct self distances
        for j, sample in enumerate(idx):
            D_mp[sample, j] = self_value
    return D_mp