Exemple #1
0
def local_scaling(D:np.ndarray, k:int=7, metric:str='distance',
                  test_ind:np.ndarray=None, n_jobs:int=1):
    """Transform a distance matrix with Local Scaling.

    Transforms the given distance matrix into new one using local scaling [1]_
    with the given `k`-th nearest neighbor. There are two types of local
    scaling methods implemented. The original one and NICDM, both reduce
    hubness in distance spaces, similarly to Mutual Proximity.

    Parameters
    ----------
    D : ndarray or csr_matrix
        The ``n x n`` symmetric distance (similarity) matrix.

    k : int, optional (default: 7)
        Neighborhood radius for local scaling.

    metric : {'distance', 'similarity'}, optional (default: 'distance')
        Define, whether matrix `D` is a distance or similarity matrix.

        NOTE: self similarities in sparse `D_ls` are set to ``np.inf``

    test_ind : ndarray, optional (default: None)
        Define data points to be hold out as part of a test set. Can be:

        - None : Rescale all distances
        - ndarray : Hold out points indexed in this array as test set.

    n_jobs : int, optional, default: 1
        Number of processes for parallel computations.

        - `1`: Don't use multiprocessing.
        - `-1`: Use all CPUs

    Returns
    -------
    D_ls : ndarray
        Secondary distance LocalScaling matrix.

    References
    ----------
    .. [1] Schnitzer, D., Flexer, A., Schedl, M., & Widmer, G. (2012).
           Local and global scaling reduce hubs in space. The Journal of Machine
           Learning Research, 13(1), 2871–2902.
    """
    log = ConsoleLogging()
    # Checking input
    io.check_distance_matrix_shape(D)
    io.check_valid_metric_parameter(metric)
    sparse = issparse(D)
    n = D.shape[0]
    if n_jobs == -1:
        n_jobs = cpu_count()
    if metric == 'similarity':
        kth = n - k
        exclude = -np.inf
        self_tmp_value = np.inf
        self_value = 1.
        log.warning("Similarity matrix support for LS is experimental.")
        if sparse and n_jobs != 1:
            log.warning("Parallel processing not implemented for sparse "
                        "matrices. Using single process instead.")
            n_jobs = 1
    else: # metric == 'distance':
        kth = k - 1
        exclude = np.inf
        self_value = 0
        self_tmp_value = self_value
        if sparse:
            log.error("Sparse distance matrices are not supported.")
            raise NotImplementedError(
                "Sparse distance matrices are not supported.")
    D = np.copy(D)

    if test_ind is None:
        train_ind = slice(0, n) #take all        
    else:
        train_ind = np.setdiff1d(np.arange(n), test_ind)
    if sparse:
        r = np.zeros(n)
        for i in range(n):
            di = D[i, train_ind].toarray()
            di[i] = exclude
            r[i] = np.partition(di, kth=kth)[kth]
        D_ls = lil_matrix(D.shape)
        # Number of nonzero cells per row
        nnz = D.getnnz(axis=1)
    else:
        np.fill_diagonal(D, exclude)
        if n_jobs > 1:
            r_ctype = RawArray(ctypes.c_double, n)
            r = np.frombuffer(r_ctype, dtype=np.float64)
            with Pool(processes=n_jobs,
                      initializer=_ls_load_shared_data,
                      initargs=(D, train_ind, r, r_ctype)) as pool:
                for _ in pool.imap(func=partial(_ls_calculate_r, kth=kth),
                                   iterable=range(n)):
                    pass # results handled within func
        else:
            r = np.partition(D[:, train_ind], kth=kth)[:, kth]

    if sparse or n_jobs == 1:
        D_ls = np.zeros_like(D)
        for i in range(n):
            # vectorized inner loop: calc only triu part
            tmp = np.empty(n-i)
            tmp[0] = self_tmp_value
            if metric == 'similarity':
                if sparse and nnz[i] <= k:  # Don't rescale if there are
                    tmp[1:] = np.nan        # too few neighbors in row
                else:
                    tmp[1:] = np.exp(-1 * D[i, i+1:]**2 / (r[i] * r[i+1:]))
            else:
                tmp[1:] = 1 - np.exp(-1 * D[i, i+1:]**2 / (r[i] * r[i+1:]))
            D_ls[i, i:] = tmp
        # copy triu to tril -> symmetric matrix (diag=zeros)
        # NOTE: does not affect self values, since inf+inf=inf and 0+0=0
        D_ls += D_ls.T
    else:
        D_ls_ctype = RawArray(ctypes.c_double, D.size)
        D_ls = np.frombuffer(D_ls_ctype, dtype=np.float64).reshape(D.shape)
        with Pool(processes=n_jobs,
                  initializer=_ls_load_shared_data,
                  initargs=(D, train_ind, r, r_ctype, D_ls, D_ls_ctype)) as pool:
            for _ in pool.imap(func=partial(_ls_calculate_sec_dist,
                                  n=n, metric=metric,
                                  self_tmp_value=self_tmp_value),
                               iterable=range(n)):
                pass # results handled within func
        # triu is copied to tril within func
    if sparse:
        for i, nz in enumerate(nnz):
            if nz <= k: # too few neighbors
                D_ls[i, :] = D[i, :]
        return D_ls.tocsr()
    else:
        np.fill_diagonal(D_ls, self_value)
        return D_ls
Exemple #2
0
def local_scaling_sample(D:np.ndarray, k:int=7, metric:str='distance',
                         train_ind:np.ndarray=None, test_ind:np.ndarray=None):
    """Transform a distance matrix with Local Scaling.

    --- DRAFT version ---

    Transforms the given distance matrix into new one using local scaling [1]_
    with the given `k`-th nearest neighbor. There are two types of local
    scaling methods implemented. The original one and NICDM, both reduce
    hubness in distance spaces, similarly to Mutual Proximity.

    Parameters
    ----------
    D : ndarray or csr_matrix
        The ``n x n`` symmetric distance (similarity) matrix.

    k : int, optional (default: 7)
        Neighborhood radius for local scaling.

    metric : {'distance', 'similarity'}, optional (default: 'distance')
        Define, whether matrix `D` is a distance or similarity matrix.

        NOTE: self similarities in sparse `D_ls` are set to ``np.inf``

    train_ind : ndarray, optional
        If given, use only these data points as neighbors for rescaling.

    test_ind : ndarray, optional (default: None)
        Define data points to be hold out as part of a test set. Can be:

        - None : Rescale all distances
        - ndarray : Hold out points indexed in this array as test set.

    Returns
    -------
    D_ls : ndarray
        Secondary distance LocalScaling matrix.

    References
    ----------
    .. [1] Schnitzer, D., Flexer, A., Schedl, M., & Widmer, G. (2012).
           Local and global scaling reduce hubs in space. The Journal of Machine
           Learning Research, 13(1), 2871–2902.
    """
    log = ConsoleLogging()
    # Checking input
    io.check_sample_shape_fits(D, train_ind)
    io.check_valid_metric_parameter(metric)
    sparse = issparse(D)
    n = D.shape[0]
    if metric == 'similarity':
        if train_ind is not None:
            raise NotImplementedError
        kth = n - k
        exclude = -np.inf
        self_value = 1.
        log.warning("Similarity matrix support for LS is experimental.")
    else: # metric == 'distance':
        kth = k - 1
        exclude = np.inf
        self_value = 0
        if sparse:
            log.error("Sparse distance matrices are not supported.")
            raise NotImplementedError(
                "Sparse distance matrices are not supported.")

    D = np.copy(D)
    if test_ind is None:
        train_set_ind = slice(0, n) #take all
        n_ind = range(n)
    else:
        train_set_ind = np.setdiff1d(np.arange(n), test_ind)
        n_ind = test_ind
    # Exclude self distances
    for j, sample in enumerate(train_ind):
        D[sample, j] = exclude
    r = np.zeros(n)
    for i in range(n):
        if train_ind is None:
            if sparse:
                di = D[i, train_set_ind].toarray()
            else:
                di = D[i, train_set_ind]
        else:
            di = D[i, :] # all columns are training in this case
        r[i] = np.partition(di, kth=kth)[kth]

    if sparse:
        D_ls = lil_matrix(D.shape)
        # Number of nonzero cells per row
        nnz = D.getnnz(axis=1)
    else:
        D_ls = np.zeros_like(D)

    if metric == 'similarity':
        for i in n_ind:
            if sparse and nnz[i] <= k: # Don't rescale if there are too few 
                D_ls[i, :] = D[i, :]   # neighbors in the current row
            else:
                D_ls[i, :] = np.exp(-1 * D[i, :]**2 / (r[i] * r[train_ind]))
    else:
        for i in n_ind:
            D_ls[i, :] = 1 - np.exp(-1 * D[i, :]**2 / (r[i] * r[train_ind]))

    if test_ind is None:
        if sparse:
            return D_ls.tocsr()
        else:
            np.fill_diagonal(D_ls, self_value)
            return D_ls
    else:
        # Ensure correct self distances
        for j, sample in enumerate(train_ind):
            D_ls[sample, j] = self_value
        return D_ls[test_ind]
Exemple #3
0
 def test_error(self):
     log = ConsoleLogging()
     log.error("Error")
     return self