Beispiel #1
0
def test_get_chunk_n_rows_warns():
    """Check that warning is raised when working_memory is too low."""
    row_bytes = 1024 * 1024 + 1
    max_n_rows = None
    working_memory = 1
    expected = 1

    warn_msg = (
        "Could not adhere to working_memory config. Currently 1MiB, 2MiB required."
    )
    with pytest.warns(UserWarning, match=warn_msg):
        actual = get_chunk_n_rows(
            row_bytes=row_bytes,
            max_n_rows=max_n_rows,
            working_memory=working_memory,
        )

    assert actual == expected
    assert type(actual) is type(expected)

    with config_context(working_memory=working_memory):
        with pytest.warns(UserWarning, match=warn_msg):
            actual = get_chunk_n_rows(row_bytes=row_bytes,
                                      max_n_rows=max_n_rows)
        assert actual == expected
        assert type(actual) is type(expected)
def _compute_core_distances_(X, neighbors, min_samples, working_memory):
    """Compute the k-th nearest neighbor of each sample

    Equivalent to neighbors.kneighbors(X, self.min_samples)[0][:, -1]
    but with more memory efficiency.

    Parameters
    ----------
    X : array, shape (n_samples, n_features)
        The data.
    neighbors : NearestNeighbors instance
        The fitted nearest neighbors estimator.
    working_memory : int, optional
        The sought maximum memory for temporary distance matrix chunks.
        When None (default), the value of
        ``sklearn.get_config()['working_memory']`` is used.

    Returns
    -------
    core_distances : array, shape (n_samples,)
        Distance at which each sample becomes a core point.
        Points which will never be core have a distance of inf.
    """
    n_samples = X.shape[0]
    core_distances = np.empty(n_samples)
    core_distances.fill(np.nan)

    chunk_n_rows = get_chunk_n_rows(row_bytes=16 * min_samples,
                                    max_n_rows=n_samples,
                                    working_memory=working_memory)
    slices = gen_batches(n_samples, chunk_n_rows)
    for sl in slices:
        core_distances[sl] = neighbors.kneighbors(X[sl], min_samples)[0][:, -1]
    return core_distances
    def _compute_chunked_score_samples(self, X):

        n_samples = _num_samples(X)

        if self._max_features == X.shape[1]:
            subsample_features = False
        else:
            subsample_features = True

        # We get as many rows as possible within our working_memory budget
        # (defined by sklearn.get_config()['working_memory']) to store
        # self._max_features in each row during computation.
        #
        # Note:
        #  - this will get at least 1 row, even if 1 row of score will
        #    exceed working_memory.
        #  - this does only account for temporary memory usage while loading
        #    the data needed to compute the scores -- the returned scores
        #    themselves are 1D.

        chunk_n_rows = get_chunk_n_rows(row_bytes=16 * self._max_features,
                                        max_n_rows=n_samples)
        slices = gen_batches(n_samples, chunk_n_rows)

        scores = np.zeros(n_samples, order="f")

        for sl in slices:
            # compute score on the slices of test samples:
            scores[sl] = self._compute_score_samples(X[sl], subsample_features)

        return scores
def test_get_chunk_n_rows(row_bytes, max_n_rows, working_memory, expected):
    with warnings.catch_warnings():
        warnings.simplefilter("error", UserWarning)
        actual = get_chunk_n_rows(
            row_bytes=row_bytes,
            max_n_rows=max_n_rows,
            working_memory=working_memory,
        )

    assert actual == expected
    assert type(actual) is type(expected)
    with config_context(working_memory=working_memory):
        with warnings.catch_warnings():
            warnings.simplefilter("error", UserWarning)
            actual = get_chunk_n_rows(row_bytes=row_bytes, max_n_rows=max_n_rows)
        assert actual == expected
        assert type(actual) is type(expected)
Beispiel #5
0
def test_get_chunk_n_rows(row_bytes, max_n_rows, working_memory, expected,
                          warn_msg):
    warning = None if warn_msg is None else UserWarning
    with pytest.warns(warning, match=warn_msg) as w:
        actual = get_chunk_n_rows(
            row_bytes=row_bytes,
            max_n_rows=max_n_rows,
            working_memory=working_memory,
        )

    assert actual == expected
    assert type(actual) is type(expected)
    if warn_msg is None:
        assert len(w) == 0
    with config_context(working_memory=working_memory):
        with pytest.warns(warning, match=warn_msg) as w:
            actual = get_chunk_n_rows(row_bytes=row_bytes,
                                      max_n_rows=max_n_rows)
        assert actual == expected
        assert type(actual) is type(expected)
        if warn_msg is None:
            assert len(w) == 0
Beispiel #6
0
def _compute_chunked_score_samples(iforest, tree_idx, X):
    n_samples = _num_samples(X)
    if iforest._max_features == X.shape[1]:
        subsample_features = False
    else:
        subsample_features = True
    chunk_n_rows = get_chunk_n_rows(row_bytes=16 * iforest._max_features,
                                    max_n_rows=n_samples)
    slices = gen_batches(n_samples, chunk_n_rows)
    scores = np.zeros(n_samples, order="f")
    for sl in slices:
        scores[sl] = _compute_score_samples_single_tree(iforest, tree_idx, X[sl], subsample_features)
    return scores
    def _compute_core_distances_(self, X, neighbors, working_memory=None):

        n_samples = len(X)
        core_distances = np.empty(n_samples)
        core_distances.fill(np.nan)

        chunk_n_rows = get_chunk_n_rows(row_bytes=16 * self.min_samples,
                                        max_n_rows=n_samples,
                                        working_memory=working_memory)
        slices = gen_batches(n_samples, chunk_n_rows)
        for sl in slices:
            core_distances[sl] = neighbors.kneighbors(X[sl],
                                                      self.min_samples)[0][:,
                                                                           -1]
        return core_distances
def pairwise_distances_chunked_cuda(X, reduce_func=None, verbose=False):
    """Generate a distance matrix chunk by chunk with optional reduction
    In cases where not all of a pairwise distance matrix needs to be stored at
    once, this is used to calculate pairwise distances in
    ``working_memory``-sized chunks.  If ``reduce_func`` is given, it is run
    on each chunk and its return values are concatenated into lists, arrays
    or sparse matrices.
    Parameters
    ----------
    X : array [n_samples_a, n_samples_a] if metric == "precomputed", or,
        [n_samples_a, n_features] otherwise
        Array of pairwise distances between samples, or a feature array.
    Y : array [n_samples_b, n_features], optional
        An optional second feature array. Only allowed if
        metric != "precomputed".
    reduce_func : callable, optional
        The function which is applied on each chunk of the distance matrix,
        reducing it to needed values.  ``reduce_func(D_chunk, start)``
        is called repeatedly, where ``D_chunk`` is a contiguous vertical
        slice of the pairwise distance matrix, starting at row ``start``.
        It should return one of: None; an array, a list, or a sparse matrix
        of length ``D_chunk.shape[0]``; or a tuple of such objects. Returning
        None is useful for in-place operations, rather than reductions.
        If None, pairwise_distances_chunked returns a generator of vertical
        chunks of the distance matrix.
    metric : string, or callable
        The metric to use when calculating distance between instances in a
        feature array. If metric is a string, it must be one of the options
        allowed by scipy.spatial.distance.pdist for its metric parameter, or
        a metric listed in pairwise.PAIRWISE_DISTANCE_FUNCTIONS.
        If metric is "precomputed", X is assumed to be a distance matrix.
        Alternatively, if metric is a callable function, it is called on each
        pair of instances (rows) and the resulting value recorded. The callable
        should take two arrays from X as input and return a value indicating
        the distance between them.
    n_jobs : int or None, optional (default=None)
        The number of jobs to use for the computation. This works by breaking
        down the pairwise matrix into n_jobs even slices and computing them in
        parallel.
        ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.
        ``-1`` means using all processors. See :term:`Glossary <n_jobs>`
        for more details.
    working_memory : int, optional
        The sought maximum memory for temporary distance matrix chunks.
        When None (default), the value of
        ``sklearn.get_config()['working_memory']`` is used.
    `**kwds` : optional keyword parameters
        Any further parameters are passed directly to the distance function.
        If using a scipy.spatial.distance metric, the parameters are still
        metric dependent. See the scipy docs for usage examples.
    Yields
    ------
    D_chunk : array or sparse matrix
        A contiguous slice of distance matrix, optionally processed by
        ``reduce_func``.
    Examples
    --------
    Without reduce_func:
    >>> import numpy as np
    >>> from sklearn.metrics import pairwise_distances_chunked
    >>> X = np.random.RandomState(0).rand(5, 3)
    >>> D_chunk = next(pairwise_distances_chunked(X))
    >>> D_chunk
    array([[0.  ..., 0.29..., 0.41..., 0.19..., 0.57...],
           [0.29..., 0.  ..., 0.57..., 0.41..., 0.76...],
           [0.41..., 0.57..., 0.  ..., 0.44..., 0.90...],
           [0.19..., 0.41..., 0.44..., 0.  ..., 0.51...],
           [0.57..., 0.76..., 0.90..., 0.51..., 0.  ...]])
    Retrieve all neighbors and average distance within radius r:
    >>> r = .2
    >>> def reduce_func(D_chunk, start):
    ...     neigh = [np.flatnonzero(d < r) for d in D_chunk]
    ...     avg_dist = (D_chunk * (D_chunk < r)).mean(axis=1)
    ...     return neigh, avg_dist
    >>> gen = pairwise_distances_chunked(X, reduce_func=reduce_func)
    >>> neigh, avg_dist = next(gen)
    >>> neigh
    [array([0, 3]), array([1]), array([2]), array([0, 3]), array([4])]
    >>> avg_dist
    array([0.039..., 0.        , 0.        , 0.039..., 0.        ])
    Where r is defined per sample, we need to make use of ``start``:
    >>> r = [.2, .4, .4, .3, .1]
    >>> def reduce_func(D_chunk, start):
    ...     neigh = [np.flatnonzero(d < r[i])
    ...              for i, d in enumerate(D_chunk, start)]
    ...     return neigh
    >>> neigh = next(pairwise_distances_chunked(X, reduce_func=reduce_func))
    >>> neigh
    [array([0, 3]), array([0, 1]), array([2]), array([0, 3]), array([4])]
    Force row-by-row generation by reducing ``working_memory``:
    >>> gen = pairwise_distances_chunked(X, reduce_func=reduce_func,
    ...                                  working_memory=0)
    >>> next(gen)
    [array([0, 3])]
    >>> next(gen)
    [array([0, 1])]
    """
    X = X.astype(np.float32)
    n_samples_X = len(X)
    Y = X
    # We get as many rows as possible within our working_memory budget to
    # store len(Y) distances in each row of output.
    #
    # Note:
    #  - this will get at least 1 row, even if 1 row of distances will
    #    exceed working_memory.
    #  - this does not account for any temporary memory usage while
    #    calculating distances (e.g. difference of vectors in manhattan
    #    distance.
    chunk_n_rows = get_chunk_n_rows(row_bytes=8 * len(Y),
                                    max_n_rows=n_samples_X,
                                    working_memory=None)
    slices = gen_batches(n_samples_X, chunk_n_rows)

    X_full = torch.tensor(X).cuda()
    Xnorms = torch.norm(X_full, dim=1, keepdim=True)**2
    for sl in slices:
        if verbose: print(sl)
        if sl.start == 0 and sl.stop == n_samples_X:
            X_chunk = X  # enable optimised paths for X is Y
        else:
            X_chunk = X[sl]
        pX = torch.tensor(X_chunk).cuda()
        d2 = Xnorms[sl] - 2 * torch.matmul(pX, X_full.t()) + Xnorms.t()
        d2 = torch.sqrt(torch.nn.functional.relu(d2)).cpu().numpy()
        d2.flat[sl.start::len(X) + 1] = 0
        D_chunk = d2
        if reduce_func is not None:
            chunk_size = D_chunk.shape[0]
            D_chunk = reduce_func(D_chunk, sl.start)
            _check_chunk_size(D_chunk, chunk_size)
        yield D_chunk
def pairwise_distances_chunked_(X, Y=None, reduce_func=None,
                               metric='euclidean', n_jobs=None,
                               working_memory=None, **kwds):
    """Generate a distance matrix chunk by chunk with optional reduction
    In cases where not all of a pairwise distance matrix needs to be stored at
    once, this is used to calculate pairwise distances in
    ``working_memory``-sized chunks.  If ``reduce_func`` is given, it is run
    on each chunk and its return values are concatenated into lists, arrays
    or sparse matrices.
    Parameters
    ----------
    X : array [n_samples_a, n_samples_a] if metric == "precomputed", or,
        [n_samples_a, n_features] otherwise
        Array of pairwise distances between samples, or a feature array.
    Y : array [n_samples_b, n_features], optional
        An optional second feature array. Only allowed if
        metric != "precomputed".
    reduce_func : callable, optional
        The function which is applied on each chunk of the distance matrix,
        reducing it to needed values.  ``reduce_func(D_chunk, start)``
        is called repeatedly, where ``D_chunk`` is a contiguous vertical
        slice of the pairwise distance matrix, starting at row ``start``.
        It should return an array, a list, or a sparse matrix of length
        ``D_chunk.shape[0]``, or a tuple of such objects.
        If None, pairwise_distances_chunked returns a generator of vertical
        chunks of the distance matrix.
    metric : string, or callable
        The metric to use when calculating distance between instances in a
        feature array. If metric is a string, it must be one of the options
        allowed by scipy.spatial.distance.pdist for its metric parameter, or
        a metric listed in pairwise.PAIRWISE_DISTANCE_FUNCTIONS.
        If metric is "precomputed", X is assumed to be a distance matrix.
        Alternatively, if metric is a callable function, it is called on each
        pair of instances (rows) and the resulting value recorded. The callable
        should take two arrays from X as input and return a value indicating
        the distance between them.
    n_jobs : int or None, optional (default=None)
        The number of jobs to use for the computation. This works by breaking
        down the pairwise matrix into n_jobs even slices and computing them in
        parallel.
        ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.
        ``-1`` means using all processors. See :term:`Glossary <n_jobs>`
        for more details.
    working_memory : int, optional
        The sought maximum memory for temporary distance matrix chunks.
        When None (default), the value of
        ``sklearn.get_config()['working_memory']`` is used.
    `**kwds` : optional keyword parameters
        Any further parameters are passed directly to the distance function.
        If using a scipy.spatial.distance metric, the parameters are still
        metric dependent. See the scipy docs for usage examples.
    Yields
    ------
    D_chunk : array or sparse matrix
        A contiguous slice of distance matrix, optionally processed by
        ``reduce_func``.
    Examples
    --------
    Without reduce_func:
    >>> X = np.random.RandomState(0).rand(5, 3)
    >>> D_chunk = next(pairwise_distances_chunked(X))
    >>> D_chunk  # doctest: +ELLIPSIS
    array([[0.  ..., 0.29..., 0.41..., 0.19..., 0.57...],
           [0.29..., 0.  ..., 0.57..., 0.41..., 0.76...],
           [0.41..., 0.57..., 0.  ..., 0.44..., 0.90...],
           [0.19..., 0.41..., 0.44..., 0.  ..., 0.51...],
           [0.57..., 0.76..., 0.90..., 0.51..., 0.  ...]])
    Retrieve all neighbors and average distance within radius r:
    >>> r = .2
    >>> def reduce_func(D_chunk, start):
    ...     neigh = [np.flatnonzero(d < r) for d in D_chunk]
    ...     avg_dist = (D_chunk * (D_chunk < r)).mean(axis=1)
    ...     return neigh, avg_dist
    >>> gen = pairwise_distances_chunked(X, reduce_func=reduce_func)
    >>> neigh, avg_dist = next(gen)
    >>> neigh
    [array([0, 3]), array([1]), array([2]), array([0, 3]), array([4])]
    >>> avg_dist  # doctest: +ELLIPSIS
    array([0.039..., 0.        , 0.        , 0.039..., 0.        ])
    Where r is defined per sample, we need to make use of ``start``:
    >>> r = [.2, .4, .4, .3, .1]
    >>> def reduce_func(D_chunk, start):
    ...     neigh = [np.flatnonzero(d < r[i])
    ...              for i, d in enumerate(D_chunk, start)]
    ...     return neigh
    >>> neigh = next(pairwise_distances_chunked(X, reduce_func=reduce_func))
    >>> neigh
    [array([0, 3]), array([0, 1]), array([2]), array([0, 3]), array([4])]
    Force row-by-row generation by reducing ``working_memory``:
    >>> gen = pairwise_distances_chunked(X, reduce_func=reduce_func,
    ...                                  working_memory=0)
    >>> next(gen)
    [array([0, 3])]
    >>> next(gen)
    [array([0, 1])]
    """
    n_samples_X = _num_samples(X)
    if metric == 'precomputed':
        slices = (slice(0, n_samples_X),)
    else:
        if Y is None:
            Y = X
        # We get as many rows as possible within our working_memory budget to
        # store len(Y) distances in each row of output.
        #
        # Note:
        #  - this will get at least 1 row, even if 1 row of distances will
        #    exceed working_memory.
        #  - this does not account for any temporary memory usage while
        #    calculating distances (e.g. difference of vectors in manhattan
        #    distance.
        chunk_n_rows = get_chunk_n_rows(row_bytes=8 * _num_samples(Y),
                                        max_n_rows=n_samples_X,
                                        working_memory=working_memory)
        slices = gen_batches(n_samples_X, chunk_n_rows)

    # precompute data-derived metric params
    params = _precompute_metric_params(X, Y, metric=metric, **kwds)
    kwds.update(**params)

    for sl in slices:
        if sl.start == 0 and sl.stop == n_samples_X:
            X_chunk = X  # enable optimised paths for X is Y
        else:
            X_chunk = X[sl]
        D_chunk = pairwise_distances_(X_chunk, Y, metric=metric,
                                     n_jobs=n_jobs, **kwds)
        if ((X is Y or Y is None)
                and PAIRWISE_DISTANCE_FUNCTIONS.get(metric, None)
                is euclidean_distances):
            # zeroing diagonal, taking care of aliases of "euclidean",
            # i.e. "l2"
            D_chunk.flat[sl.start::_num_samples(X) + 1] = 0
        if reduce_func is not None:
            chunk_size = D_chunk.shape[0]
            D_chunk = reduce_func(D_chunk, sl.start)
            _check_chunk_size(D_chunk, chunk_size)
    yield D_chunk