Esempio n. 1
0
def _conv_array_to_sparse(arr):
    """
    Converts an array (or cudf.DataFrame) to a sparse array
    :param arr: scipy or cupy sparse matrix, cudf DataFrame,
                dense numpy or cupy array
    :return: cupy sparse CSR matrix
    """
    if has_scipy():
        from scipy.sparse import isspmatrix as scipy_sparse_isspmatrix
    else:
        from cuml.common.import_utils import dummy_function_always_false \
            as scipy_sparse_isspmatrix
    if scipy_sparse_isspmatrix(arr):
        ret = \
            cupyx.scipy.sparse.csr_matrix(arr.tocsr())
    elif cupyx.scipy.sparse.isspmatrix(arr):
        ret = arr
    elif isinstance(arr, cudf.DataFrame):
        ret = _conv_df_to_sparse(arr)
    elif isinstance(arr, np.ndarray):
        cupy_ary = rmm_cupy_ary(cp.asarray, arr, dtype=arr.dtype)
        ret = cupyx.scipy.sparse.csr_matrix(cupy_ary)

    elif isinstance(arr, cp.core.core.ndarray):
        ret = cupyx.scipy.sparse.csr_matrix(arr)
    else:
        raise ValueError("Unexpected input type %s" % type(arr))
    return ret
Esempio n. 2
0
def test_neighborhood_predictions(nrows, ncols, n_neighbors, n_clusters,
                                  datatype, algo):
    if not has_scipy():
        pytest.skip('Skipping test_neighborhood_predictions because ' +
                    'Scipy is missing')

    X, y = make_blobs(n_samples=nrows,
                      centers=n_clusters,
                      n_features=ncols,
                      random_state=0)

    if datatype == "dataframe":
        X = cudf.DataFrame(X)

    knn_cu = cuKNN(algorithm=algo)
    knn_cu.fit(X)
    neigh_ind = knn_cu.kneighbors(X,
                                  n_neighbors=n_neighbors,
                                  return_distance=False)
    del knn_cu
    gc.collect()

    if datatype == "dataframe":
        assert isinstance(neigh_ind, cudf.DataFrame)
        neigh_ind = neigh_ind.to_numpy()
    else:
        assert isinstance(neigh_ind, cp.ndarray)

    labels, probs = predict(neigh_ind, y, n_neighbors)

    assert array_equal(labels, y)
Esempio n. 3
0
    def inverse_transform(self, y, threshold=None):
        """
        Transform binary labels back to original multi-class labels

        Parameters
        ----------

        y : array of shape [n_samples, n_classes]
        threshold : float this value is currently ignored

        Returns
        -------

        arr : array with original labels
        """

        if has_scipy():
            from scipy.sparse import isspmatrix as scipy_sparse_isspmatrix
        else:
            from cuml.common.import_utils import dummy_function_always_false \
                    as scipy_sparse_isspmatrix

        # If we are already given multi-class, just return it.
        if cupyx.scipy.sparse.isspmatrix(y):
            y_mapped = y.tocsr().indices.astype(self._classes_.dtype)
        elif scipy_sparse_isspmatrix(y):
            y = y.tocsr()
            y_mapped = rmm_cupy_ary(cp.array, y.indices, dtype=y.indices.dtype)
        else:
            y_mapped = rmm_cupy_ary(cp.argmax,
                                    rmm_cupy_ary(cp.asarray, y, dtype=y.dtype),
                                    axis=1).astype(y.dtype)

        return invert_labels(y_mapped, self._classes_)
Esempio n. 4
0
def test_neighborhood_predictions(nrows, ncols, n_neighbors, n_clusters,
                                  datatype):
    if not has_scipy():
        pytest.skip('Skipping test_neighborhood_predictions because ' +
                    'Scipy is missing')

    X, y = make_blobs(n_samples=nrows, centers=n_clusters,
                      n_features=ncols, random_state=0)

    X = X.astype(np.float32)

    if datatype == "dataframe":
        X = cudf.DataFrame.from_gpu_matrix(rmm.to_device(X))

    knn_cu = cuKNN()
    knn_cu.fit(X)
    neigh_ind = knn_cu.kneighbors(X, n_neighbors=n_neighbors,
                                  return_distance=False)

    if datatype == "dataframe":
        assert isinstance(neigh_ind, cudf.DataFrame)
        neigh_ind = neigh_ind.as_gpu_matrix().copy_to_host()
    else:
        assert isinstance(neigh_ind, np.ndarray)

    labels, probs = predict(neigh_ind, y, n_neighbors)

    assert array_equal(labels, y)
Esempio n. 5
0
def test_neighborhood_predictions(nrows, ncols, n_neighbors, n_clusters,
                                  datatype, algo):
    if algo == "ivfpq":
        pytest.xfail("""See Memory access error in IVFPQ :
                        https://github.com/rapidsai/cuml/issues/3318""")

    if not has_scipy():
        pytest.skip('Skipping test_neighborhood_predictions because ' +
                    'Scipy is missing')

    X, y = make_blobs(n_samples=nrows, centers=n_clusters,
                      n_features=ncols, random_state=0)

    if datatype == "dataframe":
        X = cudf.DataFrame(X)

    knn_cu = cuKNN(algorithm=algo)
    knn_cu.fit(X)
    neigh_ind = knn_cu.kneighbors(X, n_neighbors=n_neighbors,
                                  return_distance=False)
    del knn_cu
    gc.collect()

    if datatype == "dataframe":
        assert isinstance(neigh_ind, cudf.DataFrame)
        neigh_ind = neigh_ind.as_gpu_matrix().copy_to_host()
    else:
        assert isinstance(neigh_ind, cp.core.core.ndarray)

    labels, probs = predict(neigh_ind, y, n_neighbors)

    assert array_equal(labels, y)
Esempio n. 6
0
def test_random_projection_fit_transform(datatype, method):
    if has_scipy():
        from scipy.spatial.distance import pdist
    else:
        pytest.skip('Skipping test_random_projection_fit_transform because ' +
                    'Scipy is missing')

    eps = 0.2

    # dataset generation
    data, target = make_blobs(n_samples=800, centers=400, n_features=3000)

    # conversion to input_type
    data = data.astype(datatype)
    target = target.astype(datatype)

    # creation of model
    if method == 'gaussian':
        model = GaussianRandomProjection(eps=eps)
    else:
        model = SparseRandomProjection(eps=eps)

    # fitting the model
    model.fit(data)
    # applying transformation
    transformed_data = model.transform(data)

    original_pdist = pdist(data)
    embedded_pdist = pdist(transformed_data)

    # check JL lemma
    assert (np.all(((1.0 - eps) * original_pdist) <= embedded_pdist)
            and np.all(embedded_pdist <= ((1.0 + eps) * original_pdist)))
Esempio n. 7
0
def predict(neigh_ind, _y, n_neighbors):
    if has_scipy():
        import scipy.stats as stats
    else:
        raise RuntimeError('Scipy is needed to run predict()')

    neigh_ind = neigh_ind.astype(np.int64)

    ypred, count = stats.mode(_y[neigh_ind], axis=1)
    return ypred.ravel(), count.ravel() * 1.0 / n_neighbors
Esempio n. 8
0
def test_self_neighboring(datatype, metric_p, nrows):
    """Test that searches using an indexed vector itself return sensible
    results for that vector

    For L2-derived metrics, this specifically exercises the slow high-precision
    mode used to correct for approximation errors in L2 computation during NN
    searches.
    """
    ncols = 1000
    n_clusters = 10
    n_neighbors = 3

    metric, p = metric_p

    if not has_scipy():
        pytest.skip('Skipping test_neighborhood_predictions because ' +
                    'Scipy is missing')

    X, y = make_blobs(n_samples=nrows,
                      centers=n_clusters,
                      n_features=ncols,
                      random_state=0)

    if datatype == "dataframe":
        X = cudf.DataFrame(X)

    knn_cu = cuKNN(metric=metric, n_neighbors=n_neighbors)
    knn_cu.fit(X)
    neigh_dist, neigh_ind = knn_cu.kneighbors(X,
                                              n_neighbors=n_neighbors,
                                              return_distance=True,
                                              two_pass_precision=True)

    if datatype == 'dataframe':
        assert isinstance(neigh_ind, cudf.DataFrame)
        neigh_ind = neigh_ind.to_numpy()
        neigh_dist = neigh_dist.to_numpy()
    else:
        assert isinstance(neigh_ind, cp.ndarray)
        neigh_ind = neigh_ind.get()
        neigh_dist = neigh_dist.get()

    neigh_ind = neigh_ind[:, 0]
    neigh_dist = neigh_dist[:, 0]

    assert_array_equal(
        neigh_ind,
        np.arange(0, neigh_dist.shape[0]),
    )
    assert_allclose(neigh_dist,
                    np.zeros(neigh_dist.shape, dtype=neigh_dist.dtype),
                    atol=1e-4)
Esempio n. 9
0
def test_entropy_random(n_samples, base, use_handle):
    if has_scipy():
        from scipy.stats import entropy as sp_entropy
    else:
        pytest.skip('Skipping test_entropy_random because Scipy is missing')

    handle, stream = get_handle(use_handle)

    clustering, _, _, _ = \
        generate_random_labels(lambda rng: rng.randint(0, 1000, n_samples))

    # generate unormalized probabilities from clustering
    pk = np.bincount(clustering)

    # scipy's entropy uses probabilities
    sp_S = sp_entropy(pk, base=base)
    # we use a clustering
    S = entropy(np.array(clustering, dtype=np.int32), base, handle=handle)

    assert_almost_equal(S, sp_S, decimal=2)
Esempio n. 10
0
def test_basic_functions(labels, dtype, sparse_output):

    fit_labels, xform_labels = labels

    skl_bin = skLB(sparse_output=sparse_output)
    skl_bin.fit(fit_labels)

    fit_labels = cp.asarray(fit_labels, dtype=dtype)
    xform_labels = cp.asarray(xform_labels, dtype=dtype)

    binarizer = LabelBinarizer(sparse_output=sparse_output)
    binarizer.fit(fit_labels)

    assert array_equal(binarizer.classes_.get(),
                       np.unique(fit_labels.get()))

    xformed = binarizer.transform(xform_labels)

    if sparse_output:
        skl_bin_xformed = skl_bin.transform(xform_labels.get())

        if has_scipy():
            import scipy.sparse
        else:
            pytest.skip('Skipping test_basic_functions(sparse_output=True) ' +
                        'because Scipy is missing')

        skl_csr = scipy.sparse.coo_matrix(skl_bin_xformed).tocsr()
        cuml_csr = xformed

        array_equal(skl_csr.data, cuml_csr.data.get())

        # #todo: Support sparse inputs
        # xformed = xformed.todense().astype(dtype)

    assert xformed.shape[1] == binarizer.classes_.shape[0]

    original = binarizer.inverse_transform(xformed)

    assert array_equal(original.get(),
                       xform_labels.get())
Esempio n. 11
0
def test_neighborhood_predictions(nrows, ncols, n_neighbors, n_clusters,
                                  datatype, algo):
    if algo == "ivfpq":
        pytest.xfail("Warning: IVFPQ might be unstable in this "
                     "version of cuML. This is due to a known issue "
                     "in the FAISS release that this cuML version "
                     "is linked to. (see FAISS issue #1421)")

    if not has_scipy():
        pytest.skip('Skipping test_neighborhood_predictions because ' +
                    'Scipy is missing')

    X, y = make_blobs(n_samples=nrows,
                      centers=n_clusters,
                      n_features=ncols,
                      random_state=0)

    if datatype == "dataframe":
        X = cudf.DataFrame(X)

    knn_cu = cuKNN(algorithm=algo)
    knn_cu.fit(X)
    neigh_ind = knn_cu.kneighbors(X,
                                  n_neighbors=n_neighbors,
                                  return_distance=False)
    del knn_cu
    gc.collect()

    if datatype == "dataframe":
        assert isinstance(neigh_ind, cudf.DataFrame)
        neigh_ind = neigh_ind.as_gpu_matrix().copy_to_host()
    else:
        assert isinstance(neigh_ind, cp.core.core.ndarray)

    labels, probs = predict(neigh_ind, y, n_neighbors)

    assert array_equal(labels, y)
Esempio n. 12
0
    sklearn_metrics = set(sklearn.neighbors.VALID_METRICS_SPARSE[algo])
    sklearn_metrics.update(sklearn.neighbors.VALID_METRICS[algo])
    return [value for value in cuml_metrics if value in sklearn_metrics]


def metric_p_combinations():
    for metric in valid_metrics():
        yield metric, 2
        if metric in ("minkowski", "lp"):
            yield metric, 3


@pytest.mark.parametrize("datatype", ["dataframe", "numpy"])
@pytest.mark.parametrize("metric_p", metric_p_combinations())
@pytest.mark.parametrize("nrows", [1000, stress_param(10000)])
@pytest.mark.skipif(not has_scipy(),
                    reason="Skipping test_self_neighboring"
                    " because Scipy is missing")
def test_self_neighboring(datatype, metric_p, nrows):
    """Test that searches using an indexed vector itself return sensible
    results for that vector

    For L2-derived metrics, this specifically exercises the slow high-precision
    mode used to correct for approximation errors in L2 computation during NN
    searches.
    """
    ncols = 1000
    n_clusters = 10
    n_neighbors = 3

    metric, p = metric_p
Esempio n. 13
0
    sklearn_metrics = set(sklearn.neighbors.VALID_METRICS_SPARSE[algo])
    sklearn_metrics.update(sklearn.neighbors.VALID_METRICS[algo])
    return [value for value in cuml_metrics if value in sklearn_metrics]


def metric_p_combinations():
    for metric in valid_metrics():
        yield metric, 2
        if metric in ("minkowski", "lp"):
            yield metric, 3


@pytest.mark.parametrize("datatype", ["dataframe", "numpy"])
@pytest.mark.parametrize("metric_p", metric_p_combinations())
@pytest.mark.parametrize("nrows", [1000, stress_param(10000)])
@pytest.mark.skipif(not has_scipy(), reason="Skipping test_self_neighboring"
                    " because Scipy is missing")
def test_self_neighboring(datatype, metric_p, nrows):
    """Test that searches using an indexed vector itself return sensible
    results for that vector

    For L2-derived metrics, this specifically exercises the slow high-precision
    mode used to correct for approximation errors in L2 computation during NN
    searches.
    """
    ncols = 1000
    n_clusters = 10
    n_neighbors = 3

    metric, p = metric_p
Esempio n. 14
0
def batched_fmin_lbfgs_b(func, x0, num_batches, fprime=None, args=(),
                         bounds=None, m=10, factr=1e7, pgtol=1e-5,
                         epsilon=1e-8,
                         iprint=-1, maxiter=15000,
                         maxls=20):
    """A batch-aware L-BFGS-B implementation to minimize a loss function `f` given
    an initial set of parameters `x0`.

    Parameters
    ----------
    func : function (x: array) -> array[M] (M = n_batches)
           The function to minimize. The function should return an array of
           size = `num_batches`
    x0 : array
         Starting parameters
    fprime : function (x: array) -> array[M*n_params] (optional)
             The gradient. Should return an array of derivatives for each
             parameter over batches.
             When omitted, uses Finite-differencing to estimate the gradient.
    args   : Tuple
             Additional arguments to func and fprime
    bounds : List[Tuple[float, float]]
             Box-constrains on the parameters
    m      : int
             L-BFGS parameter: number of previous arrays to store when
             estimating inverse Hessian.
    factr  : float
             Stopping criterion when function evaluation not progressing.
             Stop when `|f(xk+1) - f(xk)| < factor*eps_mach`
             where `eps_mach` is the machine precision
    pgtol  : float
             Stopping criterion when gradient is sufficiently "flat".
             Stop when |grad| < pgtol.
    epsilon : float
              Finite differencing step size when approximating `fprime`
    iprint : int
             -1 for no diagnostic info
             n=1-100 for diagnostic info every n steps.
             >100 for detailed diagnostic info
    maxiter : int
              Maximum number of L-BFGS iterations
    maxls   : int
              Maximum number of line-search iterations.

    """

    if has_scipy():
        from scipy.optimize import _lbfgsb
    else:
        raise RuntimeError("Scipy is needed to run batched_fmin_lbfgs_b")

    nvtx_range_push("LBFGS")
    n = len(x0) // num_batches

    if fprime is None:
        def fprime_f(x):
            return _fd_fprime(x, func, epsilon)
        fprime = fprime_f

    if bounds is None:
        bounds = [(None, None)] * n

    nbd = np.zeros(n, np.int32)
    low_bnd = np.zeros(n, np.float64)
    upper_bnd = np.zeros(n, np.float64)
    bounds_map = {(None, None): 0,
                  (1, None): 1,
                  (1, 1): 2,
                  (None, 1): 3}
    for i in range(0, n):
        lb, ub = bounds[i]
        if lb is not None:
            low_bnd[i] = lb
            lb = 1
        if ub is not None:
            upper_bnd[i] = ub
            ub = 1
        nbd[i] = bounds_map[lb, ub]

    # working arrays needed by L-BFGS-B implementation in SciPy.
    # One for each series
    x = [np.copy(np.array(x0[ib*n:(ib+1)*n],
                          np.float64)) for ib in range(num_batches)]
    f = [np.copy(np.array(0.0,
                          np.float64)) for ib in range(num_batches)]
    g = [np.copy(np.zeros((n,), np.float64)) for ib in range(num_batches)]
    wa = [np.copy(np.zeros(2*m*n + 5*n + 11*m*m + 8*m,
                           np.float64)) for ib in range(num_batches)]
    iwa = [np.copy(np.zeros(3*n, np.int32)) for ib in range(num_batches)]
    task = [np.copy(np.zeros(1, 'S60')) for ib in range(num_batches)]
    csave = [np.copy(np.zeros(1, 'S60')) for ib in range(num_batches)]
    lsave = [np.copy(np.zeros(4, np.int32)) for ib in range(num_batches)]
    isave = [np.copy(np.zeros(44, np.int32)) for ib in range(num_batches)]
    dsave = [np.copy(np.zeros(29, np.float64)) for ib in range(num_batches)]
    for ib in range(num_batches):
        task[ib][:] = 'START'

    n_iterations = np.zeros(num_batches, dtype=np.int32)

    converged = num_batches * [False]

    warn_flag = np.zeros(num_batches)

    while not all(converged):
        nvtx_range_push("LBFGS-ITERATION")
        for ib in range(num_batches):
            if converged[ib]:
                continue

            _lbfgsb.setulb(m, x[ib],
                           low_bnd, upper_bnd,
                           nbd,
                           f[ib], g[ib],
                           factr, pgtol,
                           wa[ib], iwa[ib],
                           task[ib],
                           iprint,
                           csave[ib],
                           lsave[ib],
                           isave[ib],
                           dsave[ib],
                           maxls)

        xk = np.concatenate(x)
        fk = func(xk)
        gk = fprime(xk)
        for ib in range(num_batches):
            if converged[ib]:
                continue
            task_str = task[ib].tostring()
            task_str_strip = task[ib].tostring().strip(b'\x00').strip()
            if task_str.startswith(b'FG'):
                # needs function evalation
                f[ib] = fk[ib]
                g[ib] = gk[ib*n:(ib+1)*n]
            elif task_str.startswith(b'NEW_X'):
                n_iterations[ib] += 1
                if n_iterations[ib] >= maxiter:
                    task[ib][:] = 'STOP: TOTAL NO. of ITERATIONS REACHED LIMIT'
            elif task_str_strip.startswith(b'CONV'):
                converged[ib] = True
                warn_flag[ib] = 0
            else:
                converged[ib] = True
                warn_flag[ib] = 2
                continue

        nvtx_range_pop()
    xk = np.concatenate(x)

    if iprint > 0:
        logger.info("CONVERGED in ({}-{}) iterations (|\\/f|={})".format(
            np.min(n_iterations),
            np.max(n_iterations),
            np.linalg.norm(fprime(xk), np.inf)))

        if (warn_flag > 0).any():
            for ib in range(num_batches):
                if warn_flag[ib] > 0:
                    logger.info("WARNING: id={} convergence issue: {}".format(
                        ib, task[ib].tostring()))

    nvtx_range_pop()
    return xk, n_iterations, warn_flag