Example #1
0
    def fit(self, y):
        """
        Fit label binarizer

        Parameters
        ----------
        y : array of shape [n_samples,] or [n_samples, n_classes]
            Target values. The 2-d matrix should only contain 0 and 1,
            represents multilabel classification.

        Returns
        -------
        self : returns an instance of self.
        """

        self._set_output_type(y)

        if y.ndim > 2:
            raise ValueError("labels cannot be greater than 2 dimensions")

        if y.ndim == 2:

            unique_classes = cp.unique(y)
            if unique_classes != [0, 1]:
                raise ValueError("2-d array can must be binary")

            self._classes_ = CumlArray(cp.arange(0, y.shape[1]))
        else:
            self._classes_ = CumlArray(cp.unique(y).astype(y.dtype))

        cp.cuda.Stream.null.synchronize()

        return self
Example #2
0
def test_stratified_binary_classification():
    X = cp.array([[0.37487513, -2.3031888, 1.662633, 0.7671007],
                  [-0.49796826, -1.0621182, -0.32518214, -0.20583323],
                  [-1.0104885, -2.4997945, 2.8952584, 1.4712684],
                  [2.008748, -2.4520662, 0.5557737, 0.07749569],
                  [0.97350526, -0.3403474, -0.58081895, -0.23199573]])

    # Needs to fail when we have just 1 occurence of a label
    y = cp.array([0, 0, 0, 0, 1])
    with pytest.raises(ValueError):
        train_test_split(X, y, train_size=0.75, stratify=y, shuffle=True)

    y = cp.array([0, 0, 0, 1, 1])

    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y,
                                                        train_size=0.75,
                                                        stratify=y,
                                                        random_state=15)

    _, y_counts = cp.unique(y, return_counts=True)
    _, train_counts = cp.unique(y_train, return_counts=True)
    _, test_counts = cp.unique(y_test, return_counts=True)

    # Ensure we have preserve the number of labels
    cp.testing.assert_array_equal(train_counts + test_counts, y_counts)
Example #3
0
File: utils.py Project: teju85/cuml
def sorted_unique_labels(*ys):
    """Extract an ordered array of unique labels from one or more dask arrays
    of labels."""
    ys = (cp.unique(y.map_blocks(lambda x: cp.unique(x)).compute())
          for y in ys)
    labels = cp.unique(cp.concatenate(ys))
    return labels
Example #4
0
    def _ray_fit_preprocess(self, y) -> Callable:
        """This has been separated out so that it can be easily overwritten
        should a future xgboost version remove label encoding"""
        # pylint: disable = attribute-defined-outside-init,too-many-statements
        can_use_label_encoder = True
        use_label_encoder = getattr(self, "use_label_encoder", True)
        label_encoding_check_error = (
            "The label must consist of integer "
            "labels of form 0, 1, 2, ..., [num_class - 1].")
        label_encoder_deprecation_msg = (
            "The use of label encoder in XGBClassifier is deprecated and will "
            "be removed in a future release. To remove this warning, do the "
            "following: 1) Pass option use_label_encoder=False when "
            "constructing XGBClassifier object; and 2) Encode your labels (y) "
            "as integers starting with 0, i.e. 0, 1, 2, ..., [num_class - 1].")

        # ray: modified this to allow for compatibility with legacy xgboost
        if (_is_cudf_df and _is_cudf_df(y)) or (_is_cudf_ser
                                                and _is_cudf_ser(y)):
            import cupy as cp  # pylint: disable=E0401

            self.classes_ = cp.unique(y.values)
            self.n_classes_ = len(self.classes_)
            can_use_label_encoder = False
            expected_classes = cp.arange(self.n_classes_)
            if (self.classes_.shape != expected_classes.shape
                    or not (self.classes_ == expected_classes).all()):
                raise ValueError(label_encoding_check_error)
        elif (_is_cupy_array and _is_cupy_array(y)):
            import cupy as cp  # pylint: disable=E0401

            self.classes_ = cp.unique(y)
            self.n_classes_ = len(self.classes_)
            can_use_label_encoder = False
            expected_classes = cp.arange(self.n_classes_)
            if (self.classes_.shape != expected_classes.shape
                    or not (self.classes_ == expected_classes).all()):
                raise ValueError(label_encoding_check_error)
        else:
            self.classes_ = np.unique(y)
            self.n_classes_ = len(self.classes_)
            if not use_label_encoder and (not np.array_equal(
                    self.classes_, np.arange(self.n_classes_))):
                raise ValueError(label_encoding_check_error)

        if use_label_encoder:
            if not can_use_label_encoder:
                raise ValueError(
                    "The option use_label_encoder=True is incompatible with "
                    "inputs of type cuDF or cuPy. Please set "
                    "use_label_encoder=False when  constructing XGBClassifier "
                    "object. NOTE:" + label_encoder_deprecation_msg)
            if hasattr(self, "use_label_encoder"):
                warnings.warn(label_encoder_deprecation_msg, UserWarning)
            self._le = XGBoostLabelEncoder().fit(y)
            label_transform = self._le.transform
        else:
            label_transform = lambda x: x  # noqa: E731

        return label_transform
Example #5
0
def setdiff1d(ar1, ar2, assume_unique=False):
    """Find the set difference of two arrays. It returns unique
    values in `ar1` that are not in `ar2`.

    Parameters
    ----------
    ar1 : cupy.ndarray
        Input array
    ar2 : cupy.ndarray
        Input array for comparision
    assume_unique : bool
        By default, False, i.e. input arrays are not unique.
        If True, input arrays are assumed to be unique. This can
        speed up the calculation.

    Returns
    -------
    setdiff1d : cupy.ndarray
        Returns a 1D array of values in `ar1` that are not in `ar2`.
        It always returns a sorted output for unsorted input only
        if `assume_unique=False`.

    See Also
    --------
    numpy.setdiff1d

    """
    if assume_unique:
        ar1 = cupy.ravel(ar1)
    else:
        ar1 = cupy.unique(ar1)
        ar2 = cupy.unique(ar2)
    return ar1[in1d(ar1, ar2, assume_unique=True, invert=True)]
Example #6
0
def setxor1d(ar1, ar2, assume_unique=False):
    """Find the set exclusive-or of two arrays.

    Parameters
    ----------
    ar1, ar2 : cupy.ndarray
        Input arrays. They are flattend if they are not already 1-D.
    assume_unique : bool
        By default, False, i.e. input arrays are not unique.
        If True, input arrays are assumed to be unique. This can
        speed up the calculation.

    Returns
    -------
    setxor1d : cupy.ndarray
        Return the sorted, unique values that are in only one
        (not both) of the input arrays.

    See Also
    --------
    numpy.setxor1d

    """
    if not assume_unique:
        ar1 = cupy.unique(ar1)
        ar2 = cupy.unique(ar2)

    aux = cupy.concatenate((ar1, ar2), axis=None)
    if aux.size == 0:
        return aux

    aux.sort()

    return aux[_setxorkernel(aux, aux.size,
                             cupy.zeros(aux.size, dtype=cupy.bool_))]
Example #7
0
def many_to_one_GPU(ar1, ar2):
    '''Based on np.intersect1d. Special fn for LJ cc gen'''
    import cupy as cp
    ar1 = cp.asarray(ar1)  # ar1 to GPU
    ar1, ind1, inv1 = cp.unique(ar1, return_index=True,
                                return_inverse=True)  # ar1, ind1, inv1 on GPU
    ar1 = cp.asnumpy(ar1)  # ar1 to Host
    ind1 = cp.asnumpy(ind1)  # ind1 to Host
    inv1 = cp.asnumpy(inv1)  # inv1 to Host

    ar2 = cp.asarray(ar2)  # ar2 to GPU
    ar2, ind2 = cp.unique(ar2, return_index=True)  # ar2, ind2 on GPU
    ar2 = cp.asnumpy(ar2)  # ar2 to Host
    ind2 = cp.asnumpy(ind2)  # ind2 to Host

    aux = np.concatenate((ar1, ar2))  # aux on Host
    aux_sort_indices = np.argsort(aux,
                                  kind='mergesort')  # aux_sort_indices on Host
    aux = aux[aux_sort_indices]

    mask = aux[1:] == aux[:-1]  # mask on Host

    ar2_indices = ind2[aux_sort_indices[1:][mask] -
                       ar1.size]  # ar2_indices on Host

    return ar2_indices[inv1]  # return on Host
Example #8
0
def intersect1d_GPU(ar1, ar2, assume_unique=False, return_indices=False):
    '''Based on np.intersect1d. Special fn for LJ cc gen'''
    import cupy as cp
    assert (not assume_unique) and return_indices

    ar1 = cp.asarray(ar1)  # ar1 to GPU
    ar1, ind1 = cp.unique(ar1, return_index=True)  # ar1, ind1 on GPU
    ar1 = cp.asnumpy(ar1)  # ar1 to Host
    ind1 = cp.asnumpy(ind1)  # ind1 to Host

    ar2 = cp.asarray(ar2)  # ar2 to GPU
    ar2, ind2 = cp.unique(ar2, return_index=True)  # ar2, ind2 on GPU
    ar2 = cp.asnumpy(ar2)  # ar2 to Host
    ind2 = cp.asnumpy(ind2)  # ind2 to Host

    aux = np.concatenate((ar1, ar2))  # aux on Host
    aux_sort_indices = np.argsort(aux,
                                  kind='mergesort')  # aux_sort_indices on Host
    aux = aux[aux_sort_indices]

    mask = aux[1:] == aux[:-1]  # mask on Host
    int1d = aux[:-1][mask]  # int1d on Host

    ar1_indices = aux_sort_indices[:-1][mask]  # ar1_indices on Host
    ar2_indices = aux_sort_indices[1:][mask] - ar1.size  # ar2_indices on Host

    ar1_indices = ind1[ar1_indices]
    ar2_indices = ind2[ar2_indices]

    return int1d, ar1_indices, ar2_indices  # return on Host
Example #9
0
def test_make_blobs_scalar_parameters(dtype, n_samples, n_features, centers,
                                      cluster_std, center_box, shuffle,
                                      random_state, order):

    out, labels = cuml.make_blobs(dtype=dtype,
                                  n_samples=n_samples,
                                  n_features=n_features,
                                  centers=centers,
                                  cluster_std=0.001,
                                  center_box=center_box,
                                  shuffle=shuffle,
                                  random_state=random_state,
                                  order=order)

    assert out.shape == (n_samples, n_features), "out shape mismatch"
    assert labels.shape == (n_samples, ), "labels shape mismatch"

    if order == 'F':
        assert out.flags['F_CONTIGUOUS']
    elif order == 'C':
        assert out.flags['C_CONTIGUOUS']

    if centers is None:
        assert cp.unique(labels).shape == (3,), \
            "unexpected number of clusters"
    elif centers <= n_samples:
        assert cp.unique(labels).shape == (centers,), \
            "unexpected number of clusters"
Example #10
0
def confusion_matrix(client, y_true, y_pred, normalize=None, sample_weight=None):
    from cuml.dask.common.input_utils import DistributedDataHandler

    unique_classes = cp.unique(y_true.map_blocks(lambda x: cp.unique(x)).compute())
    nclasses = len(unique_classes)

    ddh = DistributedDataHandler.create([y_true, y_pred])

    cms = client.compute(
        [
            client.submit(
                local_cm, part, unique_classes, sample_weight, workers=[worker]
            )
            for worker, part in ddh.gpu_futures
        ],
        sync=True,
    )

    cm = cp.zeros((nclasses, nclasses))
    for i in cms:
        cm += i

    with np.errstate(all="ignore"):
        if normalize == "true":
            cm = cm / cm.sum(axis=1, keepdims=True)
        elif normalize == "pred":
            cm = cm / cm.sum(axis=0, keepdims=True)
        elif normalize == "all":
            cm = cm / cm.sum()
        cm = cp.nan_to_num(cm)

    return cm
Example #11
0
def intersect1d(arr1, arr2, assume_unique=False, return_indices=False):
    """Find the intersection of two arrays.
    Returns the sorted, unique values that are in both of the input arrays.

    Parameters
    ----------
    arr1, arr2 : cupy.ndarray
        Input arrays. Arrays will be flattened if they are not in 1D.
    assume_unique : bool
        By default, False. If set True, the input arrays will be
        assumend to be unique, which speeds up the calculation. If set True,
        but the arrays are not unique, incorrect results and out-of-bounds
        indices could result.
    return_indices : bool
       By default, False. If True, the indices which correspond to the
       intersection of the two arrays are returned.

    Returns
    -------
    intersect1d : cupy.ndarray
        Sorted 1D array of common and unique elements.
    comm1 : cupy.ndarray
        The indices of the first occurrences of the common values
        in `arr1`. Only provided if `return_indices` is True.
    comm2 : cupy.ndarray
        The indices of the first occurrences of the common values
        in `arr2`. Only provided if `return_indices` is True.

    See Also
    --------
    numpy.intersect1d

    """
    if not assume_unique:
        if return_indices:
            arr1, ind1 = cupy.unique(arr1, return_index=True)
            arr2, ind2 = cupy.unique(arr2, return_index=True)
        else:
            arr1 = cupy.unique(arr1)
            arr2 = cupy.unique(arr2)
    else:
        arr1 = arr1.ravel()
        arr2 = arr2.ravel()

    if not return_indices:
        mask = _search._exists_kernel(arr1, arr2, arr2.size, False)
        return arr1[mask]

    mask, v1 = _search._exists_and_searchsorted_kernel(arr1, arr2, arr2.size,
                                                       False)
    int1d = arr1[mask]
    arr1_indices = cupy.flatnonzero(mask)
    arr2_indices = v1[mask]

    if not assume_unique:
        arr1_indices = ind1[arr1_indices]
        arr2_indices = ind2[arr2_indices]

    return int1d, arr1_indices, arr2_indices
Example #12
0
def make_classification_dataset(datatype, nrows, ncols, nclasses):
    n_real_features = min(ncols, int(max(nclasses * 2, math.ceil(ncols / 10))))
    n_clusters_per_class = min(2, max(1, int(2**n_real_features / nclasses)))
    n_redundant = min(ncols - n_real_features, max(2, math.ceil(ncols / 20)))
    try:
        X, y = data.make_classification(
            dtype=datatype,
            n_samples=nrows + 1000,
            n_features=ncols,
            random_state=SEED,
            class_sep=1.0,
            n_informative=n_real_features,
            n_clusters_per_class=n_clusters_per_class,
            n_redundant=n_redundant,
            n_classes=nclasses)

        r = dsel.train_test_split(X, y, random_state=SEED, train_size=nrows)

        if len(cp.unique(r[2])) < nclasses:
            raise ValueError("Training data does not have all classes.")

        return r

    except ValueError:
        pytest.skip(
            "Skipping the test for invalid combination of ncols/nclasses")
Example #13
0
def test_make_blobs(nrows, ncols, centers, cluster_std, dtype, nparts, order,
                    client):

    c = client

    nrows = int(nrows)
    X, y = make_blobs(nrows,
                      ncols,
                      centers=centers,
                      cluster_std=cluster_std,
                      dtype=dtype,
                      n_parts=nparts,
                      order=order,
                      client=client)

    assert len(X.chunks[0]) == nparts
    assert len(y.chunks[0]) == nparts

    assert X.shape == (nrows, ncols)
    assert y.shape == (nrows, )

    y_local = y.compute()
    assert len(cp.unique(y_local)) == centers

    X_ddh = DistributedDataHandler.create(data=X, client=c)
    X_first = X_ddh.gpu_futures[0][1].result()

    if order == 'F':
        assert X_first.flags['F_CONTIGUOUS']
    elif order == 'C':
        assert X_first.flags['C_CONTIGUOUS']
Example #14
0
def _binary_roc_auc_score(y_true, y_score):
    """Compute binary roc_auc_score using cupy"""

    if cp.unique(y_true).shape[0] == 1:
        raise ValueError("roc_auc_score cannot be used when "
                         "only one class present in y_true. ROC AUC score "
                         "is not defined in that case.")

    if cp.unique(y_score).shape[0] == 1:
        return 0.5

    fps, tps, thresholds = _binary_clf_curve(y_true, y_score)
    tpr = tps / tps[-1]
    fpr = fps / fps[-1]

    return _calculate_area_under_curve(fpr, tpr).item()
Example #15
0
def test_make_classification(n_samples, n_features, hypercube, n_classes,
                             n_clusters_per_class, n_informative,
                             random_state, n_parts, order, dtype,
                             client):
    from cuml.dask.datasets.classification import make_classification

    X, y = make_classification(n_samples=n_samples, n_features=n_features,
                               n_classes=n_classes, hypercube=hypercube,
                               n_clusters_per_class=n_clusters_per_class,
                               n_informative=n_informative,
                               random_state=random_state, n_parts=n_parts,
                               order=order, dtype=dtype)
    assert(len(X.chunks[0])) == n_parts
    assert(len(X.chunks[1])) == 1

    assert X.shape == (n_samples, n_features)
    assert y.shape == (n_samples, )

    assert X.dtype == dtype
    assert y.dtype == np.int64

    assert len(X.chunks[0]) == n_parts
    assert len(y.chunks[0]) == n_parts

    import cupy as cp
    y_local = y.compute()
    assert len(cp.unique(y_local)) == n_classes

    X_parts = client.sync(_extract_partitions, X)
    X_first = X_parts[0][1].result()

    if order == 'F':
        assert X_first.flags['F_CONTIGUOUS']
    elif order == 'C':
        assert X_first.flags['C_CONTIGUOUS']
Example #16
0
def test_map_array_incorrect_output_shape():
    labels = cp.random.randint(0, 5, size=(24, 25))
    out = cp.empty((24, 24))
    in_values = cp.unique(labels)
    out_values = cp.random.random(in_values.shape).astype(out.dtype)
    with pytest.raises(ValueError):
        map_array(labels, in_values, out_values, out=out)
Example #17
0
def _build_laplacian(data, spacing, mask, beta, multichannel):
    l_x, l_y, l_z = data.shape[:3]
    edges = _make_graph_edges_3d(l_x, l_y, l_z)
    weights = _compute_weights_3d(data, spacing, beta=beta, eps=1.e-10,
                                  multichannel=multichannel)
    assert weights.dtype == data.dtype
    if mask is not None:
        # Remove edges of the graph connected to masked nodes, as well
        # as corresponding weights of the edges.
        mask0 = cp.concatenate([mask[..., :-1].ravel(), mask[:, :-1].ravel(),
                                mask[:-1].ravel()])
        mask1 = cp.concatenate([mask[..., 1:].ravel(), mask[:, 1:].ravel(),
                                mask[1:].ravel()])
        ind_mask = cp.logical_and(mask0, mask1)
        edges, weights = edges[:, ind_mask], weights[ind_mask]

        # Reassign edges labels to 0, 1, ... edges_number - 1
        _, inv_idx = cp.unique(edges, return_inverse=True)
        edges = inv_idx.reshape(edges.shape)

    # Build the sparse linear system
    pixel_nb = l_x * l_y * l_z
    i_indices = edges.ravel()
    j_indices = edges[::-1].ravel()
    data = cp.concatenate((weights, weights))
    lap = sparse.coo_matrix((data, (i_indices, j_indices)),
                            shape=(pixel_nb, pixel_nb))
    # need CSR instead of COO for indexing used later in _build_linear_system
    lap = lap.tocsr()
    lap.setdiag(-cp.ravel(lap.sum(axis=0)))
    return lap
Example #18
0
def clusterAverage(clu, spikeQuantity):
    # get the average of some quantity across spikes in each cluster, given the
    # quantity for each spike
    #
    # e.g.
    # > clusterDepths = clusterAverage(clu, spikeDepths)
    #
    # clu and spikeQuantity must be vector, same size
    #
    # using a super-tricky algorithm for this - when you make a sparse
    # array, the values of any duplicate indices are added. So this is the
    # fastest way I know to make the sum of the entries of spikeQuantity for each of
    # the unique entries of clu
    _, cluInds, spikeCounts = cp.unique(clu,
                                        return_inverse=True,
                                        return_counts=True)

    # summation
    q = cpx.scipy.sparse.coo_matrix(
        (spikeQuantity, (cluInds, cp.zeros(len(clu))))).toarray().flatten()

    # had sums so dividing by spike counts gives the mean depth of each cluster
    clusterQuantity = q / spikeCounts

    return clusterQuantity
Example #19
0
 def get_states_numbers(self):
     """ For all possible states, return the number of agents in the map in this state
     returns a numpy array consisting in 2 columns: the first is the state id and the second, 
     the number of agents currently in this state on the map """
     state_ids, n_agents = cp.unique(self.current_state_ids,
                                     return_counts=True)
     return state_ids, n_agents
Example #20
0
def _label2rgb_avg(label_field, image, bg_label=0, bg_color=(0, 0, 0)):
    """Visualise each segment in `label_field` with its mean color in `image`.

    Parameters
    ----------
    label_field : array of int
        A segmentation of an image.
    image : array, shape ``label_field.shape + (3,)``
        A color image of the same spatial shape as `label_field`.
    bg_label : int, optional
        A value in `label_field` to be treated as background.
    bg_color : 3-tuple of int, optional
        The color for the background label

    Returns
    -------
    out : array, same shape and type as `image`
        The output visualization.
    """
    out = cp.zeros(label_field.shape + (3, ))
    labels = cp.unique(label_field)
    bg = labels == bg_label
    if bg.any():
        labels = labels[labels != bg_label]
        mask = (label_field == bg_label).nonzero()
        out[mask] = bg_color
    for label in labels:
        mask = (label_field == label).nonzero()
        color = image[mask].mean(axis=0)
        out[mask] = color
    return out
Example #21
0
def test_map_array_non_contiguous_output_array():
    labels = cp.random.randint(0, 5, size=(24, 25))
    out = cp.empty((24 * 3, 25 * 2))[::3, ::2]
    in_values = cp.unique(labels)
    out_values = cp.random.random(in_values.shape).astype(out.dtype)
    with pytest.raises(ValueError):
        map_array(labels, in_values, out_values, out=out)
Example #22
0
def _match_label_with_color(label, colors, bg_label, bg_color):
    """Return `unique_labels` and `color_cycle` for label array and color list.

    Colors are cycled for normal labels, but the background color should only
    be used for the background.
    """
    # Temporarily set background color; it will be removed later.
    if bg_color is None:
        bg_color = (0, 0, 0)
    bg_color = _rgb_vector(bg_color)

    # map labels to their ranks among all labels from small to large
    unique_labels, mapped_labels = cp.unique(label, return_inverse=True)

    # get rank of bg_label
    # for CuPy use .ravel() instead of .flat
    bg_label_rank_list = mapped_labels[label.ravel() == bg_label]

    # The rank of each label is the index of the color it is matched to in
    # color cycle. bg_label should always be mapped to the first color, so
    # its rank must be 0. Other labels should be ranked from small to large
    # from 1.
    if len(bg_label_rank_list) > 0:
        bg_label_rank = bg_label_rank_list[0]
        mapped_labels[mapped_labels < bg_label_rank] += 1
        mapped_labels[label.ravel() == bg_label] = 0
    else:
        mapped_labels += 1

    # Modify labels and color cycle so background color is used only once.
    color_cycle = itertools.cycle(colors)
    color_cycle = itertools.chain([bg_color], color_cycle)

    return mapped_labels, color_cycle
    def remove_indices(self):
        """make feature vector `self.features`"""
        # 0<=v,d1,d2,d3<=28, 0<=e<=28*28=784, so v,d1,d2,d3 has 10**2 spaces and e has 10**3 spaces.
        if not self.use_d:
            features = cp.vstack((self.num_vertices, self.num_edges))
            features = features[0] + features[1] * (10**2)  # ve
        else:
            features = cp.vstack((self.num_vertices, self.num_edges,
                                  self.num_id1, self.num_id2, self.num_id3))
            features = features[0] + features[1] * (10**2) + features[2] * (
                10**
                (2 + 3)) + features[3] * (10**(2 + 3 + 2)) + features[4] * (
                    10**(2 + 3 + 2 + 2))  # veid1id2id3

        self.features = cp.unique(features, return_counts=True)

        self.prob_of_measuring_0ket = cp.sum(
            self.features[1]**2) / (2**(2 * self.adjacency_mat.shape[0]))
        divide_value = np.sqrt(cp.sum(self.features[1]**2))
        self.normalized_features = (self.features[0],
                                    self.features[1] / divide_value)

        del self.indices
        del self.num_vertices
        del self.num_edges
        if self.use_d:
            del self.num_id1
            del self.num_id2
            del self.num_id3

        self.features = (cp.asnumpy(self.features[0]),
                         cp.asnumpy(self.features[1]))
        self.normalized_features = (cp.asnumpy(self.normalized_features[0]),
                                    cp.asnumpy(self.normalized_features[1]))
Example #24
0
def _csr_column_index1(col_idxs, Ap, Aj):
    """Construct indptr and components for populating indices and data of
    output sparse array
    Args
        col_idxs : column indices to index from input indices
        Ap : indptr of input sparse matrix
        Aj : indices of input sparse matrix
    Returns
        Bp : indptr of output sparse matrix
        Aj_mask : Input indices array with all cols not matching the index
                  index masked out with -1.
        col_counts : Number of times each unique index occurs in Aj
        sort_idxs : Indices sorted to preserve original order of idxs
    """

    idx_map, sort_idxs = cupy.unique(col_idxs, return_index=True)
    sort_idxs = sort_idxs.astype(idx_map.dtype)
    idxs = cupy.searchsorted(idx_map, col_idxs)

    col_counts = cupy.zeros(idx_map.size, dtype=col_idxs.dtype)
    cupyx.scatter_add(col_counts, idxs, 1)

    Bp, Aj_mask = _csr_column_index1_indptr(idx_map, sort_idxs, col_counts, Ap,
                                            Aj)

    return Bp, Aj_mask, col_counts, sort_idxs
Example #25
0
def _binary_clf_curve(y_true, y_score):

    if y_true.dtype.kind == 'f' and np.any(y_true != y_true.astype(int)):
        raise ValueError("Continuous format of y_true  " "is not supported.")

    ids = cp.argsort(-y_score)
    sorted_score = y_score[ids]

    ones = y_true[ids].astype('float32')  # for calculating true positives
    zeros = 1 - ones  # for calculating predicted positives

    # calculate groups
    group = _group_same_scores(sorted_score)
    num = int(group[-1])

    tps = cp.zeros(num, dtype='float32')
    fps = cp.zeros(num, dtype='float32')

    tps = _addup_x_in_group(group, ones, tps)
    fps = _addup_x_in_group(group, zeros, fps)

    tps = cp.cumsum(tps)
    fps = cp.cumsum(fps)
    thresholds = cp.unique(y_score)
    return fps, tps, thresholds
Example #26
0
def test_make_blobs(nrows, ncols, centers, cluster_std, dtype, nparts, cluster,
                    output):

    c = Client(cluster)
    try:
        from cuml.dask.datasets import make_blobs

        X, y = make_blobs(nrows,
                          ncols,
                          centers=centers,
                          cluster_std=cluster_std,
                          dtype=dtype,
                          n_parts=nparts,
                          output=output)

        assert X.npartitions == nparts
        assert y.npartitions == nparts

        X = X.compute()
        y = y.compute()

        assert X.shape == (nrows, ncols)
        assert y.shape == (nrows, 1)

        if output == 'dataframe':
            assert len(y[0].unique()) == centers
            assert X.dtypes.unique() == [dtype]

        elif output == 'array':
            import cupy as cp
            assert len(cp.unique(y)) == centers
            assert y.dtype == dtype

    finally:
        c.close()
Example #27
0
def shannon_entropy(image, base=2):
    """Calculate the Shannon entropy of an image.

    The Shannon entropy is defined as S = -sum(pk * log(pk)),
    where pk are frequency/probability of pixels of value k.

    Parameters
    ----------
    image : (N, M) ndarray
        Grayscale input image.
    base : float, optional
        The logarithmic base to use.

    Returns
    -------
    entropy : 0-dimensional float cupy.ndarray

    Notes
    -----
    The returned value is measured in bits or shannon (Sh) for base=2, natural
    unit (nat) for base=np.e and hartley (Hart) for base=10.

    References
    ----------
    .. [1] `https://en.wikipedia.org/wiki/Entropy_(information_theory) <https://en.wikipedia.org/wiki/Entropy_(information_theory)>`_
    .. [2] https://en.wiktionary.org/wiki/Shannon_entropy

    """  # noqa

    _, counts = cp.unique(image, return_counts=True)
    return scipy_entropy(counts, base=base)
Example #28
0
def test_make_blobs(nrows, ncols, centers, cluster_std, dtype, nparts, cluster,
                    order, output):

    c = Client(cluster)
    try:
        X, y = make_blobs(nrows,
                          ncols,
                          centers=centers,
                          cluster_std=cluster_std,
                          dtype=dtype,
                          n_parts=nparts,
                          output=output,
                          order=order)

        assert X.npartitions == nparts
        assert y.npartitions == nparts

        X_local = X.compute()
        y_local = y.compute()

        assert X_local.shape == (nrows, ncols)

        if output == 'dataframe':
            assert len(y_local[0].unique()) == centers
            assert X_local.dtypes.unique() == [dtype]
            assert y_local.shape == (nrows, 1)

        elif output == 'array':
            import cupy as cp
            assert len(cp.unique(y_local)) == centers
            assert y_local.dtype == dtype
            assert y_local.shape == (nrows, )

    finally:
        c.close()
Example #29
0
def _match_cumulative_cdf(source, template):
    """
    Return modified source array so that the cumulative density function of
    its values matches the cumulative density function of the template.
    """
    src_values, src_unique_indices, src_counts = cp.unique(source.ravel(),
                                                           return_inverse=True,
                                                           return_counts=True)
    tmpl_values, tmpl_counts = cp.unique(template.ravel(), return_counts=True)

    # calculate normalized quantiles for each array
    src_quantiles = cp.cumsum(src_counts) / source.size
    tmpl_quantiles = cp.cumsum(tmpl_counts) / template.size

    interp_a_values = cp.interp(src_quantiles, tmpl_quantiles, tmpl_values)
    return interp_a_values[src_unique_indices].reshape(source.shape)
Example #30
0
    def _update_infection_probs(self, random_seed=None):
        """
        Updates probability of infection based on how many inhabitants
        are infectious each city
        """
        if random_seed is None:
            self._reset_random_seed()

        else:
            cp.random.seed(random_seed)

        infected_indices = self._indices[self._is_infectious]
        quarantine = self._is_in_quarantine[infected_indices]

        quarantine = quarantine * (cp.random.random(len(infected_indices)) <=
                                   self._quarantine_effifiency)

        infected_indices = infected_indices[~quarantine]

        infecious_city_ids = self.city_id[infected_indices]

        if len(infecious_city_ids) == 0:
            self._city_infected_counts = cp.zeros(len(self.city_ids))

        else:
            city_ids, infected_counts = cp.unique(infecious_city_ids,
                                                  return_counts=True)

            _, self._city_infected_counts = self._sort_by_city_ids(
                city_ids, infected_counts, as_json=False)

        self.city_infection_probs = self._city_infected_counts / self._city_population_sizes * \
                                    self._virus.transmission_probability