def reduced_binary_einsum(arr0, sub0, arr1, sub1, sub_others): set0 = set(sub0) set1 = set(sub1) assert len(set0) == len(sub0), 'operand 0 should be reduced: diagonal' assert len(set1) == len(sub1), 'operand 1 should be reduced: diagonal' if len(sub0) == 0 or len(sub1) == 0: return arr0 * arr1, sub0 + sub1 set_others = set(sub_others) shared = set0 & set1 batch_dims = shared & set_others contract_dims = shared - batch_dims bs0, cs0, ts0 = _make_transpose_axes(sub0, batch_dims, contract_dims) bs1, cs1, ts1 = _make_transpose_axes(sub1, batch_dims, contract_dims) sub_b = [sub0[axis] for axis in bs0] assert sub_b == [sub1[axis] for axis in bs1] sub_l = [sub0[axis] for axis in ts0] sub_r = [sub1[axis] for axis in ts1] sub_out = sub_b + sub_l + sub_r assert set(sub_out) <= set_others, 'operands should be reduced: unary sum' if len(contract_dims) == 0: # Use element-wise multiply when no contraction is needed if len(sub_out) == len(sub_others): # to assure final output of einsum is C-contiguous sub_out = sub_others arr0 = _expand_dims_transpose(arr0, sub0, sub_out) arr1 = _expand_dims_transpose(arr1, sub1, sub_out) return arr0 * arr1, sub_out for accelerator in _accelerator.get_routine_accelerators(): if accelerator == _accelerator.ACCELERATOR_CUTENSOR: if _use_cutensor(arr0.dtype, sub0, arr1.dtype, sub1, batch_dims, contract_dims): if len(sub_out) == len(sub_others): # to assure final output of einsum is C-contiguous sub_out = sub_others out_shape = _get_out_shape(arr0.shape, sub0, arr1.shape, sub1, sub_out) arr_out = cupy.empty(out_shape, arr0.dtype) arr0 = cupy.ascontiguousarray(arr0) arr1 = cupy.ascontiguousarray(arr1) desc_0 = cutensor.create_tensor_descriptor(arr0) desc_1 = cutensor.create_tensor_descriptor(arr1) desc_out = cutensor.create_tensor_descriptor(arr_out) arr_out = cutensor.contraction(1.0, arr0, desc_0, sub0, arr1, desc_1, sub1, 0.0, arr_out, desc_out, sub_out) return arr_out, sub_out tmp0, shapes0 = _flatten_transpose(arr0, [bs0, ts0, cs0]) tmp1, shapes1 = _flatten_transpose(arr1, [bs1, cs1, ts1]) shapes_out = shapes0[0] + shapes0[1] + shapes1[2] assert shapes0[0] == shapes1[0] arr_out = cupy.matmul(tmp0, tmp1).reshape(shapes_out) return arr_out, sub_out
def with_accelerators(self): old_accelerators = _accelerator.get_routine_accelerators() if self.enable_cub: _accelerator.set_routine_accelerators(['cub']) else: _accelerator.set_routine_accelerators([]) yield _accelerator.set_routine_accelerators(old_accelerators)
def setUp(self): self.old_routine_accelerators = _acc.get_routine_accelerators() self.old_reduction_accelerators = _acc.get_reduction_accelerators() if self.backend == 'device': _acc.set_routine_accelerators(['cub']) _acc.set_reduction_accelerators([]) elif self.backend == 'block': _acc.set_routine_accelerators([]) _acc.set_reduction_accelerators(['cub'])
def setUp(self): cupy.core._optimize_config._clear_all_contexts_cache() self.old_reductions = _accelerator.get_reduction_accelerators() _accelerator.set_reduction_accelerators(self.backend) # avoid shadowed by the cub module self.old_routines = _accelerator.get_routine_accelerators() _accelerator.set_routine_accelerators([]) self.x = testing.shaped_arange((3, 4), cupy, dtype=cupy.float32)
def setUp(self): self.order, self.axis = self.order_and_axis self.old_routine_accelerators = _acc.get_routine_accelerators() self.old_reduction_accelerators = _acc.get_reduction_accelerators() if self.backend == 'device': if self.axis is not None: raise unittest.SkipTest('does not support') _acc.set_routine_accelerators(['cub']) _acc.set_reduction_accelerators([]) elif self.backend == 'block': _acc.set_routine_accelerators([]) _acc.set_reduction_accelerators(['cub'])
def test_can_use_accelerator_set_unset(self): # ensure we use CUB block reduction and not CUB device reduction old_routine_accelerators = _accelerator.get_routine_accelerators() _accelerator.set_routine_accelerators([]) a = cupy.random.random((10, 10)) # this is the only function we can mock; the rest is cdef'd func = ''.join(('cupy.core._cub_reduction.', '_SimpleCubReductionKernel_get_cached_function')) with testing.AssertFunctionIsCalled(func): a.sum() with testing.AssertFunctionIsCalled(func): a.sum(axis=1) with testing.AssertFunctionIsCalled(func, times_called=0): a.sum(axis=0) _accelerator.set_routine_accelerators(old_routine_accelerators)
def setUp(self): self.old_accelerators = _accelerator.get_routine_accelerators() _accelerator.set_routine_accelerators(['cub'])
def test_max_nan(self, xp, dtype): if _acc.ACCELERATOR_CUTENSOR in _acc.get_routine_accelerators(): pytest.skip() a = xp.array([float('nan'), 1, -1], dtype) return a.max()
def __mul__(self, other): if cupy.isscalar(other): self.sum_duplicates() return self._with_data(self.data * other) elif isspmatrix_csr(other): self.sum_duplicates() other.sum_duplicates() if cusparse.check_availability('csrgemm2'): return cusparse.csrgemm2(self, other) elif cusparse.check_availability('csrgemm'): return cusparse.csrgemm(self, other) else: raise NotImplementedError elif csc.isspmatrix_csc(other): self.sum_duplicates() other.sum_duplicates() if cusparse.check_availability('csrgemm'): return cusparse.csrgemm(self, other.T, transb=True) elif cusparse.check_availability('csrgemm2'): b = other.tocsr() b.sum_duplicates() return cusparse.csrgemm2(self, b) else: raise NotImplementedError elif base.isspmatrix(other): return self * other.tocsr() elif base.isdense(other): if other.ndim == 0: self.sum_duplicates() return self._with_data(self.data * other) elif other.ndim == 1: self.sum_duplicates() other = cupy.asfortranarray(other) # need extra padding to ensure not stepping on the CUB bug, # see cupy/cupy#3679 for discussion is_cub_safe = (self.indptr.data.mem.size > self.indptr.size * self.indptr.dtype.itemsize) for accelerator in _accelerator.get_routine_accelerators(): if (accelerator == _accelerator.ACCELERATOR_CUB and is_cub_safe and other.flags.c_contiguous): return cub.device_csrmv(self.shape[0], self.shape[1], self.nnz, self.data, self.indptr, self.indices, other) if (cusparse.check_availability('csrmvEx') and self.nnz > 0 and cusparse.csrmvExIsAligned(self, other)): # csrmvEx does not work if nnz == 0 csrmv = cusparse.csrmvEx elif cusparse.check_availability('csrmv'): csrmv = cusparse.csrmv elif cusparse.check_availability('spmv'): csrmv = cusparse.spmv else: raise NotImplementedError return csrmv(self, other) elif other.ndim == 2: self.sum_duplicates() if cusparse.check_availability('csrmm2'): csrmm = cusparse.csrmm2 elif cusparse.check_availability('spmm'): csrmm = cusparse.spmm else: raise NotImplementedError return csrmm(self, cupy.asfortranarray(other)) else: raise ValueError('could not interpret dimensions') else: return NotImplemented
def setUp(self): self.old_accelerators = _accelerator.get_routine_accelerators() if self.enable_cub: _accelerator.set_routine_accelerators(['cub']) else: _accelerator.set_routine_accelerators([])
def setUp(self): self.old_accelerators = _acc.get_routine_accelerators() _acc.set_routine_accelerators([]) # also avoid fallback to CUB via the general reduction kernel self.old_reduction_accelerators = _acc.get_reduction_accelerators() _acc.set_reduction_accelerators([])
def test_argmin_nan(self, xp, dtype): if _acc.ACCELERATOR_CUTENSOR in _acc.get_routine_accelerators(): pytest.skip() a = xp.array([float('nan'), 1, -1], dtype, order=self.order) return a.argmin()
def test_ptp_all_nan(self, xp, dtype): if _acc.ACCELERATOR_CUTENSOR in _acc.get_routine_accelerators(): pytest.skip() a = xp.array([float('nan'), float('nan')], dtype) return xp.ptp(a)
def __mul__(self, other): if cupy.isscalar(other): self.sum_duplicates() return self._with_data(self.data * other) elif isspmatrix_csr(other): self.sum_duplicates() other.sum_duplicates() if cusparse.check_availability('csrgemm2'): return cusparse.csrgemm2(self, other) elif cusparse.check_availability('csrgemm'): return cusparse.csrgemm(self, other) else: raise NotImplementedError elif csc.isspmatrix_csc(other): self.sum_duplicates() other.sum_duplicates() if cusparse.check_availability('csrgemm'): return cusparse.csrgemm(self, other.T, transb=True) elif cusparse.check_availability('csrgemm2'): b = other.tocsr() b.sum_duplicates() return cusparse.csrgemm2(self, b) else: raise NotImplementedError elif base.isspmatrix(other): return self * other.tocsr() elif base.isdense(other): if other.ndim == 0: self.sum_duplicates() return self._with_data(self.data * other) elif other.ndim == 1: self.sum_duplicates() other = cupy.asfortranarray(other) # csrmvEx does not work if nnz == 0 if self.nnz > 0 and cusparse.csrmvExIsAligned(self, other): for accelerator in _accelerator.get_routine_accelerators(): if (accelerator == _accelerator.ACCELERATOR_CUB and other.flags.c_contiguous): return cub.device_csrmv(self.shape[0], self.shape[1], self.nnz, self.data, self.indptr, self.indices, other) return cusparse.csrmvEx(self, other) else: if cusparse.check_availability('csrmv'): csrmv = cusparse.csrmv elif cusparse.check_availability('spmv'): csrmv = cusparse.spmv else: raise NotImplementedError return csrmv(self, other) elif other.ndim == 2: self.sum_duplicates() if cusparse.check_availability('csrmm2'): csrmm = cusparse.csrmm2 elif cusparse.check_availability('spmm'): csrmm = cusparse.spmm else: raise NotImplementedError return csrmm(self, cupy.asfortranarray(other)) else: raise ValueError('could not interpret dimensions') else: return NotImplemented
def histogram(x, bins=10, range=None, weights=None, density=False): """Computes the histogram of a set of data. Args: x (cupy.ndarray): Input array. bins (int or cupy.ndarray): If ``bins`` is an int, it represents the number of bins. If ``bins`` is an :class:`~cupy.ndarray`, it represents a bin edges. range (2-tuple of float, optional): The lower and upper range of the bins. If not provided, range is simply ``(x.min(), x.max())``. Values outside the range are ignored. The first element of the range must be less than or equal to the second. `range` affects the automatic bin computation as well. While bin width is computed to be optimal based on the actual data within `range`, the bin count will fill the entire range including portions containing no data. density (bool, optional): If False, the default, returns the number of samples in each bin. If True, returns the probability *density* function at the bin, ``bin_count / sample_count / bin_volume``. weights (cupy.ndarray, optional): An array of weights, of the same shape as `x`. Each value in `x` only contributes its associated weight towards the bin count (instead of 1). Returns: tuple: ``(hist, bin_edges)`` where ``hist`` is a :class:`cupy.ndarray` storing the values of the histogram, and ``bin_edges`` is a :class:`cupy.ndarray` storing the bin edges. .. warning:: This function may synchronize the device. .. seealso:: :func:`numpy.histogram` """ if x.dtype.kind == 'c': # TODO(unno): comparison between complex numbers is not implemented raise NotImplementedError('complex number is not supported') if not isinstance(x, cupy.ndarray): raise ValueError('x must be a cupy.ndarray') x, weights = _ravel_and_check_weights(x, weights) bin_edges = _get_bin_edges(x, bins, range) if weights is None: y = cupy.zeros(bin_edges.size - 1, dtype='l') for accelerator in _accelerator.get_routine_accelerators(): # CUB uses int for bin counts # TODO(leofang): support >= 2^31 elements in x? if (accelerator == _accelerator.ACCELERATOR_CUB and x.size <= 0x7fffffff and bin_edges.size <= 0x7fffffff): # Need to ensure the dtype of bin_edges as it's needed for both # the CUB call and the correction later assert isinstance(bin_edges, cupy.ndarray) if numpy.issubdtype(x.dtype, numpy.integer): bin_type = numpy.float else: bin_type = numpy.result_type(bin_edges.dtype, x.dtype) if (bin_type == numpy.float16 and not common._is_fp16_supported()): bin_type = numpy.float32 x = x.astype(bin_type, copy=False) acc_bin_edge = bin_edges.astype(bin_type, copy=True) # CUB's upper bin boundary is exclusive for all bins, including # the last bin, so we must shift it to comply with NumPy if x.dtype.kind in 'ui': acc_bin_edge[-1] += 1 elif x.dtype.kind == 'f': last = acc_bin_edge[-1] acc_bin_edge[-1] = cupy.nextafter(last, last + 1) if runtime.is_hip: y = y.astype(cupy.uint64, copy=False) y = cub.device_histogram(x, acc_bin_edge, y) if runtime.is_hip: y = y.astype(cupy.int64, copy=False) break else: _histogram_kernel(x, bin_edges, bin_edges.size, y) else: simple_weights = (cupy.can_cast(weights.dtype, cupy.float64) or cupy.can_cast(weights.dtype, cupy.complex128)) if not simple_weights: # object dtype such as Decimal are supported in NumPy, but not here raise NotImplementedError( 'only weights with dtype that can be cast to float or complex ' 'are supported') if weights.dtype.kind == 'c': y = cupy.zeros(bin_edges.size - 1, dtype=cupy.complex128) _weighted_histogram_kernel(x, bin_edges, bin_edges.size, weights.real, y.real) _weighted_histogram_kernel(x, bin_edges, bin_edges.size, weights.imag, y.imag) else: if weights.dtype.kind in 'bui': y = cupy.zeros(bin_edges.size - 1, dtype=int) else: y = cupy.zeros(bin_edges.size - 1, dtype=cupy.float64) _weighted_histogram_kernel(x, bin_edges, bin_edges.size, weights, y) if density: db = cupy.array(cupy.diff(bin_edges), cupy.float64) return y / db / y.sum(), bin_edges return y, bin_edges