def reduced_binary_einsum(arr0, sub0, arr1, sub1, sub_others): set0 = set(sub0) set1 = set(sub1) assert len(set0) == len(sub0), 'operand 0 should be reduced: diagonal' assert len(set1) == len(sub1), 'operand 1 should be reduced: diagonal' if len(sub0) == 0 or len(sub1) == 0: return arr0 * arr1, sub0 + sub1 set_others = set(sub_others) shared = set0 & set1 batch_dims = shared & set_others contract_dims = shared - batch_dims bs0, cs0, ts0 = _make_transpose_axes(sub0, batch_dims, contract_dims) bs1, cs1, ts1 = _make_transpose_axes(sub1, batch_dims, contract_dims) sub_b = [sub0[axis] for axis in bs0] assert sub_b == [sub1[axis] for axis in bs1] sub_l = [sub0[axis] for axis in ts0] sub_r = [sub1[axis] for axis in ts1] sub_out = sub_b + sub_l + sub_r assert set(sub_out) <= set_others, 'operands should be reduced: unary sum' if len(contract_dims) == 0: # Use element-wise multiply when no contraction is needed if len(sub_out) == len(sub_others): # to assure final output of einsum is C-contiguous sub_out = sub_others arr0 = _expand_dims_transpose(arr0, sub0, sub_out) arr1 = _expand_dims_transpose(arr1, sub1, sub_out) return arr0 * arr1, sub_out for accelerator in _accelerator.get_routine_accelerators(): if accelerator == _accelerator.ACCELERATOR_CUTENSOR: if _use_cutensor(arr0.dtype, sub0, arr1.dtype, sub1, batch_dims, contract_dims): if len(sub_out) == len(sub_others): # to assure final output of einsum is C-contiguous sub_out = sub_others out_shape = _get_out_shape(arr0.shape, sub0, arr1.shape, sub1, sub_out) arr_out = cupy.empty(out_shape, arr0.dtype) arr0 = cupy.ascontiguousarray(arr0) arr1 = cupy.ascontiguousarray(arr1) desc_0 = cutensor.create_tensor_descriptor(arr0) desc_1 = cutensor.create_tensor_descriptor(arr1) desc_out = cutensor.create_tensor_descriptor(arr_out) arr_out = cutensor.contraction(1.0, arr0, desc_0, sub0, arr1, desc_1, sub1, 0.0, arr_out, desc_out, sub_out) return arr_out, sub_out tmp0, shapes0 = _flatten_transpose(arr0, [bs0, ts0, cs0]) tmp1, shapes1 = _flatten_transpose(arr1, [bs1, cs1, ts1]) shapes_out = shapes0[0] + shapes0[1] + shapes1[2] assert shapes0[0] == shapes1[0] arr_out = cupy.matmul(tmp0, tmp1).reshape(shapes_out) return arr_out, sub_out
def with_accelerators(self): old_accelerators = _accelerator.get_routine_accelerators() if self.enable_cub: _accelerator.set_routine_accelerators(['cub']) else: _accelerator.set_routine_accelerators([]) yield _accelerator.set_routine_accelerators(old_accelerators)
def setUp(self): self.old_routine_accelerators = _acc.get_routine_accelerators() self.old_reduction_accelerators = _acc.get_reduction_accelerators() if self.backend == 'device': _acc.set_routine_accelerators(['cub']) _acc.set_reduction_accelerators([]) elif self.backend == 'block': _acc.set_routine_accelerators([]) _acc.set_reduction_accelerators(['cub'])
def setUp(self): cupy._core._optimize_config._clear_all_contexts_cache() self.old_reductions = _accelerator.get_reduction_accelerators() _accelerator.set_reduction_accelerators(self.backend) # avoid shadowed by the cub module self.old_routines = _accelerator.get_routine_accelerators() _accelerator.set_routine_accelerators([]) self.x = testing.shaped_arange((3, 4), cupy, dtype=cupy.float32)
def setUp(self): self.order, self.axis = self.order_and_axis self.old_routine_accelerators = _acc.get_routine_accelerators() self.old_reduction_accelerators = _acc.get_reduction_accelerators() if self.backend == 'device': if self.axis is not None: raise unittest.SkipTest('does not support') _acc.set_routine_accelerators(['cub']) _acc.set_reduction_accelerators([]) elif self.backend == 'block': _acc.set_routine_accelerators([]) _acc.set_reduction_accelerators(['cub'])
def setUp(self): self.order, self.axis = self.order_and_axis old_routine_accelerators = _acc.get_routine_accelerators() old_reduction_accelerators = _acc.get_reduction_accelerators() if self.backend == 'device': if self.axis is not None: pytest.skip('does not support') _acc.set_routine_accelerators(['cub']) _acc.set_reduction_accelerators([]) elif self.backend == 'block': _acc.set_routine_accelerators([]) _acc.set_reduction_accelerators(['cub']) yield _acc.set_routine_accelerators(old_routine_accelerators) _acc.set_reduction_accelerators(old_reduction_accelerators)
def test_can_use_accelerator_set_unset(self): # ensure we use CUB block reduction and not CUB device reduction old_routine_accelerators = _accelerator.get_routine_accelerators() _accelerator.set_routine_accelerators([]) a = cupy.random.random((10, 10)) # this is the only function we can mock; the rest is cdef'd func_name = ''.join(('cupy._core._cub_reduction.', '_SimpleCubReductionKernel_get_cached_function')) func = _cub_reduction._SimpleCubReductionKernel_get_cached_function with testing.AssertFunctionIsCalled(func_name, wraps=func, times_called=2): # two passes a.sum() with testing.AssertFunctionIsCalled(func_name, wraps=func, times_called=1): # one pass a.sum(axis=1) with testing.AssertFunctionIsCalled(func_name, wraps=func, times_called=0): # not used a.sum(axis=0) _accelerator.set_routine_accelerators(old_routine_accelerators)
def setUp(self): self.old_accelerators = _accelerator.get_routine_accelerators() _accelerator.set_routine_accelerators(['cub'])
def __mul__(self, other): if cupy.isscalar(other): self.sum_duplicates() return self._with_data(self.data * other) elif isspmatrix_csr(other): self.sum_duplicates() other.sum_duplicates() if cusparse.check_availability('csrgemm2'): return cusparse.csrgemm2(self, other) elif cusparse.check_availability('csrgemm'): return cusparse.csrgemm(self, other) else: raise NotImplementedError elif csc.isspmatrix_csc(other): self.sum_duplicates() other.sum_duplicates() if cusparse.check_availability('csrgemm') and not runtime.is_hip: # trans=True is still buggy as of ROCm 4.2.0 return cusparse.csrgemm(self, other.T, transb=True) elif cusparse.check_availability('csrgemm2'): b = other.tocsr() b.sum_duplicates() return cusparse.csrgemm2(self, b) else: raise NotImplementedError elif base.isspmatrix(other): return self * other.tocsr() elif base.isdense(other): if other.ndim == 0: self.sum_duplicates() return self._with_data(self.data * other) elif other.ndim == 1: self.sum_duplicates() other = cupy.asfortranarray(other) # need extra padding to ensure not stepping on the CUB bug, # see cupy/cupy#3679 for discussion is_cub_safe = (self.indptr.data.mem.size > self.indptr.size * self.indptr.dtype.itemsize) # CUB spmv is buggy since CUDA 11.0, see # https://github.com/cupy/cupy/issues/3822#issuecomment-782607637 is_cub_safe &= (cub._get_cuda_build_version() < 11000) for accelerator in _accelerator.get_routine_accelerators(): if (accelerator == _accelerator.ACCELERATOR_CUB and not runtime.is_hip and is_cub_safe and other.flags.c_contiguous): return cub.device_csrmv(self.shape[0], self.shape[1], self.nnz, self.data, self.indptr, self.indices, other) if (cusparse.check_availability('csrmvEx') and self.nnz > 0 and cusparse.csrmvExIsAligned(self, other)): # csrmvEx does not work if nnz == 0 csrmv = cusparse.csrmvEx elif cusparse.check_availability('csrmv'): csrmv = cusparse.csrmv elif cusparse.check_availability('spmv'): csrmv = cusparse.spmv else: raise NotImplementedError return csrmv(self, other) elif other.ndim == 2: self.sum_duplicates() if cusparse.check_availability('csrmm2'): csrmm = cusparse.csrmm2 elif cusparse.check_availability('spmm'): csrmm = cusparse.spmm else: raise NotImplementedError return csrmm(self, cupy.asfortranarray(other)) else: raise ValueError('could not interpret dimensions') else: return NotImplemented
def test_max_nan(self, xp, dtype): if _acc.ACCELERATOR_CUTENSOR in _acc.get_routine_accelerators(): pytest.skip() a = xp.array([float('nan'), 1, -1], dtype, order=self.order) return a.max()
def test_ptp_all_nan(self, xp, dtype): if _acc.ACCELERATOR_CUTENSOR in _acc.get_routine_accelerators(): pytest.skip() a = xp.array([float('nan'), float('nan')], dtype) return xp.ptp(a)
def _try_use_cutensornet(*args, **kwargs): if cupy.cuda.runtime.is_hip: return None if (_accelerator.ACCELERATOR_CUTENSORNET not in _accelerator.get_routine_accelerators()): return None if cutensornet is None: warnings.warn( 'using the cuTensorNet backend was requested but it cannot be ' 'imported -- maybe you forgot to install cuQuantum Python? ' 'Please do "pip install cuquantum-python" or "conda install ' '-c conda-forge cuquantum-python" and retry', stacklevel=2) return None # cannot pop as we might still need kwargs later dtype = kwargs.get('dtype', None) path = kwargs.get('optimize', False) if path is True: path = 'greedy' # we do very lightweight pre-processing here just to inspect the # operands; the actual input verification is deferred to cuTensorNet # which can generate far better diagonostic messages args = _get_einsum_operands(args) operands = [cupy.asarray(op) for op in args[1]] if len(operands) == 1: # As of cuTENSOR 1.5.0 it still chokes with some common operations # like trace ("ii->") so it's easier to just skip all single-operand # cases instead of whitelisting what could be done explicitly return None if (any(op.size == 0 for op in operands) or any(len(op.shape) == 0 for op in operands)): # To cuTensorNet the shape is invalid return None # all input dtypes must be identical (to a numerical dtype) result_dtype = cupy.result_type(*operands) if dtype is None else dtype if result_dtype not in (cupy.float32, cupy.float64, cupy.complex64, cupy.complex128): return None operands = [op.astype(result_dtype, copy=False) for op in operands] # prepare cutn inputs device = cupy.cuda.runtime.getDevice() handle = cutn_handle_cache.get(device, cutensornet.create()) cutn_options = { 'device_id': device, 'handle': handle, 'memory_limit': 4**31 } # TODO(leofang): fix? # TODO(leofang): support all valid combinations: # - path from user, contract with cutn (done) # - path from cupy, contract with cutn (not yet) # - path from cutn, contract with cutn (done) # - path from cutn, contract with cupy (not yet) raise_warning = False if path is False: # following the same convention (contracting from the right) as would # be produced by _iter_path_pairs(), but converting to a list of pairs # due to cuTensorNet's requirement path = [(i - 1, i - 2) for i in range(len(operands), 1, -1)] elif len(path) and path[0] == 'einsum_path': # let cuTensorNet check if the format is correct path = path[1:] elif len(path) == 2: if isinstance(path[1], (int, float)): raise_warning = True if path[0] != 'cutensornet': raise_warning = True path = None else: # path is a string if path != 'cutensornet': raise_warning = True path = None if raise_warning: warnings.warn( 'the cuTensorNet backend ignores the "optimize" option ' 'except when an explicit contraction path is provided ' 'or when optimize=False (disable optimization); also, ' 'the maximum intermediate size, if set, is ignored', stacklevel=2) cutn_optimizer = {'path': path} if path else None if len(args) == 2: out = cutensornet.contract(args[0], *operands, options=cutn_options, optimize=cutn_optimizer) elif len(args) == 3: inputs = [i for pair in zip(operands, args[0]) for i in pair] if args[2] is not None: inputs.append(args[2]) out = cutensornet.contract(*inputs, options=cutn_options, optimize=cutn_optimizer) else: assert False return out
def histogram(x, bins=10, range=None, weights=None, density=False): """Computes the histogram of a set of data. Args: x (cupy.ndarray): Input array. bins (int or cupy.ndarray): If ``bins`` is an int, it represents the number of bins. If ``bins`` is an :class:`~cupy.ndarray`, it represents a bin edges. range (2-tuple of float, optional): The lower and upper range of the bins. If not provided, range is simply ``(x.min(), x.max())``. Values outside the range are ignored. The first element of the range must be less than or equal to the second. `range` affects the automatic bin computation as well. While bin width is computed to be optimal based on the actual data within `range`, the bin count will fill the entire range including portions containing no data. density (bool, optional): If False, the default, returns the number of samples in each bin. If True, returns the probability *density* function at the bin, ``bin_count / sample_count / bin_volume``. weights (cupy.ndarray, optional): An array of weights, of the same shape as `x`. Each value in `x` only contributes its associated weight towards the bin count (instead of 1). Returns: tuple: ``(hist, bin_edges)`` where ``hist`` is a :class:`cupy.ndarray` storing the values of the histogram, and ``bin_edges`` is a :class:`cupy.ndarray` storing the bin edges. .. warning:: This function may synchronize the device. .. seealso:: :func:`numpy.histogram` """ if x.dtype.kind == 'c': # TODO(unno): comparison between complex numbers is not implemented raise NotImplementedError('complex number is not supported') if not isinstance(x, cupy.ndarray): raise ValueError('x must be a cupy.ndarray') x, weights = _ravel_and_check_weights(x, weights) bin_edges = _get_bin_edges(x, bins, range) if weights is None: y = cupy.zeros(bin_edges.size - 1, dtype=cupy.int64) for accelerator in _accelerator.get_routine_accelerators(): # CUB uses int for bin counts # TODO(leofang): support >= 2^31 elements in x? if (accelerator == _accelerator.ACCELERATOR_CUB and x.size <= 0x7fffffff and bin_edges.size <= 0x7fffffff): # Need to ensure the dtype of bin_edges as it's needed for both # the CUB call and the correction later assert isinstance(bin_edges, cupy.ndarray) if numpy.issubdtype(x.dtype, numpy.integer): bin_type = float else: bin_type = numpy.result_type(bin_edges.dtype, x.dtype) if (bin_type == numpy.float16 and not common._is_fp16_supported()): bin_type = numpy.float32 x = x.astype(bin_type, copy=False) acc_bin_edge = bin_edges.astype(bin_type, copy=True) # CUB's upper bin boundary is exclusive for all bins, including # the last bin, so we must shift it to comply with NumPy if x.dtype.kind in 'ui': acc_bin_edge[-1] += 1 elif x.dtype.kind == 'f': last = acc_bin_edge[-1] acc_bin_edge[-1] = cupy.nextafter(last, last + 1) if runtime.is_hip: y = y.astype(cupy.uint64, copy=False) y = cub.device_histogram(x, acc_bin_edge, y) if runtime.is_hip: y = y.astype(cupy.int64, copy=False) break else: _histogram_kernel(x, bin_edges, bin_edges.size, y) else: simple_weights = (cupy.can_cast(weights.dtype, cupy.float64) or cupy.can_cast(weights.dtype, cupy.complex128)) if not simple_weights: # object dtype such as Decimal are supported in NumPy, but not here raise NotImplementedError( 'only weights with dtype that can be cast to float or complex ' 'are supported') if weights.dtype.kind == 'c': y = cupy.zeros(bin_edges.size - 1, dtype=cupy.complex128) _weighted_histogram_kernel(x, bin_edges, bin_edges.size, weights.real, y.real) _weighted_histogram_kernel(x, bin_edges, bin_edges.size, weights.imag, y.imag) else: if weights.dtype.kind in 'bui': y = cupy.zeros(bin_edges.size - 1, dtype=int) else: y = cupy.zeros(bin_edges.size - 1, dtype=cupy.float64) _weighted_histogram_kernel(x, bin_edges, bin_edges.size, weights, y) if density: db = cupy.array(cupy.diff(bin_edges), cupy.float64) return y / db / y.sum(), bin_edges return y, bin_edges
def setUp(self): self.old_accelerators = _acc.get_routine_accelerators() _acc.set_routine_accelerators([]) # also avoid fallback to CUB via the general reduction kernel self.old_reduction_accelerators = _acc.get_reduction_accelerators() _acc.set_reduction_accelerators([])