def filter(self, data, strict=False, allow_downcast=None): if strict: if not isinstance(data, gpuarray.GpuArray): raise TypeError("%s expected a GpuArray object." % self, data, type(data)) if self.typecode != data.typecode: raise TypeError("%s expected typecode %d (dtype %s), " "got %d (dtype %s)." % (self, self.typecode, self.dtype, data.typecode, str(data.dtype))) # fallthrough to ndim check elif allow_downcast: data = gpuarray.array(data, dtype=self.typecode, copy=False, ndmin=len(self.broadcastable)) else: up_dtype = scalar.upcast(self.dtype, data.dtype) if up_dtype == self.dtype: data = gpuarray.array(data, dtype=self.dtype, copy=False) else: raise TypeError("%s cannot store a value of dtype %s " "without risking loss of precision." % (self, data.dtype)) if self.ndim != data.ndim: raise TypeError("Wrong number of dimensions: expected %s, " "got %s with shape %s." % (self.ndim, data.ndim, data.shape), data) shp = data.shape for i, b in enumerate(self.broadcastable): if b and shp[i] != 1: raise TypeError("Non-unit value on shape on a broadcastable" " dimension.", shp, self.broadcastable) return data
def filter(self, data, strict=False, allow_downcast=None): if (isinstance(data, gpuarray.GpuArray) and data.typecode == self.typecode): # This is just to make this condition not enter the # following branches pass elif strict: if not isinstance(data, gpuarray.GpuArray): raise TypeError("%s expected a GpuArray object." % self, data, type(data)) if self.typecode != data.typecode: raise TypeError("%s expected typecode %d (dtype %s), " "got %d (dtype %s)." % (self, self.typecode, self.dtype, data.typecode, str(data.dtype))) if self.context != data.context: raise TypeError("data context does not match type context") # fallthrough to ndim check elif (allow_downcast or (allow_downcast is None and type(data) == float and self.dtype == config.floatX)): data = gpuarray.array(data, dtype=self.typecode, copy=False, ndmin=len(self.broadcastable), context=self.context) else: if not hasattr(data, 'dtype'): converted_data = theano._asarray(data, self.dtype) # We use the `values_eq` static function from TensorType # to handle NaN values. if TensorType.values_eq(numpy.asarray(data), converted_data, force_same_dtype=False): data = converted_data data = gpuarray.array(data, context=self.context) up_dtype = scalar.upcast(self.dtype, data.dtype) if up_dtype == self.dtype: data = gpuarray.array(data, dtype=self.dtype, copy=False, context=self.context) else: raise TypeError("%s cannot store a value of dtype %s " "without risking loss of precision." % (self, data.dtype)) if self.ndim != data.ndim: raise TypeError( "Wrong number of dimensions: expected %s, " "got %s with shape %s." % (self.ndim, data.ndim, data.shape), data) shp = data.shape for i, b in enumerate(self.broadcastable): if b and shp[i] != 1: raise TypeError( "Non-unit value on shape on a broadcastable" " dimension.", shp, self.broadcastable) return data
def filter(self, data, strict=False, allow_downcast=None): if (isinstance(data, gpuarray.GpuArray) and data.typecode == self.typecode): # This is just to make this condition not enter the # following branches pass elif strict: if not isinstance(data, gpuarray.GpuArray): raise TypeError("%s expected a GpuArray object." % self, data, type(data)) if self.typecode != data.typecode: raise TypeError("%s expected typecode %d (dtype %s), " "got %d (dtype %s)." % (self, self.typecode, self.dtype, data.typecode, str(data.dtype))) if self.context != data.context: raise TypeError("data context does not match type context") # fallthrough to ndim check elif (allow_downcast or (allow_downcast is None and type(data) == float and self.dtype == config.floatX)): data = gpuarray.array(data, dtype=self.typecode, copy=False, ndmin=len(self.broadcastable), context=self.context) else: if not hasattr(data, 'dtype'): # This is to convert objects that don't have a dtype # (like lists). We anticipate that the type below # will match and we pass copy=False so it won't make a # second object on the GPU. data = gpuarray.array(data, copy=False, context=self.context) up_dtype = scalar.upcast(self.dtype, data.dtype) if up_dtype == self.dtype: data = gpuarray.array(data, dtype=self.dtype, copy=False, context=self.context) else: raise TypeError("%s cannot store a value of dtype %s " "without risking loss of precision." % (self, data.dtype)) if self.ndim != data.ndim: raise TypeError( "Wrong number of dimensions: expected %s, " "got %s with shape %s." % (self.ndim, data.ndim, data.shape), data) shp = data.shape for i, b in enumerate(self.broadcastable): if b and shp[i] != 1: raise TypeError( "Non-unit value on shape on a broadcastable" " dimension.", shp, self.broadcastable) return data
def filter(self, data, strict=False, allow_downcast=None): if (isinstance(data, gpuarray.GpuArray) and data.typecode == self.typecode): # This is just to make this condition not enter the # following branches pass elif strict: if not isinstance(data, gpuarray.GpuArray): raise TypeError("%s expected a GpuArray object." % self, data, type(data)) if self.typecode != data.typecode: raise TypeError("%s expected typecode %d (dtype %s), " "got %d (dtype %s)." % (self, self.typecode, self.dtype, data.typecode, str(data.dtype))) if self.context != data.context: raise TypeError("data context does not match type context") # fallthrough to ndim check elif (allow_downcast or (allow_downcast is None and type(data) == float and self.dtype == config.floatX)): data = gpuarray.array(data, dtype=self.typecode, copy=False, ndmin=len(self.broadcastable), context=self.context) else: if not hasattr(data, 'dtype'): converted_data = theano._asarray(data, self.dtype) # We use the `values_eq` static function from TensorType # to handle NaN values. if TensorType.values_eq(numpy.asarray(data), converted_data, force_same_dtype=False): data = converted_data data = gpuarray.array(data, context=self.context) up_dtype = scalar.upcast(self.dtype, data.dtype) if up_dtype == self.dtype: data = gpuarray.array(data, dtype=self.dtype, copy=False, context=self.context) else: raise TypeError("%s cannot store a value of dtype %s " "without risking loss of precision." % (self, data.dtype)) if self.ndim != data.ndim: raise TypeError("Wrong number of dimensions: expected %s, " "got %s with shape %s." % (self.ndim, data.ndim, data.shape), data) shp = data.shape for i, b in enumerate(self.broadcastable): if b and shp[i] != 1: raise TypeError("Non-unit value on shape on a broadcastable" " dimension.", shp, self.broadcastable) return data
def filter(self, data, strict=False, allow_downcast=None): if (isinstance(data, gpuarray.GpuArray) and data.typecode == self.typecode): # This is just to make this condition not enter the # following branches pass elif strict: if not isinstance(data, gpuarray.GpuArray): raise TypeError("%s expected a GpuArray object." % self, data, type(data)) if self.typecode != data.typecode: raise TypeError("%s expected typecode %d (dtype %s), " "got %d (dtype %s)." % (self, self.typecode, self.dtype, data.typecode, str(data.dtype))) if self.context != data.context: raise TypeError("data context does not match type context") # fallthrough to ndim check elif (allow_downcast or (allow_downcast is None and type(data) == float and self.dtype == config.floatX)): data = gpuarray.array(data, dtype=self.typecode, copy=False, ndmin=len(self.broadcastable), context=self.context) else: if not hasattr(data, 'dtype'): # This is to convert objects that don't have a dtype # (like lists). We anticipate that the type below # will match and we pass copy=False so it won't make a # second object on the GPU. data = gpuarray.array(data, copy=False, context=self.context) up_dtype = scalar.upcast(self.dtype, data.dtype) if up_dtype == self.dtype: data = gpuarray.array(data, dtype=self.dtype, copy=False, context=self.context) else: raise TypeError("%s cannot store a value of dtype %s " "without risking loss of precision." % (self, data.dtype)) if self.ndim != data.ndim: raise TypeError("Wrong number of dimensions: expected %s, " "got %s with shape %s." % (self.ndim, data.ndim, data.shape), data) shp = data.shape for i, b in enumerate(self.broadcastable): if b and shp[i] != 1: raise TypeError("Non-unit value on shape on a broadcastable" " dimension.", shp, self.broadcastable) return data
def setUp(self): self.input = gpu_ftensor4() self.filters = gpu_ftensor4() self.topgrad = gpu_ftensor4() self.constant_tensor = gpuarray.array( numpy.zeros((3, 5, 7, 11), dtype='float32'), context=get_context(test_ctx_name))
def gen_gpuarray( shape_orig, dtype="float32", offseted_outer=False, offseted_inner=False, sliced=1, order="c", nozeros=False, incr=0, ctx=None, cls=None, ): if sliced is True: sliced = 2 elif sliced is False: sliced = 1 shape = numpy.asarray(shape_orig).copy() if sliced != 1 and len(shape) > 0: shape[0] *= numpy.absolute(sliced) if offseted_outer and len(shape) > 0: shape[0] += 1 if offseted_inner and len(shape) > 0: shape[-1] += 1 low = 0.0 if nozeros: low = 1.0 a = numpy.random.uniform(low, 10.0, shape) a += incr a = numpy.asarray(a, dtype=dtype) assert order in ["c", "f"] if order == "f" and len(shape) > 0: a = numpy.asfortranarray(a) b = gpuarray.array(a, context=ctx, cls=cls) if order == "f" and len(shape) > 0 and b.size > 1: assert b.flags["F_CONTIGUOUS"] if offseted_outer and len(shape) > 0: b = b[1:] a = a[1:] if offseted_inner and len(shape) > 0: # The b[..., 1:] act as the test for this subtensor case. b = b[..., 1:] a = a[..., 1:] if sliced != 1 and len(shape) > 0: a = a[::sliced] b = b[::sliced] if False and shape_orig == (): assert a.shape == (1,) assert b.shape == (1,) else: assert a.shape == shape_orig, (a.shape, shape_orig) assert b.shape == shape_orig, (b.shape, shape_orig) assert numpy.allclose(a, numpy.asarray(b)), (a, numpy.asarray(b)) return a, b
def rand_gpuarray(*shape, **kwargs): r = rng.rand(*shape) * 2 - 1 dtype = kwargs.pop("dtype", theano.config.floatX) cls = kwargs.pop("cls", None) if len(kwargs) != 0: raise TypeError("Unexpected argument %s", list(kwargs.keys())[0]) return gpuarray.array(r, dtype=dtype, cls=cls, context=get_context(test_ctx_name))
def gen_gpuarray(shape_orig, dtype='float32', offseted_outer=False, offseted_inner=False, sliced=1, order='c', nozeros=False, incr=0, ctx=None, cls=None): if sliced is True: sliced = 2 elif sliced is False: sliced = 1 shape = numpy.asarray(shape_orig).copy() if sliced != 1 and len(shape) > 0: shape[0] *= numpy.absolute(sliced) if offseted_outer and len(shape) > 0: shape[0] += 1 if offseted_inner and len(shape) > 0: shape[-1] += 1 low = 0.0 if nozeros: low = 1.0 a = numpy.random.uniform(low, 10.0, shape) a += incr a = numpy.asarray(a, dtype=dtype) b = gpuarray.array(a, context=ctx, cls=cls) assert order in ['c', 'f'] if order == 'f' and len(shape) > 0: a = numpy.asfortranarray(a) b = gpuarray.asfortranarray(b) if order == 'f' and len(shape) > 0 and b.size > 1: assert b.flags['F_CONTIGUOUS'] if offseted_outer and len(shape) > 0: b = b[1:] a = a[1:] if offseted_inner and len(shape) > 0: # The b[..., 1:] act as the test for this subtensor case. b = b[..., 1:] a = a[..., 1:] if sliced != 1 and len(shape) > 0: a = a[::sliced] b = b[::sliced] if False and shape_orig == (): assert a.shape == (1, ) assert b.shape == (1, ) else: assert a.shape == shape_orig, (a.shape, shape_orig) assert b.shape == shape_orig, (b.shape, shape_orig) assert numpy.allclose(a, numpy.asarray(b)), (a, numpy.asarray(b)) return a, b
def rand_gpuarray(*shape, **kwargs): r = rng.rand(*shape) * 2 - 1 dtype = kwargs.pop('dtype', theano.config.floatX) cls = kwargs.pop('cls', None) if len(kwargs) != 0: raise TypeError('Unexpected argument %s', list(kwargs.keys())[0]) return gpuarray.array(r, dtype=dtype, cls=cls, context=get_context(test_ctx_name))
def transfer_not_contiguous(shp, dtype): a = numpy.random.rand(*shp) * 10 a = a[::-1] b = gpu_ndarray.array(a, context=ctx) c = numpy.asarray(b) assert numpy.allclose(c, a) assert a.shape == b.shape == c.shape # the result array (c) is C contiguous assert a.strides == b.strides == (-c.strides[0], ) + c.strides[1:] assert a.dtype == b.dtype == c.dtype assert c.flags.c_contiguous
def transfer_not_contiguous(shp, dtype): a = numpy.random.rand(*shp) * 10 a = a[::-1] b = gpu_ndarray.array(a, context=ctx) c = numpy.asarray(b) assert numpy.allclose(c, a) assert a.shape == b.shape == c.shape # the result array (c) is C contiguous assert a.strides == b.strides == (-c.strides[0],) + c.strides[1:] assert a.dtype == b.dtype == c.dtype assert c.flags.c_contiguous
def test_transfer_gpu_gpu(): g = GpuArrayType(dtype='float32', broadcastable=(False, False), context_name=test_ctx_name)() av = np.asarray(rng.rand(5, 4), dtype='float32') gv = gpuarray.array(av, context=get_context(test_ctx_name)) mode = mode_with_gpu.excluding('cut_gpua_host_transfers', 'local_cut_gpua_host_gpua') f = theano.function([g], GpuToGpu(test_ctx_name)(g), mode=mode) topo = f.maker.fgraph.toposort() assert len(topo) == 1 assert isinstance(topo[0].op, GpuToGpu) fv = f(gv) assert GpuArrayType.values_eq(fv, gv)
def test_transfer_cpu_gpu(): a = T.fmatrix('a') g = GpuArrayType(dtype='float32', broadcastable=(False, False))('g') av = np.asarray(rng.rand(5, 4), dtype='float32') gv = gpuarray.array(av, context=get_context(test_ctx_name)) f = theano.function([a], GpuFromHost(test_ctx_name)(a)) fv = f(av) assert GpuArrayType.values_eq(fv, gv) f = theano.function([g], host_from_gpu(g)) fv = f(gv) assert np.all(fv == av)
def test_transfer_cpu_gpu(): a = T.fmatrix('a') g = GpuArrayType(dtype='float32', broadcastable=(False, False))('g') av = numpy.asarray(rng.rand(5, 4), dtype='float32') gv = gpuarray.array(av, context=get_context(test_ctx_name)) f = theano.function([a], GpuFromHost(test_ctx_name)(a)) fv = f(av) assert GpuArrayType.values_eq(fv, gv) f = theano.function([g], host_from_gpu(g)) fv = f(gv) assert numpy.all(fv == av)
def filter(self, data, strict=False, allow_downcast=None): if strict: if not isinstance(data, gpuarray.GpuArray): raise TypeError("%s expected a GpuArray object." % self, data, type(data)) if self.typecode != data.typecode: raise TypeError("%s expected typecode %d (dtype %s), " "got %d (dtype %s)." % (self, self.typecode, self.dtype, data.typecode, str(data.dtype))) # fallthrough to ndim check elif allow_downcast: data = gpuarray.array(data, dtype=self.typecode, copy=False, ndmin=len(self.broadcastable)) else: up_dtype = scalar.upcast(self.dtype, data.dtype) if up_dtype == self.dtype: data = gpuarray.array(data, dtype=self.typecode, copy=False) else: raise TypeError("%s cannot store a value of dtype %s " "without risking loss of precision." % (self, data.dtype)) if self.ndim != data.ndim: raise TypeError( "Wrong number of dimensions: expected %s, " "got %s with shape %s." % (self.ndim, data.ndim, data.shape), data) shp = data.shape for i, b in enumerate(self.broadcastable): if b and shp[i] != 1: raise TypeError( "Non-unit value on shape on a broadcastable" " dimension.", shp, self.broadcastable) return data
def transfer_fortran(shp, dtype): a = numpy.random.rand(*shp) * 10 a_ = numpy.asfortranarray(a) if len(shp) > 1: assert a_.strides != a.strides a = a_ b = gpu_ndarray.array(a, context=ctx) c = numpy.asarray(b) assert a.shape == b.shape == c.shape assert a.dtype == b.dtype == c.dtype assert a.flags.f_contiguous assert c.flags.f_contiguous assert a.strides == b.strides == c.strides assert numpy.allclose(c, a)
def test_transfer_strided(): # This is just to ensure that it works in theano # libgpuarray has a much more comprehensive suit of tests to # ensure correctness a = T.fmatrix('a') g = GpuArrayType(dtype='float32', broadcastable=(False, False))('g') av = numpy.asarray(rng.rand(5, 8), dtype='float32') gv = gpuarray.array(av, context=get_context(test_ctx_name)) av = av[:, ::2] gv = gv[:, ::2] f = theano.function([a], GpuFromHost(test_ctx_name)(a)) fv = f(av) assert GpuArrayType.values_eq(fv, gv) f = theano.function([g], host_from_gpu(g)) fv = f(gv) assert numpy.all(fv == av)
def test_transfer_strided(): # This is just to ensure that it works in theano # libgpuarray has a much more comprehensive suit of tests to # ensure correctness a = T.fmatrix('a') g = GpuArrayType(dtype='float32', broadcastable=(False, False))('g') av = np.asarray(rng.rand(5, 8), dtype='float32') gv = gpuarray.array(av, context=get_context(test_ctx_name)) av = av[:, ::2] gv = gv[:, ::2] f = theano.function([a], GpuFromHost(test_ctx_name)(a)) fv = f(av) assert GpuArrayType.values_eq(fv, gv) f = theano.function([g], host_from_gpu(g)) fv = f(gv) assert np.all(fv == av)
def filter_inplace(self, data, old_data, strict=False, allow_downcast=None): if isinstance(data, gpuarray.GpuArray) and data.typecode == self.typecode: # This is just to make this condition not enter the # following branches pass elif strict: if not isinstance(data, gpuarray.GpuArray): raise TypeError(f"{self} expected a GpuArray object.", data, type(data)) if self.typecode != data.typecode: raise TypeError( f"{self} expected typecode {int(self.typecode)} (dtype {self.dtype}), " f"got {int(data.typecode)} (dtype {data.dtype}).") if self.context != data.context: raise TypeError("data context does not match type context") # fallthrough to ndim check elif allow_downcast or (allow_downcast is None and type(data) == float and self.dtype == config.floatX): if not isinstance(data, gpuarray.GpuArray): data = np.array(data, dtype=self.dtype, copy=False, ndmin=len(self.broadcastable)) else: data = gpuarray.array( data, dtype=self.typecode, copy=False, ndmin=len(self.broadcastable), context=self.context, ) else: if not hasattr(data, "dtype"): converted_data = _asarray(data, self.dtype) # We use the `values_eq` static function from TensorType # to handle NaN values. if TensorType.values_eq(np.asarray(data), converted_data, force_same_dtype=False): data = converted_data up_dtype = scalar.upcast(self.dtype, data.dtype) if up_dtype == self.dtype: if not isinstance(data, gpuarray.GpuArray): data = np.array(data, dtype=self.dtype, copy=False) else: data = gpuarray.array(data, dtype=self.dtype, copy=False) else: raise TypeError( f"{self} cannot store a value of dtype {data.dtype} " "without risking loss of precision.") if self.ndim != data.ndim: raise TypeError( f"Wrong number of dimensions: expected {self.ndim}, " f"got {data.ndim} with shape {data.shape}.", data, ) shp = data.shape for i, b in enumerate(self.broadcastable): if b and shp[i] != 1: raise TypeError( "Non-unit value on shape on a broadcastable" " dimension.", shp, self.broadcastable, ) if not isinstance(data, gpuarray.GpuArray): if (old_data is not None and old_data.shape == data.shape and ( # write() only work if the destitation is contiguous. old_data.flags["C_CONTIGUOUS"] or old_data.flags["F_CONTIGUOUS"])): old_data.write(data) data = old_data else: data = pygpu.array(data, context=self.context) return data
def thunk(): context = inputs[0][0].context # Size of the matrices to invert. z = outputs[0] # Matrix. A = inputs[0][0] # Solution vectors. b = inputs[1][0] assert (len(A.shape) == 2) assert (len(b.shape) == 2) if self.trans in ['T', 'C']: trans = 1 l, n = A.shape k, m = b.shape elif self.trans == 'N': trans = 0 n, l = A.shape k, m = b.shape else: raise ValueError('Invalid value for trans') if l != n: raise ValueError('A must be a square matrix') if n != k: raise ValueError('A and b must be aligned.') lda = max(1, n) ldb = max(1, k, m) # We copy A and b as cusolver operates inplace b = gpuarray.array(b, copy=True, order='F') if not self.inplace: A = gpuarray.array(A, copy=True) A_ptr = A.gpudata b_ptr = b.gpudata # cusolver expects a F ordered matrix, but A is not explicitly # converted between C and F order, instead we switch the # "transpose" flag. if A.flags['C_CONTIGUOUS']: trans = 1 - trans workspace_size = cusolver.cusolverDnSgetrf_bufferSize( cusolver_handle, n, n, A_ptr, lda) if (thunk.workspace is None or thunk.workspace.size != workspace_size): thunk.workspace = gpuarray.zeros((workspace_size, ), dtype='float32', context=context) if thunk.pivots is None or thunk.pivots.size != min(n, n): thunk.pivots = gpuarray.zeros((min(n, n), ), dtype='float32', context=context) if thunk.dev_info is None: thunk.dev_info = gpuarray.zeros((1, ), dtype='float32', context=context) workspace_ptr = thunk.workspace.gpudata pivots_ptr = thunk.pivots.gpudata dev_info_ptr = thunk.dev_info.gpudata cusolver.cusolverDnSgetrf(cusolver_handle, n, n, A_ptr, lda, workspace_ptr, pivots_ptr, dev_info_ptr) cusolver.cusolverDnSgetrs(cusolver_handle, trans, n, m, A_ptr, lda, pivots_ptr, b_ptr, ldb, dev_info_ptr) z[0] = b
def as_gpuarray(x): return gpuarray.array(x, copy=False)
def perform(self, node, inp, out): x, = inp z, = out z[0] = gpuarray.array(numpy.asarray(x))
def perform(self, node, inp, out): x, = inp z, = out type = node.outputs[0].type z[0] = gpuarray.array(x)
def perform(self, node, inp, out, ctx): x, = inp z, = out z[0] = gpuarray.array(x, context=ctx)
def thunk(): context = inputs[0][0].context # Size of the matrices to invert. z = outputs[0] # Matrix. A = inputs[0][0] # Solution vectors. b = inputs[1][0] assert(len(A.shape) == 2) assert(len(b.shape) == 2) if self.trans in ['T', 'C']: trans = 1 l, n = A.shape k, m = b.shape elif self.trans == 'N': trans = 0 n, l = A.shape k, m = b.shape else: raise ValueError('Invalid value for trans') if l != n: raise ValueError('A must be a square matrix') if n != k: raise ValueError('A and b must be aligned.') lda = max(1, n) ldb = max(1, k, m) # We copy A and b as cusolver operates inplace b = gpuarray.array(b, copy=True, order='F') if not self.inplace: A = gpuarray.array(A, copy=True) A_ptr = A.gpudata b_ptr = b.gpudata # cusolver expects a F ordered matrix, but A is not explicitly # converted between C and F order, instead we switch the # "transpose" flag. if A.flags['C_CONTIGUOUS']: trans = 1 - trans workspace_size = cusolver.cusolverDnSgetrf_bufferSize( cusolver_handle, n, n, A_ptr, lda) if (thunk.workspace is None or thunk.workspace.size != workspace_size): thunk.workspace = gpuarray.zeros((workspace_size,), dtype='float32', context=context) if thunk.pivots is None or thunk.pivots.size != min(n, n): thunk.pivots = gpuarray.zeros((min(n, n),), dtype='float32', context=context) if thunk.dev_info is None: thunk.dev_info = gpuarray.zeros((1,), dtype='float32', context=context) workspace_ptr = thunk.workspace.gpudata pivots_ptr = thunk.pivots.gpudata dev_info_ptr = thunk.dev_info.gpudata cusolver.cusolverDnSgetrf( cusolver_handle, n, n, A_ptr, lda, workspace_ptr, pivots_ptr, dev_info_ptr) cusolver.cusolverDnSgetrs( cusolver_handle, trans, n, m, A_ptr, lda, pivots_ptr, b_ptr, ldb, dev_info_ptr) z[0] = b
def ufunc21(name, a, b, out=None, context=None): """Call a ufunc with 2 inputs and 1 output. Parameters ---------- name : str Name of the NumPy ufunc. a, b : `array-like` Input arrays to which the ufunc should be applied. out : `pygpu.gpuarray.GpuArray`, optional Array in which to store the result. context : `pygpu.gpuarray.GpuContext`, optional Use this GPU context to evaluate the GPU kernel. For ``None``, if no GPU array is among the provided parameters, a default GPU context must have been set. Returns ------- out : `pygpu.gpuarray.GpuArray` Result of the computation. If ``out`` was given, the returned object is a reference to it. The type of the returned array is `pygpu._array.ndgpuarray` if - no GPU array was among the parameters or - one of the parameters had type `pygpu._array.ndgpuarray`. """ # Lazy import to avoid circular dependency from pygpu._array import ndgpuarray # --- Prepare input array --- # # Determine GPU context and class. Use the "highest" class present in the # inputs, defaulting to `ndgpuarray` need_context = True cls = None for ary in (a, b, out): if isinstance(ary, GpuArray): if context is not None and ary.context != context: raise ValueError('cannot mix contexts') context = ary.context if cls is None or cls == GpuArray: cls = ary.__class__ need_context = False if need_context and context is None: context = get_default_context() cls = ndgpuarray # Cast input to `GpuArray` of the right dtype if necessary # TODO: figure out what to do here exactly (scalars and such) if isinstance(a, (GpuArray, numpy.ndarray)): if a.flags.f_contiguous and not a.flags.c_contiguous: order = 'F' else: order = 'C' # Determine signature here to avoid creating an intermediate GPU array sig = find_smallest_valid_signature(name, (a, ), (out, )) if not sig: raise TypeError('ufunc {!r} not supported for the input types, ' 'and the inputs could not be safely coerced' ''.format(name)) tc_in, _ = sig.split('->') a = array(a, dtype=tc_in, copy=False, order=order, context=context, cls=cls) else: a = array(a, context=context, cls=cls) sig = find_smallest_valid_signature(name, (a, ), (out, )) if not sig: raise TypeError('ufunc {!r} not supported for the input types, ' 'and the inputs could not be safely coerced' ''.format(name)) # Upcast input if necessary tc_in, tc_out = sig.split('->') if a.dtype < tc_in: a = a.astype(tc_in) # Create output array if not provided if out is None: out = empty(a.shape, dtype=tc_out, context=context, cls=cls) # --- Generate code strings for GpuElemwise --- # # C dtypes for casting c_dtype_in = dtype_to_ctype(tc_in) c_dtype_out = dtype_to_ctype(tc_out) meta = ufunc_metadata[name] assert meta['nin'] == 1 assert meta['nout'] == 1 # Create `oper` string if meta['c_op'] is not None: # Case 1: unary operator unop = meta['c_op'] if a.dtype == numpy.bool and unop == '-': if parse_version(numpy.__version__) >= parse_version('1.13'): # Numpy >= 1.13 raises a TypeError raise TypeError( 'negation of boolean arrays is not supported, use ' '`logical_not` instead') else: # Warn and remap to logical not warnings.warn( 'using negation (`-`) with boolean arrays is ' 'deprecated, use `logical_not` (`~`) instead; ' 'the current behavior will be changed along ' "with NumPy's", FutureWarning) unop = '!' oper = 'out = ({odt}) {}a'.format(unop, odt=c_dtype_out) preamble = '' elif meta['c_func'] is not None: # Case 2: C function c_func = meta['c_func'] if name in ('abs', 'absolute'): # Special case if numpy.dtype(tc_out).kind == 'u': # Shortcut for abs() with unsigned int. This also fixes a CUDA # quirk that makes abs() crash with unsigned int input. out[:] = a return out elif numpy.dtype(tc_out).kind == 'f': c_func = 'fabs' else: c_func = 'abs' oper = 'out = ({odt}) {}(a)'.format(c_func, odt=c_dtype_out) preamble_tpl = mako.template.Template(meta['oper_preamble_tpl']) preamble = preamble_tpl.render(idt=c_dtype_in, odt=c_dtype_out) elif meta['oper_fmt'] is not None: # Case 3: custom implementation with `oper` template oper = meta['oper_fmt'].format(idt=c_dtype_in, odt=c_dtype_out) preamble_tpl = mako.template.Template(meta['oper_preamble_tpl']) preamble = preamble_tpl.render(idt=c_dtype_in, odt=c_dtype_out) else: # Case 4: not implemented raise NotImplementedError('ufunc {!r} not implemented'.format(name)) # --- Generate and run GpuElemwise kernel --- # a_arg = as_argument(a, 'a', read=True) args = [arg('out', out.dtype, write=True), a_arg] ker = GpuElemwise(context, oper, args, preamble=preamble) ker(out, a) return out