def elemwise_layouts_mixed(shape, offseted_outer, offseted_inner, sliced, order): ac, ag = gen_gpuarray(shape, dtype='float32', sliced=sliced, order=order, offseted_outer=offseted_outer, offseted_inner=offseted_inner, ctx=context) b = numpy.asarray(2.0, dtype='float32') outg = gpuarray.empty(shape, dtype='float32', context=context) k = ElemwiseKernel(context, "float *a, float b, float *c", "c[i] = a[i] + b") # will use contig or basic k(ag, b, outg) outc = ac + b assert numpy.allclose(numpy.asarray(outg), outc) # test basic outg = gpuarray.empty(shape, dtype='float32', context=context) k.call_basic(ag, b, outg) assert numpy.allclose(numpy.asarray(outg), outc) # test dimspec outg = gpuarray.empty(shape, dtype='float32', context=context) k.call_dimspec(ag, b, outg) assert numpy.allclose(numpy.asarray(outg), outc) # test specialized outg = gpuarray.empty(shape, dtype='float32', context=context) k.call_specialized(ag, b, outg) assert numpy.allclose(numpy.asarray(outg), outc)
def generate_kernel(self, node, nodename): inps = [ make_argument(i, 'i%d' % (n, )) for n, i in enumerate(node.inputs) ] scal_ins = [scalar.Scalar(i.dtype) for i in node.inputs] outs = [ make_argument(o, 'o%d' % (n, )) for n, o in enumerate(node.outputs) if not n in self.inplace_pattern ] scal_out = [scalar.Scalar(o.dtype) for o in node.outputs] fake_node = Apply(self.scalar_op, [i() for i in scal_ins], [o() for o in scal_out]) try: code = self.scalar_op.c_support_code_apply(fake_node, nodename) if code: raise SupportCodeError(code) except MethodNotDefined: pass support_code = "" try: support_code = self.scalar_op.c_support_code() except MethodNotDefined: pass if (support_code.strip() != "#define THEANO_MACRO_MOD(x,y) (x % y)" and support_code.strip() != ""): # The macro is fine, the C++ struct is not. raise SupportCodeError(support_code) scal_out = [] oi = 0 for n in range(len(fake_node.outputs)): if n in self.inplace_pattern: scal_out.append(inps[self.inplace_pattern[n]].name + '[i]') else: scal_out.append(outs[oi].name + '[i]') oi += 1 kop = self.scalar_op.c_code(fake_node, nodename + '_scalar', [i.name + '[i]' for i in inps], scal_out, dict(fail='return;')) # Translate types for scalar composite ops (except complex). support_code += """ #define npy_float64 ga_double #define npy_float32 ga_float #define npy_uint8 ga_ubyte #define npy_int8 ga_byte #define npy_uint16 ga_ushort #define npy_int16 ga_short #define npy_uint32 ga_uint #define npy_int32 ga_int #define npy_uint64 ga_ulong #define npy_int64 ga_long """ return ElemwiseKernel(None, inps + outs, kop, preamble=support_code)
def generate_kernel(self, node, nodename): inps = [make_argument(i, 'i%d' % (n,)) for n, i in enumerate(node.inputs)] scal_ins = [scalar.Scalar(i.dtype) for i in node.inputs] outs = [make_argument(o, 'o%d' % (n,)) for n, o in enumerate(node.outputs) if not n in self.inplace_pattern] scal_out = [scalar.Scalar(o.dtype) for o in node.outputs] fake_node = Apply(self.scalar_op, [i() for i in scal_ins], [o() for o in scal_out]) scal_out = [] oi = 0 for n in range(len(node.outputs)): if n in self.inplace_pattern: scal_out.append(inps[self.inplace_pattern[n]].name+'[i]') else: scal_out.append(outs[oi].name+'[i]') oi += 1 kop = self.scalar_op.c_code(fake_node, nodename+'_scalar', [i.name+'[i]' for i in inps], scal_out, dict(fail='return;')) # Translate types for scalar composite ops (except complex). support_code = """ #ifdef _MSC_VER #define signed __int8 int8_t #define unsigned __int8 uint8_t #define signed __int16 int16_t #define unsigned __int16 uint16_t #define signed __int32 int32_t #define unsigned __int32 uint32_t #define signed __int64 int64_t #define unsigned __int64 uint64_t #else #include <stdint.h> #endif #define ga_bool uint8_t #define ga_byte int8_t #define ga_ubyte uint8_t #define ga_short int16_t #define ga_ushort uint16_t #define ga_int int32_t #define ga_uint uint32_t #define ga_long int64_t #define ga_ulong uint64_t #define ga_float float #define ga_double double #define ga_half uint16_t #include <Python.h> #include <numpy/npy_common.h> """ return ElemwiseKernel(None, inps+outs, kop, preamble=support_code)
def elemwise_collapse(dtype1, dtype2, shape1, shape2, expected): assert len(shape1) == len(shape2) # int8 does not cause problematic upcasts scalar = numpy.asarray(1, dtype='int8') a_cpu, a_gpu = gen_gpuarray(shape1, dtype1, ctx=context) b_cpu, b_gpu = gen_gpuarray(shape2, dtype2, ctx=context) o_shape = [] for i in range(len(shape1)): o_shape.append(max(shape1[i], shape2[i])) o = gpuarray.empty(o_shape, dtype=(a_cpu + b_cpu).dtype, context=context) n, nd, dims, strs, offsets, contig = check_args((a_gpu, b_gpu), collapse=True, broadcast=True) assert nd == expected, (shape1, shape2, dims, nd, expected) k = ElemwiseKernel(context, [ArrayArg(numpy.dtype(dtype1), 'a'), ArrayArg(numpy.dtype(dtype2), 'b'), ArrayArg(o.dtype, 'o')], "o[i] = a[i] + b[i]") out_cpu = a_cpu + b_cpu k(a_gpu, b_gpu, o, collapse=True, broadcast=True) assert numpy.allclose(numpy.asarray(o), out_cpu) k(a_gpu, b_gpu, o, collapse=False, broadcast=True) assert numpy.allclose(numpy.asarray(o), out_cpu) broadcast = any([True for i in shape1 + shape2 if i == 1]) n, nd, dims, strs, offsets, contig = check_args((a_gpu, b_gpu, scalar), collapse=True, broadcast=True) assert nd == expected k = ElemwiseKernel(context, [ArrayArg(numpy.dtype(dtype1), 'a'), ArrayArg(numpy.dtype(dtype2), 'b'), ScalarArg(scalar.dtype, 's'), ArrayArg(o.dtype, 'o')], "o[i] = a[i] + b[i] + s") out_cpu = a_cpu + b_cpu + scalar k(a_gpu, b_gpu, scalar, o, collapse=True, broadcast=True) assert numpy.allclose(numpy.asarray(o), out_cpu) k(a_gpu, b_gpu, scalar, o, collapse=False, broadcast=True) assert numpy.allclose(numpy.asarray(o), out_cpu) if expected == 1: expected2 = 2 else: expected2 = expected if len(shape1) != 4: return if shape1[0] != 1: c_cpu, c_gpu = gen_gpuarray(shape1, dtype=dtype1, sliced=2, ctx=context) n, nd, dims, strs, offsets,contig = check_args((c_gpu, b_gpu), collapse=True, broadcast=True) if broadcast: assert nd >= expected else: assert nd == expected2
def make_node(self, *inputs): _inputs = [as_gpuarray_variable(i) for i in inputs] if self.nin > 0 and len(_inputs) != self.nin: raise TypeError("Wrong argument count", (self.nin, len(_inputs))) for i in _inputs[1:]: if i.type.ndim != inputs[0].type.ndim: raise TypeError('mismatched rank amongst inputs') broadcastable = [] for d in xrange(_inputs[0].type.ndim): bcast_d = True for i in _inputs: if not i.type.broadcastable[d]: bcast_d = False break broadcastable.append(bcast_d) assert len(broadcastable) == _inputs[0].type.ndim assert self.nout > 0 inps = [make_argument(i, 'i%d' % (n, )) for n, i in enumerate(inputs)] scal_ins = [scalar.Scalar(i.dtype) for i in inputs] res = Apply(self, _inputs, [ GpuArrayType(o.dtype, broadcastable)() for o in self.scalar_op.output_types(scal_ins) ]) outs = [ make_argument(o, 'o%d' % (n, )) for n, o in enumerate(res.outputs) ] scal_out = [scalar.Scalar(o.dtype) for o in res.outputs] fake_node = Apply(self.scalar_op, [i() for i in scal_ins], [o() for o in scal_out]) kcode = self.scalar_op.c_code(fake_node, 'kcode', [i.expr() for i in inps], [o.expr() for o in outs], sub=dict(fail='return;')) res.tag.kcode = kcode try: code = self.scalar_op.c_support_code_apply(fake_node, 'kcode') if code: raise SupportCodeError() except MethodNotDefined: pass support_code = "" try: support_code += self.scalar_op.c_support_code() except MethodNotDefined: pass if support_code != "#define THEANO_MACRO_MOD(x,y) (x % y)": # Avoid the C++ complex struct raise SupportCodeError() k = ElemwiseKernel(None, inps + outs, kcode, preamble=support_code) res.tag.kernel = k return res