def local_gpua_advanced_incsubtensor(node): # This optimization is disabled if cuda is not active if pygpu.get_default_context().kind != "cuda": return None x, y, ilist = node.inputs # Gpu Ops needs both inputs to have the same dtype if (x.type.dtype != y.type.dtype): dtype = scalar.upcast(x.type.dtype, y.type.dtype) if x.type.dtype != dtype: x = tensor.cast(x, dtype) if y.type.dtype != dtype: y = tensor.cast(y, dtype) set_instead_of_inc = node.op.set_instead_of_inc active_device_no = theano.sandbox.cuda.active_device_number() device_properties = theano.sandbox.cuda.device_properties compute_capability = device_properties(active_device_no)['major'] if (compute_capability < 2 or x.ndim != 2 or y.ndim != 2): return [GpuAdvancedIncSubtensor1( set_instead_of_inc=set_instead_of_inc)(x, y, ilist)] else: return [GpuAdvancedIncSubtensor1_dev20( set_instead_of_inc=set_instead_of_inc)(x, y, ilist)]
def c_header_dirs(self): if pygpu.get_default_context().kind == 'opencl': raise MethodNotDefined('cuda only') cuda_root = config.cuda.root if cuda_root: return [os.path.join(cuda_root, 'include')] else: return []
def c_header_dirs(self): if pygpu.get_default_context().kind == "opencl": raise MethodNotDefined("cuda only") cuda_root = config.cuda.root res = [os.path.dirname(__file__)] if cuda_root: res.append(os.path.join(cuda_root, "include")) return res
def generate_kernel(self, node, odtype, redux): if isinstance(self.scalar_op, scalar.basic.Add): reduce_expr = "a + b" elif isinstance(self.scalar_op, scalar.basic.Mul): reduce_expr = "a * b" else: raise NotImplementedError() return ReductionKernel(pygpu.get_default_context(), odtype, self.scalar_op.identity, reduce_expr, redux, arguments=[make_argument(node.inputs[0], 'a')], init_nd=node.inputs[0].ndim)
def local_gpua_advanced_incsubtensor(node): # This optimization is disabled if cuda is not active if pygpu.get_default_context().kind != "cuda": return None x, y = node.inputs[0:2] set_instead_of_inc = node.op.set_instead_of_inc active_device_no = theano.sandbox.cuda.active_device_number() device_properties = theano.sandbox.cuda.device_properties compute_capability = device_properties(active_device_no)["major"] if compute_capability < 2 or x.ndim != 2 or y.ndim != 2: return GpuAdvancedIncSubtensor1(set_instead_of_inc=set_instead_of_inc) else: return GpuAdvancedIncSubtensor1_dev20(set_instead_of_inc=set_instead_of_inc)
def local_gpua_advanced_incsubtensor(node): # This optimization is disabled if cuda is not active if pygpu.get_default_context().kind != "cuda": return None x, y = node.inputs[0:2] set_instead_of_inc = node.op.set_instead_of_inc active_device_no = theano.sandbox.cuda.active_device_number() device_properties = theano.sandbox.cuda.device_properties compute_capability = device_properties(active_device_no)['major'] if (compute_capability < 2 or x.ndim != 2 or y.ndim != 2): return GpuAdvancedIncSubtensor1(set_instead_of_inc=set_instead_of_inc) else: return GpuAdvancedIncSubtensor1_dev20( set_instead_of_inc=set_instead_of_inc)
def varlp_moreau_integrand_gpuary(abs_f, p, sigma, num_newton_iter, out=None): """Integrand of the variable Lp Moreau envelope, GpuArray version. abs_f : `array-like` Magnitude of the input function (scalar or vectorial) to the functional. p : `array-like` Spatially varying exponent of the Lp modular. Must have same shape and dtype as ``abs_f``. sigma : positive float Step-size-like parameter of the envlope. num_newton_iter : positive int Number of Newton iterations to perform for the places where ``1 < p < 2``. out : `pygpu.gpuarray.GpuArray`, optional Array where the result should be stored. Its ``shape`` and ``dtype`` must match those of ``abs_f``. Returns ------- out : `pygpu.gpuarray.GpuArray` Factor for the proximal operator of the convex conjugate. If ``out`` was provided, the returned object is a reference to it. Examples -------- Exponent ``p = 1`` gives the Huber function of ``abs_f``, that is ``abs_f ** 2 / (2 * sigma)`` if ``abs_f <= sigma`` and ``abs_f - sigma / 2`` otherwise: >>> abs_f = pygpu.gpuarray.array([0.0, 0.5, 1.0, 1.5, 2.0]) >>> p1 = pygpu.gpuarray.array([1.0, 1.0, 1.0, 1.0, 1.0]) >>> sigma = 1.0 >>> result = varlp_moreau_integrand_gpuary(abs_f, p1, sigma, ... num_newton_iter=1) >>> np.allclose(result, [0, 0.125, 0.5, 1.0, 1.5]) True >>> sigma = 0.5 >>> result = varlp_moreau_integrand_gpuary(abs_f, p1, sigma, ... num_newton_iter=1) >>> np.allclose(result, [0, 0.25, 0.75, 1.25, 1.75]) True With ``p = 2`` one gets ``abs_f ** 2 / (1 + 2 * sigma)``: >>> p2 = pygpu.gpuarray.array([2.0, 2.0, 2.0, 2.0, 2.0]) >>> sigma = 0.5 >>> result = varlp_moreau_integrand_gpuary(abs_f, p2, sigma, ... num_newton_iter=1) >>> np.allclose(result, [0, 0.125, 0.5, 1.125, 2]) True """ ctx = pygpu.get_default_context() assert ctx is not None abs_f = pygpu.gpuarray.array(abs_f, copy=False) p = pygpu.gpuarray.array(p, copy=False) assert abs_f.dtype in (np.dtype('float32'), np.dtype('float64')) if out is None: out = abs_f._empty_like_me() sigma = float(sigma) num_newton_iter = int(num_newton_iter) args = [abs_f, p, sigma, num_newton_iter] argnames = ['abs_f', 'p', 'sigma', 'num_newton_iter'] # Render the preamble code from the mako template using the specific # definitions of dtype, maximum and power. if abs_f.dtype == np.dtype('float32') and ctx.kind == b'opencl': raise NotImplementedError("OpenCL kernels currently not supported " "for 'float32' data type") # Render the preamble source from templates pre_tpl = mako.template.Template(newton_tpl_str + moreau_integr_tpl_str) power = 'powf' if abs_f.dtype == np.dtype('float32') else 'pow' minimum = 'fminf' if abs_f.dtype == np.dtype('float32') else 'fmin' maximum = 'fmaxf' if abs_f.dtype == np.dtype('float32') else 'fmax' preamble = pre_tpl.render(dtype=DTYPE_TO_CTYPE[abs_f.dtype], maximum=maximum, minimum=minimum, power=power) # Define the elementwise expression expr = ('out = varlp_moreau_integrand(abs_f, p, sigma, num_newton_iter)') return elemwise(args, argnames, expr, preamble, out, 'out')
def varlp_cc_integrand_gpuary(abs_f, p, out=None): """Integrand for the variable Lp convex conjugate, GpuArray version. abs_f : `array-like` Magnitude of the input function (scalar or vectorial) to the functional. p : `array-like` Spatially varying exponent of the Lp modular. Must have same shape and dtype as ``abs_f``. out : `pygpu.gpuarray.GpuArray`, optional Array where the result should be stored. Its ``shape`` and ``dtype`` must match those of ``abs_f``. Returns ------- out : `pygpu.gpuarray.GpuArray` Integrand of the convex conjugate. If ``out`` was provided, the returned object is a reference to it. Examples -------- Exponent ``p = 1`` gives the indicator of the unit ball: >>> abs_f = pygpu.gpuarray.array([0.0, 0.5, 1.0, 1.5, 2.0]) >>> p1 = pygpu.gpuarray.array([1.0, 1.0, 1.0, 1.0, 1.0]) >>> result = varlp_cc_integrand_gpuary(abs_f, p1) >>> np.allclose(result, [0, 0, 0, np.inf, np.inf]) True With ``p = 2`` one gets ``abs_f ** 2 / 4``: >>> p2 = pygpu.gpuarray.array([2.0, 2.0, 2.0, 2.0, 2.0]) >>> result = varlp_cc_integrand_gpuary(abs_f, p2) >>> np.allclose(result, np.asarray(abs_f) ** 2 / 4) True For other ``p`` values, the result is ``abs_f**(p/(p-1)) * r``, where ``r = p**(-1/(p-1)) - p**(-p/(p-1))``: >>> p15 = pygpu.gpuarray.array([1.5, 1.5, 1.5, 1.5, 1.5]) >>> result = varlp_cc_integrand_gpuary(abs_f, p15) >>> p = np.asarray(p15) >>> r = p ** (-1 / (p - 1)) - p ** (-p / (p - 1)) >>> np.allclose(result, np.asarray(abs_f) ** (p / (p - 1)) * r) True """ ctx = pygpu.get_default_context() assert ctx is not None abs_f = pygpu.gpuarray.array(abs_f, copy=False) p = pygpu.gpuarray.array(p, copy=False) assert abs_f.dtype in (np.dtype('float32'), np.dtype('float64')) if out is None: out = abs_f._empty_like_me() args = [abs_f, p] argnames = ['abs_f', 'p'] # Render the preamble code from the mako template using the specific # definitions of dtype, maximum and power. if abs_f.dtype == np.dtype('float32') and ctx.kind == b'opencl': raise NotImplementedError("OpenCL kernels currently not supported " "for 'float32' data type") # Render the preamble source from templates pre_tpl = mako.template.Template(cc_integr_tpl_str) power = 'powf' if abs_f.dtype == np.dtype('float32') else 'pow' preamble = pre_tpl.render(dtype=DTYPE_TO_CTYPE[abs_f.dtype], power=power) # Define the elementwise expression expr = ('out = varlp_cc_integrand(abs_f, p)') return elemwise(args, argnames, expr, preamble, out, 'out')
def varlp_cc_prox_factor_gpuary(abs_f, p, sigma, num_newton_iter, out=None): """Multiplicative factor for the variable Lp cc prox, GpuArray version. abs_f : `array-like` Magnitude of the input function (scalar or vectorial) to the proximal. p : `array-like` Spatially varying exponent of the Lp modular. Must have same shape and dtype as ``abs_f``. sigma : positive float Step-size-like parameter of the proximal. num_newton_iter : positive int Number of Newton iterations to perform for the places where ``1 < p < 2``. out : `pygpu.gpuarray.GpuArray`, optional Array where the result should be stored. Its ``shape`` and ``dtype`` must match those of ``abs_f``. Returns ------- out : `pygpu.gpuarray.GpuArray` Factor for the proximal operator of the convex conjugate. If ``out`` was provided, the returned object is a reference to it. Examples -------- When ``abs_f == 0``, the returned value is always 0. Otherwise, exponent ``p = 1`` gives ``min(1, 1 / abs_f)``: >>> abs_f = np.array([0.0, 0.5, 1.0, 1.5, 2.0]) >>> p1 = np.array([1.0, 1.0, 1.0, 1.0, 1.0]) >>> sigma = 1.0 >>> result = varlp_cc_prox_factor_gpuary(abs_f, p1, sigma, ... num_newton_iter=1) >>> np.allclose(result, [0, 1, 1, 2.0 / 3.0, 0.5]) True With ``p = 2`` one gets ``2 / (2 + sigma)``: >>> p2 = np.array([2.0, 2.0, 2.0, 2.0, 2.0]) >>> sigma = 0.5 >>> result = varlp_cc_prox_factor_gpuary(abs_f, p2, sigma, ... num_newton_iter=1) >>> np.allclose(result, [0] + [0.8] * 4) True For other ``p`` values, the result is ``1 - v / abs_f``, where ``v`` satisfies the equation ``v + sigma**(1-p) * p * v**(p-1) = abs_f``: >>> p15 = pygpu.gpuarray.array([1.5, 1.5, 1.5, 1.5, 1.5]) >>> sigma = 1.0 >>> result = varlp_cc_prox_factor_gpuary(abs_f, p15, sigma, ... num_newton_iter=10) >>> v = (1 - np.asarray(result)) * abs_f >>> p = np.asarray(p15) >>> lhs = v + sigma ** (1 - p) * p * v ** (p - 1) >>> np.allclose(lhs, abs_f) True """ ctx = pygpu.get_default_context() assert ctx is not None abs_f = pygpu.gpuarray.array(abs_f, copy=False) p = pygpu.gpuarray.array(p, copy=False) assert abs_f.dtype in (np.dtype('float32'), np.dtype('float64')) if out is None: out = abs_f._empty_like_me() sigma = float(sigma) num_newton_iter = int(num_newton_iter) args = [abs_f, p, sigma, num_newton_iter] argnames = ['abs_f', 'p', 'sigma', 'num_newton_iter'] # Render the preamble code from the mako template using the specific # definitions of dtype, maximum and power. if abs_f.dtype == np.dtype('float32') and ctx.kind == b'opencl': raise NotImplementedError("OpenCL kernels currently not supported " "for 'float32' data type") # Render the preamble source from templates pre_tpl = mako.template.Template(newton_tpl_str + cc_prox_tpl_str) power = 'powf' if abs_f.dtype == np.dtype('float32') else 'pow' minimum = 'fminf' if abs_f.dtype == np.dtype('float32') else 'fmin' maximum = 'fmaxf' if abs_f.dtype == np.dtype('float32') else 'fmax' preamble = pre_tpl.render(dtype=DTYPE_TO_CTYPE[abs_f.dtype], maximum=maximum, minimum=minimum, power=power) # Define the elementwise expression expr = ('out = varlp_cc_prox_factor(abs_f, p, sigma, num_newton_iter)') return elemwise(args, argnames, expr, preamble, out, 'out')
def c_init_code(self): if pygpu.get_default_context().kind == 'opencl': raise MethodNotDefined('cuda only') return ['setup_ext_cuda();']
def c_headers(self): if pygpu.get_default_context().kind == 'opencl': raise MethodNotDefined('cuda only') return ['<stdint.h>', '<stdio.h>', 'cuda.h', '<gpuarray/extension.h>', '<numpy_compat.h>', '<gpuarray/ext_cuda.h>', '<gpuarray/types.h>']
def c_headers(self): if pygpu.get_default_context().kind == "opencl": raise MethodNotDefined("cuda only") return ["cuda.h", "<numpy_compat.h>", "<gpuarray_helper.h>", "<gpuarray/types.h>"]
def c_header_dirs(self): if pygpu.get_default_context().kind == 'opencl': raise MethodNotDefined('cuda only') cuda_root = config.cuda.root if cuda_root: return [os.path.join(cuda_root, 'include')]