def __call__(self, persid): from theano.gpuarray.type import get_context from theano.gpuarray import pygpu array_type, name = persid.split(".") if name in self.cache: return self.cache[name] ret = None if array_type == "gpuarray": with self.zip_file.open(name) as f: ctx_name = pickle.load(f) array = np.lib.format.read_array(f) if config.experimental.unpickle_gpu_on_cpu: # directly return numpy array warnings.warn( "config.experimental.unpickle_gpu_on_cpu is set " "to True. Unpickling GpuArray as numpy.ndarray" ) ret = array elif pygpu: ret = pygpu.array(array, context=get_context(ctx_name)) else: raise ImportError("pygpu not found. Cannot unpickle GpuArray") else: with self.zip_file.open(name) as f: ret = np.lib.format.read_array(f) self.cache[name] = ret return ret
def __call__(self, persid): from theano.gpuarray.type import get_context from theano.gpuarray import pygpu array_type, name = persid.split('.') if name in self.cache: return self.cache[name] ret = None if array_type == 'gpuarray': with self.zip_file.open(name) as f: ctx_name = pickle.load(f) array = np.lib.format.read_array(f) if config.experimental.unpickle_gpu_on_cpu: # directly return numpy array warnings.warn("config.experimental.unpickle_gpu_on_cpu is set " "to True. Unpickling GpuArray as numpy.ndarray") ret = array elif pygpu: ret = pygpu.array(array, context=get_context(ctx_name)) else: raise ImportError("pygpu not found. Cannot unpickle GpuArray") else: with self.zip_file.open(name) as f: ret = np.lib.format.read_array(f) self.cache[name] = ret return ret
def perform(self, node, inputs, output_storage): x = inputs[0] z = output_storage[0] z[0] = pygpu.empty(x.shape, dtype=x.dtype, context=get_context(self.context_name)) self.my_op(x, z[0])
def test_hostfromgpu_shape_i(): # Test that the shape is lifted over hostfromgpu m = mode_with_gpu.including("local_dot_to_dot22", "local_dot22_to_dot22scalar", "specialize") a = tt.fmatrix("a") ca = theano.gpuarray.type.GpuArrayType("float32", (False, False))() av = np.asarray(np.random.rand(5, 4), dtype="float32") cv = gpuarray.asarray(np.random.rand(5, 4), dtype="float32", context=get_context(test_ctx_name)) f = theano.function([a], GpuFromHost(test_ctx_name)(a), mode=m) assert any( isinstance(x.op, GpuFromHost) for x in f.maker.fgraph.toposort()) f = theano.function([a], GpuFromHost(test_ctx_name)(a).shape, mode=m) topo = f.maker.fgraph.toposort() assert isinstance(topo[0].op, tt.opt.Shape_i) assert isinstance(topo[1].op, tt.opt.Shape_i) assert isinstance(topo[2].op, tt.opt.MakeVector) assert tuple(f(av)) == (5, 4) f = theano.function([ca], host_from_gpu(ca), mode=m) assert host_from_gpu in [x.op for x in f.maker.fgraph.toposort()] f = theano.function([ca], host_from_gpu(ca).shape, mode=m) topo = f.maker.fgraph.toposort() assert isinstance(topo[0].op, theano.compile.Shape_i) assert isinstance(topo[1].op, theano.compile.Shape_i) assert isinstance(topo[2].op, tt.opt.MakeVector) assert tuple(f(cv)) == (5, 4)
def get_params(self, node): pygpu_gpuarray = pytest.importorskip("pygpu.gpuarray") return self.params_type.get_params( typecode=pygpu_gpuarray.dtype_to_typecode(self.dtype), context=get_context(self.context_name), )
def setup_method(self): self.input = gpu_ftensor4() self.filters = gpu_ftensor4() self.topgrad = gpu_ftensor4() self.constant_tensor = gpuarray.array( np.zeros((3, 5, 7, 11), dtype="float32"), context=get_context(test_ctx_name)) super().setup_method()
def rand_gpuarray(*shape, **kwargs): r = rng.rand(*shape) * 2 - 1 dtype = kwargs.pop("dtype", theano.config.floatX) cls = kwargs.pop("cls", None) if len(kwargs) != 0: raise TypeError("Unexpected argument %s", list(kwargs.keys())[0]) return gpuarray.array(r, dtype=dtype, cls=cls, context=get_context(test_ctx_name))
def make_node(self, x): x = as_gpuarray_variable(x, self.context_name) x_arg = pygpu.elemwise.arg('x', 'float32', read=True) c_arg = pygpu.elemwise.arg('c', 'float32', read=True, write=True) self.my_op = pygpu.elemwise.GpuElemwise( get_context(self.context_name), "c = " + str(self.a) + " * x + " + str(self.b), [x_arg, c_arg], convert_f16=True) return Apply(self, [x], [x.type()])
def test_transfer_cpu_gpu(): a = tt.fmatrix("a") g = GpuArrayType(dtype="float32", broadcastable=(False, False))("g") av = np.asarray(rng.rand(5, 4), dtype="float32") gv = gpuarray.array(av, context=get_context(test_ctx_name)) f = theano.function([a], GpuFromHost(test_ctx_name)(a)) fv = f(av) assert GpuArrayType.values_eq(fv, gv) f = theano.function([g], host_from_gpu(g)) fv = f(gv) assert np.all(fv == av)
def setup_method(self): super().setup_method() test_ctx = get_context(test_ctx_name) if test_ctx.kind != b"cuda": pytest.skip("Cuda specific tests") self.max_threads_dim0 = test_ctx.maxlsize0 self.max_grid_size1 = test_ctx.maxgsize2 self.op_class = CumOp # The CPU implementation is not so accurate, which throws out DebugMode. # Since propagating .tag.values_eq_approx to the output of every # GpuFromHost seems overkill, we just relax the rtol for these tests self.old_rtol = tt.float32_rtol tt.float32_rtol *= 2
def test_transfer_gpu_gpu(): g = GpuArrayType(dtype="float32", broadcastable=(False, False), context_name=test_ctx_name)() av = np.asarray(rng.rand(5, 4), dtype="float32") gv = gpuarray.array(av, context=get_context(test_ctx_name)) mode = mode_with_gpu.excluding("cut_gpua_host_transfers", "local_cut_gpua_host_gpua") f = theano.function([g], GpuToGpu(test_ctx_name)(g), mode=mode) topo = f.maker.fgraph.toposort() assert len(topo) == 1 assert isinstance(topo[0].op, GpuToGpu) fv = f(gv) assert GpuArrayType.values_eq(fv, gv)
def test_transfer_strided(): # This is just to ensure that it works in theano # libgpuarray has a much more comprehensive suit of tests to # ensure correctness a = tt.fmatrix("a") g = GpuArrayType(dtype="float32", broadcastable=(False, False))("g") av = np.asarray(rng.rand(5, 8), dtype="float32") gv = gpuarray.array(av, context=get_context(test_ctx_name)) av = av[:, ::2] gv = gv[:, ::2] f = theano.function([a], GpuFromHost(test_ctx_name)(a)) fv = f(av) assert GpuArrayType.values_eq(fv, gv) f = theano.function([g], host_from_gpu(g)) fv = f(gv) assert np.all(fv == av)
def test_shape(): x = GpuArrayType(dtype="float32", broadcastable=[False, False, False])() v = gpuarray.zeros((3, 4, 5), dtype="float32", context=get_context(test_ctx_name)) f = theano.function([x], x.shape) topo = f.maker.fgraph.toposort() assert np.all(f(v) == (3, 4, 5)) if theano.config.mode != "FAST_COMPILE": assert len(topo) == 4 assert isinstance(topo[0].op, tt.opt.Shape_i) assert isinstance(topo[1].op, tt.opt.Shape_i) assert isinstance(topo[2].op, tt.opt.Shape_i) assert isinstance(topo[3].op, tt.opt.MakeVector) mode = mode_with_gpu.excluding("local_shape_to_shape_i") f = theano.function([x], x.shape, mode=mode) topo = f.maker.fgraph.toposort() assert np.all(f(v) == (3, 4, 5)) assert len(topo) == 1 assert isinstance(topo[0].op, tt.Shape)
def setup_method(self): if get_context(test_ctx_name).kind != b"cuda": pytest.skip("Cuda specific tests")
def get_params(self, node): return get_context(self.context_name)
def transfer(x, target): try: get_context(target) return as_gpuarray_variable(x, target) except ContextNotDefined: pass
def get_params(self, node): return self.params_type.get_params(self, context=get_context(self.context_name), keepdims=self.keepdims)
def get_params(self, node): return self.params_type.get_params(self, context=get_context(self.context_name), dtype_int64=self.dtype_int64)
def local_opt(node): if type(node.op) in OP: # Either one of our inputs is on the gpu or # all of our clients are on the gpu replace = False # TODO: Maybe set context_name with infer_context_name()? context_name = None # We replace if any input is a host_from_gpu for i in node.inputs: if i.owner and i.owner.op == host_from_gpu and move_to_gpu( i): context_name = i.owner.inputs[0].type.context_name replace = True break if not replace: # We replace if *all* clients are on the GPU clients = [c for o in node.outputs for c in o.clients] replace = len(clients) != 0 for c, idx in clients: if c == "output" or not isinstance(c.op, GpuFromHost): replace = False # TODO: check that the clients want the same context? if replace: # All clients are GpuFromHost and we have at least one context_name = clients[0][0].op.context_name # Check if we should replace if (not replace or (cuda_only and get_context(context_name).kind != b"cuda") or any([ "complex" in getattr(i, "dtype", "") for i in node.inputs ])): return False # tag the inputs with the context in case # the context was derived from the outputs for i in node.inputs: i.tag.context_name = context_name new_op = maker(node.op, context_name, node.inputs, node.outputs) # This is needed as sometimes new_op inherits from OP. if new_op and new_op != node.op: if isinstance(new_op, Op): new_outputs = new_op(*node.inputs, return_list=True) to_cpu_fn = safe_to_cpu elif isinstance(new_op, (tuple, list)): new_outputs = new_op to_cpu_fn = safe_to_cpu else: # suppose it is a variable on the GPU new_outputs = [new_op] def to_cpu_fn(x): return x.transfer("cpu") # copy stack traces onto gpu outputs # also copy the stack traces onto HostFromGpu outputs on_cpu = [] for old_output, new_output in zip(node.outputs, new_outputs): copy_stack_trace(old_output, new_output) cpu = to_cpu_fn(new_output) on_cpu.append(cpu) copy_stack_trace(old_output, cpu) return on_cpu return False
def test_may_share_memory(): ctx = get_context(test_ctx_name) a = pygpu.empty((5, 4), context=ctx) b = pygpu.empty((5, 4), context=ctx) may_share_memory_core(a, b)
def gpu_alloc_expected(x, *shp): g = gpuarray.empty(shp, dtype=x.dtype, context=get_context(test_ctx_name)) g[:] = x return g