def test_hostfromgpu_shape_i(): """ Test that the shape is lifted over hostfromgpu """ m = mode_with_gpu.including('local_dot_to_dot22', 'local_dot22_to_dot22scalar','specialize') a = T.fmatrix('a') ca = theano.sandbox.gpuarray.type.GpuArrayType('float32', (False, False))() av = numpy.asarray(numpy.random.rand(5, 4), dtype='float32') cv = gpuarray.asarray(numpy.random.rand(5, 4), dtype='float32') gpu_from_host = theano.sandbox.gpuarray.basic_ops.gpu_from_host host_from_gpu = theano.sandbox.gpuarray.basic_ops.host_from_gpu f = theano.function([a], gpu_from_host(a), mode=m) assert gpu_from_host in [x.op for x in f.maker.fgraph.toposort()] f = theano.function([a], gpu_from_host(a).shape, mode=m) topo = f.maker.fgraph.toposort() assert isinstance(topo[0].op, T.opt.Shape_i) assert isinstance(topo[1].op, T.opt.Shape_i) assert isinstance(topo[2].op, T.opt.MakeVector) assert tuple(f(av)) == (5, 4) f = theano.function([ca], host_from_gpu(ca), mode=m) assert host_from_gpu in [x.op for x in f.maker.fgraph.toposort()] f = theano.function([ca], host_from_gpu(ca).shape, mode=m) topo = f.maker.fgraph.toposort() assert isinstance(topo[0].op, theano.compile.Shape_i) assert isinstance(topo[1].op, theano.compile.Shape_i) assert isinstance(topo[2].op, theano.tensor.opt.MakeVector) assert tuple(f(cv)) == (5, 4)
def test_hostfromgpu_shape_i(): """ Test that the shape is lifted over hostfromgpu """ m = mode_with_gpu.including('local_dot_to_dot22', 'local_dot22_to_dot22scalar', 'specialize') a = T.fmatrix('a') ca = theano.sandbox.gpuarray.type.GpuArrayType('float32', (False, False))() av = numpy.asarray(numpy.random.rand(5, 4), dtype='float32') cv = gpuarray.asarray(numpy.random.rand(5, 4), dtype='float32') gpu_from_host = theano.sandbox.gpuarray.basic_ops.gpu_from_host host_from_gpu = theano.sandbox.gpuarray.basic_ops.host_from_gpu f = theano.function([a], gpu_from_host(a), mode=m) assert gpu_from_host in [x.op for x in f.maker.fgraph.toposort()] f = theano.function([a], gpu_from_host(a).shape, mode=m) topo = f.maker.fgraph.toposort() assert isinstance(topo[0].op, T.opt.Shape_i) assert isinstance(topo[1].op, T.opt.Shape_i) assert isinstance(topo[2].op, T.opt.MakeVector) assert tuple(f(av)) == (5, 4) f = theano.function([ca], host_from_gpu(ca), mode=m) assert host_from_gpu in [x.op for x in f.maker.fgraph.toposort()] f = theano.function([ca], host_from_gpu(ca).shape, mode=m) topo = f.maker.fgraph.toposort() assert isinstance(topo[0].op, theano.compile.Shape_i) assert isinstance(topo[1].op, theano.compile.Shape_i) assert isinstance(topo[2].op, theano.tensor.opt.MakeVector) assert tuple(f(cv)) == (5, 4)
def local_gpua_careduce(node): if (isinstance(node.op.scalar_op, scalar.basic.Add) or isinstance(node.op.scalar_op, scalar.basic.Mul)): x, = node.inputs greduce = GpuCAReduceCuda(node.op.scalar_op, axis=node.op.axis) if x.dtype != "float32": return gvar = greduce(x) #We need to have the make node called, otherwise the mask can #be None if gvar.owner.op.supports_c_code([gpu_from_host(x)]): return greduce else: # Try to make a simpler pattern based on reshaping # The principle is that if two adjacent dimensions have # the same value in the reduce_mask, then we can reshape # to make them a single dimension, do the reduction, and # then reshape to get them back. if node.op.axis is None: reduce_mask = [1] * x.type.ndim else: reduce_mask = [0] * x.type.ndim for a in node.op.axis: assert reduce_mask[a] == 0 reduce_mask[a] = 1 shape_of = node.fgraph.shape_feature.shape_of x_shape = shape_of[x] new_in_shp = [x_shape[0]] new_mask = [reduce_mask[0]] for i in xrange(1, x.type.ndim): if reduce_mask[i] == reduce_mask[i - 1]: new_in_shp[-1] *= x_shape[i] else: new_mask.append(reduce_mask[i]) new_in_shp.append(x_shape[i]) new_greduce = GpuCAReduceCuda(new_mask, scalar_op) reshaped_x = x.reshape(tensor.stack(*new_in_shp)) gpu_reshaped_x = gpu_from_host(reshaped_x) reshaped_gpu_inputs = [gpu_reshaped_x] if new_greduce.supports_c_code(reshaped_gpu_inputs): reduce_reshaped_x = host_from_gpu( new_greduce(gpu_reshaped_x)) if reduce_reshaped_x.ndim != node.outputs[0].ndim: unreshaped_reduce = reduce_reshaped_x.reshape( tensor.stack(*shape_of[node.outputs[0]])) else: unreshaped_reduce = reduce_reshaped_x return [unreshaped_reduce]
def local_gpua_careduce(node): if (isinstance(node.op.scalar_op, scalar.basic.Add) or isinstance(node.op.scalar_op, scalar.basic.Mul)): x, = node.inputs greduce = GpuCAReduceCuda(node.op.scalar_op, axis=node.op.axis) if x.dtype != "float32": return gvar = greduce(x) #We need to have the make node called, otherwise the mask can #be None if gvar.owner.op.supports_c_code([gpu_from_host(x)]): return greduce else: # Try to make a simpler pattern based on reshaping # The principle is that if two adjacent dimensions have # the same value in the reduce_mask, then we can reshape # to make them a single dimension, do the reduction, and # then reshape to get them back. if node.op.axis is None: reduce_mask = [1] * x.type.ndim else: reduce_mask = [0] * x.type.ndim for a in node.op.axis: assert reduce_mask[a] == 0 reduce_mask[a] = 1 shape_of = node.fgraph.shape_feature.shape_of x_shape = shape_of[x] new_in_shp = [x_shape[0]] new_mask = [reduce_mask[0]] for i in xrange(1, x.type.ndim): if reduce_mask[i] == reduce_mask[i - 1]: new_in_shp[-1] *= x_shape[i] else: new_mask.append(reduce_mask[i]) new_in_shp.append(x_shape[i]) new_greduce = GpuCAReduceCuda(new_mask, scalar_op) reshaped_x = x.reshape(tensor.stack(*new_in_shp)) gpu_reshaped_x = gpu_from_host(reshaped_x) reshaped_gpu_inputs = [gpu_reshaped_x] if new_greduce.supports_c_code(reshaped_gpu_inputs): reduce_reshaped_x = host_from_gpu( new_greduce(gpu_reshaped_x)) if reduce_reshaped_x.ndim != node.outputs[0].ndim: unreshaped_reduce = reduce_reshaped_x.reshape( tensor.stack(*shape_of[node.outputs[0]])) else: unreshaped_reduce = reduce_reshaped_x return [unreshaped_reduce]
def make_graph(img, kern): buf = tensor.alloc(numpy.asarray(0, dtype=img.dtype), img.shape[0], *op.imshp_logical) img = tensor.set_subtensor(buf[:, :, ::rstride, ::cstride], img) img = gpu_from_host(img) return ret(img, kern)
def make_graph(img, kern): buf = tensor.alloc(numpy.asarray(0, dtype=img.dtype), img.shape[0], *op.imshp_logical) img = tensor.set_subtensor(buf[:, :, ::rstride, ::cstride], img) img = gpu_from_host(img) return ret(img, kern)
def test_transfer_cpu_gpu(): a = T.fmatrix('a') g = GpuArrayType(dtype='float32', broadcastable=(False, False))('g') av = numpy.asarray(rng.rand(5, 4), dtype='float32') gv = gpuarray.array(av) f = theano.function([a], gpu_from_host(a)) fv = f(av) assert GpuArrayType.values_eq(fv, gv) f = theano.function([g], host_from_gpu(g)) fv = f(gv) assert numpy.all(fv == av)
def test_transfer_cpu_gpu(): a = T.fmatrix('a') g = GpuArrayType(dtype='float32', broadcastable=(False, False))('g') av = numpy.asarray(rng.rand(5, 4), dtype='float32') gv = gpuarray.array(av) f = theano.function([a], gpu_from_host(a)) fv = f(av) assert GpuArrayType.values_eq(fv, gv) f = theano.function([g], host_from_gpu(g)) fv = f(gv) assert numpy.all(fv == av)
def apply(self, fgraph): for input in fgraph.inputs: if isinstance(input.type, GpuArrayType): continue if len(input.clients) == 1 and (input.clients[0][0] == "output" or input.clients[0][0].op == gpu_from_host): continue try: new_input = host_from_gpu(gpu_from_host(input)) fgraph.replace_validate(input, new_input, "InputToGpuOptimizer") except TypeError, e: # This could fail if the inputs are not TensorTypes pass
def apply(self, fgraph): for input in fgraph.inputs: if isinstance(input.type, GpuArrayType): continue if (len(input.clients) == 1 and (input.clients[0][0] == 'output' or input.clients[0][0].op == gpu_from_host)): continue try: new_input = host_from_gpu(gpu_from_host(input)) fgraph.replace_validate(input, new_input, "InputToGpuOptimizer") except TypeError, e: # This could fail if the inputs are not TensorTypes pass
def local_gpua_subtensor(node): x = node.inputs[0] if (x.owner and isinstance(x.owner.op, HostFromGpu)): gpu_x = x.owner.inputs[0] if (gpu_x.owner and isinstance(gpu_x.owner.op, GpuFromHost) and # And it is a shared var or an input of the graph. not gpu_x.owner.inputs[0].owner): if len(x.clients) == 1: if any([n == 'output' or any([isinstance(v.type, GpuArrayType) for v in n.inputs + n.outputs]) for n,_ in node.outputs[0].clients]): return else: return [host_from_gpu(gpu_from_host(node.outputs[0]))] return GpuSubtensor(node.op.idx_list)
def test_transfer_strided(): # This is just to ensure that it works in theano # compyte has a much more comprehensive suit of tests to ensure correctness a = T.fmatrix('a') g = GpuArrayType(dtype='float32', broadcastable=(False, False))('g') av = numpy.asarray(rng.rand(5, 8), dtype='float32') gv = gpuarray.array(av) av = av[:,::2] gv = gv[:,::2] f = theano.function([a], gpu_from_host(a)) fv = f(av) assert GpuArrayType.values_eq(fv, gv) f = theano.function([g], host_from_gpu(g)) fv = f(gv) assert numpy.all(fv == av)
def local_gpua_subtensor(node): x = node.inputs[0] if (x.owner and isinstance(x.owner.op, HostFromGpu)): gpu_x = x.owner.inputs[0] if (gpu_x.owner and isinstance(gpu_x.owner.op, GpuFromHost) and # And it is a shared var or an input of the graph. not gpu_x.owner.inputs[0].owner): if len(x.clients) == 1: if any([ n == 'output' or any([ isinstance(v.type, GpuArrayType) for v in n.inputs + n.outputs ]) for n, _ in node.outputs[0].clients ]): return else: return [host_from_gpu(gpu_from_host(node.outputs[0]))] return GpuSubtensor(node.op.idx_list)
def test_transfer_strided(): # This is just to ensure that it works in theano # compyte has a much more comprehensive suit of tests to ensure correctness a = T.fmatrix('a') g = GpuArrayType(dtype='float32', broadcastable=(False, False))('g') av = numpy.asarray(rng.rand(5, 8), dtype='float32') gv = gpuarray.array(av) av = av[:, ::2] gv = gv[:, ::2] f = theano.function([a], gpu_from_host(a)) fv = f(av) assert GpuArrayType.values_eq(fv, gv) f = theano.function([g], host_from_gpu(g)) fv = f(gv) assert numpy.all(fv == av)
def local_gpu_conv(node): """ gpu_from_host(conv) -> gpu_conv(gpu_from_host) conv(host_from_gpu) -> host_from_gpu(gpu_conv) """ def GpuConvOp_from_ConvOp(op): logical_img_hw = None if op.kshp_logical is not None and op.kshp_logical != op.kshp: return None # print op.kshp, op.imshp[1:3] # print op.kshp_logical, logical_img_hw ret = GpuConv(border_mode=op.out_mode, subsample=(op.dx, op.dy), logical_img_hw=logical_img_hw, logical_kern_hw=op.kshp_logical, logical_kern_align_top=op.kshp_logical_top_aligned, kshp=op.kshp, version=op.version, verbose=op.verbose, imshp=op.imshp, ) if op.imshp_logical is not None: logical_img_hw = op.imshp_logical[1:3] if logical_img_hw != op.imshp[1:3]: # this case is not implemented # return None rstride = int(numpy.ceil(op.imshp_logical[1] / float(op.imshp[1]))) cstride = int(numpy.ceil(op.imshp_logical[2] / float(op.imshp[2]))) def make_graph(img, kern): buf = tensor.alloc(numpy.asarray(0, dtype=img.dtype), img.shape[0], *op.imshp_logical) img = tensor.set_subtensor(buf[:, :, ::rstride, ::cstride], img) img = gpu_from_host(img) return ret(img, kern) return make_graph return ret def values_eq_approx(a, b): """This fct is needed to don't have DebugMode raise useless error due to ronding error. This happen as We reduce on the two last dimensions, so this can raise the absolute error if the number of element we reduce on is significant. """ assert a.ndim == 4 atol = None if a.shape[-1] * a.shape[-2] > 100: # For float32 the default atol is 1e-5 atol = 3e-5 return GpuArrayType.values_eq_approx(a, b, atol=atol) img, kern = node.inputs gpu_conv = GpuConvOp_from_ConvOp(node.op) if gpu_conv is None: return out = gpu_conv(gpu_from_host(img), gpu_from_host(kern)) # in some case the ConvOp broadcast the last 2 dimensions # differently then the gpu ConvOp out = tensor.patternbroadcast( host_from_gpu(out), node.outputs[0].broadcastable) # op_lifter want the output on the GPU. out = gpu_from_host(out) out.values_eq_approx = values_eq_approx return [out]
def local_gpua_specifyShape(node): if isinstance(node.inputs[0].type, GpuArrayType): return inp = [gpu_from_host(node.inputs[0])] + node.inputs[1:] return tensor.specify_shape(*inp)
def safe_to_gpu(x): if isinstance(x.type, tensor.TensorType): return gpu_from_host(x) else: return x
def local_gpu_conv(node): """ gpu_from_host(conv) -> gpu_conv(gpu_from_host) conv(host_from_gpu) -> host_from_gpu(gpu_conv) """ def GpuConvOp_from_ConvOp(op): logical_img_hw = None if op.kshp_logical is not None and op.kshp_logical != op.kshp: return None # print op.kshp, op.imshp[1:3] # print op.kshp_logical, logical_img_hw ret = GpuConv(border_mode=op.out_mode, subsample=(op.dx, op.dy), logical_img_hw=logical_img_hw, logical_kern_hw=op.kshp_logical, logical_kern_align_top=op.kshp_logical_top_aligned, kshp=op.kshp, version=op.version, verbose=op.verbose, imshp=op.imshp, ) if op.imshp_logical is not None: logical_img_hw = op.imshp_logical[1:3] if logical_img_hw != op.imshp[1:3]: # this case is not implemented # return None rstride = int(numpy.ceil(op.imshp_logical[1] / float(op.imshp[1]))) cstride = int(numpy.ceil(op.imshp_logical[2] / float(op.imshp[2]))) def make_graph(img, kern): buf = tensor.alloc(numpy.asarray(0, dtype=img.dtype), img.shape[0], *op.imshp_logical) img = tensor.set_subtensor(buf[:, :, ::rstride, ::cstride], img) img = gpu_from_host(img) return ret(img, kern) return make_graph return ret def values_eq_approx(a, b): """This fct is needed to don't have DebugMode raise useless error due to ronding error. This happen as We reduce on the two last dimensions, so this can raise the absolute error if the number of element we reduce on is significant. """ assert a.ndim == 4 atol = None if a.shape[-1] * a.shape[-2] > 100: # For float32 the default atol is 1e-5 atol = 3e-5 return GpuArrayType.values_eq_approx(a, b, atol=atol) img, kern = node.inputs gpu_conv = GpuConvOp_from_ConvOp(node.op) if gpu_conv is None: return out = gpu_conv(gpu_from_host(img), gpu_from_host(kern)) # in some case the ConvOp broadcast the last 2 dimensions # differently then the gpu ConvOp out = tensor.patternbroadcast( host_from_gpu(out), node.outputs[0].broadcastable) # op_lifter want the output on the GPU. out = gpu_from_host(out) out.values_eq_approx = values_eq_approx return [out]
def local_gpua_careduce(node): if isinstance(node.op.scalar_op, (scalar.Add, scalar.Mul, scalar.Maximum, scalar.Minimum)): dev = theano.sandbox.gpuarray.init_dev.device if dev.startswith('opencl'): op = GpuCAReduceCPY if node.op.scalar_op not in [scalar.add, scalar.mul]: # We don't support yet all reduction with cpy code. return else: op = GpuCAReduceCuda x, = node.inputs greduce = op( node.op.scalar_op, axis=node.op.axis, dtype=getattr(node.op, 'dtype', None), acc_dtype=getattr(node.op, 'acc_dtype', None)) gvar = greduce(x) # We need to have the make node called, otherwise the mask can # be None if (op is GpuCAReduceCPY or gvar.owner.op.supports_c_code([gpu_from_host(x)])): return greduce else: # Try to make a simpler pattern based on reshaping # The principle is that if two adjacent dimensions have # the same value in the reduce_mask, then we can reshape # to make them a single dimension, do the reduction, and # then reshape to get them back. if node.op.axis is None: reduce_mask = [1] * x.type.ndim else: reduce_mask = [0] * x.type.ndim for a in node.op.axis: assert reduce_mask[a] == 0 reduce_mask[a] = 1 shape_of = node.fgraph.shape_feature.shape_of x_shape = shape_of[x] new_in_shp = [x_shape[0]] new_mask = [reduce_mask[0]] for i in xrange(1, x.type.ndim): if reduce_mask[i] == reduce_mask[i - 1]: new_in_shp[-1] *= x_shape[i] else: new_mask.append(reduce_mask[i]) new_in_shp.append(x_shape[i]) new_axis = [] for idx, m in enumerate(new_mask): if m == 1: new_axis.append(idx) greduce = op( node.op.scalar_op, axis=new_axis, reduce_mask=new_mask, dtype=getattr(node.op, 'dtype', None), acc_dtype=getattr(node.op, 'acc_dtype', None)) reshaped_x = x.reshape(tensor.stack(*new_in_shp)) gpu_reshaped_x = gpu_from_host(reshaped_x) gvar = greduce(gpu_reshaped_x) # We need to have the make node called, otherwise the mask can # be None reshaped_gpu_inputs = [gpu_reshaped_x] if greduce.supports_c_code(reshaped_gpu_inputs): reduce_reshaped_x = host_from_gpu( greduce(gpu_reshaped_x)) if reduce_reshaped_x.ndim != node.outputs[0].ndim: unreshaped_reduce = reduce_reshaped_x.reshape( tensor.stack(*shape_of[node.outputs[0]])) else: unreshaped_reduce = reduce_reshaped_x return [unreshaped_reduce]
def local_gpua_shape(node): # op_lifter will call this opt too frequently as the output is # always on the CPU. if isinstance(node.inputs[0].type, GpuArrayType): return return [gpu_from_host(node.inputs[0]).shape]
def local_gpua_specifyShape(node): if isinstance(node.inputs[0].type, GpuArrayType): return inp = [gpu_from_host(node.inputs[0])] + node.inputs[1:] return tensor.specify_shape(*inp)
def test_one_sequence_one_output_weights_gpu1(self): def f_rnn(u_t, x_tm1, W_in, W): return u_t * W_in + x_tm1 * W u = theano.tensor.fvector('u') x0 = theano.tensor.fscalar('x0') W_in = theano.tensor.fscalar('win') W = theano.tensor.fscalar('w') mode = mode_with_gpu.excluding('InputToGpuOptimizer') output, updates = theano.scan(f_rnn, u, x0, [W_in, W], n_steps=None, truncate_gradient=-1, go_backwards=False, mode=mode) output = gpu_from_host(output) f2 = theano.function([u, x0, W_in, W], output, updates=updates, allow_input_downcast=True, mode=mode) rng = numpy.random.RandomState(utt.fetch_seed()) v_u = rng.uniform(size=(4,), low=-5., high=5.) v_x0 = rng.uniform() W = rng.uniform() W_in = rng.uniform() v_u = numpy.asarray(v_u, dtype='float32') v_x0 = numpy.asarray(v_x0, dtype='float32') W = numpy.asarray(W, dtype='float32') W_in = numpy.asarray(W_in, dtype='float32') # compute the output in numpy v_out = numpy.zeros((4,)) v_out[0] = v_u[0] * W_in + v_x0 * W for step in xrange(1, 4): v_out[step] = v_u[step] * W_in + v_out[step - 1] * W theano_values = f2(v_u, v_x0, W_in, W) utt.assert_allclose(theano_values, v_out) # TO DEL topo = f2.maker.fgraph.toposort() scan_node = [node for node in topo if isinstance(node.op, theano.scan_module.scan_op.Scan)] assert len(scan_node) == 1 scan_node = scan_node[0] topo = f2.maker.fgraph.toposort() assert sum([isinstance(node.op, HostFromGpu) for node in topo]) == 0 assert sum([isinstance(node.op, GpuFromHost) for node in topo]) == 4 scan_node = [node for node in topo if isinstance(node.op, theano.scan_module.scan_op.Scan)] assert len(scan_node) == 1 scan_node = scan_node[0] scan_node_topo = scan_node.op.fn.maker.fgraph.toposort() # check that there is no gpu transfer in the inner loop. assert any([isinstance(node.op, GpuElemwise) for node in scan_node_topo]) assert not any([isinstance(node.op, HostFromGpu) for node in scan_node_topo]) assert not any([isinstance(node.op, GpuFromHost) for node in scan_node_topo])
def safe_to_gpu(x): if isinstance(x.type, tensor.TensorType): return gpu_from_host(x) else: return x
def test_one_sequence_one_output_weights_gpu1(self): def f_rnn(u_t, x_tm1, W_in, W): return u_t * W_in + x_tm1 * W u = theano.tensor.fvector('u') x0 = theano.tensor.fscalar('x0') W_in = theano.tensor.fscalar('win') W = theano.tensor.fscalar('w') mode = mode_with_gpu.excluding('InputToGpuOptimizer') output, updates = theano.scan(f_rnn, u, x0, [W_in, W], n_steps=None, truncate_gradient=-1, go_backwards=False, mode=mode) output = gpu_from_host(output) f2 = theano.function([u, x0, W_in, W], output, updates=updates, allow_input_downcast=True, mode=mode) rng = numpy.random.RandomState(utt.fetch_seed()) v_u = rng.uniform(size=(4, ), low=-5., high=5.) v_x0 = rng.uniform() W = rng.uniform() W_in = rng.uniform() v_u = numpy.asarray(v_u, dtype='float32') v_x0 = numpy.asarray(v_x0, dtype='float32') W = numpy.asarray(W, dtype='float32') W_in = numpy.asarray(W_in, dtype='float32') # compute the output in numpy v_out = numpy.zeros((4, )) v_out[0] = v_u[0] * W_in + v_x0 * W for step in xrange(1, 4): v_out[step] = v_u[step] * W_in + v_out[step - 1] * W theano_values = f2(v_u, v_x0, W_in, W) utt.assert_allclose(theano_values, v_out) # TO DEL topo = f2.maker.fgraph.toposort() scan_node = [ node for node in topo if isinstance(node.op, theano.scan_module.scan_op.Scan) ] assert len(scan_node) == 1 scan_node = scan_node[0] topo = f2.maker.fgraph.toposort() assert sum([isinstance(node.op, HostFromGpu) for node in topo]) == 0 assert sum([isinstance(node.op, GpuFromHost) for node in topo]) == 4 scan_node = [ node for node in topo if isinstance(node.op, theano.scan_module.scan_op.Scan) ] assert len(scan_node) == 1 scan_node = scan_node[0] scan_node_topo = scan_node.op.fn.maker.fgraph.toposort() # check that there is no gpu transfer in the inner loop. assert any( [isinstance(node.op, GpuElemwise) for node in scan_node_topo]) assert not any( [isinstance(node.op, HostFromGpu) for node in scan_node_topo]) assert not any( [isinstance(node.op, GpuFromHost) for node in scan_node_topo])