def test_gpu_opt(): if not cuda.cuda_available: # Skip test if cuda_ndarray is not available. from nose.plugins.skip import SkipTest raise SkipTest('Optional package cuda not available') # We test the case where we put the op on the gpu when the output # is moved to the gpu. p = tensor.fmatrix() u = tensor.fvector() m = multinomial.MultinomialFromUniform('auto')(p, u) assert m.dtype == 'float32', m.dtype m_gpu = cuda.gpu_from_host(m) f = function([p, u], m_gpu, allow_input_downcast=True, mode=get_mode(True)) assert any([type(node.op) is multinomial.GpuMultinomialFromUniform for node in f.maker.fgraph.toposort()]) pval = numpy.arange(10000 * 4, dtype='float32').reshape((10000, 4))+0.1 pval = pval / pval.sum(axis=1)[:, None] uval = numpy.ones_like(pval[:, 0]) * 0.5 mval = f(pval, uval) # Test with a row, it was failing in the past. r = tensor.frow() m = multinomial.MultinomialFromUniform('auto')(r, u) assert m.dtype == 'float32', m.dtype m_gpu = cuda.gpu_from_host(m) f = function([r, u], m_gpu, allow_input_downcast=True, mode=get_mode(True)) assert any([type(node.op) is multinomial.GpuMultinomialFromUniform for node in f.maker.fgraph.toposort()]) pval = numpy.arange(1 * 4, dtype='float32').reshape((1, 4))+0.1 pval = pval / pval.sum(axis=1)[:, None] uval = numpy.ones_like(pval[:, 0]) * 0.5 mval2 = f(pval, uval)
def test_default_conv(): """Just test that we introduce the right GPU convolution version. """ img = theano.tensor.ftensor4() fil = theano.tensor.ftensor4() c = theano.tensor.nnet.conv2d(img, fil) f = theano.function([img, fil], c, mode=theano_mode) if cuda.dnn.dnn_available(): assert any([isinstance(a.op, GpuDnnConv) for a in f.maker.fgraph.apply_nodes]) else: assert any([isinstance(a.op, cuda.blas.GpuCorrMM) for a in f.maker.fgraph.apply_nodes]) mode = theano_mode.excluding('local_conv_dnn', 'local_conv_gemm') f = theano.function([img, fil], c, mode=mode) assert any([isinstance(a.op, cuda.blas.GpuConv) for a in f.maker.fgraph.apply_nodes]) mode = theano_mode.excluding('conv_dnn', 'conv_gemm') f = theano.function([img, fil], c, mode=mode) assert any([isinstance(a.op, cuda.blas.GpuConv) for a in f.maker.fgraph.apply_nodes])
def test_log1msigm_to_softplus(self): x = T.matrix() out = T.log(1 - sigmoid(x)) f = theano.function([x], out, mode=self.m) topo = f.maker.fgraph.toposort() assert len(topo) == 2 assert isinstance(topo[0].op.scalar_op, theano.tensor.nnet.sigm.ScalarSoftplus) assert isinstance(topo[1].op.scalar_op, theano.scalar.Neg) f(numpy.random.rand(54, 11).astype(config.floatX)) # Same test with a flatten out = T.log(1 - T.flatten(sigmoid(x))) f = theano.function([x], out, mode=self.m) topo = f.maker.fgraph.toposort() assert len(topo) == 3 assert isinstance(topo[0].op, T.Flatten) assert isinstance(topo[1].op.scalar_op, theano.tensor.nnet.sigm.ScalarSoftplus) assert isinstance(topo[2].op.scalar_op, theano.scalar.Neg) f(numpy.random.rand(54, 11).astype(config.floatX)) # Same test with a reshape out = T.log(1 - sigmoid(x).reshape([x.size])) f = theano.function([x], out, mode=self.m) topo = f.maker.fgraph.toposort() #assert len(topo) == 3 assert any(isinstance(node.op, T.Reshape) for node in topo) assert any(isinstance(getattr(node.op, 'scalar_op', None), theano.tensor.nnet.sigm.ScalarSoftplus) for node in topo) f(numpy.random.rand(54, 11).astype(config.floatX))
def test_local_sampling_dot_csr(): if not theano.config.cxx: raise SkipTest("G++ not available, so we need to skip this test.") mode = theano.compile.mode.get_default_mode() mode = mode.including("specialize", "local_sampling_dot_csr") for sp_format in ['csr']: # Not implemented for other format inputs = [ tensor.matrix(), tensor.matrix(), getattr(theano.sparse, sp_format + '_matrix')() ] f = theano.function(inputs, sparse.sampling_dot(*inputs), mode=mode) if theano.config.blas.ldflags: assert not any( isinstance(node.op, sparse.SamplingDot) for node in f.maker.fgraph.toposort()) else: # SamplingDotCSR's C implementation needs blas, so it should not # be inserted assert not any( isinstance(node.op, sparse.opt.SamplingDotCSR) for node in f.maker.fgraph.toposort())
def test_default_conv(): """Just test that we introduce the right GPU convolution version. """ img = theano.tensor.ftensor4() fil = theano.tensor.ftensor4() c = theano.tensor.nnet.conv2d(img, fil) f = theano.function([img, fil], c, mode=theano_mode) if cuda.dnn.dnn_available(): assert any( [isinstance(a.op, GpuDnnConv) for a in f.maker.fgraph.apply_nodes]) else: assert any([ isinstance(a.op, cuda.blas.GpuCorrMM) for a in f.maker.fgraph.apply_nodes ]) mode = theano_mode.excluding('local_conv_dnn', 'local_conv_gemm') f = theano.function([img, fil], c, mode=mode) assert any([ isinstance(a.op, cuda.blas.GpuConv) for a in f.maker.fgraph.apply_nodes ]) mode = theano_mode.excluding('conv_dnn', 'conv_gemm') f = theano.function([img, fil], c, mode=mode) assert any([ isinstance(a.op, cuda.blas.GpuConv) for a in f.maker.fgraph.apply_nodes ])
def test_pooling_opt(): if not cuda.dnn.dnn_available(): raise SkipTest(cuda.dnn.dnn_available.msg) x = T.ftensor4() f = theano.function([x], max_pool_2d(x, ds=(2, 2), ignore_border=True), mode=mode_with_gpu) assert any([ isinstance(n.op, cuda.dnn.GpuDnnPool) for n in f.maker.fgraph.toposort() ]) f = theano.function([x], T.grad( max_pool_2d(x, ds=(2, 2), ignore_border=True).sum(), x), mode=mode_with_gpu.including("cudnn")) assert any([ isinstance(n.op, cuda.dnn.GpuDnnPoolGrad) for n in f.maker.fgraph.toposort() ])
def _compile_and_check(self, inputs, outputs, numeric_inputs, cls, excluding=None, warn=True, check_topo=True): """This tests the infer_shape method only When testing with input values with shapes that take the same value over different dimensions (for instance, a square matrix, or a tensor3 with shape (n, n, n), or (m, n, m)), it is not possible to detect if the output shape was computed correctly, or if some shapes with the same value have been mixed up. For instance, if the infer_shape uses the width of a matrix instead of its height, then testing with only square matrices will not detect the problem. If warn=True, we emit a warning when testing with such values. :param check_topo: If True, we check that the Op where removed from the graph. False is useful to test not implemented case. """ mode = self.mode if excluding: mode = mode.excluding(*excluding) if warn: for var, inp in zip(inputs, numeric_inputs): if isinstance(inp, (int, float, list, tuple)): inp = var.type.filter(inp) if not hasattr(inp, "shape"): continue # remove broadcasted dims as it is sure they can't be # changed to prevent the same dim problem. if hasattr(var.type, "broadcastable"): shp = [inp.shape[i] for i in range(inp.ndim) if not var.type.broadcastable[i]] else: shp = inp.shape if len(set(shp)) != len(shp): _logger.warn( "While testing the shape inference, we received an" " input with a shape that has some repeated values: %s" ", like a square matrix. This makes it impossible to" " check if the values for these dimensions have been" " correctly used, or if they have been mixed up.", str(inp.shape)) break outputs_function = theano.function(inputs, outputs, mode=mode) shapes_function = theano.function(inputs, [o.shape for o in outputs], mode=mode) #theano.printing.debugprint(shapes_function) # Check that the Op is removed from the compiled function. if check_topo: topo_shape = shapes_function.maker.fgraph.toposort() assert not any(isinstance(t.op, cls) for t in topo_shape) topo_out = outputs_function.maker.fgraph.toposort() assert any(isinstance(t.op, cls) for t in topo_out) # Check that the shape produced agrees with the actual shape. numeric_outputs = outputs_function(*numeric_inputs) numeric_shapes = shapes_function(*numeric_inputs) for out, shape in zip(numeric_outputs, numeric_shapes): assert numpy.all(out.shape == shape), (out.shape, shape)
def test_pooling_opt(): if not cuda.dnn.dnn_available(): raise SkipTest(cuda.dnn.dnn_available.msg) x = T.ftensor4() f = theano.function([x], max_pool_2d(x, ds=(2, 2), ignore_border=True), mode=mode_with_gpu) assert any([isinstance(n.op, cuda.dnn.GpuDnnPool) for n in f.maker.fgraph.toposort()]) f = theano.function( [x], T.grad(max_pool_2d(x, ds=(2, 2), ignore_border=True).sum(), x), mode=mode_with_gpu.including("cudnn") ) assert any([isinstance(n.op, cuda.dnn.GpuDnnPoolGrad) for n in f.maker.fgraph.toposort()])
def test_dnn_tag(): """ Test that if cudnn isn't avail we crash and that if it is avail, we use it. """ x = T.ftensor4() old = theano.config.on_opt_error theano.config.on_opt_error = "raise" sio = StringIO() handler = logging.StreamHandler(sio) logging.getLogger("theano.compile.tests.test_dnn").addHandler(handler) # Silence original handler when intentionnally generating warning messages logging.getLogger("theano").removeHandler(theano.logging_default_handler) raised = False try: f = theano.function([x], max_pool_2d(x, ds=(2, 2), ignore_border=True), mode=mode_with_gpu.including("cudnn")) except (AssertionError, RuntimeError): assert not cuda.dnn.dnn_available() raised = True finally: theano.config.on_opt_error = old logging.getLogger("theano.compile.tests.test_dnn").removeHandler(handler) logging.getLogger("theano").addHandler(theano.logging_default_handler) if not raised: assert cuda.dnn.dnn_available() assert any([isinstance(n.op, cuda.dnn.GpuDnnPool) for n in f.maker.fgraph.toposort()])
def test_neibs_manual(self): shape = (2, 3, 4, 4) for dtype in self.dtypes: images = shared( numpy.arange(numpy.prod(shape), dtype=dtype).reshape(shape)) neib_shape = T.as_tensor_variable((2, 2)) for border in ['valid', 'ignore_borders']: f = function([], images2neibs(images, neib_shape, mode=border), mode=self.mode) assert any([ isinstance(node.op, self.op) for node in f.maker.fgraph.toposort() ]) # print images.get_value(borrow=True) neibs = f() # print neibs assert numpy.allclose( neibs, [[0, 1, 4, 5], [2, 3, 6, 7], [8, 9, 12, 13], [10, 11, 14, 15], [16, 17, 20, 21], [18, 19, 22, 23], [24, 25, 28, 29], [26, 27, 30, 31], [32, 33, 36, 37], [34, 35, 38, 39], [40, 41, 44, 45], [42, 43, 46, 47], [48, 49, 52, 53], [50, 51, 54, 55], [56, 57, 60, 61], [58, 59, 62, 63], [64, 65, 68, 69], [66, 67, 70, 71], [72, 73, 76, 77], [74, 75, 78, 79], [80, 81, 84, 85], [82, 83, 86, 87], [88, 89, 92, 93], [90, 91, 94, 95]]) g = function([], neibs2images(neibs, neib_shape, images.shape), mode=self.mode) assert numpy.allclose(images.get_value(borrow=True), g())
def test_neibs(self): for shape, pshape in [((10, 7, 18, 18), (2, 2)), ((10, 7, 6, 18), (3, 2)), ((5, 7, 66, 66), (33, 33)), ((5, 7, 68, 66), (34, 33))]: for border in ['valid', 'ignore_borders']: for dtype in self.dtypes: images = shared( numpy.arange(numpy.prod(shape), dtype=dtype).reshape(shape)) neib_shape = T.as_tensor_variable(pshape) f = function([], images2neibs(images, neib_shape, mode=border), mode=self.mode) # print images.get_value(borrow=True) neibs = f() # print neibs g = function([], neibs2images(neibs, neib_shape, images.shape), mode=self.mode) assert any([ isinstance(node.op, self.op) for node in f.maker.fgraph.toposort() ]) # print g() assert numpy.allclose(images.get_value(borrow=True), g())
def local_opt(node): dev = theano.sandbox.gpuarray.init_dev.device if cuda_only and not dev.startswith('cuda'): return if type(node.op) in OP: # Either one of our inputs is on the gpu or # all of our client are on the gpu if (any([ i.owner and i.owner.op == host_from_gpu for i in node.inputs ]) or all([ c != 'output' and c.op == gpu_from_host for c, idx in node.outputs[0].clients ])): new_op = maker(node) # This is needed as sometimes new_op inherit from OP. if new_op and new_op != node.op: if isinstance(new_op, theano.Op): return [ safe_to_cpu(o) for o in new_op(*node.inputs, return_list=True) ] elif isinstance(new_op, (tuple, list)): return [safe_to_cpu(o) for o in new_op] else: # suppose it is a variable on the GPU return [host_from_gpu(new_op)] return False
def body(mode, gpu): p = tensor.fmatrix() u = tensor.fvector() m = multinomial.MultinomialFromUniform('auto')(p, u) f = function([p, u], m * 2, allow_input_downcast=True, mode=mode) if gpu: assert any([ type(node.op) is multinomial.GpuMultinomialFromUniform for node in f.maker.fgraph.toposort() ]) pval = numpy.arange(10000 * 4, dtype='float32').reshape( (10000, 4)) + 0.1 pval = pval / pval.sum(axis=1)[:, None] uval = numpy.ones_like(pval[:, 0]) * 0.5 mval = f(pval, uval) assert mval.shape == pval.shape if config.cast_policy == 'custom': assert mval.dtype == pval.dtype elif config.cast_policy == 'numpy+floatX': assert mval.dtype == config.floatX elif config.cast_policy == 'numpy': assert mval.dtype == 'float64' else: raise NotImplementedError(config.cast_policy) assert numpy.allclose(mval.sum(axis=1), 2) asdf = numpy.asarray([0, 0, 2, 0]) + 0 * pval assert numpy.allclose(mval, asdf) # broadcast over all rows
def body(mode, gpu): # the m*2 allows the multinomial to reuse output f = function([p, u], m * 2, allow_input_downcast=True, mode=mode) if gpu: assert any([ type(node.op) is multinomial.GpuMultinomialFromUniform for node in f.maker.fgraph.toposort() ]) # test that both first and second samples can be drawn assert numpy.allclose(f([[1, 0], [0, 1]], [.1, .1]), [[2, 0], [0, 2]]) # test that both second labels can be drawn r = f([[.2, .8], [.3, .7]], [.31, .31]) assert numpy.allclose(r, [[0, 2], [0, 2]]), r # test that both first labels can be drawn r = f([[.2, .8], [.3, .7]], [.21, .21]) assert numpy.allclose(r, [[0, 2], [2, 0]]), r # change the size to make sure output gets reallocated ok # and also make sure that the GPU version doesn't screw up the # transposed-ness r = f([[.2, .8]], [.25]) assert numpy.allclose(r, [[0, 2]]), r
def local_gpu_multinomial(node): if type(node.op) is MultinomialFromUniform: p, u = node.inputs m, = node.outputs if (p.dtype == u.dtype == m.dtype == 'float32' and any([ i.owner and isinstance(i.owner.op, theano.sandbox.cuda.HostFromGpu) for i in node.inputs ])): gpu_op = GpuMultinomialFromUniform(node.op.odtype) return [ host_from_gpu(gpu_op(*[gpu_from_host(i) for i in node.inputs])).T ] if (isinstance(node.op, theano.sandbox.cuda.GpuFromHost) and node.inputs[0].owner and type(node.inputs[0].owner.op) is MultinomialFromUniform): multi = node.inputs[0].owner p, u = multi.inputs m, = multi.outputs if (p.dtype == u.dtype == m.dtype == 'float32'): gpu_op = GpuMultinomialFromUniform(multi.op.odtype) ret = gpu_op(*[gpu_from_host(i) for i in multi.inputs]).T # The dimshuffle is on the cpu, but will be moved to the # gpu by an opt. return [gpu_from_host(ret)]
def list_of_nodes(inputs, outputs): """ Return the apply nodes of the graph between inputs and outputs """ return stack_search( deque([o.owner for o in outputs]), lambda o: [ inp.owner for inp in o.inputs if inp.owner and not any(i in inp.owner.outputs for i in inputs) ])
def profile_printer(fct_name, compile_time, fct_call_time, fct_call, apply_time, apply_cimpl, message, outputs_size, other_time): # Scan overhead profile if any([isinstance(node.op, Scan) and v > 0 for (_, node), v in apply_time.items()]): print print 'Scan overhead:' print ('<Scan op time(s)> <sub scan fct time(s)> <sub scan op ' 'time(s)> <sub scan fct time(% scan op time)> <sub scan ' 'op time(% scan op time)> <node>') total_super_scan_time = 0 total_scan_fct_time = 0 total_scan_op_time = 0 for (_, node), v in apply_time.items(): if isinstance(node.op, Scan): if v > 0: scan_fct_time = node.op.mode_instance.fn_time scan_op_time = node.op.mode_instance.local_time total_super_scan_time += v total_scan_fct_time += scan_fct_time total_scan_op_time += scan_op_time print ' %5.1fs %5.1fs %5.1fs %5.1f%% %5.1f%%' % ( v, scan_fct_time, scan_op_time, scan_fct_time / v * 100, scan_op_time / v * 100), node else: print (' The node took 0s, so we can not compute the ' 'overhead'), node print ' total %5.1fs %5.1fs %5.1fs %5.1f%% %5.1f%%' % ( total_super_scan_time, total_scan_fct_time, total_scan_op_time, total_scan_fct_time / total_super_scan_time * 100, total_scan_op_time / total_super_scan_time * 100)
def list_of_nodes(inputs, outputs): """ Return the apply nodes of the graph between inputs and outputs """ return stack_search( deque([o.owner for o in outputs]), lambda o: [inp.owner for inp in o.inputs if inp.owner and not any(i in inp.owner.outputs for i in inputs)])
def body(mode, gpu): p = tensor.fmatrix() u = tensor.fvector() m = multinomial.MultinomialFromUniform('auto')(p, u) f = function([p, u], m*2, allow_input_downcast=True, mode=mode) if gpu: assert any([type(node.op) is multinomial.GpuMultinomialFromUniform for node in f.maker.fgraph.toposort()]) pval = numpy.arange(10000 * 4, dtype='float32').reshape((10000, 4))+0.1 pval = pval / pval.sum(axis=1)[:, None] uval = numpy.ones_like(pval[:, 0]) * 0.5 mval = f(pval, uval) assert mval.shape == pval.shape if config.cast_policy == 'custom': assert mval.dtype == pval.dtype elif config.cast_policy == 'numpy+floatX': assert mval.dtype == config.floatX elif config.cast_policy == 'numpy': assert mval.dtype == 'float64' else: raise NotImplementedError(config.cast_policy) assert numpy.allclose(mval.sum(axis=1), 2) asdf = numpy.asarray([0, 0, 2, 0])+0*pval assert numpy.allclose(mval, asdf) # broadcast over all rows
def test_logical_shapes(self): seed_rng() for stride in range(1, 4): kshp = (10, 2, 10, 10) featshp = (3, 10, 11, 11) a = tensor.ftensor4() A = tensor.ftensor4() # Need to transpose first two dimensions of kernel, and reverse # index kernel image dims (for correlation) kernel_rotated = tensor.transpose(A, axes=[1, 0, 2, 3]) featshp_logical = (featshp[0], featshp[1], featshp[2] * stride, featshp[3] * stride) kshp_rotated = (kshp[1], kshp[0], kshp[2], kshp[3]) # print featshp, kshp_rotated, featshp_logical[1:], kshp[2:] image_estimate = tensor.nnet.conv2d(a, kernel_rotated, border_mode='full', image_shape=featshp, filter_shape=kshp_rotated, imshp_logical=featshp_logical[1:], kshp_logical=kshp[2:]) func = theano.function([a, A], image_estimate, mode=mode_with_gpu) # theano.printing.debugprint(func,) assert any([isinstance(node.op, GpuConv) for node in func.maker.fgraph.toposort()]) a_in = numpy.random.randn(*featshp).astype("float32") A_in = numpy.random.randn(*kshp).astype("float32") func(a_in, A_in)
def test_neibs(self): for shape, pshape in [((10, 7, 18, 18), (2, 2)), ((10, 7, 6, 18), (3, 2)), ((5, 7, 66, 66), (33, 33)), ((5, 7, 68, 66), (34, 33)) ]: for border in ['valid', 'ignore_borders']: for dtype in self.dtypes: images = shared( numpy.arange(numpy.prod(shape), dtype=dtype ).reshape(shape)) neib_shape = T.as_tensor_variable(pshape) f = function([], images2neibs(images, neib_shape, mode=border), mode=self.mode) #print images.get_value(borrow=True) neibs = f() #print neibs g = function([], neibs2images(neibs, neib_shape, images.shape), mode=self.mode) assert any([isinstance(node.op, self.op) for node in f.maker.fgraph.toposort()]) #print g() assert numpy.allclose(images.get_value(borrow=True), g())
def test_logical_shapes(self): seed_rng() for stride in range(1, 4): kshp = (10, 2, 10, 10) featshp = (3, 10, 11, 11) a = tensor.ftensor4() A = tensor.ftensor4() # Need to transpose first two dimensions of kernel, and reverse # index kernel image dims (for correlation) kernel_rotated = tensor.transpose(A, axes=[1, 0, 2, 3]) featshp_logical = (featshp[0], featshp[1], featshp[2] * stride, featshp[3] * stride) kshp_rotated = (kshp[1], kshp[0], kshp[2], kshp[3]) #print featshp, kshp_rotated, featshp_logical[1:], kshp[2:] image_estimate = tensor.nnet.conv2d(a, kernel_rotated, border_mode='full', image_shape=featshp, filter_shape=kshp_rotated, imshp_logical=featshp_logical[1:], kshp_logical=kshp[2:]) func = theano.function([a, A], image_estimate, mode=mode_with_gpu) #theano.printing.debugprint(func,) assert any([isinstance(node.op, GpuConv) for node in func.maker.fgraph.toposort()]) a_in = numpy.random.randn(*featshp).astype("float32") A_in = numpy.random.randn(*kshp).astype("float32") func(a_in, A_in)
def cmp(a_shp, b_shp): a0 = my_rand(*a_shp) a = tcn.shared_constructor(a0, 'a') cval = my_rand(a_shp[0], b_shp[1]) c = tcn.shared_constructor(cval.copy(), 'c') b = tcn.fmatrix('b') b2 = tcn.fmatrix('b2') f = pfunc( [b, b2], [tensor.dot(a, b2) + c], updates=[(a, tensor.dot(a, b) + c)], mode=mode_with_gpu) assert any([node.op == tcn.blas.gpu_gemm_no_inplace for node in f.maker.fgraph.toposort()]) bval = my_rand(*b_shp) bval2 = my_rand(*b_shp) rval = f(bval, bval2) assert numpy.allclose(numpy.dot(a0, bval) + cval, a.get_value()) assert numpy.allclose(numpy.dot(a0, bval2) + cval, rval) # Try with a matrix equal to a0, but with strides in both dims a.set_value(a0) a.set_value( a.get_value(borrow=True, return_internal_type=True)[::-1, ::-1], borrow=True) f(bval, bval2)
def _toposort(edges): """ Topological sort algorithm by Kahn [1] - O(nodes + vertices) inputs: edges - a dict of the form {a: {b, c}} where b and c depend on a outputs: L - an ordered list of nodes that satisfy the dependencies of edges >>> _toposort({1: {2, 3}, 2: (3, )}) [1, 2, 3] Closely follows the wikipedia page [2] [1] Kahn, Arthur B. (1962), "Topological sorting of large networks", Communications of the ACM [2] http://en.wikipedia.org/wiki/Toposort#Algorithms """ incoming_edges = reverse_dict(edges) incoming_edges = dict((k, set(val)) for k, val in incoming_edges.items()) S = set((v for v in edges if v not in incoming_edges)) L = [] while S: n = S.pop() L.append(n) for m in edges.get(n, ()): assert n in incoming_edges[m] incoming_edges[m].remove(n) if not incoming_edges[m]: S.add(m) if any(incoming_edges.get(v, None) for v in edges): raise ValueError("Input has cycles") return L
def test_dnn_tag(): """ Test that if cudnn isn't avail we crash and that if it is avail, we use it. """ x = T.ftensor4() old = theano.config.on_opt_error theano.config.on_opt_error = "raise" sio = StringIO() handler = logging.StreamHandler(sio) logging.getLogger('theano.compile.tests.test_dnn').addHandler(handler) # Silence original handler when intentionnally generating warning messages logging.getLogger('theano').removeHandler(theano.logging_default_handler) raised = False try: f = theano.function([x], max_pool_2d(x, ds=(2, 2), ignore_border=True), mode=mode_with_gpu.including("cudnn")) except (AssertionError, RuntimeError): assert not cuda.dnn.dnn_available() raised = True finally: theano.config.on_opt_error = old logging.getLogger('theano.compile.tests.test_dnn').removeHandler( handler) logging.getLogger('theano').addHandler(theano.logging_default_handler) if not raised: assert cuda.dnn.dnn_available() assert any([ isinstance(n.op, cuda.dnn.GpuDnnPool) for n in f.maker.fgraph.toposort() ])
def cmp(a_shp, b_shp): a0 = my_rand(*a_shp) a = tcn.shared_constructor(a0, 'a') cval = my_rand(a_shp[0], b_shp[1]) c = tcn.shared_constructor(cval.copy(), 'c') b = tcn.fmatrix('b') b2 = tcn.fmatrix('b2') f = pfunc([b, b2], [tensor.dot(a, b2) + c], updates=[(a, tensor.dot(a, b) + c)], mode=mode_with_gpu) assert any([ node.op == tcn.blas.gpu_gemm_no_inplace for node in f.maker.fgraph.toposort() ]) bval = my_rand(*b_shp) bval2 = my_rand(*b_shp) rval = f(bval, bval2) assert numpy.allclose(numpy.dot(a0, bval) + cval, a.get_value()) assert numpy.allclose(numpy.dot(a0, bval2) + cval, rval) # Try with a matrix equal to a0, but with strides in both dims a.set_value(a0) a.set_value(a.get_value(borrow=True, return_internal_type=True)[::-1, ::-1], borrow=True) f(bval, bval2)
def local_gpua_subtensor(node): x = node.inputs[0] if (x.owner and isinstance(x.owner.op, HostFromGpu)): gpu_x = x.owner.inputs[0] if (gpu_x.owner and isinstance(gpu_x.owner.op, GpuFromHost) and # And it is a shared var or an input of the graph. not gpu_x.owner.inputs[0].owner): if len(x.clients) == 1: if any([n == 'output' or any([isinstance(v.type, GpuArrayType) for v in n.inputs + n.outputs]) for n,_ in node.outputs[0].clients]): return else: return [host_from_gpu(gpu_from_host(node.outputs[0]))] return GpuSubtensor(node.op.idx_list)
def compile_args(): """ This args will be received by compile_str() in the preargs paramter. They will also be included in the "hard" part of the key module. """ flags = [flag for flag in config.nvcc.flags.split(' ') if flag] if config.nvcc.fastmath: flags.append('-use_fast_math') cuda_ndarray_cuh_hash = hash_from_file( os.path.join(os.path.split(__file__)[0], 'cuda_ndarray.cuh')) flags.append('-DCUDA_NDARRAY_CUH=' + cuda_ndarray_cuh_hash) # NumPy 1.7 Deprecate the old API. I updated most of the places # to use the new API, but not everywhere. When finished, enable # the following macro to assert that we don't bring new code # that use the old API. flags.append("-D NPY_NO_DEPRECATED_API=NPY_1_7_API_VERSION") # numpy 1.7 deprecated the following macro but the didn't # existed in the past numpy_ver = [int(n) for n in numpy.__version__.split('.')[:2]] if bool(numpy_ver < [1, 7]): flags.append("-D NPY_ARRAY_ENSURECOPY=NPY_ENSURECOPY") flags.append("-D NPY_ARRAY_ALIGNED=NPY_ALIGNED") flags.append("-D NPY_ARRAY_WRITEABLE=NPY_WRITEABLE") flags.append("-D NPY_ARRAY_UPDATE_ALL=NPY_UPDATE_ALL") flags.append("-D NPY_ARRAY_C_CONTIGUOUS=NPY_C_CONTIGUOUS") flags.append("-D NPY_ARRAY_F_CONTIGUOUS=NPY_F_CONTIGUOUS") # If the user didn't specify architecture flags add them if not any(['-arch=sm_' in f for f in flags]): # We compile cuda_ndarray.cu during import. # We should not add device properties at that time. # As the device is not selected yet! # TODO: re-compile cuda_ndarray when we bind to a GPU? import theano.sandbox.cuda if hasattr(theano.sandbox, 'cuda'): n = theano.sandbox.cuda.use.device_number if n is None: _logger.warn( "We try to get compilation arguments for CUDA" " code, but the GPU device is not initialized." " This is probably caused by an Op that work on" " the GPU that don't inherit from GpuOp." " We Initialize the GPU now.") theano.sandbox.cuda.use( "gpu", force=True, default_to_move_computation_to_gpu=False, move_shared_float32_to_gpu=False, enable_cuda=False) n = theano.sandbox.cuda.use.device_number p = theano.sandbox.cuda.device_properties(n) flags.append('-arch=sm_' + str(p['major']) + str(p['minor'])) return flags
def filter_nvcc_flags(s): assert isinstance(s, str) flags = [flag for flag in s.split(' ') if flag] if any([f for f in flags if not f.startswith("-")]): raise ValueError( "Theano nvcc.flags support only parameter/value pairs without" " space between them. e.g.: '--machine 64' is not supported," " but '--machine=64' is supported. Please add the '=' symbol." " nvcc.flags value is '%s'" % s) return ' '.join(flags)
def local_gpua_subtensor(node): x = node.inputs[0] if (x.owner and isinstance(x.owner.op, HostFromGpu)): gpu_x = x.owner.inputs[0] if (gpu_x.owner and isinstance(gpu_x.owner.op, GpuFromHost) and # And it is a shared var or an input of the graph. not gpu_x.owner.inputs[0].owner): if len(x.clients) == 1: if any([ n == 'output' or any([ isinstance(v.type, GpuArrayType) for v in n.inputs + n.outputs ]) for n, _ in node.outputs[0].clients ]): return else: return [host_from_gpu(gpu_from_host(node.outputs[0]))] return GpuSubtensor(node.op.idx_list)
def shape_of_variables(fgraph, input_shapes): """ Compute the numeric shape of all intermediate variables given input shapes Inputs: fgraph - the theano.FunctionGraph in question input_shapes - a dict mapping input to shape Outputs: shapes - a dict mapping variable to shape WARNING : This modifies the fgraph. Not pure. >>> import theano >>> x = theano.tensor.matrix('x') >>> y = x[512:]; y.name = 'y' >>> fgraph = theano.FunctionGraph([x], [y], clone=False) >>> shape_of_variables(fgraph, {x: (1024, 1024)}) {y: (512, 1024), x: (1024, 1024)} """ if not hasattr(fgraph, 'shape_feature'): fgraph.attach_feature(theano.tensor.opt.ShapeFeature()) input_dims = [ dimension for inp in fgraph.inputs for dimension in fgraph.shape_feature.shape_of[inp] ] output_dims = [ dimension for shape in fgraph.shape_feature.shape_of.values() for dimension in shape ] compute_shapes = theano.function(input_dims, output_dims) if any([i not in fgraph.inputs for i in input_shapes.keys()]): raise ValueError( "input_shapes keys aren't in the fgraph.inputs. FunctionGraph()" " interface changed. Now by default, it clone the graph it receive." " To have the old behavior, give him this new parameter `clone=False`." ) numeric_input_dims = [ dim for inp in fgraph.inputs for dim in input_shapes[inp] ] numeric_output_dims = compute_shapes(*numeric_input_dims) sym_to_num_dict = dict(zip(output_dims, numeric_output_dims)) l = {} for var in fgraph.shape_feature.shape_of: l[var] = tuple(sym_to_num_dict[sym] for sym in fgraph.shape_feature.shape_of[var]) return l
def get_flags(*types): def get_dtype(t): if isinstance(t, (str, unicode)): return numpy.dtype(t) elif isinstance(t, Type): return t.dtype elif isinstance(t, Variable): return t.type.dtype else: raise TypeError, "can't get a dtype from %s" % (type(t),) dtypes = [get_dtype(t) for t in types] flags = dict(cluda=True) if any(d == numpy.float64 for d in dtypes): flags['have_double'] = True if any(d.itemsize < 4 for d in dtypes): flags['have_small'] = True if any(d.kind == 'c' for d in dtypes): flags['have_complex'] = True if any(d == numpy.float16 for d in dtypes): flags['have_half'] = True return flags
def test_gpu_opt(): if not cuda.cuda_available: # Skip test if cuda_ndarray is not available. from nose.plugins.skip import SkipTest raise SkipTest('Optional package cuda not available') # We test the case where we put the op on the gpu when the output # is moved to the gpu. p = tensor.fmatrix() u = tensor.fvector() m = multinomial.MultinomialFromUniform('auto')(p, u) assert m.dtype == 'float32', m.dtype m_gpu = cuda.gpu_from_host(m) f = function([p, u], m_gpu, allow_input_downcast=True, mode=get_mode(True)) assert any([ type(node.op) is multinomial.GpuMultinomialFromUniform for node in f.maker.fgraph.toposort() ]) pval = numpy.arange(10000 * 4, dtype='float32').reshape((10000, 4)) + 0.1 pval = pval / pval.sum(axis=1)[:, None] uval = numpy.ones_like(pval[:, 0]) * 0.5 mval = f(pval, uval) # Test with a row, it was failing in the past. r = tensor.frow() m = multinomial.MultinomialFromUniform('auto')(r, u) assert m.dtype == 'float32', m.dtype m_gpu = cuda.gpu_from_host(m) f = function([r, u], m_gpu, allow_input_downcast=True, mode=get_mode(True)) assert any([ type(node.op) is multinomial.GpuMultinomialFromUniform for node in f.maker.fgraph.toposort() ]) pval = numpy.arange(1 * 4, dtype='float32').reshape((1, 4)) + 0.1 pval = pval / pval.sum(axis=1)[:, None] uval = numpy.ones_like(pval[:, 0]) * 0.5 mval2 = f(pval, uval)
def test_nvidia_driver3(): """ Test that the gpu device is initialized by theano when we build a function with gpu op. The driver should always be tested during theano initialization of the gpu device """ var = cuda.fvector() f = theano.function([var], var + 1, mode=mode_with_gpu, profile=False) topo = f.maker.fgraph.toposort() assert any([isinstance(node.op, cuda.GpuElemwise) for node in topo]) assert theano.sandbox.cuda.use.device_number is not None
def test_local_sampling_dot_csr(): if not theano.config.cxx: raise SkipTest("G++ not available, so we need to skip this test.") mode = theano.compile.mode.get_default_mode() mode = mode.including("specialize", "local_sampling_dot_csr") for sp_format in ['csr']: # Not implemented for other format inputs = [tensor.matrix(), tensor.matrix(), getattr(theano.sparse, sp_format + '_matrix')()] f = theano.function(inputs, sparse.sampling_dot(*inputs), mode=mode) if theano.config.blas.ldflags: assert not any(isinstance(node.op, sparse.SamplingDot) for node in f.maker.fgraph.toposort()) else: # SamplingDotCSR's C implementation needs blas, so it should not # be inserted assert not any(isinstance(node.op, sparse.opt.SamplingDotCSR) for node in f.maker.fgraph.toposort())
def shape_of_variables(fgraph, input_shapes): """ Compute the numeric shape of all intermediate variables given input shapes Inputs: fgraph - the theano.FunctionGraph in question input_shapes - a dict mapping input to shape Outputs: shapes - a dict mapping variable to shape WARNING : This modifies the fgraph. Not pure. >>> import theano >>> x = theano.tensor.matrix('x') >>> y = x[512:]; y.name = 'y' >>> fgraph = theano.FunctionGraph([x], [y], clone=False) >>> shape_of_variables(fgraph, {x: (1024, 1024)}) {y: (512, 1024), x: (1024, 1024)} """ if not hasattr(fgraph, 'shape_feature'): fgraph.attach_feature(theano.tensor.opt.ShapeFeature()) input_dims = [dimension for inp in fgraph.inputs for dimension in fgraph.shape_feature.shape_of[inp]] output_dims = [dimension for shape in fgraph.shape_feature.shape_of.values() for dimension in shape] compute_shapes = theano.function(input_dims, output_dims) if any([i not in fgraph.inputs for i in input_shapes.keys()]): raise ValueError( "input_shapes keys aren't in the fgraph.inputs. FunctionGraph()" " interface changed. Now by default, it clones the graph it receives." " To have the old behavior, give it this new parameter `clone=False`.") numeric_input_dims = [dim for inp in fgraph.inputs for dim in input_shapes[inp]] numeric_output_dims = compute_shapes(*numeric_input_dims) sym_to_num_dict = dict(zip(output_dims, numeric_output_dims)) l = {} for var in fgraph.shape_feature.shape_of: l[var] = tuple(sym_to_num_dict[sym] for sym in fgraph.shape_feature.shape_of[var]) return l
def test_local_csm_properties_csm(): data = tensor.vector() indices, indptr, shape = (tensor.ivector(), tensor.ivector(), tensor.ivector()) mode = theano.compile.mode.get_default_mode() mode = mode.including("specialize", "local_csm_properties_csm") for CS, cast in [(sparse.CSC, sp.csc_matrix), (sparse.CSR, sp.csr_matrix)]: f = theano.function([data, indices, indptr, shape], sparse.csm_properties( CS(data, indices, indptr, shape)), mode=mode) assert not any( isinstance(node.op, (sparse.CSM, sparse.CSMProperties)) for node in f.maker.fgraph.toposort()) v = cast(random_lil((10, 40), config.floatX, 3)) f(v.data, v.indices, v.indptr, v.shape)
def test_local_mul_s_v(): if not theano.config.cxx: raise SkipTest("G++ not available, so we need to skip this test.") mode = theano.compile.mode.get_default_mode() mode = mode.including("specialize", "local_mul_s_v") for sp_format in ['csr']: # Not implemented for other format inputs = [getattr(theano.sparse, sp_format + '_matrix')(), tensor.vector()] f = theano.function(inputs, sparse.mul_s_v(*inputs), mode=mode) assert not any(isinstance(node.op, sparse.MulSV) for node in f.maker.fgraph.toposort())
def test_local_hard_sigmoid(self): x = tensor.matrix('x') s = sigmoid(x) mode = self.get_mode('local_hard_sigmoid') f = theano.function([x], s, mode=mode) topo = f.maker.fgraph.toposort() assert topo[0].op == sigmoid assert len(topo) == 1 mode = self.get_mode().including('local_hard_sigmoid') f = theano.function([x], s, mode=mode) topo = f.maker.fgraph.toposort() assert len(topo) > 1 assert not any([n.op == sigmoid for n in topo]) ux_v = f([[-50, -10, -4, -1, 0, 1, 4, 10, 50]])
def test_neibs_manual(self): shape = (2, 3, 4, 4) for dtype in self.dtypes: images = shared( numpy.arange(numpy.prod(shape), dtype=dtype ).reshape(shape)) neib_shape = T.as_tensor_variable((2, 2)) for border in ['valid', 'ignore_borders']: f = function([], images2neibs(images, neib_shape, mode=border), mode=self.mode) assert any([isinstance(node.op, self.op) for node in f.maker.fgraph.toposort()]) #print images.get_value(borrow=True) neibs = f() #print neibs assert numpy.allclose(neibs, [[ 0, 1, 4, 5], [ 2, 3, 6, 7], [ 8, 9, 12, 13], [10, 11, 14, 15], [16, 17, 20, 21], [18, 19, 22, 23], [24, 25, 28, 29], [26, 27, 30, 31], [32, 33, 36, 37], [34, 35, 38, 39], [40, 41, 44, 45], [42, 43, 46, 47], [48, 49, 52, 53], [50, 51, 54, 55], [56, 57, 60, 61], [58, 59, 62, 63], [64, 65, 68, 69], [66, 67, 70, 71], [72, 73, 76, 77], [74, 75, 78, 79], [80, 81, 84, 85], [82, 83, 86, 87], [88, 89, 92, 93], [90, 91, 94, 95]]) g = function([], neibs2images(neibs, neib_shape, images.shape), mode=self.mode) assert numpy.allclose(images.get_value(borrow=True), g())
def test_local_mul_s_v(): if not theano.config.cxx: raise SkipTest("G++ not available, so we need to skip this test.") mode = theano.compile.mode.get_default_mode() mode = mode.including("specialize", "local_mul_s_v") for sp_format in ['csr']: # Not implemented for other format inputs = [ getattr(theano.sparse, sp_format + '_matrix')(), tensor.vector() ] f = theano.function(inputs, sparse.mul_s_v(*inputs), mode=mode) assert not any( isinstance(node.op, sparse.MulSV) for node in f.maker.fgraph.toposort())
def __str__(self): if self.name: return self.name else: b = self.broadcastable named_broadcastable = {(): 'scalar', (False,): 'vector', (False, True): 'col', (True, False): 'row', (False, False): 'matrix'} if b in named_broadcastable: bcast = named_broadcastable[b] else: if any(b): bcast = str(b) else: bcast = '%iD' % len(b) return "TensorType(%s, %s)" % (str(self.dtype), bcast)
def __str__(self): if self.name: return self.name else: b = self.broadcastable named_broadcastable = { (): 'scalar', (False, ): 'vector', (False, True): 'col', (True, False): 'row', (False, False): 'matrix' } if b in named_broadcastable: bcast = named_broadcastable[b] else: if any(b): bcast = str(b) else: bcast = '%iD' % len(b) return "TensorType(%s, %s)" % (str(self.dtype), bcast)
def local_gpu_multinomial(node): if type(node.op) is MultinomialFromUniform: p, u = node.inputs m, = node.outputs if (p.dtype == u.dtype == m.dtype == 'float32' and any([i.owner and isinstance(i.owner.op, theano.sandbox.cuda.HostFromGpu) for i in node.inputs])): gpu_op = GpuMultinomialFromUniform(node.op.odtype) return [host_from_gpu(gpu_op(*[gpu_from_host(i) for i in node.inputs])).T] if (isinstance(node.op, theano.sandbox.cuda.GpuFromHost) and node.inputs[0].owner and type(node.inputs[0].owner.op) is MultinomialFromUniform): multi = node.inputs[0].owner p, u = multi.inputs m, = multi.outputs if (p.dtype == u.dtype == m.dtype == 'float32'): gpu_op = GpuMultinomialFromUniform(multi.op.odtype) ret = gpu_op(*[gpu_from_host(i) for i in multi.inputs]).T # The dimshuffle is on the cpu, but will be moved to the # gpu by an opt. return [gpu_from_host(ret)]
def local_opt(node): dev = theano.sandbox.gpuarray.init_dev.device if cuda_only and not dev.startswith('cuda'): return if type(node.op) in OP: # Either one of our inputs is on the gpu or # all of our client are on the gpu if (any([i.owner and i.owner.op == host_from_gpu for i in node.inputs]) or all([c != 'output' and c.op == gpu_from_host for c, idx in node.outputs[0].clients])): new_op = maker(node) # This is needed as sometimes new_op inherit from OP. if new_op and new_op != node.op: if isinstance(new_op, theano.Op): return [safe_to_cpu(o) for o in new_op(*node.inputs, return_list=True)] elif isinstance(new_op, (tuple, list)): return [safe_to_cpu(o) for o in new_op] else: # suppose it is a variable on the GPU return [host_from_gpu(new_op)] return False
def test_local_csm_grad_c(): raise SkipTest("Opt disabled as it don't support unsorted indices") if not theano.config.cxx: raise SkipTest("G++ not available, so we need to skip this test.") data = tensor.vector() indices, indptr, shape = (tensor.ivector(), tensor.ivector(), tensor.ivector()) mode = theano.compile.mode.get_default_mode() if theano.config.mode == 'FAST_COMPILE': mode = theano.compile.Mode(linker='c|py', optimizer='fast_compile') mode = mode.including("specialize", "local_csm_grad_c") for CS, cast in [(sparse.CSC, sp.csc_matrix), (sparse.CSR, sp.csr_matrix)]: cost = tensor.sum(sparse.DenseFromSparse()(CS(data, indices, indptr, shape))) f = theano.function( [data, indices, indptr, shape], tensor.grad(cost, data), mode=mode) assert not any(isinstance(node.op, sparse.CSMGrad) for node in f.maker.fgraph.toposort()) v = cast(random_lil((10, 40), config.floatX, 3)) f(v.data, v.indices, v.indptr, v.shape)
def body(mode, gpu): # the m*2 allows the multinomial to reuse output f = function([p, u], m*2, allow_input_downcast=True, mode=mode) if gpu: assert any([type(node.op) is multinomial.GpuMultinomialFromUniform for node in f.maker.fgraph.toposort()]) # test that both first and second samples can be drawn assert numpy.allclose(f([[1, 0], [0, 1]], [.1, .1]), [[2, 0], [0, 2]]) # test that both second labels can be drawn r = f([[.2, .8], [.3, .7]], [.31, .31]) assert numpy.allclose(r, [[0, 2], [0, 2]]), r # test that both first labels can be drawn r = f([[.2, .8], [.3, .7]], [.21, .21]) assert numpy.allclose(r, [[0, 2], [2, 0]]), r # change the size to make sure output gets reallocated ok # and also make sure that the GPU version doesn't screw up the # transposed-ness r = f([[.2, .8]], [.25]) assert numpy.allclose(r, [[0, 2]]), r
def test_local_csm_grad_c(): raise SkipTest("Opt disabled as it don't support unsorted indices") if not theano.config.cxx: raise SkipTest("G++ not available, so we need to skip this test.") data = tensor.vector() indices, indptr, shape = (tensor.ivector(), tensor.ivector(), tensor.ivector()) mode = theano.compile.mode.get_default_mode() if theano.config.mode == 'FAST_COMPILE': mode = theano.compile.Mode(linker='c|py', optimizer='fast_compile') mode = mode.including("specialize", "local_csm_grad_c") for CS, cast in [(sparse.CSC, sp.csc_matrix), (sparse.CSR, sp.csr_matrix)]: cost = tensor.sum(sparse.DenseFromSparse()(CS(data, indices, indptr, shape))) f = theano.function([data, indices, indptr, shape], tensor.grad(cost, data), mode=mode) assert not any( isinstance(node.op, sparse.CSMGrad) for node in f.maker.fgraph.toposort()) v = cast(random_lil((10, 40), config.floatX, 3)) f(v.data, v.indices, v.indptr, v.shape)
def execute(execute=True, verbose=True, M=2000, N=2000, K=2000, iters=10, order='C'): """ :param execute: If True, execute a Theano function that should call gemm. :param verbose: If True, will print some Theano flags and env variables. :param M,N,K: The M,N,K size used by gemm. :param iters: The number of calls to gemm to do. :return: a tuple (execution time, str that represents the implementation used) """ if verbose: print 'Some Theano flags:' print ' blas.ldflags=', theano.config.blas.ldflags print ' compiledir=', theano.config.compiledir print ' floatX=', theano.config.floatX print ' device=', theano.config.device print 'Some OS information:' print ' sys.platform=', sys.platform print ' sys.version=', sys.version print ' sys.prefix=', sys.prefix print 'Some environment variables:' print ' MKL_NUM_THREADS=', os.getenv('MKL_NUM_THREADS') print ' OMP_NUM_THREADS=', os.getenv('OMP_NUM_THREADS') print ' GOTO_NUM_THREADS=', os.getenv('GOTO_NUM_THREADS') print print ('Numpy config: (used when the Theano flag' ' "blas.ldflags" is empty)') numpy.show_config() print 'Numpy dot module:', numpy.dot.__module__ print 'Numpy location:', numpy.__file__ print 'Numpy version:', numpy.__version__ if (theano.config.device.startswith("gpu") or theano.config.init_gpu_device.startswith("gpu")): print 'nvcc version:' subprocess.call((theano.sandbox.cuda.nvcc_compiler.nvcc_path, "--version")) print a = theano.shared(numpy.ones((M, N), dtype=theano.config.floatX, order=order)) b = theano.shared(numpy.ones((N, K), dtype=theano.config.floatX, order=order)) c = theano.shared(numpy.ones((M, K), dtype=theano.config.floatX, order=order)) f = theano.function([], updates=[(c, 0.4 * c + .8 * T.dot(a, b))]) if any([x.op.__class__.__name__ == 'Gemm' for x in f.maker.fgraph.toposort()]): c_impl = [hasattr(thunk, 'cthunk') for node, thunk in zip(f.fn.nodes, f.fn.thunks) if node.op.__class__.__name__ == "Gemm"] assert len(c_impl) == 1 if c_impl[0]: impl = 'CPU (with direct Theano binding to blas)' else: impl = 'CPU (without direct Theano binding to blas but with numpy/scipy binding to blas)' elif any([x.op.__class__.__name__ == 'GpuGemm' for x in f.maker.fgraph.toposort()]): impl = 'GPU' else: impl = 'ERROR, unable to tell if Theano used the cpu or the gpu:\n' impl += str(f.maker.fgraph.toposort()) t0 = 0 t1 = -1 if execute: sync = (hasattr(theano, "sandbox") and hasattr(theano.sandbox, "cuda") and theano.sandbox.cuda.cuda_available) t0 = time.time() for i in range(iters): f() if sync: theano.sandbox.cuda.synchronize() t1 = time.time() return t1 - t0, impl
def depends((a, b)): """ Returns True if a depends on b """ return (any(bout in a.inputs for bout in b.outputs) or any(depends((ainp.owner, b)) for ainp in a.inputs if ainp.owner))
def test_downsample(): shps = [ (1, 1, 1, 12), (1, 1, 2, 2), (1, 1, 1, 1), (1, 1, 4, 4), (1, 1, 10, 11), (1, 2, 2, 2), (3, 5, 4, 4), (25, 1, 7, 7), (1, 1, 12, 12), (1, 1, 2, 14), (1, 1, 12, 14), (1, 1, 14, 14), (1, 1, 16, 16), (1, 1, 18, 18), (1, 1, 24, 24), (1, 6, 24, 24), (10, 1, 24, 24), (10, 6, 24, 24), (30, 6, 12, 12), (30, 2, 24, 24), (30, 6, 24, 24), (10, 10, 10, 11), (1, 1, 10, 1025), (1, 1, 10, 1023), (1, 1, 1025, 10), (1, 1, 1023, 10), (65536, 1, 10, 10), (1, 65536, 10, 10), ] numpy.random.RandomState(unittest_tools.fetch_seed()).shuffle(shps) for shp in shps: for ds in (2, 2), (3, 2), (1, 1): if ds[0] > shp[2]: continue if ds[1] > shp[3]: continue # GpuDownsampleFactorMax doesn't like having more than 512 columns # in the output tensor. if float(shp[3]) / ds[1] > 512: continue for ignore_border in (True, False): # print 'test_downsample', shp, ds, ignore_border ds_op = DownsampleFactorMax(ds, ignore_border=ignore_border) a = tcn.shared_constructor(my_rand(*shp), 'a') f = pfunc([], ds_op(tensor.as_tensor_variable(a)), mode=mode_with_gpu.excluding('cudnn')) f2 = pfunc([], ds_op(tensor.as_tensor_variable(a)), mode=mode_without_gpu) assert any([ isinstance(node.op, tcn.blas.GpuDownsampleFactorMax) for node in f.maker.fgraph.toposort() ]) assert any([ isinstance(node.op, DownsampleFactorMax) for node in f2.maker.fgraph.toposort() ]) assert numpy.allclose(f(), f2()) # The grad is too slow on GT220 GPU # This cause the computer to freeze... # Remove this when it gets optimized enough # This only bypass the last 2 checks # Those tests where passing in all Mode on a GTX470 if shp[0] > 30000 or shp[1] > 30000: continue g = pfunc([], tensor.grad( ds_op(tensor.as_tensor_variable(a)).sum(), a), mode=mode_with_gpu.excluding('cudnn')) g2 = pfunc([], tensor.grad( ds_op(tensor.as_tensor_variable(a)).sum(), a), mode=mode_without_gpu) assert any([ isinstance(node.op, tcn.blas.GpuDownsampleFactorMaxGrad) for node in g.maker.fgraph.toposort() ]) assert any([ isinstance(node.op, DownsampleFactorMaxGrad) for node in g2.maker.fgraph.toposort() ]) assert numpy.allclose(g(), g2()), shp ggf = gradient.Lop( tensor.grad((ds_op(tensor.as_tensor_variable(a))**2).sum(), a), a, a) ref_mode = copy.copy(mode_without_gpu) ref_mode.check_py_code = False gpu_mode = copy.copy(mode_with_gpu) gpu_mode.check_py_code = False gg = pfunc([], ggf, mode=gpu_mode) gg2 = pfunc([], ggf, mode=ref_mode) assert any([ isinstance(node.op, tcn.blas.GpuDownsampleFactorMaxGradGrad) for node in gg.maker.fgraph.toposort() ]) assert any([ isinstance(node.op, DownsampleFactorMaxGradGrad) for node in gg2.maker.fgraph.toposort() ]) assert numpy.allclose(gg(), gg2()), shp