def test_neibs_gpu(): if cuda.cuda_available == False: raise SkipTest('Optional package cuda disabled') for shape, pshape in [((100, 40, 18, 18), (2, 2)), ((100, 40, 6, 18), (3, 2)), ((10, 40, 66, 66), (33, 33)), ((10, 40, 68, 66), (34, 33)) ]: images = shared(numpy.arange(numpy.prod(shape), dtype='float32').reshape(shape)) neib_shape = T.as_tensor_variable(pshape) f = function([], images2neibs(images, neib_shape), mode=mode_with_gpu) f_gpu = function([], images2neibs(images, neib_shape), mode=mode_with_gpu) assert any([isinstance(node.op, GpuImages2Neibs) for node in f_gpu.maker.env.toposort()]) #print images.get_value(borrow=True) neibs = numpy.asarray(f_gpu()) assert numpy.allclose(neibs, f()) #print neibs g = function([], neibs2images(neibs, neib_shape, images.shape), mode=mode_with_gpu) assert any([isinstance(node.op, GpuImages2Neibs) for node in f.maker.env.toposort()]) #print numpy.asarray(g()) assert numpy.allclose(images.get_value(borrow=True), g())
def test_downsample(): shps = [ (1, 1, 1, 12), (1, 1, 2, 2), (1, 1, 1, 1), (1, 1, 4, 4), (1, 1, 10, 11), (1, 2, 2, 2), (3, 5, 4, 4), (25, 1, 7, 7), (1, 1, 12, 12), (1, 1, 2, 14), (1, 1, 12, 14), (1, 1, 14, 14), (1, 1, 16, 16), (1, 1, 18, 18), (1, 1, 24, 24), (1, 6, 24, 24), (10, 1, 24, 24), (10, 6, 24, 24), (30, 6, 12, 12), (30, 2, 24, 24), (30, 6, 24, 24), (10, 10, 10, 11), (1, 1, 10, 1025), (1, 1, 10, 1023), (1, 1, 1025, 10), (1, 1, 1023, 10), ] numpy.random.RandomState(unittest_tools.fetch_seed()).shuffle(shps) for shp in shps: for ds in (2, 2), (3, 2), (1, 1): if ds[0] > shp[2]: continue if ds[1] > shp[3]: continue # GpuDownsampleFactorMax doesn't like having more than 512 columns # in the output tensor. if float(shp[3]) / ds[1] > 512: continue for ignore_border in (True, False): print "test_downsample", shp, ds, ignore_border ds_op = DownsampleFactorMax(ds, ignore_border=ignore_border) a = tcn.shared_constructor(my_rand(*shp), "a") f = pfunc([], ds_op(tensor.as_tensor_variable(a)), mode=mode_with_gpu) f2 = pfunc([], ds_op(tensor.as_tensor_variable(a)), mode=mode_without_gpu) assert any([isinstance(node.op, tcn.blas.GpuDownsampleFactorMax) for node in f.maker.env.toposort()]) assert any([isinstance(node.op, DownsampleFactorMax) for node in f2.maker.env.toposort()]) assert numpy.allclose(f(), f2()) g = pfunc([], tensor.grad(ds_op(tensor.as_tensor_variable(a)).sum(), a), mode=mode_with_gpu) g2 = pfunc([], tensor.grad(ds_op(tensor.as_tensor_variable(a)).sum(), a), mode=mode_without_gpu) assert any( [isinstance(node.op, tcn.blas.GpuDownsampleFactorMaxGrad) for node in g.maker.env.toposort()] ) assert any([isinstance(node.op, DownsampleFactorMaxGrad) for node in g2.maker.env.toposort()]) assert numpy.allclose(g(), g2())
def _compile_and_check(self, inputs, outputs, numeric_inputs, cls, excluding=None): """This tests the infer_shape method only""" mode = self.mode if excluding: mode = mode.excluding(*excluding) outputs_function = theano.function(inputs, outputs, mode=mode) shapes_function = theano.function(inputs, [o.shape for o in outputs], mode=mode) #theano.printing.debugprint(shapes_function) # Check that the Op is removed from the compiled function. topo_shape = shapes_function.maker.env.toposort() assert not any(isinstance(t.op, cls) for t in topo_shape) topo_out = outputs_function.maker.env.toposort() assert any(isinstance(t.op, cls) for t in topo_out) # Check that the shape produced agrees with the actual shape. numeric_outputs = outputs_function(*numeric_inputs) numeric_shapes = shapes_function(*numeric_inputs) for out, shape in zip(numeric_outputs, numeric_shapes): assert numpy.all(out.shape == shape)
def test_default_conv(): """Just test that we introduce the right GPU convolution version. """ img = theano.tensor.ftensor4() fil = theano.tensor.ftensor4() c = theano.tensor.nnet.conv2d(img, fil) f = theano.function([img, fil], c, mode=theano_mode) if cuda.dnn.dnn_available(): assert any([isinstance(a.op, GpuDnnConv) for a in f.maker.fgraph.apply_nodes]) else: assert any([isinstance(a.op, cuda.blas.GpuCorrMM) for a in f.maker.fgraph.apply_nodes]) mode = theano_mode.excluding('local_conv_dnn', 'local_conv_gemm') f = theano.function([img, fil], c, mode=mode) assert any([isinstance(a.op, cuda.blas.GpuConv) for a in f.maker.fgraph.apply_nodes]) mode = theano_mode.excluding('conv_dnn', 'conv_gemm') f = theano.function([img, fil], c, mode=mode) assert any([isinstance(a.op, cuda.blas.GpuConv) for a in f.maker.fgraph.apply_nodes])
def test_pooling_opt(): if not cuda.dnn.dnn_available(): raise SkipTest(cuda.dnn.dnn_available.msg) x = T.ftensor4() f = theano.function([x], max_pool_2d(x, ds=(2, 2), ignore_border=True), mode=mode_with_gpu) assert any([ isinstance(n.op, cuda.dnn.GpuDnnPool) for n in f.maker.fgraph.toposort() ]) f = theano.function([x], T.grad( max_pool_2d(x, ds=(2, 2), ignore_border=True).sum(), x), mode=mode_with_gpu.including("cudnn")) assert any([ isinstance(n.op, cuda.dnn.GpuDnnPoolGrad) for n in f.maker.fgraph.toposort() ])
def test_gpu_opt(): if not cuda.cuda_available: # Skip test if cuda_ndarray is not available. from nose.plugins.skip import SkipTest raise SkipTest('Optional package cuda not available') # We test the case where we put the op on the gpu when the output # is moved to the gpu. p = tensor.fmatrix() u = tensor.fvector() m = multinomial.MultinomialFromUniform('auto')(p, u) assert m.dtype == 'float32', m.dtype m_gpu = cuda.gpu_from_host(m) f = function([p, u], m_gpu, allow_input_downcast=True, mode=get_mode(True)) assert any([type(node.op) is multinomial.GpuMultinomialFromUniform for node in f.maker.fgraph.toposort()]) pval = numpy.arange(10000 * 4, dtype='float32').reshape((10000, 4))+0.1 pval = pval / pval.sum(axis=1)[:, None] uval = numpy.ones_like(pval[:, 0]) * 0.5 mval = f(pval, uval) # Test with a row, it was failing in the past. r = tensor.frow() m = multinomial.MultinomialFromUniform('auto')(r, u) assert m.dtype == 'float32', m.dtype m_gpu = cuda.gpu_from_host(m) f = function([r, u], m_gpu, allow_input_downcast=True, mode=get_mode(True)) assert any([type(node.op) is multinomial.GpuMultinomialFromUniform for node in f.maker.fgraph.toposort()]) pval = numpy.arange(1 * 4, dtype='float32').reshape((1, 4))+0.1 pval = pval / pval.sum(axis=1)[:, None] uval = numpy.ones_like(pval[:, 0]) * 0.5 mval2 = f(pval, uval)
def test_default_conv(): """Just test that we introduce the right GPU convolution version. """ img = theano.tensor.ftensor4() fil = theano.tensor.ftensor4() c = theano.tensor.nnet.conv2d(img, fil) f = theano.function([img, fil], c, mode=theano_mode) if cuda.dnn.dnn_available(): assert any( [isinstance(a.op, GpuDnnConv) for a in f.maker.fgraph.apply_nodes]) else: assert any([ isinstance(a.op, cuda.blas.GpuCorrMM) for a in f.maker.fgraph.apply_nodes ]) mode = theano_mode.excluding('local_conv_dnn', 'local_conv_gemm') f = theano.function([img, fil], c, mode=mode) assert any([ isinstance(a.op, cuda.blas.GpuConv) for a in f.maker.fgraph.apply_nodes ]) mode = theano_mode.excluding('conv_dnn', 'conv_gemm') f = theano.function([img, fil], c, mode=mode) assert any([ isinstance(a.op, cuda.blas.GpuConv) for a in f.maker.fgraph.apply_nodes ])
def test_neibs_gpu(): if cuda.cuda_available == False: raise SkipTest('Optional package cuda disabled') for shape, pshape in [((100, 40, 18, 18), (2, 2)), ((100, 40, 6, 18), (3, 2)), ((10, 40, 66, 66), (33, 33)), ((10, 40, 68, 66), (34, 33))]: images = shared( numpy.arange(numpy.prod(shape), dtype='float32').reshape(shape)) neib_shape = T.as_tensor_variable(pshape) f = function([], images2neibs(images, neib_shape), mode=mode_with_gpu) f_gpu = function([], images2neibs(images, neib_shape), mode=mode_with_gpu) assert any([ isinstance(node.op, GpuImages2Neibs) for node in f_gpu.maker.env.toposort() ]) #print images.get_value(borrow=True) neibs = numpy.asarray(f_gpu()) assert numpy.allclose(neibs, f()) #print neibs g = function([], neibs2images(neibs, neib_shape, images.shape), mode=mode_with_gpu) assert any([ isinstance(node.op, GpuImages2Neibs) for node in f.maker.env.toposort() ]) #print numpy.asarray(g()) assert numpy.allclose(images.get_value(borrow=True), g())
def test_local_sampling_dot_csr(): if not theano.config.cxx: raise SkipTest("G++ not available, so we need to skip this test.") mode = theano.compile.mode.get_default_mode() mode = mode.including("specialize", "local_sampling_dot_csr") for sp_format in ['csr']: # Not implemented for other format inputs = [ tensor.matrix(), tensor.matrix(), getattr(theano.sparse, sp_format + '_matrix')() ] f = theano.function(inputs, sparse.sampling_dot(*inputs), mode=mode) if theano.config.blas.ldflags: assert not any( isinstance(node.op, sparse.SamplingDot) for node in f.maker.fgraph.toposort()) else: # SamplingDotCSR's C implementation needs blas, so it should not # be inserted assert not any( isinstance(node.op, sparse.opt.SamplingDotCSR) for node in f.maker.fgraph.toposort())
def _compile_and_check(self, inputs, outputs, numeric_inputs, cls, excluding=None, warn=True, check_topo=True): """This tests the infer_shape method only When testing with input values with shapes that take the same value over different dimensions (for instance, a square matrix, or a tensor3 with shape (n, n, n), or (m, n, m)), it is not possible to detect if the output shape was computed correctly, or if some shapes with the same value have been mixed up. For instance, if the infer_shape uses the width of a matrix instead of its height, then testing with only square matrices will not detect the problem. If warn=True, we emit a warning when testing with such values. :param check_topo: If True, we check that the Op where removed from the graph. False is useful to test not implemented case. """ mode = self.mode if excluding: mode = mode.excluding(*excluding) if warn: for var, inp in zip(inputs, numeric_inputs): if isinstance(inp, (int, float, list, tuple)): inp = var.type.filter(inp) if not hasattr(inp, "shape"): continue # remove broadcasted dims as it is sure they can't be # changed to prevent the same dim problem. if hasattr(var.type, "broadcastable"): shp = [inp.shape[i] for i in range(inp.ndim) if not var.type.broadcastable[i]] else: shp = inp.shape if len(set(shp)) != len(shp): _logger.warn( "While testing the shape inference, we received an" " input with a shape that has some repeated values: %s" ", like a square matrix. This makes it impossible to" " check if the values for these dimensions have been" " correctly used, or if they have been mixed up.", str(inp.shape)) break outputs_function = theano.function(inputs, outputs, mode=mode) shapes_function = theano.function(inputs, [o.shape for o in outputs], mode=mode) #theano.printing.debugprint(shapes_function) # Check that the Op is removed from the compiled function. if check_topo: topo_shape = shapes_function.maker.fgraph.toposort() assert not any(isinstance(t.op, cls) for t in topo_shape) topo_out = outputs_function.maker.fgraph.toposort() assert any(isinstance(t.op, cls) for t in topo_out) # Check that the shape produced agrees with the actual shape. numeric_outputs = outputs_function(*numeric_inputs) numeric_shapes = shapes_function(*numeric_inputs) for out, shape in zip(numeric_outputs, numeric_shapes): assert numpy.all(out.shape == shape)
def _get_kernel_flags(self, *dtypes): dtypes = [numpy.dtype(d) for d in dtypes] flags = ['GA_USE_CLUDA'] if any(d == numpy.float64 for d in dtypes): flags.append('GA_USE_DOUBLE') if any(d.itemsize < 4 for d in dtypes): flags.append('GA_USE_SMALL') return '|'.join(flags)
def test_GpuCrossentropySoftmax1HotWithBiasDx(): """ This is basic test for GpuCrossentropySoftmax1HotWithBiasDx We check that we loop when their is too much threads TODO: check that we loop when their is too much block(>32*1024) """ n_in = 1000 batch_size = 4097 n_out = 1250 # Seed numpy.random with config.unittests.rseed utt.seed_rng() softmax_output_value = numpy.random.rand(batch_size, n_out).astype("float32") dnll_value = numpy.asarray(numpy.random.rand(batch_size), dtype="float32") y_idx_value = numpy.random.randint(low=0, high=5, size=batch_size) softmax_output = T.fmatrix() softmax_output /= softmax_output.sum(axis=1).reshape(softmax_output.shape[1], 1) op = theano.tensor.nnet.crossentropy_softmax_1hot_with_bias_dx(dnll_value, softmax_output, y_idx_value) cpu_f = theano.function([softmax_output], op, mode=mode_without_gpu) gpu_f = theano.function([softmax_output], op, mode=mode_with_gpu) # theano.printing.debugprint(cpu_f) # theano.printing.debugprint(gpu_f) assert any( [isinstance(node.op, T.nnet.CrossentropySoftmax1HotWithBiasDx) for node in cpu_f.maker.fgraph.toposort()] ) assert any( [isinstance(node.op, cuda.nnet.GpuCrossentropySoftmax1HotWithBiasDx) for node in gpu_f.maker.fgraph.toposort()] ) cpu_out = cpu_f(softmax_output_value) gpu_out = gpu_f(softmax_output_value) rtol = 1e-5 atol = 1e-6 if not numpy.allclose(cpu_out, gpu_out, rtol=rtol, atol=atol): abs_err, rel_err = T.numeric_grad.abs_rel_err(cpu_out, gpu_out) scaled_err = numpy.minimum(abs_err / atol, rel_err / rtol) max_i = scaled_err.argmax() print "max err index:", max_i, max_i / batch_size, print max_i % batch_size, max_i / n_out, max_i & n_out print "At that index:" print "err:", scaled_err.flatten()[max_i] print "absolute error:", abs_err.flatten()[max_i] print "relative error:", rel_err.flatten()[max_i] print "cpu_out:", cpu_out.flatten()[max_i] print "gpu_out:", gpu_out.flatten()[max_i] print "softmax_output_value:", softmax_output_value.flatten()[max_i] print "dnll_value:", dnll_value[max_i / n_out] print "y_idx_value:", y_idx_value[max_i / n_out] assert False, "numpy.allclose(cpu_out, gpu_out, rtol=%s, atol=%s)" % (rtol, atol)
def execute(execute=True, verbose=True, M=2000, N=2000, K=2000, iters=10, order='C'): """ :param execute: If True, execute a Theano function that should call gemm. :param verbose: If True, will print some Theano flags and env variables. :param M,N,K: The M,N,K size used by gemm. :param iters: The number of calls to gemm to do. :return: a tuple (execution time, str that represents the implementation used) """ a = theano.shared(numpy.ones((M, N), dtype=theano.config.floatX, order=order)) b = theano.shared(numpy.ones((N, K), dtype=theano.config.floatX, order=order)) c = theano.shared(numpy.ones((M, K), dtype=theano.config.floatX, order=order)) f = theano.function([], updates={c: 0.4 * c + .8 * T.dot(a, b)}) if verbose: print 'Some Theano flags:' print ' blas.ldflags=', theano.config.blas.ldflags print ' compiledir=', theano.config.compiledir print ' floatX=', theano.config.floatX print 'Some environment variables:' print ' MKL_NUM_THREADS=', os.getenv('MKL_NUM_THREADS') print ' OMP_NUM_THREADS=', os.getenv('OMP_NUM_THREADS') print ' GOTO_NUM_THREADS=', os.getenv('GOTO_NUM_THREADS') print print ('Numpy config: (used when the Theano flag' ' "blas.ldflags" is empty)') numpy.show_config() print 'Numpy dot module:', numpy.dot.__module__ print 'Numpy location:', numpy.__file__ print 'Numpy version:', numpy.__version__ print t0 = 0 t1 = -1 if any([x.op.__class__.__name__ == 'Gemm' for x in f.maker.env.toposort()]): impl = 'cpu' elif any([x.op.__class__.__name__ == 'GpuGemm' for x in f.maker.env.toposort()]): impl = 'gpu' else: impl = 'ERROR, unable to tell if Theano used the cpu or the gpu:\n' impl += str(f.maker.env.toposort()) if execute: t0 = time.time() for i in range(iters): f() t1 = time.time() return t1 - t0, impl
def execute(execute=True, verbose=True): a = theano.shared(numpy.ones(shapes, dtype=theano.config.floatX)) b = theano.shared(numpy.ones(shapes, dtype=theano.config.floatX)) c = theano.shared(numpy.ones(shapes, dtype=theano.config.floatX)) f = theano.function([], updates={c: 0.4 * c + .8 * T.dot(a, b)}) if verbose: print 'Some theano flags:' print ' blas.ldflags=', theano.config.blas.ldflags print ' compiledir=', theano.config.compiledir print ' floatX=', theano.config.floatX print 'Some env flags:' print ' MKL_NUM_THREADS=', os.getenv('MKL_NUM_THREADS') print ' OMP_NUM_THREADS=', os.getenv('OMP_NUM_THREADS') print ' GOTO_NUM_THREADS=', os.getenv('GOTO_NUM_THREADS') print print( 'Numpy config: (used when the theano flags' ' "blas.ldflags" is empty)') numpy.show_config() print 'Numpy dot module:', numpy.dot.__module__ print 'Numpy file location that was loaded:', numpy.__file__ print 'Numpy version:', numpy.__version__ print if any([ x.op.__class__.__name__ == 'Gemm' for x in f.maker.env.toposort() ]): print 'Used the cpu' elif any([ x.op.__class__.__name__ == 'GpuGemm' for x in f.maker.env.toposort() ]): print 'Used the gpu' else: print 'ERROR, not able to tell if theano used the cpu or the gpu' print f.maker.env.toposort() t0 = 0 t1 = -1 if execute: t0 = time.time() for i in range(iters): f() t1 = time.time() if verbose and execute: print print 'This execution time took %.2fs' % (t1 - t0) print print( 'Try to run this script a few times. Experience show that' ' the first time is not as fast as followings call. The' ' difference is not big, but consistent.') return t1 - t0
def _compile_and_check(self, inputs, outputs, numeric_inputs, cls, excluding=None, warn=True): """This tests the infer_shape method only When testing with input values with shapes that take the same value over different dimensions (for instance, a square matrix, or a tensor3 with shape (n, n, n), or (m, n, m)), it is not possible to detect if the output shape was computed correctly, or if some shapes with the same value have been mixed up. For instance, if the infer_shape uses the width of a matrix instead of its height, then testing with only square matrices will not detect the problem. If warn=True, we emit a warning when testing with such values. """ mode = self.mode if excluding: mode = mode.excluding(*excluding) if warn: for var, inp in zip(inputs, numeric_inputs): if isinstance(inp, (int, float, list, tuple)): inp = var.type.filter(inp) if not hasattr(inp, "shape"): continue # remove broadcasted dims as it is sure they can't be # changed to prevent the same dim problem. if hasattr(var.type, "broadcastable"): shp = [inp.shape[i] for i in range(inp.ndim) if not var.type.broadcastable[i]] else: shp = inp.shape if len(set(shp)) != len(shp): _logger.warn( "While testing the shape inference, we received an" " input with a shape that has some repeated values: %s" ", like a square matrix. This makes it impossible to" " check if the values for these dimensions have been" " correctly used, or if they have been mixed up.", str(inp.shape)) break outputs_function = theano.function(inputs, outputs, mode=mode) shapes_function = theano.function(inputs, [o.shape for o in outputs], mode=mode) #theano.printing.debugprint(shapes_function) # Check that the Op is removed from the compiled function. topo_shape = shapes_function.maker.fgraph.toposort() assert not any(isinstance(t.op, cls) for t in topo_shape) topo_out = outputs_function.maker.fgraph.toposort() assert any(isinstance(t.op, cls) for t in topo_out) # Check that the shape produced agrees with the actual shape. numeric_outputs = outputs_function(*numeric_inputs) numeric_shapes = shapes_function(*numeric_inputs) for out, shape in zip(numeric_outputs, numeric_shapes): assert numpy.all(out.shape == shape)
def execute(execute=True, verbose=True, M=2000, N=2000, K=2000, iters=10, order="C"): """ :param execute: If True, execute a Theano function that should call gemm. :param verbose: If True, will print some Theano flags and env variables. :param M,N,K: The M,N,K size used by gemm. :param iters: The number of calls to gemm to do. :return: a tuple (execution time, str that represents the implementation used) """ if verbose: print "Some Theano flags:" print " blas.ldflags=", theano.config.blas.ldflags print " compiledir=", theano.config.compiledir print " floatX=", theano.config.floatX print "Some environment variables:" print " MKL_NUM_THREADS=", os.getenv("MKL_NUM_THREADS") print " OMP_NUM_THREADS=", os.getenv("OMP_NUM_THREADS") print " GOTO_NUM_THREADS=", os.getenv("GOTO_NUM_THREADS") print print ("Numpy config: (used when the Theano flag" ' "blas.ldflags" is empty)') numpy.show_config() print "Numpy dot module:", numpy.dot.__module__ print "Numpy location:", numpy.__file__ print "Numpy version:", numpy.__version__ print a = theano.shared(numpy.ones((M, N), dtype=theano.config.floatX, order=order)) b = theano.shared(numpy.ones((N, K), dtype=theano.config.floatX, order=order)) c = theano.shared(numpy.ones((M, K), dtype=theano.config.floatX, order=order)) f = theano.function([], updates={c: 0.4 * c + 0.8 * T.dot(a, b)}, mode=theano.compile.ProfileMode()) if any([x.op.__class__.__name__ == "Gemm" for x in f.maker.env.toposort()]): c_impl = f.profile.apply_cimpl.values() assert len(c_impl) == 1 if c_impl[0]: impl = "CPU (with direct Theano binding to blas)" else: impl = "CPU (without direct Theano binding to blas but with numpy/scipy binding to blas)" elif any([x.op.__class__.__name__ == "GpuGemm" for x in f.maker.env.toposort()]): impl = "GPU" else: impl = "ERROR, unable to tell if Theano used the cpu or the gpu:\n" impl += str(f.maker.env.toposort()) t0 = 0 t1 = -1 if execute: t0 = time.time() for i in range(iters): f() t1 = time.time() return t1 - t0, impl
def execute(execute=True, verbose=True): a = theano.shared(numpy.ones(shapes, dtype=theano.config.floatX)) b = theano.shared(numpy.ones(shapes, dtype=theano.config.floatX)) c = theano.shared(numpy.ones(shapes, dtype=theano.config.floatX)) f = theano.function([], updates={c: 0.4 * c + .8 * T.dot(a, b)}) if verbose: print 'Some theano flags:' print ' blas.ldflags=', theano.config.blas.ldflags print ' compiledir=', theano.config.compiledir print ' floatX=', theano.config.floatX print 'Some env flags:' print ' MKL_NUM_THREADS=', os.getenv('MKL_NUM_THREADS') print ' OMP_NUM_THREADS=', os.getenv('OMP_NUM_THREADS') print ' GOTO_NUM_THREADS=', os.getenv('GOTO_NUM_THREADS') print print ('Numpy config: (used when the theano flags' ' "blas.ldflags" is empty)') numpy.show_config() print 'Numpy dot module:', numpy.dot.__module__ print 'Numpy file location that was loaded:', numpy.__file__ print 'Numpy version:', numpy.__version__ print if any([x.op.__class__.__name__ == 'Gemm' for x in f.maker.env.toposort()]): print 'Used the cpu' elif any([x.op.__class__.__name__ == 'GpuGemm' for x in f.maker.env.toposort()]): print 'Used the gpu' else: print 'ERROR, not able to tell if theano used the cpu or the gpu' print f.maker.env.toposort() t0 = 0 t1 = -1 if execute: t0 = time.time() for i in range(iters): f() t1 = time.time() if verbose and execute: print print 'This execution time took %.2fs' % (t1 - t0) print print ('Try to run this script a few times. Experience show that' ' the first time is not as fast as followings call. The' ' difference is not big, but consistent.') return t1 - t0
def _compile_and_check(self, inputs, outputs, numeric_inputs, cls): outputs_function = theano.function(inputs, outputs, mode=self.mode) shapes_function = theano.function(inputs, [o.shape for o in outputs], mode=self.mode) #theano.printing.debugprint(shapes_function) # Check that the Op is removed from the compiled function. topo_shape = shapes_function.maker.env.toposort() assert not any(isinstance(t.op, cls) for t in topo_shape) topo_out = outputs_function.maker.env.toposort() assert any(isinstance(t.op, cls) for t in topo_out) # Check that the shape produced agrees with the actual shape. numeric_outputs = outputs_function(*numeric_inputs) numeric_shapes = shapes_function(*numeric_inputs) for out, shape in zip(numeric_outputs, numeric_shapes): assert numpy.all(out.shape == shape)
def _toposort(edges): """ Topological sort algorithm by Kahn [1] - O(nodes + vertices) inputs: edges - a dict of the form {a: {b, c}} where b and c depend on a outputs: L - an ordered list of nodes that satisfy the dependencies of edges >>> _toposort({1: {2, 3}, 2: (3, )}) [1, 2, 3] Closely follows the wikipedia page [2] [1] Kahn, Arthur B. (1962), "Topological sorting of large networks", Communications of the ACM [2] http://en.wikipedia.org/wiki/Toposort#Algorithms """ incoming_edges = reverse_dict(edges) incoming_edges = dict((k, set(val)) for k, val in incoming_edges.items()) S = set((v for v in edges if v not in incoming_edges)) L = [] while S: n = S.pop() L.append(n) for m in edges.get(n, ()): assert n in incoming_edges[m] incoming_edges[m].remove(n) if not incoming_edges[m]: S.add(m) if any(incoming_edges.get(v, None) for v in edges): raise ValueError("Input has cycles") return L
def body(mode, gpu): #the m*2 allows the multinomial to reuse output f = function([p, u], m * 2, allow_input_downcast=True, mode=mode) if gpu: assert any([ type(node.op) is multinomial.GpuMultinomialFromUniform for node in f.maker.fgraph.toposort() ]) # test that both first and second samples can be drawn assert numpy.allclose(f([[1, 0], [0, 1]], [.1, .1]), [[2, 0], [0, 2]]) # test that both second labels can be drawn r = f([[.2, .8], [.3, .7]], [.31, .31]) assert numpy.allclose(r, [[0, 2], [0, 2]]), r # test that both first labels can be drawn r = f([[.2, .8], [.3, .7]], [.21, .21]) assert numpy.allclose(r, [[0, 2], [2, 0]]), r #change the size to make sure output gets reallocated ok # and also make sure that the GPU version doesn't screw up the # transposed-ness r = f([[.2, .8]], [.25]) assert numpy.allclose(r, [[0, 2]]), r
def body(mode, gpu): p = tensor.fmatrix() u = tensor.fvector() m = multinomial.MultinomialFromUniform('auto')(p,u) f = function([p,u], m*2, allow_input_downcast=True, mode=mode) if gpu: assert any([type(node.op) is multinomial.GpuMultinomialFromUniform for node in f.maker.env.toposort()]) pval = numpy.arange(10000 * 4, dtype='float32').reshape((10000, 4))+0.1 pval = pval / pval.sum(axis=1)[:,None] uval = numpy.ones_like(pval[:,0]) * 0.5 mval = f(pval,uval) assert mval.shape == pval.shape if config.cast_policy == 'custom': assert mval.dtype == pval.dtype elif config.cast_policy == 'numpy+floatX': assert mval.dtype == config.floatX elif config.cast_policy == 'numpy': assert mval.dtype == 'float64' else: raise NotImplementedError(config.cast_policy) assert numpy.allclose(mval.sum(axis=1), 2) asdf = numpy.asarray([0, 0, 2, 0])+0*pval assert numpy.allclose(mval, asdf) #broadcast over all rows
def cmp(a_shp, b_shp): a0 = my_rand(*a_shp) a = tcn.shared_constructor(a0, 'a') b = tensor.fmatrix('b') c = tensor.fmatrix('c') f = pfunc([b, c], [], updates=[(a, tensor.dot(a, b) + tensor.exp(c))], mode=mode_with_gpu) assert any([ node.op == tcn.blas.gpu_gemm_inplace for node in f.maker.fgraph.toposort() ]) bval = my_rand(*b_shp) cval = my_rand(a_shp[0], b_shp[1]) f(bval, cval) assert numpy.allclose( numpy.dot(a0, bval) + numpy.exp(cval), a.get_value()) # Try with a matrix equal to a0, but with strides in both dims a.set_value(a0) a.set_value(a.get_value(borrow=True, return_internal_type=True)[::-1, ::-1], borrow=True) f(bval, cval)
def profile_printer(fct_name, compile_time, fct_call_time, fct_call, apply_time, apply_cimpl, message, outputs_size, other_time): # Scan overhead profile if any([isinstance(node.op, Scan) and v > 0 for (_, node), v in apply_time.items()]): print print 'Scan overhead:' print ('<Scan op time(s)> <sub scan fct time(s)> <sub scan op ' 'time(s)> <sub scan fct time(% scan op time)> <sub scan ' 'op time(% scan op time)> <node>') total_super_scan_time = 0 total_scan_fct_time = 0 total_scan_op_time = 0 for (_, node), v in apply_time.items(): if isinstance(node.op, Scan): if v > 0: scan_fct_time = node.op.mode_instance.fn_time scan_op_time = node.op.mode_instance.local_time total_super_scan_time += v total_scan_fct_time += scan_fct_time total_scan_op_time += scan_op_time print ' %5.1fs %5.1fs %5.1fs %5.1f%% %5.1f%%' % ( v, scan_fct_time, scan_op_time, scan_fct_time / v * 100, scan_op_time / v * 100), node else: print (' The node took 0s, so we can not compute the ' 'overhead'), node print ' total %5.1fs %5.1fs %5.1fs %5.1f%% %5.1f%%' % ( total_super_scan_time, total_scan_fct_time, total_scan_op_time, total_scan_fct_time / total_super_scan_time * 100, total_scan_op_time / total_super_scan_time * 100)
def test_neibs(self): for shape, pshape in [((100, 40, 18, 18), (2, 2)), ((100, 40, 6, 18), (3, 2)), ((10, 40, 66, 66), (33, 33)), ((10, 40, 68, 66), (34, 33)) ]: for border in ['valid', 'ignore_borders']: for dtype in self.dtypes: images = shared( numpy.arange(numpy.prod(shape), dtype=dtype ).reshape(shape)) neib_shape = T.as_tensor_variable(pshape) f = function([], images2neibs(images, neib_shape, mode=border), mode=self.mode) #print images.get_value(borrow=True) neibs = f() #print neibs g = function([], neibs2images(neibs, neib_shape, images.shape), mode=self.mode) if border in ['valid']: assert any([isinstance(node.op, self.op) for node in f.maker.fgraph.toposort()]) #print g() assert numpy.allclose(images.get_value(borrow=True), g())
def test_elemwise_composite_float64(): # test that we don't fuse composite elemwise with float64 somewhere inside # nvcc by default downcast them to float32. We would need to tell him not # to do so, but that possible only on some device. a = tensor.fmatrix() b = tensor.fmatrix() av = theano._asarray(numpy.random.rand(4, 4), dtype='float32') bv = numpy.ones((4, 4), dtype='float32') def get_all_basic_scalar(composite_op): l = [] for i in composite_op.fgraph.toposort(): if isinstance(i, theano.scalar.Composite): l += get_all_basic_scalar(i) else: l.append(i) return l for mode in [mode_with_gpu, mode_with_gpu.excluding('gpu_after_fusion'), mode_with_gpu.excluding('elemwise_fusion')]: f = pfunc([a, b], tensor.cast(tensor.lt(tensor.cast(a, 'float64') ** 2, b), 'float32'), mode=mode) out = f(av, bv) assert numpy.all(out == ((av ** 2) < bv)) for node in f.maker.fgraph.toposort(): if isinstance(node.op, cuda.GpuElemwise): if isinstance(node.op.scalar_op, theano.scalar.Composite): scals = get_all_basic_scalar(node.op.scalar_op) for s in scals: assert not any([i.type.dtype == 'float64' for i in s.inputs + s.outputs])
def multinomial(self, size=None, n=1, pvals=None, ndim=None, dtype='int64', nstreams=None): """ Sample `n` (currently `n` needs to be 1) times from a multinomial distribution defined by probabilities pvals. Example : pvals = [[.98, .01, .01], [.01, .98, .01]] will probably result in [[1,0,0],[0,1,0]]. .. note:: `size` and `ndim` are only there keep the same signature as other uniform, binomial, normal, etc. todo : adapt multinomial to take that into account """ if pvals is None: raise TypeError("You have to specify pvals") pvals = as_tensor_variable(pvals) if size is not None: if any([isinstance(i, int) and i <= 0 for i in size]): raise ValueError( "The specified size contains a dimension with value <= 0", size) if n == 1 and pvals.ndim == 2: ndim, size, bcast = raw_random._infer_ndim_bcast( ndim, size, pvals[:,0]) assert ndim==1 bcast = bcast+(pvals.type.broadcastable[-1],) unis = self.uniform(size=size, ndim=1, nstreams=nstreams) op = multinomial.MultinomialFromUniform(dtype) return op(pvals, unis) else: raise NotImplementedError(("MRG_RandomStreams.multinomial only" " implemented with n == 1 and pvals.ndim = 2"))
def test_neibs_manual(self): shape = (2, 3, 4, 4) for dtype in self.dtypes: images = shared( numpy.arange(numpy.prod(shape), dtype=dtype).reshape(shape)) neib_shape = T.as_tensor_variable((2, 2)) for border in ['valid', 'ignore_borders']: f = function([], images2neibs(images, neib_shape, mode=border), mode=self.mode) assert any([ isinstance(node.op, self.op) for node in f.maker.fgraph.toposort() ]) #print images.get_value(borrow=True) neibs = f() #print neibs assert numpy.allclose( neibs, [[0, 1, 4, 5], [2, 3, 6, 7], [8, 9, 12, 13], [10, 11, 14, 15], [16, 17, 20, 21], [18, 19, 22, 23], [24, 25, 28, 29], [26, 27, 30, 31], [32, 33, 36, 37], [34, 35, 38, 39], [40, 41, 44, 45], [42, 43, 46, 47], [48, 49, 52, 53], [50, 51, 54, 55], [56, 57, 60, 61], [58, 59, 62, 63], [64, 65, 68, 69], [66, 67, 70, 71], [72, 73, 76, 77], [74, 75, 78, 79], [80, 81, 84, 85], [82, 83, 86, 87], [88, 89, 92, 93], [90, 91, 94, 95]]) g = function([], neibs2images(neibs, neib_shape, images.shape), mode=self.mode) assert numpy.allclose(images.get_value(borrow=True), g())
def test_elemwise_composite_float64(): # test that we don't fuse composite elemwise with float64 somewhere inside # nvcc by default downcast them to float32. We would need to tell him not # to do so, but that possible only on some device. a = tensor.fmatrix() b = tensor.fmatrix() av = theano._asarray(numpy.random.rand(4, 4), dtype='float32') bv = numpy.ones((4, 4), dtype='float32') def get_all_basic_scalar(composite_op): l = [] for i in composite_op.env.toposort(): if isinstance(i, theano.scalar.Composite): l += get_all_basic_scalar(i) else: l.append(i) return l for mode in [mode_with_gpu, mode_with_gpu.excluding('gpu_after_fusion'), mode_with_gpu.excluding('elemwise_fusion')]: f = pfunc([a, b], tensor.cast(tensor.lt(tensor.cast(a, 'float64') ** 2, b), 'float32'), mode=mode) out = f(av, bv) assert numpy.all(out == ((av ** 2) < bv)) for node in f.maker.env.toposort(): if isinstance(node.op, cuda.GpuElemwise): if isinstance(node.op.scalar_op, theano.scalar.Composite): scals = get_all_basic_scalar(node.op.scalar_op) for s in scals: assert not any([i.type.dtype == 'float64' for i in s.inputs + s.outputs])
def cmp(a_shp, b_shp): a0 = my_rand(*a_shp) a = tcn.shared_constructor(a0, 'a') cval = my_rand(a_shp[0], b_shp[1]) c = tcn.shared_constructor(cval.copy(), 'c') b = tcn.fmatrix('b') b2 = tcn.fmatrix('b2') f = pfunc( [b, b2], [tensor.dot(a, b2) + c], updates=[(a, tensor.dot(a, b) + c)], mode=mode_with_gpu) assert any([node.op == tcn.blas.gpu_gemm_no_inplace for node in f.maker.env.toposort()]) bval = my_rand(*b_shp) bval2 = my_rand(*b_shp) rval = f(bval, bval2) assert numpy.allclose(numpy.dot(a0, bval) + cval, a.get_value()) assert numpy.allclose(numpy.dot(a0, bval2) + cval, rval) # Try with a matrix equal to a0, but with strides in both dims a.set_value(a0) a.set_value( a.get_value(borrow=True, return_internal_type=True)[::-1, ::-1], borrow=True) f(bval, bval2)
def list_of_nodes(inputs, outputs): """ Return the apply nodes of the graph between inputs and outputs """ return stack_search( deque([o.owner for o in outputs]), lambda o: [inp.owner for inp in o.inputs if inp.owner and not any(i in inp.owner.outputs for i in inputs)])
def local_gpu_multinomial(node): if type(node.op) is MultinomialFromUniform: p, u = node.inputs m, = node.outputs if (p.dtype == u.dtype == m.dtype == 'float32' and any([ i.owner and isinstance(i.owner.op, theano.sandbox.cuda.HostFromGpu) for i in node.inputs ])): gpu_op = GpuMultinomialFromUniform(node.op.odtype) return [ host_from_gpu(gpu_op(*[gpu_from_host(i) for i in node.inputs])).T ] if (isinstance(node.op, theano.sandbox.cuda.GpuFromHost) and node.inputs[0].owner and type(node.inputs[0].owner.op) is MultinomialFromUniform): multi = node.inputs[0].owner p, u = multi.inputs m, = multi.outputs if (p.dtype == u.dtype == m.dtype == 'float32'): gpu_op = GpuMultinomialFromUniform(multi.op.odtype) ret = gpu_op(*[gpu_from_host(i) for i in multi.inputs]).T # The dimshuffle is on the cpu, but will be moved to the gpu by an opt. return [gpu_from_host(ret)]
def cmp(a_shp, b_shp): a0 = my_rand(*a_shp) a = tcn.shared_constructor(a0, 'a') cval = my_rand(a_shp[0], b_shp[1]) c = tcn.shared_constructor(cval.copy(), 'c') b = tcn.fmatrix('b') b2 = tcn.fmatrix('b2') f = pfunc([b, b2], [tensor.dot(a, b2) + c], updates=[(a, tensor.dot(a, b) + c)], mode=mode_with_gpu) assert any([ node.op == tcn.blas.gpu_gemm_no_inplace for node in f.maker.env.toposort() ]) bval = my_rand(*b_shp) bval2 = my_rand(*b_shp) rval = f(bval, bval2) assert numpy.allclose(numpy.dot(a0, bval) + cval, a.get_value()) assert numpy.allclose(numpy.dot(a0, bval2) + cval, rval) # Try with a matrix equal to a0, but with strides in both dims a.set_value(a0) a.set_value(a.get_value(borrow=True, return_internal_type=True)[::-1, ::-1], borrow=True) f(bval, bval2)
def local_opt(node): if type(node.op) in OP: # Either one of our inputs is on the gpu or # all of our client are on the gpu if (any([ i.owner and i.owner.op == host_from_gpu for i in node.inputs ]) or all([ c != 'output' and c.op == gpu_from_host for c, idx in node.outputs[0].clients ])): new_op = maker(node) # This is needed as sometimes new_op inherit from OP. if new_op and new_op != node.op: if isinstance(new_op, theano.Op): return [ safe_to_cpu(o) for o in new_op(*node.inputs, return_list=True) ] elif isinstance(new_op, (tuple, list)): return [safe_to_cpu(o) for o in new_op] else: # suppose it is a variable on the GPU return [host_from_gpu(new_op)] return False
def cmp(a_shp, b_shp): a0 = my_rand(*a_shp) a = tcn.shared_constructor(a0, 'a') b = tensor.fmatrix('b') c = tensor.fmatrix('c') f = pfunc([b, c], [], updates=[(a, tensor.dot(a, b) + tensor.exp(c))], mode=mode_with_gpu) assert any([node.op == tcn.blas.gpu_gemm_inplace for node in f.maker.fgraph.toposort()]) bval = my_rand(*b_shp) cval = my_rand(a_shp[0], b_shp[1]) f(bval, cval) assert numpy.allclose(numpy.dot(a0, bval) + numpy.exp(cval), a.get_value()) # Try with a matrix equal to a0, but with strides in both dims a.set_value(a0) a.set_value( a.get_value(borrow=True, return_internal_type=True)[::-1, ::-1], borrow=True) f(bval, cval)
def test_logical_shapes(self): seed_rng() for stride in range(1, 4): kshp = (10, 2, 10, 10) featshp = (3, 10, 11, 11) a = tensor.ftensor4() A = tensor.ftensor4() # Need to transpose first two dimensions of kernel, and reverse # index kernel image dims (for correlation) kernel_rotated = tensor.transpose(A, axes=[1, 0, 2, 3]) featshp_logical = (featshp[0], featshp[1], featshp[2] * stride, featshp[3] * stride) kshp_rotated = (kshp[1], kshp[0], kshp[2], kshp[3]) #print featshp, kshp_rotated, featshp_logical[1:], kshp[2:] image_estimate = tensor.nnet.conv2d(a, kernel_rotated, border_mode='full', image_shape=featshp, filter_shape=kshp_rotated, imshp_logical=featshp_logical[1:], kshp_logical=kshp[2:]) func = theano.function([a, A], image_estimate, mode=theano_mode) #theano.printing.debugprint(func,) assert any([isinstance(node.op, theano.sandbox.cuda.blas.GpuConv) for node in func.maker.fgraph.toposort()]) a_in = numpy.random.randn(*featshp).astype("float32") A_in = numpy.random.randn(*kshp).astype("float32") func(a_in, A_in)
def test_neibs(self): for shape, pshape in [((10, 7, 18, 18), (2, 2)), ((10, 7, 6, 18), (3, 2)), ((5, 7, 66, 66), (33, 33)), ((5, 7, 68, 66), (34, 33))]: for border in ['valid', 'ignore_borders']: for dtype in self.dtypes: images = shared( numpy.arange(numpy.prod(shape), dtype=dtype).reshape(shape)) neib_shape = T.as_tensor_variable(pshape) f = function([], images2neibs(images, neib_shape, mode=border), mode=self.mode) #print images.get_value(borrow=True) neibs = f() #print neibs g = function([], neibs2images(neibs, neib_shape, images.shape), mode=self.mode) if border in ['valid']: assert any([ isinstance(node.op, self.op) for node in f.maker.fgraph.toposort() ]) #print g() assert numpy.allclose(images.get_value(borrow=True), g())
def local_gpua_subtensor(node): x = node.inputs[0] if (x.owner and isinstance(x.owner.op, HostFromGpu)): gpu_x = x.owner.inputs[0] if (gpu_x.owner and isinstance(gpu_x.owner.op, GpuFromHost) and # And it is a shared var or an input of the graph. not gpu_x.owner.inputs[0].owner): if len(x.clients) == 1: if any([n == 'output' or any([isinstance(v.type, GpuArrayType) for v in n.inputs + n.outputs]) for n,_ in node.outputs[0].clients]): return else: return [host_from_gpu(gpu_from_host(node.outputs[0]))] return GpuSubtensor(node.op.idx_list)
def create(self, input_storage=None, trustme=False): ret = super(Profile_Maker, self).create(input_storage, trustme) if (hasattr(theano, 'sandbox') and hasattr(theano.sandbox, 'cuda') and theano.sandbox.cuda.cuda_enabled): if os.environ.get('CUDA_LAUNCH_BLOCKING', '0') != '1': raise Exception( "You are running the Theano profiler with CUDA enabled." " Theano GPU ops execution is asynchronous by default." " So by default, the profile is useless." " You must set the environment variable" " CUDA_LAUNCH_BLOCKING to 1 to tell the CUDA driver to" " synchronize the execution to get a meaningful profile.") # create a function-specific storage container for profiling info profile = ProfileStats(atexit_print=False) self.mode.profile_stats[ret] = profile ret.profile = profile #initialize the timers for i, node in enumerate(ret.maker.fgraph.toposort()): profile.apply_time[node] = 0.0 # a thunk_group is a list of the thunks from each linker # corresponding to the i'th position in the toposort. assert len(ret.fn.thunk_groups[i]) == 1 profile.apply_cimpl[node] = hasattr( ret.fn.thunk_groups[i][0], 'cthunk') # Here we replace the linker function. # This ugliness makes WrapLinker (an object that *generates* # functions and is not function-specific) work with ProfileStats # objects which are function-specific. #capture old fn in closure. This is important since new_fn is about to #take its place as ret.fn. ret_fn = ret.fn def new_fn(): self.mode.apply_time = self.mode.profile_stats[ret].apply_time self.mode.variable_shape = self.mode.profile_stats[ret].variable_shape ret_fn() # delete the old apply_time variable # because it doesn't mean the same thing anymore. # This prevents old code from looking like it still works. del self.mode.apply_time del self.mode.variable_shape ret.fn = new_fn global run_cthunk if run_cthunk is None and any(profile.apply_cimpl.values()): # Lazy import to avoid compilation when importing theano. from theano.gof.cutils import run_cthunk return ret
def compile_args(): """ This args will be received by compile_str() in the preargs paramter. They will also be included in the "hard" part of the key module. """ flags = [flag for flag in config.nvcc.flags.split(" ") if flag] if config.nvcc.fastmath: flags.append("-use_fast_math") cuda_ndarray_cuh_hash = hash_from_file(os.path.join(os.path.split(__file__)[0], "cuda_ndarray.cuh")) flags.append("-DCUDA_NDARRAY_CUH=" + cuda_ndarray_cuh_hash) # NumPy 1.7 Deprecate the old API. I updated most of the places # to use the new API, but not everywhere. When finished, enable # the following macro to assert that we don't bring new code # that use the old API. flags.append("-D NPY_NO_DEPRECATED_API=NPY_1_7_API_VERSION") # numpy 1.7 deprecated the following macro but the didn't # existed in the past numpy_ver = [int(n) for n in numpy.__version__.split(".")[:2]] if bool(numpy_ver < [1, 7]): flags.append("-D NPY_ARRAY_ENSURECOPY=NPY_ENSURECOPY") flags.append("-D NPY_ARRAY_ALIGNED=NPY_ALIGNED") flags.append("-D NPY_ARRAY_WRITEABLE=NPY_WRITEABLE") flags.append("-D NPY_ARRAY_UPDATE_ALL=NPY_UPDATE_ALL") flags.append("-D NPY_ARRAY_C_CONTIGUOUS=NPY_C_CONTIGUOUS") flags.append("-D NPY_ARRAY_F_CONTIGUOUS=NPY_F_CONTIGUOUS") # If the user didn't specify architecture flags add them if not any(["-arch=sm_" in f for f in flags]): # We compile cuda_ndarray.cu during import. # We should not add device properties at that time. # As the device is not selected yet! # TODO: re-compile cuda_ndarray when we bind to a GPU? import theano.sandbox.cuda if hasattr(theano.sandbox, "cuda"): n = theano.sandbox.cuda.use.device_number if n is None: _logger.warn( "We try to get compilation arguments for CUDA" " code, but the GPU device is not initialized." " This is probably caused by an Op that work on" " the GPU that don't inherit from GpuOp." " We Initialize the GPU now." ) theano.sandbox.cuda.use( "gpu", force=True, default_to_move_computation_to_gpu=False, move_shared_float32_to_gpu=False, enable_cuda=False, ) n = theano.sandbox.cuda.use.device_number p = theano.sandbox.cuda.device_properties(n) flags.append("-arch=sm_" + str(p["major"]) + str(p["minor"])) return flags
def test_many_arg_elemwise(): """this test checks whether the + and * elemwise ops can handle extremely large numbers of arguments on gpu i.e., it is a test of the optimization theano/sandbox/cuda/opt.py:local_gpu_huge_add_or_mul """ rng = numpy.random.RandomState([1, 2, 3]) for num_args in [25]: for op_to_test in [theano.tensor.add, theano.tensor.mul]: for nb_dim in [2, 3, 4, 5]: shapes = [rng.randint(1, 5) for i in range(nb_dim)] args = [numpy.cast['float32'](rng.randn(*shapes)) for arg in xrange(0, num_args)] symb_args = [theano.tensor.TensorType('float32', (False,)*nb_dim)() for arg in xrange(0, num_args)] outputs = [] for mode in [mode_with_gpu, mode_without_gpu]: #test the optijmization local_gpu_elemwise_0 f = theano.function( symb_args, op_to_test(*symb_args), mode=mode.excluding("local_gpu_elemwise_1")) outputs.append(f(*args)) #assert that the test was done on the gpu. if mode is mode_with_gpu: assert any([isinstance(node.op, cuda.GpuElemwise) for node in f.maker.env.nodes]) #test the optijmization local_gpu_elemwise_1 f = theano.function( symb_args, cuda.gpu_from_host(op_to_test(*symb_args)), mode=mode.excluding("local_gpu_elemwise_0")) out = f(*args) #assert that the test was done on the gpu. if mode is mode_with_gpu: assert any([isinstance(node.op, cuda.GpuElemwise) for node in f.maker.env.nodes]) assert numpy.allclose(out, outputs[-1]) results_gpu, results_cpu = outputs assert numpy.allclose(results_gpu, results_cpu)
def test_many_arg_elemwise(): """this test checks whether the + and * elemwise ops can handle extremely large numbers of arguments on gpu i.e., it is a test of the optimization theano/sandbox/cuda/opt.py:local_gpu_huge_add_or_mul """ rng = numpy.random.RandomState([1, 2, 3]) for num_args in [25]: for op_to_test in [theano.tensor.add, theano.tensor.mul]: for nb_dim in [2, 3, 4, 5]: shapes = [rng.randint(1, 5) for i in range(nb_dim)] args = [numpy.cast['float32'](rng.randn(*shapes)) for arg in xrange(0, num_args)] symb_args = [theano.tensor.TensorType('float32', (False,)*nb_dim)() for arg in xrange(0, num_args)] outputs = [] for mode in [mode_with_gpu, mode_without_gpu]: #test the optijmization local_gpu_elemwise_0 f = theano.function( symb_args, op_to_test(*symb_args), mode=mode.excluding("local_gpu_elemwise_1")) outputs.append(f(*args)) #assert that the test was done on the gpu. if mode is mode_with_gpu: assert any([isinstance(node.op, cuda.GpuElemwise) for node in f.maker.fgraph.apply_nodes]) #test the optijmization local_gpu_elemwise_1 f = theano.function( symb_args, cuda.gpu_from_host(op_to_test(*symb_args)), mode=mode.excluding("local_gpu_elemwise_0")) out = f(*args) #assert that the test was done on the gpu. if mode is mode_with_gpu: assert any([isinstance(node.op, cuda.GpuElemwise) for node in f.maker.fgraph.apply_nodes]) assert numpy.allclose(out, outputs[-1]) results_gpu, results_cpu = outputs assert numpy.allclose(results_gpu, results_cpu)
def compile_args(): """ This args will be received by compile_str() in the preargs paramter. They will also be included in the "hard" part of the key module. """ flags = [flag for flag in config.nvcc.flags.split(' ') if flag] if config.nvcc.fastmath: flags.append('-use_fast_math') cuda_ndarray_cuh_hash = hash_from_file( os.path.join(os.path.split(__file__)[0], 'cuda_ndarray.cuh')) flags.append('-DCUDA_NDARRAY_CUH=' + cuda_ndarray_cuh_hash) # NumPy 1.7 Deprecate the old API. I updated most of the places # to use the new API, but not everywhere. When finished, enable # the following macro to assert that we don't bring new code # that use the old API. flags.append("-D NPY_NO_DEPRECATED_API=NPY_1_7_API_VERSION") # numpy 1.7 deprecated the following macro but the didn't # existed in the past numpy_ver = [int(n) for n in numpy.__version__.split('.')[:2]] if bool(numpy_ver < [1, 7]): flags.append("-D NPY_ARRAY_ENSURECOPY=NPY_ENSURECOPY") flags.append("-D NPY_ARRAY_ALIGNED=NPY_ALIGNED") flags.append("-D NPY_ARRAY_WRITEABLE=NPY_WRITEABLE") flags.append("-D NPY_ARRAY_UPDATE_ALL=NPY_UPDATE_ALL") flags.append("-D NPY_ARRAY_C_CONTIGUOUS=NPY_C_CONTIGUOUS") flags.append("-D NPY_ARRAY_F_CONTIGUOUS=NPY_F_CONTIGUOUS") # If the user didn't specify architecture flags add them if not any(['-arch=sm_' in f for f in flags]): # We compile cuda_ndarray.cu during import. # We should not add device properties at that time. # As the device is not selected yet! # TODO: re-compile cuda_ndarray when we bind to a GPU? import theano.sandbox.cuda if hasattr(theano.sandbox, 'cuda'): n = theano.sandbox.cuda.use.device_number if n is None: _logger.warn( "We try to get compilation arguments for CUDA" " code, but the GPU device is not initialized." " This is probably caused by an Op that work on" " the GPU that don't inherit from GpuOp." " We Initialize the GPU now.") theano.sandbox.cuda.use( "gpu", force=True, default_to_move_computation_to_gpu=False, move_shared_float32_to_gpu=False, enable_cuda=False) n = theano.sandbox.cuda.use.device_number p = theano.sandbox.cuda.device_properties(n) flags.append('-arch=sm_' + str(p['major']) + str(p['minor'])) return flags
def create(self, input_storage=None, trustme=False): ret = super(Profile_Maker, self).create(input_storage, trustme) if (hasattr(theano, 'sandbox') and hasattr(theano.sandbox, 'cuda') and theano.sandbox.cuda.cuda_enabled): if os.environ.get('CUDA_LAUNCH_BLOCKING', '0') != '1': raise Exception( "You are running the Theano profiler with CUDA enabled." " Theano GPU ops execution is asynchronous by default." " So by default, the profile is useless." " You must set the environment variable" " CUDA_LAUNCH_BLOCKING to 1 to tell the CUDA driver to" " synchronize the execution to get a meaningful profile.") # create a function-specific storage container for profiling info profile = ProfileStats(atexit_print=False) self.mode.profile_stats[ret] = profile ret.profile = profile #initialize the timers for i, node in enumerate(ret.maker.fgraph.toposort()): profile.apply_time[node] = 0.0 # a thunk_group is a list of the thunks from each linker # corresponding to the i'th position in the toposort. assert len(ret.fn.thunk_groups[i]) == 1 profile.apply_cimpl[node] = hasattr(ret.fn.thunk_groups[i][0], 'cthunk') # Here we replace the linker function. # This ugliness makes WrapLinker (an object that *generates* # functions and is not function-specific) work with ProfileStats # objects which are function-specific. #capture old fn in closure. This is important since new_fn is about to #take its place as ret.fn. ret_fn = ret.fn def new_fn(): self.mode.apply_time = self.mode.profile_stats[ret].apply_time self.mode.variable_shape = self.mode.profile_stats[ ret].variable_shape ret_fn() # delete the old apply_time variable # because it doesn't mean the same thing anymore. # This prevents old code from looking like it still works. del self.mode.apply_time del self.mode.variable_shape ret.fn = new_fn global run_cthunk if run_cthunk is None and any(profile.apply_cimpl.values()): # Lazy import to avoid compilation when importing theano. from theano.gof.cutils import run_cthunk return ret
def filter_nvcc_flags(s): assert isinstance(s, str) flags = [flag for flag in s.split(' ') if flag] if any([f for f in flags if not f.startswith("-")]): raise ValueError( "Theano nvcc.flags support only parameter/value pairs without" " space between them. e.g.: '--machine 64' is not supported," " but '--machine=64' is supported. Please add the '=' symbol." " nvcc.flags value is '%s'" % s) return ' '.join(flags)
def test_reshape(): a = tcn.CudaNdarrayType((False,))() b = tcn.CudaNdarrayType((False, False))() c = T.reshape(a, [2, 3]) #basic f = theano.function([a], c, mode=mode_with_gpu) fv = f(cuda_ndarray.CudaNdarray(theano._asarray([0, 1, 2, 3, 4, 5], dtype='float32'))) topo = f.maker.fgraph.toposort() assert any([isinstance(node.op, B.GpuReshape) for node in topo]) assert numpy.all(fv == numpy.asarray([[0, 1, 2], [3, 4, 5]])) #test that it works without inplace operations a_val = cuda_ndarray.CudaNdarray(theano._asarray([0, 1, 2, 3, 4, 5], dtype='float32')) a_val_copy = cuda_ndarray.CudaNdarray(theano._asarray([0, 1, 2, 3, 4, 5], dtype='float32')) b_val = cuda_ndarray.CudaNdarray(theano._asarray([[0, 1, 2], [3, 4, 5]], dtype='float32')) f_sub = theano.function([a, b], c - b, mode=mode_with_gpu) topo = f_sub.maker.fgraph.toposort() assert any([isinstance(node.op, B.GpuReshape) for node in topo]) assert numpy.all(f_sub(a_val, b_val) == 0.0) assert numpy.all(numpy.asarray(a_val) == numpy.asarray(a_val_copy)) #test that it works with inplace operations a_val = theano._asarray([0, 1, 2, 3, 4, 5], dtype='float32') a_val_copy = theano._asarray([0, 1, 2, 3, 4, 5], dtype='float32') b_val = theano._asarray([[0, 1, 2], [3, 4, 5]], dtype='float32') f_sub = theano.function([a, b], c - b, mode=mode_with_gpu) topo = f_sub.maker.fgraph.toposort() assert any([isinstance(node.op, B.GpuReshape) for node in topo]) assert numpy.all(f_sub(a_val, b_val) == 0.0) assert numpy.all(numpy.asarray(a_val) == numpy.asarray(a_val_copy)) # verify gradient def just_vals(v): return T.Reshape(2)(v, theano._asarray([2, 3], dtype='int32')) utt.verify_grad(just_vals, [a_val])
def local_gpua_subtensor(node): x = node.inputs[0] if (x.owner and isinstance(x.owner.op, HostFromGpu)): gpu_x = x.owner.inputs[0] if (gpu_x.owner and isinstance(gpu_x.owner.op, GpuFromHost) and # And it is a shared var or an input of the graph. not gpu_x.owner.inputs[0].owner): if len(x.clients) == 1: if any([ n == 'output' or any([ isinstance(v.type, GpuArrayType) for v in n.inputs + n.outputs ]) for n, _ in node.outputs[0].clients ]): return else: return [host_from_gpu(gpu_from_host(node.outputs[0]))] return GpuSubtensor(node.op.idx_list)
def test_reshape(): a = tcn.CudaNdarrayType((False,))() b = tcn.CudaNdarrayType((False, False))() c = T.reshape(a, [2, 3]) #basic f = theano.function([a], c, mode=mode_with_gpu) fv = f(cuda_ndarray.CudaNdarray(theano._asarray([0, 1, 2, 3, 4, 5], dtype='float32'))) topo = f.maker.env.toposort() assert any([isinstance(node.op, B.GpuReshape) for node in topo]) assert numpy.all(fv == numpy.asarray([[0, 1, 2], [3, 4, 5]])) #test that it works without inplace operations a_val = cuda_ndarray.CudaNdarray(theano._asarray([0, 1, 2, 3, 4, 5], dtype='float32')) a_val_copy = cuda_ndarray.CudaNdarray(theano._asarray([0, 1, 2, 3, 4, 5], dtype='float32')) b_val = cuda_ndarray.CudaNdarray(theano._asarray([[0, 1, 2], [3, 4, 5]], dtype='float32')) f_sub = theano.function([a, b], c - b, mode=mode_with_gpu) topo = f_sub.maker.env.toposort() assert any([isinstance(node.op, B.GpuReshape) for node in topo]) assert numpy.all(f_sub(a_val, b_val) == 0.0) assert numpy.all(numpy.asarray(a_val) == numpy.asarray(a_val_copy)) #test that it works with inplace operations a_val = theano._asarray([0, 1, 2, 3, 4, 5], dtype='float32') a_val_copy = theano._asarray([0, 1, 2, 3, 4, 5], dtype='float32') b_val = theano._asarray([[0, 1, 2], [3, 4, 5]], dtype='float32') f_sub = theano.function([a, b], c - b, mode=mode_with_gpu) topo = f_sub.maker.env.toposort() assert any([isinstance(node.op, B.GpuReshape) for node in topo]) assert numpy.all(f_sub(a_val, b_val) == 0.0) assert numpy.all(numpy.asarray(a_val) == numpy.asarray(a_val_copy)) # verify gradient def just_vals(v): return T.Reshape(2)(v, theano._asarray([2, 3], dtype='int32')) utt.verify_grad(just_vals, [a_val])
def multinomial(self, size=None, n=1, pvals=None, ndim=None, dtype='int64', nstreams=None): """ Sample `n` (currently `n` needs to be 1) times from a multinomial distribution defined by probabilities pvals. Example : pvals = [[.98, .01, .01], [.01, .98, .01]] will probably result in [[1,0,0],[0,1,0]]. .. note:: -`size` and `ndim` are only there keep the same signature as other uniform, binomial, normal, etc. todo : adapt multinomial to take that into account -Does not do any value checking on pvals, i.e. there is no check that the elements are non-negative, less than 1, or sum to 1. passing pvals = [[-2., 2.]] will result in sampling [[0, 0]] """ if pvals is None: raise TypeError("You have to specify pvals") pvals = as_tensor_variable(pvals) if size is not None: if any([isinstance(i, int) and i <= 0 for i in size]): raise ValueError( "The specified size contains a dimension with value <= 0", size) if n == 1 and pvals.ndim == 2: if size is not None: raise ValueError( "Provided a size argument to " "MRG_RandomStreams.multinomial, which does not use " "the size argument.") if ndim is not None: raise ValueError( "Provided an ndim argument to " "MRG_RandomStreams.multinomial, which does not use " "the ndim argument.") ndim, size, bcast = raw_random._infer_ndim_bcast( ndim, size, pvals[:, 0]) assert ndim == 1 bcast = bcast + (pvals.type.broadcastable[-1], ) unis = self.uniform(size=size, ndim=1, nstreams=nstreams) op = multinomial.MultinomialFromUniform(dtype) return op(pvals, unis) else: raise NotImplementedError( ("MRG_RandomStreams.multinomial only" " implemented with n == 1 and pvals.ndim = 2"))
def test__toposort(): edges = { 1: set((4, 6, 7)), 2: set((4, 6, 7)), 3: set((5, 7)), 4: set((6, 7)), 5: set((7, )) } order = _toposort(edges) assert not any(a in edges.get(b, ()) for i, a in enumerate(order) for b in order[i:])
def _compile_and_check(self, inputs, outputs, numeric_inputs, cls, excluding=None): """This tests the infer_shape method only""" mode = self.mode if excluding: mode = mode.excluding(*excluding) outputs_function = theano.function(inputs, outputs, mode=mode) shapes_function = theano.function(inputs, [o.shape for o in outputs], mode=mode) #theano.printing.debugprint(shapes_function) # Check that the Op is removed from the compiled function. topo_shape = shapes_function.maker.fgraph.toposort() assert not any(isinstance(t.op, cls) for t in topo_shape) topo_out = outputs_function.maker.fgraph.toposort() assert any(isinstance(t.op, cls) for t in topo_out) # Check that the shape produced agrees with the actual shape. numeric_outputs = outputs_function(*numeric_inputs) numeric_shapes = shapes_function(*numeric_inputs) for out, shape in zip(numeric_outputs, numeric_shapes): assert numpy.all(out.shape == shape)
def test_nvidia_driver3(): """ Test that the gpu device is initialized by theano when we build a function with gpu op. The driver should always be tested during theano initialization of the gpu device """ var = cuda.fvector() f = theano.function([var], var + 1, mode=mode_with_gpu, profile=False) topo = f.maker.fgraph.toposort() assert any([isinstance(node.op, cuda.GpuElemwise) for node in topo]) assert theano.sandbox.cuda.use.device_number is not None
def get_flags(*types): def get_dtype(t): if isinstance(t, (str, unicode)): return numpy.dtype(t) elif isinstance(t, Type): return t.dtype elif isinstance(t, Variable): return t.type.dtype else: raise TypeError, "can't get a dtype from %s" % (type(t),) dtypes = [get_dtype(t) for t in types] flags = dict(cluda=True) if any(d == numpy.float64 for d in dtypes): flags['have_double'] = True if any(d.itemsize < 4 for d in dtypes): flags['have_small'] = True if any(d.kind == 'c' for d in dtypes): flags['have_complex'] = True if any(d == numpy.float16 for d in dtypes): flags['have_half'] = True return flags
def test_pooling_opt(): if not cuda.dnn.dnn_available(): raise SkipTest(cuda.dnn.dnn_available.msg) x = T.ftensor4() f = theano.function( [x], max_pool_2d(x, ds=(2, 2), ignore_border=True), mode=mode_with_gpu) assert any([isinstance(n.op, cuda.dnn.GpuDnnPool) for n in f.maker.fgraph.toposort()]) f = theano.function( [x], T.grad(max_pool_2d(x, ds=(2, 2), ignore_border=True).sum(), x), mode=mode_with_gpu.including("cudnn")) assert any([isinstance(n.op, cuda.dnn.GpuDnnPoolGrad) for n in f.maker.fgraph.toposort()])