def test_setitem_matrix_tensor3(): a = numpy.arange(27) a.resize((3,3,3)) a = theano._asarray(a, dtype='float32') _a = cuda_ndarray.CudaNdarray(a) b = theano._asarray([7,8,9], dtype='float32') _b = cuda_ndarray.CudaNdarray(b) # set middle row through cube to 7,8,9 _a[:,1,1] = _b a[:,1,1] = b assert numpy.allclose(a,numpy.asarray(_a)) #test direct transfert from numpy try: _a[:,1,1] = b*100 a[:,1,1] = b*100 raise Exception("CudaNdarray.__setitem__ should have returned an error") assert numpy.allclose(a,numpy.asarray(_a)) except NotImplementedError: pass row = theano._asarray([777,888,999], dtype='float32') _a[1,1,:] = row a[1,1,:] = row assert numpy.allclose(a,numpy.asarray(_a))
def __init__(self, input, n_in, n_out, activation, rng=RandomState(1234), layer_name="HiddenLayer", W=None, b=None, borrow=True): if W!=None: self.W = shared(value=W, borrow=borrow, name=layer_name+'_W') elif activation in (relu,softplus): W_val = _asarray(rng.normal(loc=0, scale=0.01, size=(n_in, n_out)), dtype=floatX) self.W = shared(W_val, name=layer_name+"_W", borrow=borrow) else: # uniformly sampled W low = -sqrt(6. / (n_in + n_out)) high = sqrt(6. / (n_in + n_out)) values = rng.uniform(low=low, high=high, size=(n_in, n_out)) W_val = _asarray(values, dtype=floatX) if activation == sigmoid: W_val *= 4 self.W = shared(value=W_val, borrow=borrow, name=layer_name+'_W') if b != None: self.b = shared(b, name=layer_name+"_b", borrow=borrow) elif activation in (relu,softplus): b_val = ones((n_out,), dtype=floatX) self.b = shared(value=b_val, borrow=True) else: # Initialize b with zeros self.b = shared(value=zeros((n_out,), dtype=floatX), borrow=True) # Parameters of the model self.params = [self.W, self.b] # Output of the hidden layer self.output = activation(T.dot(input, self.W) + self.b)
def subtest(shape_1, shape_2, rng): #print >> sys.stdout, "INFO: shapes", shape_1, shape_2 a = theano._asarray(rng.randn(*shape_1), dtype='float32') b = cuda_ndarray.CudaNdarray(a) aa = a.reshape(shape_2) bb = b.reshape(shape_2) n_bb = numpy.asarray(bb) # print n_bb assert numpy.all(aa == n_bb) assert aa.shape == n_bb.shape # Test the not contiguous case shape_1_2x = (shape_1[0] * 2,) + shape_1[1:] a = theano._asarray(rng.randn(*shape_1_2x), dtype='float32') b = cuda_ndarray.CudaNdarray(a) a = a[::2] b = b[::2] aa = a.reshape(shape_2) bb = b.reshape(shape_2) n_bb = numpy.asarray(bb) # print n_bb assert numpy.all(aa == n_bb) assert aa.shape == n_bb.shape
def set_input_space(self, space): self.input_space = space if isinstance(space, VectorSpace): self.requires_reformat = False self.input_dim = space.dim else: self.requires_reformat = True self.input_dim = space.get_total_dimension() self.desired_space = VectorSpace(self.input_dim) self.output_space = VectorSpace(self.dim) # we cannot set this in init() as we're not sure about input dimesnions yet if self.istdev is not None: W = self.rng.randn(self.input_dim, self.dim) * self.istdev b = self.rng.randn(self.dim,) * self.istdev else: W = np.zeros((self.input_dim, self.dim)) b = np.zeros((self.dim,)) * self.istdev self.W = theano.shared(theano._asarray(W, dtype=theano.config.floatX), name=(self.layer_name+'_W')) self.b = theano.shared(theano._asarray(b, dtype=theano.config.floatX), name=(self.layer_name + '_b'))
def test_elemwise2(): """ Several kinds of elemwise expressions with dimension permutations """ rng = numpy.random.RandomState(int(time.time())) shape = (3, 5) for pattern in [(0, 1), (1, 0)]: a = tcn.shared_constructor(theano._asarray(rng.rand(*shape), dtype='float32'), name=None) b = tensor.Tensor(dtype='float32', broadcastable=[0] * len(shape))() f = pfunc([b], [], updates=[(a, (a + b).dimshuffle(pattern))], mode=mode_with_gpu) has_elemwise = False for i, node in enumerate(f.maker.env.toposort()): has_elemwise = has_elemwise or isinstance(node.op, tensor.Elemwise) assert not has_elemwise #let debugmode catch errors f(theano._asarray(rng.rand(*shape), dtype='float32') * .3) shape = (3, 4, 5, 6) a = tcn.shared_constructor(theano._asarray(rng.rand(*shape), dtype='float32'), 'a') b = tensor.Tensor(dtype='float32', broadcastable=[0] * len(shape))() f = pfunc([b], [], updates=[(a, (a + b).dimshuffle([2, 0, 3, 1]) * tensor.exp(b ** a).dimshuffle([2, 0, 3, 1]))], mode=mode_with_gpu) has_elemwise = False for i, node in enumerate(f.maker.env.toposort()): has_elemwise = has_elemwise or isinstance(node.op, tensor.Elemwise) assert not has_elemwise #let debugmode catch errors f(theano._asarray(rng.rand(*shape), dtype='float32'))
def gemm_directly(bs, ch, nf, rImg1, rImg2, rFlt1, rFlt2, subsx, subsy, direction): ishape = (bs, ch, rImg1, rImg2) kshape = (nf, ch, rFlt1, rFlt2) subsample = (subsx, subsy) npy_img = theano._asarray(numpy.random.rand(*ishape), dtype='float32') npy_kern = theano._asarray(numpy.random.rand(*kshape), dtype='float32') i = cuda_tensor4() k = cuda_tensor4() if direction == 'fprop': cpuval = py_conv(npy_img, npy_kern, 'valid', subsample) op = theano.sandbox.cuda.blas.GpuCorrMM(border_mode='valid', subsample=subsample)(i, k) f = theano.function([i, k], op, mode=theano_mode) gpuval = f(npy_img, npy_kern[:,:,::-1,::-1]) elif direction == 'bprop img': cpuval = py_conv(npy_img, npy_kern, 'full', subsample) op = theano.sandbox.cuda.blas.GpuCorrMM_gradInputs( border_mode='valid', subsample=subsample)(i, k) f = theano.function([i, k], op, mode=theano_mode) gpuval = f(npy_kern.transpose(1, 0, 2, 3), npy_img) elif direction == 'bprop kern': cpuval = py_conv(npy_img, npy_kern, 'valid', subsample) op = theano.sandbox.cuda.blas.GpuCorrMM_gradWeights( border_mode='valid', subsample=subsample)(i, k) f = theano.function([i, k], op, mode=theano_mode) gpuval = numpy.array(f( npy_img.transpose(1, 0, 2, 3), npy_kern.transpose(1, 0, 2, 3)[:,:,::-1,::-1])).transpose( 1, 0, 2, 3) assert_allclose(cpuval, gpuval, rtol=1e-4)
def test_setitem_rightvalue_ndarray_fails(): """ Now we don't automatically add dimensions to broadcast """ a = numpy.arange(3 * 4 * 5) a.resize((3, 4, 5)) a = theano._asarray(a, dtype='float32') _a = cuda_ndarray.CudaNdarray(a) b = theano._asarray([7, 8, 9, 10], dtype='float32') _b = cuda_ndarray.CudaNdarray(b) b5 = theano._asarray([7, 8, 9, 10, 11], dtype='float32') _b5 = cuda_ndarray.CudaNdarray(b) # attempt to assign the ndarray b with setitem _a[:, :, 1] = _b a[:, :, 1] = b assert numpy.allclose(numpy.asarray(_a), a) #test direct transfert from numpy to contiguous region # attempt to assign the ndarray b with setitem # same number of dim mat = numpy.random.rand(4, 5).astype('float32') _a[2, :, :] = mat a[2, :, :] = mat assert numpy.allclose(numpy.asarray(_a), a) # without same number of dim try: _a[0, :, :] = mat #a[0, :, :] = mat #assert numpy.allclose(numpy.asarray(_a), a) except ValueError, e: pass
def test_setitem_matrix_bad_ndim(): a = numpy.arange(27) a.resize((3, 3, 3)) a = theano._asarray(a, dtype='float32') _a = cuda_ndarray.CudaNdarray(a) b = theano._asarray([7, 8], dtype='float32') _b = cuda_ndarray.CudaNdarray(b) try: # attempt to assign the ndarray b with setitem _a[:, :, 1] = _b assert False except ValueError as e: # print e assert True # test direct transfert from numpy try: # attempt to assign the ndarray b with setitem _a[1, :, :] = b assert False except ValueError as e: # print e assert True
def test_elemwise1(): """ Several kinds of elemwise expressions with no broadcasting, non power-of-two shape """ shape = (3, 4) a = tcn.shared_constructor(theano._asarray(numpy.random.rand(*shape), dtype='float32') + 0.5, 'a') b = tensor.fmatrix() #let debugmode catch any mistakes print >> sys.stdout, "STARTING FUNCTION 1" f = pfunc([b], [], updates=[(a, b ** a)], mode=mode_with_gpu) for i, node in enumerate(f.maker.env.toposort()): print i, node f(theano._asarray(numpy.random.rand(*shape), dtype='float32') + 0.3) print >> sys.stdout, "STARTING FUNCTION 2" #let debugmode catch any mistakes f = pfunc([b], [], updates=[(a, tensor.exp(b ** a))], mode=mode_with_gpu) for i, node in enumerate(f.maker.env.toposort()): print i, node f(theano._asarray(numpy.random.rand(*shape), dtype='float32') + 0.3) print >> sys.stdout, "STARTING FUNCTION 3" #let debugmode catch any mistakes f = pfunc([b], [], updates=[(a, a + b * tensor.exp(b ** a))], mode=mode_with_gpu) f(theano._asarray(numpy.random.rand(*shape), dtype='float32') + 0.3)
def get_updates(self, grads): grads = OrderedDict(grads) updates = OrderedDict() for param in grads.keys(): # mean_squared_grad := E[g^2]_{t-1} mean_square_grad = theano.shared(theano._asarray(param.get_value() * 0., dtype=theano.config.floatX), name='mean_square_grad_' + param.name, borrow=False) self.parameters.append(mean_square_grad) # mean_square_dx := E[(\Delta x)^2]_{t-1} mean_square_dx = theano.shared(theano._asarray(param.get_value() * 0., dtype=theano.config.floatX), name='mean_square_dx_' + param.name, borrow=False) self.parameters.append(mean_square_dx) # Accumulate gradient new_mean_squared_grad = self.decay * mean_square_grad + (1 - self.decay) * T.sqr(grads[param]) # Compute update rms_dx_tm1 = T.sqrt(mean_square_dx + self.epsilon) rms_grad_t = T.sqrt(new_mean_squared_grad + self.epsilon) delta_x_t = - rms_dx_tm1 / rms_grad_t * grads[param] # Accumulate updates new_mean_square_dx = self.decay * mean_square_dx + (1 - self.decay) * T.sqr(delta_x_t) # Apply update updates[mean_square_grad] = new_mean_squared_grad updates[mean_square_dx] = new_mean_square_dx updates[param] = param + delta_x_t return updates
def test_sum(): shape = (2,3) a0 = theano._asarray(numpy.arange(shape[0]*shape[1]).reshape(shape), dtype='float32') b0 = cuda_ndarray.CudaNdarray(a0) assert numpy.allclose(a0.sum(), numpy.asarray(b0.reduce_sum([1,1]))) a0sum = a0.sum(axis=0) b0sum = b0.reduce_sum([1,0]) print 'asum\n',a0sum print 'bsum\n',numpy.asarray(b0sum) assert numpy.allclose(a0.sum(axis=0), numpy.asarray(b0.reduce_sum([1,0]))) assert numpy.allclose(a0.sum(axis=1), numpy.asarray(b0.reduce_sum([0,1]))) assert numpy.allclose(a0, numpy.asarray(b0.reduce_sum([0,0]))) shape = (3,4,5,6,7,8) a0 = theano._asarray(numpy.arange(3*4*5*6*7*8).reshape(shape), dtype='float32') b0 = cuda_ndarray.CudaNdarray(a0) assert numpy.allclose(a0.sum(axis=5).sum(axis=3).sum(axis=0), numpy.asarray(b0.reduce_sum([1,0,0,1,0,1]))) shape = (16,2048) a0 = theano._asarray(numpy.arange(16*2048).reshape(shape), dtype='float32') b0 = cuda_ndarray.CudaNdarray(a0) assert numpy.allclose(a0.sum(axis=0), numpy.asarray(b0.reduce_sum([1,0]))) shape = (16,10) a0 = theano._asarray(numpy.arange(160).reshape(shape), dtype='float32') b0 = cuda_ndarray.CudaNdarray(a0) assert numpy.allclose(a0.sum(), numpy.asarray(b0.reduce_sum([1,1])))
def new_filters_expbounds( cls, rng, input, n_in, n_out, n_terms, dtype=None, eps=1e-1, exponent_range=(1.0, 3.0), filter_range=1.0 ): """Return a KouhLayer instance with random parameters The parameters are drawn on a range [typically] suitable for fine-tuning by gradient descent. :param input: a tensor of shape (n_examples, n_in) :type n_in: positive int :param n_in: number of input dimensions :type n_out: positive int :param n_out: number of dimensions in rval.output :param nterms: each (of n_out) complex-cell firing rate will be determined from this many 'simple cell' responses. :param eps: this amount is added to the softplus of filter responses as a baseline firing rate (that prevents a subsequent error from ``pow(0, p)``) :returns: KouhLayer instance with freshly-allocated random weights. """ if input.type.ndim != 2: raise TypeError("matrix expected for input") if dtype is None: dtype = input.dtype _logger.debug("dtype %s" % dtype) def shared_uniform(low, high, size, name): return _shared_uniform(rng, low, high, size, dtype, name) f_list = [ shared_uniform( low=-2.0 / numpy.sqrt(n_in), high=2.0 / numpy.sqrt(n_in), size=(n_in, n_out), name="f_%i" % i ) for i in xrange(n_terms) ] b_list = [shared_uniform(low=0, high=0.01, size=(n_out,), name="b_%i" % i) for i in xrange(n_terms)] # x_list = [theano._asarray(eps, dtype=dtype)+softplus(tensor.dot(input, f_list[i])) for i in xrange(n_terms)] filter_range = theano._asarray(filter_range, dtype=dtype) half_filter_range = theano._asarray(filter_range / 2, dtype=dtype) x_list = [ theano._asarray(filter_range + eps, dtype=dtype) + half_filter_range * softsign(tensor.dot(input, f_list[i]) + b_list[i]) for i in xrange(n_terms) ] rval = cls.new_expbounds(rng, x_list, n_out, dtype=dtype, params=f_list + b_list, exponent_range=exponent_range) rval.f_list = f_list rval.input = input # add the input to the returned object rval.filter_l1 = sum(abs(fi).sum() for fi in f_list) rval.filter_l2_sqr = sum((fi ** 2).sum() for fi in f_list) return rval
def test_invalid_arg(self): img = theano._asarray(numpy.empty((1, 1, 1, 1)), dtype='float32') kern = theano._asarray(numpy.empty((1, 1, 1, 1)), dtype='float32') for i in self.conv_ops: assert_raises(ValueError, i, img, kern, border_mode=(-1, 0)) assert_raises(ValueError, i, img, kern, border_mode=(0, -1)) assert_raises(ValueError, i, img, kern, border_mode='not border')
def sharedX(value, name=None, borrow=True, keep_on_cpu=False): """ Transform value into a shared variable of type floatX """ if keep_on_cpu: return T._shared(theano._asarray(value, dtype=theano.config.floatX), name=name, borrow=borrow) return theano.shared(theano._asarray(value, dtype=theano.config.floatX), name=name, borrow=borrow)
def conv_grad(mode, bs, ch, nf, rImg1, rImg2, rFlt1, rFlt2, subsample, op): ishape = (bs, ch, rImg1, rImg2) kshape = (nf, ch, rFlt1, rFlt2) npy_img = theano._asarray(numpy.random.rand(*ishape), dtype='float32') npy_kern = theano._asarray(numpy.random.rand(*kshape), dtype='float32') i = cuda.CudaNdarrayType( broadcastable=[sh == 1 for sh in npy_img.shape])() k = cuda.CudaNdarrayType( broadcastable=[sh == 1 for sh in npy_kern.shape])() # TODO: also test custom pad values corr_op = op(mode, subsample)(i, k) # try to compile reference implementation without shape, # so we don't have to compile hundreds of versions conv_op = tensor.nnet.conv2d(i, k[:, :, ::-1, ::-1], border_mode=mode, subsample=subsample) try: conv_op_di = theano.grad(conv_op.sum(), i) conv_op_dk = theano.grad(conv_op.sum(), k) except Exception: # compile with shape information only when needed conv_op = tensor.nnet.conv2d(i, k[:, :, ::-1, ::-1], ishape, kshape, mode, subsample) conv_op_di = theano.grad(conv_op.sum(), i) conv_op_dk = theano.grad(conv_op.sum(), k) corr_op_di = theano.grad(corr_op.sum(), i) corr_op_dk = theano.grad(corr_op.sum(), k) outputs = [corr_op, conv_op, corr_op_di, conv_op_di, corr_op_dk, conv_op_dk] try: conv_op_dik = theano.grad(conv_op_di.sum(), k) conv_op_dki = theano.grad(conv_op_dk.sum(), i) corr_op_dik = theano.grad(corr_op_di.sum(), k) corr_op_dki = theano.grad(corr_op_dk.sum(), i) outputs.extend([corr_op_dik, conv_op_dik, corr_op_dki, conv_op_dki]) except Exception: # skip if the reference implementation can't do it pass f = theano.function([i, k], outputs, mode=theano_mode.excluding('conv_dnn', 'conv_gemm')) allvals = f(npy_img, npy_kern) for a, b, oa, ob, p in zip(allvals[::2], allvals[1::2], outputs[::2], outputs[1::2], ('top', 'dtop/dbottom', 'dtop/dweight', 'dtop/dbottom/dweight', 'dtop/dweight/dbottom')): assert oa.type.broadcastable[:2] == ob.type.broadcastable[:2] assert_allclose(a, b, rtol=1e-4)
def learning_rates_setup(self, base_lr, **kwargs): """ Initializes parameter-specific learning rate dictionary and shared variables for the annealed base learning rate and iteration number. Parameters ---------- base_lr : float The base learning rate before annealing or parameter-specific scaling. Notes ----- Parameter-specific learning rates can be set by passing keyword arguments <name>_lr, where name is the .name attribute of a given parameter. """ # Take care of learning rate scales for individual parameters self.learning_rates = {} # Base learning rate per example. self.base_lr = theano._asarray(base_lr, dtype=floatX) # Keep track of names already seen lr_names_seen = set() for parameter in self.params: lr_name = '%s_lr' % parameter.name if lr_name in lr_names_seen: print >> sys.stderr, ('Warning: In SGDOptimizer, ' 'at least two parameters have the same name. ' 'Both will be affected by the keyword argument ' '%s.' % lr_name) lr_names_seen.add(parameter.name) thislr = kwargs.get(lr_name, 1.) self.learning_rates[parameter] = sharedX(thislr, lr_name) # Verify that no ..._lr keyword argument is ignored for lr_name in lr_names_seen: if lr_name in kwargs: kwargs.pop(lr_name) for kw in kwargs.iterkeys(): if kw[-3:] == '_lr': print >> sys.stderr, ('Warning: in SGDOptimizer, ' 'keyword argument %s will be ignored, ' 'because no parameter was found with name %s.' % (kw, kw[:-3])) # A shared variable for storing the iteration number. self.iteration = sharedX(theano._asarray(0, dtype='int32'), name='iter') # A shared variable for storing the annealed base learning rate, used # to lower the learning rate gradually after a certain amount of time. self.annealed = sharedX(base_lr, 'annealed')
def test_gemm_vector_vector(): a = theano._asarray(numpy.random.rand(5,1), dtype='float32') _a = cuda_ndarray.CudaNdarray(a) b = theano._asarray(numpy.random.rand(1,5), dtype='float32') _b = cuda_ndarray.CudaNdarray(b) _c = cuda_ndarray.dot(_a,_b) assert _c.shape == (5,5) assert numpy.allclose(_c, numpy.dot(a, b)) _c = cuda_ndarray.dot(_b,_a) assert _c.shape == (1,1) assert numpy.allclose(_c, numpy.dot(b, a))
def test_dimshuffle(self): utt.seed_rng() rng = numpy.random.RandomState(utt.fetch_seed()) # 2d -> 0d a = theano._asarray(rng.randn(1,1), dtype='float32') b = cuda_ndarray.CudaNdarray(a) assert numpy.allclose(numpy.transpose(a), cuda_ndarray.dimshuffle(b,())) # Test when we drop a axis that don't have shape 1 a = theano._asarray(rng.randn(2,1), dtype='float32') b = cuda_ndarray.CudaNdarray(a) self.assertRaises(ValueError, cuda_ndarray.dimshuffle, b,()) # Test that we can't take a dimensions multiple time a = theano._asarray(rng.randn(2,1), dtype='float32') b = cuda_ndarray.CudaNdarray(a) self.assertRaises(ValueError, cuda_ndarray.dimshuffle, b,(1,1)) # 1d a = theano._asarray(rng.randn(3,), dtype='float32') b = cuda_ndarray.CudaNdarray(a) assert numpy.allclose(numpy.transpose(a), cuda_ndarray.dimshuffle(b,(0,))) assert numpy.allclose(a[None,:,None], cuda_ndarray.dimshuffle(b,(-1,0,-1))) # 2d a = theano._asarray(rng.randn(3,11), dtype='float32') b = cuda_ndarray.CudaNdarray(a) assert numpy.allclose(numpy.transpose(a), cuda_ndarray.dimshuffle(b,(1,0))) assert numpy.allclose(numpy.transpose(a)[None,:,None,:,None], cuda_ndarray.dimshuffle(b,(-1,1,-1,0,-1))) # 2d -> 1d a = theano._asarray(rng.randn(1,11), dtype='float32') b = cuda_ndarray.CudaNdarray(a) assert numpy.allclose(a[:,], cuda_ndarray.dimshuffle(b,(1,))) a = theano._asarray(rng.randn(11,1), dtype='float32') b = cuda_ndarray.CudaNdarray(a) assert numpy.allclose(a.reshape((11,)), cuda_ndarray.dimshuffle(b,(0,))) # 3d a = theano._asarray(rng.randn(3,4,5), dtype='float32') b = cuda_ndarray.CudaNdarray(a) assert numpy.allclose(a, cuda_ndarray.dimshuffle(b,(0,1,2))) assert numpy.allclose(numpy.swapaxes(a,0,1), cuda_ndarray.dimshuffle(b,(1,0,2))) assert numpy.allclose(numpy.swapaxes(a,0,2), cuda_ndarray.dimshuffle(b,(2,1,0))) assert numpy.allclose(numpy.swapaxes(a,1,2), cuda_ndarray.dimshuffle(b,(0,2,1))) assert numpy.allclose(numpy.swapaxes(a,1,2)[None,:,None,:,:,None], cuda_ndarray.dimshuffle(b,(-1,0,-1,2,1,-1))) # 4d a = theano._asarray(rng.randn(3,11,4,5), dtype='float32') b = cuda_ndarray.CudaNdarray(a) assert numpy.allclose(numpy.swapaxes(a,0,1), cuda_ndarray.dimshuffle(b,(1,0,2,3))) assert numpy.allclose(numpy.swapaxes(a,0,2), cuda_ndarray.dimshuffle(b,(2,1,0,3))) assert numpy.allclose(numpy.swapaxes(a,0,3), cuda_ndarray.dimshuffle(b,(3,1,2,0))) assert numpy.allclose(numpy.swapaxes(a,0,3), cuda_ndarray.dimshuffle(b,(3,1,2,0))) assert numpy.allclose(numpy.swapaxes(a,0,3)[None,:,None,:,:,:], cuda_ndarray.dimshuffle(b,(-1,3,-1,1,2,0)))
def conv_grad(mode, bs, ch, nf, rImg1, rImg2, rFlt1, rFlt2, subsample, op): ishape = (bs, ch, rImg1, rImg2) kshape = (nf, ch, rFlt1, rFlt2) npy_img = theano._asarray(numpy.random.rand(*ishape), dtype='float32') npy_kern = theano._asarray(numpy.random.rand(*kshape), dtype='float32') i = cuda.CudaNdarrayType( broadcastable=[sh == 1 for sh in npy_img.shape])() k = cuda.CudaNdarrayType( broadcastable=[sh == 1 for sh in npy_kern.shape])() # TODO: also test custom pad values corr_op = op(mode, subsample)(i, k) conv_op = tensor.nnet.conv2d(i, k[:, :, ::-1, ::-1], border_mode=mode, subsample=subsample) conv_op_di = theano.grad(conv_op.sum(), i) conv_op_dk = theano.grad(conv_op.sum(), k) corr_op_di = theano.grad(corr_op.sum(), i) corr_op_dk = theano.grad(corr_op.sum(), k) outputs = [corr_op, conv_op, corr_op_di, conv_op_di, corr_op_dk, conv_op_dk] conv_op_dik = theano.grad(conv_op_di.sum(), k) conv_op_dki = theano.grad(conv_op_dk.sum(), i) corr_op_dik = theano.grad(corr_op_di.sum(), k) corr_op_dki = theano.grad(corr_op_dk.sum(), i) outputs.extend([corr_op_dik, conv_op_dik, corr_op_dki, conv_op_dki]) if not theano.config.blas.ldflags: # Some of the operations are not transferred to the GPU, # and withoug BLAS, the abstract Op will not be optimized # to CorrMM either, so we have to accept the use of the # slow Python convolution in that case. mode = theano_mode.excluding('AbstractConvCheck') else: mode = theano_mode f = theano.function([i, k], outputs, mode=mode) allvals = f(npy_img, npy_kern) for a, b, oa, ob, p in zip(allvals[::2], allvals[1::2], outputs[::2], outputs[1::2], ('top', 'dtop/dbottom', 'dtop/dweight', 'dtop/dbottom/dweight', 'dtop/dweight/dbottom')): assert oa.type.broadcastable[:2] == ob.type.broadcastable[:2] assert_allclose(a, b, rtol=1e-4)
def test_elemwise_fusion(): """ Test the the GpuElemwise fusion work correctly""" shape = (3,4) a = cuda.shared_constructor(theano._asarray(numpy.random.rand(*shape), dtype='float32'), 'a') b = tensor.fmatrix() c = tensor.fmatrix() f = pfunc([b,c], [a+b+c], mode=mode_with_gpu) topo = f.maker.env.toposort() for i, node in enumerate(topo): print >> sys.stdout, i, node assert len(topo)==4 assert isinstance(topo[2].op.scalar_op,theano.scalar.basic.Composite) #let debugmode catch errors f(theano._asarray(numpy.random.rand(*shape), dtype='float32'), theano._asarray(numpy.random.rand(*shape), dtype='float32'))
def test_gemm_directly(): for direction in ['fprop', 'bprop img', 'bprop kern']: print 'Testing direction: ' + direction for bs in range(1, 5): for ch in range(1,4): for nf in range(1,4): for rImg1 in range(5, 9): for rImg2 in range(5, 9): for rFlt1 in range(2, 4): for rFlt2 in range(2, 4): for subsx in range(1, 3) if direction == 'fprop' else [1]: for subsy in range(1, 3) if direction == 'fprop' else [1]: ishape = (bs, ch, rImg1, rImg2) kshape = (nf, ch, rFlt1, rFlt2) subsample = (subsx, subsy) npy_img = theano._asarray(numpy.random.rand(*ishape), dtype='float32') npy_kern = theano._asarray(numpy.random.rand(*kshape), dtype='float32') i = cuda_tensor4() k = cuda_tensor4() if direction == 'fprop': cpuval = py_conv(npy_img, npy_kern, 'valid', subsample) op = theano.sandbox.cuda.blas.GpuCorrMM(border_mode='valid', subsample=subsample)(i, k) f = theano.function([i, k], op, mode=theano_mode) gpuval = f(npy_img, npy_kern[:,:,::-1,::-1]) elif direction == 'bprop img': cpuval = py_conv(npy_img, npy_kern, 'full', subsample) op = theano.sandbox.cuda.blas.GpuCorrMM_gradInputs(border_mode='valid', subsample=subsample)(i, k) f = theano.function([i, k], op, mode=theano_mode) gpuval = f(npy_kern.transpose(1, 0, 2, 3), npy_img) elif direction == 'bprop kern': cpuval = py_conv(npy_img, npy_kern, 'valid', subsample) op = theano.sandbox.cuda.blas.GpuCorrMM_gradWeights(border_mode='valid', subsample=subsample)(i, k) f = theano.function([i, k], op, mode=theano_mode) gpuval = numpy.array(f(npy_img.transpose(1, 0, 2, 3), npy_kern.transpose(1, 0, 2, 3)[:,:,::-1,::-1])).transpose(1, 0, 2, 3) if not numpy.allclose(cpuval, gpuval, rtol=1e-4): print "Test failed for" print "direction: ", direction print "ishape: ", ishape print "kshape: ", kshape print "subsample: ", subsample assert False
def test_elemwise4(): """ Test that two vectors can be broadcast to form an outer product (by performing rank-1 matrix update""" shape = (3,4) a = tcn.shared_constructor(theano._asarray(numpy.random.rand(*shape), dtype='float32'), 'a') b = tensor.fvector() c = tensor.fvector() f = pfunc([b,c], [], updates=[(a, (a+b.dimshuffle('x', 0)*c.dimshuffle(0, 'x')))], mode=mode_with_gpu) has_elemwise = False for i, node in enumerate(f.maker.env.toposort()): print >> sys.stdout, i, node has_elemwise = has_elemwise or isinstance(node.op, tensor.Elemwise) assert not has_elemwise #let debugmode catch errors f(theano._asarray(numpy.random.rand(4), dtype='float32'), theano._asarray(numpy.random.rand(3), dtype='float32'))
def perform(self, node, inp, out_): x, = inp out, = out_ if out[0] is None: out[0] = theano._asarray(x.shape[self.i], dtype='int64') else: out[0][...] = x.shape[self.i]
def __init__(self, input, n_in, n_out, activation, rng, layer_name="LogReg", W=None, b=None, borrow=True): # Weigth matrix W if W != None: self.W = shared(W, name=layer_name+"_W", borrow=borrow) elif activation in (relu,softplus): W_val = _asarray(rng.normal(loc=0, scale=0.01, size=(n_in, n_out)), dtype=floatX) self.W = shared(W_val, name=layer_name+"_W", borrow=borrow) else: self.W = shared(zeros((n_in, n_out), dtype=floatX), name=layer_name+"_W", borrow=borrow) # Bias vector if b!=None: self.b = shared(b, name=layer_name+"_b", borrow=borrow) elif activation in (relu,softplus): b_val = ones((n_out,), dtype=floatX) self.b = shared(value=b_val, borrow=True) else: self.b = shared(zeros((n_out,), dtype=floatX), name=layer_name+"_b", borrow=borrow) # Vector of prediction probabilities self.p_y_given_x = softmax(T.dot(input, self.W) + self.b) # Prediction self.y_pred = T.argmax(self.p_y_given_x, axis=1) # Parameters of the model self.params = [self.W, self.b]
def __init__(self, num_components=None, min_variance=0.0, whiten=False): """ :type num_components: int :param num_components: this many components will be preserved, in decreasing order of variance (default None keeps all) :type min_variance: float :param min_variance: components with normalized variance [0-1] below this threshold will be discarded :type whiten: bool :param whiten: whether or not to divide projected features by their standard deviation """ super(_PCABase, self).__init__() self.num_components = num_components self.min_variance = min_variance self.whiten = whiten self.W = None self.v = None self.mean = None self.component_cutoff = theano.shared( theano._asarray(0, dtype='int64'), name='component_cutoff') # This module really has no adjustable parameters -- once train() # is called once, they are frozen, and are not modified via gradient # descent. self._params = []
def test_host_to_device(): print >>sys.stdout, 'starting test_host_to_dev' for shape in ((), (3,), (2,3), (3,4,5,6)): a = theano._asarray(numpy.random.rand(*shape), dtype='float32') b = cuda_ndarray.CudaNdarray(a) c = numpy.asarray(b) assert numpy.all(a == c)
def test_may_share_memory(): a = scipy.sparse.csc_matrix(scipy.sparse.eye(5, 3)) b = scipy.sparse.csc_matrix(scipy.sparse.eye(4, 3)) as_ar = lambda a: theano._asarray(a, dtype="int32") for a_, b_, rep in [ (a, a, True), (b, b, True), (a, b, False), (a, a.data, True), (a, a.indptr, True), (a, a.indices, True), (a, as_ar(a.shape), False), (a.data, a, True), (a.indptr, a, True), (a.indices, a, True), (as_ar(a.shape), a, False), (b, b.data, True), (b, b.indptr, True), (b, b.indices, True), (b, as_ar(b.shape), False), (b.data, b, True), (b.indptr, b, True), (b.indices, b, True), (as_ar(b.shape), b, False), (b.data, a, False), (b.indptr, a, False), (b.indices, a, False), (as_ar(b.shape), a, False), ]: assert SparseType.may_share_memory(a_, b_) == rep
def test_setitem_matrix_bad_ndim(): a = numpy.arange(27) a.resize((3,3,3)) a = theano._asarray(a, dtype='float32') _a = cuda_ndarray.CudaNdarray(a) b = theano._asarray([7,8], dtype='float32') _b = cuda_ndarray.CudaNdarray(b) try: # attempt to assign the ndarray b with setitem _a[:,:,1] = _b assert False except NotImplementedError, e: #print e assert True
def perform(self, node, inputs, output_storage): a = inputs[0] axis = inputs[1] z = output_storage[0] z[0] = theano._asarray( np.argsort(a, axis, self.kind, self.order), dtype=node.outputs[0].dtype)
def __init__(self, which_set, multi_target=False): assert which_set in ['train', 'test'] self.which_set = which_set X = SmallNORB.load(which_set, 'dat') # Casts to the GPU-supported float type, using theano._asarray(), a # safer alternative to numpy.asarray(). # # TODO: move the dtype-casting to the view_converter's output space, # once dtypes-for-spaces is merged into master. X = theano._asarray(X, theano.config.floatX) # Formats data as rows in a matrix, for DenseDesignMatrix X = X.reshape(-1, 2*numpy.prod(self.original_image_shape)) # This is uint8 y = SmallNORB.load(which_set, 'cat') if multi_target: y_extra = SmallNORB.load(which_set, 'info') y = numpy.hstack((y[:, numpy.newaxis], y_extra)) datum_shape = ((2, ) + # two stereo images self.original_image_shape + (1, )) # one color channel # 's' is the stereo channel: 0 (left) or 1 (right) axes = ('b', 's', 0, 1, 'c') view_converter = StereoViewConverter(datum_shape, axes) super(SmallNORB, self).__init__(X=X, y=y, view_converter=view_converter)
def new_filters_expbounds(cls, rng, input, n_in, n_out, n_terms, dtype=None, eps=1e-1, exponent_range=(1.0, 3.0), filter_range=1.0): """Return a KouhLayer instance with random parameters The parameters are drawn on a range [typically] suitable for fine-tuning by gradient descent. :param input: a tensor of shape (n_examples, n_in) :type n_in: positive int :param n_in: number of input dimensions :type n_out: positive int :param n_out: number of dimensions in rval.output :param nterms: each (of n_out) complex-cell firing rate will be determined from this many 'simple cell' responses. :param eps: this amount is added to the softplus of filter responses as a baseline firing rate (that prevents a subsequent error from ``pow(0, p)``) :returns: KouhLayer instance with freshly-allocated random weights. """ if input.type.ndim != 2: raise TypeError('matrix expected for input') if dtype is None: dtype = input.dtype _logger.debug('dtype %s' % dtype) def shared_uniform(low, high, size, name): return _shared_uniform(rng, low, high, size, dtype, name) f_list = [ shared_uniform(low=-2.0 / numpy.sqrt(n_in), high=2.0 / numpy.sqrt(n_in), size=(n_in, n_out), name='f_%i' % i) for i in xrange(n_terms) ] b_list = [ shared_uniform(low=0, high=.01, size=(n_out, ), name='b_%i' % i) for i in xrange(n_terms) ] #x_list = [theano._asarray(eps, dtype=dtype)+softplus(tensor.dot(input, f_list[i])) for i in xrange(n_terms)] filter_range = theano._asarray(filter_range, dtype=dtype) half_filter_range = theano._asarray(filter_range / 2, dtype=dtype) x_list = [ theano._asarray(filter_range + eps, dtype=dtype) + half_filter_range * softsign(tensor.dot(input, f_list[i]) + b_list[i]) for i in xrange(n_terms) ] rval = cls.new_expbounds(rng, x_list, n_out, dtype=dtype, params=f_list + b_list, exponent_range=exponent_range) rval.f_list = f_list rval.input = input # add the input to the returned object rval.filter_l1 = sum(abs(fi).sum() for fi in f_list) rval.filter_l2_sqr = sum((fi**2).sum() for fi in f_list) return rval
def perform(self, node, inputs, output_storage): # Fixed by GWT: ensure output from numpy matches expected output dtype # Addresses hyperopt issue #58 output_storage[0][0] = theano._asarray( numpy.argsort(inputs[0]), dtype=node.outputs[0].type.dtype)
op = careduce_op(scalar_op, axis=pattern) pat = tensor_pattern_to_gpu_pattern(shape, pattern) #GpuCAReduce{maximum} support only those patterns if scalar_op is theano.scalar.maximum and pat not in [ (0, 1), (0, 1, 1), (0, 1, 1)]: continue a = tensor.TensorType('float32', (False,) * len(shape))() dim_pattern = range(len(shape)) dim_pattern[0] = 1 dim_pattern[1] = 0 a = a.dimshuffle(dim_pattern) b = op(a) val = numpy.random.rand(numpy.prod(shape)).reshape(shape) # val = numpy.ones(shape) # val = numpy.arange(numpy.prod(shape)).reshape(shape) val = theano._asarray(val, dtype='float32') f = theano.function([a], b, mode=mode_with_gpu) f2 = theano.function([a], b, mode=mode_without_gpu) assert tcn.GpuCAReduce in [x.op.__class__ for x in f.maker.fgraph.toposort()] assert op.__class__ in [x.op.__class__ for x in f2.maker.fgraph.toposort()] assert _allclose(f2(val), f(val)), ('shape', shape, 'pattern', pattern, sum([shape[i] for i in pattern])) #test with broadcast for shape, pattern in [((5,),[0]), ((5,4),[0,1]),((5,4),[0]), ((5,4,3),[0]),((5,4,3),[0,1]), ((5,4,3),[2]),((5,4,3),[0,1,2]),
def rand_cuda_ndarray(shape): return cuda_ndarray.CudaNdarray(theano._asarray(numpy.random.rand(*shape), dtype='float32'))
def cmp_run_conv_nnet2_classif(seed, isize, ksize, bsize, ignore_error=False, n_train=10, gpu_only=False, cpu_only=False, float_atol=1e-06, check_isfinite=True, pickle=False, verbose=0, version=-1): """Run the nnet2 function on 1 or 2 devices, and compares the results. float_atol: None mean use the default value. check_isfinite: the debug mode option. We forward this value to debug mode. For some parameter CrossentropyCategorical1Hot op generate inf when not optimized. """ if config.mode == 'DEBUG_MODE': n_train = 1 # Change global tolerance, used in DebugMode for instance orig_float32_atol = theano.tensor.basic.float32_atol try: if float_atol: #print "float_atol", float_atol theano.tensor.basic.float32_atol = float_atol if gpu_only and cpu_only: raise ValueError("Please use only one of cpu_only and gpu_only") elif cpu_only: use_gpu = False compare = False elif gpu_only: use_gpu = True compare = False else: compare = True if not compare: return run_conv_nnet2_classif(use_gpu=use_gpu, seed=seed, isize=isize, ksize=ksize, bsize=bsize, n_train=n_train, check_isfinite=check_isfinite, pickle=pickle, verbose=verbose, version=version) utt.seed_rng(seed) # Seeds numpy.random with seed train_cpu, params_cpu, x_shape, y_shape, mode_cpu = \ build_conv_nnet2_classif( use_gpu=False, isize=isize, ksize=ksize, n_batch=bsize, verbose=verbose, version=version, check_isfinite=check_isfinite) utt.seed_rng(seed) # Seeds numpy.random with seed train_gpu, params_gpu, x_shape_gpu, y_shape_gpu, mode_gpu = \ build_conv_nnet2_classif( use_gpu=True, isize=isize, ksize=ksize, n_batch=bsize, verbose=verbose, version=version, check_isfinite=check_isfinite) assert x_shape == x_shape_gpu assert y_shape == y_shape_gpu xval = my_rand(*x_shape) yval = my_rand(*y_shape) lr = theano._asarray(0.01, dtype='float32') time_cpu = 0 time_gpu = 0 for i in range(n_train): # Train one batch on CPU t0 = time.time() rval_cpu = train_cpu(xval, yval, lr)[0] t1 = time.time() time_cpu += (t1 - t0) # Train one batch on GPU t0 = time.time() rval_gpu = train_gpu(xval, yval, lr)[0] t1 = time.time() time_gpu += (t1 - t0) # Compare results if (verbose or not numpy.allclose( rval_cpu, rval_gpu, rtol=1e-5, atol=float_atol)): print "At batch:", i + 1 print "CPU:", rval_cpu print "GPU:", rval_gpu print "abs diff:", numpy.absolute(rval_gpu - rval_cpu) print "rel diff:", numpy.absolute( (rval_gpu - rval_cpu) / rval_gpu) if not ignore_error: assert numpy.allclose(rval_cpu, rval_gpu, rtol=1e-5, atol=float_atol) # Synchronize parameters to start from the same point next time if i < n_train - 1: for cpu_p, gpu_p in zip(params_cpu, params_gpu): cpu_p.set_value(gpu_p.get_value(borrow=False), borrow=True) finally: theano.tensor.basic.float32_atol = orig_float32_atol if pickle: if isinstance(cpu_mode, theano.compile.ProfileMode): import pickle print "BEGIN CPU profile mode dump" print pickle.dumps(cpu_mode) print "END CPU profile mode dump" if isinstance(gpu_mode, theano.compile.ProfileMode): import pickle print "BEGIN GPU profile mode dump" print pickle.dumps(gpu_mode) print "END GPU profile mode dump"
def my_rand(*shape): return theano._asarray(numpy.random.rand(*shape), dtype='float32')
def __init__(self, input, n_in_maps, n_out_maps, kernel_shape, video_shape, batch_size, activation, layer_name="Conv", rng=RandomState(1234), borrow=True, stride=1, W=None, b=None, b_scale=0.1, W_scale=0.01, fast_conv=False): """ video_shape: (frames, height, width) kernel_shape: (frames, height, width) W_shape: (out, in, kern_frames, kern_height, kern_width) """ self.__dict__.update(locals()) del self.self # init W #print type(W) if type(W) != numpy.ndarray: W_flag = (W != None) else: W_flag = (W.all() != None) if W_flag: self.W = shared(array(W, dtype=floatX), borrow=borrow, name=layer_name+'_W') # wudi made it shared else: # fan in: filter time x filter height x filter width x input maps fan_in = prod(kernel_shape)*n_in_maps norm_scale = 2. * sqrt( 1. / fan_in ) if activation in ('relu', 'softplus', 'leaky_relu'): print activation norm_scale = W_scale W_shape = (n_out_maps, n_in_maps)+kernel_shape W_val = _asarray(rng.normal(loc=0, scale=norm_scale, size=W_shape),\ dtype=floatX) # W_val = ones(W_shape, dtype=floatX)*W_scale self.W = shared(value=W_val, borrow=borrow, name=layer_name+'_W') self.params = [self.W] # init bias if type(b) != numpy.ndarray: b_flag = (b != None) else: b_flag = (b.all() != None) if b_flag: self.b = shared(array(b, dtype=floatX), name=layer_name+"_b", borrow=borrow) # wudi made it shared elif activation in ('relu', 'softplus', 'leaky_relu'): # print b_scale b_val = (ones((n_out_maps,), dtype=floatX)*b_scale).astype(floatX) self.b = shared(b_val, name=layer_name+"_b", borrow=borrow) else: b_val = zeros((n_out_maps,), dtype=floatX) self.b = shared(b_val, name=layer_name+"_b", borrow=borrow) self.params.append(self.b) # 3D convolution; dimshuffle: last 3 dimensions must be (in, h, w) n_fr, h, w = video_shape n_fr_k, h_k, w_k = kernel_shape out = conv3d( signals=input.dimshuffle([0,2,1,3,4]), filters=self.W.dimshuffle([0,2,1,3,4]), signals_shape=(batch_size, n_fr, n_in_maps, h, w), filters_shape=(n_out_maps, n_fr_k, n_in_maps, h_k, w_k), border_mode='valid', fast_conv=fast_conv, stride=stride ).dimshuffle([0,2,1,3,4]) out += self.b.dimshuffle('x',0,'x','x','x') self.output = eval(activation)(out)
def perform(self, node, inp, out_): x, = inp out, = out_ out[0] = theano._asarray(x.shape, dtype='int64')
def filter(self, data, strict=False, allow_downcast=None): """ Convert `data` to something which can be associated to a `TensorVariable`. This function is not meant to be called in user code. It is for `Linker` instances to use when running a compiled graph. """ # Explicit error message when one accidentally uses a Variable as # input (typical mistake, especially with shared variables). if isinstance(data, Variable): raise TypeError( 'Expected an array-like object, but found a Variable: ' 'maybe you are trying to call a function on a (possibly ' 'shared) variable instead of a numeric array?') if ((type(data) is numpy.ndarray) and (data.dtype == self.numpy_dtype)): if data.dtype.num != self.numpy_dtype.num: data = theano._asarray(data, dtype=self.dtype) # -- now fall through to ndim check elif ((type(data) is numpy.memmap) and (data.dtype == self.numpy_dtype)): # numpy.memmap is a "safe" subclass of ndarray, # so we can use it whereever we expect a base ndarray. # however, casting it would defeat the purpose of not # loading the whole data into memory pass elif strict: # If any of the two conditions above was not met, # we raise a meaningful TypeError. if not (type(data) is numpy.ndarray): raise TypeError("%s expected a ndarray object." % self, data, type(data)) if data.dtype != self.numpy_dtype: raise TypeError(("%s expected a ndarray object with " "dtype = %s (got %s).") % (self, self.numpy_dtype, data.dtype)) assert False, "This point should never be reached." else: if allow_downcast: # Convert to self.dtype, regardless of the type of data data = theano._asarray(data, dtype=self.dtype) # TODO: consider to pad shape with ones to make it consistent # with self.broadcastable... like vector->row type thing else: if isinstance(data, numpy.ndarray): # Check if self.dtype can accurately represent data # (do not try to convert the data) up_dtype = scal.upcast(self.dtype, data.dtype) if up_dtype == self.dtype: # Bug in the following line when data is a # scalar array, see # http://projects.scipy.org/numpy/ticket/1611 # data = data.astype(self.dtype) data = theano._asarray(data, dtype=self.dtype) if up_dtype != self.dtype: err_msg = ( '%s cannot store a value of dtype %s without ' 'risking loss of precision. If you do not mind ' 'this loss, you can: ' '1) explicitly cast your data to %s, or ' '2) set "allow_input_downcast=True" when calling ' '"function".' % (self, data.dtype, self.dtype)) raise TypeError(err_msg, data) elif (allow_downcast is None and type(data) is float and self.dtype == theano.config.floatX): # Special case where we allow downcasting of Python float # literals to floatX, even when floatX=='float32' data = theano._asarray(data, self.dtype) else: # data has to be converted. # Check that this conversion is lossless converted_data = theano._asarray(data, self.dtype) # We use the `values_eq` static function from TensorType # to handle NaN values. if TensorType.values_eq(numpy.asarray(data), converted_data, force_same_dtype=False): data = converted_data else: # Do not print a too long description of data # (ndarray truncates it, but it's not sure for data) str_data = str(data) if len(str_data) > 80: str_data = str_data[:75] + '(...)' err_msg = ( '%s cannot store accurately value %s, ' 'it would be represented as %s. ' 'If you do not mind this precision loss, you can: ' '1) explicitly convert your data to a numpy array ' 'of dtype %s, or ' '2) set "allow_input_downcast=True" when calling ' '"function".' % (self, data, converted_data, self.dtype)) raise TypeError(err_msg, data) if self.ndim != data.ndim: raise TypeError("Wrong number of dimensions: expected %s," " got %s with shape %s." % (self.ndim, data.ndim, data.shape)) if not data.flags.aligned: try: msg = "object buffer" + str(data.data) except AttributeError: msg = "" raise TypeError("The numpy.ndarray object is not aligned." " Theano C code does not support that.", msg, "object shape", data.shape, "object strides", data.strides, "object dtype", data.dtype) i = 0 for b in self.broadcastable: if b and data.shape[i] != 1: raise TypeError("Non-unit value on shape on a broadcastable" " dimension.", data.shape, self.broadcastable) i += 1 if (self.filter_checks_isfinite and not numpy.all(numpy.isfinite(data))): raise ValueError("non-finite elements not allowed") return data
def sharedX(value, name=None, borrow=False, dtype=None): if dtype is None: dtype = theano.config.floatX return theano.shared(theano._asarray(value, dtype=dtype), name=name, borrow=borrow)
def castX(value): return theano._asarray(value, dtype=theano.config.floatX)
def my_zeros(*shape): return theano._asarray(numpy.zeros(*shape), dtype='float32')
def just_vals(v): return T.Reshape(2)(v, theano._asarray([2, 3], dtype='int32'))
def run_conv_nnet2(use_gpu): # pretend we are training LeNet for MNIST if use_gpu: shared_fn = tcn.shared_constructor else: shared_fn = shared #cumulativ rounding error affect this comparaison of result. So we lower the tolerance. #TODO: why the last two example see the error lower? We are converging? #n_train=10, n_batch=3, n_kern=1, n_kern1=1, error see of 1e-9 #n_train=10, n_batch=3, n_kern=10, n_kern1=1, error see of -1.27777e-06 #n_train=10, n_batch=3, n_kern=10, n_kern1=10, error see of -6.91377e-05 #n_train=10, n_batch=30, n_kern=10, n_kern1=10, error see of -0.00185963 #n_train=10, n_batch=60, n_kern=10, n_kern1=10, error see of -5.26905e-05 #n_train=30, n_batch=60, n_kern=10, n_kern1=10, error see of -3.8147e-06 #n_train=30, n_batch=60, n_kern=20, n_kern1=10, error see of 6.82771e-05 #n_train=30, n_batch=60, n_kern=20, n_kern1=30, error see of 0.000231534 n_batch = 60 shape_img = (n_batch, 1, 32, 32) n_kern = 20 shape_kern = (n_kern, 1, 5, 5) n_kern1 = 10 shape_kern1 = (n_kern1, n_kern, 5, 5) n_train = 30 if config.mode == 'DEBUG_MODE': n_train = 1 logical_hid_shape = tcn.blas.GpuConv.logical_output_shape_2d( tuple(shape_img[2:]), tuple(shape_kern[2:]), 'valid') logical_hid_shape1 = tcn.blas.GpuConv.logical_output_shape_2d( (logical_hid_shape[0] // 2, logical_hid_shape[1] // 2), tuple(shape_kern1[2:]), 'valid') n_hid = n_kern1 * logical_hid_shape1[0] * logical_hid_shape1[1] n_out = 10 w0 = shared_fn(0.01 * (my_rand(*shape_kern) - 0.5), 'w0') b0 = shared_fn(my_zeros((n_kern, )), 'b0') w1 = shared_fn(0.01 * (my_rand(*shape_kern1) - 0.5), 'w1') b1 = shared_fn(my_zeros((n_kern1, )), 'b1') v = shared_fn(my_zeros((n_hid, n_out)), 'c') c = shared_fn(my_zeros(n_out), 'c') x = tensor.Tensor(dtype='float32', broadcastable=(0, 1, 0, 0))('x') y = tensor.fmatrix('y') lr = tensor.fscalar('lr') conv_op = conv.ConvOp(shape_img[2:], shape_kern[2:], n_kern, n_batch, 1, 1) conv_op1 = conv.ConvOp( (n_kern, logical_hid_shape[0] / 2, logical_hid_shape[1] / 2), shape_kern1[2:], n_kern1, n_batch, 1, 1) hid = tensor.tanh(conv_op(x, w0) + b0.dimshuffle((0, 'x', 'x'))) hid1 = tensor.tanh( conv_op1(hid[:, :, ::2, ::2], w1) + b1.dimshuffle((0, 'x', 'x'))) hid_flat = hid1.reshape((n_batch, n_hid)) out = tensor.tanh(tensor.dot(hid_flat, v) + c) loss = tensor.sum(0.5 * (out - y)**2 * lr) #print 'loss type', loss.type params = [w0, b0, w1, b1, v, c] gparams = tensor.grad(loss, params) mode = get_mode(use_gpu) #print 'building pfunc ...' train = pfunc([x, y, lr], [loss], mode=mode, updates=[(p, p - g) for p, g in zip(params, gparams)]) # for i, n in enumerate(train.maker.fgraph.toposort()): # print i, n xval = my_rand(*shape_img) yval = my_rand(n_batch, n_out) # int32 make all 0... lr = theano._asarray(0.01, dtype='float32') for i in xrange(n_train): rval = train(xval, yval, lr) print_mode(mode) return rval
def sharedX(value, name=None, borrow=False): """Transform value into a shared variable of type floatX""" return theano.shared(theano._asarray(value, dtype=theano.config.floatX), name=name, borrow=borrow)
def _params_allgood(ishape, kshape, mode, subsample=(1, 1), img_stride=(1, 1), kern_stride=(1, 1), version=-1, verbose=0, random=True, print_=None, id=None, rtol=1e-5, atol=1e-8, nb_iter=0, ones=False, compile_kshp=None, theano_mode=None, cls=None): # # This function is the core of several of the big unit-test drivers, # but it can also be used very directly on its own to test a specific # kind of convolution. # # See `test_example` (above) for an example of how to use this directly. # # :param kshape: (4d)The shape of the kernel at run time. # :param compile_kshp: (2d) hardcode the shape of the kernel in # the generated code This is supposed to be # faster, but we need to check That we raise # an error if the input have the wrong shape. # if ones: assert not random npy_img = theano._asarray(numpy.ones(ishape), dtype='float32') npy_kern = -theano._asarray(numpy.ones(kshape), dtype='float32') elif random: npy_img = theano._asarray(numpy.random.rand(*ishape) + 1, dtype='float32') npy_kern = theano._asarray(numpy.random.rand(*kshape) - 2, dtype='float32') else: npy_img = theano._asarray(numpy.arange( numpy.prod(ishape)).reshape(ishape), dtype='float32') + 1 npy_kern = -( theano._asarray(numpy.arange(numpy.prod(kshape)).reshape(kshape), dtype='float32') + 1) img = cuda_ndarray.CudaNdarray(npy_img) kern = cuda_ndarray.CudaNdarray(npy_kern) # we take the stride after the transfert as we make c_contiguous # data on the GPU. if img_stride != (1, 1): img = img[:, :, ::img_stride[0], ::img_stride[1]] npy_img = npy_img[:, :, ::img_stride[0], ::img_stride[1]] if kern_stride != (1, 1): kern = kern[:, :, ::kern_stride[0], ::kern_stride[1]] npy_kern = npy_kern[:, :, ::kern_stride[0], ::kern_stride[1]] i = cuda.CudaNdarrayType(broadcastable=[sh == 1 for sh in npy_img.shape])() k = cuda.CudaNdarrayType(broadcastable=[sh == 1 for sh in npy_kern.shape])() op = theano.sandbox.cuda.blas.GpuConv(border_mode=mode, subsample=subsample, version=version, verbose=verbose, kshp=compile_kshp)(i, k) f = theano.function([i, k], op, mode=theano_mode) if cls is not None: assert any([ isinstance(node.op, cls) for node in f.maker.fgraph.toposort() ]), "Cannot find class %r in %r" % (cls, f.maker.fgraph.toposort()) t2 = time.time() gpuval = f(img, kern) t3 = time.time() for i in range(nb_iter): gpuval2 = f(img, kern) assert (numpy.asarray(gpuval) == numpy.asarray(gpuval2)).all() gpuval = numpy.asarray(gpuval) # CPU val computed after GPU val to get the GPU errors. t0 = time.time() cpuval = py_conv(npy_img, npy_kern, mode, subsample) t1 = time.time() assert gpuval.shape == cpuval.shape, ("shape mismatch", gpuval.shape, cpuval.shape) assert_allclose(cpuval, gpuval, rtol=rtol, atol=atol) assert numpy.all(numpy.isfinite(gpuval)), gpuval assert [(sh == 1) is br for sh, br in zip(cpuval.shape[:2], op.type.broadcastable[:2])] if (t2 is not None): if mode == 'valid': approx_fp = cpuval.size * ishape[1] * kshape[2] * kshape[3] * 2 else: approx_fp = (ishape[0] * kshape[0] * kshape[1] * kshape[2] * kshape[3] * ishape[2] * ishape[3] * 2) approx_fp /= 1e6 cpu_mflops = approx_fp / (t1 - t0) gpu_mflops = approx_fp / (t3 - t2) if verbose > 0: print('%15s' % str(ishape), '%15s' % str(kshape), end=' ', file=sys.stdout) print('%12.5f %7.2f %7.2f %7.1f' % (approx_fp, cpu_mflops, gpu_mflops, (t1 - t0) / (t2 - t1)), file=sys.stdout)
def as_ar(a): return theano._asarray(a, dtype="int32")
def perform(self, node, inp, out): from theano.sandbox.cuda import filter as cuda_filter x, = inp z, = out z[0] = cuda_filter(theano._asarray(x, dtype='float32'), tuple([0] * x.ndim), 0, z[0])
def perform(self, node, inp, out_): (x,) = inp (out,) = out_ out[0] = theano._asarray(np.shape(x), dtype="int64")
def castX(x): return theano._asarray(x, dtype=theano.config.floatX)
def new_expbounds(cls, rng, x_list, n_out, dtype=None, params=None, updates=None, exponent_range=(1.0, 3.0)): """ """ if params is None: params = [] if updates is None: updates = [] if dtype is None: dtype = x_list[0].dtype n_terms = len(x_list) def shared_uniform(low, high, size, name): return _shared_uniform(rng, low, high, size, dtype, name) use_softmax_w = True if use_softmax_w: w = shared_uniform(low=-.1, high=.1, size=(n_out, n_terms), name='Kouh2008::w') w_sm = theano.tensor.nnet.softmax(w) w_list = [w_sm[:, i] for i in xrange(n_terms)] w_l1 = abs(w).sum() w_l2_sqr = (w**2).sum() else: w_list = [ shared_uniform(low=-2.0 / n_terms, high=2.0 / n_terms, size=(n_out, ), name='w_%i' % i) for i in xrange(n_terms) ] w_l1 = sum(abs(wi).sum() for wi in w_list) w_l2_sqr = sum((wi**2).sum() for wi in w_list) e_range_low, e_range_high = exponent_range e_range_low = theano._asarray(e_range_low, dtype=dtype) e_range_high = theano._asarray(e_range_high, dtype=dtype) e_range_mag = e_range_high - e_range_low if e_range_mag < 0: raise ValueError('exponent range must have low <= high') p_unbounded = shared_uniform(low=-0.1, high=0.1, size=(n_out, ), name='p') q_unbounded = shared_uniform(low=-0.1, high=0.1, size=(n_out, ), name='q') r_unbounded = shared_uniform(low=-0.1, high=0.1, size=(n_out, ), name='r') k_unbounded = shared_uniform(low=-0.2, high=0.2, size=(n_out, ), name='k') # biases p = tensor.nnet.sigmoid(p_unbounded) * e_range_mag + e_range_low q = tensor.nnet.sigmoid(q_unbounded) * e_range_mag + e_range_low r = tensor.nnet.sigmoid(r_unbounded) * \ theano._asarray(1.0/e_range_low - 1.0/e_range_high, dtype=dtype) \ + theano._asarray(1.0/e_range_high, dtype=dtype) k = softsign(k_unbounded) if use_softmax_w: rval = cls( w_list, x_list, p, q, r, k, params=[p_unbounded, q_unbounded, r_unbounded, k_unbounded, w ] + params, updates=updates) else: rval = cls( w_list, x_list, p, q, r, k, params=[p_unbounded, q_unbounded, r_unbounded, k_unbounded] + w_list + params, updates=updates) rval.p_unbounded = p_unbounded rval.q_unbounded = q_unbounded rval.r_unbounded = r_unbounded rval.k_unbounded = k_unbounded rval.exp_l1 = abs(p_unbounded).sum() + abs(q_unbounded).sum() + abs( r_unbounded).sum() rval.exp_l2_sqr = (p_unbounded**2).sum() + (q_unbounded**2).sum() + ( r_unbounded**2).sum() rval.w_l1 = w_l1 rval.w_l2_sqr = w_l2_sqr return rval
def sharedX(x): return theano.shared(theano._asarray(x, dtype=theano.config.floatX))
def _shared_uniform(rng, low, high, size, dtype, name=None): return shared( theano._asarray(rng.uniform(low=low, high=high, size=size), dtype=dtype), name)
def _infer_ndim_bcast(ndim, shape, *args): """ Infer the number of dimensions from the shape or the other arguments. :rtype: (int, variable, tuple) triple, where the variable is an integer vector, and the tuple contains Booleans. :returns: the first element returned is the inferred number of dimensions. The second element is the shape inferred (combining symbolic and constant informations from shape and args). The third element is a broadcasting pattern corresponding to that shape. """ # Find the minimum value of ndim required by the *args if args: args_ndim = max(arg.ndim for arg in args) else: args_ndim = 0 # there is a convention that -1 means the corresponding shape of a # potentially-broadcasted symbolic arg if (isinstance(shape, (tuple, list)) and numpy.all(numpy.asarray(shape)>=0)): bcast = [(s==1) for s in shape] v_shape = tensor.TensorConstant(type=tensor.lvector, data=theano._asarray(shape, dtype='int64')) shape_ndim = len(shape) if ndim is None: ndim = shape_ndim else: if shape_ndim != ndim: raise ValueError('ndim should be equal to len(shape), but\n', 'ndim = %s, len(shape) = %s, shape = %s' % (ndim, shape_ndim, shape)) elif isinstance(shape, (tuple, list)): # there is a convention that -1 means the corresponding shape of a # potentially-broadcasted symbolic arg # # This case combines together symbolic and non-symbolic shape # information if ndim is None: ndim=args_ndim else: ndim = max(args_ndim, ndim) ndim = max(args_ndim, len(shape)) shape = [-1]*(ndim - len(shape))+list(shape) bcast = [] pre_v_shape = [] for i,s in enumerate(shape): if hasattr(s, 'type'): # s is symbolic bcast.append(False) # todo - introspect further pre_v_shape.append(s) else: if s >= 0: pre_v_shape.append(tensor.as_tensor_variable(s)) bcast.append((s==1)) elif s == -1: n_a_i = 0 for a in args: # ndim: _ _ _ _ _ _ # ashp: s0 s1 s2 s3 # i if i >= ndim - a.ndim: n_a_i += 1 a_i = i + a.ndim -ndim if not a.broadcastable[a_i]: pre_v_shape.append(a.shape[a_i]) bcast.append(False) break else: if n_a_i == 0: raise ValueError(('Auto-shape of -1 must overlap' 'with the shape of one of the broadcastable' 'inputs')) else: pre_v_shape.append(tensor.as_tensor_variable(1)) bcast.append(True) else: ValueError('negative shape', s) # post-condition: shape may still contain both symbolic and non-symbolic things v_shape = tensor.stack(*pre_v_shape) elif shape is None: # The number of drawn samples will be determined automatically, # but we need to know ndim if not args: raise TypeError(('_infer_ndim_bcast cannot infer shape without' ' either shape or args')) template = reduce(lambda a,b:a+b, args) v_shape = template.shape bcast = template.broadcastable ndim = template.ndim else: v_shape = tensor.as_tensor_variable(shape) if ndim is None: ndim = tensor.get_vector_length(v_shape) bcast = [False]*ndim if not (v_shape.dtype.startswith('int') or v_shape.dtype.startswith('uint')): raise TypeError('shape must be an integer vector or list', v_shape.dtype) if args_ndim > ndim: raise ValueError('ndim should be at least as big as required by args value', (ndim, args_ndim), args) assert ndim == len(bcast) return ndim, tensor.cast(v_shape, 'int32'), tuple(bcast)
def test_sum(): """ test sum pattern 1, 11, 10, 01, 001, 010, 100, 110, 011, 111, 0011, 0101, 0111, 1011, 1111 test sum pattern implemented with reshape: 1000, 0100, 0010, 0001, 11111 others implemented by reshape that are not tested 0011,0101,0110,1001,1010,1100 1110,1101,1011 TODO: test with broadcast """ for shape, pattern in [((100,3,1300),[1]), ((0,),[0]),((5,),[0]), ((0,0),[0,1]),((1,0),[0,1]),((5,4),[0,1]),((33,31),[0,1]),((5,4),[1]),((5,4),[0]),#need something bigger then 32 for some opt test. ((5,4,3),[0]),((5,4,3),[1]),((5,4,3),[0,1]),((5,4,3),[2]),((5,4,3),[1,2]),((5,4,3),[0,1,2]), ((0,0,0,0),[0,1,2,3]), ((5,4,3,20),[2,3]), ((5,4,3,2),[0,1,2,3]), ((5,4,3,2),[0,2,3]),((5,4,3,2),[1,2,3]), ((5,4,3,10,11),[1,2]), ((5,4,3,20),[2,3]), ((5,4,3,2),[0,1,2,3]), ((5,4,3,2),[0,2,3]),((5,4,3,2),[1,2,3]), #test shape bigger then 4096 on each dimension to make sure that we work correctly when we don't have enought thread/block in each dimensions ((4100,3),[0]),((3,4101),[0]),#10 ((1024,33),[0]),((33,1024),[0]),#10 ((1025,33),[0]),((33,1025),[0]),#10 ((4100,3),[1]),((3,4101),[1]),#01 ((1024,33),[1]),((33,1024),[1]),#01 ((1025,33),[1]),((33,1025),[1]),#01 ((4100,3),[0,1]),((3,4101),[0,1]),#11 ((1024,33),[0,1]),((33,1024),[0,1]),#01 ((1025,33),[0,1]),((33,1025),[0,1]),#01 ((4100,4,3),[0]),((5,4100,3),[0]),((5,4,4100),[0]),#100 ((4100,4,3),[1]),((5,4100,3),[1]),((5,4,4100),[1]),#010 ((4100,4,3),[2]),((5,4100,3),[2]),((5,4,4100),[2]),#001 ((4100,4,3),[0,1]),((5,4100,3),[0,1]),((5,4,4100),[0,1]),#110 ((4100,4,3),[1,2]),((5,4100,3),[1,2]),((5,4,4100),[1,2]),#011 #((4100,4,3),[0,2]),((5,4100,3),[0,2]),((5,4,4100),[0,2]),#101 ##not implemented ((4100,4,3),[0,1,2]),((5,4100,3),[0,1,2]),((5,4,4100),[0,1,2]),#111 ((4100,4,3,2),[2,3]),((4,4100,3,2),[2,3]),((4,3,4100,2),[2,3]),((4,3,2,4100),[2,3]),#0011 ((4100,4,3,2),[1,3]),((4,4100,3,2),[1,3]),((4,3,4100,2),[1,3]),((4,3,2,4100),[1,3]),#0101 ((4100,4,3,2),[0,2,3]),((4,4100,3,2),[0,2,3]),((4,3,4100,2),[0,2,3]),#((4,3,2,4100),[0,2,3]),#1011 ((4100,4,3,2),[1,2,3]),((4,4100,3,2),[1,2,3]),((4,3,4100,2),[1,2,3]),((4,3,2,4100),[1,2,3]),#0111 ((4100,2,3,4),[0,1,2,3]),((2,4100,3,4),[0,1,2,3]),((2,3,4100,4),[0,1,2,3]),((2,3,4,4100),[0,1,2,3]),#1111 #test pattern implemented by reshape ((4100,4,3,2),[0]),((4,4100,3,2),[0]),((4,3,4100,2),[0]),((4,3,2,4100),[0]),#1000 ((4100,4,3,2),[1]),((4,4100,3,2),[1]),((4,3,4100,2),[1]),((4,3,2,4100),[1]),#0100 ((4100,4,3,2),[2]),((4,4100,3,2),[2]),((4,3,4100,2),[2]),((4,3,2,4100),[2]),#0010 ((4100,4,3,2),[3]),((4,4100,3,2),[3]),((4,3,4100,2),[3]),((4,3,2,4100),[3]),#0001 ((1100,2,3,4,5),[0,1,2,3,4]),((2,1100,3,4,5),[0,1,2,3,4]),((2,3,1100,4,5),[0,1,2,3,4]),((2,3,4,1100,5),[0,1,2,3,4]),((2,3,4,5,1100),[0,1,2,3,4]),#11111 ]: a = tensor.TensorType('float32', (False,) * len(shape))() b = T.Sum(pattern)(a) val = numpy.random.rand(numpy.prod(shape)).reshape(shape) # val = numpy.ones(shape) # val = numpy.arange(numpy.prod(shape)).reshape(shape) val = theano._asarray(val, dtype='float32') f = theano.function([a], b, mode=mode_with_gpu) f2 = theano.function([a], b, mode=mode_without_gpu) assert tcn.GpuSum in [x.op.__class__ for x in f.maker.env.toposort()] assert T.Sum in [x.op.__class__ for x in f2.maker.env.toposort()] if val.size == 0: assert f2(val) == f(val), ('shape', shape, 'pattern', pattern) else: try: #We raise the error threashold as we sum big matrix #and this cause small rounding difference with some seed #example in debug mode with unittests.rseed=9275 orig_rtol = theano.tensor.basic.float32_rtol theano.tensor.basic.float32_rtol = 2e-5 assert _allclose(f2(val), f(val)), ('shape', shape, 'pattern', pattern, sum([shape[i] for i in pattern]), f2(val), f(val), val) finally: theano.tensor.basic.float32_rtol = orig_rtol #test with dimshuffle #we shuffle the 2 outer dims. for shape, pattern in [#((5,),[0]), ((5,4),[0,1]),((5,4),[0]), ((5,4,3),[0]),((5,4,3),[0,1]),((5,4,3),[2]),((5,4,3),[0,1,2]), ((5,4,3,2),[0,1,2,3]), ((5,4,3,2),[0,2,3])]: a = tensor.TensorType('float32', (False,) * len(shape))() dim_pattern = range(len(shape)) dim_pattern[0] = 1 dim_pattern[1] = 0 a = a.dimshuffle(dim_pattern) b = T.Sum(pattern)(a) val = numpy.random.rand(numpy.prod(shape)).reshape(shape) # val = numpy.ones(shape) # val = numpy.arange(numpy.prod(shape)).reshape(shape) val = theano._asarray(val, dtype='float32') f = theano.function([a], b, mode=mode_with_gpu) f2 = theano.function([a], b, mode=mode_without_gpu) assert tcn.GpuSum in [x.op.__class__ for x in f.maker.env.toposort()] assert T.Sum in [x.op.__class__ for x in f2.maker.env.toposort()] assert _allclose(f2(val), f(val)), ('shape', shape, 'pattern', pattern, sum([shape[i] for i in pattern])) #test with broadcast for shape, pattern in [((5,),[0]), ((5,4),[0,1]),((5,4),[0]), ((5,4,3),[0]),((5,4,3),[0,1]),((5,4,3),[2]),((5,4,3),[0,1,2]), ((5,4,3,2),[0,1,2,3]), ((5,4,3,2),[0,2,3])]: shape = numpy.asarray(shape) * 2 a = tensor.TensorType('float32', (False,) * len(shape))() a2 = tcn.CudaNdarrayType((False,) * len(shape))() b = T.Sum(pattern)(a) b2 = T.Sum(pattern)(a2) val = numpy.random.rand(numpy.prod(shape)).reshape(shape) # val = numpy.ones(shape) # val = numpy.arange(numpy.prod(shape)).reshape(shape) val = theano._asarray(val, dtype='float32') val2 = cuda.CudaNdarray(val) if len(shape) == 1: val = val[::2] val2 = val2[::2] elif len(shape) == 2: val = val[::2, ::2] val2 = val2[::2, ::2] elif len(shape) == 3: val = val[::2, ::2, ::2] val2 = val2[::2, ::2, ::2] elif len(shape) == 4: val = val[::2, ::2, ::2, ::2] val2 = val2[::2, ::2, ::2, ::2] f = theano.function([a], b, mode=mode_without_gpu) f2 = theano.function([a2], b2, mode=mode_with_gpu) assert tcn.GpuSum in [x.op.__class__ for x in f2.maker.env.toposort()] assert T.Sum in [x.op.__class__ for x in f.maker.env.toposort()] assert _allclose(f2(val2), f(val)), ('shape', shape, 'pattern', pattern, sum([shape[i] for i in pattern]))
def shared_dataset(data_x): """Function that loads the dataset into shared variables""" if conf.get('normalize', True): return sharedX(data_x, borrow=True) else: return theano.shared(theano._asarray(data_x), borrow=True)
def test_careduce(): """ test sum pattern 1, 11, 10, 01, 001, 010, 100, 110, 011, 111, 0011, 0101, 0111, 1011, 1111 test sum pattern implemented with reshape: 1000, 0100, 0010, 0001, 11111 others implemented by reshape that are not tested 0011,0101,0110,1001,1010,1100 1110,1101,1011 TODO: test with broadcast """ for scalar_op, careduce_op in [ (theano.scalar.add, tensor.elemwise.CAReduceDtype), (theano.scalar.maximum, tensor.CAReduce)]: for shape, pattern in [((1,1),(1,)), ((1,0),(1,)), ((0,1),(1,)), ((0,0),(1,)), ((0,0,0),(1,2)), ((0,0,0,0),(1,2,3)), ((2,1),(1,)), ((1,2),(1,)), ((100,3,1300),[1]), ((0,),[0]),((5,),[0]), ((0,0),[0,1]),((1,0),[0,1]),((5,4),[0,1]),((33,31),[0,1]),((5,4),[1]),((5,4),[0]),#need something bigger then 32 for some opt test. ((5,4,3),[0]),((5,4,3),[1]),((5,4,3),[0,1]),((5,4,3),[2]),((5,4,3),[1,2]),((5,4,3),[0,1,2]), ((0,0,0,0),[0,1,2,3]), ((5,4,3,20),[2,3]), ((5,4,3,2),[0,1,2,3]), ((5,4,3,2),[0,2,3]),((5,4,3,2),[1,2,3]), ((5,4,3,10,11),[1,2]), ((5,4,3,20),[2,3]), ((5,4,3,2),[0,1,2,3]), ((5,4,3,2),[0,2,3]),((5,4,3,2),[1,2,3]), #test shape bigger then 4096 on each dimension to make sure that we work correctly when we don't have enough thread/block in each dimensions ((4100,3),[0]),((3,4101),[0]),#10 ((1024,33),[0]),((33,1024),[0]),#10 ((1025,33),[0]),((33,1025),[0]),#10 ((4100,3),[1]),((3,4101),[1]),#01 ((1024,33),[1]),((33,1024),[1]),#01 ((1025,33),[1]),((33,1025),[1]),#01 ((4100,3),[0,1]),((3,4101),[0,1]),#11 ((1024,33),[0,1]),((33,1024),[0,1]),#01 ((1025,33),[0,1]),((33,1025),[0,1]),#01 ((4100,4,3),[0]),((5,4100,3),[0]),((5,4,4100),[0]), ((3,65536,1), [0]),#100 ((4100,4,3),[1]),((5,4100,3),[1]),((5,4,4100),[1]),#010 ((4100,4,3),[2]),((5,4100,3),[2]),((5,4,4100),[2]),#001 ((4100,4,3),[0,1]),((5,4100,3),[0,1]),((5,4,4100),[0,1]),#110 ((4100,4,3),[1,2]),((5,4100,3),[1,2]),((5,4,4100),[1,2]),#011 #((4100,4,3),[0,2]),((5,4100,3),[0,2]),((5,4,4100),[0,2]),#101 ##not implemented ((4100,4,3),[0,1,2]),((5,4100,3),[0,1,2]),((5,4,4100),[0,1,2]),#111 ((4100,4,3,2),[2,3]),((4,4100,3,2),[2,3]),((4,3,4100,2),[2,3]),((4,3,2,4100),[2,3]),#0011 ((4100,4,3,2),[1,3]),((4,4100,3,2),[1,3]),((4,3,4100,2),[1,3]),((4,3,2,4100),[1,3]),#0101 ((4100,4,3,2),[0,2,3]),((4,4100,3,2),[0,2,3]),((4,3,4100,2),[0,2,3]),#((4,3,2,4100),[0,2,3]),#1011 ((4100,4,3,2),[1,2,3]),((4,4100,3,2),[1,2,3]),((4,3,4100,2),[1,2,3]),((4,3,2,4100),[1,2,3]),#0111 ((4100,2,3,4),[0,1,2,3]),((2,4100,3,4),[0,1,2,3]),((2,3,4100,4),[0,1,2,3]),((2,3,4,4100),[0,1,2,3]),#1111 #test pattern implemented by reshape ((4100,4,3,2),[0]),((4,4100,3,2),[0]),((4,3,4100,2),[0]),((4,3,2,4100),[0]),#1000 ((4100,4,3,2),[1]),((4,4100,3,2),[1]),((4,3,4100,2),[1]),((4,3,2,4100),[1]),#0100 ((4100,4,3,2),[2]),((4,4100,3,2),[2]),((4,3,4100,2),[2]),((4,3,2,4100),[2]),#0010 ((4100,4,3,2),[3]),((4,4100,3,2),[3]),((4,3,4100,2),[3]),((4,3,2,4100),[3]),#0001 ((1100,2,3,4,5),[0,1,2,3,4]),((2,1100,3,4,5),[0,1,2,3,4]),((2,3,1100,4,5),[0,1,2,3,4]),((2,3,4,1100,5),[0,1,2,3,4]),((2,3,4,5,1100),[0,1,2,3,4]),#11111 ]: op = careduce_op(scalar_op, axis=pattern) pat = tensor_pattern_to_gpu_pattern(shape, pattern) #GpuCAReduce{maximum} support only those patterns if scalar_op is theano.scalar.maximum and pat not in [ (0, 1), (0, 1, 1), (0, 1, 1)]: continue a = tensor.TensorType('float32', (False,) * len(shape))() b = op(a) val = numpy.random.rand(numpy.prod(shape)).reshape(shape) # val = numpy.ones(shape) # val = numpy.arange(numpy.prod(shape)).reshape(shape) val = theano._asarray(val, dtype='float32') f = theano.function([a], b, mode=mode_with_gpu) f2 = theano.function([a], b, mode=mode_without_gpu) assert tcn.GpuCAReduce in [x.op.__class__ for x in f.maker.fgraph.toposort()] assert op.__class__ in [x.op.__class__ for x in f2.maker.fgraph.toposort()] f_caused_value_error = False try: f_out = f(val) except ValueError, e: exc = e f_caused_value_error = True f2_caused_value_error = False try: f2_out = f2(val) except ValueError, e: exc2 = e f2_caused_value_error = True if f_caused_value_error != f2_caused_value_error: if f_caused_value_error: print 'f caused this value error:' print exc else: print 'f did not raise a value error, but should have' if f2_caused_value_error: print 'f2 caused this value error:' print exc2 else: print 'f should not have raised a value error' print 'shape was: ', shape print 'pattern was: ', pattern assert False try: #We raise the error threashold as we sum big matrix #and this cause small rounding difference with some seed #example in debug mode with unittests.rseed=9275 orig_rtol = theano.tensor.basic.float32_rtol theano.tensor.basic.float32_rtol = 2e-5 assert _allclose(f_out, f2_out), ('shape', shape, 'pattern', pattern, sum([shape[i] for i in pattern]), f2(val), f(val), val) finally: theano.tensor.basic.float32_rtol = orig_rtol
def perform(self, node, inputs, output_storage): a = inputs[0] axis = inputs[1] z = output_storage[0] z[0] = theano._asarray(np.argsort(a, axis, self.kind, self.order), dtype=node.outputs[0].dtype)
def test_huge_elemwise_fusion(): """ Test the the GpuElemwise fusion work correctly We check that we fuse one node with part of its input in case their is too many inputs and that would make it bust the 256 bytes limits. """ shape = (2, 3, 4, 5, 6) ttype = tensor.tensor(dtype='float32', broadcastable=(False, ) * len(shape)) gpu_ptr_size = theano.sandbox.cuda.opt.get_device_type_sizes( )['gpu_ptr_size'] if gpu_ptr_size == 8: nb_in = 7 len_topo = 10 elif gpu_ptr_size == 4: nb_in = 8 len_topo = 11 else: raise Exception("Unexpected value for gpu_ptr_size", gpu_ptr_size) vars = [tensor.tanh(ttype) for x in range(nb_in)] f = pfunc(vars, [reduce(operator.sub, vars)], mode=mode_with_gpu) topo = f.maker.fgraph.toposort() #theano.printing.debugprint(f) #for i, node in enumerate(topo): # print >> sys.stdout, i, node assert len(topo) == len_topo assert sum([isinstance(node.op, cuda.GpuElemwise) for node in topo]) == 2 assert isinstance(topo[-3].op.scalar_op, theano.scalar.basic.Sub) assert isinstance(topo[-2].op.scalar_op, theano.scalar.basic.Composite) #let debugmode catch errors gen = lambda: theano._asarray(numpy.random.rand(*shape), dtype='float32') f(*[gen() for i in range(nb_in)]) # Test the case where we can't put the computation on the gpu! their is too # many dimensions to the input to have 2 inputs to the op! shape = ( 1, 2, 3, 4, 5, 6, 7, 2, 2, 3, 2, 1, 2, 2, 2, ) ttype = tensor.tensor(dtype='float32', broadcastable=(False, ) * len(shape)) vars = [tensor.tanh(ttype) for x in range(7)] f = pfunc( vars, [vars[0] - vars[1] - vars[2] - vars[3] - vars[4] - vars[5] - vars[6]], mode=mode_with_gpu) topo = f.maker.fgraph.toposort() #theano.printing.debugprint(f) assert len(topo) == 1 assert sum([isinstance(node.op, cuda.GpuElemwise) for node in topo]) == 0 assert sum([isinstance(node.op, tensor.Elemwise) for node in topo]) == 1 #let debugmode catch errors gen = lambda: theano._asarray(numpy.random.rand(*shape), dtype='float32') f(gen(), gen(), gen(), gen(), gen(), gen(), gen()) def gen(shape): return theano._asarray(numpy.random.rand(*shape), dtype='float32') max_var = 16 # excluded for shape in [ (2, ), (2, 2), (2, 2, 2), (2, 2, 2, 2), (2, 2, 2, 2, 2), # 5d (2, 2, 2, 2, 2, 2), # (2, 2, 2, 2, 2, 2, 2), # (2, 2, 2, 2, 2, 2, 2, 2), # (2, 2, 2, 1, 1, 1, 1, 2, 2), # 9d ]: vals = [cuda.shared_constructor(gen(shape)) for x in range(max_var)] for use_tan in [True, False]: if use_tan: vars = [tensor.tanh(x) for x in vals] else: vars = vals for nb_var in range(1, max_var): out = reduce(lambda x, y: x + y, vars[:nb_var]) if not isinstance(out.type, CudaNdarrayType): out = cuda.gpu_from_host(out) f = pfunc([], [out], mode=mode_with_gpu) topo = f.maker.fgraph.toposort() #print shape, nb_var, use_tan, len(topo) assert (sum( [isinstance(node.op, cuda.GpuElemwise) for node in topo]) == len(topo) or (nb_var == 1 and use_tan == False)) assert sum([ isinstance(node.op, tensor.Elemwise) for node in topo ]) == 0 #let debugmode catch errors f()
def _params_allgood(ishape, kshape, mode, subsample=(1, 1), img_stride=(1, 1), kern_stride=(1, 1), version=-1, verbose=0, random=True, print_=None, id=None, rtol=1e-5, atol=1e-8, nb_iter=0, ones=False, compile_kshp=None): # # This function is the core of several of the big unit-test drivers, # but it can also be used very directly on its own to test a specific # kind of convolution. # # See `test_example` (above) for an example of how to use this directly. # # :param kshape: (4d)The shape of the kernel at run time. # :param compile_kshp: (2d) hardcode the shape of the kernel in the generated code # This is supposed to be faster, but we need to check # That we raise an error if the input have the wrong shape. # if ones: assert not random npy_img = theano._asarray(numpy.ones(ishape), dtype='float32') npy_kern = -theano._asarray(numpy.ones(kshape), dtype='float32') elif random: npy_img = theano._asarray(numpy.random.rand(*ishape) + 1, dtype='float32') npy_kern = theano._asarray(numpy.random.rand(*kshape) - 2, dtype='float32') else: npy_img = theano._asarray(numpy.arange( numpy.prod(ishape)).reshape(ishape), dtype='float32') + 1 npy_kern = -( theano._asarray(numpy.arange(numpy.prod(kshape)).reshape(kshape), dtype='float32') + 1) img = cuda_ndarray.CudaNdarray(npy_img) kern = cuda_ndarray.CudaNdarray(npy_kern) #we take the stride after the transfert as we make c_contiguous data on the GPU. if img_stride != (1, 1): img = img[:, :, ::img_stride[0], ::img_stride[1]] npy_img = npy_img[:, :, ::img_stride[0], ::img_stride[1]] if kern_stride != (1, 1): kern = kern[:, :, ::kern_stride[0], ::kern_stride[1]] npy_kern = npy_kern[:, :, ::kern_stride[0], ::kern_stride[1]] t2 = None rval = True try: t0 = time.time() cpuval = py_conv(npy_img, npy_kern, mode, subsample) t1 = time.time() i = cuda_tensor4() k = cuda_tensor4() op = theano.sandbox.cuda.blas.GpuConv(border_mode=mode, subsample=subsample, version=version, verbose=verbose, kshp=compile_kshp)(i, k) f = theano.function([i, k], op, mode=theano_mode) gpuval = f(img, kern) t2 = time.time() for i in range(nb_iter): gpuval2 = f(img, kern) assert numpy.allclose(numpy.asarray(gpuval), numpy.asarray(gpuval2)) assert (numpy.asarray(gpuval) == numpy.asarray(gpuval2)).all() gpuval = numpy.asarray(gpuval) if gpuval.shape != cpuval.shape: print >> sys.stdout, "ERROR: shape mismatch", gpuval.shape, cpuval.shape rval = False if rval: rval = numpy.allclose(cpuval, gpuval, rtol=rtol) assert numpy.all(numpy.isfinite(gpuval)) except NotImplementedError, e: print >> sys.stdout, '_params_allgood Failed allclose', e rval = False