def __init__(self, input_size, hidden_size, dtype=theano.config.floatX): self.grub = dnn.RNNBlock(dtype=dtype, hidden_size=hidden_size, num_layers=1, rnn_mode='gru') self.input_size = input_size self.hidden_size = hidden_size psize = self.grub.get_param_size((1, input_size)) self.params = gpuarray_shared_constructor( np.zeros(psize, dtype=theano.config.floatX))
def __init__(self,rng,n_hidden,x, E,xmask,is_train,dropout,mode='lstm', n_layer=1, pre_state=None,**kwargs): self.is_train=is_train self.dropout=dropout self.rng=rng self.xmask=xmask shape=x.shape embd=E[x.flatten()] embd=embd.reshape([shape[0],shape[1],-1]) if pre_state==None: h0 = T.zeros((n_layer, shape[1], n_hidden), dtype=theano.config.floatX) pre_state = [h0, ] if mode=='lstm': c0 = T.zeros((n_layer, shape[1], n_hidden), dtype=theano.config.floatX) pre_state.append(c0) rnnb=dnn.RNNBlock(dtype=theano.config.floatX, hidden_size=n_hidden, num_layers=n_layer, rnn_mode=mode, input_mode='skip', direction_mode='unidirectional') psize=rnnb.get_param_size([1,n_hidden]) print psize params_cudnn = gpuarray_shared_constructor( np.zeros((psize,), dtype=theano.config.floatX) ) #l = np.sqrt(6.) / np.sqrt(4 * n_hidden) #pvalue = np.asarray(self.rng.uniform(low=-l, high=l, size=(psize,)), dtype=theano.config.floatX) #params_cudnn=gpuarray_shared_constructor(pvalue,name='cudnn') self.params=[params_cudnn,] if mode=='lstm': h=rnnb.apply(params_cudnn,embd,pre_state[0],pre_state[1])[0] else: h=rnnb.apply(params_cudnn,embd,pre_state[0])[0] h=h*self.xmask.dimshuffle(0,1,'x') # Dropout if self.dropout > 0: drop_mask = self.rng.binomial(n=1, p=1 - self.dropout, size=h.shape, dtype=theano.config.floatX) self.activation = T.switch(self.is_train, h * drop_mask, h * (1 - self.dropout)) else: self.activation = T.switch(self.is_train, h, h)
def _params_to_cudnn(self): from theano.gpuarray import dnn from theano.gpuarray.type import gpuarray_shared_constructor assert dnn.dnn_available(None) self._rnn_block = dnn.RNNBlock(theano.config.floatX, self.hidden_dim, num_layers=1, input_mode="linear", rnn_mode=self.rnn_type, direction_mode="unidirectional") param_size = self._rnn_block.get_param_size( [self.n_batch, self.input_dim]) # TODO: study about n_batch self.params = [gpuarray_shared_constructor(Constant(0.0)(param_size))] cs = self._rnn_block.split_params(self.params[0], layer=0, input_size=[ self.n_batch, self.input_dim ]) # TODO: multi layer support for c, p in zip(cs, self.non_cudnn_params): c[:] = p.get_value(borrow=True, return_internal_type=True)
) c0_val = np.random.random((depth, batch_size, hidden_dim)).astype( theano.config.floatX ) start = time.time() X = T.tensor3('X') Y = T.tensor3('Y') h0 = T.tensor3('h0') c0 = T.tensor3('c0') rnnb = dnn.RNNBlock( theano.config.floatX, hidden_dim, depth, network_type, input_mode='skip' ) psize = rnnb.get_param_size([batch_size, hidden_dim]) params_cudnn = gpuarray_shared_constructor( np.zeros((psize,), dtype=theano.config.floatX) ) # lstm = LSTM(input_dim, hidden_dim) output = rnnb.apply(params_cudnn, X, h0, c0)[0] # Only hidden states cost = T.mean((Y - output) ** 2) grads = T.grad(cost, params_cudnn) cudnn_fn = theano.function( inputs=[], outputs=output,
def test_gru(depth, input_dim, hidden_dim): '''hidden_dim and output_dim are usually same''' model = Model() # To collect parameters and keep track of layers X = T.tensor3('X') # input h0 = T.tensor3('h0') # initial hidden state of recurrent nets last_layer = WrapperLayer(X) last_dim = input_dim for i in range(depth): gru = GRU(last_dim, hidden_dim, last_layer, name="layer_{}".format(i + 1), s0=h0[i, :, :]) model.add_layer(gru) last_layer = gru last_dim = hidden_dim params = model.get_params() print( "Printing order of params. Important to know as this will help set params for cudnn_rnn" ) model.print_params() #list_of_param_values = [p.get_value() for p in params] #list of param values output = last_layer.output() # output tensor forward_fun = theano.function([X, h0], output) #forward function #Y = T.tensor3('Y') # proxy tensor with which we want to match the output of rnn to get a loss '''For checking gradient, I am defining loss as following, here 'output' is the theano tensor representing output of rnn op''' #loss = T.mean((Y - output)*(Y - output)) # mean square error #grad = T.grad(loss, params) # list of gradient with respect to parameters #get_grad = theano.function([X, h0, Y], grad) # getting list of gradients rnnb = dnn.RNNBlock('float32', hidden_dim, depth, 'gru') psize = rnnb.get_param_size([2, input_dim]) params_cudnn = theano.shared(numpy.zeros((psize, ), dtype='float32')) # irm, irb, ium, iub, inm, inb, rrm, rrb, rum, rub, rnm, rnb l0params = rnnb.split_params(params_cudnn, 0, [2, input_dim]) for i, p in enumerate(l0params): val = params[i].get_value() p[:] = val cudnn_rnn_gru_output = rnnb.apply(params_cudnn, X, h0) #import sys;sys.exit(0) ''' loss_rnn = T.mean((Y-output_cudnn)*(Y - output_cudnn)) grad_cudnn = T.grad(loss, params_cudnn) ''' cudnn_rnn_forward_fun = theano.function([X, h0], cudnn_rnn_gru_output) # h0 = numpy.random.random((1, 2, hidden_dim)).astype('float32') # inp1 = numpy.random.random((5, 2, input_dim)).astype('float32') # out = cudnn_rnn_forward_fun(inp1, h0) # for s in out: # print(s.shape) # import sys;sys.exit(0) def test0(bs, ts): ''' bs: batch_size ts: number of timesteps ''' h0 = numpy.random.random((depth, bs, hidden_dim)).astype('float32') inp1 = numpy.random.random((bs, ts, input_dim)).astype('float32') out1 = forward_fun(inp1, h0) # '''checking output shape''' assert (out1.shape == (bs, ts, hidden_dim)) hy, y = cudnn_rnn_forward_fun(inp1.transpose((1, 0, 2)), h0) print(hy.shape, y.shape) assert (check_equality_two_nd_array( numpy.asarray(hy)[-1], numpy.asarray(y)[0])) print(out1.shape) print(numpy.asarray(hy).transpose((1, 0, 2)).shape) assert (check_equality_two_nd_array(out1.transpose((1, 0, 2)), numpy.asarray(hy))) sys.exit(0) def test1(bs, ts): ''' bs: batch_size ts: number of timesteps ''' inp1 = numpy.random.random((bs, ts, input_dim)).astype('float32') h0 = numpy.random.random((depth, bs, hidden_dim)).astype('float32') Y = numpy.random.random((bs, ts, hidden_dim)).astype('float32') grad1 = get_grad(inp1, h0, Y) ''' grad_cudnn = get_grad_cudnn(inp1, h0, Y) ''' ''' compare grad with cudnn_grad here ''' ''' for g, g_hat in zip(grad1, grad_cudnn): check_equality_two_nd_array(g, g_hat) ''' test0(2, 5) print("passed test0 -1") import sys sys.exit(0) test0(1, 10) print("passed test0 -2") test1(5, 3) print("passed test1 -1") test1(6, 10) print("passed test1 -2")
def __init__(self, rng, n_hidden, x, xmask, is_train, dropout, mode='gru', n_layer=1, pre_state=None, **kwargs): prefix = "BiGRU_" Wc = norm_weight(n_hidden * 2, n_hidden, name=prefix + 'Wc') bc = zero_bias(n_hidden, prefix + 'bc') self.is_train = is_train self.dropout = dropout self.rng = rng self.xmask = xmask if pre_state == None: h0 = T.zeros((n_layer, x.shape[1], n_hidden), dtype=theano.config.floatX) pre_state = [ h0, ] if mode == 'lstm': c0 = T.zeros((n_layer, x.shape[1], n_hidden), dtype=theano.config.floatX) pre_state.append(c0) rnnb = dnn.RNNBlock(dtype=theano.config.floatX, hidden_size=n_hidden, num_layers=n_layer, rnn_mode=mode, input_mode='skip', direction_mode='bidirectional') psize = rnnb.get_param_size([1, n_hidden]) print psize params_cudnn = gpuarray_shared_constructor( np.zeros((psize, ), dtype=theano.config.floatX)) #l = np.sqrt(6.) / np.sqrt(4 * n_hidden) #pvalue = np.asarray(self.rng.uniform(low=-l, high=l, size=(psize,)), dtype=theano.config.floatX) #params_cudnn=gpuarray_shared_constructor(pvalue,name='cudnn') self.params = [ params_cudnn, ] if mode == 'lstm': h = rnnb.apply(params_cudnn, x, pre_state[0], pre_state[1])[0] else: h = rnnb.apply(params_cudnn, x, pre_state[0])[0] h = h * self.xmask.dimshuffle(0, 1, 'x') self.context = h ctx_mean = (h * self.xmask[:, :, None]).sum(0) / self.xmask.sum(0)[:, None] self.activation = T.tanh(T.dot(ctx_mean, Wc) + bc) # Dropout if self.dropout > 0: drop_mask = self.rng.binomial(n=1, p=1 - self.dropout, size=h.shape, dtype=theano.config.floatX) self.activation = T.switch(self.is_train, h * drop_mask, h * (1 - self.dropout)) else: self.activation = T.switch(self.is_train, h, h)
def test_dnn_rnn_lstm(): if not dnn.dnn_available(test_ctx_name): raise SkipTest(dnn.dnn_available.msg) utt.seed_rng() # test params input_dim = 32 hidden_dim = 16 batch_size = 2 depth = 3 timesteps = 5 # test code X = T.tensor3('X') Y = T.tensor3('Y') h0 = T.tensor3('h0') c0 = T.tensor3('c0') rnnb = dnn.RNNBlock(theano.config.floatX, hidden_dim, depth, 'lstm') psize = rnnb.get_param_size([batch_size, input_dim]) params_cudnn = gpuarray_shared_constructor( np.zeros((psize, ), dtype=theano.config.floatX)) model = Model() last_layer = WrapperLayer(X) last_dim = input_dim for i in range(depth): lstm = LSTM(last_dim, hidden_dim, last_layer, s0=h0[i, :, :], c0=c0[i, :, :]) model.add_layer(lstm) last_layer = lstm last_dim = hidden_dim layer_params = lstm.get_params() dnn_params = rnnb.split_params(params_cudnn, i, [batch_size, input_dim]) for j, p in enumerate(dnn_params): p[:] = layer_params[j].get_value(borrow=True, return_internal_type=True) def funcs(out, params): fn = theano.function([X, h0, c0], out, mode=mode_with_gpu) cost = T.mean((Y - out)**2) grad = T.grad(cost, [X, h0, c0] + params) grad_fn = theano.function([X, Y, h0, c0], grad, mode=mode_with_gpu) return fn, grad_fn ref_fn, ref_grad_fn = funcs(last_layer.output(), model.get_params()) cudnn_fn, cudnn_grad_fn = funcs( rnnb.apply(params_cudnn, X, h0, c0)[0], [params_cudnn]) x_val = np.random.random( (timesteps, batch_size, input_dim)).astype(theano.config.floatX) y_val = np.random.random( (timesteps, batch_size, hidden_dim)).astype(theano.config.floatX) h0_val = np.random.random( (depth, batch_size, hidden_dim)).astype(theano.config.floatX) c0_val = np.random.random( (depth, batch_size, hidden_dim)).astype(theano.config.floatX) ref_out = ref_fn(x_val, h0_val, c0_val) cudnn_out = cudnn_fn(x_val, h0_val, c0_val) utt.assert_allclose(ref_out, cudnn_out) ref_grads = ref_grad_fn(x_val, y_val, h0_val, c0_val) cudnn_grads = cudnn_grad_fn(x_val, y_val, h0_val, c0_val) utt.assert_allclose(ref_grads[0], cudnn_grads[0]) utt.assert_allclose(ref_grads[1], cudnn_grads[1]) utt.assert_allclose(ref_grads[2], cudnn_grads[2]) ref_grads_params = ref_grads[3:] cudnn_grads_params = gpuarray_shared_constructor(cudnn_grads[3]) for i in range(depth): cudnn_grads_layer = rnnb.split_params(cudnn_grads_params, i, [batch_size, input_dim]) ref_grads_layer = ref_grads_params[i * len(cudnn_grads_layer):(i + 1) * len(cudnn_grads_layer)] for j, g in enumerate(cudnn_grads_layer): utt.assert_allclose(ref_grads_layer[j], g)
def rnn_dnn(X, hidden_size, rnn_mode, num_layers=1, parameters=None, h0=None, c0=None, input_mode='linear', direction_mode='unidirectional', dropout=0., name=None): """CuDNN v5 RNN implementation. Parameters ---------- X : input varialbe or placeholder shape=(batch_size, timesteps, input_dims) hidden_size : int the number of units within the RNN model. rnn_mode : {'rnn_relu', 'rnn_tanh', 'lstm', 'gru'} See cudnn documentation for ``cudnnRNNMode_t``. num_layers : int the number of layers for the RNN model. h0: tensor h0 with shape [num_layers, batch_size, hidden_size] c0: tensor c0 (lstm) with shape [num_layers, batch_size, hidden_size] parameters: list of tensor vector contain all flatten weights and bias check `backend.init.lstm`, `backend.init.gru`, and `backend.init.rnn` for more information input_mode : {'linear', 'skip'} linear: input will be multiplied by a biased matrix skip: No operation is performed on the input. The size must match the hidden size. (CuDNN docs: cudnnRNNInputMode_t) direction_mode : {'unidirectional', 'bidirectional'} unidirectional: The network operates recurrently from the first input to the last. bidirectional: The network operates from first to last then from last to first and concatenates the results at each layer. dropout: float (0.0-1.0) whether to enable dropout. With it is 0, dropout is disabled. Returns ------- [output, hidden_states, cell_states] for lstm [output, hidden_states] for gru and rnn output_shape: (batch_size, timesteps, hidden_size) hidden_shape: (num_layers, batch_size, hidden_size) cell_shape: (num_layers, batch_size, hidden_size) """ if CONFIG['device'] == 'cpu': raise Exception('This opt is not supported with CPU.') if name is None: name = uuid() # ====== Check arguments ====== # if rnn_mode not in ('rnn_relu', 'rnn_tanh', 'lstm', 'gru'): raise ValueError("rnn_mode=%s must be: 'rnn_relu', 'rnn_tanh', 'lstm', 'gru'" % rnn_mode) if input_mode not in ('linear', 'skip'): raise ValueError("input_mode=%s must be: 'linear', 'skip'" % input_mode) if direction_mode not in ('unidirectional', 'bidirectional'): raise ValueError("direction_mode=%s must be: 'unidirectional', 'bidirectional'" % direction_mode) is_bidirectional = direction_mode == 'bidirectional' # ====== helper function ====== # def check_init_states(s0, nb_layers, batch_size): if s0 is None: return None if s0.ndim < 3: s0 = expand_dims(s0, dim=0) s0shape = get_shape(s0) if s0shape[0] == 1 and s0shape[0] != nb_layers: s0 = repeat(s0, n=nb_layers, axes=0) if s0shape[1] == 1: s0 = repeat(s0, n=batch_size, axes=1) return s0 # ====== create RNNBlock ====== # input_shape = get_shape(X) if X.ndim != 3: raise ValueError('Input must be 3-D tensor, but X is %d-D tensor' % X.ndim) if input_shape[-1] != hidden_size and 'skip' in input_mode: raise ValueError('In skip_input mode, input size must be equal to hidden size' ', but input_size=%d != hidden_size=%d' % (input_shape[-1], hidden_size)) # IF we dimshuffle here, a lot of error concern GPUarray, # and cudnn will happen batch_size = X.shape[0] rnnb = dnn.RNNBlock(dtype=theano.config.floatX, hidden_size=hidden_size, num_layers=num_layers, rnn_mode=rnn_mode, input_mode=input_mode, direction_mode=direction_mode, context_name=None) # layer info (note in case of bidirectional, output from previous # layers are concatenated). layer_info = [input_shape[-1], hidden_size] + \ [hidden_size * (2 if is_bidirectional else 1), hidden_size] * (num_layers - 1) nb_params = rnnb.get_param_size([12, input_shape[-1]]) # ====== create parameters ====== # # check parameters if parameters is None: if rnn_mode == 'lstm': from odin.backend.init import lstm as init_func elif rnn_mode == 'gru': from odin.backend.init import gru as init_func else: from odin.backend.init import rnn as init_func parameters = np.concatenate([init_func(layer_info[i * 2], layer_info[i * 2 + 1], one_vector=True, return_variable=False, bidirectional=True if is_bidirectional else False) for i in range(num_layers)]).astype(FLOATX) parameters = variable(parameters, name=name) assert nb_params == get_shape(parameters)[0], \ "Require %d parameters but only %d provided" % (nb_params, get_shape(parameters)[0]) # check initial states num_layers = num_layers * 2 if is_bidirectional else num_layers h0 = zeros((num_layers, batch_size, hidden_size)) if h0 is None else h0 h0 = check_init_states(h0, num_layers, batch_size) c0 = (zeros((num_layers, batch_size, hidden_size)) if rnn_mode == 'lstm' and c0 is None else c0) c0 = check_init_states(c0, num_layers, batch_size) # ====== get output ====== # output = rnnb.apply(w=parameters, x=X.dimshuffle(1, 0, 2), hx=h0, cx=c0) output = [output[0].dimshuffle(1, 0, 2)] + list(output[1:]) add_shape(output[0], (input_shape[0], input_shape[1], hidden_size * (2 if is_bidirectional else 1))) for o in output[1:]: add_shape(o, (num_layers, input_shape[0], hidden_size)) return output
def __init__(self, num_layers=1, direction=0, **kwargs): # this has to be provided in THEANO_FLAGS as e.g. contexts=gpu0->cuda0 context_name = kwargs.get('device', str(theano.config.device)) #if context_name == 'cpu': # context_name = 'gpu0' kwargs['device'] = context_name #kwargs['n_out'] *= 2 super(RNNBlockLayer, self).__init__(**kwargs) self.params = {} #self.attrs['n_out'] /= 2 #self.set_attr('nout', self.attrs['n_out'] / 4) from theano.gpuarray import dnn from theano.gpuarray.type import gpuarray_shared_constructor from theano.tensor.extra_ops import cpu_contiguous #from theano.sandbox.cuda.basic_ops import gpu_contiguous rnnb = dnn.RNNBlock( dtype=theano.config.floatX, hidden_size=self.attrs['n_out'], num_layers=num_layers, rnn_mode='lstm', input_mode='linear', direction_mode='unidirectional' if direction != 0 else 'bidirectional', context_name=context_name if context_name != 'cpu' else 'gpu0') buffer_size = 1 # self.attrs['n_out'] * num_layers #X = self.get_linear_forward_output() #X = T.concatenate([s.output for s in self.sources],axis=2)[::direction or 1] X = cpu_contiguous( T.concatenate([s.output for s in self.sources], axis=2)[::direction or 1]) #X = cpu_contiguous(self.sources[0].output[::direction or 1]) #X = T.concatenate([X,T.zeros((X.shape[0],batch_size - X.shape[1] + 1,X.shape[2]),X.dtype)],axis=1)[:,:-1] n_in = sum([s.attrs['n_out'] for s in self.sources]) psize = rnnb.get_param_size([buffer_size, n_in]) l = numpy.sqrt(6.) / numpy.sqrt(4 * self.attrs['n_out']) pvalue = numpy.asarray(self.rng.uniform(low=-l, high=l, size=(psize, )), dtype=theano.config.floatX) if context_name == 'cpu': params_cudnn = self.add_param( self.create_bias(psize, name='cudnn_%s' % self.name)) else: params_cudnn = self.add_param( gpuarray_shared_constructor(pvalue, target=context_name, name='cudnn_%s' % self.name)) c_init = cpu_contiguous( T.alloc(numpy.cast[theano.config.floatX](0), num_layers, X.shape[1], self.attrs['n_out'])) h_init = cpu_contiguous( T.alloc(numpy.cast[theano.config.floatX](0), num_layers, X.shape[1], self.attrs['n_out'])) W_out = self.add_param( self.create_random_uniform_weights( self.attrs['n_out'], self.y_in[self.attrs['target']].n_out)) b_out = self.add_param( self.create_bias(self.y_in[self.attrs['target']].n_out)) if context_name == 'cpu': self.cost_val = T.constant(0) self.error_val = T.constant(0) self.known_grads = {} return out = rnnb.apply(params_cudnn, X, h_init, c_init)[0] out = out[::-1] out = T.dot(out, W_out) + b_out self.y_m = out.reshape((out.shape[0] * out.shape[1], out.shape[2])) self.i = (self.index.flatten() > 0).nonzero() self.y_data_flat = self.y_in[self.attrs['target']].flatten() nll, _ = T.nnet.crossentropy_softmax_1hot( x=self.y_m[self.i], y_idx=self.y_data_flat[self.i]) self.cost_val = T.sum(nll) #self.cost_val = -T.sum(T.log(out[:,self.y_in[self.attrs['target']].flatten()][(self.index.flatten()>0).nonzero()])) self.known_grads = {params_cudnn: T.grad(self.cost_val, params_cudnn)} self.output = out self.index = self.sources[0].index self.error_val = T.sum( T.neq(T.argmax(self.y_m[self.i], axis=-1), self.y_data_flat[self.i]))
hidden_size = args.hidden_size n_batch = args.n_batch print(args) X = T.tensor3('X', dtype='float32') h0 = T.tensor3('h0', dtype='float32') Y = T.tensor3('Y', dtype='float32') x_val = np.random.randn(seq_len, batch_size, input_size).astype('float32') y_val = np.random.randn(seq_len, batch_size, hidden_size * 2).astype('float32') h0_val = np.random.randn(2, batch_size, hidden_size).astype('float32') rnnb = dnn.RNNBlock(dtype=theano.config.floatX, hidden_size=hidden_size, num_layers=1, rnn_mode='gru', direction_mode='bidirectional') psize = rnnb.get_param_size((50, input_size)) params_cudnn = gpuarray_shared_constructor( np.zeros(psize, dtype=theano.config.floatX)) outp, h = rnnb.apply(params_cudnn, X, h0) cost = T.mean((Y - outp)**2) grads = T.grad(cost, params_cudnn) print("CuDNN bidirectional GRU") #---------------Compile time test-----------------#