def __init__(self, input_size, hidden_size, dtype=theano.config.floatX):
        self.grub = dnn.RNNBlock(dtype=dtype,
                                 hidden_size=hidden_size,
                                 num_layers=1,
                                 rnn_mode='gru')

        self.input_size = input_size
        self.hidden_size = hidden_size

        psize = self.grub.get_param_size((1, input_size))

        self.params = gpuarray_shared_constructor(
            np.zeros(psize, dtype=theano.config.floatX))
Example #2
0
    def __init__(self,rng,n_hidden,x,
                 E,xmask,is_train,dropout,mode='lstm',
                 n_layer=1, pre_state=None,**kwargs):

        self.is_train=is_train
        self.dropout=dropout

        self.rng=rng
        self.xmask=xmask

        shape=x.shape
        embd=E[x.flatten()]
        embd=embd.reshape([shape[0],shape[1],-1])

        if pre_state==None:
            h0 = T.zeros((n_layer, shape[1], n_hidden), dtype=theano.config.floatX)
            pre_state = [h0, ]
            if mode=='lstm':
                c0 = T.zeros((n_layer, shape[1], n_hidden), dtype=theano.config.floatX)
                pre_state.append(c0)


        rnnb=dnn.RNNBlock(dtype=theano.config.floatX,
                          hidden_size=n_hidden,
                          num_layers=n_layer,
                          rnn_mode=mode,
                          input_mode='skip',
                          direction_mode='unidirectional')
        psize=rnnb.get_param_size([1,n_hidden])
        print psize
        params_cudnn = gpuarray_shared_constructor(
            np.zeros((psize,), dtype=theano.config.floatX)
        )
        #l = np.sqrt(6.) / np.sqrt(4 * n_hidden)
        #pvalue = np.asarray(self.rng.uniform(low=-l, high=l, size=(psize,)), dtype=theano.config.floatX)
        #params_cudnn=gpuarray_shared_constructor(pvalue,name='cudnn')
        self.params=[params_cudnn,]

        if mode=='lstm':
            h=rnnb.apply(params_cudnn,embd,pre_state[0],pre_state[1])[0]
        else:
            h=rnnb.apply(params_cudnn,embd,pre_state[0])[0]

        h=h*self.xmask.dimshuffle(0,1,'x')

        # Dropout
        if self.dropout > 0:
            drop_mask = self.rng.binomial(n=1, p=1 - self.dropout, size=h.shape, dtype=theano.config.floatX)
            self.activation = T.switch(self.is_train, h * drop_mask, h * (1 - self.dropout))
        else:
            self.activation = T.switch(self.is_train, h, h)
Example #3
0
 def _params_to_cudnn(self):
     from theano.gpuarray import dnn
     from theano.gpuarray.type import gpuarray_shared_constructor
     assert dnn.dnn_available(None)
     self._rnn_block = dnn.RNNBlock(theano.config.floatX,
                                    self.hidden_dim,
                                    num_layers=1,
                                    input_mode="linear",
                                    rnn_mode=self.rnn_type,
                                    direction_mode="unidirectional")
     param_size = self._rnn_block.get_param_size(
         [self.n_batch, self.input_dim])  # TODO: study about n_batch
     self.params = [gpuarray_shared_constructor(Constant(0.0)(param_size))]
     cs = self._rnn_block.split_params(self.params[0],
                                       layer=0,
                                       input_size=[
                                           self.n_batch, self.input_dim
                                       ])  # TODO: multi layer support
     for c, p in zip(cs, self.non_cudnn_params):
         c[:] = p.get_value(borrow=True, return_internal_type=True)
)
c0_val = np.random.random((depth, batch_size, hidden_dim)).astype(
    theano.config.floatX
)

start = time.time()

X = T.tensor3('X')
Y = T.tensor3('Y')
h0 = T.tensor3('h0')
c0 = T.tensor3('c0')

rnnb = dnn.RNNBlock(
    theano.config.floatX,
    hidden_dim,
    depth,
    network_type,
    input_mode='skip'
)
psize = rnnb.get_param_size([batch_size, hidden_dim])
params_cudnn = gpuarray_shared_constructor(
    np.zeros((psize,), dtype=theano.config.floatX)
)

# lstm = LSTM(input_dim, hidden_dim)
output = rnnb.apply(params_cudnn, X, h0, c0)[0]  # Only hidden states
cost = T.mean((Y - output) ** 2)
grads = T.grad(cost, params_cudnn)
cudnn_fn = theano.function(
    inputs=[],
    outputs=output,
def test_gru(depth, input_dim, hidden_dim):
    '''hidden_dim and output_dim are usually same'''

    model = Model()  # To collect parameters and keep track of layers

    X = T.tensor3('X')  # input
    h0 = T.tensor3('h0')  # initial hidden state of recurrent nets

    last_layer = WrapperLayer(X)
    last_dim = input_dim
    for i in range(depth):
        gru = GRU(last_dim,
                  hidden_dim,
                  last_layer,
                  name="layer_{}".format(i + 1),
                  s0=h0[i, :, :])
        model.add_layer(gru)
        last_layer = gru
        last_dim = hidden_dim

    params = model.get_params()
    print(
        "Printing order of params. Important to know as this will help set params for cudnn_rnn"
    )

    model.print_params()

    #list_of_param_values = [p.get_value() for p in params] #list of param values

    output = last_layer.output()  # output tensor

    forward_fun = theano.function([X, h0], output)  #forward function

    #Y = T.tensor3('Y') # proxy tensor with which we want to match the output of rnn to get a loss
    '''For checking gradient, I am defining loss as following,
	 here 'output' is the theano tensor representing output of rnn op'''

    #loss = T.mean((Y - output)*(Y - output)) # mean square error

    #grad = T.grad(loss, params) # list of gradient with respect to parameters

    #get_grad = theano.function([X, h0, Y], grad) # getting list of gradients
    rnnb = dnn.RNNBlock('float32', hidden_dim, depth, 'gru')
    psize = rnnb.get_param_size([2, input_dim])
    params_cudnn = theano.shared(numpy.zeros((psize, ), dtype='float32'))
    # irm, irb, ium, iub, inm, inb, rrm, rrb, rum, rub, rnm, rnb
    l0params = rnnb.split_params(params_cudnn, 0, [2, input_dim])
    for i, p in enumerate(l0params):
        val = params[i].get_value()
        p[:] = val

    cudnn_rnn_gru_output = rnnb.apply(params_cudnn, X, h0)

    #import sys;sys.exit(0)
    '''
	loss_rnn = T.mean((Y-output_cudnn)*(Y - output_cudnn))
	grad_cudnn = T.grad(loss, params_cudnn)
	'''

    cudnn_rnn_forward_fun = theano.function([X, h0], cudnn_rnn_gru_output)

    # h0 = numpy.random.random((1, 2, hidden_dim)).astype('float32')
    # inp1 = numpy.random.random((5, 2, input_dim)).astype('float32')
    # out = cudnn_rnn_forward_fun(inp1, h0)
    # for s in out:
    # 	print(s.shape)
    # import sys;sys.exit(0)

    def test0(bs, ts):
        '''
		bs: batch_size
		ts: number of timesteps
		'''
        h0 = numpy.random.random((depth, bs, hidden_dim)).astype('float32')
        inp1 = numpy.random.random((bs, ts, input_dim)).astype('float32')
        out1 = forward_fun(inp1, h0)
        # '''checking output shape'''
        assert (out1.shape == (bs, ts, hidden_dim))

        hy, y = cudnn_rnn_forward_fun(inp1.transpose((1, 0, 2)), h0)
        print(hy.shape, y.shape)

        assert (check_equality_two_nd_array(
            numpy.asarray(hy)[-1],
            numpy.asarray(y)[0]))
        print(out1.shape)
        print(numpy.asarray(hy).transpose((1, 0, 2)).shape)

        assert (check_equality_two_nd_array(out1.transpose((1, 0, 2)),
                                            numpy.asarray(hy)))
        sys.exit(0)

    def test1(bs, ts):
        '''
		bs: batch_size
		ts: number of timesteps
		'''
        inp1 = numpy.random.random((bs, ts, input_dim)).astype('float32')
        h0 = numpy.random.random((depth, bs, hidden_dim)).astype('float32')
        Y = numpy.random.random((bs, ts, hidden_dim)).astype('float32')

        grad1 = get_grad(inp1, h0, Y)
        '''
		grad_cudnn = get_grad_cudnn(inp1, h0, Y)
		'''
        '''
			compare grad with cudnn_grad here
		'''
        '''
		for g, g_hat in zip(grad1, grad_cudnn):
			check_equality_two_nd_array(g, g_hat)
		'''

    test0(2, 5)
    print("passed test0 -1")
    import sys
    sys.exit(0)
    test0(1, 10)
    print("passed test0 -2")

    test1(5, 3)
    print("passed test1 -1")

    test1(6, 10)
    print("passed test1 -2")
Example #6
0
    def __init__(self,
                 rng,
                 n_hidden,
                 x,
                 xmask,
                 is_train,
                 dropout,
                 mode='gru',
                 n_layer=1,
                 pre_state=None,
                 **kwargs):

        prefix = "BiGRU_"
        Wc = norm_weight(n_hidden * 2, n_hidden, name=prefix + 'Wc')
        bc = zero_bias(n_hidden, prefix + 'bc')

        self.is_train = is_train
        self.dropout = dropout

        self.rng = rng
        self.xmask = xmask

        if pre_state == None:
            h0 = T.zeros((n_layer, x.shape[1], n_hidden),
                         dtype=theano.config.floatX)
            pre_state = [
                h0,
            ]
            if mode == 'lstm':
                c0 = T.zeros((n_layer, x.shape[1], n_hidden),
                             dtype=theano.config.floatX)
                pre_state.append(c0)

        rnnb = dnn.RNNBlock(dtype=theano.config.floatX,
                            hidden_size=n_hidden,
                            num_layers=n_layer,
                            rnn_mode=mode,
                            input_mode='skip',
                            direction_mode='bidirectional')
        psize = rnnb.get_param_size([1, n_hidden])
        print psize
        params_cudnn = gpuarray_shared_constructor(
            np.zeros((psize, ), dtype=theano.config.floatX))
        #l = np.sqrt(6.) / np.sqrt(4 * n_hidden)
        #pvalue = np.asarray(self.rng.uniform(low=-l, high=l, size=(psize,)), dtype=theano.config.floatX)
        #params_cudnn=gpuarray_shared_constructor(pvalue,name='cudnn')
        self.params = [
            params_cudnn,
        ]

        if mode == 'lstm':
            h = rnnb.apply(params_cudnn, x, pre_state[0], pre_state[1])[0]
        else:
            h = rnnb.apply(params_cudnn, x, pre_state[0])[0]

        h = h * self.xmask.dimshuffle(0, 1, 'x')
        self.context = h

        ctx_mean = (h *
                    self.xmask[:, :, None]).sum(0) / self.xmask.sum(0)[:, None]

        self.activation = T.tanh(T.dot(ctx_mean, Wc) + bc)

        # Dropout
        if self.dropout > 0:
            drop_mask = self.rng.binomial(n=1,
                                          p=1 - self.dropout,
                                          size=h.shape,
                                          dtype=theano.config.floatX)
            self.activation = T.switch(self.is_train, h * drop_mask,
                                       h * (1 - self.dropout))
        else:
            self.activation = T.switch(self.is_train, h, h)
Example #7
0
def test_dnn_rnn_lstm():
    if not dnn.dnn_available(test_ctx_name):
        raise SkipTest(dnn.dnn_available.msg)
    utt.seed_rng()

    # test params
    input_dim = 32
    hidden_dim = 16
    batch_size = 2
    depth = 3
    timesteps = 5

    # test code
    X = T.tensor3('X')
    Y = T.tensor3('Y')
    h0 = T.tensor3('h0')
    c0 = T.tensor3('c0')

    rnnb = dnn.RNNBlock(theano.config.floatX, hidden_dim, depth, 'lstm')
    psize = rnnb.get_param_size([batch_size, input_dim])
    params_cudnn = gpuarray_shared_constructor(
        np.zeros((psize, ), dtype=theano.config.floatX))

    model = Model()
    last_layer = WrapperLayer(X)
    last_dim = input_dim
    for i in range(depth):
        lstm = LSTM(last_dim,
                    hidden_dim,
                    last_layer,
                    s0=h0[i, :, :],
                    c0=c0[i, :, :])
        model.add_layer(lstm)
        last_layer = lstm
        last_dim = hidden_dim
        layer_params = lstm.get_params()
        dnn_params = rnnb.split_params(params_cudnn, i,
                                       [batch_size, input_dim])
        for j, p in enumerate(dnn_params):
            p[:] = layer_params[j].get_value(borrow=True,
                                             return_internal_type=True)

    def funcs(out, params):
        fn = theano.function([X, h0, c0], out, mode=mode_with_gpu)
        cost = T.mean((Y - out)**2)
        grad = T.grad(cost, [X, h0, c0] + params)
        grad_fn = theano.function([X, Y, h0, c0], grad, mode=mode_with_gpu)
        return fn, grad_fn

    ref_fn, ref_grad_fn = funcs(last_layer.output(), model.get_params())
    cudnn_fn, cudnn_grad_fn = funcs(
        rnnb.apply(params_cudnn, X, h0, c0)[0], [params_cudnn])

    x_val = np.random.random(
        (timesteps, batch_size, input_dim)).astype(theano.config.floatX)
    y_val = np.random.random(
        (timesteps, batch_size, hidden_dim)).astype(theano.config.floatX)
    h0_val = np.random.random(
        (depth, batch_size, hidden_dim)).astype(theano.config.floatX)
    c0_val = np.random.random(
        (depth, batch_size, hidden_dim)).astype(theano.config.floatX)

    ref_out = ref_fn(x_val, h0_val, c0_val)
    cudnn_out = cudnn_fn(x_val, h0_val, c0_val)

    utt.assert_allclose(ref_out, cudnn_out)

    ref_grads = ref_grad_fn(x_val, y_val, h0_val, c0_val)
    cudnn_grads = cudnn_grad_fn(x_val, y_val, h0_val, c0_val)

    utt.assert_allclose(ref_grads[0], cudnn_grads[0])
    utt.assert_allclose(ref_grads[1], cudnn_grads[1])
    utt.assert_allclose(ref_grads[2], cudnn_grads[2])

    ref_grads_params = ref_grads[3:]
    cudnn_grads_params = gpuarray_shared_constructor(cudnn_grads[3])

    for i in range(depth):
        cudnn_grads_layer = rnnb.split_params(cudnn_grads_params, i,
                                              [batch_size, input_dim])
        ref_grads_layer = ref_grads_params[i * len(cudnn_grads_layer):(i + 1) *
                                           len(cudnn_grads_layer)]
        for j, g in enumerate(cudnn_grads_layer):
            utt.assert_allclose(ref_grads_layer[j], g)
Example #8
0
def rnn_dnn(X, hidden_size, rnn_mode,
            num_layers=1,
            parameters=None,
            h0=None, c0=None,
            input_mode='linear',
            direction_mode='unidirectional',
            dropout=0., name=None):
    """CuDNN v5 RNN implementation.

    Parameters
    ----------
    X : input varialbe or placeholder
        shape=(batch_size, timesteps, input_dims)
    hidden_size : int
        the number of units within the RNN model.
    rnn_mode : {'rnn_relu', 'rnn_tanh', 'lstm', 'gru'}
        See cudnn documentation for ``cudnnRNNMode_t``.
    num_layers : int
        the number of layers for the RNN model.
    h0: tensor
        h0 with shape [num_layers, batch_size, hidden_size]
    c0: tensor
        c0 (lstm) with shape [num_layers, batch_size, hidden_size]
    parameters: list of tensor
        vector contain all flatten weights and bias
        check `backend.init.lstm`, `backend.init.gru`, and `backend.init.rnn`
        for more information
    input_mode : {'linear', 'skip'}
        linear: input will be multiplied by a biased matrix
        skip: No operation is performed on the input.  The size must
        match the hidden size.
        (CuDNN docs: cudnnRNNInputMode_t)
    direction_mode : {'unidirectional', 'bidirectional'}
        unidirectional: The network operates recurrently from the
                        first input to the last.
        bidirectional: The network operates from first to last then from last
                       to first and concatenates the results at each layer.
    dropout: float (0.0-1.0)
        whether to enable dropout. With it is 0, dropout is disabled.

    Returns
    -------
    [output, hidden_states, cell_states] for lstm
    [output, hidden_states] for gru and rnn

    output_shape: (batch_size, timesteps, hidden_size)
    hidden_shape: (num_layers, batch_size, hidden_size)
    cell_shape: (num_layers, batch_size, hidden_size)

    """
    if CONFIG['device'] == 'cpu':
        raise Exception('This opt is not supported with CPU.')
    if name is None: name = uuid()
    # ====== Check arguments ====== #
    if rnn_mode not in ('rnn_relu', 'rnn_tanh', 'lstm', 'gru'):
        raise ValueError("rnn_mode=%s must be: 'rnn_relu', 'rnn_tanh', 'lstm', 'gru'"
                         % rnn_mode)
    if input_mode not in ('linear', 'skip'):
        raise ValueError("input_mode=%s must be: 'linear', 'skip'" % input_mode)
    if direction_mode not in ('unidirectional', 'bidirectional'):
        raise ValueError("direction_mode=%s must be: 'unidirectional', 'bidirectional'"
                         % direction_mode)
    is_bidirectional = direction_mode == 'bidirectional'

    # ====== helper function ====== #
    def check_init_states(s0, nb_layers, batch_size):
        if s0 is None: return None
        if s0.ndim < 3:
            s0 = expand_dims(s0, dim=0)
        s0shape = get_shape(s0)
        if s0shape[0] == 1 and s0shape[0] != nb_layers:
            s0 = repeat(s0, n=nb_layers, axes=0)
        if s0shape[1] == 1:
            s0 = repeat(s0, n=batch_size, axes=1)
        return s0
    # ====== create RNNBlock ====== #
    input_shape = get_shape(X)
    if X.ndim != 3:
        raise ValueError('Input must be 3-D tensor, but X is %d-D tensor' % X.ndim)
    if input_shape[-1] != hidden_size and 'skip' in input_mode:
        raise ValueError('In skip_input mode, input size must be equal to hidden size'
                         ', but input_size=%d != hidden_size=%d' %
                         (input_shape[-1], hidden_size))
    # IF we dimshuffle here, a lot of error concern GPUarray,
    # and cudnn will happen
    batch_size = X.shape[0]
    rnnb = dnn.RNNBlock(dtype=theano.config.floatX, hidden_size=hidden_size,
                        num_layers=num_layers, rnn_mode=rnn_mode,
                        input_mode=input_mode, direction_mode=direction_mode,
                        context_name=None)
    # layer info (note in case of bidirectional, output from previous
    # layers are concatenated).
    layer_info = [input_shape[-1], hidden_size] + \
                 [hidden_size * (2 if is_bidirectional else 1),
                  hidden_size] * (num_layers - 1)
    nb_params = rnnb.get_param_size([12, input_shape[-1]])
    # ====== create parameters ====== #
    # check parameters
    if parameters is None:
        if rnn_mode == 'lstm':
            from odin.backend.init import lstm as init_func
        elif rnn_mode == 'gru':
            from odin.backend.init import gru as init_func
        else:
            from odin.backend.init import rnn as init_func
        parameters = np.concatenate([init_func(layer_info[i * 2], layer_info[i * 2 + 1],
                                     one_vector=True, return_variable=False,
                                     bidirectional=True if is_bidirectional else False)
                                     for i in range(num_layers)]).astype(FLOATX)
        parameters = variable(parameters, name=name)
    assert nb_params == get_shape(parameters)[0], \
        "Require %d parameters but only %d provided" % (nb_params, get_shape(parameters)[0])
    # check initial states
    num_layers = num_layers * 2 if is_bidirectional else num_layers
    h0 = zeros((num_layers, batch_size, hidden_size)) if h0 is None else h0
    h0 = check_init_states(h0, num_layers, batch_size)
    c0 = (zeros((num_layers, batch_size, hidden_size))
          if rnn_mode == 'lstm' and c0 is None else c0)
    c0 = check_init_states(c0, num_layers, batch_size)
    # ====== get output ====== #
    output = rnnb.apply(w=parameters, x=X.dimshuffle(1, 0, 2),
                        hx=h0, cx=c0)
    output = [output[0].dimshuffle(1, 0, 2)] + list(output[1:])
    add_shape(output[0], (input_shape[0], input_shape[1],
                          hidden_size * (2 if is_bidirectional else 1)))
    for o in output[1:]:
        add_shape(o, (num_layers, input_shape[0], hidden_size))
    return output
Example #9
0
    def __init__(self, num_layers=1, direction=0, **kwargs):
        # this has to be provided in THEANO_FLAGS as e.g. contexts=gpu0->cuda0
        context_name = kwargs.get('device', str(theano.config.device))
        #if context_name == 'cpu':
        #  context_name = 'gpu0'
        kwargs['device'] = context_name
        #kwargs['n_out'] *= 2
        super(RNNBlockLayer, self).__init__(**kwargs)
        self.params = {}
        #self.attrs['n_out'] /= 2
        #self.set_attr('nout', self.attrs['n_out'] / 4)
        from theano.gpuarray import dnn
        from theano.gpuarray.type import gpuarray_shared_constructor
        from theano.tensor.extra_ops import cpu_contiguous
        #from theano.sandbox.cuda.basic_ops import gpu_contiguous

        rnnb = dnn.RNNBlock(
            dtype=theano.config.floatX,
            hidden_size=self.attrs['n_out'],
            num_layers=num_layers,
            rnn_mode='lstm',
            input_mode='linear',
            direction_mode='unidirectional'
            if direction != 0 else 'bidirectional',
            context_name=context_name if context_name != 'cpu' else 'gpu0')

        buffer_size = 1  # self.attrs['n_out'] * num_layers
        #X = self.get_linear_forward_output()
        #X = T.concatenate([s.output for s in self.sources],axis=2)[::direction or 1]
        X = cpu_contiguous(
            T.concatenate([s.output for s in self.sources],
                          axis=2)[::direction or 1])
        #X = cpu_contiguous(self.sources[0].output[::direction or 1])
        #X = T.concatenate([X,T.zeros((X.shape[0],batch_size - X.shape[1] + 1,X.shape[2]),X.dtype)],axis=1)[:,:-1]
        n_in = sum([s.attrs['n_out'] for s in self.sources])
        psize = rnnb.get_param_size([buffer_size, n_in])
        l = numpy.sqrt(6.) / numpy.sqrt(4 * self.attrs['n_out'])
        pvalue = numpy.asarray(self.rng.uniform(low=-l, high=l,
                                                size=(psize, )),
                               dtype=theano.config.floatX)
        if context_name == 'cpu':
            params_cudnn = self.add_param(
                self.create_bias(psize, name='cudnn_%s' % self.name))
        else:
            params_cudnn = self.add_param(
                gpuarray_shared_constructor(pvalue,
                                            target=context_name,
                                            name='cudnn_%s' % self.name))
        c_init = cpu_contiguous(
            T.alloc(numpy.cast[theano.config.floatX](0), num_layers,
                    X.shape[1], self.attrs['n_out']))
        h_init = cpu_contiguous(
            T.alloc(numpy.cast[theano.config.floatX](0), num_layers,
                    X.shape[1], self.attrs['n_out']))

        W_out = self.add_param(
            self.create_random_uniform_weights(
                self.attrs['n_out'], self.y_in[self.attrs['target']].n_out))
        b_out = self.add_param(
            self.create_bias(self.y_in[self.attrs['target']].n_out))

        if context_name == 'cpu':
            self.cost_val = T.constant(0)
            self.error_val = T.constant(0)
            self.known_grads = {}
            return

        out = rnnb.apply(params_cudnn, X, h_init, c_init)[0]
        out = out[::-1]
        out = T.dot(out, W_out) + b_out
        self.y_m = out.reshape((out.shape[0] * out.shape[1], out.shape[2]))

        self.i = (self.index.flatten() > 0).nonzero()
        self.y_data_flat = self.y_in[self.attrs['target']].flatten()
        nll, _ = T.nnet.crossentropy_softmax_1hot(
            x=self.y_m[self.i], y_idx=self.y_data_flat[self.i])
        self.cost_val = T.sum(nll)

        #self.cost_val = -T.sum(T.log(out[:,self.y_in[self.attrs['target']].flatten()][(self.index.flatten()>0).nonzero()]))
        self.known_grads = {params_cudnn: T.grad(self.cost_val, params_cudnn)}
        self.output = out
        self.index = self.sources[0].index

        self.error_val = T.sum(
            T.neq(T.argmax(self.y_m[self.i], axis=-1),
                  self.y_data_flat[self.i]))
hidden_size = args.hidden_size
n_batch = args.n_batch

print(args)

X = T.tensor3('X', dtype='float32')
h0 = T.tensor3('h0', dtype='float32')
Y = T.tensor3('Y', dtype='float32')

x_val = np.random.randn(seq_len, batch_size, input_size).astype('float32')
y_val = np.random.randn(seq_len, batch_size, hidden_size * 2).astype('float32')
h0_val = np.random.randn(2, batch_size, hidden_size).astype('float32')

rnnb = dnn.RNNBlock(dtype=theano.config.floatX,
                    hidden_size=hidden_size,
                    num_layers=1,
                    rnn_mode='gru',
                    direction_mode='bidirectional')

psize = rnnb.get_param_size((50, input_size))

params_cudnn = gpuarray_shared_constructor(
    np.zeros(psize, dtype=theano.config.floatX))

outp, h = rnnb.apply(params_cudnn, X, h0)

cost = T.mean((Y - outp)**2)
grads = T.grad(cost, params_cudnn)

print("CuDNN bidirectional GRU")
#---------------Compile time test-----------------#