Beispiel #1
0
    def __init__(self,
                 input_size,
                 state_size,
                 batch_size,
                 use_layer_norm=False,
                 nematus_compat=False,
                 dropout_input=None,
                 dropout_state=None):
        self.state_to_gates = tf.Variable(numpy.concatenate(
            [ortho_weight(state_size),
             ortho_weight(state_size)], axis=1),
                                          name='state_to_gates')
        self.input_to_gates = tf.Variable(numpy.concatenate([
            norm_weight(input_size, state_size),
            norm_weight(input_size, state_size)
        ],
                                                            axis=1),
                                          name='input_to_gates')
        self.gates_bias = tf.Variable(numpy.zeros(
            (2 * state_size, )).astype('float32'),
                                      name='gates_bias')

        self.state_to_proposal = tf.Variable(ortho_weight(state_size),
                                             name='state_to_proposal')
        self.input_to_proposal = tf.Variable(norm_weight(
            input_size, state_size),
                                             name='input_to_proposal')
        self.proposal_bias = tf.Variable(numpy.zeros(
            (state_size, )).astype('float32'),
                                         name='proposal_bias')
        self.nematus_compat = nematus_compat
        self.use_layer_norm = use_layer_norm

        if self.use_layer_norm:
            with tf.name_scope('gates_x_norm'):
                self.gates_x_norm = LayerNormLayer(2 * state_size)
            with tf.name_scope('gates_state_norm'):
                self.gates_state_norm = LayerNormLayer(2 * state_size)
            with tf.name_scope('proposal_x_norm'):
                self.proposal_x_norm = LayerNormLayer(state_size)
            with tf.name_scope('proposal_state_norm'):
                self.proposal_state_norm = LayerNormLayer(state_size)

        # Create dropout masks for input values (reused at every timestep).
        if dropout_input == None:
            self.dropout_mask_input_to_gates = None
            self.dropout_mask_input_to_proposal = None
        else:
            ones = tf.ones([batch_size, input_size])
            self.dropout_mask_input_to_gates = dropout_input(ones)
            self.dropout_mask_input_to_proposal = dropout_input(ones)

        # Create dropout masks for state values (reused at every timestep).
        if dropout_state == None:
            self.dropout_mask_state_to_gates = None
            self.dropout_mask_state_to_proposal = None
        else:
            ones = tf.ones([batch_size, state_size])
            self.dropout_mask_state_to_gates = dropout_state(ones)
            self.dropout_mask_state_to_proposal = dropout_state(ones)
Beispiel #2
0
def param_init_gru(options, params, prefix='gru', nin=None, dim=None):
    if nin is None:
        nin = options['dim_proj']
    if dim is None:
        dim = options['dim_proj']

    # embedding to gates transformation weights, biases
    W = numpy.concatenate(
        [norm_weight(nin, dim), norm_weight(nin, dim)], axis=1)
    params[pp(prefix, 'W')] = W
    params[pp(prefix, 'b')] = numpy.zeros((2 * dim, )).astype('float32')

    # recurrent transformation weights for gates
    U = numpy.concatenate([ortho_weight(dim), ortho_weight(dim)], axis=1)
    params[pp(prefix, 'U')] = U

    # embedding to hidden state proposal weights, biases
    Wx = norm_weight(nin, dim)
    params[pp(prefix, 'Wx')] = Wx
    params[pp(prefix, 'bx')] = numpy.zeros((dim, )).astype('float32')

    # recurrent transformation weights for hidden state proposal
    Ux = ortho_weight(dim)
    params[pp(prefix, 'Ux')] = Ux

    return params
Beispiel #3
0
    def __init__(self,
                 context,
                 context_state_size,
                 context_mask,
                 state_size,
                 hidden_size,
                 use_layer_norm=False,
                 dropout_context=None,
                 dropout_state=None):
        init = initializers.norm_weight(state_size, hidden_size)
        self.state_to_hidden = tf.compat.v1.get_variable('state_to_hidden',
                                                         initializer=init)
        #TODO: Nematus uses ortho_weight here - important?
        init = initializers.norm_weight(context_state_size, hidden_size)
        self.context_to_hidden = tf.compat.v1.get_variable('context_to_hidden',
                                                           initializer=init)
        self.hidden_bias = tf.compat.v1.get_variable(
            'hidden_bias', [hidden_size], initializer=tf.zeros_initializer)
        init = initializers.norm_weight(hidden_size, 1)
        self.hidden_to_score = tf.compat.v1.get_variable('hidden_to_score',
                                                         initializer=init)
        self.use_layer_norm = use_layer_norm
        if self.use_layer_norm:
            with tf.compat.v1.variable_scope('hidden_context_norm'):
                self.hidden_context_norm = self.use_layer_norm(
                    layer_size=hidden_size)
            with tf.compat.v1.variable_scope('hidden_state_norm'):
                self.hidden_state_norm = self.use_layer_norm(
                    layer_size=hidden_size)
        self.context = context
        self.context_mask = context_mask

        batch_size = tf.shape(input=context)[1]

        # Create a dropout mask for context values (reused at every timestep).
        if dropout_context == None:
            self.dropout_mask_context_to_hidden = None
        else:
            ones = tf.ones([batch_size, context_state_size])
            self.dropout_mask_context_to_hidden = dropout_context(ones)

        # Create a dropout mask for state values (reused at every timestep).
        if dropout_state == None:
            self.dropout_mask_state_to_hidden = None
        else:
            ones = tf.ones([batch_size, state_size])
            self.dropout_mask_state_to_hidden = dropout_state(ones)

        # precompute these activations, they are the same at each step
        # Ideally the compiler would have figured out that too
        context = apply_dropout_mask(context,
                                     self.dropout_mask_context_to_hidden, True)
        self.hidden_from_context = matmul3d(context, self.context_to_hidden)
        self.hidden_from_context += self.hidden_bias
        if self.use_layer_norm:
            self.hidden_from_context = \
                self.hidden_context_norm.forward(self.hidden_from_context)
Beispiel #4
0
    def __init__(self,
                 context,
                 context_state_size,
                 context_mask,
                 state_size,
                 hidden_size,
                 use_layer_norm=False,
                 dropout_context=None,
                 dropout_state=None):
        self.state_to_hidden = tf.Variable(
                                norm_weight(state_size, hidden_size),
                                name='state_to_hidden')
        self.context_to_hidden = tf.Variable( #TODO: Nematus uses ortho_weight here - important?
                                    norm_weight(context_state_size, hidden_size), 
                                    name='context_to_hidden')
        self.hidden_bias = tf.Variable(
                            numpy.zeros((hidden_size,)).astype('float32'),
                            name='hidden_bias')
        self.hidden_to_score = tf.Variable(
                                norm_weight(hidden_size, 1),
                                name='hidden_to_score')
        self.use_layer_norm = use_layer_norm
        if self.use_layer_norm:
            with tf.name_scope('hidden_context_norm'):
                self.hidden_context_norm = LayerNormLayer(layer_size=hidden_size)
            with tf.name_scope('hidden_state_norm'):
                self.hidden_state_norm = LayerNormLayer(layer_size=hidden_size)
        self.context = context
        self.context_mask = context_mask

        batch_size = tf.shape(context)[1]

        # Create a dropout mask for context values (reused at every timestep).
        if dropout_context == None:
            self.dropout_mask_context_to_hidden = None
        else:
            ones = tf.ones([batch_size, context_state_size])
            self.dropout_mask_context_to_hidden = dropout_context(ones)

        # Create a dropout mask for state values (reused at every timestep).
        if dropout_state == None:
            self.dropout_mask_state_to_hidden = None
        else:
            ones = tf.ones([batch_size, state_size])
            self.dropout_mask_state_to_hidden = dropout_state(ones)

        # precompute these activations, they are the same at each step
        # Ideally the compiler would have figured out that too
        context = apply_dropout_mask(context,
                                     self.dropout_mask_context_to_hidden, True)
        self.hidden_from_context = matmul3d(context, self.context_to_hidden)
        self.hidden_from_context += self.hidden_bias
        if self.use_layer_norm:
            self.hidden_from_context = \
                self.hidden_context_norm.forward(self.hidden_from_context, input_is_3d=True)
Beispiel #5
0
    def __init__(self,
                 context,
                 context_state_size,
                 context_mask,
                 state_size,
                 hidden_size,
                 use_layer_norm=False,
                 dropout_context=None,
                 dropout_state=None):
        init = initializers.norm_weight(state_size, hidden_size)
        self.state_to_hidden = tf.get_variable('state_to_hidden',
                                               initializer=init)
        #TODO: Nematus uses ortho_weight here - important?
        init = initializers.norm_weight(context_state_size, hidden_size)
        self.context_to_hidden = tf.get_variable('context_to_hidden',
                                                 initializer=init)
        self.hidden_bias = tf.get_variable('hidden_bias', [hidden_size],
                                           initializer=tf.zeros_initializer)
        init = initializers.norm_weight(hidden_size, 1)
        self.hidden_to_score = tf.get_variable('hidden_to_score',
                                               initializer=init)
        self.use_layer_norm = use_layer_norm
        if self.use_layer_norm:
            with tf.variable_scope('hidden_context_norm'):
                self.hidden_context_norm = LayerNormLayer(layer_size=hidden_size)
            with tf.variable_scope('hidden_state_norm'):
                self.hidden_state_norm = LayerNormLayer(layer_size=hidden_size)
        self.context = context
        self.context_mask = context_mask

        batch_size = tf.shape(context)[1]

        # Create a dropout mask for context values (reused at every timestep).
        if dropout_context == None:
            self.dropout_mask_context_to_hidden = None
        else:
            ones = tf.ones([batch_size, context_state_size])
            self.dropout_mask_context_to_hidden = dropout_context(ones)

        # Create a dropout mask for state values (reused at every timestep).
        if dropout_state == None:
            self.dropout_mask_state_to_hidden = None
        else:
            ones = tf.ones([batch_size, state_size])
            self.dropout_mask_state_to_hidden = dropout_state(ones)

        # precompute these activations, they are the same at each step
        # Ideally the compiler would have figured out that too
        context = apply_dropout_mask(context,
                                     self.dropout_mask_context_to_hidden, True)
        self.hidden_from_context = matmul3d(context, self.context_to_hidden)
        self.hidden_from_context += self.hidden_bias
        if self.use_layer_norm:
            self.hidden_from_context = \
                self.hidden_context_norm.forward(self.hidden_from_context)
Beispiel #6
0
 def __init__(self,
              vocabulary_sizes,
              dim_per_factor):
     assert len(vocabulary_sizes) == len(dim_per_factor)
     self.embedding_matrices = [
         tf.Variable(norm_weight(vocab_size, dim), name='embeddings')
             for vocab_size, dim in zip(vocabulary_sizes, dim_per_factor)]
Beispiel #7
0
 def __init__(self, vocabulary_sizes, dim_per_factor):
     assert len(vocabulary_sizes) == len(dim_per_factor)
     self.embedding_matrices = []
     for i in range(len(vocabulary_sizes)):
         vocab_size, dim = vocabulary_sizes[i], dim_per_factor[i]
         var_name = 'embeddings' if i == 0 else 'embeddings_' + str(i)
         init = initializers.norm_weight(vocab_size, dim)
         matrix = tf.get_variable(var_name, initializer=init)
         self.embedding_matrices.append(matrix)
Beispiel #8
0
 def __init__(self, vocabulary_sizes, dim_per_factor):
     assert len(vocabulary_sizes) == len(dim_per_factor)
     self.embedding_matrices = []
     for i in range(len(vocabulary_sizes)):
         vocab_size, dim = vocabulary_sizes[i], dim_per_factor[i]
         var_name = 'embeddings' if i == 0 else 'embeddings_' + str(i)
         init = initializers.norm_weight(vocab_size, dim)
         matrix = tf.get_variable(var_name, initializer=init)
         self.embedding_matrices.append(matrix)
def init_params(options):
    params = OrderedDict()
    
    #embedding
    params['Wemb'] = norm_weight(options['n_words_src'],
                                 options['dim_word'])
    params['Wemb_dec'] = norm_weight(options['n_words_tgt'],
                                     options['dim_word'])
    #encoder: bidirectional RNN
    params = param_init_gru(options,params,
                            prefix='encoder',
                            nin=options['dim_word'],
                            dim=options['dim'])
    params = param_init_gru(options,params,
                           prefix='encoder_r',
                           nin=options['dim_word'],
                           dim=options['dim'])
    ctxdim = 2*options['dim']
    #init state, init cell
    params = param_init_fflayer(options,params,prefix='ff_state',
                                nin=ctxdim,nout=options['dim'])
    #decoder
    params = param_init_gru_cond(options,params,
                                 prefix='decoder',
                                 nin=options['dim_word'],
                                 dim=options['dim'],
                                 dimctx=ctxdim)
    #readout
    params = param_init_fflayer(options,params,prefix='ff_logit_lstm',
                                nin=options['dim'],nout=options['dim_word'],
                                ortho=False)
    params = param_init_fflayer(options,params,prefix='ff_logit_prev',
                                nin=options['dim_word'],
                                nout=options['dim_word'],ortho=False)
    params = param_init_fflayer(options,params,prefix='ff_logit_ctx',
                                nin=ctxdim,nout=options['dim_word'],
                                ortho=False)
    params = param_init_fflayer(options,params,prefix='ff_logit', 
                                 nin=options['dim_word'],
                                 nout=options['n_words_tgt'])
    
    return params
Beispiel #10
0
def param_init_fflayer(options,
                       params,
                       prefix='ff',
                       nin=None,
                       nout=None,
                       ortho=True):
    if nin is None:
        nin = options['dim_proj']
    if nout is None:
        nout = options['dim_proj']
    params[pp(prefix, 'W')] = norm_weight(nin, nout, scale=0.01, ortho=ortho)
    params[pp(prefix, 'b')] = numpy.zeros((nout, )).astype('float32')

    return params
Beispiel #11
0
 def __init__(self,
              in_size,
              out_size,
              batch_size,
              non_linearity=tf.nn.tanh,
              W=None,
              use_layer_norm=False,
              dropout_input=None):
     if W is None:
         W = tf.Variable(norm_weight(in_size, out_size), name='W')
     self.W = W
     self.b = tf.Variable(numpy.zeros((out_size,)).astype('float32'), name='b')
     self.non_linearity = non_linearity
     self.use_layer_norm = use_layer_norm
     if use_layer_norm:
         self.layer_norm = LayerNormLayer(layer_size=out_size)
     # Create a dropout mask for input values (reused at every timestep).
     if dropout_input == None:
         self.dropout_mask = None
     else:
         ones = tf.ones([batch_size, in_size])
         self.dropout_mask = dropout_input(ones)
Beispiel #12
0
 def __init__(self,
              in_size,
              out_size,
              batch_size,
              non_linearity=tf.nn.tanh,
              W=None,
              use_layer_norm=False,
              dropout_input=None):
     if W is None:
         init = initializers.norm_weight(in_size, out_size)
         W = tf.get_variable('W', initializer=init)
     self.W = W
     self.b = tf.get_variable('b', [out_size],
                              initializer=tf.zeros_initializer)
     self.non_linearity = non_linearity
     self.use_layer_norm = use_layer_norm
     if use_layer_norm:
         self.layer_norm = LayerNormLayer(layer_size=out_size)
     # Create a dropout mask for input values (reused at every timestep).
     if dropout_input == None:
         self.dropout_mask = None
     else:
         ones = tf.ones([batch_size, in_size])
         self.dropout_mask = dropout_input(ones)
Beispiel #13
0
 def __init__(self,
              in_size,
              out_size,
              batch_size,
              non_linearity=tf.nn.tanh,
              W=None,
              use_layer_norm=False,
              dropout_input=None):
     if W is None:
         init = initializers.norm_weight(in_size, out_size)
         W = tf.get_variable('W', initializer=init)
     self.W = W
     self.b = tf.get_variable('b', [out_size],
                              initializer=tf.zeros_initializer)
     self.non_linearity = non_linearity
     self.use_layer_norm = use_layer_norm
     if use_layer_norm:
         self.layer_norm = LayerNormLayer(layer_size=out_size)
     # Create a dropout mask for input values (reused at every timestep).
     if dropout_input == None:
         self.dropout_mask = None
     else:
         ones = tf.ones([batch_size, in_size])
         self.dropout_mask = dropout_input(ones)
Beispiel #14
0
    def __init__(self,
                 input_size,
                 state_size,
                 batch_size,
                 use_layer_norm=False,
                 legacy_bias_type=LegacyBiasType.NEMATUS_COMPAT_FALSE,
                 dropout_input=None,
                 dropout_state=None):
        init = tf.concat([
            initializers.ortho_weight(state_size),
            initializers.ortho_weight(state_size)
        ],
                         axis=1)
        self.state_to_gates = tf.get_variable('state_to_gates',
                                              initializer=init)
        if input_size > 0:
            init = tf.concat([
                initializers.norm_weight(input_size, state_size),
                initializers.norm_weight(input_size, state_size)
            ],
                             axis=1)
            self.input_to_gates = tf.get_variable('input_to_gates',
                                                  initializer=init)

        if input_size == 0 and legacy_bias_type == LegacyBiasType.NEMATUS_COMPAT_FALSE:
            self.gates_bias = None
        else:
            self.gates_bias = tf.get_variable('gates_bias', [2 * state_size],
                                              initializer=tf.zeros_initializer)

        init = initializers.ortho_weight(state_size)
        self.state_to_proposal = tf.get_variable('state_to_proposal',
                                                 initializer=init)
        if input_size > 0:
            init = initializers.norm_weight(input_size, state_size)
            self.input_to_proposal = tf.get_variable('input_to_proposal',
                                                     initializer=init)

        if input_size == 0 and legacy_bias_type == LegacyBiasType.NEMATUS_COMPAT_FALSE:
            self.proposal_bias = None
        else:
            self.proposal_bias = tf.get_variable(
                'proposal_bias', [state_size],
                initializer=tf.zeros_initializer)

        self.legacy_bias_type = legacy_bias_type
        self.use_layer_norm = use_layer_norm

        self.gates_state_norm = None
        self.proposal_state_norm = None
        self.gates_x_norm = None
        self.proposal_x_norm = None
        if self.use_layer_norm:
            with tf.variable_scope('gates_state_norm'):
                self.gates_state_norm = LayerNormLayer(2 * state_size)
            with tf.variable_scope('proposal_state_norm'):
                self.proposal_state_norm = LayerNormLayer(state_size)
            if input_size > 0:
                with tf.variable_scope('gates_x_norm'):
                    self.gates_x_norm = LayerNormLayer(2 * state_size)
                with tf.variable_scope('proposal_x_norm'):
                    self.proposal_x_norm = LayerNormLayer(state_size)

        # Create dropout masks for input values (reused at every timestep).
        if dropout_input == None:
            self.dropout_mask_input_to_gates = None
            self.dropout_mask_input_to_proposal = None
        else:
            ones = tf.ones([batch_size, input_size])
            self.dropout_mask_input_to_gates = dropout_input(ones)
            self.dropout_mask_input_to_proposal = dropout_input(ones)

        # Create dropout masks for state values (reused at every timestep).
        if dropout_state == None:
            self.dropout_mask_state_to_gates = None
            self.dropout_mask_state_to_proposal = None
        else:
            ones = tf.ones([batch_size, state_size])
            self.dropout_mask_state_to_gates = dropout_state(ones)
            self.dropout_mask_state_to_proposal = dropout_state(ones)
Beispiel #15
0
 def __init__(self, vocabulary_size, dim_per_factor):
     self.embedding_matrices = [
         tf.Variable(norm_weight(vocabulary_size, dim), name='embeddings')
         for dim in dim_per_factor
     ]
Beispiel #16
0
def init_params(options):
    params = OrderedDict()

    # embedding
    for factor in range(options['factors']):
        params[embedding_name(factor)] = norm_weight(
            options['n_words_src'], options['dim_per_factor'][factor])

    params['Wemb_dec'] = norm_weight(options['n_words'], options['dim_word'])

    # encoder: bidirectional RNN
    params = get_layer_param(options['encoder'])(options,
                                                 params,
                                                 prefix='encoder',
                                                 nin=options['dim_word'],
                                                 dim=options['dim'])
    params = get_layer_param(options['encoder'])(options,
                                                 params,
                                                 prefix='encoder_r',
                                                 nin=options['dim_word'],
                                                 dim=options['dim'])
    ctxdim = 2 * options['dim']

    # init_state, init_cell
    params = get_layer_param('ff')(options,
                                   params,
                                   prefix='ff_state',
                                   nin=ctxdim,
                                   nout=options['dim'])
    # decoder
    params = get_layer_param(options['decoder'])(options,
                                                 params,
                                                 prefix='decoder',
                                                 nin=options['dim_word'],
                                                 dim=options['dim'],
                                                 dimctx=ctxdim)
    # readout
    params = get_layer_param('ff')(options,
                                   params,
                                   prefix='ff_logit_lstm',
                                   nin=options['dim'],
                                   nout=options['dim_word'],
                                   ortho=False)
    params = get_layer_param('ff')(options,
                                   params,
                                   prefix='ff_logit_prev',
                                   nin=options['dim_word'],
                                   nout=options['dim_word'],
                                   ortho=False)
    params = get_layer_param('ff')(options,
                                   params,
                                   prefix='ff_logit_ctx',
                                   nin=ctxdim,
                                   nout=options['dim_word'],
                                   ortho=False)
    params = get_layer_param('ff')(options,
                                   params,
                                   prefix='ff_logit',
                                   nin=options['dim_word'],
                                   nout=options['n_words'])

    return params
Beispiel #17
0
 def __init__(self,
              vocabulary_size,
              embedding_size):
     self.embeddings = tf.Variable(norm_weight(vocabulary_size, embedding_size),
                                   name='embeddings')
Beispiel #18
0
    def __init__(self, 
                 input_size, 
                 state_size,
                 batch_size,
                 use_layer_norm=False,
                 legacy_bias_type=LegacyBiasType.NEMATUS_COMPAT_FALSE,
                 dropout_input=None,
                 dropout_state=None):
        init = tf.concat([initializers.ortho_weight(state_size),
                          initializers.ortho_weight(state_size)],
                         axis=1)
        self.state_to_gates = tf.get_variable('state_to_gates',
                                              initializer=init)
        if input_size > 0:
            init = tf.concat([initializers.norm_weight(input_size, state_size),
                              initializers.norm_weight(input_size, state_size)],
                             axis=1)
            self.input_to_gates = tf.get_variable('input_to_gates',
                                                  initializer=init)

        if input_size > 0 or legacy_bias_type == LegacyBiasType.THEANO_A:
            self.gates_bias = tf.get_variable('gates_bias', [2*state_size],
                                          initializer=tf.zeros_initializer)
        else:
            self.gates_bias = None

        init = initializers.ortho_weight(state_size)
        self.state_to_proposal = tf.get_variable('state_to_proposal',
                                                 initializer=init)
        if input_size > 0:
            init = initializers.norm_weight(input_size, state_size)
            self.input_to_proposal = tf.get_variable('input_to_proposal',
                                                     initializer=init)

        if input_size > 0 or legacy_bias_type == LegacyBiasType.THEANO_A:
            self.proposal_bias = tf.get_variable('proposal_bias', [state_size],
                                             initializer=tf.zeros_initializer)
        else:
            self.proposal_bias = None

        self.legacy_bias_type = legacy_bias_type
        self.use_layer_norm = use_layer_norm

        self.gates_state_norm = None
        self.proposal_state_norm = None
        self.gates_x_norm = None
        self.proposal_x_norm = None
        if self.use_layer_norm:
            with tf.variable_scope('gates_state_norm'):
                self.gates_state_norm = LayerNormLayer(2*state_size)
            with tf.variable_scope('proposal_state_norm'):
                self.proposal_state_norm = LayerNormLayer(state_size)
            if input_size > 0:
                with tf.variable_scope('gates_x_norm'):
                    self.gates_x_norm = LayerNormLayer(2*state_size)
                with tf.variable_scope('proposal_x_norm'):
                    self.proposal_x_norm = LayerNormLayer(state_size)

        # Create dropout masks for input values (reused at every timestep).
        if dropout_input == None:
            self.dropout_mask_input_to_gates = None
            self.dropout_mask_input_to_proposal = None
        else:
            ones = tf.ones([batch_size, input_size])
            self.dropout_mask_input_to_gates = dropout_input(ones)
            self.dropout_mask_input_to_proposal = dropout_input(ones)

        # Create dropout masks for state values (reused at every timestep).
        if dropout_state == None:
            self.dropout_mask_state_to_gates = None
            self.dropout_mask_state_to_proposal = None
        else:
            ones = tf.ones([batch_size, state_size])
            self.dropout_mask_state_to_gates = dropout_state(ones)
            self.dropout_mask_state_to_proposal = dropout_state(ones)
Beispiel #19
0
def param_init_gru_cond(options,
                        params,
                        prefix='gru_cond',
                        nin=None,
                        dim=None,
                        dimctx=None,
                        nin_nonlin=None,
                        dim_nonlin=None):
    if nin is None:
        nin = options['dim']
    if dim is None:
        dim = options['dim']
    if dimctx is None:
        dimctx = options['dim']
    if nin_nonlin is None:
        nin_nonlin = nin
    if dim_nonlin is None:
        dim_nonlin = dim

    W = numpy.concatenate(
        [norm_weight(nin, dim), norm_weight(nin, dim)], axis=1)
    params[pp(prefix, 'W')] = W
    params[pp(prefix, 'b')] = numpy.zeros((2 * dim, )).astype('float32')
    U = numpy.concatenate([ortho_weight(dim_nonlin),
                           ortho_weight(dim_nonlin)],
                          axis=1)
    params[pp(prefix, 'U')] = U

    Wx = norm_weight(nin_nonlin, dim_nonlin)
    params[pp(prefix, 'Wx')] = Wx
    Ux = ortho_weight(dim_nonlin)
    params[pp(prefix, 'Ux')] = Ux
    params[pp(prefix, 'bx')] = numpy.zeros((dim_nonlin, )).astype('float32')

    U_nl = numpy.concatenate(
        [ortho_weight(dim_nonlin),
         ortho_weight(dim_nonlin)], axis=1)
    params[pp(prefix, 'U_nl')] = U_nl
    params[pp(prefix, 'b_nl')] = numpy.zeros(
        (2 * dim_nonlin, )).astype('float32')

    Ux_nl = ortho_weight(dim_nonlin)
    params[pp(prefix, 'Ux_nl')] = Ux_nl
    params[pp(prefix, 'bx_nl')] = numpy.zeros((dim_nonlin, )).astype('float32')

    # context to LSTM
    Wc = norm_weight(dimctx, dim * 2)
    params[pp(prefix, 'Wc')] = Wc

    Wcx = norm_weight(dimctx, dim)
    params[pp(prefix, 'Wcx')] = Wcx

    # attention: combined -> hidden
    W_comb_att = norm_weight(dim, dimctx)
    params[pp(prefix, 'W_comb_att')] = W_comb_att

    # attention: context -> hidden
    Wc_att = norm_weight(dimctx)
    params[pp(prefix, 'Wc_att')] = Wc_att

    # attention: hidden bias
    b_att = numpy.zeros((dimctx, )).astype('float32')
    params[pp(prefix, 'b_att')] = b_att

    # attention:
    U_att = norm_weight(dimctx, 1)
    params[pp(prefix, 'U_att')] = U_att
    c_att = numpy.zeros((1, )).astype('float32')
    params[pp(prefix, 'c_tt')] = c_att

    return params