def param_init_gru(options, params, prefix='gru', nin=None, dim=None): if nin is None: nin = options['dim_proj'] if dim is None: dim = options['dim_proj'] # embedding to gates transformation weights, biases W = numpy.concatenate( [norm_weight(nin, dim), norm_weight(nin, dim)], axis=1) params[pp(prefix, 'W')] = W params[pp(prefix, 'b')] = numpy.zeros((2 * dim, )).astype('float32') # recurrent transformation weights for gates U = numpy.concatenate([ortho_weight(dim), ortho_weight(dim)], axis=1) params[pp(prefix, 'U')] = U # embedding to hidden state proposal weights, biases Wx = norm_weight(nin, dim) params[pp(prefix, 'Wx')] = Wx params[pp(prefix, 'bx')] = numpy.zeros((dim, )).astype('float32') # recurrent transformation weights for hidden state proposal Ux = ortho_weight(dim) params[pp(prefix, 'Ux')] = Ux return params
def __init__(self, input_size, state_size, batch_size, use_layer_norm=False, nematus_compat=False, dropout_input=None, dropout_state=None): self.state_to_gates = tf.Variable(numpy.concatenate( [ortho_weight(state_size), ortho_weight(state_size)], axis=1), name='state_to_gates') self.input_to_gates = tf.Variable(numpy.concatenate([ norm_weight(input_size, state_size), norm_weight(input_size, state_size) ], axis=1), name='input_to_gates') self.gates_bias = tf.Variable(numpy.zeros( (2 * state_size, )).astype('float32'), name='gates_bias') self.state_to_proposal = tf.Variable(ortho_weight(state_size), name='state_to_proposal') self.input_to_proposal = tf.Variable(norm_weight( input_size, state_size), name='input_to_proposal') self.proposal_bias = tf.Variable(numpy.zeros( (state_size, )).astype('float32'), name='proposal_bias') self.nematus_compat = nematus_compat self.use_layer_norm = use_layer_norm if self.use_layer_norm: with tf.name_scope('gates_x_norm'): self.gates_x_norm = LayerNormLayer(2 * state_size) with tf.name_scope('gates_state_norm'): self.gates_state_norm = LayerNormLayer(2 * state_size) with tf.name_scope('proposal_x_norm'): self.proposal_x_norm = LayerNormLayer(state_size) with tf.name_scope('proposal_state_norm'): self.proposal_state_norm = LayerNormLayer(state_size) # Create dropout masks for input values (reused at every timestep). if dropout_input == None: self.dropout_mask_input_to_gates = None self.dropout_mask_input_to_proposal = None else: ones = tf.ones([batch_size, input_size]) self.dropout_mask_input_to_gates = dropout_input(ones) self.dropout_mask_input_to_proposal = dropout_input(ones) # Create dropout masks for state values (reused at every timestep). if dropout_state == None: self.dropout_mask_state_to_gates = None self.dropout_mask_state_to_proposal = None else: ones = tf.ones([batch_size, state_size]) self.dropout_mask_state_to_gates = dropout_state(ones) self.dropout_mask_state_to_proposal = dropout_state(ones)
def __init__(self, input_size, state_size, batch_size, use_layer_norm=False, legacy_bias_type=LegacyBiasType.NEMATUS_COMPAT_FALSE, dropout_input=None, dropout_state=None): init = tf.concat([ initializers.ortho_weight(state_size), initializers.ortho_weight(state_size) ], axis=1) self.state_to_gates = tf.get_variable('state_to_gates', initializer=init) if input_size > 0: init = tf.concat([ initializers.norm_weight(input_size, state_size), initializers.norm_weight(input_size, state_size) ], axis=1) self.input_to_gates = tf.get_variable('input_to_gates', initializer=init) if input_size == 0 and legacy_bias_type == LegacyBiasType.NEMATUS_COMPAT_FALSE: self.gates_bias = None else: self.gates_bias = tf.get_variable('gates_bias', [2 * state_size], initializer=tf.zeros_initializer) init = initializers.ortho_weight(state_size) self.state_to_proposal = tf.get_variable('state_to_proposal', initializer=init) if input_size > 0: init = initializers.norm_weight(input_size, state_size) self.input_to_proposal = tf.get_variable('input_to_proposal', initializer=init) if input_size == 0 and legacy_bias_type == LegacyBiasType.NEMATUS_COMPAT_FALSE: self.proposal_bias = None else: self.proposal_bias = tf.get_variable( 'proposal_bias', [state_size], initializer=tf.zeros_initializer) self.legacy_bias_type = legacy_bias_type self.use_layer_norm = use_layer_norm self.gates_state_norm = None self.proposal_state_norm = None self.gates_x_norm = None self.proposal_x_norm = None if self.use_layer_norm: with tf.variable_scope('gates_state_norm'): self.gates_state_norm = LayerNormLayer(2 * state_size) with tf.variable_scope('proposal_state_norm'): self.proposal_state_norm = LayerNormLayer(state_size) if input_size > 0: with tf.variable_scope('gates_x_norm'): self.gates_x_norm = LayerNormLayer(2 * state_size) with tf.variable_scope('proposal_x_norm'): self.proposal_x_norm = LayerNormLayer(state_size) # Create dropout masks for input values (reused at every timestep). if dropout_input == None: self.dropout_mask_input_to_gates = None self.dropout_mask_input_to_proposal = None else: ones = tf.ones([batch_size, input_size]) self.dropout_mask_input_to_gates = dropout_input(ones) self.dropout_mask_input_to_proposal = dropout_input(ones) # Create dropout masks for state values (reused at every timestep). if dropout_state == None: self.dropout_mask_state_to_gates = None self.dropout_mask_state_to_proposal = None else: ones = tf.ones([batch_size, state_size]) self.dropout_mask_state_to_gates = dropout_state(ones) self.dropout_mask_state_to_proposal = dropout_state(ones)
def __init__(self, input_size, state_size, batch_size, use_layer_norm=False, legacy_bias_type=LegacyBiasType.NEMATUS_COMPAT_FALSE, dropout_input=None, dropout_state=None): init = tf.concat([initializers.ortho_weight(state_size), initializers.ortho_weight(state_size)], axis=1) self.state_to_gates = tf.get_variable('state_to_gates', initializer=init) if input_size > 0: init = tf.concat([initializers.norm_weight(input_size, state_size), initializers.norm_weight(input_size, state_size)], axis=1) self.input_to_gates = tf.get_variable('input_to_gates', initializer=init) if input_size > 0 or legacy_bias_type == LegacyBiasType.THEANO_A: self.gates_bias = tf.get_variable('gates_bias', [2*state_size], initializer=tf.zeros_initializer) else: self.gates_bias = None init = initializers.ortho_weight(state_size) self.state_to_proposal = tf.get_variable('state_to_proposal', initializer=init) if input_size > 0: init = initializers.norm_weight(input_size, state_size) self.input_to_proposal = tf.get_variable('input_to_proposal', initializer=init) if input_size > 0 or legacy_bias_type == LegacyBiasType.THEANO_A: self.proposal_bias = tf.get_variable('proposal_bias', [state_size], initializer=tf.zeros_initializer) else: self.proposal_bias = None self.legacy_bias_type = legacy_bias_type self.use_layer_norm = use_layer_norm self.gates_state_norm = None self.proposal_state_norm = None self.gates_x_norm = None self.proposal_x_norm = None if self.use_layer_norm: with tf.variable_scope('gates_state_norm'): self.gates_state_norm = LayerNormLayer(2*state_size) with tf.variable_scope('proposal_state_norm'): self.proposal_state_norm = LayerNormLayer(state_size) if input_size > 0: with tf.variable_scope('gates_x_norm'): self.gates_x_norm = LayerNormLayer(2*state_size) with tf.variable_scope('proposal_x_norm'): self.proposal_x_norm = LayerNormLayer(state_size) # Create dropout masks for input values (reused at every timestep). if dropout_input == None: self.dropout_mask_input_to_gates = None self.dropout_mask_input_to_proposal = None else: ones = tf.ones([batch_size, input_size]) self.dropout_mask_input_to_gates = dropout_input(ones) self.dropout_mask_input_to_proposal = dropout_input(ones) # Create dropout masks for state values (reused at every timestep). if dropout_state == None: self.dropout_mask_state_to_gates = None self.dropout_mask_state_to_proposal = None else: ones = tf.ones([batch_size, state_size]) self.dropout_mask_state_to_gates = dropout_state(ones) self.dropout_mask_state_to_proposal = dropout_state(ones)
def param_init_gru_cond(options, params, prefix='gru_cond', nin=None, dim=None, dimctx=None, nin_nonlin=None, dim_nonlin=None): if nin is None: nin = options['dim'] if dim is None: dim = options['dim'] if dimctx is None: dimctx = options['dim'] if nin_nonlin is None: nin_nonlin = nin if dim_nonlin is None: dim_nonlin = dim W = numpy.concatenate( [norm_weight(nin, dim), norm_weight(nin, dim)], axis=1) params[pp(prefix, 'W')] = W params[pp(prefix, 'b')] = numpy.zeros((2 * dim, )).astype('float32') U = numpy.concatenate([ortho_weight(dim_nonlin), ortho_weight(dim_nonlin)], axis=1) params[pp(prefix, 'U')] = U Wx = norm_weight(nin_nonlin, dim_nonlin) params[pp(prefix, 'Wx')] = Wx Ux = ortho_weight(dim_nonlin) params[pp(prefix, 'Ux')] = Ux params[pp(prefix, 'bx')] = numpy.zeros((dim_nonlin, )).astype('float32') U_nl = numpy.concatenate( [ortho_weight(dim_nonlin), ortho_weight(dim_nonlin)], axis=1) params[pp(prefix, 'U_nl')] = U_nl params[pp(prefix, 'b_nl')] = numpy.zeros( (2 * dim_nonlin, )).astype('float32') Ux_nl = ortho_weight(dim_nonlin) params[pp(prefix, 'Ux_nl')] = Ux_nl params[pp(prefix, 'bx_nl')] = numpy.zeros((dim_nonlin, )).astype('float32') # context to LSTM Wc = norm_weight(dimctx, dim * 2) params[pp(prefix, 'Wc')] = Wc Wcx = norm_weight(dimctx, dim) params[pp(prefix, 'Wcx')] = Wcx # attention: combined -> hidden W_comb_att = norm_weight(dim, dimctx) params[pp(prefix, 'W_comb_att')] = W_comb_att # attention: context -> hidden Wc_att = norm_weight(dimctx) params[pp(prefix, 'Wc_att')] = Wc_att # attention: hidden bias b_att = numpy.zeros((dimctx, )).astype('float32') params[pp(prefix, 'b_att')] = b_att # attention: U_att = norm_weight(dimctx, 1) params[pp(prefix, 'U_att')] = U_att c_att = numpy.zeros((1, )).astype('float32') params[pp(prefix, 'c_tt')] = c_att return params