def param_init_gru(options, params, prefix='gru', nin=None, dim=None): """ Gated Recurrent Unit (GRU) The following equations defne GRU u = sig(x_t Wu + h_t-1 Uu + bu) r = sig(x_t Wr + h_t-1 Ur + br) h = tanh(x_t Wx + (s_t-1 . r) Ux + bx) s_t = (1 - u) . h + u . s_t-1 Below some of the parameters are initlaized together and later sliced W = [Wu Wr], i.e. the (horizontal) concatination of Wu and Wr b = [bu br] U = [Uu Ur] """ if nin == None: nin = options['dim_word'] if dim == None: dim = options['dim_proj'] W = numpy.concatenate( [norm_weight(nin, dim), norm_weight(nin, dim)], axis=1) params[_p(prefix, 'W')] = W params[_p(prefix, 'b')] = numpy.zeros((2 * dim, )).astype('float32') U = numpy.concatenate([ortho_weight(dim), ortho_weight(dim)], axis=1) params[_p(prefix, 'U')] = U Wx = norm_weight(nin, dim) params[_p(prefix, 'Wx')] = Wx Ux = ortho_weight(dim) params[_p(prefix, 'Ux')] = Ux params[_p(prefix, 'bx')] = numpy.zeros((dim, )).astype('float32') return params
def param_init_decoder(options, params, prefix='decoder_lstm'): n_x = options['n_x'] n_h = options['n_h'] W = np.concatenate([ uniform_weight(n_x, n_h), uniform_weight(n_x, n_h), uniform_weight(n_x, n_h), uniform_weight(n_x, n_h) ], axis=1) params[_p(prefix, 'W')] = W U = np.concatenate([ ortho_weight(n_h), ortho_weight(n_h), ortho_weight(n_h), ortho_weight(n_h) ], axis=1) params[_p(prefix, 'U')] = U params[_p(prefix, 'b')] = zero_bias(4 * n_h) params[_p(prefix, 'b')][n_h:2 * n_h] = 3 * np.ones( (n_h, )).astype(theano.config.floatX) return params
def param_init_gru(options, params, prefix='gru', nin=None, dim=None): if nin is None: nin = options['dim_proj'] if dim is None: dim = options['dim_proj'] # embedding to gates transformation weights, biases W = np.concatenate([norm_weight(nin, dim), norm_weight(nin, dim)], axis=1) params[_p(prefix, 'W')] = W params[_p(prefix, 'b')] = np.zeros((2 * dim,)).astype('float32') # recurrent transformation weights for gates U = np.concatenate([ortho_weight(dim), ortho_weight(dim)], axis=1) params[_p(prefix, 'U')] = U # embedding to hidden state proposal weights, biases Wx = norm_weight(nin, dim) params[_p(prefix, 'Wx')] = Wx params[_p(prefix, 'bx')] = np.zeros((dim,)).astype('float32') # recurrent transformation weights for hidden state proposal Ux = ortho_weight(dim) params[_p(prefix, 'Ux')] = Ux return params
def param_init_lstm(options, params, prefix='lstm', nin=None, dim=None): """ Init the LSTM parameters """ assert (not nin is None and not dim is None) # input to hidden weights W = numpy.concatenate([ norm_weight(nin, dim), norm_weight(nin, dim), norm_weight(nin, dim), norm_weight(nin, dim) ], axis=1) params[_p(prefix, 'W')] = W # hidden to hidden (recurrent) weights U = numpy.concatenate([ ortho_weight(dim), ortho_weight(dim), ortho_weight(dim), ortho_weight(dim) ], axis=1) params[_p(prefix, 'U')] = U # biases b = numpy.zeros((4 * dim, )) params[_p(prefix, 'b')] = b.astype(theano.config.floatX) return params
def param_init_decoder(options, params, prefix='decoder_gru'): n_x = options['n_x'] n_h = options['n_h'] W = np.concatenate([uniform_weight(n_x,n_h), uniform_weight(n_x,n_h)], axis=1) params[_p(prefix,'W')] = W U = np.concatenate([ortho_weight(n_h), ortho_weight(n_h)], axis=1) params[_p(prefix,'U')] = U params[_p(prefix,'b')] = zero_bias(2*n_h) Wx = uniform_weight(n_x, n_h) params[_p(prefix,'Wx')] = Wx Ux = ortho_weight(n_h) params[_p(prefix,'Ux')] = Ux params[_p(prefix,'bx')] = zero_bias(n_h) params[_p(prefix,'b0')] = zero_bias(n_h) return params
def param_init_lstm_concat(self, options, params, nin, dim, dimctx, prefix='lstm_concat'): # input to LSTM W = np.concatenate([norm_weight(nin,dim), norm_weight(nin,dim), norm_weight(nin,dim), norm_weight(nin,dim)], axis=1) params[_p(prefix, 'W')] = W # LSTM to LSTM U = np.concatenate([ortho_weight(dim), ortho_weight(dim), ortho_weight(dim), ortho_weight(dim)], axis=1) params[_p(prefix, 'U')] = U # bias to LSTM params[_p(prefix, 'b')] = np.zeros((4 * dim,)).astype('float32') # context to LSTM Wc = norm_weight(dimctx, dim*4) params[_p(prefix, 'Wc')] = Wc if options['selector']: # attention: selector W_sel = norm_weight(dim, 1) params[_p(prefix, 'W_sel')] = W_sel b_sel = np.float32(0.) params[_p(prefix, 'b_sel')] = b_sel return params
def param_init(options, params, nin, dim, dimctx, prefix='lstm_cond'): # input to LSTM W = np.concatenate([ norm_weight(nin, dim), norm_weight(nin, dim), norm_weight(nin, dim), norm_weight(nin, dim) ], axis=1) params[_p(prefix, 'W')] = W # ctx to LSTM V = np.concatenate([ norm_weight(dimctx, dim), norm_weight(dimctx, dim), norm_weight(dimctx, dim), norm_weight(dimctx, dim) ], axis=1) params[_p(prefix, 'V')] = V # LSTM to LSTM U = np.concatenate([ ortho_weight(dim), ortho_weight(dim), ortho_weight(dim), ortho_weight(dim) ], axis=1) params[_p(prefix, 'U')] = U params[_p(prefix, 'b')] = np.zeros((4 * dim, )).astype('float32') return params
def param_init_encoder(options, params, prefix='lstm_encoder'): n_x = options['n_x'] n_h = options['n_h'] W = np.concatenate([uniform_weight(n_x,n_h), uniform_weight(n_x,n_h), uniform_weight(n_x,n_h), uniform_weight(n_x,n_h)], axis=1) params[_p(prefix, 'W')] = W U = np.concatenate([ortho_weight(n_h), ortho_weight(n_h), ortho_weight(n_h), ortho_weight(n_h)], axis=1) params[_p(prefix, 'U')] = U params[_p(prefix,'b')] = zero_bias(4*n_h) # It is observed that setting a high initial forget gate bias for LSTMs can # give slighly better results (Le et al., 2015). Hence, the initial forget # gate bias is set to 3. params[_p(prefix, 'b')][n_h:2*n_h] = 3*np.ones((n_h,)).astype(theano.config.floatX) return params
def param_init_encoder(options, params, prefix='lstm_encoder'): n_x = options['n_x'] n_h = options['n_h'] W = np.concatenate([ uniform_weight(n_x, n_h), uniform_weight(n_x, n_h), uniform_weight(n_x, n_h), uniform_weight(n_x, n_h) ], axis=1) params[_p(prefix, 'W')] = W U = np.concatenate([ ortho_weight(n_h), ortho_weight(n_h), ortho_weight(n_h), ortho_weight(n_h) ], axis=1) params[_p(prefix, 'U')] = U params[_p(prefix, 'b')] = zero_bias(4 * n_h) # It is observed that setting a high initial forget gate bias for LSTMs can # give slighly better results (Le et al., 2015). Hence, the initial forget # gate bias is set to 3. params[_p(prefix, 'b')][n_h:2 * n_h] = 3 * np.ones( (n_h, )).astype(theano.config.floatX) return params
def param_init_decoder(options, params, prefix='decoder_gru'): n_x = options['n_x'] n_h = options['n_h'] W = np.concatenate([uniform_weight(n_x, n_h), uniform_weight(n_x, n_h)], axis=1) params[_p(prefix, 'W')] = W U = np.concatenate([ortho_weight(n_h), ortho_weight(n_h)], axis=1) params[_p(prefix, 'U')] = U params[_p(prefix, 'b')] = zero_bias(2 * n_h) Wx = uniform_weight(n_x, n_h) params[_p(prefix, 'Wx')] = Wx Ux = ortho_weight(n_h) params[_p(prefix, 'Ux')] = Ux params[_p(prefix, 'bx')] = zero_bias(n_h) params[_p(prefix, 'b0')] = zero_bias(n_h) return params
def param_init_lstm(options, params, prefix='lstm', nin=None, dim=None): if nin is None: nin = options['dim_proj'] if dim is None: dim = options['dim_proj'] W = numpy.concatenate([ norm_weight(nin, dim), norm_weight(nin, dim), norm_weight(nin, dim), norm_weight(nin, dim) ], axis=1) params[prfx(prefix, 'W')] = W U = numpy.concatenate([ ortho_weight(dim), ortho_weight(dim), ortho_weight(dim), ortho_weight(dim) ], axis=1) params[prfx(prefix, 'U')] = U params[prfx(prefix, 'b')] = numpy.zeros((4 * dim, )).astype('float32') return params
def param_init_lstm_cond(self, options, params, prefix='lstm_cond', nin=None, dim=None, dimctx=None): #nin=512 dim=512 dimctx=2048 if nin == None: nin = options['word_dim'] if dim == None: dim = options['lstm_dim'] if dimctx == None: dimctx = options['ctx_dim'] # input to LSTM W = np.concatenate([ norm_weight(nin, dim), norm_weight(nin, dim), norm_weight(nin, dim), norm_weight(nin, dim) ], axis=1) params[_p(prefix, 'W')] = W # bo_lstm_W:(512,2048) # LSTM to LSTM U = np.concatenate([ ortho_weight(dim), ortho_weight(dim), ortho_weight(dim), ortho_weight(dim) ], axis=1) params[_p(prefix, 'U')] = U # bo_lstm_U:(512,2048) # bias to LSTM params[_p(prefix, 'b')] = np.zeros( (4 * dim, )).astype('float32') # bo_lstm_b:(2048,) # attention: context -> hidden # Wc_att = norm_weight(dimctx, ortho=False) Wc_att = norm_weight(dim, ortho=False) params[_p(prefix, 'Wc_att')] = Wc_att # bo_lstm_Wc_att:(2048,2048) # attention: LSTM -> hidden # Wd_att = norm_weight(dim, dimctx) Wd_att = norm_weight(dim, dim) params[_p(prefix, 'Wd_att')] = Wd_att # bo_lstm_Wd_att:(512,2048) # attention: hidden bias # b_att = np.zeros((dimctx,)).astype('float32') b_att = np.zeros((dim, )).astype('float32') params[_p(prefix, 'b_att')] = b_att # bo_lstm_b_att:(2048,) # attention: # U_att = norm_weight(dimctx, 1) U_att = norm_weight(dim, 28) params[_p(prefix, 'U_att')] = U_att # bo_lstm_U_att:(2048,1) c_att = np.zeros((1, )).astype('float32') params[_p(prefix, 'c_att')] = c_att # bo_lstm_c_att:(1,) if options['selector']: # attention: selector W_sel = norm_weight(dim, 1) params[_p(prefix, 'W_sel')] = W_sel # bo_lstm_W_sel:(512,1) b_sel = np.float32(0.) params[_p(prefix, 'b_sel')] = b_sel # bo_lstm_b_sel: 0 return params
def param_init_lstm_peep(options, params, prefix='lstm', nin=None, dim=None): """ Code based on http://deeplearning.net/tutorial/code/lstm.py and Jamie's GRU code Long Short Term Memory Unit (LSTM) LSTM is defined by the follow equations, W = [Wi Wf Wc Wo] # input weights b = [bi bf bc bo] # biases U = [Ui Uf Uc Uo] # recurrent weights Pi Pf Po c_t-1 # peep hole params and the previous cell, c_t-1 i_t = sig(Wi x_t + Ui h_t-1 + Pi c_t-1 + bi) f_t = sig(Wf x_t + Uf h_t-1 + Pf c_t-1 + bf) c_t = f_t c_t-1 + i_t tanh(Wc x_t + Uc h_t-1 + bc) o_t = sig(Wo x_t + Uo h_t-1 + Po c_t-1 + bo) h_t = o_t tanh(c_t) """ if nin == None: nin = options['dim_word'] if dim == None: dim = options['dim_proj'] # input weight matrix is 4 times for the input gate, forget gate, output gate, and cell input W = numpy.concatenate([ norm_weight(nin, dim), norm_weight(nin, dim), norm_weight(nin, dim), norm_weight(nin, dim) ], axis=1) params[_p(prefix, 'W')] = W params[_p(prefix, 'b')] = numpy.zeros((4 * dim, )).astype('float32') # The recurrent weight matrix U = numpy.concatenate( [ ortho_weight(dim), ortho_weight(dim), # remember this is ortho_weight(dim, dim) ortho_weight(dim), ortho_weight(dim) ], axis=1) params[_p(prefix, 'U')] = U # Peep holes weight vectors, all initialized to zero # Peep hole weights are diagonal as in Grave's paper params[_p(prefix, 'Pi')] = numpy.zeros((dim, )).astype('float32') params[_p(prefix, 'Pf')] = numpy.zeros((dim, )).astype('float32') params[_p(prefix, 'Po')] = numpy.zeros((dim, )).astype('float32') # inital h_0, and cell get made in lstm_layer or passed in # initialize forget gates to one? return params
def param_init_gru(options, param, prefix='gru', nin=None, dim=None): param[prefix + '_W'] = numpy.concatenate( [norm_weight(nin, dim), norm_weight(nin, dim)], axis=1) param[prefix + '_U'] = numpy.concatenate( [ortho_weight(dim), ortho_weight(dim)], axis=1) param[prefix + '_b'] = zero_vector(2 * dim) param[prefix + '_Wx'] = norm_weight(nin, dim) param[prefix + '_Ux'] = ortho_weight(dim) param[prefix + '_bx'] = zero_vector(dim) return param
def __init__(self, n_input, n_output): """ :type nc: int :param nc: dimension of input vector :type nh: int :param nh: number of hidden units in this layer :type no: int :param no: dimension of output vector """ # Parameter of this lstm layer self._name = "attention_blstm_fusion" self.n_input = n_input self.n_output = n_output # Wh = [ Wi, Wc, Wf, Wo] Wh = np.concatenate([np.random.randn(n_input, n_output).astype(theano.config.floatX), np.random.randn(n_input, n_output).astype(theano.config.floatX), np.random.randn(n_input, n_output).astype(theano.config.floatX), np.random.randn(n_input, n_output).astype(theano.config.floatX)] , axis=1) self.Wh = theano.shared(value=Wh, name='Wh', borrow=True) # U = [Ui, Uc, Uf, Uo] Uh = np.concatenate([ortho_weight(n_output, n_output), ortho_weight(n_output, n_output), ortho_weight(n_output, n_output), ortho_weight(n_output, n_output)] , axis=1) self.Uh = theano.shared(value=Uh, name='Uh', borrow=True) # bh = [bi, bc, bf, bo] bh = np.zeros((n_output * 4,)).astype(theano.config.floatX) self.bh = theano.shared(value=bh, name='bh', borrow=True) Wh_reverse = np.concatenate([np.random.randn(n_input, n_output).astype(theano.config.floatX), np.random.randn(n_input, n_output).astype(theano.config.floatX), np.random.randn(n_input, n_output).astype(theano.config.floatX), np.random.randn(n_input, n_output).astype(theano.config.floatX)] , axis=1) self.Wh_reverse = theano.shared(value=Wh_reverse, name='Wh_reverse', borrow=True) # U = [Ui, Uc, Uf, Uo] Uh_reverse = np.concatenate([ortho_weight(n_output, n_output), ortho_weight(n_output, n_output), ortho_weight(n_output, n_output), ortho_weight(n_output, n_output)] , axis=1) self.Uh_reverse = theano.shared(value=Uh_reverse, name='Uh_reverse', borrow=True) # bh = [bi, bc, bf, bo] bh_reverse = np.zeros((n_output * 4,)).astype(theano.config.floatX) self.bh_reverse = theano.shared(value=bh_reverse, name='bh_reverse', borrow=True) self._output = np.zeros(2, ) self.params = [self.Wh, self.Uh, self.bh, self.Wh_reverse, self.Uh_reverse, self.bh_reverse] Wa = 0.01*np.random.rand(n_input,n_input).astype(theano.config.floatX) Ua = 0.01*np.random.rand(n_output,n_input).astype(theano.config.floatX) Va = 0.01*np.random.rand(n_input,1).astype(theano.config.floatX) self.Wa = theano.shared(value=Wa, name='Wa', borrow=True) self.Ua = theano.shared(value=Ua, name='Ua', borrow=True) #self.Va = theano.shared(value=Va, name='Va', borrow=True) self.params.extend([self.Wa, self.Ua])#, self.Va])
def __init__(self, n_input, n_output): """ :type nc: int :param nc: dimension of input vector :type nh: int :param nh: number of hidden units in this layer :type no: int :param no: dimension of output vector """ # Parameter of this lstm layer self._name = "LSTM_MASK" self.n_input = n_input self.n_output = n_output # Wh = [ Wi, Wc, Wf, Wo] Wh = np.concatenate([ortho_weight(n_input, n_output), ortho_weight(n_input, n_output), ortho_weight(n_input, n_output), ortho_weight(n_input, n_output)] , axis=1) self.Wh = theano.shared(value=Wh, name='Wh', borrow=True) # U = [Ui, Uc, Uf, Uo] Uh = np.concatenate([ortho_weight(n_output, n_output), ortho_weight(n_output, n_output), ortho_weight(n_output, n_output), ortho_weight(n_output, n_output)] , axis=1) self.Uh = theano.shared(value=Uh, name='Uh', borrow=True) # bh = [bi, bc, bf, bo] bh = np.zeros((n_output * 4,)).astype(theano.config.floatX) self.bh = theano.shared(value=bh, name='bh', borrow=True) self.params = [self.Wh, self.Uh, self.bh]
def param_init_lstm(options, params, prefix='lstm'): """ Init the LSTM parameter: :see: init_params """ W = numpy.concatenate([ ortho_weight(options['dim_proj']), ortho_weight(options['dim_proj']), ortho_weight(options['dim_proj']), ortho_weight(options['dim_proj']) ], axis=1) params[_p(prefix, 'W')] = W U = numpy.concatenate([ ortho_weight(options['dim_proj']), ortho_weight(options['dim_proj']), ortho_weight(options['dim_proj']), ortho_weight(options['dim_proj']) ], axis=1) params[_p(prefix, 'U')] = U b = numpy.zeros((4 * options['dim_proj'], )) params[_p(prefix, 'b')] = b.astype(config.floatX) return params
def init_params(self): Wi_values = utils.ortho_weight(self.dim) self.Wi = theano.shared(Wi_values, name="LSTM_Wi") Wf_values = utils.ortho_weight(self.dim) self.Wf = theano.shared(Wf_values, name="LSTM_Wf") Wc_values = utils.ortho_weight(self.dim) self.Wc = theano.shared(Wc_values, name="LSTM_Wc") Wo_values = utils.ortho_weight(self.dim) self.Wo = theano.shared(Wo_values, name="LSTM_Wo") Ui_values = utils.ortho_weight(self.dim) self.Ui = theano.shared(Ui_values, name="LSTM_Ui") Uf_values = utils.ortho_weight(self.dim) self.Uf = theano.shared(Uf_values, name="LSTM_Uf") Uc_values = utils.ortho_weight(self.dim) self.Uc = theano.shared(Uc_values, name="LSTM_Uc") Uo_values = utils.ortho_weight(self.dim) self.Uo = theano.shared(Uo_values, name="LSTM_Uo") b_values = np.zeros((self.dim, ), dtype=theano.config.floatX) self.bi = theano.shared(b_values, name="LSTM_bi") self.bf = theano.shared(b_values, name="LSTM_bf") self.bc = theano.shared(b_values, name="LSTM_bc") self.bo = theano.shared(b_values, name="LSTM_bo") self.params = [ self.Wi, self.Ui, self.bi, self.Wf, self.Uf, self.bf, self.Wc, self.Uc, self.bc, self.Wo, self.Uo, self.bo ]
def param_init_lstm_cond(self, options, params, nin, dim, dimctx, prefix='lstm_cond'): # input to LSTM W = np.concatenate([norm_weight(nin,dim), norm_weight(nin,dim), norm_weight(nin,dim), norm_weight(nin,dim)], axis=1) params[_p(prefix, 'W')] = W # LSTM to LSTM U = np.concatenate([ortho_weight(dim), ortho_weight(dim), ortho_weight(dim), ortho_weight(dim)], axis=1) params[_p(prefix, 'U')] = U # bias to LSTM params[_p(prefix, 'b')] = np.zeros((4 * dim,)).astype('float32') # context to LSTM Wc = norm_weight(dimctx,dim*4) params[_p(prefix, 'Wc')] = Wc # attention: context -> hidden Wc_att = norm_weight(dimctx, ortho=False) params[_p(prefix, 'Wc_att')] = Wc_att # attention: LSTM -> hidden Wd_att = norm_weight(dim,dimctx) params[_p(prefix, 'Wd_att')] = Wd_att # attention: hidden bias b_att = np.zeros((dimctx,)).astype('float32') params[_p(prefix, 'b_att')] = b_att # attention: U_att = norm_weight(dimctx, 1) params[_p(prefix, 'U_att')] = U_att c_att = np.zeros((1,)).astype('float32') params[_p(prefix, 'c_tt')] = c_att if options['selector']: # attention: selector W_sel = norm_weight(dim, 1) params[_p(prefix, 'W_sel')] = W_sel b_sel = np.float32(0.) params[_p(prefix, 'b_sel')] = b_sel return params
def _init_gru(in_dim, hid_dim, prefix_): param[prefix_ + '_W'] = numpy.concatenate( [uniform_weight(in_dim, hid_dim), uniform_weight(in_dim, hid_dim)], axis=1) param[prefix_ + '_Wx'] = uniform_weight(in_dim, hid_dim) param[prefix_ + '_U'] = numpy.concatenate( [ortho_weight(hid_dim), ortho_weight(hid_dim)], axis=1) param[prefix_ + '_b'] = zero_vector(2 * hid_dim) param[prefix_ + '_Ux'] = ortho_weight(hid_dim) param[prefix_ + '_bx'] = zero_vector(hid_dim)
def LSTM(input_x, rnn_size, batch_size): #input(batch_size, steps, embedding_size) num_steps = int(input_x.get_shape()[1]) embedding_size = int(input_x.get_shape()[2]) #define parameter W = tf.get_variable("W", initializer=tf.concat(1, [ uniform_weight(embedding_size, rnn_size), uniform_weight(embedding_size, rnn_size) ])) U = tf.get_variable( "U", initializer=tf.concat(1, [ortho_weight(rnn_size), ortho_weight(rnn_size)])) b = tf.get_variable("b", initializer=tf.zeros([2 * rnn_size])) Wx = tf.get_variable("Wx", initializer=uniform_weight(embedding_size, rnn_size)) Ux = tf.get_variable("Ux", initializer=ortho_weight(rnn_size)) bx = tf.get_variable("bx", initializer=tf.zeros([rnn_size])) h_ = tf.zeros([batch_size, rnn_size]) one = tf.fill([batch_size, rnn_size], 1.) state_below = tf.transpose(tf.batch_matmul( input_x, tf.tile(tf.reshape(W, [1, embedding_size, 2 * rnn_size]), [batch_size, 1, 1])) + b, perm=[1, 0, 2]) state_belowx = tf.transpose(tf.batch_matmul( input_x, tf.tile(tf.reshape(Wx, [1, embedding_size, rnn_size]), [batch_size, 1, 1])) + bx, perm=[1, 0, 2]) #(steps, batch_size, rnn_size) output = [] #(steps, batch_size, rnn_size) with tf.variable_scope("GRU"): for time_step in range(num_steps): preact = tf.matmul(h_, U) preact = tf.add(preact, state_below[time_step]) r = tf.nn.sigmoid(_slice(preact, 0, rnn_size)) u = tf.nn.sigmoid(_slice(preact, 1, rnn_size)) preactx = tf.matmul(h_, Ux) preactx = tf.mul(preactx, r) preactx = tf.add(preactx, state_belowx[time_step]) h = tf.tanh(preactx) h_ = tf.add(tf.mul(u, h_), tf.mul(tf.sub(one, u), h)) output.append(h_) output = tf.transpose(output, perm=[1, 0, 2]) return output #(batch_size, steps, rnn_size)
def param_init_lstm(self, params, nin, dim, prefix='lstm'): assert prefix is not None # Stack the weight matricies for faster dot prods W = np.concatenate([norm_weight(nin, dim), norm_weight(nin, dim), norm_weight(nin, dim), norm_weight(nin, dim)], axis=1) params[_p(prefix, 'W')] = W # to_lstm_W:(512,2048) U = np.concatenate([ortho_weight(dim), ortho_weight(dim), ortho_weight(dim), ortho_weight(dim)], axis=1) params[_p(prefix, 'U')] = U # to_lstm_U:(512,2048) params[_p(prefix, 'b')] = np.zeros((4*dim,)).astype('float32') # to_lstm_b:(2048,) return params
def param_init(params, nin, dim, prefix='lstm'): assert prefix is not None # Stack the weight matricies for faster dot prods W = np.concatenate([norm_weight(nin,dim), norm_weight(nin,dim), norm_weight(nin,dim), norm_weight(nin,dim)], axis=1) params[_p(prefix, 'W')] = W U = np.concatenate([ortho_weight(dim), ortho_weight(dim), ortho_weight(dim), ortho_weight(dim)], axis=1) params[_p(prefix, 'U')] = U params[_p(prefix, 'b')] = np.zeros((4 * dim,)).astype('float32') return params
def param_init_decoder(options, params, prefix='decoder'): n_x = options['n_x'] n_h = options['n_h'] n_z = options['n_z'] W = np.concatenate([ uniform_weight(n_x, n_h), uniform_weight(n_x, n_h), uniform_weight(n_x, n_h), uniform_weight(n_x, n_h) ], axis=1) params[_p(prefix, 'W')] = W U = np.concatenate([ ortho_weight(n_h), ortho_weight(n_h), ortho_weight(n_h), ortho_weight(n_h) ], axis=1) params[_p(prefix, 'U')] = U C = np.concatenate([ uniform_weight(n_z, n_h), uniform_weight(n_z, n_h), uniform_weight(n_z, n_h), uniform_weight(n_z, n_h) ], axis=1) params[_p(prefix, 'C')] = C params[_p(prefix, 'b')] = zero_bias(4 * n_h) params[_p(prefix, 'b')][n_h:2 * n_h] = 3 * np.ones( (n_h, )).astype(theano.config.floatX) C0 = uniform_weight(n_z, n_h) params[_p(prefix, 'C0')] = C0 params[_p(prefix, 'b0')] = zero_bias(n_h) #params[_p(prefix,'b_y')] = zero_bias(n_x) # 48 return params
def param_init_gru(prefix='gru', nin=None, dim=None): #Gated Recurrent Unit (GRU) params = {} W = [norm_weight(nin, dim), norm_weight(nin, dim)] params[_p(prefix, 'W')] = W params[_p(prefix, 'b1')] = np.zeros((dim, ), dtype=np.float32) params[_p(prefix, 'b2')] = np.zeros((dim, ), dtype=np.float32) U = [ortho_weight(dim), ortho_weight(dim)] params[_p(prefix, 'U')] = U Wx = norm_weight(nin, dim) params[_p(prefix, 'Wx')] = Wx Ux = ortho_weight(dim) params[_p(prefix, 'Ux')] = Ux params[_p(prefix, 'bx')] = np.zeros((dim, ), dtype=np.float32) return params[_p(prefix,'W')][0], params[_p(prefix,'W')][1], params[_p(prefix,'U')][0] , \ params[_p(prefix,'U')][1], params[_p(prefix,'b1')], params[_p(prefix,'b2')], \ params[_p(prefix,'Wx')], params[_p(prefix,'Ux')], params[_p(prefix,'bx')]
def param_init_cnn(options, params, prefix='cnn'): feature_maps = options['feature_maps'] filter_hs = options['filter_hs'] # Fixed image shape num_chn = len(options['W']) if options['bidir']: num_chn = num_chn * 2 if options['combine']: num_chn = num_chn * 2 image_shape = (options['batch_size'], num_chn, options['maxlen'], options['dim_proj']) img_h = image_shape[2] img_w = image_shape[3] options['image_shape'] = image_shape # init filter,bias filter_shapes = [] pool_sizes = [] filter_w = options['dim_proj'] for filter_h in filter_hs: filter_shape = (feature_maps, num_chn, filter_h, filter_w) pool_size = (img_h - filter_h + 1, img_w - filter_w + 1) #4 different initialization of filters if options['init'] == 'uniform': params['cnn_f' + str(filter_h)] = numpy.random.uniform( low=-0.01, high=0.01, size=filter_shape).astype(config.floatX) elif options['init'] == 'xavier': fan_in = numpy.prod(filter_shape[1:]) fan_out = (filter_shape[0] * numpy.prod(filter_shape[2:]) / numpy.prod(pool_size)) W_bound = numpy.sqrt(6. / (fan_in + fan_out)) params['cnn_f' + str(filter_h)] = numpy.random.uniform( low=-W_bound, high=W_bound, size=filter_shape).astype(config.floatX) elif options['init'] == 'gaussian': params['cnn_f' + str(filter_h)] = numpy.random.normal( size=filter_shape).astype(config.floatX) elif options['init'] == 'ortho': W_ortho = ortho_weight(numpy.prod(filter_shape[1:])) W_ortho = numpy.reshape(W_ortho[:filter_shape[0]], filter_shape) params['cnn_f' + str(filter_h)] = W_ortho params['cnn_b' + str(filter_h)] = numpy.zeros( (filter_shape[0], )).astype(config.floatX) filter_shapes.append(filter_shape) pool_sizes.append(pool_size) options['filter_shapes'] = filter_shapes options['pool_sizes'] = pool_sizes return params
def param_init_lnlstm(options, params, prefix='lnlstm', nin=None, dim=None): if nin is None: nin = options['dim_proj'] if dim is None: dim = options['dim_proj'] W = numpy.concatenate([ norm_weight(nin, dim), norm_weight(nin, dim), norm_weight(nin, dim), norm_weight(nin, dim) ], axis=1) params[prfx(prefix, 'W')] = W U = numpy.concatenate([ ortho_weight(dim), ortho_weight(dim), ortho_weight(dim), ortho_weight(dim) ], axis=1) params[prfx(prefix, 'U')] = U params[prfx(prefix, 'b')] = numpy.zeros((4 * dim, )).astype('float32') # lateral parameters scale_add = 0.0 scale_mul = 1.0 params[prfx(prefix, 'b1')] = scale_add * numpy.ones( (4 * dim)).astype('float32') params[prfx(prefix, 'b2')] = scale_add * numpy.ones( (4 * dim)).astype('float32') params[prfx(prefix, 'b3')] = scale_add * numpy.ones( (1 * dim)).astype('float32') params[prfx(prefix, 's1')] = scale_mul * numpy.ones( (4 * dim)).astype('float32') params[prfx(prefix, 's2')] = scale_mul * numpy.ones( (4 * dim)).astype('float32') params[prfx(prefix, 's3')] = scale_mul * numpy.ones( (1 * dim)).astype('float32') return params
def param_init_gru(options, param, prefix='gru', nin=None, dim=None): param[prefix + '_W'] = numpy.concatenate( [ norm_weight(nin, dim), norm_weight(nin, dim) ], axis=1) param[prefix + '_U'] = numpy.concatenate( [ ortho_weight(dim), ortho_weight(dim) ], axis=1) param[prefix + '_b'] = zero_vector(2 * dim) param[prefix + '_Wx'] = norm_weight(nin, dim) param[prefix + '_Ux'] = ortho_weight(dim) param[prefix + '_bx'] = zero_vector(dim) return param
def param_init_lngru(options, params, prefix='lngru', nin=None, dim=None): """ Gated Recurrent Unit (GRU) with LN """ if nin == None: nin = options['dim_proj'] if dim == None: dim = options['dim_proj'] W = numpy.concatenate( [norm_weight(nin, dim), norm_weight(nin, dim)], axis=1) params[_p(prefix, 'W')] = W.astype('float32') params[_p(prefix, 'b')] = numpy.zeros((2 * dim, )).astype('float32') U = numpy.concatenate([ortho_weight(dim), ortho_weight(dim)], axis=1) params[_p(prefix, 'U')] = U.astype('float32') Wx = norm_weight(nin, dim) params[_p(prefix, 'Wx')] = Wx.astype('float32') Ux = ortho_weight(dim) params[_p(prefix, 'Ux')] = Ux.astype('float32') params[_p(prefix, 'bx')] = numpy.zeros((dim, )).astype('float32') # LN parameters scale_add = 0.0 scale_mul = 1.0 params[_p(prefix, 'b1')] = scale_add * numpy.ones( (2 * dim)).astype('float32') params[_p(prefix, 'b2')] = scale_add * numpy.ones( (1 * dim)).astype('float32') params[_p(prefix, 'b3')] = scale_add * numpy.ones( (2 * dim)).astype('float32') params[_p(prefix, 'b4')] = scale_add * numpy.ones( (1 * dim)).astype('float32') params[_p(prefix, 's1')] = scale_mul * numpy.ones( (2 * dim)).astype('float32') params[_p(prefix, 's2')] = scale_mul * numpy.ones( (1 * dim)).astype('float32') params[_p(prefix, 's3')] = scale_mul * numpy.ones( (2 * dim)).astype('float32') params[_p(prefix, 's4')] = scale_mul * numpy.ones( (1 * dim)).astype('float32') return params
def param_init_gru_cond(options, param, prefix='gru_cond', nin=None, dim=None, dimctx=None, nin_nonlin=None, dim_nonlin=None): if nin_nonlin is None: nin_nonlin = nin if dim_nonlin is None: dim_nonlin = dim param = param_init_gru(options, param, prefix=prefix, nin=nin, dim=dim) param[prefix + '_U_nl'] = numpy.concatenate( [ortho_weight(dim_nonlin), ortho_weight(dim_nonlin)], axis=1) param[prefix + '_b_nl'] = zero_vector(2 * dim_nonlin) param[prefix + '_Ux_nl'] = ortho_weight(dim_nonlin) param[prefix + '_bx_nl'] = zero_vector(dim_nonlin) # context to LSTM param[prefix + '_Wc'] = uniform_weight(dimctx, dim * 2) param[prefix + '_Wcx'] = uniform_weight(dimctx, dim) # attention: combined -> hidden param[prefix + '_W_comb_att'] = uniform_weight(dim, dimctx) # attention: context -> hidden param[prefix + '_Wc_att'] = uniform_weight(dimctx, dimctx) # attention: hidden bias param[prefix + '_b_att'] = zero_vector(dimctx) # attention: param[prefix + '_U_att'] = uniform_weight(dimctx, 1) param[prefix + '_c_att'] = zero_vector(1) return param
def param_init_gru(options, params, prefix='gru', nin=None, dim=None, hiero=False): if nin == None: nin = options['dim_proj'] if dim == None: dim = options['dim_proj'] if not hiero: W = numpy.concatenate([norm_weight(nin,dim), norm_weight(nin,dim)], axis=1) params[prfx(prefix,'W')] = W params[prfx(prefix,'b')] = numpy.zeros((2 * dim,)).astype('float32') U = numpy.concatenate([ortho_weight(dim), ortho_weight(dim)], axis=1) params[prfx(prefix,'U')] = U Wx = norm_weight(nin, dim) params[prfx(prefix,'Wx')] = Wx Ux = ortho_weight(dim) params[prfx(prefix,'Ux')] = Ux params[prfx(prefix,'bx')] = numpy.zeros((dim,)).astype('float32') return params
def param_init_gru_rmn(params, prefix='gru_rmn', nin=None, dim=None, vocab_size=None, memory_dim=None, memory_size=None): assert dim == memory_dim, 'Should be fixed!' # first GRU params W = numpy.concatenate([norm_weight(nin, dim), norm_weight(nin, dim)], axis=1) params[_p(prefix, 'W')] = W params[_p(prefix, 'b')] = numpy.zeros((2 * dim,)).astype('float32') U = numpy.concatenate([ortho_weight(dim), ortho_weight(dim)], axis=1) params[_p(prefix, 'U')] = U params[_p(prefix, 'Wx')] = norm_weight(nin, dim) params[_p(prefix, 'Ux')] = ortho_weight(dim) params[_p(prefix, 'bx')] = numpy.zeros((dim,)).astype('float32') # memory block params params[_p(prefix, 'M')] = norm_weight(vocab_size, memory_dim) params[_p(prefix, 'C')] = norm_weight(vocab_size, memory_dim) params[_p(prefix, 'T')] = norm_weight(memory_size, memory_dim) # second GRU params params[_p(prefix, 'Wz')] = norm_weight(dim, memory_dim, ortho=False) params[_p(prefix, 'Wr')] = norm_weight(dim, memory_dim, ortho=False) params[_p(prefix, 'W2')] = norm_weight(dim, memory_dim, ortho=False) params[_p(prefix, 'Uz')] = ortho_weight(dim) params[_p(prefix, 'Ur')] = ortho_weight(dim) params[_p(prefix, 'U2')] = ortho_weight(dim) return params
def param_init_gru(options, params, prefix='gru', nin=None, dim=None): """ Gated Recurrent Unit (GRU) """ if nin == None: nin = options['dim_proj'] if dim == None: dim = options['dim_proj'] W = numpy.concatenate( [norm_weight(nin, dim), norm_weight(nin, dim)], axis=1) params[_p(prefix, 'W')] = W params[_p(prefix, 'b')] = numpy.zeros((2 * dim, )).astype('float32') U = numpy.concatenate([ortho_weight(dim), ortho_weight(dim)], axis=1) params[_p(prefix, 'U')] = U Wx = norm_weight(nin, dim) params[_p(prefix, 'Wx')] = Wx Ux = ortho_weight(dim) params[_p(prefix, 'Ux')] = Ux params[_p(prefix, 'bx')] = numpy.zeros((dim, )).astype('float32') return params
def param_init_gru(options, params, prefix='gru', nin=None, dim=None): """ Gated Recurrent Unit (GRU) """ if nin == None: nin = options['dim_proj'] if dim == None: dim = options['dim_proj'] W = numpy.concatenate([norm_weight(nin,dim), norm_weight(nin,dim)], axis=1) params[_p(prefix,'W')] = W params[_p(prefix,'b')] = numpy.zeros((2 * dim,)).astype('float32') U = numpy.concatenate([ortho_weight(dim), ortho_weight(dim)], axis=1) params[_p(prefix,'U')] = U Wx = norm_weight(nin, dim) params[_p(prefix,'Wx')] = Wx Ux = ortho_weight(dim) params[_p(prefix,'Ux')] = Ux params[_p(prefix,'bx')] = numpy.zeros((dim,)).astype('float32') return params
def param_init_decoder(options, params, prefix='decoder_vanilla'): n_x = options['n_x'] n_h = options['n_h'] W = uniform_weight(n_x, n_h) params[_p(prefix, 'W')] = W U = ortho_weight(n_h) params[_p(prefix, 'U')] = U params[_p(prefix, 'b')] = zero_bias(n_h) return params
def param_init_gru_cond(options, params, prefix='gru_cond', nin=None, dim=None, dimctx=None, nin_nonlin=None, dim_nonlin=None): if nin is None: nin = options['dim'] if dim is None: dim = options['dim'] if dimctx is None: dimctx = options['dim'] if nin_nonlin is None: nin_nonlin = nin if dim_nonlin is None: dim_nonlin = dim W = np.concatenate([norm_weight(nin, dim), norm_weight(nin, dim)], axis=1) params[_p(prefix, 'W')] = W params[_p(prefix, 'b')] = np.zeros((2 * dim,)).astype('float32') U = np.concatenate([ortho_weight(dim_nonlin), ortho_weight(dim_nonlin)], axis=1) params[_p(prefix, 'U')] = U Wx = norm_weight(nin_nonlin, dim_nonlin) params[_p(prefix, 'Wx')] = Wx Ux = ortho_weight(dim_nonlin) params[_p(prefix, 'Ux')] = Ux params[_p(prefix, 'bx')] = np.zeros((dim_nonlin,)).astype('float32') U_nl = np.concatenate([ortho_weight(dim_nonlin), ortho_weight(dim_nonlin)], axis=1) params[_p(prefix, 'U_nl')] = U_nl params[_p(prefix, 'b_nl')] = np.zeros((2 * dim_nonlin,)).astype('float32') Ux_nl = ortho_weight(dim_nonlin) params[_p(prefix, 'Ux_nl')] = Ux_nl params[_p(prefix, 'bx_nl')] = np.zeros((dim_nonlin,)).astype('float32') # context to LSTM Wc = norm_weight(dimctx, dim * 2) params[_p(prefix, 'Wc')] = Wc Wcx = norm_weight(dimctx, dim) params[_p(prefix, 'Wcx')] = Wcx # attention: combined -> hidden W_comb_att = norm_weight(dim, dimctx) params[_p(prefix, 'W_comb_att')] = W_comb_att # attention: context -> hidden Wc_att = norm_weight(dimctx) params[_p(prefix, 'Wc_att')] = Wc_att # attention: hidden bias b_att = np.zeros((dimctx,)).astype('float32') params[_p(prefix, 'b_att')] = b_att # attention: U_att = norm_weight(dimctx, 1) params[_p(prefix, 'U_att')] = U_att c_att = np.zeros((1,)).astype('float32') params[_p(prefix, 'c_tt')] = c_att return params