def mlp_layer(tparams, state_below, options, prefix='predictor'): layer_num = len(options['dims']) for i in range(layer_num - 1): if i == 0: output = tensor.dot(state_below, tparams[_p(prefix, i)]) output = tanh(output) elif i == layer_num - 2: output = tensor.dot(output, tparams[_p(prefix, i)]) output = rectifier(output) else: output = tensor.dot(output, tparams[_p(prefix, i)]) output = tanh(output) return output
def _step(m_, h_, c_, a_, ct_, pctx_, dp_=None, dp_att_=None): if _p(prefix, 'Wct_att') in tparams: pstate_ = tensor.dot(h_, tparams[_p( prefix, 'Wd_att')]) + tensor.dot( ct_, tparams[_p(prefix, 'Wct_att')]) else: pstate_ = tensor.dot(h_, tparams[_p(prefix, 'Wd_att')]) pctx_ += pstate_[:, None, :] pctx_ = tanh(pctx_) alpha = tensor.dot( pctx_, tparams[_p(prefix, 'U_att')] + tparams[_p(prefix, 'c_att')]) alpha_shp = alpha.shape alpha = tensor.nnet.softmax(alpha.reshape( (alpha_shp[0], alpha_shp[1]))) ctx_ = (new_ctx * alpha[:, :, None]).sum(1) # current context preact = tensor.dot(h_, tparams[_p(prefix, 'U')]) + \ tensor.dot(ctx_, tparams[_p(prefix, 'Wc')]) + \ tparams[_p(prefix,'b')] i = tensor.nnet.sigmoid(_slice(preact, 0, dim)) f = tensor.nnet.sigmoid(_slice(preact, 1, dim)) o = tensor.nnet.sigmoid(_slice(preact, 2, dim)) c = tensor.tanh(_slice(preact, 3, dim)) c = f * c_ + i * c c = m_[:, None] * c + (1. - m_)[:, None] * c_ h = o * tensor.tanh(c) h = m_[:, None] * h + (1. - m_)[:, None] * h_ rval = [h, c, alpha, ctx_, i, f, o] return rval
def forward(self): temp = np.vstack((np.ones((1, self._input.shape[1])), self._input)) self._forward_cache_acted = [temp] self._forward_cache_raw = [temp] if not self._constructed: print("use the build method before forwarding.") assert 0 times = len(self._layers) - 1 for i in range(times): # temp = np.vstack((np.ones((1, self._input.shape[1])), temp)) temp = np.dot(self._weights[i], temp) self._forward_cache_raw.append(temp) if not self._activations[i]: pass elif self._activations[i].lower() == 'sigmoid': temp = util.sigmoid(temp) elif self._activations[i].lower() == 'tanh': temp = util.tanh(temp) elif self._activations[i].lower() == 'relu': temp = util.relu(temp) else: print( "Activation function should be None, 'sigmoid', 'tanh' or 'relu'." ) assert 0 self._forward_cache_acted.append(temp) self._predictions = temp return temp
def forward(self,X): #Z = relu(X.dot(self.W1)+self.b1) Z = tanh(X.dot(self.W1)+self.b1) # print("Z.shape"+str(Z.shape)) # print("self.W2.shape"+str(self.W2.shape)) # print("self.b2.shape"+str(self.b2.shape)) ret = sigmoid(Z.dot(self.W2)+self.b2) # print("ret.shape"+str(np.array(ret).shape)) return ret, Z
def forward(self, x_t): self.t += 1 t = self.t h = self.h[t-1] self.input_gate[t] = sigmoid(np.dot(self.W_hi, h) + np.dot(self.W_xi, x_t) + self.b_i) self.forget_gate[t] = sigmoid(np.dot(self.W_hf, h) + np.dot(self.W_xf, x_t) + self.b_f) self.output_gate[t] = sigmoid(np.dot(self.W_ho, h) + np.dot(self.W_xo, x_t) + self.b_o) self.cell_update[t] = tanh(np.dot(self.W_hj, h) + np.dot(self.W_xj, x_t) + self.b_j) self.c[t] = self.input_gate[t] * self.cell_update[t] + self.forget_gate[t] * self.c[t-1] self.ct[t] = tanh(self.c[t]) self.h[t] = self.output_gate[t] * self.ct[t] self.x[t] = x_t return self.h[t]
def forward_pass(self, inputs): # decleare variables used forward pass self.inputs = inputs self.n_inp = len(inputs) self.vr = [] self.vz = [] self.v_h = [] self.vo = [] self.r = [] self.z = [] self._h = [] self.h = {} self.o = [] self.h[-1] = np.zeros((self.h_size, 1)) # performing recurrsion for i in range(self.n_inp): # calculating reset gate value # self.vr.append(np.dot(self.w['ur'],inputs[i]) + np.dot(self.w['wr'], self.h[i-1]) + self.b['r']) # self.r.append(sigmoid(self.vr[i])) self.r.append( sigmoid( np.dot(self.w['ur'], inputs[i]) + np.dot(self.w['wr'], self.h[i - 1]) + self.b['r'])) # calculation update gate value # self.vz.append(np.dot(self.w['uz'],inputs[i]) + np.dot(self.w['wz'], self.h[i-1]) + self.b['z']) # self.z.append(sigmoid(self.vz[i])) self.z.append( sigmoid( np.dot(self.w['uz'], inputs[i]) + np.dot(self.w['wz'], self.h[i - 1]) + self.b['z'])) # applying reset gate value # self.v_h.append(np.dot(self.w['u_h'], inputs[i]) + np.dot(self.w['w_h'], np.multiply(self.h[i - 1], self.r[i])) + + self.b['_h']) # self._h.append(tanh(self.v_h[i])) self._h.append( tanh( np.dot(self.w['u_h'], inputs[i]) + np.dot(self.w['w_h'], np.multiply(self.h[i - 1], self.r[i])) + +self.b['_h'])) # applying update gate value self.h[i] = np.multiply(self.z[i], self.h[i - 1]) + np.multiply( 1 - self.z[i], self._h[i]) # calculating output # self.vo.append(np.dot(self.w['wo'], self.h[i]) + self.b['o']) # self.o.append(softmax(self.vo[i])) self.o.append( softmax(np.dot(self.w['wo'], self.h[i]) + self.b['o'])) return self.o
def mlp_attention_layer(tparams, state_below, options, prefix='attention'): mean_emb = state_below.mean(1) attention_vec = tensor.dot(state_below, tparams[_p( prefix, 'W_att')]) + tparams[_p(prefix, 'b')] attention_vec += tensor.dot(mean_emb, tparams[_p(prefix, 'Wm')])[:, None, :] attention_vec = tanh(attention_vec) alpha = tensor.dot(attention_vec, tparams[_p( prefix, 'U_att')]) + tparams[_p(prefix, 'c_att')] alpha_shp = alpha.shape alpha = tensor.nnet.softmax(alpha.reshape([alpha_shp[0], alpha_shp[1]])) output = (state_below * alpha[:, :, None]).sum(1) return output
def forward(self, x_t): self.t += 1 t = self.t h = self.h[t - 1] self.input_gate[t] = sigmoid( np.dot(self.W_hi, h) + np.dot(self.W_xi, x_t) + self.b_i) self.forget_gate[t] = sigmoid( np.dot(self.W_hi, h) + np.dot(self.W_xf, x_t) + self.b_f) self.output_gate[t] = sigmoid( np.dot(self.W_ho, h) + np.dot(self.W_xo, x_t) + self.b_o) self.cell_update[t] = tanh( np.dot(self.W_hj, h) + np.dot(self.W_xj, x_t) + self.b_j) self.c[t] = self.input_gate[t] * self.cell_update[ t] + self.forget_gate[t] * self.c[t - 1] self.ct[t] = tanh(self.c[t]) self.h[t] = self.output_gate[t] * self.ct[t] self.x[t] = x_t return self.h[t]
def _step(m_, x_, h_, c_, a_, as_, ct_, pctx_, dp_=None, dp_att_=None): """ Each variable is one time slice of the LSTM m_ - (mask), x_- (previous word), h_- (hidden state), c_- (lstm memory), a_ - (alpha distribution [eq (5)]), as_- (sample from alpha dist), ct_- (context), pctx_ (projected context), dp_/dp_att_ (dropout masks) """ # attention computation # [described in equations (4), (5), (6) in # section "3.1.2 Decoder: Long Short Term Memory Network] pstate_ = tensor.dot(h_, tparams[_p(prefix,'Wd_att')]) + tensor.dot(ct_, tparams[_p(prefix, 'Wct_att')]) pctx_ = pctx_ + pstate_[:,None,:] pctx_list = [] pctx_list.append(pctx_) pctx_ = tanh(pctx_) alpha = tensor.dot(pctx_, tparams[_p(prefix,'U_att')])+tparams[_p(prefix, 'c_tt')] alpha_pre = alpha alpha_shp = alpha.shape alpha = tensor.nnet.softmax(alpha.reshape([alpha_shp[0],alpha_shp[1]])) # softmax ctx_ = (context * alpha[:,:,None]).sum(1) # current context alpha_sample = alpha # you can return something else reasonable here to debug preact = tensor.dot(h_, tparams[_p(prefix, 'U')]) preact += x_ preact += tensor.dot(ctx_, tparams[_p(prefix, 'Wc')]) # Recover the activations to the lstm gates # [equation (1)] i = _slice(preact, 0, dim) f = _slice(preact, 1, dim) o = _slice(preact, 2, dim) if options['use_dropout_lstm']: i = i * _slice(dp_, 0, dim) f = f * _slice(dp_, 1, dim) o = o * _slice(dp_, 2, dim) i = tensor.nnet.sigmoid(i) f = tensor.nnet.sigmoid(f) o = tensor.nnet.sigmoid(o) c = tensor.tanh(_slice(preact, 3, dim)) # compute the new memory/hidden state # if the mask is 0, just copy the previous state c = f * c_ + i * c c = m_[:,None] * c + (1. - m_)[:,None] * c_ h = o * tensor.tanh(c) h = m_[:,None] * h + (1. - m_)[:,None] * h_ rval = [h, c, alpha, alpha_sample, ctx_] rval += [pstate_, pctx_, i, f, o, preact, alpha_pre]+pctx_list return rval
def predict(self,input): L = np.shape(input)[0] az = np.zeros((L,self.Nhidden)) ar = np.zeros((L,self.Nhidden)) ahhat = np.zeros((L,self.Nhidden)) ah = np.zeros((L,self.Nhidden)) a1 = tanh(np.dot(input,self.w1) + self.b1) x = np.concatenate((np.zeros((self.Nhidden)),a1[1,:])) az[1,:] = sigm(np.dot(x,self.wz) + self.bz) ar[1,:] = sigm(np.dot(x,self.wr) + self.br) ahhat[1,:] = tanh(np.dot(x,self.wh) + self.bh) ah[1,:] = az[1,:]*ahhat[1,:] for i in range(1,L): x = np.concatenate((ah[i-1,:],a1[i,:])) az[i,:] = sigm(np.dot(x,self.wz) + self.bz) ar[i,:] = sigm(np.dot(x,self.wr) + self.br) x = np.concatenate((ar[i,:]*ah[i-1,:],a1[i,:])) ahhat[i,:] = tanh(np.dot(x,self.wh) + self.bh) ah[i,:] = (1-az[i,:])*ah[i-1,:] + az[i,:]*ahhat[i,:] a2 = tanh(np.dot(ah,self.w2) + self.b2) return [a1,az,ar,ahhat,ah,a2]
def backward(self, learning_rate=0.01): # using mse for loss self._gradients = [] mse = np.average( np.square(self._forward_cache_acted[-1] - self._labels)) d_mse_yhat = np.average(2 * (self._forward_cache_acted[-1] - self._labels)) times = len(self._layers) - 1 dx = np.ones((self._forward_cache_raw[times - 2].shape[0], 1)) for i in range(times - 1, -1, -1): # in reverse order act = self._activations[i] d_act = None if not act: d_act = np.ones(self._forward_cache_raw[i + 1].shape) # d_act = 1 elif act.lower() == 'sigmoid': d_act = util.sigmoid(self._forward_cache_raw[i + 1]) * ( 1 - util.sigmoid(self._forward_cache_raw[i + 1])) elif act.lower() == 'relu': d_act = (self._forward_cache_raw[i + 1] > 0).astype('float32') elif act.lower() == 'tanh': d_act = 1 - np.square(util.tanh( self._forward_cache_raw[i + 1])) if i != times - 1: dw = np.dot( dx * d_act, self._forward_cache_raw[i].T) / self._labels.shape[1] else: dw = np.dot( d_act, self._forward_cache_raw[i].T) / self._labels.shape[1] dx = np.dot(d_act.T, self._weights[i]).T self._gradients.insert(0, dw * d_mse_yhat) for i in range(times): self._weights[ i] = self._weights[i] - learning_rate * self._gradients[i] return mse
def build_model(tparams, options, sampling=True): """ Builds the entire computational graph used for training Basically does a forward pass through the data and calculates the cost function [This function builds a model described in Section 3.1.2 onwards as the convolutional feature are precomputed, some extra features which were not used are also implemented here.] Parameters ---------- tparams : OrderedDict maps names of variables to theano shared variables options : dict big dictionary with all the settings and hyperparameters sampling : boolean [If it is true, when using stochastic attention, follows the learning rule described in section 4. at the bottom left of page 5] Returns ------- trng: theano random number generator Used for dropout, stochastic attention, etc use_noise: theano shared variable flag that toggles noise on and off [x, mask, ctx]: theano variables Represent the captions, binary mask, and annotations for a single batch (see dimensions below) alphas: theano variables Attention weights alpha_sample: theano variable Sampled attention weights used in REINFORCE for stochastic attention: [see the learning rule in eq (12)] cost: theano variable negative log likelihood opt_outs: OrderedDict extra outputs required depending on configuration in options """ trng = RandomStreams(1234) use_noise = theano.shared(numpy.float32(0.)) # description string: #words x #samples, x = tensor.matrix('x', dtype='int64') mask = tensor.matrix('mask', dtype='float32') # context: #samples x #annotations x dim ctx = tensor.tensor3('ctx', dtype='float32') n_timesteps = x.shape[0] n_samples = x.shape[1] # index into the word embedding matrix, shift it forward in time emb = tparams['Wemb'][x.flatten()].reshape( [n_timesteps, n_samples, options['dim_word']]) emb_shifted = tensor.zeros_like(emb) emb_shifted = tensor.set_subtensor(emb_shifted[1:], emb[:-1]) emb = emb_shifted if options['lstm_encoder']: # encoder ctx_fwd = get_layer('lstm')[1](tparams, ctx.dimshuffle(1, 0, 2), options, prefix='encoder')[0].dimshuffle( 1, 0, 2) ctx_rev = get_layer('lstm')[1]( tparams, ctx.dimshuffle(1, 0, 2)[:, ::-1, :], options, prefix='encoder_rev')[0][:, ::-1, :].dimshuffle(1, 0, 2) ctx0 = tensor.concatenate((ctx_fwd, ctx_rev), axis=2) else: ctx0 = ctx # initial state/cell [top right on page 4] ctx_mean = ctx0.mean(1) for lidx in xrange(1, options['n_layers_init']): ctx_mean = get_layer('ff')[1](tparams, ctx_mean, options, prefix='ff_init_%d' % lidx, activ='rectifier') if options['use_dropout']: ctx_mean = dropout_layer(ctx_mean, use_noise, trng) init_state = get_layer('ff')[1](tparams, ctx_mean, options, prefix='ff_state', activ='tanh') init_memory = get_layer('ff')[1](tparams, ctx_mean, options, prefix='ff_memory', activ='tanh') # lstm decoder # [equation (1), (2), (3) in section 3.1.2] attn_updates = [] proj, updates = get_layer('lstm_cond')[1](tparams, emb, options, prefix='decoder', mask=mask, context=ctx0, one_step=False, init_state=init_state, init_memory=init_memory, trng=trng, use_noise=use_noise, sampling=sampling) attn_updates += updates proj_h = proj[0] # optional deep attention if options['n_layers_lstm'] > 1: for lidx in xrange(1, options['n_layers_lstm']): init_state = get_layer('ff')[1](tparams, ctx_mean, options, prefix='ff_state_%d' % lidx, activ='tanh') init_memory = get_layer('ff')[1](tparams, ctx_mean, options, prefix='ff_memory_%d' % lidx, activ='tanh') proj, updates = get_layer('lstm_cond')[1](tparams, proj_h, options, prefix='decoder_%d' % lidx, mask=mask, context=ctx0, one_step=False, init_state=init_state, init_memory=init_memory, trng=trng, use_noise=use_noise, sampling=sampling) attn_updates += updates proj_h = proj[0] alphas = proj[2] alpha_sample = proj[3] ctxs = proj[4] # [beta value explained in note 4.2.1 "doubly stochastic attention"] if options['selector']: sels = proj[5] if options['use_dropout']: proj_h = dropout_layer(proj_h, use_noise, trng) # compute word probabilities # [equation (7)] logit = get_layer('ff')[1](tparams, proj_h, options, prefix='ff_logit_lstm', activ='linear') if options['prev2out']: logit += emb if options['ctx2out']: logit += get_layer('ff')[1](tparams, ctxs, options, prefix='ff_logit_ctx', activ='linear') logit = tanh(logit) if options['use_dropout']: logit = dropout_layer(logit, use_noise, trng) if options['n_layers_out'] > 1: for lidx in xrange(1, options['n_layers_out']): logit = get_layer('ff')[1](tparams, logit, options, prefix='ff_logit_h%d' % lidx, activ='rectifier') if options['use_dropout']: logit = dropout_layer(logit, use_noise, trng) # compute softmax logit = get_layer('ff')[1](tparams, logit, options, prefix='ff_logit', activ='linear') logit_shp = logit.shape probs = tensor.nnet.softmax( logit.reshape([logit_shp[0] * logit_shp[1], logit_shp[2]])) # Index into the computed probability to give the log likelihood x_flat = x.flatten() p_flat = probs.flatten() cost = -tensor.log(p_flat[tensor.arange(x_flat.shape[0]) * probs.shape[1] + x_flat] + 1e-8) cost = cost.reshape([x.shape[0], x.shape[1]]) masked_cost = cost * mask cost = (masked_cost).sum(0) # optional outputs opt_outs = dict() if options['selector']: opt_outs['selector'] = sels if options['attn_type'] == 'stochastic': opt_outs['masked_cost'] = masked_cost # need this for reinforce later opt_outs['attn_updates'] = attn_updates # this is to update the rng return trng, use_noise, [x, mask, ctx], alphas, alpha_sample, cost, opt_outs
def _step(m_, x_, h_, c_, a_, as_, ct_, pctx_, dp_=None, dp_att_=None): """ Each variable is one time slice of the LSTM m_ - (mask), x_- (previous word), h_- (hidden state), c_- (lstm memory), a_ - (alpha distribution [eq (5)]), as_- (sample from alpha dist), ct_- (context), pctx_ (projected context), dp_/dp_att_ (dropout masks) """ # attention computation # [described in equations (4), (5), (6) in # section "3.1.2 Decoder: Long Short Term Memory Network] pstate_ = tensor.dot(h_, tparams[_p(prefix, 'Wd_att')]) pctx_ = pctx_ + pstate_[:, None, :] pctx_list = [] pctx_list.append(pctx_) pctx_ = tanh(pctx_) alpha = tensor.dot(pctx_, tparams[_p(prefix, 'U_att')]) + tparams[_p( prefix, 'c_tt')] alpha_pre = alpha alpha_shp = alpha.shape if options['attn_type'] == 'deterministic': alpha = tensor.nnet.softmax( alpha.reshape([alpha_shp[0], alpha_shp[1]])) # softmax ctx_ = (context * alpha[:, :, None]).sum(1) # current context alpha_sample = alpha # you can return something else reasonable here to debug else: alpha = tensor.nnet.softmax( temperature_c * alpha.reshape([alpha_shp[0], alpha_shp[1]])) # softmax # TODO return alpha_sample if sampling: alpha_sample = h_sampling_mask * trng.multinomial(pvals=alpha,dtype=theano.config.floatX)\ + (1.-h_sampling_mask) * alpha else: if argmax: alpha_sample = tensor.cast( tensor.eq( tensor.arange(alpha_shp[1])[None, :], tensor.argmax(alpha, axis=1, keepdims=True)), theano.config.floatX) else: alpha_sample = alpha ctx_ = (context * alpha_sample[:, :, None]).sum( 1) # current context if options['selector']: sel_ = tensor.nnet.sigmoid( tensor.dot(h_, tparams[_p(prefix, 'W_sel')]) + tparams[_p(prefix, 'b_sel')]) sel_ = sel_.reshape([sel_.shape[0]]) ctx_ = sel_[:, None] * ctx_ preact = tensor.dot(h_, tparams[_p(prefix, 'U')]) preact += x_ preact += tensor.dot(ctx_, tparams[_p(prefix, 'Wc')]) # Recover the activations to the lstm gates # [equation (1)] i = _slice(preact, 0, dim) f = _slice(preact, 1, dim) o = _slice(preact, 2, dim) if options['use_dropout_lstm']: i = i * _slice(dp_, 0, dim) f = f * _slice(dp_, 1, dim) o = o * _slice(dp_, 2, dim) i = tensor.nnet.sigmoid(i) f = tensor.nnet.sigmoid(f) o = tensor.nnet.sigmoid(o) c = tensor.tanh(_slice(preact, 3, dim)) # compute the new memory/hidden state # if the mask is 0, just copy the previous state c = f * c_ + i * c c = m_[:, None] * c + (1. - m_)[:, None] * c_ h = o * tensor.tanh(c) h = m_[:, None] * h + (1. - m_)[:, None] * h_ rval = [h, c, alpha, alpha_sample, ctx_] if options['selector']: rval += [sel_] rval += [pstate_, pctx_, i, f, o, preact, alpha_pre] + pctx_list return rval
def lstm_cond_layer(tparams, state_below, options, prefix='lstm', mask=None, context=None, one_step=False, init_memory=None, init_state=None, trng=None, use_noise=None, sampling=True, argmax=False, **kwargs): assert context, 'Context must be provided' if one_step: assert init_memory, 'previous memory must be provided' assert init_state, 'previous state must be provided' nsteps = state_below.shape[0] if state_below.ndim == 3: n_samples = state_below.shape[1] else: n_samples = 1 # mask if mask is None: mask = tensor.alloc(1., state_below.shape[0], 1) # infer lstm dimension dim = tparams[_p(prefix, 'U')].shape[0] # initial/previous state if init_state is None: init_state = tensor.alloc(0., n_samples, dim) # initial/previous memory if init_memory is None: init_memory = tensor.alloc(0., n_samples, dim) # projected context pctx_ = tensor.dot(context, tparams[_p(prefix, 'Wc_att')]) + tparams[_p( prefix, 'b_att')] if options['n_layers_att'] > 1: for lidx in xrange(1, options['n_layers_att']): pctx_ = tensor.dot(pctx_, tparams[_p( prefix, 'W_att_%d' % lidx)]) + tparams[_p( prefix, 'b_att_%d' % lidx)] # note to self: this used to be options['n_layers_att'] - 1, so no extra non-linearity if n_layers_att < 3 if lidx < options['n_layers_att']: pctx_ = tanh(pctx_) # projected x # state_below is timesteps*num samples by d in training (TODO change to notation of paper) # this is n * d during sampling state_below = tensor.dot(state_below, tparams[_p( prefix, 'W')]) + tparams[_p(prefix, 'b')] # additional parameters for stochastic hard attention if options['attn_type'] == 'stochastic': # temperature for softmax temperature = options.get("temperature", 1) # [see (Section 4.1): Stochastic "Hard" Attention] semi_sampling_p = options.get("semi_sampling_p", 0.5) temperature_c = theano.shared(numpy.float32(temperature), name='temperature_c') h_sampling_mask = trng.binomial((1, ), p=semi_sampling_p, n=1, dtype=theano.config.floatX).sum() def _slice(_x, n, dim): if _x.ndim == 3: return _x[:, :, n * dim:(n + 1) * dim] return _x[:, n * dim:(n + 1) * dim] def _step(m_, x_, h_, c_, a_, as_, ct_, pctx_, dp_=None, dp_att_=None): """ Each variable is one time slice of the LSTM m_ - (mask), x_- (previous word), h_- (hidden state), c_- (lstm memory), a_ - (alpha distribution [eq (5)]), as_- (sample from alpha dist), ct_- (context), pctx_ (projected context), dp_/dp_att_ (dropout masks) """ # attention computation # [described in equations (4), (5), (6) in # section "3.1.2 Decoder: Long Short Term Memory Network] pstate_ = tensor.dot(h_, tparams[_p(prefix, 'Wd_att')]) pctx_ = pctx_ + pstate_[:, None, :] pctx_list = [] pctx_list.append(pctx_) pctx_ = tanh(pctx_) alpha = tensor.dot(pctx_, tparams[_p(prefix, 'U_att')]) + tparams[_p( prefix, 'c_tt')] alpha_pre = alpha alpha_shp = alpha.shape if options['attn_type'] == 'deterministic': alpha = tensor.nnet.softmax( alpha.reshape([alpha_shp[0], alpha_shp[1]])) # softmax ctx_ = (context * alpha[:, :, None]).sum(1) # current context alpha_sample = alpha # you can return something else reasonable here to debug else: alpha = tensor.nnet.softmax( temperature_c * alpha.reshape([alpha_shp[0], alpha_shp[1]])) # softmax # TODO return alpha_sample if sampling: alpha_sample = h_sampling_mask * trng.multinomial(pvals=alpha,dtype=theano.config.floatX)\ + (1.-h_sampling_mask) * alpha else: if argmax: alpha_sample = tensor.cast( tensor.eq( tensor.arange(alpha_shp[1])[None, :], tensor.argmax(alpha, axis=1, keepdims=True)), theano.config.floatX) else: alpha_sample = alpha ctx_ = (context * alpha_sample[:, :, None]).sum( 1) # current context if options['selector']: sel_ = tensor.nnet.sigmoid( tensor.dot(h_, tparams[_p(prefix, 'W_sel')]) + tparams[_p(prefix, 'b_sel')]) sel_ = sel_.reshape([sel_.shape[0]]) ctx_ = sel_[:, None] * ctx_ preact = tensor.dot(h_, tparams[_p(prefix, 'U')]) preact += x_ preact += tensor.dot(ctx_, tparams[_p(prefix, 'Wc')]) # Recover the activations to the lstm gates # [equation (1)] i = _slice(preact, 0, dim) f = _slice(preact, 1, dim) o = _slice(preact, 2, dim) if options['use_dropout_lstm']: i = i * _slice(dp_, 0, dim) f = f * _slice(dp_, 1, dim) o = o * _slice(dp_, 2, dim) i = tensor.nnet.sigmoid(i) f = tensor.nnet.sigmoid(f) o = tensor.nnet.sigmoid(o) c = tensor.tanh(_slice(preact, 3, dim)) # compute the new memory/hidden state # if the mask is 0, just copy the previous state c = f * c_ + i * c c = m_[:, None] * c + (1. - m_)[:, None] * c_ h = o * tensor.tanh(c) h = m_[:, None] * h + (1. - m_)[:, None] * h_ rval = [h, c, alpha, alpha_sample, ctx_] if options['selector']: rval += [sel_] rval += [pstate_, pctx_, i, f, o, preact, alpha_pre] + pctx_list return rval if options['use_dropout_lstm']: if options['selector']: _step0 = lambda m_, x_, dp_, h_, c_, a_, as_, ct_, sel_, pctx_: \ _step(m_, x_, h_, c_, a_, as_, ct_, pctx_, dp_) else: _step0 = lambda m_, x_, dp_, h_, c_, a_, as_, ct_, pctx_: \ _step(m_, x_, h_, c_, a_, as_, ct_, pctx_, dp_) dp_shape = state_below.shape if one_step: dp_mask = tensor.switch( use_noise, trng.binomial((dp_shape[0], 3 * dim), p=0.5, n=1, dtype=state_below.dtype), tensor.alloc(0.5, dp_shape[0], 3 * dim)) else: dp_mask = tensor.switch( use_noise, trng.binomial((dp_shape[0], dp_shape[1], 3 * dim), p=0.5, n=1, dtype=state_below.dtype), tensor.alloc(0.5, dp_shape[0], dp_shape[1], 3 * dim)) else: if options['selector']: _step0 = lambda m_, x_, h_, c_, a_, as_, ct_, sel_, pctx_: _step( m_, x_, h_, c_, a_, as_, ct_, pctx_) else: _step0 = lambda m_, x_, h_, c_, a_, as_, ct_, pctx_: _step( m_, x_, h_, c_, a_, as_, ct_, pctx_) if one_step: if options['use_dropout_lstm']: if options['selector']: rval = _step0(mask, state_below, dp_mask, init_state, init_memory, None, None, None, None, pctx_) else: rval = _step0(mask, state_below, dp_mask, init_state, init_memory, None, None, None, pctx_) else: if options['selector']: rval = _step0(mask, state_below, init_state, init_memory, None, None, None, None, pctx_) else: rval = _step0(mask, state_below, init_state, init_memory, None, None, None, pctx_) return rval else: seqs = [mask, state_below] if options['use_dropout_lstm']: seqs += [dp_mask] outputs_info = [ init_state, init_memory, tensor.alloc(0., n_samples, pctx_.shape[1]), tensor.alloc(0., n_samples, pctx_.shape[1]), tensor.alloc(0., n_samples, context.shape[2]) ] if options['selector']: outputs_info += [tensor.alloc(0., n_samples)] outputs_info += [None, None, None, None, None, None, None ] + [None] # *options['n_layers_att'] rval, updates = theano.scan(_step0, sequences=seqs, outputs_info=outputs_info, non_sequences=[pctx_], name=_p(prefix, '_layers'), n_steps=nsteps, profile=False) return rval, updates
def forwardProp(self,allKids,words_embedded,updateWlab,label,theta,freq): (W1,W2,W3,W4,Wlab,b1,b2,b3,blab,WL)=self.getParams(theta) sl=np.size(words_embedded,1) sentree=rnntree.rnntree(self.d,sl,words_embedded) collapsed_sentence = range(sl) if updateWlab: temp_label=np.zeros(self.cat) temp_label[label-1]=1.0 nodeUnder = np.ones([2*sl-1,1]) for i in range(sl,2*sl-1): # calculate n1, n2 and n1+n2 for each node in the sensentree and store in nodeUnder kids = allKids[i] n1 = nodeUnder[kids[0]] n2 = nodeUnder[kids[1]] nodeUnder[i] = n1+n2 cat_size=self.cat sentree.catDelta = np.zeros([cat_size, 2*sl-1]) sentree.catDelta_out = np.zeros([self.d,2*sl-1]) # classifier on single words for i in range(sl): sm = softmax(np.dot(Wlab,words_embedded[:,i]) + blab) lbl_sm = (1-self.alpha)*(temp_label - sm) sentree.nodeScores[i] = 1.0/2.0*(np.dot(lbl_sm,(temp_label- sm))) sentree.catDelta[:, i] = -np.dot(lbl_sm,softmax_prime(sm)) # sm = sigmoid(self.Wlab*words_embedded + self.blab) #lbl_sm = (1-self.alpha)*(label[:,np.ones(sl,1)] - sm) #sentree.nodeScores[:sl] = 1/2*(lbl_sm.*(label(:,ones(sl,1)) - sm)) #sentree.catDelta[:, :sl] = -(lbl_sm).*sigmoid_prime(sm) for i in range(sl,2*sl-1): kids = allKids[i] c1 = sentree.nodeFeatures[:,kids[0]] c2 = sentree.nodeFeatures[:,kids[1]] # Eq. [2] in the paper: p = f(W[1][c1 c2] + b[1]) p = tanh(np.dot(W1,c1) + np.dot(W2,c2) + b1) # See last paragraph in Section 2.3 p_norm1 = p/norm(p) # Eq. (7) in the paper (for special case of 1d label) #sm = sigmoid(np.dot(Wlab,p_norm1) + blab) sm=softmax(np.dot(Wlab,p_norm1) + blab) beta=0.5 #lbl_sm = beta * (1.0-self.alpha)*(label - sm) lbl_sm = beta * (1.0-self.alpha)*(temp_label - sm) #lbl_sm = beta * (1.0-self.alpha) * (temp_label-sm) #sentree.catDelta[:, i] = -softmax_prime(sm)[:,label-1] #J=-(1.0-self.alpha)*np.log(sm[label-1]) #sentree.catDelta[:, i] = -np.dot(lbl_sm,sigmoid_prime(sm)) sentree.catDelta[:, i] = -np.dot(lbl_sm,softmax_prime(sm)) #J = 1.0/2.0*(np.dot(lbl_sm,(label - sm))) J = 1.0/2.0*(np.dot(lbl_sm,(temp_label - sm))) sentree.nodeFeatures[:,i] = p_norm1 sentree.nodeFeatures_unnormalized[:,i] = p sentree.nodeScores[i] = J sentree.numkids = nodeUnder sentree.kids = allKids else: # Reconstruction Error for j in range(sl-1): size2=np.size(words_embedded,1) c1 = words_embedded[:,0:-1] c2 = words_embedded[:,1:] freq1 = freq[0:-1] freq2 = freq[1:] p = tanh(np.dot(W1,c1) + np.dot(W2,c2) + np.reshape(b1,[self.d,1])*([1]*(size2-1))) p_norm1 =p/np.sqrt(sum(p**2)) y1_unnormalized = tanh(np.dot(W3,p_norm1) + np.reshape(b2,[self.d,1])*([1]*(size2-1))) y2_unnormalized = tanh(np.dot(W4,p_norm1) + np.reshape(b3,[self.d,1])*([1]*(size2-1))) y1 = y1_unnormalized/np.sqrt(sum(y1_unnormalized**2)) y2 = y2_unnormalized/np.sqrt(sum(y2_unnormalized**2)) y1c1 = self.alpha*(y1-c1) y2c2 = self.alpha*(y2-c2) # Eq. (4) in the paper: reconstruction error J = 1.0/2.0*sum((y1c1)*(y1-c1) + (y2c2)*(y2-c2)) # finding the pair with smallest reconstruction error for constructing sentree J_min= min(J) J_minpos=np.argmin(J) sentree.node_y1c1[:,sl+j] = y1c1[:,J_minpos] sentree.node_y2c2[:,sl+j] = y2c2[:,J_minpos] sentree.nodeDelta_out1[:,sl+j] = np.dot(norm1tanh_prime(y1_unnormalized[:,J_minpos]) , y1c1[:,J_minpos]) sentree.nodeDelta_out2[:,sl+j] = np.dot(norm1tanh_prime(y2_unnormalized[:,J_minpos]) , y2c2[:,J_minpos]) words_embedded=np.delete(words_embedded,J_minpos+1,1) words_embedded[:,J_minpos]=p_norm1[:,J_minpos] sentree.nodeFeatures[:, sl+j] = p_norm1[:,J_minpos] sentree.nodeFeatures_unnormalized[:, sl+j]= p[:,J_minpos] sentree.nodeScores[sl+j] = J_min sentree.pp[collapsed_sentence[J_minpos]] = sl+j sentree.pp[collapsed_sentence[J_minpos+1]] = sl+j sentree.kids[sl+j,:] = [collapsed_sentence[J_minpos], collapsed_sentence[J_minpos+1]] sentree.numkids[sl+j] = sentree.numkids[sentree.kids[sl+j,0]] + sentree.numkids[sentree.kids[sl+j,1]] freq=np.delete(freq,J_minpos+1) freq[J_minpos] = (sentree.numkids[sentree.kids[sl+j,0]]*freq1[J_minpos] + sentree.numkids[sentree.kids[sl+j,1]]*freq2[J_minpos])/(sentree.numkids[sentree.kids[sl+j,0]]+sentree.numkids[sentree.kids[sl+j,1]]) collapsed_sentence=np.delete(collapsed_sentence,J_minpos+1) collapsed_sentence[J_minpos]=sl+j return sentree
def supAnalyser(self,X,freq,vocabulary,top=20): result_score=[] result_word=[] for i in range(self.cat): result_score.append([0.0]*top) result_word.append(['']*top) num_sent=np.size(X,0) allKids=[[]]*num_sent for i in range(num_sent): x=X[i] sl=len(x) words_embedded=self.WL[:,x] unsup_tree = self.forwardProp([],words_embedded,False,None,self.theta,freq) allKids[i]=unsup_tree.kids sup_tree=rnntree.rnntree(self.d,sl,words_embedded) nodeUnder = np.ones([2*sl-1,1]) for j in range(sl,2*sl-1): # calculate n1, n2 and n1+n2 for each node in the sensentree and store in nodeUnder kids = allKids[i][j] n1 = nodeUnder[kids[0]] n2 = nodeUnder[kids[1]] nodeUnder[j] = n1+n2 #sentree.catDelta = np.zeros([cat_size, 2*sl-1]) #sentree.catDelta_out = np.zeros([self.d,2*sl-1]) for j in range(2*sl-1): kids = allKids[i][j] c1 = sup_tree.nodeFeatures[:,kids[0]] c2 = sup_tree.nodeFeatures[:,kids[1]] # Eq. [2] in the paper: p = f(W[1][c1 c2] + b[1]) p = tanh(np.dot(self.W1,c1) + np.dot(self.W2,c2) + self.b1) # See last paragraph in Section 2.3 p_norm1 = p/norm(p) # Eq. (7) in the paper (for special case of 1d label) #sm = sigmoid(np.dot(Wlab,p_norm1) + blab) sm=softmax(np.dot(self.Wlab,p_norm1) + self.blab) #max_score=max(sm) for ind in range(self.cat): max_score=sm[ind] #ind=list(sm).index(max_score) min_score=min(result_score[ind]) if max_score>min_score: min_ind=result_score[ind].index(min_score) result_score[ind][min_ind]=max_score if j<sl: result_word[ind][min_ind]=vocabulary[x[j]] else: stk=[] stk.extend(list(kids)) stk.reverse() words=[] while len(stk)!=0: current=stk.pop() if current<sl: words.append(vocabulary[x[current]]) else: toExtend=[] toExtend.extend(list(allKids[i][current])) toExtend.reverse() stk.extend(toExtend) result_word[ind][min_ind]=' '.join(words) return (result_score,result_word)
def lstm_cond_layer(tparams, state_below, options, prefix='lstm', mask=None, context=None, one_step=False, init_memory=None, init_state=None, trng=None, use_noise=None, sampling=True, argmax=False, **kwargs): assert context, 'Context must be provided' if one_step: assert init_memory, 'previous memory must be provided' assert init_state, 'previous state must be provided' nsteps = state_below.shape[0] if state_below.ndim == 3: n_samples = state_below.shape[1] else: n_samples = 1 # mask if mask is None: mask = tensor.alloc(1., state_below.shape[0], 1) # infer lstm dimension dim = tparams[_p(prefix, 'U')].shape[0] # initial/previous state if init_state is None: init_state = tensor.alloc(0., n_samples, dim) # initial/previous memory if init_memory is None: init_memory = tensor.alloc(0., n_samples, dim) # projected context pctx_ = tensor.dot(context, tparams[_p(prefix,'Wc_att')]) + tparams[_p(prefix, 'b_att')] if options['n_layers_att'] > 1: for lidx in xrange(1, options['n_layers_att']): pctx_ = tensor.dot(pctx_, tparams[_p(prefix,'W_att_%d'%lidx)])+tparams[_p(prefix, 'b_att_%d'%lidx)] # note to self: this used to be options['n_layers_att'] - 1, so no extra non-linearity if n_layers_att < 3 if lidx < options['n_layers_att']: pctx_ = tanh(pctx_) # projected x # state_below is timesteps*num samples by d in training (TODO change to notation of paper) # this is n * d during sampling state_below = tensor.dot(state_below, tparams[_p(prefix, 'W')]) + tparams[_p(prefix, 'b')] # additional parameters for stochastic hard attention if options['attn_type'] == 'stochastic': # temperature for softmax temperature = options.get("temperature", 1) # [see (Section 4.1): Stochastic "Hard" Attention] semi_sampling_p = options.get("semi_sampling_p", 0.5) temperature_c = theano.shared(numpy.float32(temperature), name='temperature_c') h_sampling_mask = trng.binomial((1,), p=semi_sampling_p, n=1, dtype=theano.config.floatX).sum() def _slice(_x, n, dim): if _x.ndim == 3: return _x[:, :, n*dim:(n+1)*dim] return _x[:, n*dim:(n+1)*dim] def _step(m_, x_, h_, c_, a_, as_, ct_, pctx_, dp_=None, dp_att_=None): """ Each variable is one time slice of the LSTM m_ - (mask), x_- (previous word), h_- (hidden state), c_- (lstm memory), a_ - (alpha distribution [eq (5)]), as_- (sample from alpha dist), ct_- (context), pctx_ (projected context), dp_/dp_att_ (dropout masks) """ # attention computation # [described in equations (4), (5), (6) in # section "3.1.2 Decoder: Long Short Term Memory Network] pstate_ = tensor.dot(h_, tparams[_p(prefix,'Wd_att')]) pctx_ = pctx_ + pstate_[:,None,:] pctx_list = [] pctx_list.append(pctx_) pctx_ = tanh(pctx_) alpha = tensor.dot(pctx_, tparams[_p(prefix,'U_att')])+tparams[_p(prefix, 'c_tt')] alpha_pre = alpha alpha_shp = alpha.shape if options['attn_type'] == 'deterministic': alpha = tensor.nnet.softmax(alpha.reshape([alpha_shp[0],alpha_shp[1]])) # softmax ctx_ = (context * alpha[:,:,None]).sum(1) # current context alpha_sample = alpha # you can return something else reasonable here to debug else: alpha = tensor.nnet.softmax(temperature_c*alpha.reshape([alpha_shp[0],alpha_shp[1]])) # softmax # TODO return alpha_sample if sampling: alpha_sample = h_sampling_mask * trng.multinomial(pvals=alpha,dtype=theano.config.floatX)\ + (1.-h_sampling_mask) * alpha else: if argmax: alpha_sample = tensor.cast(tensor.eq(tensor.arange(alpha_shp[1])[None,:], tensor.argmax(alpha,axis=1,keepdims=True)), theano.config.floatX) else: alpha_sample = alpha ctx_ = (context * alpha_sample[:,:,None]).sum(1) # current context if options['selector']: sel_ = tensor.nnet.sigmoid(tensor.dot(h_, tparams[_p(prefix, 'W_sel')])+tparams[_p(prefix,'b_sel')]) sel_ = sel_.reshape([sel_.shape[0]]) ctx_ = sel_[:,None] * ctx_ preact = tensor.dot(h_, tparams[_p(prefix, 'U')]) preact += x_ preact += tensor.dot(ctx_, tparams[_p(prefix, 'Wc')]) # Recover the activations to the lstm gates # [equation (1)] i = _slice(preact, 0, dim) f = _slice(preact, 1, dim) o = _slice(preact, 2, dim) if options['use_dropout_lstm']: i = i * _slice(dp_, 0, dim) f = f * _slice(dp_, 1, dim) o = o * _slice(dp_, 2, dim) i = tensor.nnet.sigmoid(i) f = tensor.nnet.sigmoid(f) o = tensor.nnet.sigmoid(o) c = tensor.tanh(_slice(preact, 3, dim)) # compute the new memory/hidden state # if the mask is 0, just copy the previous state c = f * c_ + i * c c = m_[:,None] * c + (1. - m_)[:,None] * c_ h = o * tensor.tanh(c) h = m_[:,None] * h + (1. - m_)[:,None] * h_ rval = [h, c, alpha, alpha_sample, ctx_] if options['selector']: rval += [sel_] rval += [pstate_, pctx_, i, f, o, preact, alpha_pre]+pctx_list return rval if options['use_dropout_lstm']: if options['selector']: _step0 = lambda m_, x_, dp_, h_, c_, a_, as_, ct_, sel_, pctx_: \ _step(m_, x_, h_, c_, a_, as_, ct_, pctx_, dp_) else: _step0 = lambda m_, x_, dp_, h_, c_, a_, as_, ct_, pctx_: \ _step(m_, x_, h_, c_, a_, as_, ct_, pctx_, dp_) dp_shape = state_below.shape if one_step: dp_mask = tensor.switch(use_noise, trng.binomial((dp_shape[0], 3*dim), p=0.5, n=1, dtype=state_below.dtype), tensor.alloc(0.5, dp_shape[0], 3 * dim)) else: dp_mask = tensor.switch(use_noise, trng.binomial((dp_shape[0], dp_shape[1], 3*dim), p=0.5, n=1, dtype=state_below.dtype), tensor.alloc(0.5, dp_shape[0], dp_shape[1], 3*dim)) else: if options['selector']: _step0 = lambda m_, x_, h_, c_, a_, as_, ct_, sel_, pctx_: _step(m_, x_, h_, c_, a_, as_, ct_, pctx_) else: _step0 = lambda m_, x_, h_, c_, a_, as_, ct_, pctx_: _step(m_, x_, h_, c_, a_, as_, ct_, pctx_) if one_step: if options['use_dropout_lstm']: if options['selector']: rval = _step0(mask, state_below, dp_mask, init_state, init_memory, None, None, None, None, pctx_) else: rval = _step0(mask, state_below, dp_mask, init_state, init_memory, None, None, None, pctx_) else: if options['selector']: rval = _step0(mask, state_below, init_state, init_memory, None, None, None, None, pctx_) else: rval = _step0(mask, state_below, init_state, init_memory, None, None, None, pctx_) return rval else: seqs = [mask, state_below] if options['use_dropout_lstm']: seqs += [dp_mask] outputs_info = [init_state, init_memory, tensor.alloc(0., n_samples, pctx_.shape[1]), tensor.alloc(0., n_samples, pctx_.shape[1]), tensor.alloc(0., n_samples, context.shape[2])] if options['selector']: outputs_info += [tensor.alloc(0., n_samples)] outputs_info += [None, None, None, None, None, None, None] + [None] # *options['n_layers_att'] rval, updates = theano.scan(_step0, sequences=seqs, outputs_info=outputs_info, non_sequences=[pctx_], name=_p(prefix, '_layers'), n_steps=nsteps, profile=False) return rval, updates
def supAnalyser(self, X, freq, vocabulary, top=20): result_score = [] result_word = [] for i in range(self.cat): result_score.append([0.0] * top) result_word.append([''] * top) num_sent = np.size(X, 0) allKids = [[]] * num_sent for i in range(num_sent): x = X[i] sl = len(x) words_embedded = self.WL[:, x] unsup_tree = self.forwardProp([], words_embedded, False, None, self.theta, freq) allKids[i] = unsup_tree.kids sup_tree = rnntree.rnntree(self.d, sl, words_embedded) nodeUnder = np.ones([2 * sl - 1, 1]) for j in range( sl, 2 * sl - 1 ): # calculate n1, n2 and n1+n2 for each node in the sensentree and store in nodeUnder kids = allKids[i][j] n1 = nodeUnder[kids[0]] n2 = nodeUnder[kids[1]] nodeUnder[j] = n1 + n2 #sentree.catDelta = np.zeros([cat_size, 2*sl-1]) #sentree.catDelta_out = np.zeros([self.d,2*sl-1]) for j in range(2 * sl - 1): kids = allKids[i][j] c1 = sup_tree.nodeFeatures[:, kids[0]] c2 = sup_tree.nodeFeatures[:, kids[1]] # Eq. [2] in the paper: p = f(W[1][c1 c2] + b[1]) p = tanh(np.dot(self.W1, c1) + np.dot(self.W2, c2) + self.b1) # See last paragraph in Section 2.3 p_norm1 = p / norm(p) # Eq. (7) in the paper (for special case of 1d label) #sm = sigmoid(np.dot(Wlab,p_norm1) + blab) sm = softmax(np.dot(self.Wlab, p_norm1) + self.blab) max_score = max(sm) ind = list(sm).index(max_score) min_score = min(result_score[ind]) if max_score > min_score: min_ind = result_score[ind].index(min_score) result_score[ind][min_ind] = max_score if j < sl: result_word[ind][min_ind] = vocabulary[x[j]] else: stk = [] stk.extend(list(kids)) stk.reverse() words = [] while len(stk) != 0: current = stk.pop() if current < sl: words.append(vocabulary[x[current]]) else: toExtend = [] toExtend.extend(list(allKids[i][current])) toExtend.reverse() stk.extend(toExtend) result_word[ind][min_ind] = ' '.join(words) return (result_score, result_word)
def forwardProp(self,allKids,words_embedded,updateWlab,label,theta,freq): #allkids存的是所有节点,第i行存第i个节点,列表示第i行节点所包含的子节点 (W1,W2,W3,W4,Wlab,b1,b2,b3,blab,WL)=self.getParams(theta) #s1可能是词汇表的大小 sl=np.size(words_embedded,1) sentree=rnntree.rnntree(self.d,sl,words_embedded) collapsed_sentence = range(sl) #计算情感误差 if updateWlab: temp_label=np.zeros(self.cat) #label表示当前标签,label-1主要是因为list从0开始,即当前标签的位置为1 temp_label[label-1]=1.0 nodeUnder = np.ones([2*sl-1,1]) #n1,n2是kids的子节点数 for i in range(sl,2*sl-1): # calculate n1, n2 and n1+n2 for each node in the sensentree and store in nodeUnder kids = allKids[i] n1 = nodeUnder[kids[0]] #左节点 n2 = nodeUnder[kids[1]] #右节点 nodeUnder[i] = n1+n2 #第i个节点的子节点数目 cat_size=self.cat sentree.catDelta = np.zeros([cat_size, 2*sl-1]) sentree.catDelta_out = np.zeros([self.d,2*sl-1]) # classifier on single words for i in range(sl): sm = softmax(np.dot(Wlab,words_embedded[:,i]) + blab) #这里代码部分计算情感误差和论文不太一样,这里直接用yi-h(x)来表示情感误差 lbl_sm = (1-self.alpha)*(temp_label - sm) #这里貌似是在计算J sentree.nodeScores[i] = 1.0/2.0*(np.dot(lbl_sm,(temp_label- sm))) #sentree.nodeScores分为2个部分,这里计算0-s1,下面计算2*s1-1 sentree.catDelta[:, i] = -np.dot(lbl_sm,softmax_prime(sm)) # sm = sigmoid(self.Wlab*words_embedded + self.blab) #lbl_sm = (1-self.alpha)*(label[:,np.ones(sl,1)] - sm) #sentree.nodeScores[:sl] = 1/2*(lbl_sm.*(label(:,ones(sl,1)) - sm)) #sentree.catDelta[:, :sl] = -(lbl_sm).*sigmoid_prime(sm) for i in range(sl,2*sl-1): #kids,c1,c2 是什么 kids = allKids[i] c1 = sentree.nodeFeatures[:,kids[0]] #左孩子的词向量 c2 = sentree.nodeFeatures[:,kids[1]] #右孩子的词向量 # Eq. [2] in the paper: p = f(W[1][c1 c2] + b[1]) p = tanh(np.dot(W1,c1) + np.dot(W2,c2) + b1) # See last paragraph in Section 2.3 p_norm1 = p/norm(p) # Eq. (7) in the paper (for special case of 1d label) #sm = sigmoid(np.dot(Wlab,p_norm1) + blab) sm=softmax(np.dot(Wlab,p_norm1) + blab) beta=0.5 #论文里面本来是没有beta这个值的 #lbl_sm = beta * (1.0-self.alpha)*(label - sm) lbl_sm = beta * (1.0-self.alpha)*(temp_label - sm) #lbl_sm = beta * (1.0-self.alpha) * (temp_label-sm) #sentree.catDelta[:, i] = -softmax_prime(sm)[:,label-1] #J=-(1.0-self.alpha)*np.log(sm[label-1]) #sentree.catDelta[:, i] = -np.dot(lbl_sm,sigmoid_prime(sm)) sentree.catDelta[:, i] = -np.dot(lbl_sm,softmax_prime(sm)) #J = 1.0/2.0*(np.dot(lbl_sm,(label - sm))) J = 1.0/2.0*(np.dot(lbl_sm,(temp_label - sm))) sentree.nodeFeatures[:,i] = p_norm1 sentree.nodeFeatures_unnormalized[:,i] = p sentree.nodeScores[i] = J sentree.numkids = nodeUnder sentree.kids = allKids #计算重构误差 else: # Reconstruction Error for j in range(sl-1): size2=np.size(words_embedded,1) c1 = words_embedded[:,0:-1] c2 = words_embedded[:,1:] freq1 = freq[0:-1] freq2 = freq[1:] p = tanh(np.dot(W1,c1) + np.dot(W2,c2) + np.reshape(b1,[self.d,1])*([1]*(size2-1))) p_norm1 =p/np.sqrt(sum(p**2)) #下方y1,y2实际上就是论文的c1,c2,由p分解而来。 y1_unnormalized = tanh(np.dot(W3,p_norm1) + np.reshape(b2,[self.d,1])*([1]*(size2-1))) y2_unnormalized = tanh(np.dot(W4,p_norm1) + np.reshape(b3,[self.d,1])*([1]*(size2-1))) y1 = y1_unnormalized/np.sqrt(sum(y1_unnormalized**2)) y2 = y2_unnormalized/np.sqrt(sum(y2_unnormalized**2)) y1c1 = self.alpha*(y1-c1) y2c2 = self.alpha*(y2-c2) # Eq. (4) in the paper: reconstruction error:重构误差 #(y1-c1)*(y1-c1)的结果是一个数值 J = 1.0/2.0*sum((y1c1)*(y1-c1) + (y2c2)*(y2-c2)) #这个for循环的下面部分没看懂 # finding the pair with smallest reconstruction error for constructing sentree #min(J)是什么意思,J是一个值 J_min= min(J) J_minpos=np.argmin(J) #重构误差最小的重构向量存入树中(c1',c2') sentree.node_y1c1[:,sl+j] = y1c1[:,J_minpos] sentree.node_y2c2[:,sl+j] = y2c2[:,J_minpos] #可能是更新值 sentree.nodeDelta_out1[:,sl+j] = np.dot(norm1tanh_prime(y1_unnormalized[:,J_minpos]) , y1c1[:,J_minpos]) sentree.nodeDelta_out2[:,sl+j] = np.dot(norm1tanh_prime(y2_unnormalized[:,J_minpos]) , y2c2[:,J_minpos]) words_embedded=np.delete(words_embedded,J_minpos+1,1) words_embedded[:,J_minpos]=p_norm1[:,J_minpos] sentree.nodeFeatures[:, sl+j] = p_norm1[:,J_minpos] sentree.nodeFeatures_unnormalized[:, sl+j]= p[:,J_minpos] sentree.nodeScores[sl+j] = J_min sentree.pp[collapsed_sentence[J_minpos]] = sl+j sentree.pp[collapsed_sentence[J_minpos+1]] = sl+j sentree.kids[sl+j,:] = [collapsed_sentence[J_minpos], collapsed_sentence[J_minpos+1]] sentree.numkids[sl+j] = sentree.numkids[sentree.kids[sl+j,0]] + sentree.numkids[sentree.kids[sl+j,1]] freq=np.delete(freq,J_minpos+1) freq[J_minpos] = (sentree.numkids[sentree.kids[sl+j,0]]*freq1[J_minpos] + sentree.numkids[sentree.kids[sl+j,1]]*freq2[J_minpos])/(sentree.numkids[sentree.kids[sl+j,0]]+sentree.numkids[sentree.kids[sl+j,1]]) collapsed_sentence=np.delete(collapsed_sentence,J_minpos+1) collapsed_sentence[J_minpos]=sl+j return sentree
def build_model(tparams, options): """ Builds the entire computational graph used for training Basically does a forward pass through the data and calculates the cost function [This function builds a model described in Section 3.1.2 onwards as the convolutional feature are precomputed, some extra features which were not used are also implemented here.] Parameters ---------- tparams : OrderedDict maps names of variables to theano shared variables options : dict big dictionary with all the settings and hyperparameters Returns ------- trng: theano random number generator Used for dropout, etc use_noise: theano shared variable flag that toggles noise on and off [x, mask, ctx, cnn_features]: theano variables Represent the captions, binary mask, and annotations for a single batch (see dimensions below) alphas: theano variables Attention weights alpha_sample: theano variable Sampled attention weights used in REINFORCE for stochastic attention: [see the learning rule in eq (12)] cost: theano variable negative log likelihood opt_outs: OrderedDict extra outputs required depending on configuration in options """ trng = RandomStreams(1234) use_noise = theano.shared(numpy.float32(0.)) # description string: #words x #samples, x = tensor.matrix('x', dtype='int64') # mask: #samples, mask = tensor.matrix('mask', dtype='float32') # context: #samples x #visual_words x dim if options['with_glove']: ctx = tensor.tensor3('ctx', dtype='float32') new_ctx = ctx else: ctx = tensor.matrix('ctx', dtype='int32') new_ctx = tparams['VCemb'][ctx] # fc7 features: #samples x dim cnn_features = tensor.matrix('cnn_feats', dtype='float32') # index into the word embedding matrix, shift it forward in time, the first element is zero # Time step x S x D emb = tparams['Wemb'][x.flatten()].reshape( [x.shape[0], x.shape[1], options['dim_word']]) emb_shifted = tensor.zeros_like(emb) emb_shifted = tensor.set_subtensor(emb_shifted[1:], emb[:-1]) emb = emb_shifted # forward-backward lstm encoder if options['lstm_encoder']: rval, encoder_alphas = get_layer('lstm_cond_nox')[1](tparams, options, prefix='encoder', context=new_ctx) ctx0 = rval.dimshuffle(1, 0, 2) else: ctx0 = new_ctx for lidx in range(options['n_layers_lstm']): init_state_prefix = 'CNNTrans_%d' % lidx if lidx > 0 else 'CNNTrans' init_memory_prefix = 'CNN_memory_%d' % lidx if lidx > 0 else 'CNN_memory' lstm_prefix = 'decoder_%d' % lidx if lidx > 0 else 'decoder' lstm_inps = proj_h if lidx > 0 else emb init_state = get_layer('ff')[1](tparams, cnn_features, options, prefix=init_state_prefix, activ='tanh') init_memory = get_layer('ff')[1](tparams, cnn_features, options, prefix=init_memory_prefix, activ='tanh') attn_updates = [] proj, updates = get_layer('lstm_cond')[1](tparams, lstm_inps, options, prefix=lstm_prefix, mask=mask, context=ctx0, one_step=False, init_state=init_state, init_memory=init_memory, trng=trng, use_noise=use_noise) attn_updates += updates proj_h = proj[0] alphas = proj[2] ctxs = proj[4] if options['use_dropout']: proj_h = dropout_layer(proj_h, use_noise, trng) # compute word probabilities # [equation (7)] logit = get_layer('ff')[1](tparams, proj_h, options, prefix='ff_logit_lstm', activ='linear') if options['prev2out']: logit += emb if options['ctx2out']: logit += get_layer('ff')[1](tparams, ctxs, options, prefix='ff_logit_ctx', activ='linear') logit = tanh(logit) if options['use_dropout']: logit = dropout_layer(logit, use_noise, trng) if options['n_layers_out'] > 1: for lidx in xrange(1, options['n_layers_out']): logit = get_layer('ff')[1](tparams, logit, options, prefix='ff_logit_h%d' % lidx, activ='rectifier') if options['use_dropout']: logit = dropout_layer(logit, use_noise, trng) # compute softmax logit = get_layer('ff')[1](tparams, logit, options, prefix='ff_logit', activ='linear') logit_shp = logit.shape probs = tensor.nnet.softmax( logit.reshape([logit_shp[0] * logit_shp[1], logit_shp[2]])) # Index into the computed probability to give the log likelihood x_flat = x.flatten() p_flat = probs.flatten() cost = -tensor.log(p_flat[tensor.arange(x_flat.shape[0]) * probs.shape[1] + x_flat] + 1e-8) cost = cost.reshape([x.shape[0], x.shape[1]]) masked_cost = cost * mask #align_cost = (-standard_aligns*alphas).sum(2) cost = masked_cost.sum(0) # optional outputs opt_outs = dict() if options['lstm_encoder']: return trng, use_noise, [x, mask, ctx, cnn_features ], [alphas, encoder_alphas], cost, opt_outs else: return trng, use_noise, [x, mask, ctx, cnn_features], [alphas], cost, opt_outs
def build_sampler(tparams, options, use_noise, trng): """ Builds a sampler used for generating from the model Parameters ---------- tparams : OrderedDict maps names of variables to theano shared variables options : dict big dictionary with all the settings and hyperparameters use_noise: boolean If true, add noise to the sampling trng: random number generator Returns ------- f_init : theano function Input: annotation, Output: initial lstm state and memory (also performs transformation on ctx0 if using lstm_encoder) f_next: theano function Takes the previous word/state/memory + ctx0 and runs ne step through the lstm (used for beam search) """ # context: #annotations x dim if options['with_glove']: ctx = tensor.matrix('ctx_sampler', dtype='float32') new_ctx = ctx else: ctx = tensor.vector('ctx_sampler', dtype='int32') new_ctx = tparams['VCemb'][ctx] if options['lstm_encoder']: ctx0, _ = get_layer('lstm_cond_nox')[1](tparams, options, prefix='encoder', context=new_ctx) else: ctx0 = new_ctx # initial state/cell cnn_features = tensor.vector('x_feats', dtype='float32') init_state, init_memory = [], [] for lidx in range(options['n_layers_lstm']): init_state_prefix = 'CNNTrans_%d' % lidx if lidx > 0 else 'CNNTrans' init_memory_prefix = 'CNN_memory_%d' % lidx if lidx > 0 else 'CNN_memory' init_state.append( get_layer('ff')[1](tparams, cnn_features, options, prefix=init_state_prefix, activ='tanh')) init_memory.append( get_layer('ff')[1](tparams, cnn_features, options, prefix=init_memory_prefix, activ='tanh')) print 'Building f_init...', f_init = theano.function([ctx, cnn_features], [ctx0] + init_state + init_memory, name='f_init', profile=False, allow_input_downcast=True) print 'Done' # build f_next x = tensor.vector('x_sampler', dtype='int64') init_state = [] init_memory = [] for lidx in range(options['n_layers_lstm']): init_state.append(tensor.matrix('init_state', dtype='float32')) init_memory.append(tensor.matrix('init_memory', dtype='float32')) # for the first word (which is coded with -1), emb should be all zero emb = tensor.switch(x[:, None] < 0, tensor.alloc(0., 1, tparams['Wemb'].shape[1]), tparams['Wemb'][x]) next_state, next_memory, ctxs = [], [], [] for lidx in range(options['n_layers_lstm']): decoder_prefix = 'decoder_%d' % lidx if lidx > 0 else 'decoder' inps = proj_h if lidx > 0 else emb proj = get_layer('lstm_cond')[1](tparams, inps, options, prefix=decoder_prefix, context=ctx0, one_step=True, init_state=init_state[lidx], init_memory=init_memory[lidx], trng=trng, use_noise=use_noise) next_state.append(proj[0]) next_memory.append(proj[1]) ctxs.append(proj[4]) next_alpha = proj[2] proj_h = proj[0] if options['use_dropout']: proj_h = dropout_layer(proj[0], use_noise, trng) else: proj_h = proj[0] logit = get_layer('ff')[1](tparams, proj_h, options, prefix='ff_logit_lstm', activ='linear') if options['prev2out']: logit += emb if options['ctx2out']: logit += get_layer('ff')[1](tparams, ctxs[-1], options, prefix='ff_logit_ctx', activ='linear') logit = tanh(logit) if options['use_dropout']: logit = dropout_layer(logit, use_noise, trng) if options['n_layers_out'] > 1: for lidx in xrange(1, options['n_layers_out']): logit = get_layer('ff')[1](tparams, logit, options, prefix='ff_logit_h%d' % lidx, activ='rectifier') if options['use_dropout']: logit = dropout_layer(logit, use_noise, trng) logit = get_layer('ff')[1](tparams, logit, options, prefix='ff_logit', activ='linear') next_probs = tensor.nnet.softmax(logit) next_sample = trng.multinomial(pvals=next_probs).argmax(1) # next word probability f_next = theano.function([x, ctx0] + init_state + init_memory, [next_probs, next_sample, next_alpha] + next_state + next_memory, name='f_next', profile=False, allow_input_downcast=True) return f_init, f_next
def build_sampler(tparams, options, use_noise, trng, sampling=True): """ Builds a sampler used for generating from the model Parameters ---------- tparams : OrderedDict maps names of variables to theano shared variables options : dict big dictionary with all the settings and hyperparameters use_noise: boolean If true, add noise to the sampling trng: random number generator sampling : boolean [If it is true, when using stochastic attention, follows the learning rule described in section 4. at the bottom left of page 5] Returns ------- f_init : theano function Input: annotation, Output: initial lstm state and memory (also performs transformation on ctx0 if using lstm_encoder) f_next: theano function Takes the previous word/state/memory + ctx0 and runs ne step through the lstm (used for beam search) """ # context: #annotations x dim ctx = tensor.matrix('ctx_sampler', dtype='float32') if options['lstm_encoder']: # encoder ctx_fwd = get_layer('lstm')[1](tparams, ctx, options, prefix='encoder')[0] ctx_rev = get_layer('lstm')[1](tparams, ctx[::-1,:], options, prefix='encoder_rev')[0][::-1,:] ctx = tensor.concatenate((ctx_fwd, ctx_rev), axis=1) # initial state/cell ctx_mean = ctx.mean(0) for lidx in xrange(1, options['n_layers_init']): ctx_mean = get_layer('ff')[1](tparams, ctx_mean, options, prefix='ff_init_%d'%lidx, activ='rectifier') if options['use_dropout']: ctx_mean = dropout_layer(ctx_mean, use_noise, trng) init_state = [get_layer('ff')[1](tparams, ctx_mean, options, prefix='ff_state', activ='tanh')] init_memory = [get_layer('ff')[1](tparams, ctx_mean, options, prefix='ff_memory', activ='tanh')] if options['n_layers_lstm'] > 1: for lidx in xrange(1, options['n_layers_lstm']): init_state.append(get_layer('ff')[1](tparams, ctx_mean, options, prefix='ff_state_%d'%lidx, activ='tanh')) init_memory.append(get_layer('ff')[1](tparams, ctx_mean, options, prefix='ff_memory_%d'%lidx, activ='tanh')) print 'Building f_init...', f_init = theano.function([ctx], [ctx]+init_state+init_memory, name='f_init', profile=False, allow_input_downcast=True) print 'Done' # build f_next ctx = tensor.matrix('ctx_sampler', dtype='float32') x = tensor.vector('x_sampler', dtype='int64') init_state = [tensor.matrix('init_state', dtype='float32')] init_memory = [tensor.matrix('init_memory', dtype='float32')] if options['n_layers_lstm'] > 1: for lidx in xrange(1, options['n_layers_lstm']): init_state.append(tensor.matrix('init_state', dtype='float32')) init_memory.append(tensor.matrix('init_memory', dtype='float32')) # for the first word (which is coded with -1), emb should be all zero emb = tensor.switch(x[:,None] < 0, tensor.alloc(0., 1, tparams['Wemb'].shape[1]), tparams['Wemb'][x]) proj = get_layer('lstm_cond')[1](tparams, emb, options, prefix='decoder', mask=None, context=ctx, one_step=True, init_state=init_state[0], init_memory=init_memory[0], trng=trng, use_noise=use_noise, sampling=sampling) next_state, next_memory, ctxs = [proj[0]], [proj[1]], [proj[4]] proj_h = proj[0] if options['n_layers_lstm'] > 1: for lidx in xrange(1, options['n_layers_lstm']): proj = get_layer('lstm_cond')[1](tparams, proj_h, options, prefix='decoder_%d'%lidx, context=ctx, one_step=True, init_state=init_state[lidx], init_memory=init_memory[lidx], trng=trng, use_noise=use_noise, sampling=sampling) next_state.append(proj[0]) next_memory.append(proj[1]) ctxs.append(proj[4]) proj_h = proj[0] if options['use_dropout']: proj_h = dropout_layer(proj[0], use_noise, trng) else: proj_h = proj[0] logit = get_layer('ff')[1](tparams, proj_h, options, prefix='ff_logit_lstm', activ='linear') if options['prev2out']: logit += emb if options['ctx2out']: logit += get_layer('ff')[1](tparams, ctxs[-1], options, prefix='ff_logit_ctx', activ='linear') logit = tanh(logit) if options['use_dropout']: logit = dropout_layer(logit, use_noise, trng) if options['n_layers_out'] > 1: for lidx in xrange(1, options['n_layers_out']): logit = get_layer('ff')[1](tparams, logit, options, prefix='ff_logit_h%d'%lidx, activ='rectifier') if options['use_dropout']: logit = dropout_layer(logit, use_noise, trng) logit = get_layer('ff')[1](tparams, logit, options, prefix='ff_logit', activ='linear') logit_shp = logit.shape next_probs = tensor.nnet.softmax(logit) next_sample = trng.multinomial(pvals=next_probs).argmax(1) # next word probability f_next = theano.function([x, ctx]+init_state+init_memory, [next_probs, next_sample]+next_state+next_memory, name='f_next', profile=False, allow_input_downcast=True) return f_init, f_next
def build_model(tparams, options, sampling=True): """ Builds the entire computational graph used for training Basically does a forward pass through the data and calculates the cost function [This function builds a model described in Section 3.1.2 onwards as the convolutional feature are precomputed, some extra features which were not used are also implemented here.] Parameters ---------- tparams : OrderedDict maps names of variables to theano shared variables options : dict big dictionary with all the settings and hyperparameters sampling : boolean [If it is true, when using stochastic attention, follows the learning rule described in section 4. at the bottom left of page 5] Returns ------- trng: theano random number generator Used for dropout, stochastic attention, etc use_noise: theano shared variable flag that toggles noise on and off [x, mask, ctx]: theano variables Represent the captions, binary mask, and annotations for a single batch (see dimensions below) alphas: theano variables Attention weights alpha_sample: theano variable Sampled attention weights used in REINFORCE for stochastic attention: [see the learning rule in eq (12)] cost: theano variable negative log likelihood opt_outs: OrderedDict extra outputs required depending on configuration in options """ trng = RandomStreams(1234) use_noise = theano.shared(numpy.float32(0.)) # description string: #words x #samples, x = tensor.matrix('x', dtype='int64') mask = tensor.matrix('mask', dtype='float32') # context: #samples x #annotations x dim ctx = tensor.tensor3('ctx', dtype='float32') n_timesteps = x.shape[0] n_samples = x.shape[1] # index into the word embedding matrix, shift it forward in time emb = tparams['Wemb'][x.flatten()].reshape([n_timesteps, n_samples, options['dim_word']]) emb_shifted = tensor.zeros_like(emb) emb_shifted = tensor.set_subtensor(emb_shifted[1:], emb[:-1]) emb = emb_shifted if options['lstm_encoder']: # encoder ctx_fwd = get_layer('lstm')[1](tparams, ctx.dimshuffle(1,0,2), options, prefix='encoder')[0].dimshuffle(1,0,2) ctx_rev = get_layer('lstm')[1](tparams, ctx.dimshuffle(1,0,2)[:,::-1,:], options, prefix='encoder_rev')[0][:,::-1,:].dimshuffle(1,0,2) ctx0 = tensor.concatenate((ctx_fwd, ctx_rev), axis=2) else: ctx0 = ctx # initial state/cell [top right on page 4] ctx_mean = ctx0.mean(1) for lidx in xrange(1, options['n_layers_init']): ctx_mean = get_layer('ff')[1](tparams, ctx_mean, options, prefix='ff_init_%d'%lidx, activ='rectifier') if options['use_dropout']: ctx_mean = dropout_layer(ctx_mean, use_noise, trng) init_state = get_layer('ff')[1](tparams, ctx_mean, options, prefix='ff_state', activ='tanh') init_memory = get_layer('ff')[1](tparams, ctx_mean, options, prefix='ff_memory', activ='tanh') # lstm decoder # [equation (1), (2), (3) in section 3.1.2] attn_updates = [] proj, updates = get_layer('lstm_cond')[1](tparams, emb, options, prefix='decoder', mask=mask, context=ctx0, one_step=False, init_state=init_state, init_memory=init_memory, trng=trng, use_noise=use_noise, sampling=sampling) attn_updates += updates proj_h = proj[0] # optional deep attention if options['n_layers_lstm'] > 1: for lidx in xrange(1, options['n_layers_lstm']): init_state = get_layer('ff')[1](tparams, ctx_mean, options, prefix='ff_state_%d'%lidx, activ='tanh') init_memory = get_layer('ff')[1](tparams, ctx_mean, options, prefix='ff_memory_%d'%lidx, activ='tanh') proj, updates = get_layer('lstm_cond')[1](tparams, proj_h, options, prefix='decoder_%d'%lidx, mask=mask, context=ctx0, one_step=False, init_state=init_state, init_memory=init_memory, trng=trng, use_noise=use_noise, sampling=sampling) attn_updates += updates proj_h = proj[0] alphas = proj[2] alpha_sample = proj[3] ctxs = proj[4] # [beta value explained in note 4.2.1 "doubly stochastic attention"] if options['selector']: sels = proj[5] if options['use_dropout']: proj_h = dropout_layer(proj_h, use_noise, trng) # compute word probabilities # [equation (7)] logit = get_layer('ff')[1](tparams, proj_h, options, prefix='ff_logit_lstm', activ='linear') if options['prev2out']: logit += emb if options['ctx2out']: logit += get_layer('ff')[1](tparams, ctxs, options, prefix='ff_logit_ctx', activ='linear') logit = tanh(logit) if options['use_dropout']: logit = dropout_layer(logit, use_noise, trng) if options['n_layers_out'] > 1: for lidx in xrange(1, options['n_layers_out']): logit = get_layer('ff')[1](tparams, logit, options, prefix='ff_logit_h%d'%lidx, activ='rectifier') if options['use_dropout']: logit = dropout_layer(logit, use_noise, trng) # compute softmax logit = get_layer('ff')[1](tparams, logit, options, prefix='ff_logit', activ='linear') logit_shp = logit.shape probs = tensor.nnet.softmax(logit.reshape([logit_shp[0]*logit_shp[1], logit_shp[2]])) # Index into the computed probability to give the log likelihood x_flat = x.flatten() p_flat = probs.flatten() cost = -tensor.log(p_flat[tensor.arange(x_flat.shape[0])*probs.shape[1]+x_flat]+1e-8) cost = cost.reshape([x.shape[0], x.shape[1]]) masked_cost = cost * mask cost = (masked_cost).sum(0) # optional outputs opt_outs = dict() if options['selector']: opt_outs['selector'] = sels if options['attn_type'] == 'stochastic': opt_outs['masked_cost'] = masked_cost # need this for reinforce later opt_outs['attn_updates'] = attn_updates # this is to update the rng return trng, use_noise, [x, mask, ctx], alphas, alpha_sample, cost, opt_outs
def _step(m_, x_, h_, c_, a_, as_, ct_, pctx_, dp_=None, dp_att_=None): """ Each variable is one time slice of the LSTM m_ - (mask), x_- (previous word), h_- (hidden state), c_- (lstm memory), a_ - (alpha distribution [eq (5)]), as_- (sample from alpha dist), ct_- (context), pctx_ (projected context), dp_/dp_att_ (dropout masks) """ # attention computation # [described in equations (4), (5), (6) in # section "3.1.2 Decoder: Long Short Term Memory Network] pstate_ = tensor.dot(h_, tparams[_p(prefix,'Wd_att')]) pctx_ = pctx_ + pstate_[:,None,:] pctx_list = [] pctx_list.append(pctx_) pctx_ = tanh(pctx_) alpha = tensor.dot(pctx_, tparams[_p(prefix,'U_att')])+tparams[_p(prefix, 'c_tt')] alpha_pre = alpha alpha_shp = alpha.shape if options['attn_type'] == 'deterministic': alpha = tensor.nnet.softmax(alpha.reshape([alpha_shp[0],alpha_shp[1]])) # softmax ctx_ = (context * alpha[:,:,None]).sum(1) # current context alpha_sample = alpha # you can return something else reasonable here to debug else: alpha = tensor.nnet.softmax(temperature_c*alpha.reshape([alpha_shp[0],alpha_shp[1]])) # softmax # TODO return alpha_sample if sampling: alpha_sample = h_sampling_mask * trng.multinomial(pvals=alpha,dtype=theano.config.floatX)\ + (1.-h_sampling_mask) * alpha else: if argmax: alpha_sample = tensor.cast(tensor.eq(tensor.arange(alpha_shp[1])[None,:], tensor.argmax(alpha,axis=1,keepdims=True)), theano.config.floatX) else: alpha_sample = alpha ctx_ = (context * alpha_sample[:,:,None]).sum(1) # current context if options['selector']: sel_ = tensor.nnet.sigmoid(tensor.dot(h_, tparams[_p(prefix, 'W_sel')])+tparams[_p(prefix,'b_sel')]) sel_ = sel_.reshape([sel_.shape[0]]) ctx_ = sel_[:,None] * ctx_ preact = tensor.dot(h_, tparams[_p(prefix, 'U')]) preact += x_ preact += tensor.dot(ctx_, tparams[_p(prefix, 'Wc')]) # Recover the activations to the lstm gates # [equation (1)] i = _slice(preact, 0, dim) f = _slice(preact, 1, dim) o = _slice(preact, 2, dim) if options['use_dropout_lstm']: i = i * _slice(dp_, 0, dim) f = f * _slice(dp_, 1, dim) o = o * _slice(dp_, 2, dim) i = tensor.nnet.sigmoid(i) f = tensor.nnet.sigmoid(f) o = tensor.nnet.sigmoid(o) c = tensor.tanh(_slice(preact, 3, dim)) # compute the new memory/hidden state # if the mask is 0, just copy the previous state c = f * c_ + i * c c = m_[:,None] * c + (1. - m_)[:,None] * c_ h = o * tensor.tanh(c) h = m_[:,None] * h + (1. - m_)[:,None] * h_ rval = [h, c, alpha, alpha_sample, ctx_] if options['selector']: rval += [sel_] rval += [pstate_, pctx_, i, f, o, preact, alpha_pre]+pctx_list return rval
def build_sampler(tparams, options, use_noise, trng, sampling=True): """ Builds a sampler used for generating from the model Parameters ---------- tparams : OrderedDict maps names of variables to theano shared variables options : dict big dictionary with all the settings and hyperparameters use_noise: boolean If true, add noise to the sampling trng: random number generator sampling : boolean [If it is true, when using stochastic attention, follows the learning rule described in section 4. at the bottom left of page 5] Returns ------- f_init : theano function Input: annotation, Output: initial lstm state and memory (also performs transformation on ctx0 if using lstm_encoder) f_next: theano function Takes the previous word/state/memory + ctx0 and runs ne step through the lstm (used for beam search) """ # context: #annotations x dim ctx = tensor.matrix('ctx_sampler', dtype='float32') if options['lstm_encoder']: # encoder ctx_fwd = get_layer('lstm')[1](tparams, ctx, options, prefix='encoder')[0] ctx_rev = get_layer('lstm')[1](tparams, ctx[::-1, :], options, prefix='encoder_rev')[0][::-1, :] ctx = tensor.concatenate((ctx_fwd, ctx_rev), axis=1) # initial state/cell ctx_mean = ctx.mean(0) for lidx in xrange(1, options['n_layers_init']): ctx_mean = get_layer('ff')[1](tparams, ctx_mean, options, prefix='ff_init_%d' % lidx, activ='rectifier') if options['use_dropout']: ctx_mean = dropout_layer(ctx_mean, use_noise, trng) init_state = [ get_layer('ff')[1](tparams, ctx_mean, options, prefix='ff_state', activ='tanh') ] init_memory = [ get_layer('ff')[1](tparams, ctx_mean, options, prefix='ff_memory', activ='tanh') ] if options['n_layers_lstm'] > 1: for lidx in xrange(1, options['n_layers_lstm']): init_state.append( get_layer('ff')[1](tparams, ctx_mean, options, prefix='ff_state_%d' % lidx, activ='tanh')) init_memory.append( get_layer('ff')[1](tparams, ctx_mean, options, prefix='ff_memory_%d' % lidx, activ='tanh')) print 'Building f_init...', f_init = theano.function([ctx], [ctx] + init_state + init_memory, name='f_init', profile=False, allow_input_downcast=True) print 'Done' # build f_next ctx = tensor.matrix('ctx_sampler', dtype='float32') x = tensor.vector('x_sampler', dtype='int64') init_state = [tensor.matrix('init_state', dtype='float32')] init_memory = [tensor.matrix('init_memory', dtype='float32')] if options['n_layers_lstm'] > 1: for lidx in xrange(1, options['n_layers_lstm']): init_state.append(tensor.matrix('init_state', dtype='float32')) init_memory.append(tensor.matrix('init_memory', dtype='float32')) # for the first word (which is coded with -1), emb should be all zero emb = tensor.switch(x[:, None] < 0, tensor.alloc(0., 1, tparams['Wemb'].shape[1]), tparams['Wemb'][x]) proj = get_layer('lstm_cond')[1](tparams, emb, options, prefix='decoder', mask=None, context=ctx, one_step=True, init_state=init_state[0], init_memory=init_memory[0], trng=trng, use_noise=use_noise, sampling=sampling) next_state, next_memory, ctxs = [proj[0]], [proj[1]], [proj[4]] proj_h = proj[0] if options['n_layers_lstm'] > 1: for lidx in xrange(1, options['n_layers_lstm']): proj = get_layer('lstm_cond')[1](tparams, proj_h, options, prefix='decoder_%d' % lidx, context=ctx, one_step=True, init_state=init_state[lidx], init_memory=init_memory[lidx], trng=trng, use_noise=use_noise, sampling=sampling) next_state.append(proj[0]) next_memory.append(proj[1]) ctxs.append(proj[4]) proj_h = proj[0] if options['use_dropout']: proj_h = dropout_layer(proj[0], use_noise, trng) else: proj_h = proj[0] logit = get_layer('ff')[1](tparams, proj_h, options, prefix='ff_logit_lstm', activ='linear') if options['prev2out']: logit += emb if options['ctx2out']: logit += get_layer('ff')[1](tparams, ctxs[-1], options, prefix='ff_logit_ctx', activ='linear') logit = tanh(logit) if options['use_dropout']: logit = dropout_layer(logit, use_noise, trng) if options['n_layers_out'] > 1: for lidx in xrange(1, options['n_layers_out']): logit = get_layer('ff')[1](tparams, logit, options, prefix='ff_logit_h%d' % lidx, activ='rectifier') if options['use_dropout']: logit = dropout_layer(logit, use_noise, trng) logit = get_layer('ff')[1](tparams, logit, options, prefix='ff_logit', activ='linear') logit_shp = logit.shape next_probs = tensor.nnet.softmax(logit) next_sample = trng.multinomial(pvals=next_probs).argmax(1) # next word probability f_next = theano.function([x, ctx] + init_state + init_memory, [next_probs, next_sample] + next_state + next_memory, name='f_next', profile=False, allow_input_downcast=True) return f_init, f_next
def forwardProp(self,allKids,words_embedded,updateWlab,label,theta,freq): (W1,W2,W3,W4,Wlab,b1,b2,b3,blab,WL)=self.getParams(theta) #sl是words_embedded的个数,一句话单词的个数 # allKids一开始没有值,是因为训练之前,语法树本来就没有构建完,树结构是训练完了以后才出现的。但是,allkids内容应该会随着算法的进行而变化 sl=np.size(words_embedded,1) sentree=rnntree.rnntree(self.d,sl,words_embedded) collapsed_sentence = range(sl) # updateWlab主要是获得情感误差,修正情感的权值 # 情感误差也是需要p作为输入的,因此也需要计算出p if updateWlab: temp_label=np.zeros(self.cat) #假设cat = 4, temp_label就是(0,0,0,0)。下面这句话的意思是label对应的位置为1 temp_label[label-1]=1.0 nodeUnder = np.ones([2*sl-1,1]) # 这个for循环是计算出,某个节点底下一共有多少个子节点 # kids存了两个值,分别代表左右孩子。 # 可以推测出,allkids存的东西,allkids[i]代表第i个非叶子节点,allkids[i][0]是左孩子,allkids[i][1]是右孩子 for i in range(sl,2*sl-1): # calculate n1, n2 and n1+n2 for each node in the sensentree and store in nodeUnder kids = allKids[i] n1 = nodeUnder[kids[0]] n2 = nodeUnder[kids[1]] nodeUnder[i] = n1+n2 cat_size=self.cat sentree.catDelta = np.zeros([cat_size, 2*sl-1]) sentree.catDelta_out = np.zeros([self.d,2*sl-1]) # classifier on single words # 处理所有单词,即叶子节点 # 这里有个问题就是,为什么叶子节点也要计算情感误差 for i in range(sl): sm = softmax(np.dot(Wlab,words_embedded[:,i]) + blab) #这里不管情感误差是如何计算的,sentree.nodeScores存的是情感误差没错了。 #sentree.catDelta存的什么不清楚,但是和情感误差有关 lbl_sm = (1-self.alpha)*(temp_label - sm) sentree.nodeScores[i] = 1.0/2.0*(np.dot(lbl_sm,(temp_label- sm))) sentree.catDelta[:, i] = -np.dot(lbl_sm,softmax_prime(sm)) # sm = sigmoid(self.Wlab*words_embedded + self.blab) #lbl_sm = (1-self.alpha)*(label[:,np.ones(sl,1)] - sm) #sentree.nodeScores[:sl] = 1/2*(lbl_sm.*(label(:,ones(sl,1)) - sm)) #sentree.catDelta[:, :sl] = -(lbl_sm).*sigmoid_prime(sm) #超过sl的部分是单词的父亲节点 for i in range(sl,2*sl-1): kids = allKids[i] #c1,c2,是左右孩子的向量 c1 = sentree.nodeFeatures[:,kids[0]] c2 = sentree.nodeFeatures[:,kids[1]] # Eq. [2] in the paper: p = f(W[1][c1 c2] + b[1]) #计算p,显然p是个数值,即得分,用于判断哪两个节点合并 p = tanh(np.dot(W1,c1) + np.dot(W2,c2) + b1) # See last paragraph in Section 2.3 p_norm1 = p/norm(p) # Eq. (7) in the paper (for special case of 1d label) #sm = sigmoid(np.dot(Wlab,p_norm1) + blab) #这里是计算节点的情感标签,sm sm = softmax(np.dot(Wlab,p_norm1) + blab) beta=0.5 #lbl_sm = beta * (1.0-self.alpha)*(label - sm) lbl_sm = beta * (1.0-self.alpha)*(temp_label - sm) #lbl_sm = beta * (1.0-self.alpha) * (temp_label-sm) #sentree.catDelta[:, i] = -softmax_prime(sm)[:,label-1] #J=-(1.0-self.alpha)*np.log(sm[label-1]) #sentree.catDelta[:, i] = -np.dot(lbl_sm,sigmoid_prime(sm)) sentree.catDelta[:, i] = -np.dot(lbl_sm,softmax_prime(sm)) #J = 1.0/2.0*(np.dot(lbl_sm,(label - sm))) J = 1.0/2.0*(np.dot(lbl_sm,(temp_label - sm))) sentree.nodeFeatures[:,i] = p_norm1 sentree.nodeFeatures_unnormalized[:,i] = p sentree.nodeScores[i] = J sentree.numkids = nodeUnder sentree.kids = allKids else: # 这里主要是计算重构误差 # Reconstruction Error for j in range(sl-1): size2=np.size(words_embedded,1) """ 经过测试,p有多个值 也就不难怪这里c1,c2里面分别存了多个单词的向量 因此,这个算法并不是一个个依次算p的,而是一次性一起算出来p 也因此J的值应该也是有多个值。代表两两单词计算的不同结果。 """ c1 = words_embedded[:,0:-1] # 去掉最后一个单词 c2 = words_embedded[:,1:] # 去掉第一个单词 freq1 = freq[0:-1] freq2 = freq[1:] p = tanh(np.dot(W1,c1) + np.dot(W2,c2) + np.reshape(b1,[self.d,1])*([1]*(size2-1))) p_norm1 =p/np.sqrt(sum(p**2)) y1_unnormalized = tanh(np.dot(W3,p_norm1) + np.reshape(b2,[self.d,1])*([1]*(size2-1))) y2_unnormalized = tanh(np.dot(W4,p_norm1) + np.reshape(b3,[self.d,1])*([1]*(size2-1))) y1 = y1_unnormalized/np.sqrt(sum(y1_unnormalized**2)) y2 = y2_unnormalized/np.sqrt(sum(y2_unnormalized**2)) y1c1 = self.alpha*(y1-c1) y2c2 = self.alpha*(y2-c2) # Eq. (4) in the paper: reconstruction error J = 1.0/2.0*sum((y1c1)*(y1-c1) + (y2c2)*(y2-c2)) # finding the pair with smallest reconstruction error for constructing sentree J_min= min(J) J_minpos=np.argmin(J) """ 只有非叶子节点才会有重构节点,因此,sentree.node_y1c1需要从sl+j开始存y1c1. """ sentree.node_y1c1[:,sl+j] = y1c1[:,J_minpos] sentree.node_y2c2[:,sl+j] = y2c2[:,J_minpos] sentree.nodeDelta_out1[:,sl+j] = np.dot(norm1tanh_prime(y1_unnormalized[:,J_minpos]) , y1c1[:,J_minpos]) sentree.nodeDelta_out2[:,sl+j] = np.dot(norm1tanh_prime(y2_unnormalized[:,J_minpos]) , y2c2[:,J_minpos]) #一对节点被选中以后,需要删除words_embedded对应的向量 #还要把合成的节点加入words_embedded words_embedded=np.delete(words_embedded,J_minpos+1,1) words_embedded[:,J_minpos]=p_norm1[:,J_minpos] sentree.nodeFeatures[:, sl+j] = p_norm1[:,J_minpos] sentree.nodeFeatures_unnormalized[:, sl+j]= p[:,J_minpos] sentree.nodeScores[sl+j] = J_min # pp存的可能是父节点信息,因为两个孩子拥有同一个父亲 sentree.pp[collapsed_sentence[J_minpos]] = sl+j sentree.pp[collapsed_sentence[J_minpos+1]] = sl+j sentree.kids[sl+j,:] = [collapsed_sentence[J_minpos], collapsed_sentence[J_minpos+1]] sentree.numkids[sl+j] = sentree.numkids[sentree.kids[sl+j,0]] + sentree.numkids[sentree.kids[sl+j,1]] freq=np.delete(freq,J_minpos+1) freq[J_minpos] = (sentree.numkids[sentree.kids[sl+j,0]]*freq1[J_minpos] + sentree.numkids[sentree.kids[sl+j,1]]*freq2[J_minpos])/(sentree.numkids[sentree.kids[sl+j,0]]+sentree.numkids[sentree.kids[sl+j,1]]) collapsed_sentence=np.delete(collapsed_sentence,J_minpos+1) collapsed_sentence[J_minpos]=sl+j print("@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@") print(sentree.pp) print("^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^") print(sentree.kids) return sentree
def forward(self, X): #Z = relu(X.dot(self.W1)+self.b1) Z = tanh(X.dot(self.W1) + self.b1) return softmax(Z.dot(self.W2) + self.b2), Z
def forwardProp(self, allKids, words_embedded, updateWlab, label, theta, freq): (W1, W2, W3, W4, Wlab, b1, b2, b3, blab, WL) = self.getParams(theta) sl = np.size(words_embedded, 1) sentree = rnntree.rnntree(self.d, sl, words_embedded) collapsed_sentence = range(sl) if updateWlab: temp_label = np.zeros(self.cat) temp_label[label - 1] = 1.0 nodeUnder = np.ones([2 * sl - 1, 1]) for i in range( sl, 2 * sl - 1 ): # calculate n1, n2 and n1+n2 for each node in the sensentree and store in nodeUnder kids = allKids[i] n1 = nodeUnder[kids[0]] n2 = nodeUnder[kids[1]] nodeUnder[i] = n1 + n2 cat_size = self.cat sentree.catDelta = np.zeros([cat_size, 2 * sl - 1]) sentree.catDelta_out = np.zeros([self.d, 2 * sl - 1]) # classifier on single words for i in range(sl): sm = softmax(np.dot(Wlab, words_embedded[:, i]) + blab) lbl_sm = (1 - self.alpha) * (temp_label - sm) sentree.nodeScores[i] = 1.0 / 2.0 * (np.dot( lbl_sm, (temp_label - sm))) sentree.catDelta[:, i] = -np.dot(lbl_sm, softmax_prime(sm)) # sm = sigmoid(self.Wlab*words_embedded + self.blab) #lbl_sm = (1-self.alpha)*(label[:,np.ones(sl,1)] - sm) #sentree.nodeScores[:sl] = 1/2*(lbl_sm.*(label(:,ones(sl,1)) - sm)) #sentree.catDelta[:, :sl] = -(lbl_sm).*sigmoid_prime(sm) for i in range(sl, 2 * sl - 1): kids = allKids[i] c1 = sentree.nodeFeatures[:, kids[0]] c2 = sentree.nodeFeatures[:, kids[1]] # Eq. [2] in the paper: p = f(W[1][c1 c2] + b[1]) p = tanh(np.dot(W1, c1) + np.dot(W2, c2) + b1) # See last paragraph in Section 2.3 p_norm1 = p / norm(p) # Eq. (7) in the paper (for special case of 1d label) #sm = sigmoid(np.dot(Wlab,p_norm1) + blab) sm = softmax(np.dot(Wlab, p_norm1) + blab) beta = 0.5 #lbl_sm = beta * (1.0-self.alpha)*(label - sm) lbl_sm = beta * (1.0 - self.alpha) * (temp_label - sm) #lbl_sm = beta * (1.0-self.alpha) * (temp_label-sm) #sentree.catDelta[:, i] = -softmax_prime(sm)[:,label-1] #J=-(1.0-self.alpha)*np.log(sm[label-1]) #sentree.catDelta[:, i] = -np.dot(lbl_sm,sigmoid_prime(sm)) sentree.catDelta[:, i] = -np.dot(lbl_sm, softmax_prime(sm)) #J = 1.0/2.0*(np.dot(lbl_sm,(label - sm))) J = 1.0 / 2.0 * (np.dot(lbl_sm, (temp_label - sm))) sentree.nodeFeatures[:, i] = p_norm1 sentree.nodeFeatures_unnormalized[:, i] = p sentree.nodeScores[i] = J sentree.numkids = nodeUnder sentree.kids = allKids else: # Reconstruction Error for j in range(sl - 1): size2 = np.size(words_embedded, 1) c1 = words_embedded[:, 0:-1] c2 = words_embedded[:, 1:] freq1 = freq[0:-1] freq2 = freq[1:] p = tanh( np.dot(W1, c1) + np.dot(W2, c2) + np.reshape(b1, [self.d, 1]) * ([1] * (size2 - 1))) p_norm1 = p / np.sqrt(sum(p**2)) y1_unnormalized = tanh( np.dot(W3, p_norm1) + np.reshape(b2, [self.d, 1]) * ([1] * (size2 - 1))) y2_unnormalized = tanh( np.dot(W4, p_norm1) + np.reshape(b3, [self.d, 1]) * ([1] * (size2 - 1))) y1 = y1_unnormalized / np.sqrt(sum(y1_unnormalized**2)) y2 = y2_unnormalized / np.sqrt(sum(y2_unnormalized**2)) y1c1 = self.alpha * (y1 - c1) y2c2 = self.alpha * (y2 - c2) # Eq. (4) in the paper: reconstruction error J = 1.0 / 2.0 * sum((y1c1) * (y1 - c1) + (y2c2) * (y2 - c2)) # finding the pair with smallest reconstruction error for constructing sentree J_min = min(J) J_minpos = np.argmin(J) sentree.node_y1c1[:, sl + j] = y1c1[:, J_minpos] sentree.node_y2c2[:, sl + j] = y2c2[:, J_minpos] sentree.nodeDelta_out1[:, sl + j] = np.dot( norm1tanh_prime(y1_unnormalized[:, J_minpos]), y1c1[:, J_minpos]) sentree.nodeDelta_out2[:, sl + j] = np.dot( norm1tanh_prime(y2_unnormalized[:, J_minpos]), y2c2[:, J_minpos]) words_embedded = np.delete(words_embedded, J_minpos + 1, 1) words_embedded[:, J_minpos] = p_norm1[:, J_minpos] sentree.nodeFeatures[:, sl + j] = p_norm1[:, J_minpos] sentree.nodeFeatures_unnormalized[:, sl + j] = p[:, J_minpos] sentree.nodeScores[sl + j] = J_min sentree.pp[collapsed_sentence[J_minpos]] = sl + j sentree.pp[collapsed_sentence[J_minpos + 1]] = sl + j sentree.kids[sl + j, :] = [ collapsed_sentence[J_minpos], collapsed_sentence[J_minpos + 1] ] sentree.numkids[sl + j] = sentree.numkids[sentree.kids[ sl + j, 0]] + sentree.numkids[sentree.kids[sl + j, 1]] freq = np.delete(freq, J_minpos + 1) freq[J_minpos] = ( sentree.numkids[sentree.kids[sl + j, 0]] * freq1[J_minpos] + sentree.numkids[sentree.kids[sl + j, 1]] * freq2[J_minpos] ) / (sentree.numkids[sentree.kids[sl + j, 0]] + sentree.numkids[sentree.kids[sl + j, 1]]) collapsed_sentence = np.delete(collapsed_sentence, J_minpos + 1) collapsed_sentence[J_minpos] = sl + j return sentree
def tanh(x): return sp.tanh(x)