def build_sampler(tparams, options, trng): x = tensor.matrix('x', dtype='int64') xr = x[::-1] n_timesteps = x.shape[0] n_samples = x.shape[1] # word embedding (source) emb = tparams['Wemb'][x.flatten()] emb = emb.reshape([n_timesteps, n_samples, options['dim_word']]) embr = tparams['Wemb'][xr.flatten()] embr = embr.reshape([n_timesteps, n_samples, options['dim_word']]) # encoder proj = get_layer(options['encoder'])[1](tparams, emb, options, prefix='encoder') projr = get_layer(options['encoder'])[1](tparams, embr, options, prefix='encoder_r') ctx = concatenate([proj[0],projr[0][::-1]], axis=proj[0].ndim-1) ctx_mean = ctx.mean(0) #ctx_mean = concatenate([proj[0][-1],projr[0][-1]], axis=proj[0].ndim-2) init_state = get_layer('ff')[1](tparams, ctx_mean, options, prefix='ff_state', activ='tanh') print 'Building f_init...', outs = [init_state, ctx] f_init = theano.function([x], outs, name='f_init', profile=profile) print 'Done' # x: 1 x 1 y = tensor.vector('y_sampler', dtype='int64') init_state = tensor.matrix('init_state', dtype='float32') lmy=y.dimshuffle((1,0)) # inputx=tensor.imatrix('x') lmmodel = NeuralLM(options['n_words'], test_data=None, input_tensor=lmy) lmmodel.stack(LSTM(hidden_size=options['lmdim'], output_type="sequence", persistent_state=True, batch_size=options['batch_size'], reset_state_for_input=0), FullOutputLayer(options['n_words'])) lmhidden =lmmodel._hidden_outputs[1].dimshuffle((1,0,2)) # if it's the first word, emb should be all zero emb = tensor.switch(y[:,None] < 0, tensor.alloc(0., 1, tparams['Wemb_dec'].shape[1]), tparams['Wemb_dec'][y]) proj = get_layer(options['decoder'])[1](tparams, emb, options, prefix='decoder', mask=None, context=ctx, one_step=True, init_state=init_state) next_state = proj[0] ctxs = proj[1] logit_lstm = get_layer('ff')[1](tparams, next_state, options, prefix='ff_logit_lstm', activ='linear') logit_prev = get_layer('ff')[1](tparams, emb, options, prefix='ff_logit_prev', activ='linear') logit_ctx = get_layer('ff')[1](tparams, ctxs, options, prefix='ff_logit_ctx', activ='linear') logit_lm = get_layer('ff')[1](tparams, lmhidden, options, prefix='ff_logit_lm', activ='linear') logit = tensor.tanh(logit_lstm+logit_prev+logit_ctx+ logit_lm) logit = get_layer('ff')[1](tparams, logit, options, prefix='ff_logit', activ='linear') next_probs = tensor.nnet.softmax(logit) next_sample = trng.multinomial(pvals=next_probs).argmax(1) # next word probability print 'Building f_next..', inps = [y, ctx, init_state] outs = [next_probs, next_sample, next_state] f_next = theano.function(inps, outs, name='f_next', profile=profile) print 'Done' return f_init, f_next
def build_model(tparams, options): opt_ret = dict() trng = RandomStreams(1234) use_noise = theano.shared(numpy.float32(0.)) # description string: #words x #samples x = tensor.matrix('x', dtype='int64') x_mask = tensor.matrix('x_mask', dtype='float32') y = tensor.matrix('y', dtype='int64', ) y_mask = tensor.matrix('y_mask', dtype='float32') xr = x[::-1] xr_mask = x_mask[::-1] n_timesteps = x.shape[0] n_timesteps_trg = y.shape[0] n_samples = x.shape[1] # (maxlen_x, n_samples) # shares=n_timesteps_trg/options['history_len'] # lmy=y.dimshuffle(1,0).reshape([shares, n_samples, options['history_len']]) lmy=y.dimshuffle((1,0)) # inputx=tensor.imatrix('x') lmmodel = NeuralLM(options['n_words'], test_data=None, input_tensor=lmy) lmmodel.stack(LSTM(hidden_size=options['lmdim'], output_type="sequence", persistent_state=True, batch_size=options['batch_size'], reset_state_for_input=0), FullOutputLayer(options['n_words'])) lmhidden =lmmodel._hidden_outputs[1].dimshuffle((1,0,2)) # lmhid=[] # # for i in range(shares): # lmhid.append(theano.clone(hiddenoutput, replace={inputx: lmy[i]}, strict=False)) # # lmhidden=tensor.concatenate(lmhid, axis=0) emb = tparams['Wemb'][x.flatten()] emb = emb.reshape([n_timesteps, n_samples, options['dim_word']]) proj = get_layer(options['encoder'])[1](tparams, emb, options, prefix='encoder', mask=x_mask) embr = tparams['Wemb'][xr.flatten()] embr = embr.reshape([n_timesteps, n_samples, options['dim_word']]) projr = get_layer(options['encoder'])[1](tparams, embr, options, prefix='encoder_r', mask=xr_mask) ctx = concatenate([proj[0], projr[0][::-1]], axis=proj[0].ndim-1) ctx_mean = (ctx * x_mask[:,:,None]).sum(0) / x_mask.sum(0)[:,None] #ctx_mean = concatenate([proj[0][-1], projr[0][-1]], axis=proj[0].ndim-2) # initial decoder state init_state = get_layer('ff')[1](tparams, ctx_mean, options, prefix='ff_state', activ='tanh') # word embedding (target) emb = tparams['Wemb_dec'][y.flatten()] emb = emb.reshape([n_timesteps_trg, n_samples, options['dim_word']]) emb_shifted = tensor.zeros_like(emb) emb_shifted = tensor.set_subtensor(emb_shifted[1:], emb[:-1]) emb = emb_shifted # decoder proj = get_layer(options['decoder'])[1](tparams, emb, options, prefix='decoder', mask=y_mask, context=ctx, context_mask=x_mask, one_step=False, init_state=init_state) proj_h = proj[0] ctxs = proj[1] opt_ret['dec_alphas'] = proj[2] # compute word probabilities logit_lstm = get_layer('ff')[1](tparams, proj_h, options, prefix='ff_logit_lstm', activ='linear') logit_prev = get_layer('ff')[1](tparams, emb, options, prefix='ff_logit_prev', activ='linear') logit_ctx = get_layer('ff')[1](tparams, ctxs, options, prefix='ff_logit_ctx', activ='linear') logit_lm = get_layer('ff')[1](tparams, lmhidden, options, prefix='ff_logit_lm', activ='linear') logit = tensor.tanh(logit_lstm+logit_prev+logit_ctx+ logit_lm) logit = get_layer('ff')[1](tparams, logit, options, prefix='ff_logit', activ='linear') logit_shp = logit.shape probs = tensor.nnet.softmax(logit.reshape([logit_shp[0]*logit_shp[1], logit_shp[2]])) # cost y_flat = y.flatten() y_flat_idx = tensor.arange(y_flat.shape[0]) * options['n_words'] + y_flat cost = -tensor.log(probs.flatten()[y_flat_idx]) cost = cost.reshape([y.shape[0],y.shape[1]]) cost = (cost * y_mask).sum(0) return trng, use_noise, x, x_mask, y, y_mask, opt_ret, cost, lmmodel