Ejemplo n.º 1
0
def build_sampler(tparams, options, trng):
    x = tensor.matrix('x', dtype='int64')
    xr = x[::-1]
    n_timesteps = x.shape[0]
    n_samples = x.shape[1]

    # word embedding (source)
    emb = tparams['Wemb'][x.flatten()]
    emb = emb.reshape([n_timesteps, n_samples, options['dim_word']])
    embr = tparams['Wemb'][xr.flatten()]
    embr = embr.reshape([n_timesteps, n_samples, options['dim_word']])

    # encoder
    proj = get_layer(options['encoder'])[1](tparams, emb, options, prefix='encoder')
    projr = get_layer(options['encoder'])[1](tparams, embr, options, prefix='encoder_r')
    ctx = concatenate([proj[0],projr[0][::-1]], axis=proj[0].ndim-1)
    ctx_mean = ctx.mean(0)
    #ctx_mean = concatenate([proj[0][-1],projr[0][-1]], axis=proj[0].ndim-2)
    init_state = get_layer('ff')[1](tparams, ctx_mean, options, 
                                    prefix='ff_state', activ='tanh')

    print 'Building f_init...',
    outs = [init_state, ctx]
    f_init = theano.function([x], outs, name='f_init', profile=profile)
    print 'Done'

    # x: 1 x 1
    y = tensor.vector('y_sampler', dtype='int64')
    init_state = tensor.matrix('init_state', dtype='float32')

    lmy=y.dimshuffle((1,0))
    # inputx=tensor.imatrix('x')
    lmmodel = NeuralLM(options['n_words'], test_data=None, input_tensor=lmy)
    lmmodel.stack(LSTM(hidden_size=options['lmdim'], output_type="sequence",
                    persistent_state=True, batch_size=options['batch_size'],
                    reset_state_for_input=0),
                FullOutputLayer(options['n_words']))

    lmhidden =lmmodel._hidden_outputs[1].dimshuffle((1,0,2))

    # if it's the first word, emb should be all zero
    emb = tensor.switch(y[:,None] < 0, 
                        tensor.alloc(0., 1, tparams['Wemb_dec'].shape[1]), 
                        tparams['Wemb_dec'][y])
    proj = get_layer(options['decoder'])[1](tparams, emb, options, 
                                            prefix='decoder', 
                                            mask=None, context=ctx, 
                                            one_step=True, 
                                            init_state=init_state)
    next_state = proj[0]
    ctxs = proj[1]

    logit_lstm = get_layer('ff')[1](tparams, next_state, options, 
                                    prefix='ff_logit_lstm', activ='linear')
    logit_prev = get_layer('ff')[1](tparams, emb, options, 
                                    prefix='ff_logit_prev', activ='linear')
    logit_ctx = get_layer('ff')[1](tparams, ctxs, options, 
                                    prefix='ff_logit_ctx', activ='linear')
    logit_lm = get_layer('ff')[1](tparams, lmhidden, options,
                                    prefix='ff_logit_lm', activ='linear')

    logit = tensor.tanh(logit_lstm+logit_prev+logit_ctx+ logit_lm)

    logit = get_layer('ff')[1](tparams, logit, options,
                               prefix='ff_logit', activ='linear')
    next_probs = tensor.nnet.softmax(logit)
    next_sample = trng.multinomial(pvals=next_probs).argmax(1)

    # next word probability
    print 'Building f_next..', 
    inps = [y, ctx, init_state]
    outs = [next_probs, next_sample, next_state]
    f_next = theano.function(inps, outs, name='f_next', profile=profile)
    print 'Done'

    return f_init, f_next
Ejemplo n.º 2
0
def build_model(tparams, options):
    opt_ret = dict()

    trng = RandomStreams(1234)
    use_noise = theano.shared(numpy.float32(0.))

    # description string: #words x #samples
    x = tensor.matrix('x', dtype='int64')
    x_mask = tensor.matrix('x_mask', dtype='float32')
    y = tensor.matrix('y', dtype='int64', )
    y_mask = tensor.matrix('y_mask', dtype='float32')

    xr = x[::-1]
    xr_mask = x_mask[::-1]

    n_timesteps = x.shape[0]
    n_timesteps_trg = y.shape[0]
    n_samples = x.shape[1]
    # (maxlen_x, n_samples)
    # shares=n_timesteps_trg/options['history_len']

    # lmy=y.dimshuffle(1,0).reshape([shares, n_samples, options['history_len']])
    lmy=y.dimshuffle((1,0))
    # inputx=tensor.imatrix('x')
    lmmodel = NeuralLM(options['n_words'], test_data=None, input_tensor=lmy)
    lmmodel.stack(LSTM(hidden_size=options['lmdim'], output_type="sequence",
                    persistent_state=True, batch_size=options['batch_size'],
                    reset_state_for_input=0),
                FullOutputLayer(options['n_words']))

    lmhidden =lmmodel._hidden_outputs[1].dimshuffle((1,0,2))

    # lmhid=[]
    #
    # for i in range(shares):
    #     lmhid.append(theano.clone(hiddenoutput, replace={inputx: lmy[i]}, strict=False))
    #
    # lmhidden=tensor.concatenate(lmhid, axis=0)

    emb = tparams['Wemb'][x.flatten()]
    emb = emb.reshape([n_timesteps, n_samples, options['dim_word']])
    proj = get_layer(options['encoder'])[1](tparams, emb, options,
                                            prefix='encoder',
                                            mask=x_mask)
    embr = tparams['Wemb'][xr.flatten()]
    embr = embr.reshape([n_timesteps, n_samples, options['dim_word']])
    projr = get_layer(options['encoder'])[1](tparams, embr, options,
                                             prefix='encoder_r',
                                             mask=xr_mask)
    ctx = concatenate([proj[0], projr[0][::-1]], axis=proj[0].ndim-1)
    ctx_mean = (ctx * x_mask[:,:,None]).sum(0) / x_mask.sum(0)[:,None]
    #ctx_mean = concatenate([proj[0][-1], projr[0][-1]], axis=proj[0].ndim-2)

    # initial decoder state
    init_state = get_layer('ff')[1](tparams, ctx_mean, options, 
                                    prefix='ff_state', activ='tanh')

    # word embedding (target)
    emb = tparams['Wemb_dec'][y.flatten()]
    emb = emb.reshape([n_timesteps_trg, n_samples, options['dim_word']])
    emb_shifted = tensor.zeros_like(emb)
    emb_shifted = tensor.set_subtensor(emb_shifted[1:], emb[:-1])
    emb = emb_shifted
    # decoder
    proj = get_layer(options['decoder'])[1](tparams, emb, options, 
                                            prefix='decoder', 
                                            mask=y_mask, context=ctx, 
                                            context_mask=x_mask,
                                            one_step=False, 
                                            init_state=init_state)
    proj_h = proj[0]
    ctxs = proj[1]
    opt_ret['dec_alphas'] = proj[2]

    # compute word probabilities
    logit_lstm = get_layer('ff')[1](tparams, proj_h, options, 
                                    prefix='ff_logit_lstm', activ='linear')
    logit_prev = get_layer('ff')[1](tparams, emb, options, 
                                    prefix='ff_logit_prev', activ='linear')
    logit_ctx = get_layer('ff')[1](tparams, ctxs, options, 
                                    prefix='ff_logit_ctx', activ='linear')
    logit_lm = get_layer('ff')[1](tparams, lmhidden, options,
                                    prefix='ff_logit_lm', activ='linear')

    logit = tensor.tanh(logit_lstm+logit_prev+logit_ctx+ logit_lm)
    logit = get_layer('ff')[1](tparams, logit, options, 
                               prefix='ff_logit', activ='linear')
    logit_shp = logit.shape
    probs = tensor.nnet.softmax(logit.reshape([logit_shp[0]*logit_shp[1], 
                                               logit_shp[2]]))

    # cost
    y_flat = y.flatten()
    y_flat_idx = tensor.arange(y_flat.shape[0]) * options['n_words'] + y_flat
    cost = -tensor.log(probs.flatten()[y_flat_idx])
    cost = cost.reshape([y.shape[0],y.shape[1]])
    cost = (cost * y_mask).sum(0)

    return trng, use_noise, x, x_mask, y, y_mask, opt_ret, cost, lmmodel