def build_sampler(tparams, options, use_noise, trng, sampling=True): """ Builds a sampler used for generating from the model Parameters ---------- tparams : OrderedDict maps names of variables to theano shared variables options : dict big dictionary with all the settings and hyperparameters use_noise: boolean If true, add noise to the sampling trng: random number generator sampling : boolean [If it is true, when using stochastic attention, follows the learning rule described in section 4. at the bottom left of page 5] Returns ------- f_init : theano function Input: annotation, Output: initial lstm state and memory (also performs transformation on ctx0 if using lstm_encoder) f_next: theano function Takes the previous word/state/memory + ctx0 and runs ne step through the lstm (used for beam search) """ # context: #annotations x dim ctx = tensor.matrix('ctx_sampler', dtype='float32') if options['lstm_encoder']: # encoder ctx_fwd = get_layer('lstm')[1](tparams, ctx, options, prefix='encoder')[0] ctx_rev = get_layer('lstm')[1](tparams, ctx[::-1, :], options, prefix='encoder_rev')[0][::-1, :] ctx = tensor.concatenate((ctx_fwd, ctx_rev), axis=1) # initial state/cell ctx_mean = ctx.mean(0) for lidx in xrange(1, options['n_layers_init']): ctx_mean = get_layer('ff')[1](tparams, ctx_mean, options, prefix='ff_init_%d' % lidx, activ='rectifier') if options['use_dropout']: ctx_mean = dropout_layer(ctx_mean, use_noise, trng) init_state = [ get_layer('ff')[1](tparams, ctx_mean, options, prefix='ff_state', activ='tanh') ] init_memory = [ get_layer('ff')[1](tparams, ctx_mean, options, prefix='ff_memory', activ='tanh') ] if options['n_layers_lstm'] > 1: for lidx in xrange(1, options['n_layers_lstm']): init_state.append( get_layer('ff')[1](tparams, ctx_mean, options, prefix='ff_state_%d' % lidx, activ='tanh')) init_memory.append( get_layer('ff')[1](tparams, ctx_mean, options, prefix='ff_memory_%d' % lidx, activ='tanh')) print 'Building f_init...', f_init = theano.function([ctx], [ctx] + init_state + init_memory, name='f_init', profile=False, allow_input_downcast=True) print 'Done' # build f_next ctx = tensor.matrix('ctx_sampler', dtype='float32') x = tensor.vector('x_sampler', dtype='int64') init_state = [tensor.matrix('init_state', dtype='float32')] init_memory = [tensor.matrix('init_memory', dtype='float32')] if options['n_layers_lstm'] > 1: for lidx in xrange(1, options['n_layers_lstm']): init_state.append(tensor.matrix('init_state', dtype='float32')) init_memory.append(tensor.matrix('init_memory', dtype='float32')) # for the first word (which is coded with -1), emb should be all zero emb = tensor.switch(x[:, None] < 0, tensor.alloc(0., 1, tparams['Wemb'].shape[1]), tparams['Wemb'][x]) proj = get_layer('lstm_cond')[1](tparams, emb, options, prefix='decoder', mask=None, context=ctx, one_step=True, init_state=init_state[0], init_memory=init_memory[0], trng=trng, use_noise=use_noise, sampling=sampling) next_state, next_memory, ctxs = [proj[0]], [proj[1]], [proj[4]] proj_h = proj[0] if options['n_layers_lstm'] > 1: for lidx in xrange(1, options['n_layers_lstm']): proj = get_layer('lstm_cond')[1](tparams, proj_h, options, prefix='decoder_%d' % lidx, context=ctx, one_step=True, init_state=init_state[lidx], init_memory=init_memory[lidx], trng=trng, use_noise=use_noise, sampling=sampling) next_state.append(proj[0]) next_memory.append(proj[1]) ctxs.append(proj[4]) proj_h = proj[0] if options['use_dropout']: proj_h = dropout_layer(proj[0], use_noise, trng) else: proj_h = proj[0] logit = get_layer('ff')[1](tparams, proj_h, options, prefix='ff_logit_lstm', activ='linear') if options['prev2out']: logit += emb if options['ctx2out']: logit += get_layer('ff')[1](tparams, ctxs[-1], options, prefix='ff_logit_ctx', activ='linear') logit = tanh(logit) if options['use_dropout']: logit = dropout_layer(logit, use_noise, trng) if options['n_layers_out'] > 1: for lidx in xrange(1, options['n_layers_out']): logit = get_layer('ff')[1](tparams, logit, options, prefix='ff_logit_h%d' % lidx, activ='rectifier') if options['use_dropout']: logit = dropout_layer(logit, use_noise, trng) logit = get_layer('ff')[1](tparams, logit, options, prefix='ff_logit', activ='linear') logit_shp = logit.shape next_probs = tensor.nnet.softmax(logit) next_sample = trng.multinomial(pvals=next_probs).argmax(1) # next word probability f_next = theano.function([x, ctx] + init_state + init_memory, [next_probs, next_sample] + next_state + next_memory, name='f_next', profile=False, allow_input_downcast=True) return f_init, f_next
def build_model(tparams, options, sampling=True): """ Builds the entire computational graph used for training Basically does a forward pass through the data and calculates the cost function [This function builds a model described in Section 3.1.2 onwards as the convolutional feature are precomputed, some extra features which were not used are also implemented here.] Parameters ---------- tparams : OrderedDict maps names of variables to theano shared variables options : dict big dictionary with all the settings and hyperparameters sampling : boolean [If it is true, when using stochastic attention, follows the learning rule described in section 4. at the bottom left of page 5] Returns ------- trng: theano random number generator Used for dropout, stochastic attention, etc use_noise: theano shared variable flag that toggles noise on and off [x, mask, ctx]: theano variables Represent the captions, binary mask, and annotations for a single batch (see dimensions below) alphas: theano variables Attention weights alpha_sample: theano variable Sampled attention weights used in REINFORCE for stochastic attention: [see the learning rule in eq (12)] cost: theano variable negative log likelihood opt_outs: OrderedDict extra outputs required depending on configuration in options """ trng = RandomStreams(1234) use_noise = theano.shared(numpy.float32(0.)) # description string: #words x #samples, x = tensor.matrix('x', dtype='int64') mask = tensor.matrix('mask', dtype='float32') # context: #samples x #annotations x dim ctx = tensor.tensor3('ctx', dtype='float32') n_timesteps = x.shape[0] n_samples = x.shape[1] # index into the word embedding matrix, shift it forward in time emb = tparams['Wemb'][x.flatten()].reshape( [n_timesteps, n_samples, options['dim_word']]) emb_shifted = tensor.zeros_like(emb) emb_shifted = tensor.set_subtensor(emb_shifted[1:], emb[:-1]) emb = emb_shifted if options['lstm_encoder']: # encoder ctx_fwd = get_layer('lstm')[1](tparams, ctx.dimshuffle(1, 0, 2), options, prefix='encoder')[0].dimshuffle( 1, 0, 2) ctx_rev = get_layer('lstm')[1]( tparams, ctx.dimshuffle(1, 0, 2)[:, ::-1, :], options, prefix='encoder_rev')[0][:, ::-1, :].dimshuffle(1, 0, 2) ctx0 = tensor.concatenate((ctx_fwd, ctx_rev), axis=2) else: ctx0 = ctx # initial state/cell [top right on page 4] ctx_mean = ctx0.mean(1) for lidx in xrange(1, options['n_layers_init']): ctx_mean = get_layer('ff')[1](tparams, ctx_mean, options, prefix='ff_init_%d' % lidx, activ='rectifier') if options['use_dropout']: ctx_mean = dropout_layer(ctx_mean, use_noise, trng) init_state = get_layer('ff')[1](tparams, ctx_mean, options, prefix='ff_state', activ='tanh') init_memory = get_layer('ff')[1](tparams, ctx_mean, options, prefix='ff_memory', activ='tanh') # lstm decoder # [equation (1), (2), (3) in section 3.1.2] attn_updates = [] proj, updates = get_layer('lstm_cond')[1](tparams, emb, options, prefix='decoder', mask=mask, context=ctx0, one_step=False, init_state=init_state, init_memory=init_memory, trng=trng, use_noise=use_noise, sampling=sampling) attn_updates += updates proj_h = proj[0] # optional deep attention if options['n_layers_lstm'] > 1: for lidx in xrange(1, options['n_layers_lstm']): init_state = get_layer('ff')[1](tparams, ctx_mean, options, prefix='ff_state_%d' % lidx, activ='tanh') init_memory = get_layer('ff')[1](tparams, ctx_mean, options, prefix='ff_memory_%d' % lidx, activ='tanh') proj, updates = get_layer('lstm_cond')[1](tparams, proj_h, options, prefix='decoder_%d' % lidx, mask=mask, context=ctx0, one_step=False, init_state=init_state, init_memory=init_memory, trng=trng, use_noise=use_noise, sampling=sampling) attn_updates += updates proj_h = proj[0] alphas = proj[2] alpha_sample = proj[3] ctxs = proj[4] # [beta value explained in note 4.2.1 "doubly stochastic attention"] if options['selector']: sels = proj[5] if options['use_dropout']: proj_h = dropout_layer(proj_h, use_noise, trng) # compute word probabilities # [equation (7)] logit = get_layer('ff')[1](tparams, proj_h, options, prefix='ff_logit_lstm', activ='linear') if options['prev2out']: logit += emb if options['ctx2out']: logit += get_layer('ff')[1](tparams, ctxs, options, prefix='ff_logit_ctx', activ='linear') logit = tanh(logit) if options['use_dropout']: logit = dropout_layer(logit, use_noise, trng) if options['n_layers_out'] > 1: for lidx in xrange(1, options['n_layers_out']): logit = get_layer('ff')[1](tparams, logit, options, prefix='ff_logit_h%d' % lidx, activ='rectifier') if options['use_dropout']: logit = dropout_layer(logit, use_noise, trng) # compute softmax logit = get_layer('ff')[1](tparams, logit, options, prefix='ff_logit', activ='linear') logit_shp = logit.shape probs = tensor.nnet.softmax( logit.reshape([logit_shp[0] * logit_shp[1], logit_shp[2]])) # Index into the computed probability to give the log likelihood x_flat = x.flatten() p_flat = probs.flatten() cost = -tensor.log(p_flat[tensor.arange(x_flat.shape[0]) * probs.shape[1] + x_flat] + 1e-8) cost = cost.reshape([x.shape[0], x.shape[1]]) masked_cost = cost * mask cost = (masked_cost).sum(0) # optional outputs opt_outs = dict() if options['selector']: opt_outs['selector'] = sels if options['attn_type'] == 'stochastic': opt_outs['masked_cost'] = masked_cost # need this for reinforce later opt_outs['attn_updates'] = attn_updates # this is to update the rng return trng, use_noise, [x, mask, ctx], alphas, alpha_sample, cost, opt_outs
def build_sampler(tparams, options, use_noise, trng, sampling=True): """ Builds a sampler used for generating from the model Parameters ---------- tparams : OrderedDict maps names of variables to theano shared variables options : dict big dictionary with all the settings and hyperparameters use_noise: boolean If true, add noise to the sampling trng: random number generator sampling : boolean [If it is true, when using stochastic attention, follows the learning rule described in section 4. at the bottom left of page 5] Returns ------- f_init : theano function Input: annotation, Output: initial lstm state and memory (also performs transformation on ctx0 if using lstm_encoder) f_next: theano function Takes the previous word/state/memory + ctx0 and runs ne step through the lstm (used for beam search) """ # context: #annotations x dim ctx = tensor.matrix('ctx_sampler', dtype='float32') if options['lstm_encoder']: # encoder ctx_fwd = get_layer('lstm')[1](tparams, ctx, options, prefix='encoder')[0] ctx_rev = get_layer('lstm')[1](tparams, ctx[::-1,:], options, prefix='encoder_rev')[0][::-1,:] ctx = tensor.concatenate((ctx_fwd, ctx_rev), axis=1) # initial state/cell ctx_mean = ctx.mean(0) for lidx in xrange(1, options['n_layers_init']): ctx_mean = get_layer('ff')[1](tparams, ctx_mean, options, prefix='ff_init_%d'%lidx, activ='rectifier') if options['use_dropout']: ctx_mean = dropout_layer(ctx_mean, use_noise, trng) init_state = [get_layer('ff')[1](tparams, ctx_mean, options, prefix='ff_state', activ='tanh')] init_memory = [get_layer('ff')[1](tparams, ctx_mean, options, prefix='ff_memory', activ='tanh')] if options['n_layers_lstm'] > 1: for lidx in xrange(1, options['n_layers_lstm']): init_state.append(get_layer('ff')[1](tparams, ctx_mean, options, prefix='ff_state_%d'%lidx, activ='tanh')) init_memory.append(get_layer('ff')[1](tparams, ctx_mean, options, prefix='ff_memory_%d'%lidx, activ='tanh')) print 'Building f_init...', f_init = theano.function([ctx], [ctx]+init_state+init_memory, name='f_init', profile=False, allow_input_downcast=True) print 'Done' # build f_next ctx = tensor.matrix('ctx_sampler', dtype='float32') x = tensor.vector('x_sampler', dtype='int64') init_state = [tensor.matrix('init_state', dtype='float32')] init_memory = [tensor.matrix('init_memory', dtype='float32')] if options['n_layers_lstm'] > 1: for lidx in xrange(1, options['n_layers_lstm']): init_state.append(tensor.matrix('init_state', dtype='float32')) init_memory.append(tensor.matrix('init_memory', dtype='float32')) # for the first word (which is coded with -1), emb should be all zero emb = tensor.switch(x[:,None] < 0, tensor.alloc(0., 1, tparams['Wemb'].shape[1]), tparams['Wemb'][x]) proj = get_layer('lstm_cond')[1](tparams, emb, options, prefix='decoder', mask=None, context=ctx, one_step=True, init_state=init_state[0], init_memory=init_memory[0], trng=trng, use_noise=use_noise, sampling=sampling) next_state, next_memory, ctxs = [proj[0]], [proj[1]], [proj[4]] proj_h = proj[0] if options['n_layers_lstm'] > 1: for lidx in xrange(1, options['n_layers_lstm']): proj = get_layer('lstm_cond')[1](tparams, proj_h, options, prefix='decoder_%d'%lidx, context=ctx, one_step=True, init_state=init_state[lidx], init_memory=init_memory[lidx], trng=trng, use_noise=use_noise, sampling=sampling) next_state.append(proj[0]) next_memory.append(proj[1]) ctxs.append(proj[4]) proj_h = proj[0] if options['use_dropout']: proj_h = dropout_layer(proj[0], use_noise, trng) else: proj_h = proj[0] logit = get_layer('ff')[1](tparams, proj_h, options, prefix='ff_logit_lstm', activ='linear') if options['prev2out']: logit += emb if options['ctx2out']: logit += get_layer('ff')[1](tparams, ctxs[-1], options, prefix='ff_logit_ctx', activ='linear') logit = tanh(logit) if options['use_dropout']: logit = dropout_layer(logit, use_noise, trng) if options['n_layers_out'] > 1: for lidx in xrange(1, options['n_layers_out']): logit = get_layer('ff')[1](tparams, logit, options, prefix='ff_logit_h%d'%lidx, activ='rectifier') if options['use_dropout']: logit = dropout_layer(logit, use_noise, trng) logit = get_layer('ff')[1](tparams, logit, options, prefix='ff_logit', activ='linear') logit_shp = logit.shape next_probs = tensor.nnet.softmax(logit) next_sample = trng.multinomial(pvals=next_probs).argmax(1) # next word probability f_next = theano.function([x, ctx]+init_state+init_memory, [next_probs, next_sample]+next_state+next_memory, name='f_next', profile=False, allow_input_downcast=True) return f_init, f_next
def build_model(tparams, options, sampling=True): """ Builds the entire computational graph used for training Basically does a forward pass through the data and calculates the cost function [This function builds a model described in Section 3.1.2 onwards as the convolutional feature are precomputed, some extra features which were not used are also implemented here.] Parameters ---------- tparams : OrderedDict maps names of variables to theano shared variables options : dict big dictionary with all the settings and hyperparameters sampling : boolean [If it is true, when using stochastic attention, follows the learning rule described in section 4. at the bottom left of page 5] Returns ------- trng: theano random number generator Used for dropout, stochastic attention, etc use_noise: theano shared variable flag that toggles noise on and off [x, mask, ctx]: theano variables Represent the captions, binary mask, and annotations for a single batch (see dimensions below) alphas: theano variables Attention weights alpha_sample: theano variable Sampled attention weights used in REINFORCE for stochastic attention: [see the learning rule in eq (12)] cost: theano variable negative log likelihood opt_outs: OrderedDict extra outputs required depending on configuration in options """ trng = RandomStreams(1234) use_noise = theano.shared(numpy.float32(0.)) # description string: #words x #samples, x = tensor.matrix('x', dtype='int64') mask = tensor.matrix('mask', dtype='float32') # context: #samples x #annotations x dim ctx = tensor.tensor3('ctx', dtype='float32') n_timesteps = x.shape[0] n_samples = x.shape[1] # index into the word embedding matrix, shift it forward in time emb = tparams['Wemb'][x.flatten()].reshape([n_timesteps, n_samples, options['dim_word']]) emb_shifted = tensor.zeros_like(emb) emb_shifted = tensor.set_subtensor(emb_shifted[1:], emb[:-1]) emb = emb_shifted if options['lstm_encoder']: # encoder ctx_fwd = get_layer('lstm')[1](tparams, ctx.dimshuffle(1,0,2), options, prefix='encoder')[0].dimshuffle(1,0,2) ctx_rev = get_layer('lstm')[1](tparams, ctx.dimshuffle(1,0,2)[:,::-1,:], options, prefix='encoder_rev')[0][:,::-1,:].dimshuffle(1,0,2) ctx0 = tensor.concatenate((ctx_fwd, ctx_rev), axis=2) else: ctx0 = ctx # initial state/cell [top right on page 4] ctx_mean = ctx0.mean(1) for lidx in xrange(1, options['n_layers_init']): ctx_mean = get_layer('ff')[1](tparams, ctx_mean, options, prefix='ff_init_%d'%lidx, activ='rectifier') if options['use_dropout']: ctx_mean = dropout_layer(ctx_mean, use_noise, trng) init_state = get_layer('ff')[1](tparams, ctx_mean, options, prefix='ff_state', activ='tanh') init_memory = get_layer('ff')[1](tparams, ctx_mean, options, prefix='ff_memory', activ='tanh') # lstm decoder # [equation (1), (2), (3) in section 3.1.2] attn_updates = [] proj, updates = get_layer('lstm_cond')[1](tparams, emb, options, prefix='decoder', mask=mask, context=ctx0, one_step=False, init_state=init_state, init_memory=init_memory, trng=trng, use_noise=use_noise, sampling=sampling) attn_updates += updates proj_h = proj[0] # optional deep attention if options['n_layers_lstm'] > 1: for lidx in xrange(1, options['n_layers_lstm']): init_state = get_layer('ff')[1](tparams, ctx_mean, options, prefix='ff_state_%d'%lidx, activ='tanh') init_memory = get_layer('ff')[1](tparams, ctx_mean, options, prefix='ff_memory_%d'%lidx, activ='tanh') proj, updates = get_layer('lstm_cond')[1](tparams, proj_h, options, prefix='decoder_%d'%lidx, mask=mask, context=ctx0, one_step=False, init_state=init_state, init_memory=init_memory, trng=trng, use_noise=use_noise, sampling=sampling) attn_updates += updates proj_h = proj[0] alphas = proj[2] alpha_sample = proj[3] ctxs = proj[4] # [beta value explained in note 4.2.1 "doubly stochastic attention"] if options['selector']: sels = proj[5] if options['use_dropout']: proj_h = dropout_layer(proj_h, use_noise, trng) # compute word probabilities # [equation (7)] logit = get_layer('ff')[1](tparams, proj_h, options, prefix='ff_logit_lstm', activ='linear') if options['prev2out']: logit += emb if options['ctx2out']: logit += get_layer('ff')[1](tparams, ctxs, options, prefix='ff_logit_ctx', activ='linear') logit = tanh(logit) if options['use_dropout']: logit = dropout_layer(logit, use_noise, trng) if options['n_layers_out'] > 1: for lidx in xrange(1, options['n_layers_out']): logit = get_layer('ff')[1](tparams, logit, options, prefix='ff_logit_h%d'%lidx, activ='rectifier') if options['use_dropout']: logit = dropout_layer(logit, use_noise, trng) # compute softmax logit = get_layer('ff')[1](tparams, logit, options, prefix='ff_logit', activ='linear') logit_shp = logit.shape probs = tensor.nnet.softmax(logit.reshape([logit_shp[0]*logit_shp[1], logit_shp[2]])) # Index into the computed probability to give the log likelihood x_flat = x.flatten() p_flat = probs.flatten() cost = -tensor.log(p_flat[tensor.arange(x_flat.shape[0])*probs.shape[1]+x_flat]+1e-8) cost = cost.reshape([x.shape[0], x.shape[1]]) masked_cost = cost * mask cost = (masked_cost).sum(0) # optional outputs opt_outs = dict() if options['selector']: opt_outs['selector'] = sels if options['attn_type'] == 'stochastic': opt_outs['masked_cost'] = masked_cost # need this for reinforce later opt_outs['attn_updates'] = attn_updates # this is to update the rng return trng, use_noise, [x, mask, ctx], alphas, alpha_sample, cost, opt_outs
def build_sampler(tparams, options, use_noise, trng): """ Builds a sampler used for generating from the model Parameters ---------- tparams : OrderedDict maps names of variables to theano shared variables options : dict big dictionary with all the settings and hyperparameters use_noise: boolean If true, add noise to the sampling trng: random number generator Returns ------- f_init : theano function Input: annotation, Output: initial lstm state and memory (also performs transformation on ctx0 if using lstm_encoder) f_next: theano function Takes the previous word/state/memory + ctx0 and runs ne step through the lstm (used for beam search) """ # context: #annotations x dim if options['with_glove']: ctx = tensor.matrix('ctx_sampler', dtype='float32') new_ctx = ctx else: ctx = tensor.vector('ctx_sampler', dtype='int32') new_ctx = tparams['VCemb'][ctx] if options['lstm_encoder']: ctx0, _ = get_layer('lstm_cond_nox')[1](tparams, options, prefix='encoder', context=new_ctx) else: ctx0 = new_ctx # initial state/cell cnn_features = tensor.vector('x_feats', dtype='float32') init_state, init_memory = [], [] for lidx in range(options['n_layers_lstm']): init_state_prefix = 'CNNTrans_%d' % lidx if lidx > 0 else 'CNNTrans' init_memory_prefix = 'CNN_memory_%d' % lidx if lidx > 0 else 'CNN_memory' init_state.append( get_layer('ff')[1](tparams, cnn_features, options, prefix=init_state_prefix, activ='tanh')) init_memory.append( get_layer('ff')[1](tparams, cnn_features, options, prefix=init_memory_prefix, activ='tanh')) print 'Building f_init...', f_init = theano.function([ctx, cnn_features], [ctx0] + init_state + init_memory, name='f_init', profile=False, allow_input_downcast=True) print 'Done' # build f_next x = tensor.vector('x_sampler', dtype='int64') init_state = [] init_memory = [] for lidx in range(options['n_layers_lstm']): init_state.append(tensor.matrix('init_state', dtype='float32')) init_memory.append(tensor.matrix('init_memory', dtype='float32')) # for the first word (which is coded with -1), emb should be all zero emb = tensor.switch(x[:, None] < 0, tensor.alloc(0., 1, tparams['Wemb'].shape[1]), tparams['Wemb'][x]) next_state, next_memory, ctxs = [], [], [] for lidx in range(options['n_layers_lstm']): decoder_prefix = 'decoder_%d' % lidx if lidx > 0 else 'decoder' inps = proj_h if lidx > 0 else emb proj = get_layer('lstm_cond')[1](tparams, inps, options, prefix=decoder_prefix, context=ctx0, one_step=True, init_state=init_state[lidx], init_memory=init_memory[lidx], trng=trng, use_noise=use_noise) next_state.append(proj[0]) next_memory.append(proj[1]) ctxs.append(proj[4]) next_alpha = proj[2] proj_h = proj[0] if options['use_dropout']: proj_h = dropout_layer(proj[0], use_noise, trng) else: proj_h = proj[0] logit = get_layer('ff')[1](tparams, proj_h, options, prefix='ff_logit_lstm', activ='linear') if options['prev2out']: logit += emb if options['ctx2out']: logit += get_layer('ff')[1](tparams, ctxs[-1], options, prefix='ff_logit_ctx', activ='linear') logit = tanh(logit) if options['use_dropout']: logit = dropout_layer(logit, use_noise, trng) if options['n_layers_out'] > 1: for lidx in xrange(1, options['n_layers_out']): logit = get_layer('ff')[1](tparams, logit, options, prefix='ff_logit_h%d' % lidx, activ='rectifier') if options['use_dropout']: logit = dropout_layer(logit, use_noise, trng) logit = get_layer('ff')[1](tparams, logit, options, prefix='ff_logit', activ='linear') next_probs = tensor.nnet.softmax(logit) next_sample = trng.multinomial(pvals=next_probs).argmax(1) # next word probability f_next = theano.function([x, ctx0] + init_state + init_memory, [next_probs, next_sample, next_alpha] + next_state + next_memory, name='f_next', profile=False, allow_input_downcast=True) return f_init, f_next
def build_model(tparams, options): """ Builds the entire computational graph used for training Basically does a forward pass through the data and calculates the cost function [This function builds a model described in Section 3.1.2 onwards as the convolutional feature are precomputed, some extra features which were not used are also implemented here.] Parameters ---------- tparams : OrderedDict maps names of variables to theano shared variables options : dict big dictionary with all the settings and hyperparameters Returns ------- trng: theano random number generator Used for dropout, etc use_noise: theano shared variable flag that toggles noise on and off [x, mask, ctx, cnn_features]: theano variables Represent the captions, binary mask, and annotations for a single batch (see dimensions below) alphas: theano variables Attention weights alpha_sample: theano variable Sampled attention weights used in REINFORCE for stochastic attention: [see the learning rule in eq (12)] cost: theano variable negative log likelihood opt_outs: OrderedDict extra outputs required depending on configuration in options """ trng = RandomStreams(1234) use_noise = theano.shared(numpy.float32(0.)) # description string: #words x #samples, x = tensor.matrix('x', dtype='int64') # mask: #samples, mask = tensor.matrix('mask', dtype='float32') # context: #samples x #visual_words x dim if options['with_glove']: ctx = tensor.tensor3('ctx', dtype='float32') new_ctx = ctx else: ctx = tensor.matrix('ctx', dtype='int32') new_ctx = tparams['VCemb'][ctx] # fc7 features: #samples x dim cnn_features = tensor.matrix('cnn_feats', dtype='float32') # index into the word embedding matrix, shift it forward in time, the first element is zero # Time step x S x D emb = tparams['Wemb'][x.flatten()].reshape( [x.shape[0], x.shape[1], options['dim_word']]) emb_shifted = tensor.zeros_like(emb) emb_shifted = tensor.set_subtensor(emb_shifted[1:], emb[:-1]) emb = emb_shifted # forward-backward lstm encoder if options['lstm_encoder']: rval, encoder_alphas = get_layer('lstm_cond_nox')[1](tparams, options, prefix='encoder', context=new_ctx) ctx0 = rval.dimshuffle(1, 0, 2) else: ctx0 = new_ctx for lidx in range(options['n_layers_lstm']): init_state_prefix = 'CNNTrans_%d' % lidx if lidx > 0 else 'CNNTrans' init_memory_prefix = 'CNN_memory_%d' % lidx if lidx > 0 else 'CNN_memory' lstm_prefix = 'decoder_%d' % lidx if lidx > 0 else 'decoder' lstm_inps = proj_h if lidx > 0 else emb init_state = get_layer('ff')[1](tparams, cnn_features, options, prefix=init_state_prefix, activ='tanh') init_memory = get_layer('ff')[1](tparams, cnn_features, options, prefix=init_memory_prefix, activ='tanh') attn_updates = [] proj, updates = get_layer('lstm_cond')[1](tparams, lstm_inps, options, prefix=lstm_prefix, mask=mask, context=ctx0, one_step=False, init_state=init_state, init_memory=init_memory, trng=trng, use_noise=use_noise) attn_updates += updates proj_h = proj[0] alphas = proj[2] ctxs = proj[4] if options['use_dropout']: proj_h = dropout_layer(proj_h, use_noise, trng) # compute word probabilities # [equation (7)] logit = get_layer('ff')[1](tparams, proj_h, options, prefix='ff_logit_lstm', activ='linear') if options['prev2out']: logit += emb if options['ctx2out']: logit += get_layer('ff')[1](tparams, ctxs, options, prefix='ff_logit_ctx', activ='linear') logit = tanh(logit) if options['use_dropout']: logit = dropout_layer(logit, use_noise, trng) if options['n_layers_out'] > 1: for lidx in xrange(1, options['n_layers_out']): logit = get_layer('ff')[1](tparams, logit, options, prefix='ff_logit_h%d' % lidx, activ='rectifier') if options['use_dropout']: logit = dropout_layer(logit, use_noise, trng) # compute softmax logit = get_layer('ff')[1](tparams, logit, options, prefix='ff_logit', activ='linear') logit_shp = logit.shape probs = tensor.nnet.softmax( logit.reshape([logit_shp[0] * logit_shp[1], logit_shp[2]])) # Index into the computed probability to give the log likelihood x_flat = x.flatten() p_flat = probs.flatten() cost = -tensor.log(p_flat[tensor.arange(x_flat.shape[0]) * probs.shape[1] + x_flat] + 1e-8) cost = cost.reshape([x.shape[0], x.shape[1]]) masked_cost = cost * mask #align_cost = (-standard_aligns*alphas).sum(2) cost = masked_cost.sum(0) # optional outputs opt_outs = dict() if options['lstm_encoder']: return trng, use_noise, [x, mask, ctx, cnn_features ], [alphas, encoder_alphas], cost, opt_outs else: return trng, use_noise, [x, mask, ctx, cnn_features], [alphas], cost, opt_outs