def rmsprop(self, lr, tparams, grads, inp_list, cost, params): clip = params['grad_clip'] decay_rate = params['decay_rate'] smooth_eps = params['smooth_eps'] zipped_grads = [theano.shared(p.get_value() * numpy_floatX(0.), name='%s_grad' % k) for k, p in tparams.iteritems()] running_grads2 = [theano.shared(p.get_value() * numpy_floatX(0.), name='%s_rgrad2' % k) for k, p in tparams.iteritems()] zgup = [(zg, g) for zg, g in zip(zipped_grads, grads)] if clip > 0: rg2up = [(rg2, decay_rate * rg2 + (1 - decay_rate) * (tensor.clip(g,-clip,clip) ** 2)) for rg2, g in zip(running_grads2, grads)] else: rg2up = [(rg2, decay_rate * rg2 + (1 - decay_rate) * (g ** 2)) for rg2, g in zip(running_grads2, grads)] f_grad_shared = theano.function(inp_list, cost, updates=zgup + rg2up, name='rmsprop_f_grad_shared') updir = [theano.shared(p.get_value() * numpy_floatX(0.), name='%s_updir' % k) for k, p in tparams.iteritems()] updir_new = [(ud, -lr * zg / (tensor.sqrt(rg2)+ smooth_eps)) for ud, zg, rg2 in zip(updir, zipped_grads, running_grads2)] param_up = [(p, p + udn[1]) for p, udn in zip(tparams.values(), updir_new)] f_update = theano.function([lr], [], updates=updir_new + param_up, on_unused_input='ignore', name='rmsprop_f_update') return f_grad_shared, f_update, zipped_grads, running_grads2, updir
def lstm_enc_layer(self, tparams, state_below, prefix='lstm'): nsteps = state_below.shape[0] h_depth = self.hidden_depth h_sz = self.hidden_size if state_below.ndim == 3: n_samples = state_below.shape[1] else: n_samples = 1 def _step(x_in, h_, c_): preact = tensor.dot(sliceT(h_, 0, h_sz), tparams[_p(prefix, 'W_hid')]) preact += x_in # preact += tparams[_p(prefix, 'b')] h = [[]] * h_depth c = [[]] * h_depth outp = [[]] * h_depth for di in xrange(h_depth): i = tensor.nnet.sigmoid(sliceT(preact, 0, h_sz)) f = tensor.nnet.sigmoid(sliceT(preact, 1, h_sz)) o = tensor.nnet.sigmoid(sliceT(preact, 2, h_sz)) c[di] = tensor.tanh(sliceT(preact, 3, h_sz)) c[di] = f * sliceT(c_, di, h_sz) + i * c[di] h[di] = o * tensor.tanh(c[di]) outp[di] = h[di] if self.en_residual_conn: if (di > 0): outp[di] += outp[di - 1] print "Connecting residual at %d" % (di) if di < (h_depth - 1): preact = tensor.dot(sliceT(h_, di+1, h_sz), tparams[_p(prefix, ('W_hid_' + str(di+1)))]) + \ tensor.dot(outp[di], tparams[_p(prefix, ('W_inp_' + str(di+1)))]) c_out = tensor.concatenate(c, axis=1) h_out = tensor.concatenate(h + [outp[-1]], axis=1) return h_out, c_out state_below = (tensor.dot(state_below, tparams[_p(prefix, 'W_inp')]) + tparams[_p(prefix, 'b')]) rval, updates = theano.scan(_step, sequences=[state_below], outputs_info=[ tensor.alloc(numpy_floatX(0.), n_samples, (h_depth + 1) * h_sz), tensor.alloc(numpy_floatX(0.), n_samples, h_depth * h_sz), ], name=_p(prefix, '_layers'), n_steps=nsteps) return rval, updates
def lstm_layer(self, tparams, state_below, aux_input, use_noise, options, prefix='lstm', mask=None): nsteps = state_below.shape[0] h_depth = options.get('hidden_depth',1) h_sz = options['hidden_size'] if state_below.ndim == 3: n_samples = state_below.shape[1] else: n_samples = 1 assert mask is not None def _step(m_, x_, h_, c_, xAux): preact = tensor.dot(sliceT(h_, 0, h_sz), tparams[_p(prefix, 'W_hid')]) preact += x_ if options.get('en_aux_inp',0): preact += tensor.dot(xAux,tparams[_p(prefix,'W_aux')]) # preact += tparams[_p(prefix, 'b')] h = [[]]*h_depth c = [[]]*h_depth for di in xrange(h_depth): i = tensor.nnet.sigmoid(sliceT(preact, 0, h_sz)) f = tensor.nnet.sigmoid(sliceT(preact, 1, h_sz)) o = tensor.nnet.sigmoid(sliceT(preact, 2, h_sz)) c[di] = tensor.tanh(sliceT(preact, 3, h_sz)) c[di] = f * sliceT(c_, di, h_sz) + i * c[di] h[di] = o * tensor.tanh(c[di]) if di < (h_depth - 1): preact = tensor.dot(sliceT(h_, di+1, h_sz), tparams[_p(prefix, ('W_hid_' + str(di+1)))]) + \ tensor.dot(h[di], tparams[_p(prefix, ('W_inp_' + str(di+1)))]) c_out = tensor.concatenate(c,axis=1) h_out = tensor.concatenate(h,axis=1) return h_out, c_out state_below = (tensor.dot(state_below, tparams[_p(prefix, 'W_inp')]) + tparams[_p(prefix, 'b')]) if options.get('en_aux_inp',0) == 0: aux_input = [] rval, updates = theano.scan(_step, sequences=[mask, state_below], outputs_info=[tensor.alloc(numpy_floatX(0.), n_samples, h_depth*h_sz), tensor.alloc(numpy_floatX(0.), n_samples, h_depth*h_sz), #tensor.alloc(numpy_floatX(0.),n_samples,options['output_size'])], ], non_sequences = [aux_input] , name=_p(prefix, '_layers'), n_steps=nsteps) return rval, updates
def _FindB_best(lPLcl, lPprev, dVLcl): srtLcl = tensor.argsort(-lPLcl) srtLcl = srtLcl[:beam_size] deltaVec = tensor.fill( lPLcl[srtLcl], numpy_floatX(-10000.)) deltaVec = tensor.set_subtensor(deltaVec[0], lPprev) lProbBest = ifelse(tensor.eq( dVLcl, tensor.zeros_like(dVLcl)), lPLcl[srtLcl] + lPprev, deltaVec) xWIdxBest = ifelse(tensor.eq( dVLcl, tensor.zeros_like(dVLcl)), srtLcl, tensor.zeros_like(srtLcl)) return lProbBest, xWIdxBest
def rmsprop(self, lr, tparams, grads, inp_list, cost, params): clip = params['grad_clip'] decay_rate = params['decay_rate'] smooth_eps = params['smooth_eps'] zipped_grads = [ theano.shared(p.get_value() * numpy_floatX(0.), name='%s_grad' % k) for k, p in tparams.iteritems() ] running_grads2 = [ theano.shared(p.get_value() * numpy_floatX(0.), name='%s_rgrad2' % k) for k, p in tparams.iteritems() ] zgup = [(zg, g) for zg, g in zip(zipped_grads, grads)] if clip > 0: rg2up = [(rg2, decay_rate * rg2 + (1 - decay_rate) * (tensor.clip(g, -clip, clip)**2)) for rg2, g in zip(running_grads2, grads)] else: rg2up = [(rg2, decay_rate * rg2 + (1 - decay_rate) * (g**2)) for rg2, g in zip(running_grads2, grads)] f_grad_shared = theano.function(inp_list, cost, updates=zgup + rg2up, name='rmsprop_f_grad_shared') updir = [ theano.shared(p.get_value() * numpy_floatX(0.), name='%s_updir' % k) for k, p in tparams.iteritems() ] updir_new = [ (ud, -lr * zg / (tensor.sqrt(rg2) + smooth_eps)) for ud, zg, rg2 in zip(updir, zipped_grads, running_grads2) ] param_up = [(p, p + udn[1]) for p, udn in zip(tparams.values(), updir_new)] f_update = theano.function([lr], [], updates=updir_new + param_up, on_unused_input='ignore', name='rmsprop_f_update') return f_grad_shared, f_update, zipped_grads, running_grads2, updir
def _FindB_best(lPLcl, lPprev, dVLcl): srtLcl = tensor.argsort(-lPLcl) srtLcl = srtLcl[:beam_size] deltaVec = tensor.fill(lPLcl[srtLcl], numpy_floatX(-10000.)) deltaVec = tensor.set_subtensor(deltaVec[0], lPprev) lProbBest = ifelse(tensor.eq(dVLcl, tensor.zeros_like(dVLcl)), lPLcl[srtLcl] + lPprev, deltaVec) xWIdxBest = ifelse(tensor.eq(dVLcl, tensor.zeros_like(dVLcl)), srtLcl, tensor.zeros_like(srtLcl)) return lProbBest, xWIdxBest
def _stepP(x_, h_, c_, lP_, dV_, xAux): preact = tensor.dot(sliceT(h_, 0, h_sz), tparams[_p(prefix, 'W_hid')]) preact += (tensor.dot(x_, tparams[_p(prefix, 'W_inp')]) + tparams[_p(prefix, 'b')]) if options.get('en_aux_inp', 0): preact += tensor.dot(xAux, tparams[_p(prefix, 'W_aux')]) hL = [[]] * h_depth cL = [[]] * h_depth for di in xrange(h_depth): i = tensor.nnet.sigmoid(sliceT(preact, 0, h_sz)) f = tensor.nnet.sigmoid(sliceT(preact, 1, h_sz)) o = tensor.nnet.sigmoid(sliceT(preact, 2, h_sz)) cL[di] = tensor.tanh(sliceT(preact, 3, h_sz)) cL[di] = f * sliceT(c_, di, h_sz) + i * cL[di] hL[di] = o * tensor.tanh(cL[di]) if di < (h_depth - 1): preact = tensor.dot(sliceT(h_, di+1, h_sz), tparams[_p(prefix, ('W_hid_' + str(di+1)))]) + \ tensor.dot(hL[di], tparams[_p(prefix, ('W_inp_' + str(di+1)))]) c = tensor.concatenate(cL, axis=1) h = tensor.concatenate(hL, axis=1) p = tensor.dot(hL[-1], tparams['Wd']) + tparams['bd'] smooth_factor = tensor.as_tensor_variable(numpy_floatX( options.get('softmax_smooth_factor', 1.0)), name='sm_f') p = tensor.nnet.softmax(p * smooth_factor) lProb = tensor.log(p + 1e-20) #xCandIdx = tensor.as_tensor_variable([0]) lProb = lProb.flatten() xWIdx = tensor.argmax(lProb, keepdims=True) xWlogProb = lProb[xWIdx] + lP_ if options.get('softmax_propogate', 0) == 0: xW = tparams['Wemb'][xWIdx.flatten()] else: xW = p.dot(tparams['Wemb']) doneVec = tensor.eq(xWIdx, tensor.zeros_like(xWIdx)) return [xW, h, c, xWlogProb, doneVec, xWIdx, p], theano.scan_module.until(doneVec.all())
def rmsprop(self, lr, tparams, grads, inp_list, cost, params, prior_updates=[], w_clip = None): clip = params['grad_clip'] decay_rate = tensor.constant(params['decay_rate'], dtype=theano.config.floatX) smooth_eps = tensor.constant(params['smooth_eps'], dtype=theano.config.floatX) zipped_grads = [theano.shared(np.zeros_like(p.get_value()), name='%s_grad' % k) for k, p in tparams.iteritems()] running_grads2 = [theano.shared(np.zeros_like(p.get_value()), name='%s_rgrad2' % k) for k, p in tparams.iteritems()] zgup = [(zg, g) for zg, g in zip(zipped_grads, grads)] if clip > 0.0: rg2up = [(rg2, tensor.clip(decay_rate * rg2 + (1 - decay_rate) * (tensor.clip(g,-clip,clip) ** 2),0.0,np.inf)) for rg2, g in zip(running_grads2, grads)] else: rg2up = [(rg2, tensor.clip(decay_rate * rg2 + (1 - decay_rate) * (g ** 2),0.0,np.inf)) for rg2, g in zip(running_grads2, grads)] f_grad_shared = theano.function(inp_list, cost, updates=zgup + rg2up + prior_updates, name='rmsprop_f_grad_shared') updir = [theano.shared(p.get_value() * numpy_floatX(0.), name='%s_updir' % k) for k, p in tparams.iteritems()] updir_new = [(ud, -lr * zg / (tensor.sqrt(rg2)+ smooth_eps)) for ud, zg, rg2 in zip(updir, zipped_grads, running_grads2)] if w_clip != None: print 'clipping weights with %.2f in RMS-PROP'%(w_clip) param_up = [(p, tensor.clip(p + udn[1], -w_clip, w_clip)) for p, udn in zip(tparams.values(), updir_new)] else: param_up = [(p, p + udn[1]) for p, udn in zip(tparams.values(), updir_new)] f_update = theano.function([lr], [], updates=updir_new + param_up, on_unused_input='ignore', name='rmsprop_f_update') return f_grad_shared, f_update, zipped_grads, running_grads2, updir
def _stepP(x_, h_, c_, lP_, dV_, xAux): preact = tensor.dot(sliceT(h_, 0, h_sz), tparams[_p(prefix, 'W_hid')]) preact += (tensor.dot(x_, tparams[_p(prefix, 'W_inp')]) + tparams[_p(prefix, 'b')]) if options.get('en_aux_inp',0): preact += tensor.dot(xAux,tparams[_p(prefix,'W_aux')]) hL = [[]]*h_depth cL = [[]]*h_depth for di in xrange(h_depth): i = tensor.nnet.sigmoid(sliceT(preact, 0, h_sz)) f = tensor.nnet.sigmoid(sliceT(preact, 1, h_sz)) o = tensor.nnet.sigmoid(sliceT(preact, 2, h_sz)) cL[di] = tensor.tanh(sliceT(preact, 3, h_sz)) cL[di] = f * sliceT(c_, di, h_sz) + i * cL[di] hL[di] = o * tensor.tanh(cL[di]) if di < (h_depth - 1): preact = tensor.dot(sliceT(h_, di+1, h_sz), tparams[_p(prefix, ('W_hid_' + str(di+1)))]) + \ tensor.dot(hL[di], tparams[_p(prefix, ('W_inp_' + str(di+1)))]) c = tensor.concatenate(cL,axis=1) h = tensor.concatenate(hL,axis=1) p = tensor.dot(hL[-1],tparams['Wd']) + tparams['bd'] smooth_factor = tensor.as_tensor_variable(numpy_floatX(options.get('softmax_smooth_factor',1.0)), name='sm_f') p = tensor.nnet.softmax(p*smooth_factor) lProb = tensor.log(p + 1e-20) #xCandIdx = tensor.as_tensor_variable([0]) lProb = lProb.flatten() xWIdx = tensor.argmax(lProb,keepdims=True) xWlogProb = lProb[xWIdx] + lP_ if options.get('softmax_propogate',0) == 0: xW = tparams['Wemb'][xWIdx.flatten()] else: xW = p.dot(tparams['Wemb']) doneVec = tensor.eq(xWIdx,tensor.zeros_like(xWIdx)) return [xW, h, c, xWlogProb, doneVec, xWIdx, p], theano.scan_module.until(doneVec.all())
def _stepP(x_, h_, c_, lP_, dV_, xAux): preact = tensor.dot(sliceT(h_, 0, h_sz), tparams[_p(prefix, 'W_hid')]) preact += (tensor.dot(x_, tparams[_p(prefix, 'W_inp')]) + tparams[_p(prefix, 'b')]) if options.get('en_aux_inp',0): preact += tensor.dot(xAux,tparams[_p(prefix,'W_aux')]) hL = [[]]*h_depth cL = [[]]*h_depth for di in xrange(h_depth): i = tensor.nnet.sigmoid(sliceT(preact, 0, h_sz)) f = tensor.nnet.sigmoid(sliceT(preact, 1, h_sz)) o = tensor.nnet.sigmoid(sliceT(preact, 2, h_sz)) cL[di] = tensor.tanh(sliceT(preact, 3, h_sz)) cL[di] = f * sliceT(c_, di, h_sz) + i * cL[di] hL[di] = o * tensor.tanh(cL[di]) if di < (h_depth - 1): preact = tensor.dot(sliceT(h_, di+1, h_sz), tparams[_p(prefix, ('W_hid_' + str(di+1)))]) + \ tensor.dot(hL[di], tparams[_p(prefix, ('W_inp_' + str(di+1)))]) c = tensor.concatenate(cL,axis=1) h = tensor.concatenate(hL,axis=1) if options.get('class_out_factoring',0) == 1: pC = tensor.dot(hL[-1],tparams['WdCls']) + tparams['bdCls'] pCSft = tensor.nnet.softmax(pC) xCIdx = tensor.argmax(pCSft) pW = tensor.dot(h[-1],tparams['Wd'][:,xCIdx,:]) + tparams['bd'][:,xCIdx,:] smooth_factor = tensor.as_tensor_variable(numpy_floatX(options.get('softmax_smooth_factor',1.0)), name='sm_f') pWSft = tensor.nnet.softmax(pW*smooth_factor) lProb = tensor.log(pWSft + 1e-20) + tensor.log(pCSft[0,xCIdx] + 1e-20) else: p = tensor.dot(hL[-1],tparams['Wd']) + tparams['bd'] smooth_factor = tensor.as_tensor_variable(numpy_floatX(options.get('softmax_smooth_factor',1.0)), name='sm_f') p = tensor.nnet.softmax(p*smooth_factor) lProb = tensor.log(p + 1e-20) if beam_size > 1: def _FindB_best(lPLcl, lPprev, dVLcl): srtLcl = tensor.argsort(-lPLcl) srtLcl = srtLcl[:beam_size] deltaVec = tensor.fill( lPLcl[srtLcl], numpy_floatX(-10000.)) deltaVec = tensor.set_subtensor(deltaVec[0], lPprev) lProbBest = ifelse(tensor.eq( dVLcl, tensor.zeros_like(dVLcl)), lPLcl[srtLcl] + lPprev, deltaVec) xWIdxBest = ifelse(tensor.eq( dVLcl, tensor.zeros_like(dVLcl)), srtLcl, tensor.zeros_like(srtLcl)) return lProbBest, xWIdxBest rvalLcl, updatesLcl = theano.scan(_FindB_best, sequences = [lProb, lP_, dV_], name=_p(prefix, 'FindBest'), n_steps=x_.shape[0]) xWIdxBest = rvalLcl[1] lProbBest = rvalLcl[0] xWIdxBest = xWIdxBest.flatten() lProb = lProbBest.flatten() # Now sort and find the best among these best extensions for the current beams srtIdx = tensor.argsort(-lProb) srtIdx = srtIdx[:beam_size] xCandIdx = srtIdx // beam_size # Floor division h = h.take(xCandIdx.flatten(),axis=0) c = c.take(xCandIdx.flatten(),axis=0) xWlogProb = lProb[srtIdx] xWIdx = xWIdxBest[srtIdx] else: xCandIdx = tensor.as_tensor_variable([0]) lProb = lProb.flatten() xWIdx = tensor.argmax(lProb,keepdims=True) xWlogProb = lProb[xWIdx] + lP_ if options.get('class_out_factoring',0) == 1: clsoffset = tensor.as_tensor_variable(options['ixtoclsinfo'][:,0]) xWIdx += clsoffset[xCIdx] h = h.take(xCandIdx.flatten(),axis=0) c = c.take(xCandIdx.flatten(),axis=0) if options.get('softmax_propogate',0) == 0: xW = tparams['Wemb'][xWIdx.flatten()] else: xW = p.dot(tparams['Wemb']) doneVec = tensor.eq(xWIdx,tensor.zeros_like(xWIdx)) return [xW, h, c, xWlogProb, doneVec, xWIdx, xCandIdx], theano.scan_module.until(doneVec.all())
def _stepP(*in_list): x_inp = [] h_inp = [] c_inp = [] for i in xrange(nmodels): x_inp.append(in_list[i]) h_inp.append(in_list[nmodels + i]) c_inp.append(in_list[2 * nmodels + i]) lP_ = in_list[3 * nmodels] dV_ = in_list[3 * nmodels + 1] p_comb = tensor.alloc(numpy_floatX(0.), options[0]['output_size']) cf = [] h = [] xW = [] for i in xrange(nmodels): preact = tensor.dot(h_inp[i], tparams[i][_p(prefix, 'W_hid')]) preact += ( tensor.dot(x_inp[i], tparams[i][_p(prefix, 'W_inp')]) + tparams[i][_p(prefix, 'b')]) if options[i].get('en_aux_inp', 0): preact += tensor.dot(aux_input2[i], tparams[i][_p(prefix, 'W_aux')]) inp = tensor.nnet.sigmoid( sliceT(preact, 0, options[i]['hidden_size'])) f = tensor.nnet.sigmoid( sliceT(preact, 1, options[i]['hidden_size'])) o = tensor.nnet.sigmoid( sliceT(preact, 2, options[i]['hidden_size'])) c = tensor.tanh(sliceT(preact, 3, options[i]['hidden_size'])) cf.append(f * c_inp[i] + inp * c) h.append(o * tensor.tanh(cf[i])) p = tensor.dot(h[i], tparams[i]['Wd']) + tparams[i]['bd'] if i == 0: p_comb = tparams[i]['comb_weight'] * tensor.nnet.softmax(p) else: p_comb += tparams[i]['comb_weight'] * tensor.nnet.softmax( p) lProb = tensor.log(p_comb + 1e-20) def _FindB_best(lPLcl, lPprev, dVLcl): srtLcl = tensor.argsort(-lPLcl) srtLcl = srtLcl[:beam_size] deltaVec = tensor.fill(lPLcl[srtLcl], numpy_floatX(-10000.)) deltaVec = tensor.set_subtensor(deltaVec[0], lPprev) lProbBest = ifelse(tensor.eq(dVLcl, tensor.zeros_like(dVLcl)), lPLcl[srtLcl] + lPprev, deltaVec) xWIdxBest = ifelse(tensor.eq(dVLcl, tensor.zeros_like(dVLcl)), srtLcl, tensor.zeros_like(srtLcl)) return lProbBest, xWIdxBest rvalLcl, updatesLcl = theano.scan(_FindB_best, sequences=[lProb, lP_, dV_], name=_p(prefix, 'FindBest'), n_steps=x_inp[0].shape[0]) xWIdxBest = rvalLcl[1] lProbBest = rvalLcl[0] xWIdxBest = xWIdxBest.flatten() lProb = lProbBest.flatten() # Now sort and find the best among these best extensions for the current beams srtIdx = tensor.argsort(-lProb) srtIdx = srtIdx[:beam_size] xWlogProb = lProb[srtIdx] xWIdx = xWIdxBest[srtIdx] xCandIdx = srtIdx // beam_size # Floor division doneVec = tensor.eq(xWIdx, tensor.zeros_like(xWIdx)) x_out = [] h_out = [] c_out = [] for i in xrange(nmodels): x_out.append(tparams[i]['Wemb'][xWIdx.flatten()]) h_out.append(h[i].take(xCandIdx.flatten(), axis=0)) c_out.append(cf[i].take(xCandIdx.flatten(), axis=0)) out_list = [] out_list.extend(x_out) out_list.extend(h_out) out_list.extend(c_out) out_list.extend([xWlogProb, doneVec, xWIdx, xCandIdx]) return out_list, theano.scan_module.until(doneVec.all())
def build_eval_other_sent(self, tparams, options, model_npy): zipp(model_npy, self.model_th) # Used for dropout. use_noise = theano.shared(numpy_floatX(0.)) xW = tensor.matrix('xW', dtype='int64') mask = tensor.matrix('mask', dtype=config.floatX) n_timesteps = xW.shape[0] n_samples = xW.shape[1] n_out_samps = (n_timesteps - 1) * n_samples embW = tparams['Wemb'][xW.flatten()].reshape( [n_timesteps, n_samples, options['word_encoding_size']]) xI = tensor.matrix('xI', dtype=config.floatX) xAux = tensor.matrix('xAux', dtype=config.floatX) if options.get('swap_aux', 0): xAuxEmb = tensor.dot(xAux, tparams['WIemb_aux']) + tparams['b_Img_aux'] else: xAuxEmb = xAux embImg = (tensor.dot(xI, tparams['WIemb']) + tparams['b_Img']).reshape( [1, n_samples, options['image_encoding_size']]) emb = tensor.concatenate([embImg, embW], axis=0) rval, updatesLSTM = basic_lstm_layer(tparams, emb[:n_timesteps, :, :], xAuxEmb, use_noise, options, prefix=options['generator']) p = sliceT(rval[0][1:, :, :], options.get('hidden_depth', 1), options['hidden_size']) pW = (tensor.dot(p, tparams['Wd']) + tparams['bd']).reshape( [n_out_samps, options['output_size']]) pWSft = tensor.nnet.softmax(pW) totProb = pWSft[tensor.arange(n_out_samps), xW[1:, :].flatten()] # #pred = tensor.nnet.softmax(p) # # #pred = rval[2] # # #pred = pred[1:,:,:] # # def accumCost(pred,xW,m,c_sum,ppl_sum): # pred = tensor.nnet.softmax(pred) # c_sum += (tensor.log(pred[tensor.arange(n_samples), xW]+1e-20) * m) # ppl_sum += -(tensor.log2(pred[tensor.arange(n_samples), xW]+1e-10) * m) # return c_sum, ppl_sum # # sums, upd = theano.scan(fn=accumCost, # outputs_info=[tensor.alloc(numpy_floatX(0.), 1,n_samples), # tensor.alloc(numpy_floatX(0.), 1,n_samples)], # sequences = [p, xW[1:,:], mask[1:,:]]) # NOTE1: we are leaving out the first prediction, which was made for the image # and is meaningless. Here cost[0] contains log probability (log10) and cost[1] contains # perplexity (log2) tot_cost = -(tensor.log(totProb + 1e-10) * mask[1:, :].flatten()).sum() cost = tot_cost / options['batch_size'] inp_list = [xW, mask, xI] if options.get('en_aux_inp', 0): inp_list.append(xAux) self.f_pred_prob_other = theano.function(inp_list, p, name='f_pred_prob', updates=updatesLSTM) #f_pred = theano.function([xW, mask], pred.argmax(axis=1), name='f_pred') #cost = -tensor.log(pred[tensor.arange(n_timesteps),tensor.arange(n_samples), xW] + 1e-8).mean() self.f_eval_other = theano.function(inp_list, cost, name='f_eval') return use_noise, inp_list, self.f_pred_prob_other, cost, pW, updatesLSTM
def lstm_advers_gen_layer(self, tparams, Xi, aux_input, options, beam_size, prefix='lstm'): nMaxsteps = options.get('maxlen', 15) n_samples = 1 h_depth = options.get('hidden_depth', 1) h_sz = options['hidden_size'] # ---------------------- STEP FUNCTION ---------------------- # def _stepP(x_, h_, c_, lP_, dV_, xAux): preact = tensor.dot(sliceT(h_, 0, h_sz), tparams[_p(prefix, 'W_hid')]) preact += (tensor.dot(x_, tparams[_p(prefix, 'W_inp')]) + tparams[_p(prefix, 'b')]) if options.get('en_aux_inp', 0): preact += tensor.dot(xAux, tparams[_p(prefix, 'W_aux')]) hL = [[]] * h_depth cL = [[]] * h_depth for di in xrange(h_depth): i = tensor.nnet.sigmoid(sliceT(preact, 0, h_sz)) f = tensor.nnet.sigmoid(sliceT(preact, 1, h_sz)) o = tensor.nnet.sigmoid(sliceT(preact, 2, h_sz)) cL[di] = tensor.tanh(sliceT(preact, 3, h_sz)) cL[di] = f * sliceT(c_, di, h_sz) + i * cL[di] hL[di] = o * tensor.tanh(cL[di]) if di < (h_depth - 1): preact = tensor.dot(sliceT(h_, di+1, h_sz), tparams[_p(prefix, ('W_hid_' + str(di+1)))]) + \ tensor.dot(hL[di], tparams[_p(prefix, ('W_inp_' + str(di+1)))]) c = tensor.concatenate(cL, axis=1) h = tensor.concatenate(hL, axis=1) p = tensor.dot(hL[-1], tparams['Wd']) + tparams['bd'] smooth_factor = tensor.as_tensor_variable(numpy_floatX( options.get('softmax_smooth_factor', 1.0)), name='sm_f') p = tensor.nnet.softmax(p * smooth_factor) lProb = tensor.log(p + 1e-20) #xCandIdx = tensor.as_tensor_variable([0]) lProb = lProb.flatten() xWIdx = tensor.argmax(lProb, keepdims=True) xWlogProb = lProb[xWIdx] + lP_ if options.get('softmax_propogate', 0) == 0: xW = tparams['Wemb'][xWIdx.flatten()] else: xW = p.dot(tparams['Wemb']) doneVec = tensor.eq(xWIdx, tensor.zeros_like(xWIdx)) return [xW, h, c, xWlogProb, doneVec, xWIdx, p], theano.scan_module.until(doneVec.all()) # ------------------- END of STEP FUNCTION -------------------- # if options.get('en_aux_inp', 0) == 0: aux_input = [] h = tensor.alloc(numpy_floatX(0.), n_samples, h_sz * h_depth) c = tensor.alloc(numpy_floatX(0.), n_samples, h_sz * h_depth) lP = tensor.alloc(numpy_floatX(0.), beam_size) dV = tensor.alloc(np.int8(0.), beam_size) # Propogate the image feature vector [xW, h, c, _, _, _, _], _ = _stepP(Xi, h, c, lP, dV, aux_input) xWStart = tparams['Wemb'][0, :] [xW, h, c, lP, dV, idx0, p0], _ = _stepP(xWStart, h, c, lP, dV, aux_input) #if options.get('en_aux_inp',0) == 1: # aux_input = tensor.extra_ops.repeat(aux_input,beam_size,axis=0) # Now lets do the loop. rval, updates = theano.scan( _stepP, outputs_info=[xW, h, c, lP, dV, None, None], non_sequences=[aux_input], name=_p(prefix, 'predict_layers'), n_steps=nMaxsteps - 1) return rval[3][-1], tensor.concatenate( [idx0.reshape([1, beam_size]), rval[5]], axis=0), tensor.concatenate( [tensor.shape_padleft(p0, n_ones=1), rval[6]], axis=0), updates
def lstm_predict_layer(self, tparams, Xi, aux_input, options, beam_size, prefix='lstm'): nMaxsteps = options.get('maxlen', 30) if nMaxsteps is None: nMaxsteps = 30 n_samples = 1 h_depth = options.get('hidden_depth', 1) h_sz = options['hidden_size'] # ---------------------- STEP FUNCTION ---------------------- # def _stepP(x_, h_, c_, lP_, dV_, xAux): preact = tensor.dot(sliceT(h_, 0, h_sz), tparams[_p(prefix, 'W_hid')]) preact += (tensor.dot(x_, tparams[_p(prefix, 'W_inp')]) + tparams[_p(prefix, 'b')]) if options.get('en_aux_inp', 0): preact += tensor.dot(xAux, tparams[_p(prefix, 'W_aux')]) hL = [[]] * h_depth cL = [[]] * h_depth outp = [[]] * h_depth for di in xrange(h_depth): i = tensor.nnet.sigmoid(sliceT(preact, 0, h_sz)) f = tensor.nnet.sigmoid(sliceT(preact, 1, h_sz)) o = tensor.nnet.sigmoid(sliceT(preact, 2, h_sz)) cL[di] = tensor.tanh(sliceT(preact, 3, h_sz)) cL[di] = f * sliceT(c_, di, h_sz) + i * cL[di] hL[di] = o * tensor.tanh(cL[di]) outp[di] = hL[di] if options.get('en_residual_conn', 1): if (di > 0): outp[di] += outp[di - 1] print "Connecting residual at %d" % (di) if di < (h_depth - 1): preact = tensor.dot(sliceT(h_, di+1, h_sz), tparams[_p(prefix, ('W_hid_' + str(di+1)))]) + \ tensor.dot(outp[di], tparams[_p(prefix, ('W_inp_' + str(di+1)))]) c = tensor.concatenate(cL, axis=1) h = tensor.concatenate(hL, axis=1) if options.get('class_out_factoring', 0) == 1: pC = tensor.dot(outp[-1], tparams['WdCls']) + tparams['bdCls'] pCSft = tensor.nnet.softmax(pC) xCIdx = tensor.argmax(pCSft) pW = tensor.dot( outp[-1], tparams['Wd'][:, xCIdx, :]) + tparams['bd'][:, xCIdx, :] smooth_factor = tensor.as_tensor_variable(numpy_floatX( options.get('softmax_smooth_factor', 1.0)), name='sm_f') pWSft = tensor.nnet.softmax(pW * smooth_factor) lProb = tensor.log(pWSft + 1e-20) + tensor.log(pCSft[0, xCIdx] + 1e-20) else: p = tensor.dot(outp[-1], tparams['Wd']) + tparams['bd'] smooth_factor = tensor.as_tensor_variable(numpy_floatX( options.get('softmax_smooth_factor', 1.0)), name='sm_f') p = tensor.nnet.softmax(p * smooth_factor) lProb = tensor.log(p + 1e-20) if beam_size > 1: def _FindB_best(lPLcl, lPprev, dVLcl): srtLcl = tensor.argsort(-lPLcl) srtLcl = srtLcl[:beam_size] deltaVec = tensor.fill(lPLcl[srtLcl], numpy_floatX(-10000.)) deltaVec = tensor.set_subtensor(deltaVec[0], lPprev) lProbBest = ifelse( tensor.eq(dVLcl, tensor.zeros_like(dVLcl)), lPLcl[srtLcl] + lPprev, deltaVec) xWIdxBest = ifelse( tensor.eq(dVLcl, tensor.zeros_like(dVLcl)), srtLcl, tensor.zeros_like(srtLcl)) return lProbBest, xWIdxBest rvalLcl, updatesLcl = theano.scan(_FindB_best, sequences=[lProb, lP_, dV_], name=_p(prefix, 'FindBest'), n_steps=x_.shape[0]) xWIdxBest = rvalLcl[1] lProbBest = rvalLcl[0] xWIdxBest = xWIdxBest.flatten() lProb = lProbBest.flatten() # Now sort and find the best among these best extensions for the current beams srtIdx = tensor.argsort(-lProb) srtIdx = srtIdx[:beam_size] xCandIdx = srtIdx // beam_size # Floor division h = h.take(xCandIdx.flatten(), axis=0) c = c.take(xCandIdx.flatten(), axis=0) xWlogProb = lProb[srtIdx] xWIdx = xWIdxBest[srtIdx] else: xCandIdx = tensor.as_tensor_variable([0]) lProb = lProb.flatten() xWIdx = tensor.argmax(lProb, keepdims=True) xWlogProb = lProb[xWIdx] + lP_ if options.get('class_out_factoring', 0) == 1: clsoffset = tensor.as_tensor_variable( options['ixtoclsinfo'][:, 0]) xWIdx += clsoffset[xCIdx] h = h.take(xCandIdx.flatten(), axis=0) c = c.take(xCandIdx.flatten(), axis=0) if options.get('softmax_propogate', 0) == 0: xW = tparams['Wemb'][xWIdx.flatten()] else: xW = p.dot(tparams['Wemb']) doneVec = tensor.eq(xWIdx, tensor.zeros_like(xWIdx)) return [xW, h, c, xWlogProb, doneVec, xWIdx, xCandIdx], theano.scan_module.until(doneVec.all()) # ------------------- END of STEP FUNCTION -------------------- # if options.get('en_aux_inp', 0) == 0: aux_input = [] h = tensor.alloc(numpy_floatX(0.), beam_size, h_sz * h_depth) c = tensor.alloc(numpy_floatX(0.), beam_size, h_sz * h_depth) lP = tensor.alloc(numpy_floatX(0.), beam_size) dV = tensor.alloc(np.int8(0.), beam_size) # Propogate the image feature vector [xW, h, c, _, _, _, _], _ = _stepP(Xi, h[:1, :], c[:1, :], lP, dV, aux_input) xWStart = tparams['Wemb'][[0]] [xW, h, c, lP, dV, idx0, cand0], _ = _stepP(xWStart, h[:1, :], c[:1, :], lP, dV, aux_input) if options.get('en_aux_inp', 0) == 1: aux_input = tensor.extra_ops.repeat(aux_input, beam_size, axis=0) # Now lets do the loop. rval, updates = theano.scan( _stepP, outputs_info=[xW, h, c, lP, dV, None, None], non_sequences=[aux_input], name=_p(prefix, 'predict_layers'), n_steps=nMaxsteps) return rval[3][-1], tensor.concatenate( [idx0.reshape([1, beam_size]), rval[5]], axis=0), tensor.concatenate( [cand0.reshape([1, beam_size]), rval[6]], axis=0), tensor.concatenate( [tensor.shape_padleft(xW, n_ones=1), rval[0]], axis=0), updates
def main(params): word_count_threshold = params['word_count_threshold'] max_epochs = params['max_epochs'] host = socket.gethostname() # get computer hostname # fetch the data provider dp = getDataProvider(params) # Initialize the optimizer solver = Solver(params['solver']) params['image_feat_size'] = dp.img_feat_size params['aux_inp_size'] = dp.aux_inp_size misc = { } # stores various misc items that need to be passed around the framework # go over all training sentences and find the vocabulary we want to use, i.e. the words that occur # at least word_count_threshold number of times misc['wordtoix'], misc[ 'ixtoword'], bias_init_vector = preProBuildWordVocab( dp.iterSentences('train'), word_count_threshold) if params['fine_tune'] == 1: params['mode'] = 'multi_choice_mode' if params[ 'mc_mode'] == 1 else 'multimodal_lstm' if params['checkpoint_file_name'] != None: #params['batch_size'] = dp.dataset['batchsize'] misc['wordtoix'] = checkpoint_init['wordtoix'] misc['ixtoword'] = checkpoint_init['ixtoword'] batch_size = 1 num_sentences_total = dp.getSplitSize('train', ofwhat='images') else: params['mode'] = 'batchtrain' batch_size = params['batch_size'] num_sentences_total = dp.getSplitSize('train', ofwhat='sentences') params['vocabulary_size'] = len(misc['wordtoix']) pos_samp = np.arange(batch_size, dtype=np.int32) # This initializes the model parameters and does matrix initializations evalModel = decodeEvaluator(params) model, misc['update'], misc['regularize'] = (evalModel.model_th, evalModel.updateP, evalModel.regularize) #----------------- If we are using feature encoders ----------------------- if params['use_encoder_for'] & 1: imgFeatEncoder = RecurrentFeatEncoder(params['image_feat_size'], params['sent_encoding_size'], params, mdl_prefix='img_enc_', features=dp.features.T) mdlLen = len(model.keys()) model.update(imgFeatEncoder.model_th) assert (len(model.keys()) == (mdlLen + len(imgFeatEncoder.model_th.keys()))) #misc['update'].extend(imgFeatEncoder.update_list) misc['regularize'].extend(imgFeatEncoder.regularize) (imgenc_use_dropout, imgFeatEnc_inp, xI, updatesLSTMImgFeat) = imgFeatEncoder.build_model(model, params) else: xI = None imgFeatEnc_inp = [] # Define the computational graph for relating the input image features and word indices to the # log probability cost funtion. (use_dropout, inp_list_eval, miscOuts, cost, predTh, model) = evalModel.build_model(model, params, xI=xI, prior_inp_list=imgFeatEnc_inp) inp_list = imgFeatEnc_inp + inp_list_eval # Compile an evaluation function.. Doesn't include gradients # To be used for validation set evaluation f_eval = theano.function(inp_list, cost, name='f_eval') # Add the regularization cost. Since this is specific to trainig and doesn't get included when we # evaluate the cost on test or validation data, we leave it here outside the model definition if params['regc'] > 0.: reg_cost = theano.shared(numpy_floatX(0.), name='reg_c') reg_c = tensor.as_tensor_variable(numpy_floatX(params['regc']), name='reg_c') for p in misc['regularize']: reg_cost += (model[p]**2).sum() reg_cost *= 0.5 * reg_c cost[0] += (reg_cost / params['batch_size']) # Now let's build a gradient computation graph and rmsprop update mechanism grads = tensor.grad(cost[0], wrt=model.values()) lr = tensor.scalar(name='lr', dtype=config.floatX) if params['sim_minibatch'] > 0: f_grad_accum, f_clr, ag = solver.accumGrads(model, grads, inp_list, cost, params['sim_minibatch']) f_grad_shared, f_update, zg, rg, ud = solver.build_solver_model( lr, model, ag, inp_list, cost, params) else: f_grad_shared, f_update, zg, rg, ud = solver.build_solver_model( lr, model, grads, inp_list, cost, params) print 'model init done.' print 'model has keys: ' + ', '.join(model.keys()) # calculate how many iterations we need, One epoch is considered once going through all the sentences and not images # Hence in case of coco/flickr this will 5* no of images num_iters_one_epoch = num_sentences_total / batch_size max_iters = max_epochs * num_iters_one_epoch inner_loop = params['sim_minibatch'] if params['sim_minibatch'] > 0 else 1 max_iters = max_iters / inner_loop eval_period_in_epochs = params['eval_period'] eval_period_in_iters = max( 1, int(num_iters_one_epoch * eval_period_in_epochs / inner_loop)) top_val_ppl2 = -1 smooth_train_cost = len( misc['ixtoword']) # initially size of dictionary of confusion smooth_error_rate = 100. error_rate = 0. prev_it = -1 val_ppl2 = len(misc['ixtoword']) last_status_write_time = 0 # for writing worker job status reports json_worker_status = {} json_worker_status['params'] = params json_worker_status['history'] = [] len_hist = defaultdict(int) ## Initialize the model parameters from the checkpoint file if we are resuming training if params['checkpoint_file_name'] != None: zipp(model_init_from, model) zipp(rg_init, rg) print("\nContinuing training from previous model\n. Already run for %0.2f epochs with validation perplx at %0.3f\n" % (checkpoint_init['epoch'], \ checkpoint_init['perplexity'])) elif params['init_from_imagernn'] != None: # Initialize word vecs and image emb from generative model file rnnCv = pickle.load(open(params['init_from_imagernn'], 'rb')) model['Wemb'].set_value(rnnCv['model']['Wemb']) model['WIemb'].set_value(rnnCv['model']['WIemb_aux']) misc['wordtoix'] = rnnCv['wordtoix'] misc['ixtoword'] = rnnCv['ixtoword'] print( "\n Initialized Word embedding and Image embeddings from gen mode %s" % (params['init_from_imagernn'])) write_checkpoint_ppl_threshold = params['write_checkpoint_ppl_threshold'] use_dropout.set_value(1.) #################### Main Loop ############################################ for it in xrange(max_iters): t0 = time.time() if params['use_encoder_for'] & 1: imgenc_use_dropout.set_value(float(params['use_dropout'])) # fetch a batch of data cost_inner = np.zeros((inner_loop, ), dtype=np.float32) if params['sim_minibatch'] > 0: for i_l in xrange(inner_loop): batch, pos_samp_sent = dp.sampPosNegSentSamps( params['batch_size'], params['mode'], thresh=0.3) eval_inp_list, lenS = prepare_data( batch, misc['wordtoix'], maxlen=params['maxlen'], pos_samp=pos_samp, prep_for=params['eval_model'], use_enc_for=params['use_encoder_for']) if params['fine_tune'] == 1: eval_inp_list.append(pos_samp_sent) cost_inner[i_l] = f_grad_accum(*eval_inp_list) else: batch, pos_samp_sent = dp.sampPosNegSentSamps(params['batch_size'], params['mode'], thresh=0.3) enc_inp_list = prepare_seq_features( batch, use_enc_for=params['use_encoder_for'], use_shared_mem=params['use_shared_mem_enc']) eval_inp_list, lenS = prepare_data( batch, misc['wordtoix'], maxlen=params['maxlen'], pos_samp=pos_samp, prep_for=params['eval_model'], use_enc_for=params['use_encoder_for']) if params['fine_tune'] == 1: eval_inp_list.append(pos_samp_sent) real_inp_list = enc_inp_list + eval_inp_list # Enable using dropout in training cost = f_grad_shared(*real_inp_list) f_update(params['learning_rate']) dt = time.time() - t0 # Reset accumulated gradients to 0 if params['sim_minibatch'] > 0: f_clr() #print 'model: ' + ' '.join([str(np.isnan(model[m].get_value()).any()) for m in model]) #print 'rg: ' +' '.join([str(np.isnan(rg[i].get_value()).any()) for i in xrange(len(rg))]) #print 'zg: ' + ' '.join([str(np.isnan(zg[i].get_value()).any()) for i in xrange(len(zg))]) #print 'ud: ' + ' '.join([str(np.isnan(ud[i].get_value()).any()) for i in xrange(len(ud))]) #import pdb; pdb.set_trace() #print 'udAft: ' + ' '.join([str(np.isnan(ud[i].get_value()).any()) for i in xrange(len(ud))]) # print training statistics epoch = it * inner_loop * 1.0 / num_iters_one_epoch total_cost = (np.e**(-cost[0]) + (np.e**(-cost_inner)).sum() * (params['sim_minibatch'] > 0)) / ( 1 + params['sim_minibatch']) #print '%d/%d batch done in %.3fs. at epoch %.2f. loss cost = %f, reg cost = %f, ppl2 = %.2f (smooth %.2f)' \ # % (it, max_iters, dt, epoch, cost['loss_cost'], cost['reg_cost'], \ # train_ppl2, smooth_train_cost) if it == 0: smooth_train_cost = total_cost else: smooth_train_cost = 0.99 * smooth_train_cost + 0.01 * total_cost error_rate += 100.0 * float((cost[2] < 0.).sum()) / batch_size margin_strength = cost[2].sum() smooth_error_rate = 0.99 * smooth_error_rate + 0.01 * 100.0 * ( float(cost[1]) / batch_size) if it > 0 else 100.0 * ( float(cost[1]) / batch_size) tnow = time.time() if tnow > last_status_write_time + 60 * 1: # every now and then lets write a report print '%d/%d batch done in %.3fs. at epoch %.2f. Prob now is %.4f, Error '\ 'rate is %.3f%%, Margin %.2f, negMarg=%.2f' % (it, max_iters, dt, \ epoch, smooth_train_cost, smooth_error_rate, margin_strength, error_rate/(it-prev_it)) error_rate = 0. prev_it = it last_status_write_time = tnow jstatus = {} jstatus['time'] = datetime.datetime.now().isoformat() jstatus['iter'] = (it, max_iters) jstatus['epoch'] = (epoch, max_epochs) jstatus['time_per_batch'] = dt jstatus['val_ppl2'] = val_ppl2 # just write the last available one json_worker_status['history'].append(jstatus) status_file = os.path.join( params['worker_status_output_directory'], host + '_status.json') #import pdb; pdb.set_trace() try: json.dump(json_worker_status, open(status_file, 'w')) except Exception, e: # todo be more clever here print 'tried to write worker status into %s but got error:' % ( status_file, ) print e ## perform perplexity evaluation on the validation set and save a model checkpoint if it's good is_last_iter = (it + 1) == max_iters if (((it + 1) % eval_period_in_iters) == 0 and it < max_iters - 5) or is_last_iter: # Disable using dropout in validation use_dropout.set_value(0.) if params['use_encoder_for'] & 1: imgenc_use_dropout.set_value(0.) val_ppl2 = eval_split_theano( 'val', dp, model, params, misc, f_eval) # perform the evaluation on VAL set if epoch - params['lr_decay_st_epoch'] >= 0: params['learning_rate'] = params['learning_rate'] * params[ 'lr_decay'] params['lr_decay_st_epoch'] += 1 print 'validation perplexity = %f, lr = %f' % ( val_ppl2, params['learning_rate']) #if params['sample_by_len'] == 1: # print len_hist if val_ppl2 < top_val_ppl2 or top_val_ppl2 < 0: if val_ppl2 < write_checkpoint_ppl_threshold or write_checkpoint_ppl_threshold < 0: # if we beat a previous record or if this is the first time # AND we also beat the user-defined threshold or it doesnt exist top_val_ppl2 = val_ppl2 filename = '%s_checkpoint_%s_%s_%s_%.2f_%.2f.p' % ( params['eval_model'], params['dataset'], host, params['fappend'], smooth_error_rate, val_ppl2) filepath = os.path.join( params['checkpoint_output_directory'], filename) model_npy = unzip(model) rgrads_npy = unzip(rg) checkpoint = {} checkpoint['it'] = it checkpoint['epoch'] = epoch checkpoint['model'] = model_npy checkpoint['rgrads'] = rgrads_npy checkpoint['params'] = params checkpoint['perplexity'] = val_ppl2 checkpoint['wordtoix'] = misc['wordtoix'] checkpoint['ixtoword'] = misc['ixtoword'] try: pickle.dump(checkpoint, open(filepath, "wb")) print 'saved checkpoint in %s' % (filepath, ) except Exception, e: # todo be more clever here print 'tried to write checkpoint into %s but got error: ' % ( filepath, ) print e use_dropout.set_value(1.)
def lstm_predict_layer(self, tparams, Xi, aux_input, options, beam_size, prefix='lstm'): nMaxsteps = 30 n_samples = 1 # ---------------------- STEP FUNCTION ---------------------- # def _stepP(x_, h_, c_, lP_, dV_, xAux): preact = tensor.dot(h_, tparams[_p(prefix, 'W_hid')]) preact += (tensor.dot(x_, tparams[_p(prefix, 'W_inp')]) + tparams[_p(prefix, 'b')]) if options.get('en_aux_inp', 0): preact += tensor.dot(xAux, tparams[_p(prefix, 'W_aux')]) i = tensor.nnet.sigmoid(sliceT(preact, 0, options['hidden_size'])) f = tensor.nnet.sigmoid(sliceT(preact, 1, options['hidden_size'])) o = tensor.nnet.sigmoid(sliceT(preact, 2, options['hidden_size'])) c = tensor.tanh(sliceT(preact, 3, options['hidden_size'])) c = f * c_ + i * c h = o * tensor.tanh(c) p = tensor.dot(h, tparams['Wd']) + tparams['bd'] p = tensor.nnet.softmax(p) lProb = tensor.log(p + 1e-20) def _FindB_best(lPLcl, lPprev, dVLcl): srtLcl = tensor.argsort(-lPLcl) srtLcl = srtLcl[:beam_size] deltaVec = tensor.fill(lPLcl[srtLcl], numpy_floatX(-10000.)) deltaVec = tensor.set_subtensor(deltaVec[0], lPprev) lProbBest = ifelse(tensor.eq(dVLcl, tensor.zeros_like(dVLcl)), lPLcl[srtLcl] + lPprev, deltaVec) xWIdxBest = ifelse(tensor.eq(dVLcl, tensor.zeros_like(dVLcl)), srtLcl, tensor.zeros_like(srtLcl)) return lProbBest, xWIdxBest rvalLcl, updatesLcl = theano.scan(_FindB_best, sequences=[lProb, lP_, dV_], name=_p(prefix, 'FindBest'), n_steps=x_.shape[0]) xWIdxBest = rvalLcl[1] lProbBest = rvalLcl[0] xWIdxBest = xWIdxBest.flatten() lProb = lProbBest.flatten() # Now sort and find the best among these best extensions for the current beams srtIdx = tensor.argsort(-lProb) srtIdx = srtIdx[:beam_size] xWlogProb = lProb[srtIdx] xWIdx = xWIdxBest[srtIdx] xCandIdx = srtIdx // beam_size # Floor division xW = tparams['Wemb'][xWIdx.flatten()] doneVec = tensor.eq(xWIdx, tensor.zeros_like(xWIdx)) h = h.take(xCandIdx.flatten(), axis=0) c = c.take(xCandIdx.flatten(), axis=0) return [xW, h, c, xWlogProb, doneVec, xWIdx, xCandIdx], theano.scan_module.until(doneVec.all()) # ------------------- END of STEP FUNCTION -------------------- # if options.get('en_aux_inp', 0) == 0: aux_input = [] hidden_size = options['hidden_size'] h = tensor.alloc(numpy_floatX(0.), beam_size, hidden_size) c = tensor.alloc(numpy_floatX(0.), beam_size, hidden_size) lP = tensor.alloc(numpy_floatX(0.), beam_size) dV = tensor.alloc(np.int8(0.), beam_size) # Propogate the image feature vector [xW, h, c, _, _, _, _], _ = _stepP(Xi, h[:1, :], c[:1, :], lP, dV, aux_input) xWStart = tparams['Wemb'][[0]] [xW, h, c, lP, dV, idx0, cand0], _ = _stepP(xWStart, h[:1, :], c[:1, :], lP, dV, aux_input) aux_input = tensor.extra_ops.repeat(aux_input, beam_size, axis=0) # Now lets do the loop. rval, updates = theano.scan( _stepP, outputs_info=[xW, h, c, lP, dV, None, None], non_sequences=[aux_input], name=_p(prefix, 'predict_layers'), n_steps=nMaxsteps) return rval[3][-1], tensor.concatenate( [idx0.reshape([1, beam_size]), rval[5]], axis=0), tensor.concatenate( [cand0.reshape([1, beam_size]), rval[6]], axis=0)
def lstm_advers_gen_layer(self, tparams, xI, xAux, options, prefix='lstm'): nBatchSamps = xI.shape[0] nMaxsteps = options.get('maxlen', 15) if nMaxsteps is None: nMaxsteps = 30 n_samp = options.get('n_gen_samples', 1) h_depth = options.get('hidden_depth', 1) h_sz = options['hidden_size'] # ---------------------- STEP FUNCTION ---------------------- # def _stepP(U, xW_, h_, c_, lP_, dV_, xAux, xNoise): preact = tensor.dot(sliceT(h_, 0, h_sz), tparams[_p(prefix, 'W_hid')]) preact += (tensor.dot(xW_, tparams[_p(prefix, 'W_inp')]) + tparams[_p(prefix, 'b')]) preact += xAux if options.get('gen_input_noise', 0): preact += xNoise hL = [[]] * h_depth cL = [[]] * h_depth outp = [[]] * h_depth for di in xrange(h_depth): i = tensor.nnet.sigmoid(sliceT(preact, 0, h_sz)) f = tensor.nnet.sigmoid(sliceT(preact, 1, h_sz)) o = tensor.nnet.sigmoid(sliceT(preact, 2, h_sz)) cL[di] = tensor.tanh(sliceT(preact, 3, h_sz)) cL[di] = f * sliceT(c_, di, h_sz) + i * cL[di] hL[di] = o * tensor.tanh(cL[di]) outp[di] = hL[di] if options.get('en_residual_conn', 1): if (di > 0): outp[di] += outp[di - 1] print "Connecting residual at %d" % (di) if di < (h_depth - 1): preact = tensor.dot(sliceT(h_, di+1, h_sz), tparams[_p(prefix, ('W_hid_' + str(di+1)))]) + \ tensor.dot(outp[di], tparams[_p(prefix, ('W_inp_' + str(di+1)))]) c = tensor.concatenate(cL, axis=1) h = tensor.concatenate(hL, axis=1) logits = tensor.dot(outp[-1], tparams['Wd']) + tparams['bd'] #p = tensor.dot(outp[-1],l2norm(tparams['Wd'],axis=0))# + tparams['bd'] if options.get('use_gumbel_mse', 0) == 0 or options.get( 'greedy', 0): p = tensor.nnet.softmax(logits) else: p = gumbel_softmax_sample( self.trng, logits * self.softmax_smooth_factor, self.gumb_temp, U, options.get('use_gumbel_hard', False)) if options.get('computelogprob', 0): lProb = tensor.log( tensor.nnet.softmax(logits * self.softmax_smooth_factor) + 1e-20) else: lProb = logits # Idx of the correct word should come from the xWIdx = ~dV_ * tensor.argmax(p, axis=-1) xWlogProb = ~dV_ * lProb[tensor.arange(nBatchSamps * n_samp), xWIdx] + lP_ #xW = tparams['Wemb'][xWIdx.flatten()] if options.get('use_gumbel_hard', 0) and options.get( 'use_gumbel_mse', 0) and not options.get('greedy', 0): xW = p.dot(tparams['Wemb']) else: xW = theano.gradient.disconnected_grad( tparams['Wemb'][xWIdx.flatten()].reshape( [xWIdx.shape[0], -1])) doneVec = tensor.eq(xWIdx, tensor.zeros_like(xWIdx)) return [xW, h, c, xWlogProb, doneVec, xWIdx, p], theano.scan_module.until(doneVec.all()) # ------------------- END of STEP FUNCTION -------------------- # if options.get('use_gumbel_mse', 0) == 0: U = self.trng.uniform((nMaxsteps, 1), low=0., high=1., dtype=theano.config.floatX) else: U = self.trng.uniform((nMaxsteps + 1, nBatchSamps * n_samp, options['vocabulary_size']), low=0., high=1., dtype=theano.config.floatX) xI = tensor.extra_ops.repeat(xI, n_samp, axis=0) xAux = tensor.extra_ops.repeat(tensor.dot(xAux, tparams[_p(prefix, 'W_aux')]), n_samp, axis=0) if options.get('gen_input_noise', 0): xNoise = tensor.dot( self.trng.normal([nBatchSamps * n_samp, self.noise_dim]), tparams[_p(prefix, 'W_noise')]) else: xNoise = [] if options.get('gen_use_rand_init', 0) and not options.get('gen_input_noise', 0): h = tensor.unbroadcast( self.trng.uniform([nBatchSamps * n_samp, h_sz * h_depth], low=-0.1, high=0.1), 0, 1) c = tensor.unbroadcast( self.trng.uniform([nBatchSamps * n_samp, h_sz * h_depth], low=-0.1, high=0.1), 0, 1) else: h = tensor.zeros([nBatchSamps * n_samp, h_sz * h_depth]) c = tensor.zeros([nBatchSamps * n_samp, h_sz * h_depth]) lP = tensor.alloc(numpy_floatX(0.), nBatchSamps * n_samp) dV = tensor.alloc(np.bool_(0.), nBatchSamps * n_samp) # Propogate the image feature vector [_, h, c, _, _, _, _], _ = _stepP(U[0, :], xI, h, c, lP, dV, xAux, xNoise) xWStart = tensor.unbroadcast( tensor.tile(tparams['Wemb'][[0]], [nBatchSamps * n_samp, 1]), 0, 1) # Now lets do the loop. rval, updates = theano.scan( _stepP, sequences=[U[1:, :]], outputs_info=[xWStart, h, c, lP, dV, None, None], non_sequences=[xAux, xNoise], name=_p(prefix, 'adv_predict_layers'), n_steps=nMaxsteps) seq_lengths = theano.gradient.disconnected_grad( tensor.argmax(tensor.concatenate( [rval[4][:-1, :], tensor.ones((1, nBatchSamps * n_samp))], axis=0), axis=0) + 1) return rval[3][-1], rval[5], rval[6], updates, seq_lengths
def build_eval_other_sent(self, tparams, options, model_npy): zipp(model_npy, self.model_th) # Used for dropout. use_noise = theano.shared(numpy_floatX(0.)) xW = tensor.matrix('xW', dtype='int64') mask = tensor.matrix('mask', dtype=config.floatX) n_timesteps = xW.shape[0] n_samples = xW.shape[1] n_out_samps = (n_timesteps - 1) * n_samples embW = tparams['Wemb'][xW.flatten()].reshape( [n_timesteps, n_samples, options['word_encoding_size']]) xI = tensor.matrix('xI', dtype=config.floatX) xAux = tensor.matrix('xAux', dtype=config.floatX) if options.get('swap_aux', 0): xAuxEmb = tensor.dot(xAux, tparams['WIemb_aux']) + tparams['b_Img_aux'] else: xAuxEmb = xAux embImg = (tensor.dot(xI, tparams['WIemb']) + tparams['b_Img']).reshape( [1, n_samples, options['image_encoding_size']]) emb = tensor.concatenate([embImg, embW], axis=0) rval, updatesLSTM = basic_lstm_layer(tparams, emb[:n_timesteps, :, :], xAuxEmb, use_noise, options, prefix=options['generator']) p = sliceT(rval[0][1:, :, :], options.get('hidden_depth', 1), options['hidden_size']) if options.get('class_out_factoring', 0) == 0: pW = (tensor.dot(p, tparams['Wd']) + tparams['bd']).reshape( [n_out_samps, options['output_size']]) pWSft = tensor.nnet.softmax(pW) totProb = pWSft[tensor.arange(n_out_samps), xW[1:, :].flatten()] out_list = [pWSft, totProb, p] else: ixtoclsinfo_t = tensor.as_tensor_variable(self.clsinfo) xC = ixtoclsinfo_t[xW[1:, :].flatten(), 0] pW = ((tparams['Wd'][:, xC, :].T * ((p.reshape([1, n_out_samps, options['hidden_size']]) - tparams['WdCls'][:, xC].T))).sum(axis=-1).T + tparams['bd'][:, xC, :]) pWSft = tensor.nnet.softmax(pW[0, :, :]) pC = (tensor.dot(p, tparams['WdCls']) + tparams['bdCls']).reshape( [n_out_samps, options['nClasses']]) pCSft = tensor.nnet.softmax(pC) totProb = pWSft[tensor.arange(n_out_samps), ixtoclsinfo_t[xW[1:,:].flatten(),3]] * \ pCSft[tensor.arange(n_out_samps), xC] tot_cost = -(tensor.log(totProb + 1e-10) * mask[1:, :].flatten() ).reshape([n_timesteps - 1, n_samples]) cost = tot_cost.sum(axis=0) inp_list = [xW, mask, xI] if options.get('en_aux_inp', 0): inp_list.append(xAux) self.f_pred_prob_other = theano.function([xW, xI, xAux], pWSft, name='f_pred_prob', updates=updatesLSTM) #f_pred = theano.function([xW, mask], pred.argmax(axis=1), name='f_pred') #cost = -tensor.log(pred[tensor.arange(n_timesteps),tensor.arange(n_samples), xW] + 1e-8).mean() self.f_eval_other = theano.function(inp_list, cost, name='f_eval') return use_noise, inp_list, self.f_pred_prob_other, cost, pW, updatesLSTM
def __init__(self, params): image_encoding_size = params.get('image_encoding_size', 128) word_encoding_size = params.get('word_encoding_size', 128) hidden_size = params.get('hidden_size', 128) hidden_depth = params.get('hidden_depth', 1) generator = params.get('generator', 'lstm') vocabulary_size = params.get('vocabulary_size', -1) output_size = params.get('output_size', -1) image_feat_size = params.get('image_feat_size', -1) # size of CNN vectors hardcoded here aux_inp_size = params.get('aux_inp_size', -1) model = OrderedDict() # Recurrent weights: take x_t, h_{t-1}, and bias unit # and produce the 3 gates and the input to cell signal encoder = params.get('feat_encoder', None) use_feat_enc = params.get('use_encoder_for', 0) if not (use_feat_enc & 1): model['WIemb'] = initwTh(image_feat_size, word_encoding_size) # image encoder model['b_Img'] = np.zeros( (word_encoding_size)).astype(config.floatX) model['Wemb'] = initwTh(vocabulary_size, word_encoding_size) # word encoder model['lstm_W_hid'] = initwTh(hidden_size, 4 * hidden_size) model['lstm_W_inp'] = initwTh(word_encoding_size, 4 * hidden_size) for i in xrange(1, hidden_depth): model['lstm_W_hid_' + str(i)] = initwTh(hidden_size, 4 * hidden_size) model['lstm_W_inp_' + str(i)] = initwTh(hidden_size, 4 * hidden_size) model['lstm_b'] = np.zeros((4 * hidden_size, )).astype(config.floatX) # Decoder weights (e.g. mapping to vocabulary) if params.get('class_out_factoring', 0) == 0: model['Wd'] = initwTh(hidden_size, output_size) # decoder model['bd'] = np.zeros((output_size, )).astype(config.floatX) else: clsinfo = params['ixtoclsinfo'] self.clsinfo = clsinfo clsSizes = clsinfo[:, 2] - clsinfo[:, 1] self.clsSize = np.zeros(params['nClasses']) self.clsOffset = np.zeros(params['nClasses'], dtype=np.int32) self.clsSize[clsinfo[:, 0]] = clsSizes self.clsOffset[clsinfo[:, 0]] = np.int32(clsinfo[:, 1]) max_cls_size = np.max(clsSizes) self.max_cls_size = max_cls_size Wd = np.zeros( (params['hidden_size'], params['nClasses'], max_cls_size), dtype=config.floatX) model['bd'] = np.zeros((1, params['nClasses'], max_cls_size), dtype=config.floatX) for cix in clsinfo[:, 0]: Wd[:, cix, :clsSizes[cix]] = initwTh(params['hidden_size'], clsSizes[cix]) model['bd'][0, cix, clsSizes[cix]:] = -100 model['Wd'] = Wd update_list = [ 'lstm_W_hid', 'lstm_W_inp', 'lstm_b', 'Wd', 'bd', 'Wemb' ] self.regularize = ['lstm_W_hid', 'lstm_W_inp', 'Wd', 'Wemb'] if not (use_feat_enc & 1): update_list.extend(['WIemb', 'b_Img']) self.regularize.extend(['WIemb']) if params.get('class_out_factoring', 0) == 1: model['WdCls'] = initwTh(hidden_size, params['nClasses']) # decoder model['bdCls'] = np.zeros( (params['nClasses'], )).astype(config.floatX) update_list.extend(['WdCls', 'bdCls']) self.regularize.extend(['WdCls']) for i in xrange(1, hidden_depth): update_list.append('lstm_W_hid_' + str(i)) update_list.append('lstm_W_hid_' + str(i)) self.regularize.append('lstm_W_inp_' + str(i)) self.regularize.append('lstm_W_inp_' + str(i)) if params.get('en_aux_inp', 0): if params.get('swap_aux', 1) == 1: if not (use_feat_enc & 2) or params.get( 'encode_gt_sentences', 0): model['WIemb_aux'] = initwTh( aux_inp_size, image_encoding_size) # image encoder model['b_Img_aux'] = np.zeros( (image_encoding_size)).astype(config.floatX) update_list.append('WIemb_aux') self.regularize.append('WIemb_aux') update_list.append('b_Img_aux') model['lstm_W_aux'] = initwTh(image_encoding_size, 4 * hidden_size, 0.00005) else: model['lstm_W_aux'] = initwTh(aux_inp_size, 4 * hidden_size, 0.001) update_list.append('lstm_W_aux') self.regularize.append('lstm_W_aux') if params.get('gen_input_noise', 0): self.noise_dim = params.get('gen_inp_noise_dim', 50) model['lstm_W_noise'] = initwTh(self.noise_dim, 4 * hidden_size, 0.001) self.model_th = self.init_tparams(model) del model if params.get('use_gumbel_mse', 0): self.usegumbel = theano.shared(1) self.gumb_temp = theano.shared( numpy_floatX(params.get('gumbel_temp_init', 0.5))) #self.model_th['gumb_temp'] = self.gumb_temp self.softmax_smooth_factor = theano.shared( numpy_floatX(params.get('softmax_smooth_factor', 1.0))) else: self.usegumbel = theano.shared(0) self.update_list = update_list
def build_model(self, tparams, options, xI=None, xAux=None, attn_nw=None): self.trng = RandomStreams(int(time.time())) # Used for dropout. use_noise = theano.shared(numpy_floatX(0.)) xW = tensor.matrix('xW', dtype='int64') mask = tensor.matrix('mask', dtype=config.floatX) n_timesteps = xW.shape[0] n_samples = xW.shape[1] embW = tparams['Wemb'][xW.flatten()].reshape( [n_timesteps, n_samples, options['word_encoding_size']]) if xI == None: xI = tensor.matrix('xI', dtype=config.floatX) embImg = (tensor.dot(xI, tparams['WIemb']) + tparams['b_Img']) xI_is_inp = True else: embImg = xI xI_is_inp = False if xAux == None: xAux = tensor.matrix( 'xAux', dtype=config.floatX) if attn_nw == None else tensor.tensor3( 'xAux', dtype=config.floatX) if (options.get('swap_aux', 1)) and (attn_nw == None): xAuxEmb = tensor.dot( xAux, tparams['WIemb_aux']) + tparams['b_Img_aux'] else: xAuxEmb = xAux xA_is_inp = True else: xA_is_inp = False if options.get('encode_gt_sentences', 0): xAuxEmb = tensor.dot( xAux, tparams['WIemb_aux']) + tparams['b_Img_aux'] else: xAuxEmb = xAux embImg = embImg.reshape([1, n_samples, options['image_encoding_size']]) emb = tensor.concatenate([embImg, embW], axis=0) #This is implementation of input dropout !! if options['use_dropout']: emb = dropout_layer(emb, use_noise, self.trng, options['drop_prob_encoder'], shp=emb.shape) if (options.get('en_aux_inp', 0)) and (attn_nw == None): xAuxEmb = dropout_layer(xAuxEmb, use_noise, self.trng, options['drop_prob_aux'], shp=xAuxEmb.shape) # Implement scehduled sampling! if options.get('sched_sampling_mode', None) != None: curr_epoch = tensor.scalar(name='curr_epoch', dtype=config.floatX) # Assign the probabilies according to the scheduling mode if options['sched_sampling_mode'] == 'linear': prob = tensor.maximum( options['sslin_min'], options['sched_sampling_const'] - options['sslin_slope'] * curr_epoch) elif options['sched_sampling_mode'] == 'exp': raise ValueError( 'ERROR: %s --> This solver type is not yet supported' % (options['sched_sampling_mode'])) elif options['sched_sampling_mode'] == 'invsig': raise ValueError( 'ERROR: %s --> This solver type is not yet supported' % (options['sched_sampling_mode'])) else: raise ValueError( 'ERROR: %s --> This scheduling type is unknown' % (options['sched_sampling_mode'])) # Now to build the mask. We don't want to do this coin toss when # feeding in image feature and the start symbol sched_mask = self.trng.binomial((n_timesteps - 2, n_samples), p=prob, n=1, dtype='int64') sched_mask = tensor.concatenate( [sched_mask, tensor.alloc(1, 2, n_samples)], axis=0) else: sched_mask = [] ############################################################################################################################# # This implements core lstm rval, updatesLSTM = basic_lstm_layer(tparams, emb[:n_timesteps, :, :], xAuxEmb, use_noise, options, prefix=options['generator'], sched_prob_mask=sched_mask, attn_nw=attn_nw) ############################################################################################################################# # NOTE1: we are leaving out the first prediction, which was made for the image and is meaningless. if options['use_dropout']: # XXX : Size given to dropout is missing one dimension. This keeps the dropped units consistent across time!?. # ### Is this a good bug ? p = dropout_layer( sliceT(rval[0][1:, :, :], options.get('hidden_depth', 1), options['hidden_size']), use_noise, self.trng, options['drop_prob_decoder'], (n_samples, options['hidden_size'])) else: p = sliceT(rval[0][1:, :, :], options.get('hidden_depth', 1), options['hidden_size']) if options.get('class_out_factoring', 0) == 1: if options.get('cls_diff_layer', 0) == 1: pC_inp = dropout_layer( sliceT(rval[0][1:, :, :], options.get('hidden_depth', 1) - 2, options['hidden_size']), use_noise, self.trng, options['drop_prob_decoder'], (n_samples, options['hidden_size'])) else: pC_inp = p n_out_samps = (n_timesteps - 1) * n_samples if options.get('class_out_factoring', 0) == 0: pW = (tensor.dot(p, tparams['Wd']) + tparams['bd']).reshape( [n_out_samps, options['output_size']]) if options.get('use_gumbel_mse', 0) == 0: pWSft = tensor.nnet.softmax(pW) else: w_out = ifelse( self.usegumbel, gumbel_softmax_sample(self.trng, pW, self.gumb_temp, hard=options.get( 'use_gumbel_hard', False)), tensor.nnet.softmax(pW)) # This is not exactly right, but just testing pWSft = w_out totProb = pWSft[tensor.arange(n_out_samps), xW[1:, :].flatten()] out_list = [pWSft, totProb, pW] else: ixtoclsinfo_t = tensor.as_tensor_variable(self.clsinfo) xC = ixtoclsinfo_t[xW[1:, :].flatten(), 0] if options.get('cls_zmean', 1): pW = ((tparams['Wd'][:, xC, :].T * ((p.reshape([1, n_out_samps, options['hidden_size']]) - tparams['WdCls'][:, xC].T))).sum(axis=-1).T + tparams['bd'][:, xC, :]) else: pW = ((tparams['Wd'][:, xC, :].T * (p.reshape([1, n_out_samps, options['hidden_size']])) ).sum(axis=-1).T + tparams['bd'][:, xC, :]) pWSft = tensor.nnet.softmax(pW[0, :, :]) pC = (tensor.dot(pC_inp, tparams['WdCls']) + tparams['bdCls']).reshape([n_out_samps, options['nClasses']]) pCSft = tensor.nnet.softmax(pC) totProb = pWSft[tensor.arange(n_out_samps), ixtoclsinfo_t[xW[1:,:].flatten(),3]] * \ pCSft[tensor.arange(n_out_samps), xC] out_list = [pWSft, pCSft, totProb, p] tot_cost = -(tensor.log(totProb + 1e-10) * mask[1:, :].flatten()).sum() tot_pplx = -(tensor.log2(totProb + 1e-10) * mask[1:, :].flatten()).sum() cost = [ tot_cost / tensor.cast(n_samples, dtype=config.floatX), tot_pplx ] inp_list = [xW, mask] if xI_is_inp: inp_list.append(xI) if options.get('en_aux_inp', 0) and xA_is_inp: inp_list.append(xAux) if options.get('sched_sampling_mode', None) != None: inp_list.append(curr_epoch) f_pred_prob = theano.function([xW, xI, xAux], out_list, name='f_pred_prob', updates=updatesLSTM) return use_noise, inp_list, f_pred_prob, cost, out_list, updatesLSTM
def build_model(self, tparams, options): trng = RandomStreams(1234) # Used for dropout. use_noise = theano.shared(numpy_floatX(0.)) xW = tensor.matrix('xW', dtype='int64') mask = tensor.matrix('mask', dtype=config.floatX) n_timesteps = xW.shape[0] n_samples = xW.shape[1] embW = tparams['Wemb'][xW.flatten()].reshape([n_timesteps, n_samples, options['word_encoding_size']]) xI = tensor.matrix('xI', dtype=config.floatX) xAux = tensor.matrix('xAux', dtype=config.floatX) if options.get('swap_aux',0): xAuxEmb = tensor.dot(xAux,tparams['WIemb_aux']) + tparams['b_Img_aux'] else: xAuxEmb = xAux embImg = (tensor.dot(xI, tparams['WIemb']) + tparams['b_Img']).reshape([1,n_samples,options['image_encoding_size']]); emb = tensor.concatenate([embImg, embW], axis=0) #This is implementation of input dropout !! if options['use_dropout']: emb = dropout_layer(emb, use_noise, trng, options['drop_prob_encoder'], shp = emb.shape) if options.get('en_aux_inp',0): xAuxEmb = dropout_layer(xAuxEmb, use_noise, trng, options['drop_prob_aux'], shp = xAuxEmb.shape) # Implement scehduled sampling! if options.get('sched_sampling_mode',None) != None: curr_epoch = tensor.scalar(name='curr_epoch',dtype=config.floatX) # Assign the probabilies according to the scheduling mode if options['sched_sampling_mode'] == 'linear': prob = tensor.maximum(options['sslin_min'],options['sched_sampling_const'] - options['sslin_slope'] * curr_epoch) elif options['sched_sampling_mode'] == 'exp': raise ValueError('ERROR: %s --> This solver type is not yet supported'%(options['sched_sampling_mode'])) elif options['sched_sampling_mode'] == 'invsig': raise ValueError('ERROR: %s --> This solver type is not yet supported'%(options['sched_sampling_mode'])) else: raise ValueError('ERROR: %s --> This scheduling type is unknown'%(options['sched_sampling_mode'])) # Now to build the mask. We don't want to do this coin toss when # feeding in image feature and the start symbol sched_mask = trng.binomial((n_timesteps - 2, n_samples), p=prob, n=1, dtype='int64') sched_mask = tensor.concatenate([sched_mask, tensor.alloc(1, 2, n_samples)],axis=0) else: sched_mask = [] ############################################################################################################################# # This implements core lstm rval, updatesLSTM = basic_lstm_layer(tparams, emb[:n_timesteps,:,:], xAuxEmb, use_noise, options, prefix=options['generator'], sched_prob_mask = sched_mask) ############################################################################################################################# # NOTE1: we are leaving out the first prediction, which was made for the image and is meaningless. if options['use_dropout']: # XXX : Size given to dropout is missing one dimension. This keeps the dropped units consistent across time!?. # ### Is this a good bug ? p = dropout_layer(sliceT(rval[0][1:,:,:],options.get('hidden_depth',1)-1,options['hidden_size']), use_noise, trng, options['drop_prob_decoder'], (n_samples,options['hidden_size'])) else: p = sliceT(rval[0][1:,:,:],options.get('hidden_depth',1)-1,options['hidden_size']) n_out_samps = (n_timesteps-1) * n_samples if options.get('class_out_factoring',0) == 0: pW = (tensor.dot(p,tparams['Wd']) + tparams['bd']).reshape([n_out_samps,options['output_size']]) pWSft = tensor.nnet.softmax(pW) totProb = pWSft[tensor.arange(n_out_samps), xW[1:,:].flatten()] out_list = [pWSft, totProb, p] else: ixtoclsinfo_t = tensor.as_tensor_variable(options['ixtoclsinfo']) xC = ixtoclsinfo_t[xW[1:,:].flatten(),0] pW = ((tparams['Wd'][:,xC,:].T*(p.reshape([1,n_out_samps,options['hidden_size']]))).sum(axis=-1).T + tparams['bd'][:,xC,:]) pWSft = tensor.nnet.softmax(pW[0,:,:]) pC = (tensor.dot(p,tparams['WdCls']) + tparams['bdCls']).reshape([n_out_samps,options['nClasses']]) pCSft = tensor.nnet.softmax(pC) totProb = pWSft[tensor.arange(n_out_samps), ixtoclsinfo_t[xW[1:,:].flatten(),3]] * \ pCSft[tensor.arange(n_out_samps), xC] out_list = [pWSft, pCSft, totProb, p] tot_cost = -(tensor.log(totProb + 1e-10) * mask[1:,:].flatten()).sum() tot_pplx = -(tensor.log2(totProb + 1e-10) * mask[1:,:].flatten()).sum() cost = [tot_cost/options['batch_size'], tot_pplx] inp_list = [xW, mask, xI] if options.get('en_aux_inp',0): inp_list.append(xAux) if options.get('sched_sampling_mode',None) != None: inp_list.append(curr_epoch) f_pred_prob = [] #theano.function(inp_list, p, name='f_pred_prob', updates=updatesLSTM) return use_noise, inp_list, f_pred_prob, cost, out_list , updatesLSTM
def _stepP(*in_list): x_inp = [] h_inp = [] c_inp = [] for i in xrange(nmodels): x_inp.append(in_list[i]) h_inp.append(in_list[nmodels+i]) c_inp.append(in_list[2*nmodels+i]) lP_ = in_list[3*nmodels] dV_ = in_list[3*nmodels+1] p_comb = tensor.alloc(numpy_floatX(0.), options[0]['output_size']); cf = [] h = [] xW = [] for i in xrange(nmodels): preact = tensor.dot(h_inp[i], tparams[i][_p(prefix, 'W_hid')]) preact += (tensor.dot(x_inp[i], tparams[i][_p(prefix, 'W_inp')]) + tparams[i][_p(prefix, 'b')]) if options[i].get('en_aux_inp',0): preact += tensor.dot(aux_input2[i],tparams[i][_p(prefix,'W_aux')]) inp = tensor.nnet.sigmoid(sliceT(preact, 0, options[i]['hidden_size'])) f = tensor.nnet.sigmoid(sliceT(preact, 1, options[i]['hidden_size'])) o = tensor.nnet.sigmoid(sliceT(preact, 2, options[i]['hidden_size'])) c = tensor.tanh(sliceT(preact, 3, options[i]['hidden_size'])) cf.append(f * c_inp[i] + inp * c) h.append(o * tensor.tanh(cf[i])) p = tensor.dot(h[i],tparams[i]['Wd']) + tparams[i]['bd'] if i == 0: p_comb = tparams[i]['comb_weight']*tensor.nnet.softmax(p) else: p_comb += tparams[i]['comb_weight']*tensor.nnet.softmax(p) lProb = tensor.log(p_comb + 1e-20) def _FindB_best(lPLcl, lPprev, dVLcl): srtLcl = tensor.argsort(-lPLcl) srtLcl = srtLcl[:beam_size] deltaVec = tensor.fill( lPLcl[srtLcl], numpy_floatX(-10000.)) deltaVec = tensor.set_subtensor(deltaVec[0], lPprev) lProbBest = ifelse(tensor.eq( dVLcl, tensor.zeros_like(dVLcl)), lPLcl[srtLcl] + lPprev, deltaVec) xWIdxBest = ifelse(tensor.eq( dVLcl, tensor.zeros_like(dVLcl)), srtLcl, tensor.zeros_like(srtLcl)) return lProbBest, xWIdxBest rvalLcl, updatesLcl = theano.scan(_FindB_best, sequences = [lProb, lP_, dV_], name=_p(prefix, 'FindBest'), n_steps=x_inp[0].shape[0]) xWIdxBest = rvalLcl[1] lProbBest = rvalLcl[0] xWIdxBest = xWIdxBest.flatten() lProb = lProbBest.flatten() # Now sort and find the best among these best extensions for the current beams srtIdx = tensor.argsort(-lProb) srtIdx = srtIdx[:beam_size] xWlogProb = lProb[srtIdx] xWIdx = xWIdxBest[srtIdx] xCandIdx = srtIdx // beam_size # Floor division doneVec = tensor.eq(xWIdx,tensor.zeros_like(xWIdx)) x_out = [] h_out = [] c_out = [] for i in xrange(nmodels): x_out.append(tparams[i]['Wemb'][xWIdx.flatten()]) h_out.append(h[i].take(xCandIdx.flatten(),axis=0)) c_out.append(cf[i].take(xCandIdx.flatten(),axis=0)) out_list = [] out_list.extend(x_out) out_list.extend(h_out) out_list.extend(c_out) out_list.extend([xWlogProb, doneVec, xWIdx, xCandIdx]) return out_list, theano.scan_module.until(doneVec.all())
def lstm_layer(self, tparams, state_below, aux_input, use_noise, options, prefix='lstm', mask=None): nsteps = state_below.shape[0] h_depth = options.get('hidden_depth', 1) h_sz = options['hidden_size'] if state_below.ndim == 3: n_samples = state_below.shape[1] else: n_samples = 1 assert mask is not None def _step(m_, x_, h_, c_, xAux): preact = tensor.dot(sliceT(h_, 0, h_sz), tparams[_p(prefix, 'W_hid')]) preact += x_ if options.get('en_aux_inp', 0): preact += tensor.dot(xAux, tparams[_p(prefix, 'W_aux')]) # preact += tparams[_p(prefix, 'b')] h = [[]] * h_depth c = [[]] * h_depth for di in xrange(h_depth): i = tensor.nnet.sigmoid(sliceT(preact, 0, h_sz)) f = tensor.nnet.sigmoid(sliceT(preact, 1, h_sz)) o = tensor.nnet.sigmoid(sliceT(preact, 2, h_sz)) c[di] = tensor.tanh(sliceT(preact, 3, h_sz)) c[di] = f * sliceT(c_, di, h_sz) + i * c[di] h[di] = o * tensor.tanh(c[di]) if di < (h_depth - 1): preact = tensor.dot(sliceT(h_, di+1, h_sz), tparams[_p(prefix, ('W_hid_' + str(di+1)))]) + \ tensor.dot(h[di], tparams[_p(prefix, ('W_inp_' + str(di+1)))]) c_out = tensor.concatenate(c, axis=1) h_out = tensor.concatenate(h, axis=1) return h_out, c_out state_below = (tensor.dot(state_below, tparams[_p(prefix, 'W_inp')]) + tparams[_p(prefix, 'b')]) if options.get('en_aux_inp', 0) == 0: aux_input = [] rval, updates = theano.scan( _step, sequences=[mask, state_below], outputs_info=[ tensor.alloc(numpy_floatX(0.), n_samples, h_depth * h_sz), tensor.alloc(numpy_floatX(0.), n_samples, h_depth * h_sz), #tensor.alloc(numpy_floatX(0.),n_samples,options['output_size'])], ], non_sequences=[aux_input], name=_p(prefix, '_layers'), n_steps=nsteps) return rval, updates
def build_eval_other_sent(self, tparams, options, model_npy): zipp(model_npy, self.model_th) # Used for dropout. use_noise = theano.shared(numpy_floatX(0.)) xW = tensor.matrix('xW', dtype='int64') mask = tensor.matrix('mask', dtype=config.floatX) n_timesteps = xW.shape[0] n_samples = xW.shape[1] embW = tparams['Wemb'][xW.flatten()].reshape( [n_timesteps, n_samples, options['word_encoding_size']]) xI = tensor.matrix('xI', dtype=config.floatX) xAux = tensor.matrix('xAux', dtype=config.floatX) embImg = (tensor.dot(xI, tparams['WIemb']) + tparams['b_Img']).reshape( [1, n_samples, options['image_encoding_size']]) emb = tensor.concatenate([embImg, embW], axis=0) rval, updatesLSTM = self.lstm_layer(tparams, emb[:n_timesteps, :, :], xAux, use_noise, options, prefix=options['generator'], mask=mask) p = rval[0] p = tensor.dot(p, tparams['Wd']) + tparams['bd'] #pred = tensor.nnet.softmax(p) #pred = rval[2] #pred = pred[1:,:,:] p = p[1:, :, :] def accumCost(pred, xW, m, c_sum, ppl_sum): pred = tensor.nnet.softmax(pred) c_sum += (tensor.log(pred[tensor.arange(n_samples), xW] + 1e-20) * m) ppl_sum += -( tensor.log2(pred[tensor.arange(n_samples), xW] + 1e-10) * m) return c_sum, ppl_sum sums, upd = theano.scan(fn=accumCost, outputs_info=[ tensor.alloc(numpy_floatX(0.), 1, n_samples), tensor.alloc(numpy_floatX(0.), 1, n_samples) ], sequences=[p, xW[1:, :], mask[1:, :]]) # NOTE1: we are leaving out the first prediction, which was made for the image # and is meaningless. Here cost[0] contains log probability (log10) and cost[1] contains # perplexity (log2) cost = sums[0][-1] inp_list = [xW, xI, mask] if options.get('en_aux_inp', 0): inp_list.append(xAux) f_pred_prob = theano.function(inp_list, p, name='f_pred_prob', updates=updatesLSTM) self.f_pred_prob_other = theano.function(inp_list, p, name='f_pred_prob', updates=updatesLSTM) #f_pred = theano.function([xW, mask], pred.argmax(axis=1), name='f_pred') #cost = -tensor.log(pred[tensor.arange(n_timesteps),tensor.arange(n_samples), xW] + 1e-8).mean() self.f_eval_other = theano.function(inp_list, cost, name='f_eval') return use_noise, inp_list, self.f_pred_prob_other, cost, p, updatesLSTM
def main(params): word_count_threshold = params['word_count_threshold'] max_epochs = params['max_epochs'] host = socket.gethostname() # get computer hostname # fetch the data provider dp = getDataProvider(params) # Initialize the optimizer solver = Solver(params['solver']) params['image_feat_size'] = dp.img_feat_size misc = {} # stores various misc items that need to be passed around the framework # go over all training sentences and find the vocabulary we want to use, i.e. the words that occur # at least word_count_threshold number of times misc['wordtoix'], misc['ixtoword'], bias_init_vector = preProBuildWordVocab(dp.iterSentences('train'), word_count_threshold) params['use_dropout'] = 1 if params['fine_tune'] == 1: params['mode'] = 'multimodal_lstm' if params['multimodal_lstm'] == 0 else 'multimodal_lstm' if params['checkpoint_file_name'] != None: params['batch_size'] = dp.dataset['batchsize'] misc['wordtoix'] = checkpoint_init['wordtoix'] misc['ixtoword'] = checkpoint_init['ixtoword'] batch_size = 1 num_sentences_total = dp.getSplitSize('train', ofwhat = 'images') else: params['mode'] = 'batchtrain' batch_size = params['batch_size'] num_sentences_total = dp.getSplitSize('train', ofwhat = 'sentences') params['vocabulary_size'] = len(misc['wordtoix']) pos_samp = np.arange(batch_size,dtype=np.int32) # This initializes the model parameters and does matrix initializations evalModel = decodeEvaluator(params) model, misc['update'], misc['regularize'] = (evalModel.model_th, evalModel.updateP, evalModel.regularize) # Define the computational graph for relating the input image features and word indices to the # log probability cost funtion. (use_dropout, inp_list, miscOuts, cost, predTh, model) = evalModel.build_model(model, params) # Add the regularization cost. Since this is specific to trainig and doesn't get included when we # evaluate the cost on test or validation data, we leave it here outside the model definition if params['regc'] > 0.: reg_cost = theano.shared(numpy_floatX(0.), name='reg_c') reg_c = tensor.as_tensor_variable(numpy_floatX(params['regc']), name='reg_c') reg_cost = 0. for p in misc['regularize']: reg_cost += (model[p] ** 2).sum() reg_cost *= 0.5 * reg_c cost[0] += (reg_cost /params['batch_size']) # Compile an evaluation function.. Doesn't include gradients # To be used for validation set evaluation f_eval= theano.function(inp_list, cost, name='f_eval') # Now let's build a gradient computation graph and rmsprop update mechanism grads = tensor.grad(cost, wrt=model.values()) lr = tensor.scalar(name='lr',dtype=config.floatX) if params['sim_minibatch'] > 0: f_grad_accum, f_clr, ag = solver.accumGrads(model,grads,inp_list,cost, params['sim_minibatch']) f_grad_shared, f_update, zg, rg, ud = solver.build_solver_model(lr, model, ag, inp_list, cost, params) else: f_grad_shared, f_update, zg, rg, ud = solver.build_solver_model(lr, model, grads, inp_list, cost, params) print 'model init done.' print 'model has keys: ' + ', '.join(model.keys()) # calculate how many iterations we need, One epoch is considered once going through all the sentences and not images # Hence in case of coco/flickr this will 5* no of images num_iters_one_epoch = num_sentences_total / batch_size max_iters = max_epochs * num_iters_one_epoch inner_loop = params['sim_minibatch'] if params['sim_minibatch'] > 0 else 1 max_iters = max_iters / inner_loop eval_period_in_epochs = params['eval_period'] eval_period_in_iters = max(1, int(num_iters_one_epoch * eval_period_in_epochs/ inner_loop)) top_val_ppl2 = -1 smooth_train_cost = len(misc['ixtoword']) # initially size of dictionary of confusion val_ppl2 = len(misc['ixtoword']) last_status_write_time = 0 # for writing worker job status reports json_worker_status = {} json_worker_status['params'] = params json_worker_status['history'] = [] len_hist = defaultdict(int) ## Initialize the model parameters from the checkpoint file if we are resuming training if params['checkpoint_file_name'] != None: zipp(model_init_from,model) zipp(rg_init,rg) print("\nContinuing training from previous model\n. Already run for %0.2f epochs with validation perplx at %0.3f\n" % (checkpoint_init['epoch'], \ checkpoint_init['perplexity'])) elif params['init_from_imagernn'] != None: # Initialize word vecs and image emb from generative model file rnnCv = pickle.load(open(params['init_from_imagernn'], 'rb')) model['Wemb'].set_value(rnnCv['model']['Wemb']) model['WIemb'].set_value(rnnCv['model']['WIemb_aux']) misc['wordtoix'] = rnnCv['wordtoix'] misc['ixtoword'] = rnnCv['ixtoword'] print("\n Initialized Word embedding and Image embeddings from gen mode %s" % (params['init_from_imagernn'])) use_dropout.set_value(1.) #################### Main Loop ############################################ for it in xrange(max_iters): t0 = time.time() # fetch a batch of data cost_inner = np.zeros((inner_loop,),dtype=np.float32) if params['sim_minibatch'] > 0: for i_l in xrange(inner_loop): batch,pos_samp_sent = dp.sampPosNegSentSamps(params['batch_size'],params['mode'],thresh=0.3) real_inp_list, lenS = prepare_data(batch,misc['wordtoix'],maxlen=params['maxlen'],pos_samp=pos_samp,prep_for=params['eval_model']) if params['fine_tune'] == 1: real_inp_list.append(pos_samp_sent) cost_inner[i_l] = f_grad_accum(*real_inp_list) else: batch,pos_samp_sent = dp.sampPosNegSentSamps(params['batch_size'],params['mode'],thresh=0.3) real_inp_list, lenS = prepare_data(batch,misc['wordtoix'],maxlen=params['maxlen'],pos_samp=pos_samp,prep_for=params['eval_model']) if params['fine_tune'] == 1: real_inp_list.append(pos_samp_sent) # Enable using dropout in training cost = f_grad_shared(*real_inp_list) f_update(params['learning_rate']) dt = time.time() - t0 # Reset accumulated gradients to 0 if params['sim_minibatch'] > 0: f_clr() #print 'model: ' + ' '.join([str(np.isnan(model[m].get_value()).any()) for m in model]) #print 'rg: ' +' '.join([str(np.isnan(rg[i].get_value()).any()) for i in xrange(len(rg))]) #print 'zg: ' + ' '.join([str(np.isnan(zg[i].get_value()).any()) for i in xrange(len(zg))]) #print 'ud: ' + ' '.join([str(np.isnan(ud[i].get_value()).any()) for i in xrange(len(ud))]) #import pdb; pdb.set_trace() #print 'udAft: ' + ' '.join([str(np.isnan(ud[i].get_value()).any()) for i in xrange(len(ud))]) # print training statistics epoch = it*inner_loop * 1.0 / num_iters_one_epoch total_cost = (np.e**-cost + (np.e**(-cost_inner)).sum()*(params['sim_minibatch'] > 0))/ (1 + params['sim_minibatch']) #print '%d/%d batch done in %.3fs. at epoch %.2f. loss cost = %f, reg cost = %f, ppl2 = %.2f (smooth %.2f)' \ # % (it, max_iters, dt, epoch, cost['loss_cost'], cost['reg_cost'], \ # train_ppl2, smooth_train_cost) if it == 0: smooth_train_cost = total_cost else: smooth_train_cost = 0.99 * smooth_train_cost + 0.01 * total_cost tnow = time.time() if tnow > last_status_write_time + 60*1: # every now and then lets write a report print '%d/%d batch done in %.3fs. at epoch %.2f. Prob now is %.3f' % (it, max_iters, dt, \ epoch, smooth_train_cost) last_status_write_time = tnow jstatus = {} jstatus['time'] = datetime.datetime.now().isoformat() jstatus['iter'] = (it, max_iters) jstatus['epoch'] = (epoch, max_epochs) jstatus['time_per_batch'] = dt jstatus['val_ppl2'] = val_ppl2 # just write the last available one json_worker_status['history'].append(jstatus) status_file = os.path.join(params['worker_status_output_directory'], host + '_status.json') #import pdb; pdb.set_trace() try: json.dump(json_worker_status, open(status_file, 'w')) except Exception, e: # todo be more clever here print 'tried to write worker status into %s but got error:' % (status_file, ) print e ## perform perplexity evaluation on the validation set and save a model checkpoint if it's good is_last_iter = (it+1) == max_iters if (((it+1) % eval_period_in_iters) == 0 and it < max_iters - 5) or is_last_iter: # Disable using dropout in validation use_dropout.set_value(0.) val_ppl2 = eval_split_theano('val', dp, model, params, misc,f_eval) # perform the evaluation on VAL set if epoch - params['lr_decay_st_epoch'] >= 0: params['learning_rate'] = params['learning_rate'] * params['lr_decay'] params['lr_decay_st_epoch'] += 1 print 'validation perplexity = %f, lr = %f' % (val_ppl2, params['learning_rate']) if params['sample_by_len'] == 1: print len_hist write_checkpoint_ppl_threshold = params['write_checkpoint_ppl_threshold'] if val_ppl2 < top_val_ppl2 or top_val_ppl2 < 0: if val_ppl2 < write_checkpoint_ppl_threshold or write_checkpoint_ppl_threshold < 0: # if we beat a previous record or if this is the first time # AND we also beat the user-defined threshold or it doesnt exist #top_val_ppl2 = val_ppl2 filename = '%s_checkpoint_%s_%s_%s_%.2f_%.2f.p' % (params['eval_model'], params['dataset'], host, params['fappend'],val_ppl2,smooth_train_cost) filepath = os.path.join(params['checkpoint_output_directory'], filename) model_npy = unzip(model) rgrads_npy = unzip(rg) checkpoint = {} checkpoint['it'] = it checkpoint['epoch'] = epoch checkpoint['model'] = model_npy checkpoint['rgrads'] = rgrads_npy checkpoint['params'] = params checkpoint['perplexity'] = val_ppl2 checkpoint['wordtoix'] = misc['wordtoix'] checkpoint['ixtoword'] = misc['ixtoword'] try: pickle.dump(checkpoint, open(filepath, "wb")) print 'saved checkpoint in %s' % (filepath, ) except Exception, e: # todo be more clever here print 'tried to write checkpoint into %s but got error: ' % (filepath, ) print e use_dropout.set_value(1.)
def build_model(self, tparams, options): trng = RandomStreams(1234) # Used for dropout. self.use_noise = theano.shared(numpy_floatX(0.)) if self.use_shared_features == False: xI = tensor.tensor3('xI', dtype=config.floatX) xIemb = xI n_timesteps = xI.shape[0] n_samples = xI.shape[1] else: xI = tensor.matrix('xI', dtype='int64') n_timesteps = xI.shape[0] n_samples = xI.shape[1] #feats = tensor.concatenate([self.features,tensor.alloc(numpy_floatX(0.),self.image_feat_size,1)],axis=1).T xIemb = self.features[xI.flatten(), :].reshape( [n_timesteps, n_samples, self.image_feat_size]) samp_lens = tensor.vector('sL', dtype='int64') #This is implementation of input dropout !! if options['use_dropout']: emb = dropout_layer(xIemb, self.use_noise, trng, options['drop_prob_encoder'], shp=xIemb.shape) ############################################################################################################################# # This implements core lstm rval, updatesLSTM = self.lstm_enc_layer(tparams, emb, prefix=self.mp + 'lstm') ############################################################################################################################# # This implements core reverse lstm if self.encoder == 'bilstm': rev_rval, rev_updatesLSTM = basic_lstm_layer(tparams, emb[::-1, :, :], prefix=self.mp + 'rev_lstm') ############################################################################################################################# # NOTE1: we are leaving out the first prediction, which was made for the image and is meaningless. p = sliceT(rval[0][samp_lens, tensor.arange(n_samples), :], self.hidden_depth, self.hidden_size) if self.encoder == 'bilstm': rev_p = sliceT(rev_rval[0][-1, :, :], self.hidden_depth, self.hidden_size) feat_enc = p + rev_p if self.encoder == 'bilstm' else p if options.get('encoder_add_mean', 0): feat_enc = feat_enc + (sliceT(rval[0], self.hidden_depth, self.hidden_size).sum(axis=0) / samp_lens[:, None]) inp_list = [xI, samp_lens] return self.use_noise, inp_list, feat_enc, updatesLSTM
def lstm_advers_gen_layer(self, tparams, Xi, aux_input, options, beam_size, prefix='lstm'): nMaxsteps = options.get('maxlen',15) n_samples = 1 h_depth = options.get('hidden_depth',1) h_sz = options['hidden_size'] # ---------------------- STEP FUNCTION ---------------------- # def _stepP(x_, h_, c_, lP_, dV_, xAux): preact = tensor.dot(sliceT(h_, 0, h_sz), tparams[_p(prefix, 'W_hid')]) preact += (tensor.dot(x_, tparams[_p(prefix, 'W_inp')]) + tparams[_p(prefix, 'b')]) if options.get('en_aux_inp',0): preact += tensor.dot(xAux,tparams[_p(prefix,'W_aux')]) hL = [[]]*h_depth cL = [[]]*h_depth for di in xrange(h_depth): i = tensor.nnet.sigmoid(sliceT(preact, 0, h_sz)) f = tensor.nnet.sigmoid(sliceT(preact, 1, h_sz)) o = tensor.nnet.sigmoid(sliceT(preact, 2, h_sz)) cL[di] = tensor.tanh(sliceT(preact, 3, h_sz)) cL[di] = f * sliceT(c_, di, h_sz) + i * cL[di] hL[di] = o * tensor.tanh(cL[di]) if di < (h_depth - 1): preact = tensor.dot(sliceT(h_, di+1, h_sz), tparams[_p(prefix, ('W_hid_' + str(di+1)))]) + \ tensor.dot(hL[di], tparams[_p(prefix, ('W_inp_' + str(di+1)))]) c = tensor.concatenate(cL,axis=1) h = tensor.concatenate(hL,axis=1) p = tensor.dot(hL[-1],tparams['Wd']) + tparams['bd'] smooth_factor = tensor.as_tensor_variable(numpy_floatX(options.get('softmax_smooth_factor',1.0)), name='sm_f') p = tensor.nnet.softmax(p*smooth_factor) lProb = tensor.log(p + 1e-20) #xCandIdx = tensor.as_tensor_variable([0]) lProb = lProb.flatten() xWIdx = tensor.argmax(lProb,keepdims=True) xWlogProb = lProb[xWIdx] + lP_ if options.get('softmax_propogate',0) == 0: xW = tparams['Wemb'][xWIdx.flatten()] else: xW = p.dot(tparams['Wemb']) doneVec = tensor.eq(xWIdx,tensor.zeros_like(xWIdx)) return [xW, h, c, xWlogProb, doneVec, xWIdx, p], theano.scan_module.until(doneVec.all()) # ------------------- END of STEP FUNCTION -------------------- # if options.get('en_aux_inp',0) == 0: aux_input = [] h = tensor.alloc(numpy_floatX(0.),n_samples,h_sz*h_depth) c = tensor.alloc(numpy_floatX(0.),n_samples,h_sz*h_depth) lP = tensor.alloc(numpy_floatX(0.), beam_size); dV = tensor.alloc(np.int8(0.), beam_size); # Propogate the image feature vector [xW, h, c, _, _, _, _], _ = _stepP(Xi, h, c, lP, dV,aux_input) xWStart = tparams['Wemb'][0,:] [xW, h, c, lP, dV, idx0, p0], _ = _stepP(xWStart, h, c, lP, dV, aux_input) #if options.get('en_aux_inp',0) == 1: # aux_input = tensor.extra_ops.repeat(aux_input,beam_size,axis=0) # Now lets do the loop. rval, updates = theano.scan(_stepP, outputs_info=[xW, h, c, lP, dV, None, None], non_sequences = [aux_input], name=_p(prefix, 'predict_layers'), n_steps=nMaxsteps-1) return rval[3][-1], tensor.concatenate([idx0.reshape([1,beam_size]), rval[5]],axis=0), tensor.concatenate([tensor.shape_padleft(p0,n_ones=1),rval[6]],axis=0), updates
def main(params): batch_size = params['batch_size'] word_count_threshold = params['word_count_threshold'] max_epochs = params['max_epochs'] host = socket.gethostname() # get computer hostname # fetch the data provider dp = getDataProvider(params) params['aux_inp_size'] = dp.aux_inp_size params['image_feat_size'] = dp.img_feat_size print 'Image feature size is %d, and aux input size is %d' % ( params['image_feat_size'], params['aux_inp_size']) misc = { } # stores various misc items that need to be passed around the framework # go over all training sentences and find the vocabulary we want to use, i.e. the words that occur # at least word_count_threshold number of times misc['wordtoix'], misc[ 'ixtoword'], bias_init_vector = preProBuildWordVocab( dp.iterSentences('train'), word_count_threshold) params['vocabulary_size'] = len(misc['wordtoix']) params['output_size'] = len(misc['ixtoword']) # these should match though params['use_dropout'] = 1 # This initializes the model parameters and does matrix initializations lstmGenerator = LSTMGenerator(params) model, misc['update'], misc['regularize'] = (lstmGenerator.model_th, lstmGenerator.update, lstmGenerator.regularize) # force overwrite here. The bias to the softmax is initialized to reflect word frequencies # This is a bit of a hack, not happy about it model['bd'].set_value(bias_init_vector.astype(config.floatX)) # Define the computational graph for relating the input image features and word indices to the # log probability cost funtion. (use_dropout, inp_list, f_pred_prob, cost, predTh, updatesLSTM) = lstmGenerator.build_model(model, params) # Add the regularization cost. Since this is specific to trainig and doesn't get included when we # evaluate the cost on test or validation data, we leave it here outside the model definition if params['regc'] > 0.: reg_cost = theano.shared(numpy_floatX(0.), name='reg_c') reg_c = tensor.as_tensor_variable(numpy_floatX(params['regc']), name='reg_c') reg_cost = 0. for p in misc['regularize']: reg_cost += (model[p]**2).sum() reg_cost *= 0.5 * reg_c cost[0] += (reg_cost / params['batch_size']) # Compile an evaluation function.. Doesn't include gradients # To be used for validation set evaluation f_eval = theano.function(inp_list, cost, name='f_eval') # Now let's build a gradient computation graph and rmsprop update mechanism grads = tensor.grad(cost[0], wrt=model.values()) lr = tensor.scalar(name='lr', dtype=config.floatX) f_grad_shared, f_update, zg, rg, ud = lstmGenerator.rmsprop( lr, model, grads, inp_list, cost, params) print 'model init done.' print 'model has keys: ' + ', '.join(model.keys()) #print 'updating: ' + ', '.join( '%s [%dx%d]' % (k, model[k].shape[0], model[k].shape[1]) for k in misc['update']) #print 'updating: ' + ', '.join( '%s [%dx%d]' % (k, model[k].shape[0], model[k].shape[1]) for k in misc['regularize']) #print 'number of learnable parameters total: %d' % (sum(model[k].shape[0] * model[k].shape[1] for k in misc['update']), ) # calculate how many iterations we need, One epoch is considered once going through all the sentences and not images # Hence in case of coco/flickr this will 5* no of images num_sentences_total = dp.getSplitSize('train', ofwhat='sentences') num_iters_one_epoch = num_sentences_total / batch_size max_iters = max_epochs * num_iters_one_epoch eval_period_in_epochs = params['eval_period'] eval_period_in_iters = max( 1, int(num_iters_one_epoch * eval_period_in_epochs)) top_val_ppl2 = -1 smooth_train_ppl2 = len( misc['ixtoword']) # initially size of dictionary of confusion val_ppl2 = len(misc['ixtoword']) last_status_write_time = 0 # for writing worker job status reports json_worker_status = {} json_worker_status['params'] = params json_worker_status['history'] = [] len_hist = defaultdict(int) ## Initialize the model parameters from the checkpoint file if we are resuming training if params['checkpoint_file_name'] != 'None': zipp(model_init_from, model) zipp(rg_init, rg) print("\nContinuing training from previous model\n. Already run for %0.2f epochs with validation perplx at %0.3f\n" % (checkpoint_init['epoch'], \ checkpoint_init['perplexity'])) for it in xrange(max_iters): t0 = time.time() # fetch a batch of data if params['sample_by_len'] == 0: batch = [dp.sampleImageSentencePair() for i in xrange(batch_size)] else: batch, l = dp.getRandBatchByLen(batch_size) len_hist[l] += 1 if params['use_pos_tag'] != 'None': real_inp_list, lenS = prepare_data(batch, misc['wordtoix'], None, sentTagMap, misc['ixtoword']) else: real_inp_list, lenS = prepare_data(batch, misc['wordtoix']) # Enable using dropout in training use_dropout.set_value(1.) # evaluate cost, gradient and perform parameter update cost = f_grad_shared(*real_inp_list) f_update(params['learning_rate']) dt = time.time() - t0 # print training statistics train_ppl2 = (2**(cost[1] / lenS)) #step_struct['stats']['ppl2'] smooth_train_ppl2 = 0.99 * smooth_train_ppl2 + 0.01 * train_ppl2 # smooth exponentially decaying moving average if it == 0: smooth_train_ppl2 = train_ppl2 # start out where we start out epoch = it * 1.0 / num_iters_one_epoch total_cost = cost[0] #print '%d/%d batch done in %.3fs. at epoch %.2f. loss cost = %f, reg cost = %f, ppl2 = %.2f (smooth %.2f)' \ # % (it, max_iters, dt, epoch, cost['loss_cost'], cost['reg_cost'], \ # train_ppl2, smooth_train_ppl2) tnow = time.time() if tnow > last_status_write_time + 60 * 1: # every now and then lets write a report print '%d/%d batch done in %.3fs. at epoch %.2f. Cost now is %.3f and pplx is %.3f' % (it, max_iters, dt, \ epoch, total_cost, smooth_train_ppl2) last_status_write_time = tnow jstatus = {} jstatus['time'] = datetime.datetime.now().isoformat() jstatus['iter'] = (it, max_iters) jstatus['epoch'] = (epoch, max_epochs) jstatus['time_per_batch'] = dt jstatus['smooth_train_ppl2'] = smooth_train_ppl2 jstatus['val_ppl2'] = val_ppl2 # just write the last available one jstatus['train_ppl2'] = train_ppl2 json_worker_status['history'].append(jstatus) status_file = os.path.join( params['worker_status_output_directory'], host + '_status.json') #import pdb; pdb.set_trace() try: json.dump(json_worker_status, open(status_file, 'w')) except Exception, e: # todo be more clever here print 'tried to write worker status into %s but got error:' % ( status_file, ) print e ## perform perplexity evaluation on the validation set and save a model checkpoint if it's good is_last_iter = (it + 1) == max_iters if (((it + 1) % eval_period_in_iters) == 0 and it < max_iters - 5) or is_last_iter: # Disable using dropout in validation use_dropout.set_value(0.) val_ppl2 = eval_split_theano( 'val', dp, model, params, misc, f_eval) # perform the evaluation on VAL set if epoch - params['lr_decay_st_epoch'] >= 0: params['learning_rate'] = params['learning_rate'] * params[ 'lr_decay'] params['lr_decay_st_epoch'] += 1 print 'validation perplexity = %f, lr = %f' % ( val_ppl2, params['learning_rate']) if params['sample_by_len'] == 1: print len_hist write_checkpoint_ppl_threshold = params[ 'write_checkpoint_ppl_threshold'] if val_ppl2 < top_val_ppl2 or top_val_ppl2 < 0: if val_ppl2 < write_checkpoint_ppl_threshold or write_checkpoint_ppl_threshold < 0: # if we beat a previous record or if this is the first time # AND we also beat the user-defined threshold or it doesnt exist top_val_ppl2 = val_ppl2 filename = 'model_checkpoint_%s_%s_%s_%.2f.p' % ( params['dataset'], host, params['fappend'], val_ppl2) filepath = os.path.join( params['checkpoint_output_directory'], filename) model_npy = unzip(model) rgrads_npy = unzip(rg) checkpoint = {} checkpoint['it'] = it checkpoint['epoch'] = epoch checkpoint['model'] = model_npy checkpoint['rgrads'] = rgrads_npy checkpoint['params'] = params checkpoint['perplexity'] = val_ppl2 checkpoint['wordtoix'] = misc['wordtoix'] checkpoint['ixtoword'] = misc['ixtoword'] try: pickle.dump(checkpoint, open(filepath, "wb")) print 'saved checkpoint in %s' % (filepath, ) except Exception, e: # todo be more clever here print 'tried to write checkpoint into %s but got error: ' % ( filepath, ) print e
def main(params): batch_size = params['batch_size'] word_count_threshold = params['word_count_threshold'] max_epochs = params['max_epochs'] host = socket.gethostname() # get computer hostname # fetch the data provider dp = getDataProvider(params) params['aux_inp_size'] = dp.aux_inp_size params['image_feat_size'] = dp.img_feat_size print 'Image feature size is %d, and aux input size is %d'%(params['image_feat_size'],params['aux_inp_size']) misc = {} # stores various misc items that need to be passed around the framework # go over all training sentences and find the vocabulary we want to use, i.e. the words that occur # at least word_count_threshold number of times misc['wordtoix'], misc['ixtoword'], bias_init_vector = preProBuildWordVocab(dp.iterSentences('train'), word_count_threshold) params['vocabulary_size'] = len(misc['wordtoix']) params['output_size'] = len(misc['ixtoword']) # these should match though params['use_dropout'] = 1 # This initializes the model parameters and does matrix initializations lstmGenerator = LSTMGenerator(params) model, misc['update'], misc['regularize'] = (lstmGenerator.model_th, lstmGenerator.update, lstmGenerator.regularize) # force overwrite here. The bias to the softmax is initialized to reflect word frequencies # This is a bit of a hack, not happy about it model['bd'].set_value(bias_init_vector.astype(config.floatX)) # Define the computational graph for relating the input image features and word indices to the # log probability cost funtion. (use_dropout, inp_list, f_pred_prob, cost, predTh, updatesLSTM) = lstmGenerator.build_model(model, params) # Add the regularization cost. Since this is specific to trainig and doesn't get included when we # evaluate the cost on test or validation data, we leave it here outside the model definition if params['regc'] > 0.: reg_cost = theano.shared(numpy_floatX(0.), name='reg_c') reg_c = tensor.as_tensor_variable(numpy_floatX(params['regc']), name='reg_c') reg_cost = 0. for p in misc['regularize']: reg_cost += (model[p] ** 2).sum() reg_cost *= 0.5 * reg_c cost[0] += (reg_cost /params['batch_size']) # Compile an evaluation function.. Doesn't include gradients # To be used for validation set evaluation f_eval= theano.function(inp_list, cost, name='f_eval') # Now let's build a gradient computation graph and rmsprop update mechanism grads = tensor.grad(cost[0], wrt=model.values()) lr = tensor.scalar(name='lr',dtype=config.floatX) f_grad_shared, f_update, zg, rg, ud = lstmGenerator.rmsprop(lr, model, grads, inp_list, cost, params) print 'model init done.' print 'model has keys: ' + ', '.join(model.keys()) #print 'updating: ' + ', '.join( '%s [%dx%d]' % (k, model[k].shape[0], model[k].shape[1]) for k in misc['update']) #print 'updating: ' + ', '.join( '%s [%dx%d]' % (k, model[k].shape[0], model[k].shape[1]) for k in misc['regularize']) #print 'number of learnable parameters total: %d' % (sum(model[k].shape[0] * model[k].shape[1] for k in misc['update']), ) # calculate how many iterations we need, One epoch is considered once going through all the sentences and not images # Hence in case of coco/flickr this will 5* no of images num_sentences_total = dp.getSplitSize('train', ofwhat = 'sentences') num_iters_one_epoch = num_sentences_total / batch_size max_iters = max_epochs * num_iters_one_epoch eval_period_in_epochs = params['eval_period'] eval_period_in_iters = max(1, int(num_iters_one_epoch * eval_period_in_epochs)) top_val_ppl2 = -1 smooth_train_ppl2 = len(misc['ixtoword']) # initially size of dictionary of confusion val_ppl2 = len(misc['ixtoword']) last_status_write_time = 0 # for writing worker job status reports json_worker_status = {} json_worker_status['params'] = params json_worker_status['history'] = [] len_hist = defaultdict(int) ## Initialize the model parameters from the checkpoint file if we are resuming training if params['checkpoint_file_name'] != 'None': zipp(model_init_from,model) zipp(rg_init,rg) print("\nContinuing training from previous model\n. Already run for %0.2f epochs with validation perplx at %0.3f\n" % (checkpoint_init['epoch'], \ checkpoint_init['perplexity'])) for it in xrange(max_iters): t0 = time.time() # fetch a batch of data if params['sample_by_len'] == 0: batch = [dp.sampleImageSentencePair() for i in xrange(batch_size)] else: batch,l = dp.getRandBatchByLen(batch_size) len_hist[l] += 1 if params['use_pos_tag'] != 'None': real_inp_list, lenS = prepare_data(batch,misc['wordtoix'],None,sentTagMap,misc['ixtoword']) else: real_inp_list, lenS = prepare_data(batch,misc['wordtoix']) # Enable using dropout in training use_dropout.set_value(1.) # evaluate cost, gradient and perform parameter update cost = f_grad_shared(*real_inp_list) f_update(params['learning_rate']) dt = time.time() - t0 # print training statistics train_ppl2 = (2**(cost[1]/lenS)) #step_struct['stats']['ppl2'] smooth_train_ppl2 = 0.99 * smooth_train_ppl2 + 0.01 * train_ppl2 # smooth exponentially decaying moving average if it == 0: smooth_train_ppl2 = train_ppl2 # start out where we start out epoch = it * 1.0 / num_iters_one_epoch total_cost = cost[0] #print '%d/%d batch done in %.3fs. at epoch %.2f. loss cost = %f, reg cost = %f, ppl2 = %.2f (smooth %.2f)' \ # % (it, max_iters, dt, epoch, cost['loss_cost'], cost['reg_cost'], \ # train_ppl2, smooth_train_ppl2) tnow = time.time() if tnow > last_status_write_time + 60*1: # every now and then lets write a report print '%d/%d batch done in %.3fs. at epoch %.2f. Cost now is %.3f and pplx is %.3f' % (it, max_iters, dt, \ epoch, total_cost, smooth_train_ppl2) last_status_write_time = tnow jstatus = {} jstatus['time'] = datetime.datetime.now().isoformat() jstatus['iter'] = (it, max_iters) jstatus['epoch'] = (epoch, max_epochs) jstatus['time_per_batch'] = dt jstatus['smooth_train_ppl2'] = smooth_train_ppl2 jstatus['val_ppl2'] = val_ppl2 # just write the last available one jstatus['train_ppl2'] = train_ppl2 json_worker_status['history'].append(jstatus) status_file = os.path.join(params['worker_status_output_directory'], host + '_status.json') #import pdb; pdb.set_trace() try: json.dump(json_worker_status, open(status_file, 'w')) except Exception, e: # todo be more clever here print 'tried to write worker status into %s but got error:' % (status_file, ) print e ## perform perplexity evaluation on the validation set and save a model checkpoint if it's good is_last_iter = (it+1) == max_iters if (((it+1) % eval_period_in_iters) == 0 and it < max_iters - 5) or is_last_iter: # Disable using dropout in validation use_dropout.set_value(0.) val_ppl2 = eval_split_theano('val', dp, model, params, misc,f_eval) # perform the evaluation on VAL set if epoch - params['lr_decay_st_epoch'] >= 0: params['learning_rate'] = params['learning_rate'] * params['lr_decay'] params['lr_decay_st_epoch'] += 1 print 'validation perplexity = %f, lr = %f' % (val_ppl2, params['learning_rate']) if params['sample_by_len'] == 1: print len_hist write_checkpoint_ppl_threshold = params['write_checkpoint_ppl_threshold'] if val_ppl2 < top_val_ppl2 or top_val_ppl2 < 0: if val_ppl2 < write_checkpoint_ppl_threshold or write_checkpoint_ppl_threshold < 0: # if we beat a previous record or if this is the first time # AND we also beat the user-defined threshold or it doesnt exist top_val_ppl2 = val_ppl2 filename = 'model_checkpoint_%s_%s_%s_%.2f.p' % (params['dataset'], host, params['fappend'], val_ppl2) filepath = os.path.join(params['checkpoint_output_directory'], filename) model_npy = unzip(model) rgrads_npy = unzip(rg) checkpoint = {} checkpoint['it'] = it checkpoint['epoch'] = epoch checkpoint['model'] = model_npy checkpoint['rgrads'] = rgrads_npy checkpoint['params'] = params checkpoint['perplexity'] = val_ppl2 checkpoint['wordtoix'] = misc['wordtoix'] checkpoint['ixtoword'] = misc['ixtoword'] try: pickle.dump(checkpoint, open(filepath, "wb")) print 'saved checkpoint in %s' % (filepath, ) except Exception, e: # todo be more clever here print 'tried to write checkpoint into %s but got error: ' % (filepath, ) print e
def build_model(self, tparams, options): trng = RandomStreams(1234) # Used for dropout. use_noise = theano.shared(numpy_floatX(0.)) xW = tensor.matrix('xW', dtype='int64') mask = tensor.matrix('mask', dtype=config.floatX) n_timesteps = xW.shape[0] n_samples = xW.shape[1] embW = tparams['Wemb'][xW.flatten()].reshape( [n_timesteps, n_samples, options['word_encoding_size']]) embW_rev = tparams['Wemb'][xW[::-1, :].flatten()].reshape( [n_timesteps, n_samples, options['word_encoding_size']]) xI = tensor.matrix('xI', dtype=config.floatX) xAux = tensor.matrix('xAux', dtype=config.floatX) if options.get('swap_aux', 0): xAuxEmb = tensor.dot(xAux, tparams['WIemb_aux']) + tparams['b_Img_aux'] else: xAuxEmb = xAux embImg = (tensor.dot(xI, tparams['WIemb']) + tparams['b_Img']).reshape( [1, n_samples, options['image_encoding_size']]) emb = tensor.concatenate([embImg, embW], axis=0) emb_rev = tensor.set_subtensor( embW_rev[mask[::-1, :].argmax(axis=0) - 1, tensor.arange(n_samples), :], embImg[0, :, :]) #This is implementation of input dropout !! if options['use_dropout']: emb = dropout_layer(emb, use_noise, trng, options['drop_prob_encoder'], shp=emb.shape) if options.get('en_aux_inp', 0): xAuxEmb = dropout_layer(xAuxEmb, use_noise, trng, options['drop_prob_aux'], shp=xAuxEmb.shape) ############################################################################################################################# # This implements core lstm rval, updatesLSTM = basic_lstm_layer(tparams, emb[:n_timesteps, :, :], xAuxEmb, use_noise, options, prefix='lstm', sched_prob_mask=[]) ############################################################################################################################# # This implements core reverse lstm rev_rval, rev_updatesLSTM = basic_lstm_layer( tparams, emb_rev[:n_timesteps, :, :], xAuxEmb, use_noise, options, prefix='rev_lstm', sched_prob_mask=[]) ############################################################################################################################# # NOTE1: we are leaving out the first prediction, which was made for the image and is meaningless. if options['use_dropout']: # XXX : Size given to dropout is missing one dimension. This keeps the dropped units consistent across time!?. # ### Is this a good bug ? p = dropout_layer( sliceT(rval[0][1:, :, :], options.get('hidden_depth', 1), options['hidden_size']), use_noise, trng, options['drop_prob_decoder'], (n_samples, options['hidden_size'])) rev_p = dropout_layer( sliceT(rev_rval[0][:, :, :], options.get('hidden_depth', 1), options['hidden_size']), use_noise, trng, options['drop_prob_decoder'], (n_samples, options['hidden_size'])) else: p = sliceT(rval[0][1:, :, :], options.get('hidden_depth', 1), options['hidden_size']) rev_p = sliceT(rev_rval[0][:, :, :], options.get('hidden_depth', 1), options['hidden_size']) n_out_samps = (n_timesteps - 2) * n_samples if options.get('class_out_factoring', 0) == 0: pW = (tensor.dot(p[:-1, :, :] + rev_p[::-1, :, :][2:, :, :], tparams['Wd']) + tparams['bd']).reshape( [n_out_samps, options['output_size']]) pWSft = tensor.nnet.softmax(pW) totProb = pWSft[tensor.arange(n_out_samps), xW[1:-1, :].flatten()] out_list = [pWSft, totProb, p] else: ixtoclsinfo_t = tensor.as_tensor_variable(options['ixtoclsinfo']) xC = ixtoclsinfo_t[xW[1:, :].flatten(), 0] pW = ((tparams['Wd'][:, xC, :].T * (p.reshape([1, n_out_samps, options['hidden_size']]))).sum( axis=-1).T + tparams['bd'][:, xC, :]) pWSft = tensor.nnet.softmax(pW[0, :, :]) pC = (tensor.dot(p, tparams['WdCls']) + tparams['bdCls']).reshape( [n_out_samps, options['nClasses']]) pCSft = tensor.nnet.softmax(pC) totProb = pWSft[tensor.arange(n_out_samps), ixtoclsinfo_t[xW[1:,:].flatten(),3]] * \ pCSft[tensor.arange(n_out_samps), xC] out_list = [pWSft, pCSft, totProb, p] # XXX : THIS IS VERY FISHY, CHECK THE MASK INDEXING AGAIN probs_valid = tensor.log(totProb + 1e-10) * mask[1:-1, :].flatten() tot_cost = -(probs_valid.sum()) tot_pplx = -(tensor.log2(totProb + 1e-10) * mask[1:-1, :].flatten()).sum() cost = [tot_cost / options['batch_size'], tot_pplx] inp_list = [xW, mask, xI] if options.get('en_aux_inp', 0): inp_list.append(xAux) if options.get('sched_sampling_mode', None) != None: inp_list.append(curr_epoch) per_sent_prob = probs_valid.reshape([n_timesteps - 2, n_samples]).sum(axis=0) f_per_sentLogP = theano.function(inp_list, per_sent_prob, name='f_pred_logprob', updates=updatesLSTM) f_pred_prob = ['', f_per_sentLogP, ''] return use_noise, inp_list, f_pred_prob, cost, out_list, updatesLSTM
def build_model(self, tparams, options): trng = RandomStreams(1234) # Used for dropout. use_noise = theano.shared(numpy_floatX(0.)) xW = tensor.matrix('xW', dtype='int64') mask = tensor.matrix('mask', dtype=config.floatX) n_timesteps = xW.shape[0] n_samples = xW.shape[1] embW = tparams['Wemb'][xW.flatten()].reshape([n_timesteps, n_samples, options['word_encoding_size']]) xI = tensor.matrix('xI', dtype=config.floatX) xAux = tensor.matrix('xAux', dtype=config.floatX) embImg = (tensor.dot(xI, tparams['WIemb']) + tparams['b_Img']).reshape([1,n_samples,options['image_encoding_size']]); emb = tensor.concatenate([embImg, embW], axis=0) #This is implementation of input dropout !! if options['use_dropout']: emb = self.dropout_layer(emb, use_noise, trng, options['drop_prob_encoder'], shp = emb.shape) if options.get('en_aux_inp',0): xAux = self.dropout_layer(xAux, use_noise, trng, options['drop_prob_aux'], shp = xAux.shape) # This implements core lstm rval, updatesLSTM = self.lstm_layer(tparams, emb[:n_timesteps,:,:], xAux, use_noise, options, prefix=options['generator'], mask=mask) if options['use_dropout']: p = self.dropout_layer(sliceT(rval[0],options.get('hidden_depth',1)-1,options['hidden_size']), use_noise, trng, options['drop_prob_decoder'], (n_samples,options['hidden_size'])) else: p = sliceT(rval[0],options.get('hidden_depth',1)-1,options['hidden_size']) p = tensor.dot(p,tparams['Wd']) + tparams['bd'] #pred = tensor.nnet.softmax(p) #pred = rval[2] #pred = pred[1:,:,:] p = p[1:,:,:] def accumCost(pred, xW, m, c_sum, ppl_sum): pred = tensor.nnet.softmax(pred) c_sum += -(tensor.log(pred[tensor.arange(n_samples), xW]+1e-10) * m).sum() ppl_sum += -(tensor.log2(pred[tensor.arange(n_samples), xW]+1e-10) * m).sum() return c_sum, ppl_sum sums, upd = theano.scan(fn=accumCost, outputs_info=[tensor.as_tensor_variable(numpy_floatX(0.)), tensor.as_tensor_variable(numpy_floatX(0.))], sequences = [p, xW[1:,:], mask[1:,:]]) # NOTE1: we are leaving out the first prediction, which was made for the image # and is meaningless. Here cost[0] contains log probability (log10) and cost[1] contains # perplexity (log2) cost = [sums[0][-1]/options['batch_size'], sums[1][-1]] inp_list = [xW, xI, mask] if options.get('en_aux_inp',0): inp_list.append(xAux) f_pred_prob = theano.function(inp_list, p, name='f_pred_prob', updates=updatesLSTM) return use_noise, inp_list, f_pred_prob, cost, p, updatesLSTM
def _stepP(x_, h_, c_, lP_, dV_, xAux): preact = tensor.dot(sliceT(h_, 0, h_sz), tparams[_p(prefix, 'W_hid')]) preact += (tensor.dot(x_, tparams[_p(prefix, 'W_inp')]) + tparams[_p(prefix, 'b')]) if options.get('en_aux_inp', 0): preact += tensor.dot(xAux, tparams[_p(prefix, 'W_aux')]) hL = [[]] * h_depth cL = [[]] * h_depth outp = [[]] * h_depth for di in xrange(h_depth): i = tensor.nnet.sigmoid(sliceT(preact, 0, h_sz)) f = tensor.nnet.sigmoid(sliceT(preact, 1, h_sz)) o = tensor.nnet.sigmoid(sliceT(preact, 2, h_sz)) cL[di] = tensor.tanh(sliceT(preact, 3, h_sz)) cL[di] = f * sliceT(c_, di, h_sz) + i * cL[di] hL[di] = o * tensor.tanh(cL[di]) outp[di] = hL[di] if options.get('en_residual_conn', 1): if (di > 0): outp[di] += outp[di - 1] print "Connecting residual at %d" % (di) if di < (h_depth - 1): preact = tensor.dot(sliceT(h_, di+1, h_sz), tparams[_p(prefix, ('W_hid_' + str(di+1)))]) + \ tensor.dot(outp[di], tparams[_p(prefix, ('W_inp_' + str(di+1)))]) c = tensor.concatenate(cL, axis=1) h = tensor.concatenate(hL, axis=1) if options.get('class_out_factoring', 0) == 1: pC = tensor.dot(outp[-1], tparams['WdCls']) + tparams['bdCls'] pCSft = tensor.nnet.softmax(pC) xCIdx = tensor.argmax(pCSft) pW = tensor.dot( outp[-1], tparams['Wd'][:, xCIdx, :]) + tparams['bd'][:, xCIdx, :] smooth_factor = tensor.as_tensor_variable(numpy_floatX( options.get('softmax_smooth_factor', 1.0)), name='sm_f') pWSft = tensor.nnet.softmax(pW * smooth_factor) lProb = tensor.log(pWSft + 1e-20) + tensor.log(pCSft[0, xCIdx] + 1e-20) else: p = tensor.dot(outp[-1], tparams['Wd']) + tparams['bd'] smooth_factor = tensor.as_tensor_variable(numpy_floatX( options.get('softmax_smooth_factor', 1.0)), name='sm_f') p = tensor.nnet.softmax(p * smooth_factor) lProb = tensor.log(p + 1e-20) if beam_size > 1: def _FindB_best(lPLcl, lPprev, dVLcl): srtLcl = tensor.argsort(-lPLcl) srtLcl = srtLcl[:beam_size] deltaVec = tensor.fill(lPLcl[srtLcl], numpy_floatX(-10000.)) deltaVec = tensor.set_subtensor(deltaVec[0], lPprev) lProbBest = ifelse( tensor.eq(dVLcl, tensor.zeros_like(dVLcl)), lPLcl[srtLcl] + lPprev, deltaVec) xWIdxBest = ifelse( tensor.eq(dVLcl, tensor.zeros_like(dVLcl)), srtLcl, tensor.zeros_like(srtLcl)) return lProbBest, xWIdxBest rvalLcl, updatesLcl = theano.scan(_FindB_best, sequences=[lProb, lP_, dV_], name=_p(prefix, 'FindBest'), n_steps=x_.shape[0]) xWIdxBest = rvalLcl[1] lProbBest = rvalLcl[0] xWIdxBest = xWIdxBest.flatten() lProb = lProbBest.flatten() # Now sort and find the best among these best extensions for the current beams srtIdx = tensor.argsort(-lProb) srtIdx = srtIdx[:beam_size] xCandIdx = srtIdx // beam_size # Floor division h = h.take(xCandIdx.flatten(), axis=0) c = c.take(xCandIdx.flatten(), axis=0) xWlogProb = lProb[srtIdx] xWIdx = xWIdxBest[srtIdx] else: xCandIdx = tensor.as_tensor_variable([0]) lProb = lProb.flatten() xWIdx = tensor.argmax(lProb, keepdims=True) xWlogProb = lProb[xWIdx] + lP_ if options.get('class_out_factoring', 0) == 1: clsoffset = tensor.as_tensor_variable( options['ixtoclsinfo'][:, 0]) xWIdx += clsoffset[xCIdx] h = h.take(xCandIdx.flatten(), axis=0) c = c.take(xCandIdx.flatten(), axis=0) if options.get('softmax_propogate', 0) == 0: xW = tparams['Wemb'][xWIdx.flatten()] else: xW = p.dot(tparams['Wemb']) doneVec = tensor.eq(xWIdx, tensor.zeros_like(xWIdx)) return [xW, h, c, xWlogProb, doneVec, xWIdx, xCandIdx], theano.scan_module.until(doneVec.all())
def build_model(self, tparams, options): trng = RandomStreams(1234) # Used for dropout. use_noise = theano.shared(numpy_floatX(0.)) xW = tensor.matrix('xW', dtype='int64') mask = tensor.matrix('mask', dtype=config.floatX) n_timesteps = xW.shape[0] n_samples = xW.shape[1] embW = tparams['Wemb'][xW.flatten()].reshape( [n_timesteps, n_samples, options['word_encoding_size']]) xI = tensor.matrix('xI', dtype=config.floatX) xAux = tensor.matrix('xAux', dtype=config.floatX) embImg = (tensor.dot(xI, tparams['WIemb']) + tparams['b_Img']).reshape( [1, n_samples, options['image_encoding_size']]) emb = tensor.concatenate([embImg, embW], axis=0) #This is implementation of input dropout !! if options['use_dropout']: emb = self.dropout_layer(emb, use_noise, trng, options['drop_prob_encoder'], shp=emb.shape) if options.get('en_aux_inp', 0): xAux = self.dropout_layer(xAux, use_noise, trng, options['drop_prob_aux'], shp=xAux.shape) # This implements core lstm rval, updatesLSTM = self.lstm_layer(tparams, emb[:n_timesteps, :, :], xAux, use_noise, options, prefix=options['generator'], mask=mask) if options['use_dropout']: p = self.dropout_layer( sliceT(rval[0], options.get('hidden_depth', 1) - 1, options['hidden_size']), use_noise, trng, options['drop_prob_decoder'], (n_samples, options['hidden_size'])) else: p = sliceT(rval[0], options.get('hidden_depth', 1) - 1, options['hidden_size']) p = tensor.dot(p, tparams['Wd']) + tparams['bd'] #pred = tensor.nnet.softmax(p) #pred = rval[2] #pred = pred[1:,:,:] p = p[1:, :, :] def accumCost(pred, xW, m, c_sum, ppl_sum): pred = tensor.nnet.softmax(pred) c_sum += -(tensor.log(pred[tensor.arange(n_samples), xW] + 1e-10) * m).sum() ppl_sum += -(tensor.log2(pred[tensor.arange(n_samples), xW] + 1e-10) * m).sum() return c_sum, ppl_sum sums, upd = theano.scan(fn=accumCost, outputs_info=[ tensor.as_tensor_variable( numpy_floatX(0.)), tensor.as_tensor_variable(numpy_floatX(0.)) ], sequences=[p, xW[1:, :], mask[1:, :]]) # NOTE1: we are leaving out the first prediction, which was made for the image # and is meaningless. Here cost[0] contains log probability (log10) and cost[1] contains # perplexity (log2) cost = [sums[0][-1] / options['batch_size'], sums[1][-1]] inp_list = [xW, xI, mask] if options.get('en_aux_inp', 0): inp_list.append(xAux) f_pred_prob = theano.function(inp_list, p, name='f_pred_prob', updates=updatesLSTM) return use_noise, inp_list, f_pred_prob, cost, p, updatesLSTM
def lstm_predict_layer(self, tparams, Xi, aux_input, options, beam_size, prefix='lstm'): nMaxsteps = 30 n_samples = 1 # ---------------------- STEP FUNCTION ---------------------- # def _stepP(x_, h_, c_, lP_, dV_, xAux): preact = tensor.dot(h_, tparams[_p(prefix, 'W_hid')]) preact += (tensor.dot(x_, tparams[_p(prefix, 'W_inp')]) + tparams[_p(prefix, 'b')]) if options.get('en_aux_inp',0): preact += tensor.dot(xAux,tparams[_p(prefix,'W_aux')]) i = tensor.nnet.sigmoid(sliceT(preact, 0, options['hidden_size'])) f = tensor.nnet.sigmoid(sliceT(preact, 1, options['hidden_size'])) o = tensor.nnet.sigmoid(sliceT(preact, 2, options['hidden_size'])) c = tensor.tanh(sliceT(preact, 3, options['hidden_size'])) c = f * c_ + i * c h = o * tensor.tanh(c) p = tensor.dot(h,tparams['Wd']) + tparams['bd'] p = tensor.nnet.softmax(p) lProb = tensor.log(p + 1e-20) def _FindB_best(lPLcl, lPprev, dVLcl): srtLcl = tensor.argsort(-lPLcl) srtLcl = srtLcl[:beam_size] deltaVec = tensor.fill( lPLcl[srtLcl], numpy_floatX(-10000.)) deltaVec = tensor.set_subtensor(deltaVec[0], lPprev) lProbBest = ifelse(tensor.eq( dVLcl, tensor.zeros_like(dVLcl)), lPLcl[srtLcl] + lPprev, deltaVec) xWIdxBest = ifelse(tensor.eq( dVLcl, tensor.zeros_like(dVLcl)), srtLcl, tensor.zeros_like(srtLcl)) return lProbBest, xWIdxBest rvalLcl, updatesLcl = theano.scan(_FindB_best, sequences = [lProb, lP_, dV_], name=_p(prefix, 'FindBest'), n_steps=x_.shape[0]) xWIdxBest = rvalLcl[1] lProbBest = rvalLcl[0] xWIdxBest = xWIdxBest.flatten() lProb = lProbBest.flatten() # Now sort and find the best among these best extensions for the current beams srtIdx = tensor.argsort(-lProb) srtIdx = srtIdx[:beam_size] xWlogProb = lProb[srtIdx] xWIdx = xWIdxBest[srtIdx] xCandIdx = srtIdx // beam_size # Floor division xW = tparams['Wemb'][xWIdx.flatten()] doneVec = tensor.eq(xWIdx,tensor.zeros_like(xWIdx)) h = h.take(xCandIdx.flatten(),axis=0); c = c.take(xCandIdx.flatten(),axis=0) return [xW, h, c, xWlogProb, doneVec, xWIdx, xCandIdx], theano.scan_module.until(doneVec.all()) # ------------------- END of STEP FUNCTION -------------------- # if options.get('en_aux_inp',0) == 0: aux_input = [] hidden_size = options['hidden_size'] h = tensor.alloc(numpy_floatX(0.),beam_size,hidden_size) c = tensor.alloc(numpy_floatX(0.),beam_size,hidden_size) lP = tensor.alloc(numpy_floatX(0.), beam_size); dV = tensor.alloc(np.int8(0.), beam_size); # Propogate the image feature vector [xW, h, c, _, _, _, _], _ = _stepP(Xi, h[:1,:], c[:1,:], lP, dV,aux_input) xWStart = tparams['Wemb'][[0]] [xW, h, c, lP, dV, idx0, cand0], _ = _stepP(xWStart, h[:1,:], c[:1,:], lP, dV, aux_input) aux_input = tensor.extra_ops.repeat(aux_input,beam_size,axis=0) # Now lets do the loop. rval, updates = theano.scan(_stepP, outputs_info=[xW, h, c, lP, dV, None, None], non_sequences = [aux_input], name=_p(prefix, 'predict_layers'), n_steps=nMaxsteps) return rval[3][-1], tensor.concatenate([idx0.reshape([1,beam_size]), rval[5]],axis=0), tensor.concatenate([cand0.reshape([1,beam_size]), rval[6]],axis=0)
def build_eval_other_sent(self, tparams, options,model_npy): zipp(model_npy, self.model_th) # Used for dropout. use_noise = theano.shared(numpy_floatX(0.)) xW = tensor.matrix('xW', dtype='int64') mask = tensor.matrix('mask', dtype=config.floatX) n_timesteps = xW.shape[0] n_samples = xW.shape[1] embW = tparams['Wemb'][xW.flatten()].reshape([n_timesteps, n_samples, options['word_encoding_size']]) xI = tensor.matrix('xI', dtype=config.floatX) xAux = tensor.matrix('xAux', dtype=config.floatX) embImg = (tensor.dot(xI, tparams['WIemb']) + tparams['b_Img']).reshape([1,n_samples,options['image_encoding_size']]); emb = tensor.concatenate([embImg, embW], axis=0) rval, updatesLSTM = self.lstm_layer(tparams, emb[:n_timesteps,:,:], xAux, use_noise, options, prefix=options['generator'], mask=mask) p = rval[0] p = tensor.dot(p,tparams['Wd']) + tparams['bd'] #pred = tensor.nnet.softmax(p) #pred = rval[2] #pred = pred[1:,:,:] p = p[1:,:,:] def accumCost(pred,xW,m,c_sum,ppl_sum): pred = tensor.nnet.softmax(pred) c_sum += (tensor.log(pred[tensor.arange(n_samples), xW]+1e-20) * m) ppl_sum += -(tensor.log2(pred[tensor.arange(n_samples), xW]+1e-10) * m) return c_sum, ppl_sum sums, upd = theano.scan(fn=accumCost, outputs_info=[tensor.alloc(numpy_floatX(0.), 1,n_samples), tensor.alloc(numpy_floatX(0.), 1,n_samples)], sequences = [p, xW[1:,:], mask[1:,:]]) # NOTE1: we are leaving out the first prediction, which was made for the image # and is meaningless. Here cost[0] contains log probability (log10) and cost[1] contains # perplexity (log2) cost = sums[0][-1] inp_list = [xW, xI, mask] if options.get('en_aux_inp',0): inp_list.append(xAux) f_pred_prob = theano.function(inp_list, p, name='f_pred_prob', updates=updatesLSTM) self.f_pred_prob_other = theano.function(inp_list, p, name='f_pred_prob', updates=updatesLSTM) #f_pred = theano.function([xW, mask], pred.argmax(axis=1), name='f_pred') #cost = -tensor.log(pred[tensor.arange(n_timesteps),tensor.arange(n_samples), xW] + 1e-8).mean() self.f_eval_other = theano.function(inp_list, cost, name='f_eval') return use_noise, inp_list, self.f_pred_prob_other, cost, p, updatesLSTM
def lstm_multi_model_pred(self, tparams, Xi, aux_input, options, beam_size, nmodels, prefix='lstm'): nMaxsteps = 30 # ---------------------- STEP FUNCTION ---------------------- # def _stepP(*in_list): x_inp = [] h_inp = [] c_inp = [] for i in xrange(nmodels): x_inp.append(in_list[i]) h_inp.append(in_list[nmodels + i]) c_inp.append(in_list[2 * nmodels + i]) lP_ = in_list[3 * nmodels] dV_ = in_list[3 * nmodels + 1] p_comb = tensor.alloc(numpy_floatX(0.), options[0]['output_size']) cf = [] h = [] xW = [] for i in xrange(nmodels): preact = tensor.dot(h_inp[i], tparams[i][_p(prefix, 'W_hid')]) preact += ( tensor.dot(x_inp[i], tparams[i][_p(prefix, 'W_inp')]) + tparams[i][_p(prefix, 'b')]) if options[i].get('en_aux_inp', 0): preact += tensor.dot(aux_input2[i], tparams[i][_p(prefix, 'W_aux')]) inp = tensor.nnet.sigmoid( sliceT(preact, 0, options[i]['hidden_size'])) f = tensor.nnet.sigmoid( sliceT(preact, 1, options[i]['hidden_size'])) o = tensor.nnet.sigmoid( sliceT(preact, 2, options[i]['hidden_size'])) c = tensor.tanh(sliceT(preact, 3, options[i]['hidden_size'])) cf.append(f * c_inp[i] + inp * c) h.append(o * tensor.tanh(cf[i])) p = tensor.dot(h[i], tparams[i]['Wd']) + tparams[i]['bd'] if i == 0: p_comb = tparams[i]['comb_weight'] * tensor.nnet.softmax(p) else: p_comb += tparams[i]['comb_weight'] * tensor.nnet.softmax( p) lProb = tensor.log(p_comb + 1e-20) def _FindB_best(lPLcl, lPprev, dVLcl): srtLcl = tensor.argsort(-lPLcl) srtLcl = srtLcl[:beam_size] deltaVec = tensor.fill(lPLcl[srtLcl], numpy_floatX(-10000.)) deltaVec = tensor.set_subtensor(deltaVec[0], lPprev) lProbBest = ifelse(tensor.eq(dVLcl, tensor.zeros_like(dVLcl)), lPLcl[srtLcl] + lPprev, deltaVec) xWIdxBest = ifelse(tensor.eq(dVLcl, tensor.zeros_like(dVLcl)), srtLcl, tensor.zeros_like(srtLcl)) return lProbBest, xWIdxBest rvalLcl, updatesLcl = theano.scan(_FindB_best, sequences=[lProb, lP_, dV_], name=_p(prefix, 'FindBest'), n_steps=x_inp[0].shape[0]) xWIdxBest = rvalLcl[1] lProbBest = rvalLcl[0] xWIdxBest = xWIdxBest.flatten() lProb = lProbBest.flatten() # Now sort and find the best among these best extensions for the current beams srtIdx = tensor.argsort(-lProb) srtIdx = srtIdx[:beam_size] xWlogProb = lProb[srtIdx] xWIdx = xWIdxBest[srtIdx] xCandIdx = srtIdx // beam_size # Floor division doneVec = tensor.eq(xWIdx, tensor.zeros_like(xWIdx)) x_out = [] h_out = [] c_out = [] for i in xrange(nmodels): x_out.append(tparams[i]['Wemb'][xWIdx.flatten()]) h_out.append(h[i].take(xCandIdx.flatten(), axis=0)) c_out.append(cf[i].take(xCandIdx.flatten(), axis=0)) out_list = [] out_list.extend(x_out) out_list.extend(h_out) out_list.extend(c_out) out_list.extend([xWlogProb, doneVec, xWIdx, xCandIdx]) return out_list, theano.scan_module.until(doneVec.all()) # ------------------- END of STEP FUNCTION -------------------- # #Xi = tensor.extra_ops.repeat(Xi,beam_size,axis=0) lP = tensor.alloc(numpy_floatX(0.), beam_size) dV = tensor.alloc(np.int8(0.), beam_size) h_inp = [] c_inp = [] x_inp = [] for i in xrange(nmodels): hidden_size = options[i]['hidden_size'] h = theano.shared(np.zeros((1, hidden_size), dtype='float32')) c = theano.shared(np.zeros((1, hidden_size), dtype='float32')) h_inp.append(h) c_inp.append(c) x_inp.append(Xi[i]) aux_input2 = aux_input in_list = [] in_list.extend(x_inp) in_list.extend(h_inp) in_list.extend(c_inp) in_list.append(lP) in_list.append(dV) # Propogate the image feature vector out_list, _ = _stepP(*in_list) for i in xrange(nmodels): h_inp[i] = out_list[nmodels + i] c_inp[i] = out_list[2 * nmodels + i] x_inp = [] for i in xrange(nmodels): x_inp.append(tparams[i]['Wemb'][[0]]) h_inp[i] = h_inp[i][:1, :] c_inp[i] = c_inp[i][:1, :] #if options[i].get('en_aux_inp',0): # aux_input2.append(aux_input[i]) in_list = [] in_list.extend(x_inp) in_list.extend(h_inp) in_list.extend(c_inp) in_list.append(lP) in_list.append(dV) out_list, _ = _stepP(*in_list) aux_input2 = [] for i in xrange(nmodels): x_inp[i] = out_list[i] h_inp[i] = out_list[nmodels + i] c_inp[i] = out_list[2 * nmodels + i] aux_input2.append( tensor.extra_ops.repeat(aux_input[i], beam_size, axis=0)) lP = out_list[3 * nmodels] dV = out_list[3 * nmodels + 1] idx0 = out_list[3 * nmodels + 2] cand0 = out_list[3 * nmodels + 3] in_list = [] in_list.extend(x_inp) in_list.extend(h_inp) in_list.extend(c_inp) in_list.append(lP) in_list.append(dV) in_list.append(None) in_list.append(None) # Now lets do the loop. rval, updates = theano.scan(_stepP, outputs_info=in_list, name=_p(prefix, 'predict_layers'), n_steps=nMaxsteps) return rval[3 * nmodels][-1], tensor.concatenate( [idx0.reshape([1, beam_size]), rval[3 * nmodels + 2]], axis=0), tensor.concatenate( [cand0.reshape([1, beam_size]), rval[3 * nmodels + 3]], axis=0), rval[3 * nmodels]
def lstm_multi_model_pred(self,tparams, Xi, aux_input, options, beam_size, nmodels, prefix='lstm'): nMaxsteps = 30 # ---------------------- STEP FUNCTION ---------------------- # def _stepP(*in_list): x_inp = [] h_inp = [] c_inp = [] for i in xrange(nmodels): x_inp.append(in_list[i]) h_inp.append(in_list[nmodels+i]) c_inp.append(in_list[2*nmodels+i]) lP_ = in_list[3*nmodels] dV_ = in_list[3*nmodels+1] p_comb = tensor.alloc(numpy_floatX(0.), options[0]['output_size']); cf = [] h = [] xW = [] for i in xrange(nmodels): preact = tensor.dot(h_inp[i], tparams[i][_p(prefix, 'W_hid')]) preact += (tensor.dot(x_inp[i], tparams[i][_p(prefix, 'W_inp')]) + tparams[i][_p(prefix, 'b')]) if options[i].get('en_aux_inp',0): preact += tensor.dot(aux_input2[i],tparams[i][_p(prefix,'W_aux')]) inp = tensor.nnet.sigmoid(sliceT(preact, 0, options[i]['hidden_size'])) f = tensor.nnet.sigmoid(sliceT(preact, 1, options[i]['hidden_size'])) o = tensor.nnet.sigmoid(sliceT(preact, 2, options[i]['hidden_size'])) c = tensor.tanh(sliceT(preact, 3, options[i]['hidden_size'])) cf.append(f * c_inp[i] + inp * c) h.append(o * tensor.tanh(cf[i])) p = tensor.dot(h[i],tparams[i]['Wd']) + tparams[i]['bd'] if i == 0: p_comb = tparams[i]['comb_weight']*tensor.nnet.softmax(p) else: p_comb += tparams[i]['comb_weight']*tensor.nnet.softmax(p) lProb = tensor.log(p_comb + 1e-20) def _FindB_best(lPLcl, lPprev, dVLcl): srtLcl = tensor.argsort(-lPLcl) srtLcl = srtLcl[:beam_size] deltaVec = tensor.fill( lPLcl[srtLcl], numpy_floatX(-10000.)) deltaVec = tensor.set_subtensor(deltaVec[0], lPprev) lProbBest = ifelse(tensor.eq( dVLcl, tensor.zeros_like(dVLcl)), lPLcl[srtLcl] + lPprev, deltaVec) xWIdxBest = ifelse(tensor.eq( dVLcl, tensor.zeros_like(dVLcl)), srtLcl, tensor.zeros_like(srtLcl)) return lProbBest, xWIdxBest rvalLcl, updatesLcl = theano.scan(_FindB_best, sequences = [lProb, lP_, dV_], name=_p(prefix, 'FindBest'), n_steps=x_inp[0].shape[0]) xWIdxBest = rvalLcl[1] lProbBest = rvalLcl[0] xWIdxBest = xWIdxBest.flatten() lProb = lProbBest.flatten() # Now sort and find the best among these best extensions for the current beams srtIdx = tensor.argsort(-lProb) srtIdx = srtIdx[:beam_size] xWlogProb = lProb[srtIdx] xWIdx = xWIdxBest[srtIdx] xCandIdx = srtIdx // beam_size # Floor division doneVec = tensor.eq(xWIdx,tensor.zeros_like(xWIdx)) x_out = [] h_out = [] c_out = [] for i in xrange(nmodels): x_out.append(tparams[i]['Wemb'][xWIdx.flatten()]) h_out.append(h[i].take(xCandIdx.flatten(),axis=0)) c_out.append(cf[i].take(xCandIdx.flatten(),axis=0)) out_list = [] out_list.extend(x_out) out_list.extend(h_out) out_list.extend(c_out) out_list.extend([xWlogProb, doneVec, xWIdx, xCandIdx]) return out_list, theano.scan_module.until(doneVec.all()) # ------------------- END of STEP FUNCTION -------------------- # #Xi = tensor.extra_ops.repeat(Xi,beam_size,axis=0) lP = tensor.alloc(numpy_floatX(0.), beam_size); dV = tensor.alloc(np.int8(0.), beam_size); h_inp = [] c_inp = [] x_inp = [] for i in xrange(nmodels): hidden_size = options[i]['hidden_size'] h = theano.shared(np.zeros((1,hidden_size),dtype='float32')) c = theano.shared(np.zeros((1,hidden_size),dtype='float32')) h_inp.append(h) c_inp.append(c) x_inp.append(Xi[i]) aux_input2 = aux_input in_list = [] in_list.extend(x_inp); in_list.extend(h_inp); in_list.extend(c_inp) in_list.append(lP); in_list.append(dV) # Propogate the image feature vector out_list,_ = _stepP(*in_list) for i in xrange(nmodels): h_inp[i] = out_list[nmodels + i] c_inp[i] = out_list[2*nmodels + i] x_inp = [] for i in xrange(nmodels): x_inp.append(tparams[i]['Wemb'][[0]]) h_inp[i] = h_inp[i][:1,:] c_inp[i] = c_inp[i][:1,:] #if options[i].get('en_aux_inp',0): # aux_input2.append(aux_input[i]) in_list = [] in_list.extend(x_inp); in_list.extend(h_inp); in_list.extend(c_inp) in_list.append(lP); in_list.append(dV) out_list, _ = _stepP(*in_list) aux_input2 = [] for i in xrange(nmodels): x_inp[i] = out_list[i] h_inp[i] = out_list[nmodels + i] c_inp[i] = out_list[2*nmodels + i] aux_input2.append(tensor.extra_ops.repeat(aux_input[i],beam_size,axis=0)) lP = out_list[3*nmodels] dV = out_list[3*nmodels+1] idx0 = out_list[3*nmodels+2] cand0 = out_list[3*nmodels+3] in_list = [] in_list.extend(x_inp); in_list.extend(h_inp); in_list.extend(c_inp) in_list.append(lP); in_list.append(dV) in_list.append(None);in_list.append(None); # Now lets do the loop. rval, updates = theano.scan(_stepP, outputs_info=in_list, name=_p(prefix, 'predict_layers'), n_steps=nMaxsteps) return rval[3*nmodels][-1], tensor.concatenate([idx0.reshape([1,beam_size]), rval[3*nmodels+2]],axis=0), tensor.concatenate([cand0.reshape([1,beam_size]), rval[3*nmodels+3]],axis=0), rval[3*nmodels]
def main(params): batch_size = params['batch_size'] word_count_threshold = params['word_count_threshold'] max_epochs = params['max_epochs'] host = socket.gethostname() # get computer hostname #--------------------------------- Init data provider and load data+features #---------------------------------# # fetch the data provider dp = getDataProvider(params) params['aux_inp_size'] = params['featenc_hidden_size'] * params[ 'n_encgt_sent'] if params['encode_gt_sentences'] else dp.aux_inp_size params['featenc_hidden_size'] = params['featenc_hidden_size'] if params[ 'encode_gt_sentences'] else params['aux_inp_size'] params['image_feat_size'] = dp.img_feat_size print 'Image feature size is %d, and aux input size is %d' % ( params['image_feat_size'], params['aux_inp_size']) #--------------------------------- Preprocess sentences and build Vocabulary #---------------------------------# misc = { } # stores various misc items that need to be passed around the framework # go over all training sentences and find the vocabulary we want to use, i.e. the words that occur # at least word_count_threshold number of times if params['checkpoint_file_name'] == 'None': if params['class_out_factoring'] == 0: misc['wordtoix'], misc[ 'ixtoword'], bias_init_vector = preProBuildWordVocab( dp.iterSentences('train'), word_count_threshold) else: [misc['wordtoix'], misc['classes'] ], [misc['ixtoword'], misc['clstotree'], misc['ixtoclsinfo'] ], [bias_init_vector, bias_init_inter_class ] = preProBuildWordVocab(dp.iterSentences('train'), word_count_threshold, params) params['nClasses'] = bias_init_inter_class.shape[0] params['ixtoclsinfo'] = misc['ixtoclsinfo'] else: misc = checkpoint_init['misc'] params['nClasses'] = checkpoint_init['params']['nClasses'] if 'ixtoclsinfo' in misc: params['ixtoclsinfo'] = misc['ixtoclsinfo'] params['vocabulary_size'] = len(misc['wordtoix']) params['output_size'] = len(misc['ixtoword']) # these should match though print len(misc['wordtoix']), len(misc['ixtoword']) #------------------------------ Initialize the solver/generator and build forward path #-----------------------# # Initialize the optimizer solver = Solver(params['solver']) # This initializes the model parameters and does matrix initializations lstmGenerator = decodeGenerator(params) model, misc['update'], misc['regularize'] = (lstmGenerator.model_th, lstmGenerator.update_list, lstmGenerator.regularize) # force overwrite here. The bias to the softmax is initialized to reflect word frequencies # This is a bit of a hack if params['checkpoint_file_name'] == 'None': model['bd'].set_value(bias_init_vector.astype(config.floatX)) if params['class_out_factoring'] == 1: model['bdCls'].set_value( bias_init_inter_class.astype(config.floatX)) #----------------- If we are using feature encoders ----------------------- # This mode can now also be used for encoding GT sentences. if params['use_encoder_for'] & 1: if params['encode_gt_sentences']: xI = tensor.zeros((batch_size, params['image_encoding_size'])) imgFeatEnc_inp = [] else: imgFeatEncoder = RecurrentFeatEncoder(params['image_feat_size'], params['word_encoding_size'], params, mdl_prefix='img_enc_', features=dp.features.T) mdlLen = len(model.keys()) model.update(imgFeatEncoder.model_th) assert (len(model.keys()) == (mdlLen + len(imgFeatEncoder.model_th.keys()))) misc['update'].extend(imgFeatEncoder.update_list) misc['regularize'].extend(imgFeatEncoder.regularize) (imgenc_use_dropout, imgFeatEnc_inp, xI, updatesLSTMImgFeat) = imgFeatEncoder.build_model(model, params) else: xI = None imgFeatEnc_inp = [] if params['use_encoder_for'] & 2: aux_enc_inp = model['Wemb'] if params[ 'encode_gt_sentences'] else dp.aux_inputs.T hid_size = params['featenc_hidden_size'] auxFeatEncoder = RecurrentFeatEncoder(hid_size, params['image_encoding_size'], params, mdl_prefix='aux_enc_', features=aux_enc_inp) mdlLen = len(model.keys()) model.update(auxFeatEncoder.model_th) assert (len(model.keys()) == (mdlLen + len(auxFeatEncoder.model_th.keys()))) misc['update'].extend(auxFeatEncoder.update_list) misc['regularize'].extend(auxFeatEncoder.regularize) (auxenc_use_dropout, auxFeatEnc_inp, xAux, updatesLSTMAuxFeat) = auxFeatEncoder.build_model(model, params) if params['encode_gt_sentences']: # Reshape it size(batch_size, n_gt, hidden_size) xAux = xAux.reshape( (-1, params['n_encgt_sent'], params['featenc_hidden_size'])) # Convert it to size (batch_size, n_gt*hidden_size xAux = xAux.flatten(2) else: auxFeatEnc_inp = [] xAux = None #--------------------------------- Initialize the Attention Network #-------------------------------# if params['use_attn'] != None: attnModel = AttentionNetwork(params['image_feat_size'], params['hidden_size'], params, mdl_prefix='attn_mlp_') mdlLen = len(model.keys()) model.update(attnModel.model_th) assert (len(model.keys()) == (mdlLen + len(attnModel.model_th.keys()))) misc['update'].extend(attnModel.update_list) misc['regularize'].extend(attnModel.regularize) attn_nw_func = attnModel.build_model else: attn_nw_func = None #--------------------------------- Build the language model graph #---------------------------------# # Define the computational graph for relating the input image features and word indices to the # log probability cost funtion. (use_dropout, inp_list_gen, f_pred_prob, cost, predTh, updatesLSTM) = lstmGenerator.build_model(model, params, xI, xAux, attn_nw=attn_nw_func) inp_list = imgFeatEnc_inp + auxFeatEnc_inp + inp_list_gen #--------------------------------- Cost function and gradient computations setup #---------------------------------# costGrad = cost[0] # Add class uncertainity to final cost #if params['class_out_factoring'] == 1: # costGrad += cost[2] # Add the regularization cost. Since this is specific to trainig and doesn't get included when we # evaluate the cost on test or validation data, we leave it here outside the model definition if params['regc'] > 0.: reg_cost = theano.shared(numpy_floatX(0.), name='reg_c') reg_c = tensor.as_tensor_variable(numpy_floatX(params['regc']), name='reg_c') reg_cost = 0. for p in misc['regularize']: reg_cost += (model[p]**2).sum() reg_cost *= 0.5 * reg_c costGrad += (reg_cost / params['batch_size']) # Compile an evaluation function.. Doesn't include gradients # To be used for validation set evaluation f_eval = theano.function(inp_list, cost, name='f_eval') # Now let's build a gradient computation graph and rmsprop update mechanism grads = tensor.grad(costGrad, wrt=model.values()) lr = tensor.scalar(name='lr', dtype=config.floatX) f_grad_shared, f_update, zg, rg, ud = solver.build_solver_model( lr, model, grads, inp_list, cost, params) print 'model init done.' print 'model has keys: ' + ', '.join(model.keys()) #print 'updating: ' + ', '.join( '%s [%dx%d]' % (k, model[k].shape[0], model[k].shape[1]) for k in misc['update']) #print 'updating: ' + ', '.join( '%s [%dx%d]' % (k, model[k].shape[0], model[k].shape[1]) for k in misc['regularize']) #print 'number of learnable parameters total: %d' % (sum(model[k].shape[0] * model[k].shape[1] for k in misc['update']), ) #-------------------------------- Intialize the prediction path if needed by evaluator ----------------------------# evalKwargs = { 'eval_metric': params['eval_metric'], 'f_gen': lstmGenerator.predict, 'beamsize': params['eval_beamsize'] } if params['eval_metric'] != 'perplex': lstmGenerator.prepPredictor(None, params, params['eval_beamsize']) refToks, scr_info = eval_prep_refs('val', dp, params['eval_metric']) evalKwargs['refToks'] = refToks evalKwargs['scr_info'] = scr_info valMetOp = operator.gt else: valMetOp = operator.lt if params['met_to_track'] != []: trackMetargs = { 'eval_metric': params['met_to_track'], 'f_gen': lstmGenerator.predict, 'beamsize': params['eval_beamsize'] } lstmGenerator.prepPredictor(None, params, params['eval_beamsize']) refToks, scr_info = eval_prep_refs('val', dp, params['met_to_track']) trackMetargs['refToks'] = refToks trackMetargs['scr_info'] = scr_info #--------------------------------- Iterations and Logging intializations ------------------------------------------# # calculate how many iterations we need, One epoch is considered once going through all the sentences and not images # Hence in case of coco/flickr this will 5* no of images num_sentences_total = dp.getSplitSize('train', ofwhat='sentences') num_iters_one_epoch = num_sentences_total / batch_size max_iters = max_epochs * num_iters_one_epoch eval_period_in_epochs = params['eval_period'] eval_period_in_iters = max( 1, int(num_iters_one_epoch * eval_period_in_epochs)) top_val_sc = -1 smooth_train_ppl2 = len( misc['ixtoword']) # initially size of dictionary of confusion val_sc = len(misc['ixtoword']) last_status_write_time = 0 # for writing worker job status reports json_worker_status = {} #json_worker_status['params'] = params json_worker_status['history'] = [] len_hist = defaultdict(int) #Initialize Tracking the perplexity of train and val, with iters. train_perplex = [] val_perplex = [] trackSc_array = [] #-------------------------------------- Load previously saved model ------------------------------------------------# #- Initialize the model parameters from the checkpoint file if we are resuming training if params['checkpoint_file_name'] != 'None': zipp(model_init_from, model) if params['restore_grads'] == 1: zipp(rg_init, rg) #Copy trackers from previous checkpoint if 'trackers' in checkpoint_init: train_perplex = checkpoint_init['trackers']['train_perplex'] val_perplex = checkpoint_init['trackers']['val_perplex'] trackSc_array = checkpoint_init['trackers'].get('trackScores', []) print( """\nContinuing training from previous model\n. Already run for %0.2f epochs with validation perplx at %0.3f\n""" % (checkpoint_init['epoch'], checkpoint_init['perplexity'])) #-------------------------------------- MAIN LOOP ----------------------------------------------------------------# for it in xrange(max_iters): t0 = time.time() # Enable using dropout in training use_dropout.set_value(float(params['use_dropout'])) if params['use_encoder_for'] & 1: imgenc_use_dropout.set_value(float(params['use_dropout'])) if params['use_encoder_for'] & 2: auxenc_use_dropout.set_value(float(params['use_dropout'])) epoch = it * 1.0 / num_iters_one_epoch #-------------------------------------- Prepare batch-------------------------------------------# # fetch a batch of data if params['sample_by_len'] == 0: batch = [dp.sampleImageSentencePair() for i in xrange(batch_size)] else: batch, l = dp.getRandBatchByLen(batch_size) len_hist[l] += 1 enc_inp_list = prepare_seq_features( batch, use_enc_for=params['use_encoder_for'], maxlen=params['maxlen'], use_shared_mem=params['use_shared_mem_enc'], enc_gt_sent=params['encode_gt_sentences'], n_enc_sent=params['n_encgt_sent'], wordtoix=misc['wordtoix']) if params['use_pos_tag'] != 'None': gen_inp_list, lenS = prepare_data( batch, misc['wordtoix'], params['maxlen'], sentTagMap, misc['ixtoword'], rev_sents=params['reverse_sentence'], use_enc_for=params['use_encoder_for'], use_unk_token=params['use_unk_token']) else: gen_inp_list, lenS = prepare_data( batch, misc['wordtoix'], params['maxlen'], rev_sents=params['reverse_sentence'], use_enc_for=params['use_encoder_for'], use_unk_token=params['use_unk_token']) if params['sched_sampling_mode'] != None: gen_inp_list.append(epoch) real_inp_list = enc_inp_list + gen_inp_list #import ipdb; ipdb.set_trace() #---------------------------------- Compute cost and apply gradients ---------------------------# # evaluate cost, gradient and perform parameter update cost = f_grad_shared(*real_inp_list) f_update(params['learning_rate']) dt = time.time() - t0 # print training statistics train_ppl2 = (2**(cost[1] / lenS)) #step_struct['stats']['ppl2'] # smooth exponentially decaying moving average smooth_train_ppl2 = 0.99 * smooth_train_ppl2 + 0.01 * train_ppl2 if it == 0: smooth_train_ppl2 = train_ppl2 # start out where we start out total_cost = cost[0] if it == 0: smooth_cost = total_cost # start out where we start out smooth_cost = 0.99 * smooth_cost + 0.01 * total_cost #print '%d/%d batch done in %.3fs. at epoch %.2f. loss cost = %f, reg cost = %f, ppl2 = %.2f (smooth %.2f)' \ # % (it, max_iters, dt, epoch, cost['loss_cost'], cost['reg_cost'], \ # train_ppl2, smooth_train_ppl2) #---------------------------------- Write a report into a json file ---------------------------# tnow = time.time() if tnow > last_status_write_time + 60 * 1: # every now and then lets write a report print '%d/%d batch done in %.3fs. at epoch %.2f. Cost now is %.3f and pplx is %.3f' \ % (it, max_iters, dt, epoch, smooth_cost, smooth_train_ppl2) last_status_write_time = tnow jstatus = {} jstatus['time'] = datetime.datetime.now().isoformat() jstatus['iter'] = (it, max_iters) jstatus['epoch'] = (epoch, max_epochs) jstatus['time_per_batch'] = dt jstatus['smooth_train_ppl2'] = smooth_train_ppl2 jstatus['val_sc'] = val_sc # just write the last available one jstatus['val_metric'] = params[ 'eval_metric'] # just write the last available one jstatus['train_ppl2'] = train_ppl2 #if params['class_out_factoring'] == 1: # jstatus['class_cost'] = float(cost[2]) json_worker_status['history'].append(jstatus) status_file = os.path.join( params['worker_status_output_directory'], host + '_status.json') #import pdb; pdb.set_trace() try: json.dump(json_worker_status, open(status_file, 'w')) except Exception, e: # todo be more clever here print 'tried to write worker status into %s but got error:' % ( status_file, ) print e #--------------------------------- VALIDATION ---------------------------# #- perform perplexity evaluation on the validation set and save a model checkpoint if it's good is_last_iter = (it + 1) == max_iters if (((it + 1) % eval_period_in_iters) == 0 and it < max_iters - 5) or is_last_iter: # Disable using dropout in validation use_dropout.set_value(0.) if params['use_encoder_for'] & 1: imgenc_use_dropout.set_value(0.) if params['use_encoder_for'] & 2: auxenc_use_dropout.set_value(0.) # perform the evaluation on VAL set val_sc = eval_split_theano('val', dp, model, params, misc, f_eval, **evalKwargs) val_sc = val_sc[0] val_perplex.append((it, val_sc)) train_perplex.append((it, smooth_train_ppl2)) if params['met_to_track'] != []: track_sc = eval_split_theano('val', dp, model, params, misc, f_eval, **trackMetargs) trackSc_array.append((it, { evm: track_sc[i] for i, evm in enumerate(params['met_to_track']) })) if epoch - params['lr_decay_st_epoch'] >= 0: params['learning_rate'] = params['learning_rate'] * params[ 'lr_decay'] params['lr_decay_st_epoch'] += 1 print 'validation %s = %f, lr = %f' % ( params['eval_metric'], val_sc, params['learning_rate']) #if params['sample_by_len'] == 1: # print len_hist #----------------------------- SAVE THE MODEL -------------------# write_checkpoint_ppl_threshold = params[ 'write_checkpoint_ppl_threshold'] if valMetOp(val_sc, top_val_sc) or top_val_sc < 0: if valMetOp(val_sc, write_checkpoint_ppl_threshold ) or write_checkpoint_ppl_threshold < 0: # if we beat a previous record or if this is the first time # AND we also beat the user-defined threshold or it doesnt exist top_val_sc = val_sc filename = 'model_checkpoint_%s_%s_%s_%s%.2f.p' % ( params['dataset'], host, params['fappend'], params['eval_metric'][:3], val_sc) filepath = os.path.join( params['checkpoint_output_directory'], filename) model_npy = unzip(model) rgrads_npy = unzip(rg) checkpoint = {} checkpoint['it'] = it checkpoint['epoch'] = epoch checkpoint['model'] = model_npy checkpoint['rgrads'] = rgrads_npy checkpoint['params'] = params checkpoint['perplexity'] = val_sc checkpoint['misc'] = misc checkpoint['trackers'] = { 'train_perplex': train_perplex, 'val_perplex': val_perplex, 'trackScores': trackSc_array } try: pickle.dump(checkpoint, open(filepath, "wb")) print 'saved checkpoint in %s' % (filepath, ) except Exception, e: # todo be more clever here print 'tried to write checkpoint into %s but got error: ' % ( filepath, ) print e
def lstm_predict_layer(self, tparams, Xi, aux_input, options, beam_size, prefix='lstm'): nMaxsteps = options.get('maxlen',30) if nMaxsteps is None: nMaxsteps = 30 n_samples = 1 h_depth = options.get('hidden_depth',1) h_sz = options['hidden_size'] # ---------------------- STEP FUNCTION ---------------------- # def _stepP(x_, h_, c_, lP_, dV_, xAux): preact = tensor.dot(sliceT(h_, 0, h_sz), tparams[_p(prefix, 'W_hid')]) preact += (tensor.dot(x_, tparams[_p(prefix, 'W_inp')]) + tparams[_p(prefix, 'b')]) if options.get('en_aux_inp',0): preact += tensor.dot(xAux,tparams[_p(prefix,'W_aux')]) hL = [[]]*h_depth cL = [[]]*h_depth for di in xrange(h_depth): i = tensor.nnet.sigmoid(sliceT(preact, 0, h_sz)) f = tensor.nnet.sigmoid(sliceT(preact, 1, h_sz)) o = tensor.nnet.sigmoid(sliceT(preact, 2, h_sz)) cL[di] = tensor.tanh(sliceT(preact, 3, h_sz)) cL[di] = f * sliceT(c_, di, h_sz) + i * cL[di] hL[di] = o * tensor.tanh(cL[di]) if di < (h_depth - 1): preact = tensor.dot(sliceT(h_, di+1, h_sz), tparams[_p(prefix, ('W_hid_' + str(di+1)))]) + \ tensor.dot(hL[di], tparams[_p(prefix, ('W_inp_' + str(di+1)))]) c = tensor.concatenate(cL,axis=1) h = tensor.concatenate(hL,axis=1) if options.get('class_out_factoring',0) == 1: pC = tensor.dot(hL[-1],tparams['WdCls']) + tparams['bdCls'] pCSft = tensor.nnet.softmax(pC) xCIdx = tensor.argmax(pCSft) pW = tensor.dot(h[-1],tparams['Wd'][:,xCIdx,:]) + tparams['bd'][:,xCIdx,:] smooth_factor = tensor.as_tensor_variable(numpy_floatX(options.get('softmax_smooth_factor',1.0)), name='sm_f') pWSft = tensor.nnet.softmax(pW*smooth_factor) lProb = tensor.log(pWSft + 1e-20) + tensor.log(pCSft[0,xCIdx] + 1e-20) else: p = tensor.dot(hL[-1],tparams['Wd']) + tparams['bd'] smooth_factor = tensor.as_tensor_variable(numpy_floatX(options.get('softmax_smooth_factor',1.0)), name='sm_f') p = tensor.nnet.softmax(p*smooth_factor) lProb = tensor.log(p + 1e-20) if beam_size > 1: def _FindB_best(lPLcl, lPprev, dVLcl): srtLcl = tensor.argsort(-lPLcl) srtLcl = srtLcl[:beam_size] deltaVec = tensor.fill( lPLcl[srtLcl], numpy_floatX(-10000.)) deltaVec = tensor.set_subtensor(deltaVec[0], lPprev) lProbBest = ifelse(tensor.eq( dVLcl, tensor.zeros_like(dVLcl)), lPLcl[srtLcl] + lPprev, deltaVec) xWIdxBest = ifelse(tensor.eq( dVLcl, tensor.zeros_like(dVLcl)), srtLcl, tensor.zeros_like(srtLcl)) return lProbBest, xWIdxBest rvalLcl, updatesLcl = theano.scan(_FindB_best, sequences = [lProb, lP_, dV_], name=_p(prefix, 'FindBest'), n_steps=x_.shape[0]) xWIdxBest = rvalLcl[1] lProbBest = rvalLcl[0] xWIdxBest = xWIdxBest.flatten() lProb = lProbBest.flatten() # Now sort and find the best among these best extensions for the current beams srtIdx = tensor.argsort(-lProb) srtIdx = srtIdx[:beam_size] xCandIdx = srtIdx // beam_size # Floor division h = h.take(xCandIdx.flatten(),axis=0) c = c.take(xCandIdx.flatten(),axis=0) xWlogProb = lProb[srtIdx] xWIdx = xWIdxBest[srtIdx] else: xCandIdx = tensor.as_tensor_variable([0]) lProb = lProb.flatten() xWIdx = tensor.argmax(lProb,keepdims=True) xWlogProb = lProb[xWIdx] + lP_ if options.get('class_out_factoring',0) == 1: clsoffset = tensor.as_tensor_variable(options['ixtoclsinfo'][:,0]) xWIdx += clsoffset[xCIdx] h = h.take(xCandIdx.flatten(),axis=0) c = c.take(xCandIdx.flatten(),axis=0) if options.get('softmax_propogate',0) == 0: xW = tparams['Wemb'][xWIdx.flatten()] else: xW = p.dot(tparams['Wemb']) doneVec = tensor.eq(xWIdx,tensor.zeros_like(xWIdx)) return [xW, h, c, xWlogProb, doneVec, xWIdx, xCandIdx], theano.scan_module.until(doneVec.all()) # ------------------- END of STEP FUNCTION -------------------- # if options.get('en_aux_inp',0) == 0: aux_input = [] h = tensor.alloc(numpy_floatX(0.),beam_size,h_sz*h_depth) c = tensor.alloc(numpy_floatX(0.),beam_size,h_sz*h_depth) lP = tensor.alloc(numpy_floatX(0.), beam_size); dV = tensor.alloc(np.int8(0.), beam_size); # Propogate the image feature vector [xW, h, c, _, _, _, _], _ = _stepP(Xi, h[:1,:], c[:1,:], lP, dV,aux_input) xWStart = tparams['Wemb'][[0]] [xW, h, c, lP, dV, idx0, cand0], _ = _stepP(xWStart, h[:1,:], c[:1,:], lP, dV, aux_input) if options.get('en_aux_inp',0) == 1: aux_input = tensor.extra_ops.repeat(aux_input,beam_size,axis=0) # Now lets do the loop. rval, updates = theano.scan(_stepP, outputs_info=[xW, h, c, lP, dV, None, None], non_sequences = [aux_input], name=_p(prefix, 'predict_layers'), n_steps=nMaxsteps) return rval[3][-1], tensor.concatenate([idx0.reshape([1,beam_size]), rval[5]],axis=0), tensor.concatenate([cand0.reshape([1,beam_size]), rval[6]],axis=0), tensor.concatenate([tensor.shape_padleft(xW,n_ones=1),rval[0]],axis=0), updates