def _stepP(x_, h_, c_, lP_, dV_, xAux): preact = tensor.dot(h_, tparams[_p(prefix, 'W_hid')]) preact += (tensor.dot(x_, tparams[_p(prefix, 'W_inp')]) + tparams[_p(prefix, 'b')]) if options.get('en_aux_inp', 0): preact += tensor.dot(xAux, tparams[_p(prefix, 'W_aux')]) i = tensor.nnet.sigmoid(sliceT(preact, 0, options['hidden_size'])) f = tensor.nnet.sigmoid(sliceT(preact, 1, options['hidden_size'])) o = tensor.nnet.sigmoid(sliceT(preact, 2, options['hidden_size'])) c = tensor.tanh(sliceT(preact, 3, options['hidden_size'])) c = f * c_ + i * c h = o * tensor.tanh(c) p = tensor.dot(h, tparams['Wd']) + tparams['bd'] p = tensor.nnet.softmax(p) lProb = tensor.log(p + 1e-20) def _FindB_best(lPLcl, lPprev, dVLcl): srtLcl = tensor.argsort(-lPLcl) srtLcl = srtLcl[:beam_size] deltaVec = tensor.fill(lPLcl[srtLcl], numpy_floatX(-10000.)) deltaVec = tensor.set_subtensor(deltaVec[0], lPprev) lProbBest = ifelse(tensor.eq(dVLcl, tensor.zeros_like(dVLcl)), lPLcl[srtLcl] + lPprev, deltaVec) xWIdxBest = ifelse(tensor.eq(dVLcl, tensor.zeros_like(dVLcl)), srtLcl, tensor.zeros_like(srtLcl)) return lProbBest, xWIdxBest rvalLcl, updatesLcl = theano.scan(_FindB_best, sequences=[lProb, lP_, dV_], name=_p(prefix, 'FindBest'), n_steps=x_.shape[0]) xWIdxBest = rvalLcl[1] lProbBest = rvalLcl[0] xWIdxBest = xWIdxBest.flatten() lProb = lProbBest.flatten() # Now sort and find the best among these best extensions for the current beams srtIdx = tensor.argsort(-lProb) srtIdx = srtIdx[:beam_size] xWlogProb = lProb[srtIdx] xWIdx = xWIdxBest[srtIdx] xCandIdx = srtIdx // beam_size # Floor division xW = tparams['Wemb'][xWIdx.flatten()] doneVec = tensor.eq(xWIdx, tensor.zeros_like(xWIdx)) h = h.take(xCandIdx.flatten(), axis=0) c = c.take(xCandIdx.flatten(), axis=0) return [xW, h, c, xWlogProb, doneVec, xWIdx, xCandIdx], theano.scan_module.until(doneVec.all())
def _stepP(x_, h_, c_, lP_, dV_, xAux): preact = tensor.dot(h_, tparams[_p(prefix, 'W_hid')]) preact += (tensor.dot(x_, tparams[_p(prefix, 'W_inp')]) + tparams[_p(prefix, 'b')]) if options.get('en_aux_inp',0): preact += tensor.dot(xAux,tparams[_p(prefix,'W_aux')]) i = tensor.nnet.sigmoid(sliceT(preact, 0, options['hidden_size'])) f = tensor.nnet.sigmoid(sliceT(preact, 1, options['hidden_size'])) o = tensor.nnet.sigmoid(sliceT(preact, 2, options['hidden_size'])) c = tensor.tanh(sliceT(preact, 3, options['hidden_size'])) c = f * c_ + i * c h = o * tensor.tanh(c) p = tensor.dot(h,tparams['Wd']) + tparams['bd'] p = tensor.nnet.softmax(p) lProb = tensor.log(p + 1e-20) def _FindB_best(lPLcl, lPprev, dVLcl): srtLcl = tensor.argsort(-lPLcl) srtLcl = srtLcl[:beam_size] deltaVec = tensor.fill( lPLcl[srtLcl], numpy_floatX(-10000.)) deltaVec = tensor.set_subtensor(deltaVec[0], lPprev) lProbBest = ifelse(tensor.eq( dVLcl, tensor.zeros_like(dVLcl)), lPLcl[srtLcl] + lPprev, deltaVec) xWIdxBest = ifelse(tensor.eq( dVLcl, tensor.zeros_like(dVLcl)), srtLcl, tensor.zeros_like(srtLcl)) return lProbBest, xWIdxBest rvalLcl, updatesLcl = theano.scan(_FindB_best, sequences = [lProb, lP_, dV_], name=_p(prefix, 'FindBest'), n_steps=x_.shape[0]) xWIdxBest = rvalLcl[1] lProbBest = rvalLcl[0] xWIdxBest = xWIdxBest.flatten() lProb = lProbBest.flatten() # Now sort and find the best among these best extensions for the current beams srtIdx = tensor.argsort(-lProb) srtIdx = srtIdx[:beam_size] xWlogProb = lProb[srtIdx] xWIdx = xWIdxBest[srtIdx] xCandIdx = srtIdx // beam_size # Floor division xW = tparams['Wemb'][xWIdx.flatten()] doneVec = tensor.eq(xWIdx,tensor.zeros_like(xWIdx)) h = h.take(xCandIdx.flatten(),axis=0); c = c.take(xCandIdx.flatten(),axis=0) return [xW, h, c, xWlogProb, doneVec, xWIdx, xCandIdx], theano.scan_module.until(doneVec.all())
def _step(x_in, h_, c_): preact = tensor.dot(sliceT(h_, 0, h_sz), tparams[_p(prefix, 'W_hid')]) preact += x_in # preact += tparams[_p(prefix, 'b')] h = [[]] * h_depth c = [[]] * h_depth outp = [[]] * h_depth for di in xrange(h_depth): i = tensor.nnet.sigmoid(sliceT(preact, 0, h_sz)) f = tensor.nnet.sigmoid(sliceT(preact, 1, h_sz)) o = tensor.nnet.sigmoid(sliceT(preact, 2, h_sz)) c[di] = tensor.tanh(sliceT(preact, 3, h_sz)) c[di] = f * sliceT(c_, di, h_sz) + i * c[di] h[di] = o * tensor.tanh(c[di]) outp[di] = h[di] if self.en_residual_conn: if (di > 0): outp[di] += outp[di - 1] print "Connecting residual at %d" % (di) if di < (h_depth - 1): preact = tensor.dot(sliceT(h_, di+1, h_sz), tparams[_p(prefix, ('W_hid_' + str(di+1)))]) + \ tensor.dot(outp[di], tparams[_p(prefix, ('W_inp_' + str(di+1)))]) c_out = tensor.concatenate(c, axis=1) h_out = tensor.concatenate(h + [outp[-1]], axis=1) return h_out, c_out
def _step(m_, x_, h_, c_, xAux): preact = tensor.dot(sliceT(h_, 0, h_sz), tparams[_p(prefix, 'W_hid')]) preact += x_ if options.get('en_aux_inp', 0): preact += tensor.dot(xAux, tparams[_p(prefix, 'W_aux')]) # preact += tparams[_p(prefix, 'b')] h = [[]] * h_depth c = [[]] * h_depth for di in xrange(h_depth): i = tensor.nnet.sigmoid(sliceT(preact, 0, h_sz)) f = tensor.nnet.sigmoid(sliceT(preact, 1, h_sz)) o = tensor.nnet.sigmoid(sliceT(preact, 2, h_sz)) c[di] = tensor.tanh(sliceT(preact, 3, h_sz)) c[di] = f * sliceT(c_, di, h_sz) + i * c[di] h[di] = o * tensor.tanh(c[di]) if di < (h_depth - 1): preact = tensor.dot(sliceT(h_, di+1, h_sz), tparams[_p(prefix, ('W_hid_' + str(di+1)))]) + \ tensor.dot(h[di], tparams[_p(prefix, ('W_inp_' + str(di+1)))]) c_out = tensor.concatenate(c, axis=1) h_out = tensor.concatenate(h, axis=1) return h_out, c_out
def _step(m_, x_, h_, c_, xAux): preact = tensor.dot(sliceT(h_, 0, h_sz), tparams[_p(prefix, 'W_hid')]) preact += x_ if options.get('en_aux_inp',0): preact += tensor.dot(xAux,tparams[_p(prefix,'W_aux')]) # preact += tparams[_p(prefix, 'b')] h = [[]]*h_depth c = [[]]*h_depth for di in xrange(h_depth): i = tensor.nnet.sigmoid(sliceT(preact, 0, h_sz)) f = tensor.nnet.sigmoid(sliceT(preact, 1, h_sz)) o = tensor.nnet.sigmoid(sliceT(preact, 2, h_sz)) c[di] = tensor.tanh(sliceT(preact, 3, h_sz)) c[di] = f * sliceT(c_, di, h_sz) + i * c[di] h[di] = o * tensor.tanh(c[di]) if di < (h_depth - 1): preact = tensor.dot(sliceT(h_, di+1, h_sz), tparams[_p(prefix, ('W_hid_' + str(di+1)))]) + \ tensor.dot(h[di], tparams[_p(prefix, ('W_inp_' + str(di+1)))]) c_out = tensor.concatenate(c,axis=1) h_out = tensor.concatenate(h,axis=1) return h_out, c_out
def _stepP(x_, h_, c_, lP_, dV_, xAux): preact = tensor.dot(sliceT(h_, 0, h_sz), tparams[_p(prefix, 'W_hid')]) preact += (tensor.dot(x_, tparams[_p(prefix, 'W_inp')]) + tparams[_p(prefix, 'b')]) if options.get('en_aux_inp', 0): preact += tensor.dot(xAux, tparams[_p(prefix, 'W_aux')]) hL = [[]] * h_depth cL = [[]] * h_depth for di in xrange(h_depth): i = tensor.nnet.sigmoid(sliceT(preact, 0, h_sz)) f = tensor.nnet.sigmoid(sliceT(preact, 1, h_sz)) o = tensor.nnet.sigmoid(sliceT(preact, 2, h_sz)) cL[di] = tensor.tanh(sliceT(preact, 3, h_sz)) cL[di] = f * sliceT(c_, di, h_sz) + i * cL[di] hL[di] = o * tensor.tanh(cL[di]) if di < (h_depth - 1): preact = tensor.dot(sliceT(h_, di+1, h_sz), tparams[_p(prefix, ('W_hid_' + str(di+1)))]) + \ tensor.dot(hL[di], tparams[_p(prefix, ('W_inp_' + str(di+1)))]) c = tensor.concatenate(cL, axis=1) h = tensor.concatenate(hL, axis=1) p = tensor.dot(hL[-1], tparams['Wd']) + tparams['bd'] smooth_factor = tensor.as_tensor_variable(numpy_floatX( options.get('softmax_smooth_factor', 1.0)), name='sm_f') p = tensor.nnet.softmax(p * smooth_factor) lProb = tensor.log(p + 1e-20) #xCandIdx = tensor.as_tensor_variable([0]) lProb = lProb.flatten() xWIdx = tensor.argmax(lProb, keepdims=True) xWlogProb = lProb[xWIdx] + lP_ if options.get('softmax_propogate', 0) == 0: xW = tparams['Wemb'][xWIdx.flatten()] else: xW = p.dot(tparams['Wemb']) doneVec = tensor.eq(xWIdx, tensor.zeros_like(xWIdx)) return [xW, h, c, xWlogProb, doneVec, xWIdx, p], theano.scan_module.until(doneVec.all())
def _stepP(x_, h_, c_, lP_, dV_, xAux): preact = tensor.dot(sliceT(h_, 0, h_sz), tparams[_p(prefix, 'W_hid')]) preact += (tensor.dot(x_, tparams[_p(prefix, 'W_inp')]) + tparams[_p(prefix, 'b')]) if options.get('en_aux_inp',0): preact += tensor.dot(xAux,tparams[_p(prefix,'W_aux')]) hL = [[]]*h_depth cL = [[]]*h_depth for di in xrange(h_depth): i = tensor.nnet.sigmoid(sliceT(preact, 0, h_sz)) f = tensor.nnet.sigmoid(sliceT(preact, 1, h_sz)) o = tensor.nnet.sigmoid(sliceT(preact, 2, h_sz)) cL[di] = tensor.tanh(sliceT(preact, 3, h_sz)) cL[di] = f * sliceT(c_, di, h_sz) + i * cL[di] hL[di] = o * tensor.tanh(cL[di]) if di < (h_depth - 1): preact = tensor.dot(sliceT(h_, di+1, h_sz), tparams[_p(prefix, ('W_hid_' + str(di+1)))]) + \ tensor.dot(hL[di], tparams[_p(prefix, ('W_inp_' + str(di+1)))]) c = tensor.concatenate(cL,axis=1) h = tensor.concatenate(hL,axis=1) p = tensor.dot(hL[-1],tparams['Wd']) + tparams['bd'] smooth_factor = tensor.as_tensor_variable(numpy_floatX(options.get('softmax_smooth_factor',1.0)), name='sm_f') p = tensor.nnet.softmax(p*smooth_factor) lProb = tensor.log(p + 1e-20) #xCandIdx = tensor.as_tensor_variable([0]) lProb = lProb.flatten() xWIdx = tensor.argmax(lProb,keepdims=True) xWlogProb = lProb[xWIdx] + lP_ if options.get('softmax_propogate',0) == 0: xW = tparams['Wemb'][xWIdx.flatten()] else: xW = p.dot(tparams['Wemb']) doneVec = tensor.eq(xWIdx,tensor.zeros_like(xWIdx)) return [xW, h, c, xWlogProb, doneVec, xWIdx, p], theano.scan_module.until(doneVec.all())
def build_model(self, tparams, options): trng = RandomStreams(1234) # Used for dropout. use_noise = theano.shared(numpy_floatX(0.)) xW = tensor.matrix('xW', dtype='int64') mask = tensor.matrix('mask', dtype=config.floatX) n_timesteps = xW.shape[0] n_samples = xW.shape[1] embW = tparams['Wemb'][xW.flatten()].reshape([n_timesteps, n_samples, options['word_encoding_size']]) xI = tensor.matrix('xI', dtype=config.floatX) xAux = tensor.matrix('xAux', dtype=config.floatX) embImg = (tensor.dot(xI, tparams['WIemb']) + tparams['b_Img']).reshape([1,n_samples,options['image_encoding_size']]); emb = tensor.concatenate([embImg, embW], axis=0) #This is implementation of input dropout !! if options['use_dropout']: emb = self.dropout_layer(emb, use_noise, trng, options['drop_prob_encoder'], shp = emb.shape) if options.get('en_aux_inp',0): xAux = self.dropout_layer(xAux, use_noise, trng, options['drop_prob_aux'], shp = xAux.shape) # This implements core lstm rval, updatesLSTM = self.lstm_layer(tparams, emb[:n_timesteps,:,:], xAux, use_noise, options, prefix=options['generator'], mask=mask) if options['use_dropout']: p = self.dropout_layer(sliceT(rval[0],options.get('hidden_depth',1)-1,options['hidden_size']), use_noise, trng, options['drop_prob_decoder'], (n_samples,options['hidden_size'])) else: p = sliceT(rval[0],options.get('hidden_depth',1)-1,options['hidden_size']) p = tensor.dot(p,tparams['Wd']) + tparams['bd'] #pred = tensor.nnet.softmax(p) #pred = rval[2] #pred = pred[1:,:,:] p = p[1:,:,:] def accumCost(pred, xW, m, c_sum, ppl_sum): pred = tensor.nnet.softmax(pred) c_sum += -(tensor.log(pred[tensor.arange(n_samples), xW]+1e-10) * m).sum() ppl_sum += -(tensor.log2(pred[tensor.arange(n_samples), xW]+1e-10) * m).sum() return c_sum, ppl_sum sums, upd = theano.scan(fn=accumCost, outputs_info=[tensor.as_tensor_variable(numpy_floatX(0.)), tensor.as_tensor_variable(numpy_floatX(0.))], sequences = [p, xW[1:,:], mask[1:,:]]) # NOTE1: we are leaving out the first prediction, which was made for the image # and is meaningless. Here cost[0] contains log probability (log10) and cost[1] contains # perplexity (log2) cost = [sums[0][-1]/options['batch_size'], sums[1][-1]] inp_list = [xW, xI, mask] if options.get('en_aux_inp',0): inp_list.append(xAux) f_pred_prob = theano.function(inp_list, p, name='f_pred_prob', updates=updatesLSTM) return use_noise, inp_list, f_pred_prob, cost, p, updatesLSTM
def _stepP(x_, h_, c_, lP_, dV_, xAux): preact = tensor.dot(sliceT(h_, 0, h_sz), tparams[_p(prefix, 'W_hid')]) preact += (tensor.dot(x_, tparams[_p(prefix, 'W_inp')]) + tparams[_p(prefix, 'b')]) if options.get('en_aux_inp', 0): preact += tensor.dot(xAux, tparams[_p(prefix, 'W_aux')]) hL = [[]] * h_depth cL = [[]] * h_depth outp = [[]] * h_depth for di in xrange(h_depth): i = tensor.nnet.sigmoid(sliceT(preact, 0, h_sz)) f = tensor.nnet.sigmoid(sliceT(preact, 1, h_sz)) o = tensor.nnet.sigmoid(sliceT(preact, 2, h_sz)) cL[di] = tensor.tanh(sliceT(preact, 3, h_sz)) cL[di] = f * sliceT(c_, di, h_sz) + i * cL[di] hL[di] = o * tensor.tanh(cL[di]) outp[di] = hL[di] if options.get('en_residual_conn', 1): if (di > 0): outp[di] += outp[di - 1] print "Connecting residual at %d" % (di) if di < (h_depth - 1): preact = tensor.dot(sliceT(h_, di+1, h_sz), tparams[_p(prefix, ('W_hid_' + str(di+1)))]) + \ tensor.dot(outp[di], tparams[_p(prefix, ('W_inp_' + str(di+1)))]) c = tensor.concatenate(cL, axis=1) h = tensor.concatenate(hL, axis=1) if options.get('class_out_factoring', 0) == 1: pC = tensor.dot(outp[-1], tparams['WdCls']) + tparams['bdCls'] pCSft = tensor.nnet.softmax(pC) xCIdx = tensor.argmax(pCSft) pW = tensor.dot( outp[-1], tparams['Wd'][:, xCIdx, :]) + tparams['bd'][:, xCIdx, :] smooth_factor = tensor.as_tensor_variable(numpy_floatX( options.get('softmax_smooth_factor', 1.0)), name='sm_f') pWSft = tensor.nnet.softmax(pW * smooth_factor) lProb = tensor.log(pWSft + 1e-20) + tensor.log(pCSft[0, xCIdx] + 1e-20) else: p = tensor.dot(outp[-1], tparams['Wd']) + tparams['bd'] smooth_factor = tensor.as_tensor_variable(numpy_floatX( options.get('softmax_smooth_factor', 1.0)), name='sm_f') p = tensor.nnet.softmax(p * smooth_factor) lProb = tensor.log(p + 1e-20) if beam_size > 1: def _FindB_best(lPLcl, lPprev, dVLcl): srtLcl = tensor.argsort(-lPLcl) srtLcl = srtLcl[:beam_size] deltaVec = tensor.fill(lPLcl[srtLcl], numpy_floatX(-10000.)) deltaVec = tensor.set_subtensor(deltaVec[0], lPprev) lProbBest = ifelse( tensor.eq(dVLcl, tensor.zeros_like(dVLcl)), lPLcl[srtLcl] + lPprev, deltaVec) xWIdxBest = ifelse( tensor.eq(dVLcl, tensor.zeros_like(dVLcl)), srtLcl, tensor.zeros_like(srtLcl)) return lProbBest, xWIdxBest rvalLcl, updatesLcl = theano.scan(_FindB_best, sequences=[lProb, lP_, dV_], name=_p(prefix, 'FindBest'), n_steps=x_.shape[0]) xWIdxBest = rvalLcl[1] lProbBest = rvalLcl[0] xWIdxBest = xWIdxBest.flatten() lProb = lProbBest.flatten() # Now sort and find the best among these best extensions for the current beams srtIdx = tensor.argsort(-lProb) srtIdx = srtIdx[:beam_size] xCandIdx = srtIdx // beam_size # Floor division h = h.take(xCandIdx.flatten(), axis=0) c = c.take(xCandIdx.flatten(), axis=0) xWlogProb = lProb[srtIdx] xWIdx = xWIdxBest[srtIdx] else: xCandIdx = tensor.as_tensor_variable([0]) lProb = lProb.flatten() xWIdx = tensor.argmax(lProb, keepdims=True) xWlogProb = lProb[xWIdx] + lP_ if options.get('class_out_factoring', 0) == 1: clsoffset = tensor.as_tensor_variable( options['ixtoclsinfo'][:, 0]) xWIdx += clsoffset[xCIdx] h = h.take(xCandIdx.flatten(), axis=0) c = c.take(xCandIdx.flatten(), axis=0) if options.get('softmax_propogate', 0) == 0: xW = tparams['Wemb'][xWIdx.flatten()] else: xW = p.dot(tparams['Wemb']) doneVec = tensor.eq(xWIdx, tensor.zeros_like(xWIdx)) return [xW, h, c, xWlogProb, doneVec, xWIdx, xCandIdx], theano.scan_module.until(doneVec.all())
def build_model(self, tparams, options): trng = RandomStreams(1234) # Used for dropout. use_noise = theano.shared(numpy_floatX(0.)) xW = tensor.matrix('xW', dtype='int64') mask = tensor.matrix('mask', dtype=config.floatX) n_timesteps = xW.shape[0] n_samples = xW.shape[1] embW = tparams['Wemb'][xW.flatten()].reshape( [n_timesteps, n_samples, options['word_encoding_size']]) embW_rev = tparams['Wemb'][xW[::-1, :].flatten()].reshape( [n_timesteps, n_samples, options['word_encoding_size']]) xI = tensor.matrix('xI', dtype=config.floatX) xAux = tensor.matrix('xAux', dtype=config.floatX) if options.get('swap_aux', 0): xAuxEmb = tensor.dot(xAux, tparams['WIemb_aux']) + tparams['b_Img_aux'] else: xAuxEmb = xAux embImg = (tensor.dot(xI, tparams['WIemb']) + tparams['b_Img']).reshape( [1, n_samples, options['image_encoding_size']]) emb = tensor.concatenate([embImg, embW], axis=0) emb_rev = tensor.set_subtensor( embW_rev[mask[::-1, :].argmax(axis=0) - 1, tensor.arange(n_samples), :], embImg[0, :, :]) #This is implementation of input dropout !! if options['use_dropout']: emb = dropout_layer(emb, use_noise, trng, options['drop_prob_encoder'], shp=emb.shape) if options.get('en_aux_inp', 0): xAuxEmb = dropout_layer(xAuxEmb, use_noise, trng, options['drop_prob_aux'], shp=xAuxEmb.shape) ############################################################################################################################# # This implements core lstm rval, updatesLSTM = basic_lstm_layer(tparams, emb[:n_timesteps, :, :], xAuxEmb, use_noise, options, prefix='lstm', sched_prob_mask=[]) ############################################################################################################################# # This implements core reverse lstm rev_rval, rev_updatesLSTM = basic_lstm_layer( tparams, emb_rev[:n_timesteps, :, :], xAuxEmb, use_noise, options, prefix='rev_lstm', sched_prob_mask=[]) ############################################################################################################################# # NOTE1: we are leaving out the first prediction, which was made for the image and is meaningless. if options['use_dropout']: # XXX : Size given to dropout is missing one dimension. This keeps the dropped units consistent across time!?. # ### Is this a good bug ? p = dropout_layer( sliceT(rval[0][1:, :, :], options.get('hidden_depth', 1), options['hidden_size']), use_noise, trng, options['drop_prob_decoder'], (n_samples, options['hidden_size'])) rev_p = dropout_layer( sliceT(rev_rval[0][:, :, :], options.get('hidden_depth', 1), options['hidden_size']), use_noise, trng, options['drop_prob_decoder'], (n_samples, options['hidden_size'])) else: p = sliceT(rval[0][1:, :, :], options.get('hidden_depth', 1), options['hidden_size']) rev_p = sliceT(rev_rval[0][:, :, :], options.get('hidden_depth', 1), options['hidden_size']) n_out_samps = (n_timesteps - 2) * n_samples if options.get('class_out_factoring', 0) == 0: pW = (tensor.dot(p[:-1, :, :] + rev_p[::-1, :, :][2:, :, :], tparams['Wd']) + tparams['bd']).reshape( [n_out_samps, options['output_size']]) pWSft = tensor.nnet.softmax(pW) totProb = pWSft[tensor.arange(n_out_samps), xW[1:-1, :].flatten()] out_list = [pWSft, totProb, p] else: ixtoclsinfo_t = tensor.as_tensor_variable(options['ixtoclsinfo']) xC = ixtoclsinfo_t[xW[1:, :].flatten(), 0] pW = ((tparams['Wd'][:, xC, :].T * (p.reshape([1, n_out_samps, options['hidden_size']]))).sum( axis=-1).T + tparams['bd'][:, xC, :]) pWSft = tensor.nnet.softmax(pW[0, :, :]) pC = (tensor.dot(p, tparams['WdCls']) + tparams['bdCls']).reshape( [n_out_samps, options['nClasses']]) pCSft = tensor.nnet.softmax(pC) totProb = pWSft[tensor.arange(n_out_samps), ixtoclsinfo_t[xW[1:,:].flatten(),3]] * \ pCSft[tensor.arange(n_out_samps), xC] out_list = [pWSft, pCSft, totProb, p] # XXX : THIS IS VERY FISHY, CHECK THE MASK INDEXING AGAIN probs_valid = tensor.log(totProb + 1e-10) * mask[1:-1, :].flatten() tot_cost = -(probs_valid.sum()) tot_pplx = -(tensor.log2(totProb + 1e-10) * mask[1:-1, :].flatten()).sum() cost = [tot_cost / options['batch_size'], tot_pplx] inp_list = [xW, mask, xI] if options.get('en_aux_inp', 0): inp_list.append(xAux) if options.get('sched_sampling_mode', None) != None: inp_list.append(curr_epoch) per_sent_prob = probs_valid.reshape([n_timesteps - 2, n_samples]).sum(axis=0) f_per_sentLogP = theano.function(inp_list, per_sent_prob, name='f_pred_logprob', updates=updatesLSTM) f_pred_prob = ['', f_per_sentLogP, ''] return use_noise, inp_list, f_pred_prob, cost, out_list, updatesLSTM
def _stepP(x_, h_, c_, lP_, dV_, xAux): preact = tensor.dot(sliceT(h_, 0, h_sz), tparams[_p(prefix, 'W_hid')]) preact += (tensor.dot(x_, tparams[_p(prefix, 'W_inp')]) + tparams[_p(prefix, 'b')]) if options.get('en_aux_inp',0): preact += tensor.dot(xAux,tparams[_p(prefix,'W_aux')]) hL = [[]]*h_depth cL = [[]]*h_depth for di in xrange(h_depth): i = tensor.nnet.sigmoid(sliceT(preact, 0, h_sz)) f = tensor.nnet.sigmoid(sliceT(preact, 1, h_sz)) o = tensor.nnet.sigmoid(sliceT(preact, 2, h_sz)) cL[di] = tensor.tanh(sliceT(preact, 3, h_sz)) cL[di] = f * sliceT(c_, di, h_sz) + i * cL[di] hL[di] = o * tensor.tanh(cL[di]) if di < (h_depth - 1): preact = tensor.dot(sliceT(h_, di+1, h_sz), tparams[_p(prefix, ('W_hid_' + str(di+1)))]) + \ tensor.dot(hL[di], tparams[_p(prefix, ('W_inp_' + str(di+1)))]) c = tensor.concatenate(cL,axis=1) h = tensor.concatenate(hL,axis=1) if options.get('class_out_factoring',0) == 1: pC = tensor.dot(hL[-1],tparams['WdCls']) + tparams['bdCls'] pCSft = tensor.nnet.softmax(pC) xCIdx = tensor.argmax(pCSft) pW = tensor.dot(h[-1],tparams['Wd'][:,xCIdx,:]) + tparams['bd'][:,xCIdx,:] smooth_factor = tensor.as_tensor_variable(numpy_floatX(options.get('softmax_smooth_factor',1.0)), name='sm_f') pWSft = tensor.nnet.softmax(pW*smooth_factor) lProb = tensor.log(pWSft + 1e-20) + tensor.log(pCSft[0,xCIdx] + 1e-20) else: p = tensor.dot(hL[-1],tparams['Wd']) + tparams['bd'] smooth_factor = tensor.as_tensor_variable(numpy_floatX(options.get('softmax_smooth_factor',1.0)), name='sm_f') p = tensor.nnet.softmax(p*smooth_factor) lProb = tensor.log(p + 1e-20) if beam_size > 1: def _FindB_best(lPLcl, lPprev, dVLcl): srtLcl = tensor.argsort(-lPLcl) srtLcl = srtLcl[:beam_size] deltaVec = tensor.fill( lPLcl[srtLcl], numpy_floatX(-10000.)) deltaVec = tensor.set_subtensor(deltaVec[0], lPprev) lProbBest = ifelse(tensor.eq( dVLcl, tensor.zeros_like(dVLcl)), lPLcl[srtLcl] + lPprev, deltaVec) xWIdxBest = ifelse(tensor.eq( dVLcl, tensor.zeros_like(dVLcl)), srtLcl, tensor.zeros_like(srtLcl)) return lProbBest, xWIdxBest rvalLcl, updatesLcl = theano.scan(_FindB_best, sequences = [lProb, lP_, dV_], name=_p(prefix, 'FindBest'), n_steps=x_.shape[0]) xWIdxBest = rvalLcl[1] lProbBest = rvalLcl[0] xWIdxBest = xWIdxBest.flatten() lProb = lProbBest.flatten() # Now sort and find the best among these best extensions for the current beams srtIdx = tensor.argsort(-lProb) srtIdx = srtIdx[:beam_size] xCandIdx = srtIdx // beam_size # Floor division h = h.take(xCandIdx.flatten(),axis=0) c = c.take(xCandIdx.flatten(),axis=0) xWlogProb = lProb[srtIdx] xWIdx = xWIdxBest[srtIdx] else: xCandIdx = tensor.as_tensor_variable([0]) lProb = lProb.flatten() xWIdx = tensor.argmax(lProb,keepdims=True) xWlogProb = lProb[xWIdx] + lP_ if options.get('class_out_factoring',0) == 1: clsoffset = tensor.as_tensor_variable(options['ixtoclsinfo'][:,0]) xWIdx += clsoffset[xCIdx] h = h.take(xCandIdx.flatten(),axis=0) c = c.take(xCandIdx.flatten(),axis=0) if options.get('softmax_propogate',0) == 0: xW = tparams['Wemb'][xWIdx.flatten()] else: xW = p.dot(tparams['Wemb']) doneVec = tensor.eq(xWIdx,tensor.zeros_like(xWIdx)) return [xW, h, c, xWlogProb, doneVec, xWIdx, xCandIdx], theano.scan_module.until(doneVec.all())
def build_model(self, tparams, options): trng = RandomStreams(1234) # Used for dropout. self.use_noise = theano.shared(numpy_floatX(0.)) if self.use_shared_features == False: xI = tensor.tensor3('xI', dtype=config.floatX) xIemb = xI n_timesteps = xI.shape[0] n_samples = xI.shape[1] else: xI = tensor.matrix('xI', dtype='int64') n_timesteps = xI.shape[0] n_samples = xI.shape[1] #feats = tensor.concatenate([self.features,tensor.alloc(numpy_floatX(0.),self.image_feat_size,1)],axis=1).T xIemb = self.features[xI.flatten(), :].reshape( [n_timesteps, n_samples, self.image_feat_size]) samp_lens = tensor.vector('sL', dtype='int64') #This is implementation of input dropout !! if options['use_dropout']: emb = dropout_layer(xIemb, self.use_noise, trng, options['drop_prob_encoder'], shp=xIemb.shape) ############################################################################################################################# # This implements core lstm rval, updatesLSTM = self.lstm_enc_layer(tparams, emb, prefix=self.mp + 'lstm') ############################################################################################################################# # This implements core reverse lstm if self.encoder == 'bilstm': rev_rval, rev_updatesLSTM = basic_lstm_layer(tparams, emb[::-1, :, :], prefix=self.mp + 'rev_lstm') ############################################################################################################################# # NOTE1: we are leaving out the first prediction, which was made for the image and is meaningless. p = sliceT(rval[0][samp_lens, tensor.arange(n_samples), :], self.hidden_depth, self.hidden_size) if self.encoder == 'bilstm': rev_p = sliceT(rev_rval[0][-1, :, :], self.hidden_depth, self.hidden_size) feat_enc = p + rev_p if self.encoder == 'bilstm' else p if options.get('encoder_add_mean', 0): feat_enc = feat_enc + (sliceT(rval[0], self.hidden_depth, self.hidden_size).sum(axis=0) / samp_lens[:, None]) inp_list = [xI, samp_lens] return self.use_noise, inp_list, feat_enc, updatesLSTM
def _stepP(*in_list): x_inp = [] h_inp = [] c_inp = [] for i in xrange(nmodels): x_inp.append(in_list[i]) h_inp.append(in_list[nmodels+i]) c_inp.append(in_list[2*nmodels+i]) lP_ = in_list[3*nmodels] dV_ = in_list[3*nmodels+1] p_comb = tensor.alloc(numpy_floatX(0.), options[0]['output_size']); cf = [] h = [] xW = [] for i in xrange(nmodels): preact = tensor.dot(h_inp[i], tparams[i][_p(prefix, 'W_hid')]) preact += (tensor.dot(x_inp[i], tparams[i][_p(prefix, 'W_inp')]) + tparams[i][_p(prefix, 'b')]) if options[i].get('en_aux_inp',0): preact += tensor.dot(aux_input2[i],tparams[i][_p(prefix,'W_aux')]) inp = tensor.nnet.sigmoid(sliceT(preact, 0, options[i]['hidden_size'])) f = tensor.nnet.sigmoid(sliceT(preact, 1, options[i]['hidden_size'])) o = tensor.nnet.sigmoid(sliceT(preact, 2, options[i]['hidden_size'])) c = tensor.tanh(sliceT(preact, 3, options[i]['hidden_size'])) cf.append(f * c_inp[i] + inp * c) h.append(o * tensor.tanh(cf[i])) p = tensor.dot(h[i],tparams[i]['Wd']) + tparams[i]['bd'] if i == 0: p_comb = tparams[i]['comb_weight']*tensor.nnet.softmax(p) else: p_comb += tparams[i]['comb_weight']*tensor.nnet.softmax(p) lProb = tensor.log(p_comb + 1e-20) def _FindB_best(lPLcl, lPprev, dVLcl): srtLcl = tensor.argsort(-lPLcl) srtLcl = srtLcl[:beam_size] deltaVec = tensor.fill( lPLcl[srtLcl], numpy_floatX(-10000.)) deltaVec = tensor.set_subtensor(deltaVec[0], lPprev) lProbBest = ifelse(tensor.eq( dVLcl, tensor.zeros_like(dVLcl)), lPLcl[srtLcl] + lPprev, deltaVec) xWIdxBest = ifelse(tensor.eq( dVLcl, tensor.zeros_like(dVLcl)), srtLcl, tensor.zeros_like(srtLcl)) return lProbBest, xWIdxBest rvalLcl, updatesLcl = theano.scan(_FindB_best, sequences = [lProb, lP_, dV_], name=_p(prefix, 'FindBest'), n_steps=x_inp[0].shape[0]) xWIdxBest = rvalLcl[1] lProbBest = rvalLcl[0] xWIdxBest = xWIdxBest.flatten() lProb = lProbBest.flatten() # Now sort and find the best among these best extensions for the current beams srtIdx = tensor.argsort(-lProb) srtIdx = srtIdx[:beam_size] xWlogProb = lProb[srtIdx] xWIdx = xWIdxBest[srtIdx] xCandIdx = srtIdx // beam_size # Floor division doneVec = tensor.eq(xWIdx,tensor.zeros_like(xWIdx)) x_out = [] h_out = [] c_out = [] for i in xrange(nmodels): x_out.append(tparams[i]['Wemb'][xWIdx.flatten()]) h_out.append(h[i].take(xCandIdx.flatten(),axis=0)) c_out.append(cf[i].take(xCandIdx.flatten(),axis=0)) out_list = [] out_list.extend(x_out) out_list.extend(h_out) out_list.extend(c_out) out_list.extend([xWlogProb, doneVec, xWIdx, xCandIdx]) return out_list, theano.scan_module.until(doneVec.all())
def build_model(self, tparams, options): trng = RandomStreams(1234) # Used for dropout. use_noise = theano.shared(numpy_floatX(0.)) xW = tensor.matrix('xW', dtype='int64') mask = tensor.matrix('mask', dtype=config.floatX) n_timesteps = xW.shape[0] n_samples = xW.shape[1] embW = tparams['Wemb'][xW.flatten()].reshape([n_timesteps, n_samples, options['word_encoding_size']]) xI = tensor.matrix('xI', dtype=config.floatX) xAux = tensor.matrix('xAux', dtype=config.floatX) if options.get('swap_aux',0): xAuxEmb = tensor.dot(xAux,tparams['WIemb_aux']) + tparams['b_Img_aux'] else: xAuxEmb = xAux embImg = (tensor.dot(xI, tparams['WIemb']) + tparams['b_Img']).reshape([1,n_samples,options['image_encoding_size']]); emb = tensor.concatenate([embImg, embW], axis=0) #This is implementation of input dropout !! if options['use_dropout']: emb = dropout_layer(emb, use_noise, trng, options['drop_prob_encoder'], shp = emb.shape) if options.get('en_aux_inp',0): xAuxEmb = dropout_layer(xAuxEmb, use_noise, trng, options['drop_prob_aux'], shp = xAuxEmb.shape) # Implement scehduled sampling! if options.get('sched_sampling_mode',None) != None: curr_epoch = tensor.scalar(name='curr_epoch',dtype=config.floatX) # Assign the probabilies according to the scheduling mode if options['sched_sampling_mode'] == 'linear': prob = tensor.maximum(options['sslin_min'],options['sched_sampling_const'] - options['sslin_slope'] * curr_epoch) elif options['sched_sampling_mode'] == 'exp': raise ValueError('ERROR: %s --> This solver type is not yet supported'%(options['sched_sampling_mode'])) elif options['sched_sampling_mode'] == 'invsig': raise ValueError('ERROR: %s --> This solver type is not yet supported'%(options['sched_sampling_mode'])) else: raise ValueError('ERROR: %s --> This scheduling type is unknown'%(options['sched_sampling_mode'])) # Now to build the mask. We don't want to do this coin toss when # feeding in image feature and the start symbol sched_mask = trng.binomial((n_timesteps - 2, n_samples), p=prob, n=1, dtype='int64') sched_mask = tensor.concatenate([sched_mask, tensor.alloc(1, 2, n_samples)],axis=0) else: sched_mask = [] ############################################################################################################################# # This implements core lstm rval, updatesLSTM = basic_lstm_layer(tparams, emb[:n_timesteps,:,:], xAuxEmb, use_noise, options, prefix=options['generator'], sched_prob_mask = sched_mask) ############################################################################################################################# # NOTE1: we are leaving out the first prediction, which was made for the image and is meaningless. if options['use_dropout']: # XXX : Size given to dropout is missing one dimension. This keeps the dropped units consistent across time!?. # ### Is this a good bug ? p = dropout_layer(sliceT(rval[0][1:,:,:],options.get('hidden_depth',1)-1,options['hidden_size']), use_noise, trng, options['drop_prob_decoder'], (n_samples,options['hidden_size'])) else: p = sliceT(rval[0][1:,:,:],options.get('hidden_depth',1)-1,options['hidden_size']) n_out_samps = (n_timesteps-1) * n_samples if options.get('class_out_factoring',0) == 0: pW = (tensor.dot(p,tparams['Wd']) + tparams['bd']).reshape([n_out_samps,options['output_size']]) pWSft = tensor.nnet.softmax(pW) totProb = pWSft[tensor.arange(n_out_samps), xW[1:,:].flatten()] out_list = [pWSft, totProb, p] else: ixtoclsinfo_t = tensor.as_tensor_variable(options['ixtoclsinfo']) xC = ixtoclsinfo_t[xW[1:,:].flatten(),0] pW = ((tparams['Wd'][:,xC,:].T*(p.reshape([1,n_out_samps,options['hidden_size']]))).sum(axis=-1).T + tparams['bd'][:,xC,:]) pWSft = tensor.nnet.softmax(pW[0,:,:]) pC = (tensor.dot(p,tparams['WdCls']) + tparams['bdCls']).reshape([n_out_samps,options['nClasses']]) pCSft = tensor.nnet.softmax(pC) totProb = pWSft[tensor.arange(n_out_samps), ixtoclsinfo_t[xW[1:,:].flatten(),3]] * \ pCSft[tensor.arange(n_out_samps), xC] out_list = [pWSft, pCSft, totProb, p] tot_cost = -(tensor.log(totProb + 1e-10) * mask[1:,:].flatten()).sum() tot_pplx = -(tensor.log2(totProb + 1e-10) * mask[1:,:].flatten()).sum() cost = [tot_cost/options['batch_size'], tot_pplx] inp_list = [xW, mask, xI] if options.get('en_aux_inp',0): inp_list.append(xAux) if options.get('sched_sampling_mode',None) != None: inp_list.append(curr_epoch) f_pred_prob = [] #theano.function(inp_list, p, name='f_pred_prob', updates=updatesLSTM) return use_noise, inp_list, f_pred_prob, cost, out_list , updatesLSTM
def build_eval_other_sent(self, tparams, options, model_npy): zipp(model_npy, self.model_th) # Used for dropout. use_noise = theano.shared(numpy_floatX(0.)) xW = tensor.matrix('xW', dtype='int64') mask = tensor.matrix('mask', dtype=config.floatX) n_timesteps = xW.shape[0] n_samples = xW.shape[1] n_out_samps = (n_timesteps - 1) * n_samples embW = tparams['Wemb'][xW.flatten()].reshape( [n_timesteps, n_samples, options['word_encoding_size']]) xI = tensor.matrix('xI', dtype=config.floatX) xAux = tensor.matrix('xAux', dtype=config.floatX) if options.get('swap_aux', 0): xAuxEmb = tensor.dot(xAux, tparams['WIemb_aux']) + tparams['b_Img_aux'] else: xAuxEmb = xAux embImg = (tensor.dot(xI, tparams['WIemb']) + tparams['b_Img']).reshape( [1, n_samples, options['image_encoding_size']]) emb = tensor.concatenate([embImg, embW], axis=0) rval, updatesLSTM = basic_lstm_layer(tparams, emb[:n_timesteps, :, :], xAuxEmb, use_noise, options, prefix=options['generator']) p = sliceT(rval[0][1:, :, :], options.get('hidden_depth', 1), options['hidden_size']) pW = (tensor.dot(p, tparams['Wd']) + tparams['bd']).reshape( [n_out_samps, options['output_size']]) pWSft = tensor.nnet.softmax(pW) totProb = pWSft[tensor.arange(n_out_samps), xW[1:, :].flatten()] # #pred = tensor.nnet.softmax(p) # # #pred = rval[2] # # #pred = pred[1:,:,:] # # def accumCost(pred,xW,m,c_sum,ppl_sum): # pred = tensor.nnet.softmax(pred) # c_sum += (tensor.log(pred[tensor.arange(n_samples), xW]+1e-20) * m) # ppl_sum += -(tensor.log2(pred[tensor.arange(n_samples), xW]+1e-10) * m) # return c_sum, ppl_sum # # sums, upd = theano.scan(fn=accumCost, # outputs_info=[tensor.alloc(numpy_floatX(0.), 1,n_samples), # tensor.alloc(numpy_floatX(0.), 1,n_samples)], # sequences = [p, xW[1:,:], mask[1:,:]]) # NOTE1: we are leaving out the first prediction, which was made for the image # and is meaningless. Here cost[0] contains log probability (log10) and cost[1] contains # perplexity (log2) tot_cost = -(tensor.log(totProb + 1e-10) * mask[1:, :].flatten()).sum() cost = tot_cost / options['batch_size'] inp_list = [xW, mask, xI] if options.get('en_aux_inp', 0): inp_list.append(xAux) self.f_pred_prob_other = theano.function(inp_list, p, name='f_pred_prob', updates=updatesLSTM) #f_pred = theano.function([xW, mask], pred.argmax(axis=1), name='f_pred') #cost = -tensor.log(pred[tensor.arange(n_timesteps),tensor.arange(n_samples), xW] + 1e-8).mean() self.f_eval_other = theano.function(inp_list, cost, name='f_eval') return use_noise, inp_list, self.f_pred_prob_other, cost, pW, updatesLSTM
def build_eval_other_sent(self, tparams, options, model_npy): zipp(model_npy, self.model_th) # Used for dropout. use_noise = theano.shared(numpy_floatX(0.)) xW = tensor.matrix('xW', dtype='int64') mask = tensor.matrix('mask', dtype=config.floatX) n_timesteps = xW.shape[0] n_samples = xW.shape[1] n_out_samps = (n_timesteps - 1) * n_samples embW = tparams['Wemb'][xW.flatten()].reshape( [n_timesteps, n_samples, options['word_encoding_size']]) xI = tensor.matrix('xI', dtype=config.floatX) xAux = tensor.matrix('xAux', dtype=config.floatX) if options.get('swap_aux', 0): xAuxEmb = tensor.dot(xAux, tparams['WIemb_aux']) + tparams['b_Img_aux'] else: xAuxEmb = xAux embImg = (tensor.dot(xI, tparams['WIemb']) + tparams['b_Img']).reshape( [1, n_samples, options['image_encoding_size']]) emb = tensor.concatenate([embImg, embW], axis=0) rval, updatesLSTM = basic_lstm_layer(tparams, emb[:n_timesteps, :, :], xAuxEmb, use_noise, options, prefix=options['generator']) p = sliceT(rval[0][1:, :, :], options.get('hidden_depth', 1), options['hidden_size']) if options.get('class_out_factoring', 0) == 0: pW = (tensor.dot(p, tparams['Wd']) + tparams['bd']).reshape( [n_out_samps, options['output_size']]) pWSft = tensor.nnet.softmax(pW) totProb = pWSft[tensor.arange(n_out_samps), xW[1:, :].flatten()] out_list = [pWSft, totProb, p] else: ixtoclsinfo_t = tensor.as_tensor_variable(self.clsinfo) xC = ixtoclsinfo_t[xW[1:, :].flatten(), 0] pW = ((tparams['Wd'][:, xC, :].T * ((p.reshape([1, n_out_samps, options['hidden_size']]) - tparams['WdCls'][:, xC].T))).sum(axis=-1).T + tparams['bd'][:, xC, :]) pWSft = tensor.nnet.softmax(pW[0, :, :]) pC = (tensor.dot(p, tparams['WdCls']) + tparams['bdCls']).reshape( [n_out_samps, options['nClasses']]) pCSft = tensor.nnet.softmax(pC) totProb = pWSft[tensor.arange(n_out_samps), ixtoclsinfo_t[xW[1:,:].flatten(),3]] * \ pCSft[tensor.arange(n_out_samps), xC] tot_cost = -(tensor.log(totProb + 1e-10) * mask[1:, :].flatten() ).reshape([n_timesteps - 1, n_samples]) cost = tot_cost.sum(axis=0) inp_list = [xW, mask, xI] if options.get('en_aux_inp', 0): inp_list.append(xAux) self.f_pred_prob_other = theano.function([xW, xI, xAux], pWSft, name='f_pred_prob', updates=updatesLSTM) #f_pred = theano.function([xW, mask], pred.argmax(axis=1), name='f_pred') #cost = -tensor.log(pred[tensor.arange(n_timesteps),tensor.arange(n_samples), xW] + 1e-8).mean() self.f_eval_other = theano.function(inp_list, cost, name='f_eval') return use_noise, inp_list, self.f_pred_prob_other, cost, pW, updatesLSTM
def _stepP(U, xW_, h_, c_, lP_, dV_, xAux, xNoise): preact = tensor.dot(sliceT(h_, 0, h_sz), tparams[_p(prefix, 'W_hid')]) preact += (tensor.dot(xW_, tparams[_p(prefix, 'W_inp')]) + tparams[_p(prefix, 'b')]) preact += xAux if options.get('gen_input_noise', 0): preact += xNoise hL = [[]] * h_depth cL = [[]] * h_depth outp = [[]] * h_depth for di in xrange(h_depth): i = tensor.nnet.sigmoid(sliceT(preact, 0, h_sz)) f = tensor.nnet.sigmoid(sliceT(preact, 1, h_sz)) o = tensor.nnet.sigmoid(sliceT(preact, 2, h_sz)) cL[di] = tensor.tanh(sliceT(preact, 3, h_sz)) cL[di] = f * sliceT(c_, di, h_sz) + i * cL[di] hL[di] = o * tensor.tanh(cL[di]) outp[di] = hL[di] if options.get('en_residual_conn', 1): if (di > 0): outp[di] += outp[di - 1] print "Connecting residual at %d" % (di) if di < (h_depth - 1): preact = tensor.dot(sliceT(h_, di+1, h_sz), tparams[_p(prefix, ('W_hid_' + str(di+1)))]) + \ tensor.dot(outp[di], tparams[_p(prefix, ('W_inp_' + str(di+1)))]) c = tensor.concatenate(cL, axis=1) h = tensor.concatenate(hL, axis=1) logits = tensor.dot(outp[-1], tparams['Wd']) + tparams['bd'] #p = tensor.dot(outp[-1],l2norm(tparams['Wd'],axis=0))# + tparams['bd'] if options.get('use_gumbel_mse', 0) == 0 or options.get( 'greedy', 0): p = tensor.nnet.softmax(logits) else: p = gumbel_softmax_sample( self.trng, logits * self.softmax_smooth_factor, self.gumb_temp, U, options.get('use_gumbel_hard', False)) if options.get('computelogprob', 0): lProb = tensor.log( tensor.nnet.softmax(logits * self.softmax_smooth_factor) + 1e-20) else: lProb = logits # Idx of the correct word should come from the xWIdx = ~dV_ * tensor.argmax(p, axis=-1) xWlogProb = ~dV_ * lProb[tensor.arange(nBatchSamps * n_samp), xWIdx] + lP_ #xW = tparams['Wemb'][xWIdx.flatten()] if options.get('use_gumbel_hard', 0) and options.get( 'use_gumbel_mse', 0) and not options.get('greedy', 0): xW = p.dot(tparams['Wemb']) else: xW = theano.gradient.disconnected_grad( tparams['Wemb'][xWIdx.flatten()].reshape( [xWIdx.shape[0], -1])) doneVec = tensor.eq(xWIdx, tensor.zeros_like(xWIdx)) return [xW, h, c, xWlogProb, doneVec, xWIdx, p], theano.scan_module.until(doneVec.all())
def build_model(self, tparams, options, xI=None, xAux=None, attn_nw=None): self.trng = RandomStreams(int(time.time())) # Used for dropout. use_noise = theano.shared(numpy_floatX(0.)) xW = tensor.matrix('xW', dtype='int64') mask = tensor.matrix('mask', dtype=config.floatX) n_timesteps = xW.shape[0] n_samples = xW.shape[1] embW = tparams['Wemb'][xW.flatten()].reshape( [n_timesteps, n_samples, options['word_encoding_size']]) if xI == None: xI = tensor.matrix('xI', dtype=config.floatX) embImg = (tensor.dot(xI, tparams['WIemb']) + tparams['b_Img']) xI_is_inp = True else: embImg = xI xI_is_inp = False if xAux == None: xAux = tensor.matrix( 'xAux', dtype=config.floatX) if attn_nw == None else tensor.tensor3( 'xAux', dtype=config.floatX) if (options.get('swap_aux', 1)) and (attn_nw == None): xAuxEmb = tensor.dot( xAux, tparams['WIemb_aux']) + tparams['b_Img_aux'] else: xAuxEmb = xAux xA_is_inp = True else: xA_is_inp = False if options.get('encode_gt_sentences', 0): xAuxEmb = tensor.dot( xAux, tparams['WIemb_aux']) + tparams['b_Img_aux'] else: xAuxEmb = xAux embImg = embImg.reshape([1, n_samples, options['image_encoding_size']]) emb = tensor.concatenate([embImg, embW], axis=0) #This is implementation of input dropout !! if options['use_dropout']: emb = dropout_layer(emb, use_noise, self.trng, options['drop_prob_encoder'], shp=emb.shape) if (options.get('en_aux_inp', 0)) and (attn_nw == None): xAuxEmb = dropout_layer(xAuxEmb, use_noise, self.trng, options['drop_prob_aux'], shp=xAuxEmb.shape) # Implement scehduled sampling! if options.get('sched_sampling_mode', None) != None: curr_epoch = tensor.scalar(name='curr_epoch', dtype=config.floatX) # Assign the probabilies according to the scheduling mode if options['sched_sampling_mode'] == 'linear': prob = tensor.maximum( options['sslin_min'], options['sched_sampling_const'] - options['sslin_slope'] * curr_epoch) elif options['sched_sampling_mode'] == 'exp': raise ValueError( 'ERROR: %s --> This solver type is not yet supported' % (options['sched_sampling_mode'])) elif options['sched_sampling_mode'] == 'invsig': raise ValueError( 'ERROR: %s --> This solver type is not yet supported' % (options['sched_sampling_mode'])) else: raise ValueError( 'ERROR: %s --> This scheduling type is unknown' % (options['sched_sampling_mode'])) # Now to build the mask. We don't want to do this coin toss when # feeding in image feature and the start symbol sched_mask = self.trng.binomial((n_timesteps - 2, n_samples), p=prob, n=1, dtype='int64') sched_mask = tensor.concatenate( [sched_mask, tensor.alloc(1, 2, n_samples)], axis=0) else: sched_mask = [] ############################################################################################################################# # This implements core lstm rval, updatesLSTM = basic_lstm_layer(tparams, emb[:n_timesteps, :, :], xAuxEmb, use_noise, options, prefix=options['generator'], sched_prob_mask=sched_mask, attn_nw=attn_nw) ############################################################################################################################# # NOTE1: we are leaving out the first prediction, which was made for the image and is meaningless. if options['use_dropout']: # XXX : Size given to dropout is missing one dimension. This keeps the dropped units consistent across time!?. # ### Is this a good bug ? p = dropout_layer( sliceT(rval[0][1:, :, :], options.get('hidden_depth', 1), options['hidden_size']), use_noise, self.trng, options['drop_prob_decoder'], (n_samples, options['hidden_size'])) else: p = sliceT(rval[0][1:, :, :], options.get('hidden_depth', 1), options['hidden_size']) if options.get('class_out_factoring', 0) == 1: if options.get('cls_diff_layer', 0) == 1: pC_inp = dropout_layer( sliceT(rval[0][1:, :, :], options.get('hidden_depth', 1) - 2, options['hidden_size']), use_noise, self.trng, options['drop_prob_decoder'], (n_samples, options['hidden_size'])) else: pC_inp = p n_out_samps = (n_timesteps - 1) * n_samples if options.get('class_out_factoring', 0) == 0: pW = (tensor.dot(p, tparams['Wd']) + tparams['bd']).reshape( [n_out_samps, options['output_size']]) if options.get('use_gumbel_mse', 0) == 0: pWSft = tensor.nnet.softmax(pW) else: w_out = ifelse( self.usegumbel, gumbel_softmax_sample(self.trng, pW, self.gumb_temp, hard=options.get( 'use_gumbel_hard', False)), tensor.nnet.softmax(pW)) # This is not exactly right, but just testing pWSft = w_out totProb = pWSft[tensor.arange(n_out_samps), xW[1:, :].flatten()] out_list = [pWSft, totProb, pW] else: ixtoclsinfo_t = tensor.as_tensor_variable(self.clsinfo) xC = ixtoclsinfo_t[xW[1:, :].flatten(), 0] if options.get('cls_zmean', 1): pW = ((tparams['Wd'][:, xC, :].T * ((p.reshape([1, n_out_samps, options['hidden_size']]) - tparams['WdCls'][:, xC].T))).sum(axis=-1).T + tparams['bd'][:, xC, :]) else: pW = ((tparams['Wd'][:, xC, :].T * (p.reshape([1, n_out_samps, options['hidden_size']])) ).sum(axis=-1).T + tparams['bd'][:, xC, :]) pWSft = tensor.nnet.softmax(pW[0, :, :]) pC = (tensor.dot(pC_inp, tparams['WdCls']) + tparams['bdCls']).reshape([n_out_samps, options['nClasses']]) pCSft = tensor.nnet.softmax(pC) totProb = pWSft[tensor.arange(n_out_samps), ixtoclsinfo_t[xW[1:,:].flatten(),3]] * \ pCSft[tensor.arange(n_out_samps), xC] out_list = [pWSft, pCSft, totProb, p] tot_cost = -(tensor.log(totProb + 1e-10) * mask[1:, :].flatten()).sum() tot_pplx = -(tensor.log2(totProb + 1e-10) * mask[1:, :].flatten()).sum() cost = [ tot_cost / tensor.cast(n_samples, dtype=config.floatX), tot_pplx ] inp_list = [xW, mask] if xI_is_inp: inp_list.append(xI) if options.get('en_aux_inp', 0) and xA_is_inp: inp_list.append(xAux) if options.get('sched_sampling_mode', None) != None: inp_list.append(curr_epoch) f_pred_prob = theano.function([xW, xI, xAux], out_list, name='f_pred_prob', updates=updatesLSTM) return use_noise, inp_list, f_pred_prob, cost, out_list, updatesLSTM
def _stepP(*in_list): x_inp = [] h_inp = [] c_inp = [] for i in xrange(nmodels): x_inp.append(in_list[i]) h_inp.append(in_list[nmodels + i]) c_inp.append(in_list[2 * nmodels + i]) lP_ = in_list[3 * nmodels] dV_ = in_list[3 * nmodels + 1] p_comb = tensor.alloc(numpy_floatX(0.), options[0]['output_size']) cf = [] h = [] xW = [] for i in xrange(nmodels): preact = tensor.dot(h_inp[i], tparams[i][_p(prefix, 'W_hid')]) preact += ( tensor.dot(x_inp[i], tparams[i][_p(prefix, 'W_inp')]) + tparams[i][_p(prefix, 'b')]) if options[i].get('en_aux_inp', 0): preact += tensor.dot(aux_input2[i], tparams[i][_p(prefix, 'W_aux')]) inp = tensor.nnet.sigmoid( sliceT(preact, 0, options[i]['hidden_size'])) f = tensor.nnet.sigmoid( sliceT(preact, 1, options[i]['hidden_size'])) o = tensor.nnet.sigmoid( sliceT(preact, 2, options[i]['hidden_size'])) c = tensor.tanh(sliceT(preact, 3, options[i]['hidden_size'])) cf.append(f * c_inp[i] + inp * c) h.append(o * tensor.tanh(cf[i])) p = tensor.dot(h[i], tparams[i]['Wd']) + tparams[i]['bd'] if i == 0: p_comb = tparams[i]['comb_weight'] * tensor.nnet.softmax(p) else: p_comb += tparams[i]['comb_weight'] * tensor.nnet.softmax( p) lProb = tensor.log(p_comb + 1e-20) def _FindB_best(lPLcl, lPprev, dVLcl): srtLcl = tensor.argsort(-lPLcl) srtLcl = srtLcl[:beam_size] deltaVec = tensor.fill(lPLcl[srtLcl], numpy_floatX(-10000.)) deltaVec = tensor.set_subtensor(deltaVec[0], lPprev) lProbBest = ifelse(tensor.eq(dVLcl, tensor.zeros_like(dVLcl)), lPLcl[srtLcl] + lPprev, deltaVec) xWIdxBest = ifelse(tensor.eq(dVLcl, tensor.zeros_like(dVLcl)), srtLcl, tensor.zeros_like(srtLcl)) return lProbBest, xWIdxBest rvalLcl, updatesLcl = theano.scan(_FindB_best, sequences=[lProb, lP_, dV_], name=_p(prefix, 'FindBest'), n_steps=x_inp[0].shape[0]) xWIdxBest = rvalLcl[1] lProbBest = rvalLcl[0] xWIdxBest = xWIdxBest.flatten() lProb = lProbBest.flatten() # Now sort and find the best among these best extensions for the current beams srtIdx = tensor.argsort(-lProb) srtIdx = srtIdx[:beam_size] xWlogProb = lProb[srtIdx] xWIdx = xWIdxBest[srtIdx] xCandIdx = srtIdx // beam_size # Floor division doneVec = tensor.eq(xWIdx, tensor.zeros_like(xWIdx)) x_out = [] h_out = [] c_out = [] for i in xrange(nmodels): x_out.append(tparams[i]['Wemb'][xWIdx.flatten()]) h_out.append(h[i].take(xCandIdx.flatten(), axis=0)) c_out.append(cf[i].take(xCandIdx.flatten(), axis=0)) out_list = [] out_list.extend(x_out) out_list.extend(h_out) out_list.extend(c_out) out_list.extend([xWlogProb, doneVec, xWIdx, xCandIdx]) return out_list, theano.scan_module.until(doneVec.all())
def build_model(self, tparams, options): trng = RandomStreams(1234) # Used for dropout. use_noise = theano.shared(numpy_floatX(0.)) xW = tensor.matrix('xW', dtype='int64') mask = tensor.matrix('mask', dtype=config.floatX) n_timesteps = xW.shape[0] n_samples = xW.shape[1] embW = tparams['Wemb'][xW.flatten()].reshape( [n_timesteps, n_samples, options['word_encoding_size']]) xI = tensor.matrix('xI', dtype=config.floatX) xAux = tensor.matrix('xAux', dtype=config.floatX) embImg = (tensor.dot(xI, tparams['WIemb']) + tparams['b_Img']).reshape( [1, n_samples, options['image_encoding_size']]) emb = tensor.concatenate([embImg, embW], axis=0) #This is implementation of input dropout !! if options['use_dropout']: emb = self.dropout_layer(emb, use_noise, trng, options['drop_prob_encoder'], shp=emb.shape) if options.get('en_aux_inp', 0): xAux = self.dropout_layer(xAux, use_noise, trng, options['drop_prob_aux'], shp=xAux.shape) # This implements core lstm rval, updatesLSTM = self.lstm_layer(tparams, emb[:n_timesteps, :, :], xAux, use_noise, options, prefix=options['generator'], mask=mask) if options['use_dropout']: p = self.dropout_layer( sliceT(rval[0], options.get('hidden_depth', 1) - 1, options['hidden_size']), use_noise, trng, options['drop_prob_decoder'], (n_samples, options['hidden_size'])) else: p = sliceT(rval[0], options.get('hidden_depth', 1) - 1, options['hidden_size']) p = tensor.dot(p, tparams['Wd']) + tparams['bd'] #pred = tensor.nnet.softmax(p) #pred = rval[2] #pred = pred[1:,:,:] p = p[1:, :, :] def accumCost(pred, xW, m, c_sum, ppl_sum): pred = tensor.nnet.softmax(pred) c_sum += -(tensor.log(pred[tensor.arange(n_samples), xW] + 1e-10) * m).sum() ppl_sum += -(tensor.log2(pred[tensor.arange(n_samples), xW] + 1e-10) * m).sum() return c_sum, ppl_sum sums, upd = theano.scan(fn=accumCost, outputs_info=[ tensor.as_tensor_variable( numpy_floatX(0.)), tensor.as_tensor_variable(numpy_floatX(0.)) ], sequences=[p, xW[1:, :], mask[1:, :]]) # NOTE1: we are leaving out the first prediction, which was made for the image # and is meaningless. Here cost[0] contains log probability (log10) and cost[1] contains # perplexity (log2) cost = [sums[0][-1] / options['batch_size'], sums[1][-1]] inp_list = [xW, xI, mask] if options.get('en_aux_inp', 0): inp_list.append(xAux) f_pred_prob = theano.function(inp_list, p, name='f_pred_prob', updates=updatesLSTM) return use_noise, inp_list, f_pred_prob, cost, p, updatesLSTM