def _stepP(x_, h_, c_, lP_, dV_, xAux):
            preact = tensor.dot(h_, tparams[_p(prefix, 'W_hid')])
            preact += (tensor.dot(x_, tparams[_p(prefix, 'W_inp')]) +
                       tparams[_p(prefix, 'b')])
            if options.get('en_aux_inp', 0):
                preact += tensor.dot(xAux, tparams[_p(prefix, 'W_aux')])

            i = tensor.nnet.sigmoid(sliceT(preact, 0, options['hidden_size']))
            f = tensor.nnet.sigmoid(sliceT(preact, 1, options['hidden_size']))
            o = tensor.nnet.sigmoid(sliceT(preact, 2, options['hidden_size']))
            c = tensor.tanh(sliceT(preact, 3, options['hidden_size']))

            c = f * c_ + i * c

            h = o * tensor.tanh(c)

            p = tensor.dot(h, tparams['Wd']) + tparams['bd']
            p = tensor.nnet.softmax(p)
            lProb = tensor.log(p + 1e-20)

            def _FindB_best(lPLcl, lPprev, dVLcl):
                srtLcl = tensor.argsort(-lPLcl)
                srtLcl = srtLcl[:beam_size]
                deltaVec = tensor.fill(lPLcl[srtLcl], numpy_floatX(-10000.))
                deltaVec = tensor.set_subtensor(deltaVec[0], lPprev)
                lProbBest = ifelse(tensor.eq(dVLcl, tensor.zeros_like(dVLcl)),
                                   lPLcl[srtLcl] + lPprev, deltaVec)
                xWIdxBest = ifelse(tensor.eq(dVLcl, tensor.zeros_like(dVLcl)),
                                   srtLcl, tensor.zeros_like(srtLcl))
                return lProbBest, xWIdxBest

            rvalLcl, updatesLcl = theano.scan(_FindB_best,
                                              sequences=[lProb, lP_, dV_],
                                              name=_p(prefix, 'FindBest'),
                                              n_steps=x_.shape[0])
            xWIdxBest = rvalLcl[1]
            lProbBest = rvalLcl[0]

            xWIdxBest = xWIdxBest.flatten()
            lProb = lProbBest.flatten()

            # Now sort and find the best among these best extensions for the current beams
            srtIdx = tensor.argsort(-lProb)
            srtIdx = srtIdx[:beam_size]
            xWlogProb = lProb[srtIdx]

            xWIdx = xWIdxBest[srtIdx]
            xCandIdx = srtIdx // beam_size  # Floor division

            xW = tparams['Wemb'][xWIdx.flatten()]
            doneVec = tensor.eq(xWIdx, tensor.zeros_like(xWIdx))
            h = h.take(xCandIdx.flatten(), axis=0)
            c = c.take(xCandIdx.flatten(), axis=0)

            return [xW, h, c, xWlogProb, doneVec, xWIdx,
                    xCandIdx], theano.scan_module.until(doneVec.all())
   def _stepP(x_, h_, c_, lP_, dV_, xAux):
       preact = tensor.dot(h_, tparams[_p(prefix, 'W_hid')])
       preact += (tensor.dot(x_, tparams[_p(prefix, 'W_inp')]) +
                  tparams[_p(prefix, 'b')])
       if options.get('en_aux_inp',0):
           preact += tensor.dot(xAux,tparams[_p(prefix,'W_aux')])
 
       i = tensor.nnet.sigmoid(sliceT(preact, 0, options['hidden_size']))
       f = tensor.nnet.sigmoid(sliceT(preact, 1, options['hidden_size']))
       o = tensor.nnet.sigmoid(sliceT(preact, 2, options['hidden_size']))
       c = tensor.tanh(sliceT(preact, 3, options['hidden_size']))
 
       c = f * c_ + i * c
 
       h = o * tensor.tanh(c)
 
       p = tensor.dot(h,tparams['Wd']) + tparams['bd']
       p = tensor.nnet.softmax(p)
       lProb = tensor.log(p + 1e-20)
 
       def _FindB_best(lPLcl, lPprev, dVLcl):
           srtLcl = tensor.argsort(-lPLcl)
           srtLcl = srtLcl[:beam_size]
           deltaVec = tensor.fill( lPLcl[srtLcl], numpy_floatX(-10000.))
           deltaVec = tensor.set_subtensor(deltaVec[0], lPprev)
           lProbBest = ifelse(tensor.eq( dVLcl, tensor.zeros_like(dVLcl)), lPLcl[srtLcl] + lPprev, deltaVec)
           xWIdxBest = ifelse(tensor.eq( dVLcl, tensor.zeros_like(dVLcl)), srtLcl, tensor.zeros_like(srtLcl)) 
           return lProbBest, xWIdxBest 
 
       rvalLcl, updatesLcl = theano.scan(_FindB_best, sequences = [lProb, lP_, dV_], name=_p(prefix, 'FindBest'), n_steps=x_.shape[0])
       xWIdxBest = rvalLcl[1]
       lProbBest = rvalLcl[0]
 
       xWIdxBest = xWIdxBest.flatten()
       lProb = lProbBest.flatten()
 
       # Now sort and find the best among these best extensions for the current beams
       srtIdx = tensor.argsort(-lProb)
       srtIdx = srtIdx[:beam_size]
       xWlogProb = lProb[srtIdx]
 
       xWIdx = xWIdxBest[srtIdx]
       xCandIdx = srtIdx // beam_size # Floor division 
 
       xW = tparams['Wemb'][xWIdx.flatten()]
       doneVec = tensor.eq(xWIdx,tensor.zeros_like(xWIdx))
       h = h.take(xCandIdx.flatten(),axis=0);
       c = c.take(xCandIdx.flatten(),axis=0)
 
       return [xW, h, c, xWlogProb, doneVec, xWIdx, xCandIdx], theano.scan_module.until(doneVec.all())
        def _step(x_in, h_, c_):
            preact = tensor.dot(sliceT(h_, 0, h_sz),
                                tparams[_p(prefix, 'W_hid')])
            preact += x_in

            #  preact += tparams[_p(prefix, 'b')]
            h = [[]] * h_depth
            c = [[]] * h_depth
            outp = [[]] * h_depth

            for di in xrange(h_depth):
                i = tensor.nnet.sigmoid(sliceT(preact, 0, h_sz))
                f = tensor.nnet.sigmoid(sliceT(preact, 1, h_sz))
                o = tensor.nnet.sigmoid(sliceT(preact, 2, h_sz))
                c[di] = tensor.tanh(sliceT(preact, 3, h_sz))
                c[di] = f * sliceT(c_, di, h_sz) + i * c[di]
                h[di] = o * tensor.tanh(c[di])
                outp[di] = h[di]
                if self.en_residual_conn:
                    if (di > 0):
                        outp[di] += outp[di - 1]
                        print "Connecting residual at %d" % (di)
                if di < (h_depth - 1):
                    preact = tensor.dot(sliceT(h_, di+1, h_sz), tparams[_p(prefix, ('W_hid_' + str(di+1)))]) + \
                            tensor.dot(outp[di], tparams[_p(prefix, ('W_inp_' + str(di+1)))])

            c_out = tensor.concatenate(c, axis=1)
            h_out = tensor.concatenate(h + [outp[-1]], axis=1)

            return h_out, c_out
        def _step(m_, x_, h_, c_, xAux):
            preact = tensor.dot(sliceT(h_, 0, h_sz),
                                tparams[_p(prefix, 'W_hid')])
            preact += x_
            if options.get('en_aux_inp', 0):
                preact += tensor.dot(xAux, tparams[_p(prefix, 'W_aux')])

            #  preact += tparams[_p(prefix, 'b')]
            h = [[]] * h_depth
            c = [[]] * h_depth

            for di in xrange(h_depth):
                i = tensor.nnet.sigmoid(sliceT(preact, 0, h_sz))
                f = tensor.nnet.sigmoid(sliceT(preact, 1, h_sz))
                o = tensor.nnet.sigmoid(sliceT(preact, 2, h_sz))
                c[di] = tensor.tanh(sliceT(preact, 3, h_sz))
                c[di] = f * sliceT(c_, di, h_sz) + i * c[di]
                h[di] = o * tensor.tanh(c[di])
                if di < (h_depth - 1):
                    preact = tensor.dot(sliceT(h_, di+1, h_sz), tparams[_p(prefix, ('W_hid_' + str(di+1)))]) + \
                            tensor.dot(h[di], tparams[_p(prefix, ('W_inp_' + str(di+1)))])

            c_out = tensor.concatenate(c, axis=1)
            h_out = tensor.concatenate(h, axis=1)

            return h_out, c_out
    def _step(m_, x_, h_, c_, xAux):
        preact = tensor.dot(sliceT(h_, 0, h_sz), tparams[_p(prefix, 'W_hid')])
        preact += x_
        if options.get('en_aux_inp',0):
            preact += tensor.dot(xAux,tparams[_p(prefix,'W_aux')])

        #  preact += tparams[_p(prefix, 'b')]
        h = [[]]*h_depth 
        c = [[]]*h_depth 
        
        for di in xrange(h_depth):
            i = tensor.nnet.sigmoid(sliceT(preact, 0, h_sz))
            f = tensor.nnet.sigmoid(sliceT(preact, 1, h_sz))
            o = tensor.nnet.sigmoid(sliceT(preact, 2, h_sz))
            c[di] = tensor.tanh(sliceT(preact, 3, h_sz))
            c[di] = f * sliceT(c_, di, h_sz) + i * c[di]
            h[di] = o * tensor.tanh(c[di])
            if di < (h_depth - 1):
                preact = tensor.dot(sliceT(h_, di+1, h_sz), tparams[_p(prefix, ('W_hid_' + str(di+1)))]) + \
                        tensor.dot(h[di], tparams[_p(prefix, ('W_inp_' + str(di+1)))])
        
        c_out = tensor.concatenate(c,axis=1)
        h_out = tensor.concatenate(h,axis=1)

        return h_out, c_out
        def _stepP(x_, h_, c_, lP_, dV_, xAux):
            preact = tensor.dot(sliceT(h_, 0, h_sz),
                                tparams[_p(prefix, 'W_hid')])
            preact += (tensor.dot(x_, tparams[_p(prefix, 'W_inp')]) +
                       tparams[_p(prefix, 'b')])
            if options.get('en_aux_inp', 0):
                preact += tensor.dot(xAux, tparams[_p(prefix, 'W_aux')])

            hL = [[]] * h_depth
            cL = [[]] * h_depth
            for di in xrange(h_depth):
                i = tensor.nnet.sigmoid(sliceT(preact, 0, h_sz))
                f = tensor.nnet.sigmoid(sliceT(preact, 1, h_sz))
                o = tensor.nnet.sigmoid(sliceT(preact, 2, h_sz))
                cL[di] = tensor.tanh(sliceT(preact, 3, h_sz))
                cL[di] = f * sliceT(c_, di, h_sz) + i * cL[di]
                hL[di] = o * tensor.tanh(cL[di])
                if di < (h_depth - 1):
                    preact = tensor.dot(sliceT(h_, di+1, h_sz), tparams[_p(prefix, ('W_hid_' + str(di+1)))]) + \
                            tensor.dot(hL[di], tparams[_p(prefix, ('W_inp_' + str(di+1)))])

            c = tensor.concatenate(cL, axis=1)
            h = tensor.concatenate(hL, axis=1)

            p = tensor.dot(hL[-1], tparams['Wd']) + tparams['bd']
            smooth_factor = tensor.as_tensor_variable(numpy_floatX(
                options.get('softmax_smooth_factor', 1.0)),
                                                      name='sm_f')
            p = tensor.nnet.softmax(p * smooth_factor)
            lProb = tensor.log(p + 1e-20)

            #xCandIdx = tensor.as_tensor_variable([0])
            lProb = lProb.flatten()
            xWIdx = tensor.argmax(lProb, keepdims=True)
            xWlogProb = lProb[xWIdx] + lP_

            if options.get('softmax_propogate', 0) == 0:
                xW = tparams['Wemb'][xWIdx.flatten()]
            else:
                xW = p.dot(tparams['Wemb'])
            doneVec = tensor.eq(xWIdx, tensor.zeros_like(xWIdx))

            return [xW, h, c, xWlogProb, doneVec, xWIdx,
                    p], theano.scan_module.until(doneVec.all())
    def _stepP(x_, h_, c_, lP_, dV_, xAux):
        preact = tensor.dot(sliceT(h_, 0, h_sz), tparams[_p(prefix, 'W_hid')])
        preact += (tensor.dot(x_, tparams[_p(prefix, 'W_inp')]) +
                   tparams[_p(prefix, 'b')])
        if options.get('en_aux_inp',0):
            preact += tensor.dot(xAux,tparams[_p(prefix,'W_aux')])
  
        hL = [[]]*h_depth 
        cL = [[]]*h_depth 
        for di in xrange(h_depth):
            i = tensor.nnet.sigmoid(sliceT(preact, 0, h_sz))
            f = tensor.nnet.sigmoid(sliceT(preact, 1, h_sz))
            o = tensor.nnet.sigmoid(sliceT(preact, 2, h_sz))
            cL[di] = tensor.tanh(sliceT(preact, 3, h_sz))
            cL[di] = f * sliceT(c_, di, h_sz) + i * cL[di]
            hL[di] = o * tensor.tanh(cL[di])
            if di < (h_depth - 1):
                preact = tensor.dot(sliceT(h_, di+1, h_sz), tparams[_p(prefix, ('W_hid_' + str(di+1)))]) + \
                        tensor.dot(hL[di], tparams[_p(prefix, ('W_inp_' + str(di+1)))])
        
        c = tensor.concatenate(cL,axis=1)
        h = tensor.concatenate(hL,axis=1)
  
        p = tensor.dot(hL[-1],tparams['Wd']) + tparams['bd']
        smooth_factor = tensor.as_tensor_variable(numpy_floatX(options.get('softmax_smooth_factor',1.0)), name='sm_f')
        p = tensor.nnet.softmax(p*smooth_factor)
        lProb = tensor.log(p + 1e-20)

        #xCandIdx = tensor.as_tensor_variable([0]) 
        lProb = lProb.flatten()
        xWIdx =  tensor.argmax(lProb,keepdims=True)
        xWlogProb = lProb[xWIdx] + lP_
        
        if options.get('softmax_propogate',0) == 0:
            xW = tparams['Wemb'][xWIdx.flatten()]
        else:
            xW = p.dot(tparams['Wemb'])
        doneVec = tensor.eq(xWIdx,tensor.zeros_like(xWIdx))
  
        return [xW, h, c, xWlogProb, doneVec, xWIdx, p], theano.scan_module.until(doneVec.all())
  def build_model(self, tparams, options):
    trng = RandomStreams(1234)

    # Used for dropout.
    use_noise = theano.shared(numpy_floatX(0.))

    xW = tensor.matrix('xW', dtype='int64')

    mask = tensor.matrix('mask', dtype=config.floatX)
    n_timesteps = xW.shape[0]
    n_samples = xW.shape[1]

    embW = tparams['Wemb'][xW.flatten()].reshape([n_timesteps,
                                                n_samples,
                                                options['word_encoding_size']])
    xI = tensor.matrix('xI', dtype=config.floatX)
    xAux = tensor.matrix('xAux', dtype=config.floatX)

    embImg = (tensor.dot(xI, tparams['WIemb']) + tparams['b_Img']).reshape([1,n_samples,options['image_encoding_size']]);
    emb = tensor.concatenate([embImg, embW], axis=0) 

    #This is implementation of input dropout !!
    if options['use_dropout']:
        emb = self.dropout_layer(emb, use_noise, trng, options['drop_prob_encoder'], shp = emb.shape)
        if options.get('en_aux_inp',0):
            xAux = self.dropout_layer(xAux, use_noise, trng, options['drop_prob_aux'], shp = xAux.shape)

    # This implements core lstm
    rval, updatesLSTM = self.lstm_layer(tparams, emb[:n_timesteps,:,:], xAux, use_noise, options, prefix=options['generator'],
                                mask=mask)
    if options['use_dropout']:
        p = self.dropout_layer(sliceT(rval[0],options.get('hidden_depth',1)-1,options['hidden_size']), use_noise, trng,
            options['drop_prob_decoder'], (n_samples,options['hidden_size']))
    else:
        p = sliceT(rval[0],options.get('hidden_depth',1)-1,options['hidden_size'])


    p = tensor.dot(p,tparams['Wd']) + tparams['bd']

    #pred = tensor.nnet.softmax(p)

    #pred = rval[2]

    #pred = pred[1:,:,:]
    p = p[1:,:,:]

    def accumCost(pred, xW, m, c_sum, ppl_sum):
        pred = tensor.nnet.softmax(pred)
        c_sum += -(tensor.log(pred[tensor.arange(n_samples), xW]+1e-10) * m).sum()
        ppl_sum += -(tensor.log2(pred[tensor.arange(n_samples), xW]+1e-10) * m).sum()
        return c_sum, ppl_sum

    sums, upd = theano.scan(fn=accumCost, 
                                outputs_info=[tensor.as_tensor_variable(numpy_floatX(0.)), 
                                              tensor.as_tensor_variable(numpy_floatX(0.))],
                                sequences = [p, xW[1:,:], mask[1:,:]])

    # NOTE1: we are leaving out the first prediction, which was made for the image
    # and is meaningless. Here cost[0] contains log probability (log10) and cost[1] contains
    # perplexity (log2)
    cost = [sums[0][-1]/options['batch_size'], sums[1][-1]]

    inp_list = [xW, xI, mask]

    if options.get('en_aux_inp',0):
        inp_list.append(xAux)

    f_pred_prob = theano.function(inp_list, p, name='f_pred_prob', updates=updatesLSTM)


    return use_noise, inp_list, f_pred_prob, cost, p, updatesLSTM 
        def _stepP(x_, h_, c_, lP_, dV_, xAux):
            preact = tensor.dot(sliceT(h_, 0, h_sz),
                                tparams[_p(prefix, 'W_hid')])
            preact += (tensor.dot(x_, tparams[_p(prefix, 'W_inp')]) +
                       tparams[_p(prefix, 'b')])
            if options.get('en_aux_inp', 0):
                preact += tensor.dot(xAux, tparams[_p(prefix, 'W_aux')])

            hL = [[]] * h_depth
            cL = [[]] * h_depth
            outp = [[]] * h_depth
            for di in xrange(h_depth):
                i = tensor.nnet.sigmoid(sliceT(preact, 0, h_sz))
                f = tensor.nnet.sigmoid(sliceT(preact, 1, h_sz))
                o = tensor.nnet.sigmoid(sliceT(preact, 2, h_sz))
                cL[di] = tensor.tanh(sliceT(preact, 3, h_sz))
                cL[di] = f * sliceT(c_, di, h_sz) + i * cL[di]
                hL[di] = o * tensor.tanh(cL[di])
                outp[di] = hL[di]
                if options.get('en_residual_conn', 1):
                    if (di > 0):
                        outp[di] += outp[di - 1]
                        print "Connecting residual at %d" % (di)
                if di < (h_depth - 1):
                    preact = tensor.dot(sliceT(h_, di+1, h_sz), tparams[_p(prefix, ('W_hid_' + str(di+1)))]) + \
                            tensor.dot(outp[di], tparams[_p(prefix, ('W_inp_' + str(di+1)))])

            c = tensor.concatenate(cL, axis=1)
            h = tensor.concatenate(hL, axis=1)

            if options.get('class_out_factoring', 0) == 1:
                pC = tensor.dot(outp[-1], tparams['WdCls']) + tparams['bdCls']
                pCSft = tensor.nnet.softmax(pC)
                xCIdx = tensor.argmax(pCSft)
                pW = tensor.dot(
                    outp[-1],
                    tparams['Wd'][:, xCIdx, :]) + tparams['bd'][:, xCIdx, :]
                smooth_factor = tensor.as_tensor_variable(numpy_floatX(
                    options.get('softmax_smooth_factor', 1.0)),
                                                          name='sm_f')
                pWSft = tensor.nnet.softmax(pW * smooth_factor)
                lProb = tensor.log(pWSft +
                                   1e-20) + tensor.log(pCSft[0, xCIdx] + 1e-20)
            else:
                p = tensor.dot(outp[-1], tparams['Wd']) + tparams['bd']
                smooth_factor = tensor.as_tensor_variable(numpy_floatX(
                    options.get('softmax_smooth_factor', 1.0)),
                                                          name='sm_f')
                p = tensor.nnet.softmax(p * smooth_factor)
                lProb = tensor.log(p + 1e-20)

            if beam_size > 1:

                def _FindB_best(lPLcl, lPprev, dVLcl):
                    srtLcl = tensor.argsort(-lPLcl)
                    srtLcl = srtLcl[:beam_size]
                    deltaVec = tensor.fill(lPLcl[srtLcl],
                                           numpy_floatX(-10000.))
                    deltaVec = tensor.set_subtensor(deltaVec[0], lPprev)
                    lProbBest = ifelse(
                        tensor.eq(dVLcl, tensor.zeros_like(dVLcl)),
                        lPLcl[srtLcl] + lPprev, deltaVec)
                    xWIdxBest = ifelse(
                        tensor.eq(dVLcl, tensor.zeros_like(dVLcl)), srtLcl,
                        tensor.zeros_like(srtLcl))
                    return lProbBest, xWIdxBest

                rvalLcl, updatesLcl = theano.scan(_FindB_best,
                                                  sequences=[lProb, lP_, dV_],
                                                  name=_p(prefix, 'FindBest'),
                                                  n_steps=x_.shape[0])
                xWIdxBest = rvalLcl[1]
                lProbBest = rvalLcl[0]

                xWIdxBest = xWIdxBest.flatten()
                lProb = lProbBest.flatten()
                # Now sort and find the best among these best extensions for the current beams
                srtIdx = tensor.argsort(-lProb)
                srtIdx = srtIdx[:beam_size]
                xCandIdx = srtIdx // beam_size  # Floor division
                h = h.take(xCandIdx.flatten(), axis=0)
                c = c.take(xCandIdx.flatten(), axis=0)
                xWlogProb = lProb[srtIdx]
                xWIdx = xWIdxBest[srtIdx]
            else:
                xCandIdx = tensor.as_tensor_variable([0])
                lProb = lProb.flatten()
                xWIdx = tensor.argmax(lProb, keepdims=True)
                xWlogProb = lProb[xWIdx] + lP_
                if options.get('class_out_factoring', 0) == 1:
                    clsoffset = tensor.as_tensor_variable(
                        options['ixtoclsinfo'][:, 0])
                    xWIdx += clsoffset[xCIdx]
                h = h.take(xCandIdx.flatten(), axis=0)
                c = c.take(xCandIdx.flatten(), axis=0)

            if options.get('softmax_propogate', 0) == 0:
                xW = tparams['Wemb'][xWIdx.flatten()]
            else:
                xW = p.dot(tparams['Wemb'])
            doneVec = tensor.eq(xWIdx, tensor.zeros_like(xWIdx))

            return [xW, h, c, xWlogProb, doneVec, xWIdx,
                    xCandIdx], theano.scan_module.until(doneVec.all())
    def build_model(self, tparams, options):
        trng = RandomStreams(1234)

        # Used for dropout.
        use_noise = theano.shared(numpy_floatX(0.))

        xW = tensor.matrix('xW', dtype='int64')

        mask = tensor.matrix('mask', dtype=config.floatX)
        n_timesteps = xW.shape[0]
        n_samples = xW.shape[1]

        embW = tparams['Wemb'][xW.flatten()].reshape(
            [n_timesteps, n_samples, options['word_encoding_size']])

        embW_rev = tparams['Wemb'][xW[::-1, :].flatten()].reshape(
            [n_timesteps, n_samples, options['word_encoding_size']])
        xI = tensor.matrix('xI', dtype=config.floatX)
        xAux = tensor.matrix('xAux', dtype=config.floatX)

        if options.get('swap_aux', 0):
            xAuxEmb = tensor.dot(xAux,
                                 tparams['WIemb_aux']) + tparams['b_Img_aux']
        else:
            xAuxEmb = xAux

        embImg = (tensor.dot(xI, tparams['WIemb']) + tparams['b_Img']).reshape(
            [1, n_samples, options['image_encoding_size']])
        emb = tensor.concatenate([embImg, embW], axis=0)

        emb_rev = tensor.set_subtensor(
            embW_rev[mask[::-1, :].argmax(axis=0) - 1,
                     tensor.arange(n_samples), :], embImg[0, :, :])

        #This is implementation of input dropout !!
        if options['use_dropout']:
            emb = dropout_layer(emb,
                                use_noise,
                                trng,
                                options['drop_prob_encoder'],
                                shp=emb.shape)
            if options.get('en_aux_inp', 0):
                xAuxEmb = dropout_layer(xAuxEmb,
                                        use_noise,
                                        trng,
                                        options['drop_prob_aux'],
                                        shp=xAuxEmb.shape)

        #############################################################################################################################
        # This implements core lstm
        rval, updatesLSTM = basic_lstm_layer(tparams,
                                             emb[:n_timesteps, :, :],
                                             xAuxEmb,
                                             use_noise,
                                             options,
                                             prefix='lstm',
                                             sched_prob_mask=[])
        #############################################################################################################################
        # This implements core reverse lstm
        rev_rval, rev_updatesLSTM = basic_lstm_layer(
            tparams,
            emb_rev[:n_timesteps, :, :],
            xAuxEmb,
            use_noise,
            options,
            prefix='rev_lstm',
            sched_prob_mask=[])
        #############################################################################################################################

        # NOTE1: we are leaving out the first prediction, which was made for the image and is meaningless.
        if options['use_dropout']:
            # XXX : Size given to dropout is missing one dimension. This keeps the dropped units consistent across time!?.
            # ###   Is this a good bug ?
            p = dropout_layer(
                sliceT(rval[0][1:, :, :], options.get('hidden_depth', 1),
                       options['hidden_size']), use_noise, trng,
                options['drop_prob_decoder'],
                (n_samples, options['hidden_size']))
            rev_p = dropout_layer(
                sliceT(rev_rval[0][:, :, :], options.get('hidden_depth', 1),
                       options['hidden_size']), use_noise, trng,
                options['drop_prob_decoder'],
                (n_samples, options['hidden_size']))
        else:
            p = sliceT(rval[0][1:, :, :], options.get('hidden_depth', 1),
                       options['hidden_size'])
            rev_p = sliceT(rev_rval[0][:, :, :],
                           options.get('hidden_depth',
                                       1), options['hidden_size'])

        n_out_samps = (n_timesteps - 2) * n_samples
        if options.get('class_out_factoring', 0) == 0:
            pW = (tensor.dot(p[:-1, :, :] + rev_p[::-1, :, :][2:, :, :],
                             tparams['Wd']) + tparams['bd']).reshape(
                                 [n_out_samps, options['output_size']])
            pWSft = tensor.nnet.softmax(pW)
            totProb = pWSft[tensor.arange(n_out_samps), xW[1:-1, :].flatten()]
            out_list = [pWSft, totProb, p]
        else:
            ixtoclsinfo_t = tensor.as_tensor_variable(options['ixtoclsinfo'])
            xC = ixtoclsinfo_t[xW[1:, :].flatten(), 0]
            pW = ((tparams['Wd'][:, xC, :].T *
                   (p.reshape([1, n_out_samps, options['hidden_size']]))).sum(
                       axis=-1).T + tparams['bd'][:, xC, :])
            pWSft = tensor.nnet.softmax(pW[0, :, :])
            pC = (tensor.dot(p, tparams['WdCls']) + tparams['bdCls']).reshape(
                [n_out_samps, options['nClasses']])
            pCSft = tensor.nnet.softmax(pC)

            totProb = pWSft[tensor.arange(n_out_samps), ixtoclsinfo_t[xW[1:,:].flatten(),3]] * \
                      pCSft[tensor.arange(n_out_samps), xC]
            out_list = [pWSft, pCSft, totProb, p]

        # XXX : THIS IS VERY FISHY, CHECK THE MASK INDEXING AGAIN
        probs_valid = tensor.log(totProb + 1e-10) * mask[1:-1, :].flatten()
        tot_cost = -(probs_valid.sum())
        tot_pplx = -(tensor.log2(totProb + 1e-10) *
                     mask[1:-1, :].flatten()).sum()
        cost = [tot_cost / options['batch_size'], tot_pplx]

        inp_list = [xW, mask, xI]

        if options.get('en_aux_inp', 0):
            inp_list.append(xAux)

        if options.get('sched_sampling_mode', None) != None:
            inp_list.append(curr_epoch)

        per_sent_prob = probs_valid.reshape([n_timesteps - 2,
                                             n_samples]).sum(axis=0)
        f_per_sentLogP = theano.function(inp_list,
                                         per_sent_prob,
                                         name='f_pred_logprob',
                                         updates=updatesLSTM)
        f_pred_prob = ['', f_per_sentLogP, '']

        return use_noise, inp_list, f_pred_prob, cost, out_list, updatesLSTM
    def _stepP(x_, h_, c_, lP_, dV_, xAux):
        preact = tensor.dot(sliceT(h_, 0, h_sz), tparams[_p(prefix, 'W_hid')])
        preact += (tensor.dot(x_, tparams[_p(prefix, 'W_inp')]) +
                   tparams[_p(prefix, 'b')])
        if options.get('en_aux_inp',0):
            preact += tensor.dot(xAux,tparams[_p(prefix,'W_aux')])
  
        hL = [[]]*h_depth 
        cL = [[]]*h_depth 
        for di in xrange(h_depth):
            i = tensor.nnet.sigmoid(sliceT(preact, 0, h_sz))
            f = tensor.nnet.sigmoid(sliceT(preact, 1, h_sz))
            o = tensor.nnet.sigmoid(sliceT(preact, 2, h_sz))
            cL[di] = tensor.tanh(sliceT(preact, 3, h_sz))
            cL[di] = f * sliceT(c_, di, h_sz) + i * cL[di]
            hL[di] = o * tensor.tanh(cL[di])
            if di < (h_depth - 1):
                preact = tensor.dot(sliceT(h_, di+1, h_sz), tparams[_p(prefix, ('W_hid_' + str(di+1)))]) + \
                        tensor.dot(hL[di], tparams[_p(prefix, ('W_inp_' + str(di+1)))])
        
        c = tensor.concatenate(cL,axis=1)
        h = tensor.concatenate(hL,axis=1)
  
        if options.get('class_out_factoring',0) == 1:
            pC    = tensor.dot(hL[-1],tparams['WdCls']) + tparams['bdCls']
            pCSft = tensor.nnet.softmax(pC)
            xCIdx =  tensor.argmax(pCSft)
            pW = tensor.dot(h[-1],tparams['Wd'][:,xCIdx,:]) + tparams['bd'][:,xCIdx,:]
            smooth_factor = tensor.as_tensor_variable(numpy_floatX(options.get('softmax_smooth_factor',1.0)), name='sm_f')
            pWSft = tensor.nnet.softmax(pW*smooth_factor)
            lProb = tensor.log(pWSft + 1e-20) + tensor.log(pCSft[0,xCIdx] + 1e-20)
        else:
            p = tensor.dot(hL[-1],tparams['Wd']) + tparams['bd']
            smooth_factor = tensor.as_tensor_variable(numpy_floatX(options.get('softmax_smooth_factor',1.0)), name='sm_f')
            p = tensor.nnet.softmax(p*smooth_factor)
            lProb = tensor.log(p + 1e-20)

        if beam_size > 1:
            def _FindB_best(lPLcl, lPprev, dVLcl):
                srtLcl = tensor.argsort(-lPLcl)
                srtLcl = srtLcl[:beam_size]
                deltaVec = tensor.fill( lPLcl[srtLcl], numpy_floatX(-10000.))
                deltaVec = tensor.set_subtensor(deltaVec[0], lPprev)
                lProbBest = ifelse(tensor.eq( dVLcl, tensor.zeros_like(dVLcl)), lPLcl[srtLcl] + lPprev, deltaVec)
                xWIdxBest = ifelse(tensor.eq( dVLcl, tensor.zeros_like(dVLcl)), srtLcl, tensor.zeros_like(srtLcl)) 
                return lProbBest, xWIdxBest 
  
            rvalLcl, updatesLcl = theano.scan(_FindB_best, sequences = [lProb, lP_, dV_], name=_p(prefix, 'FindBest'), n_steps=x_.shape[0])
            xWIdxBest = rvalLcl[1]
            lProbBest = rvalLcl[0]
  
            xWIdxBest = xWIdxBest.flatten()
            lProb = lProbBest.flatten()
            # Now sort and find the best among these best extensions for the current beams
            srtIdx = tensor.argsort(-lProb)
            srtIdx = srtIdx[:beam_size]
            xCandIdx = srtIdx // beam_size # Floor division 
            h = h.take(xCandIdx.flatten(),axis=0)
            c = c.take(xCandIdx.flatten(),axis=0)
            xWlogProb = lProb[srtIdx]
            xWIdx = xWIdxBest[srtIdx]
        else:
            xCandIdx = tensor.as_tensor_variable([0]) 
            lProb = lProb.flatten()
            xWIdx =  tensor.argmax(lProb,keepdims=True)
            xWlogProb = lProb[xWIdx] + lP_
            if options.get('class_out_factoring',0) == 1:
                clsoffset = tensor.as_tensor_variable(options['ixtoclsinfo'][:,0])
                xWIdx += clsoffset[xCIdx]
            h = h.take(xCandIdx.flatten(),axis=0)
            c = c.take(xCandIdx.flatten(),axis=0)
        
        if options.get('softmax_propogate',0) == 0:
            xW = tparams['Wemb'][xWIdx.flatten()]
        else:
            xW = p.dot(tparams['Wemb'])
        doneVec = tensor.eq(xWIdx,tensor.zeros_like(xWIdx))
  
        return [xW, h, c, xWlogProb, doneVec, xWIdx, xCandIdx], theano.scan_module.until(doneVec.all())
    def build_model(self, tparams, options):
        trng = RandomStreams(1234)

        # Used for dropout.
        self.use_noise = theano.shared(numpy_floatX(0.))

        if self.use_shared_features == False:
            xI = tensor.tensor3('xI', dtype=config.floatX)
            xIemb = xI
            n_timesteps = xI.shape[0]
            n_samples = xI.shape[1]
        else:
            xI = tensor.matrix('xI', dtype='int64')
            n_timesteps = xI.shape[0]
            n_samples = xI.shape[1]
            #feats = tensor.concatenate([self.features,tensor.alloc(numpy_floatX(0.),self.image_feat_size,1)],axis=1).T
            xIemb = self.features[xI.flatten(), :].reshape(
                [n_timesteps, n_samples, self.image_feat_size])

        samp_lens = tensor.vector('sL', dtype='int64')

        #This is implementation of input dropout !!
        if options['use_dropout']:
            emb = dropout_layer(xIemb,
                                self.use_noise,
                                trng,
                                options['drop_prob_encoder'],
                                shp=xIemb.shape)

        #############################################################################################################################
        # This implements core lstm
        rval, updatesLSTM = self.lstm_enc_layer(tparams,
                                                emb,
                                                prefix=self.mp + 'lstm')
        #############################################################################################################################
        # This implements core reverse lstm
        if self.encoder == 'bilstm':
            rev_rval, rev_updatesLSTM = basic_lstm_layer(tparams,
                                                         emb[::-1, :, :],
                                                         prefix=self.mp +
                                                         'rev_lstm')
        #############################################################################################################################

        # NOTE1: we are leaving out the first prediction, which was made for the image and is meaningless.
        p = sliceT(rval[0][samp_lens, tensor.arange(n_samples), :],
                   self.hidden_depth, self.hidden_size)

        if self.encoder == 'bilstm':
            rev_p = sliceT(rev_rval[0][-1, :, :], self.hidden_depth,
                           self.hidden_size)

        feat_enc = p + rev_p if self.encoder == 'bilstm' else p

        if options.get('encoder_add_mean', 0):
            feat_enc = feat_enc + (sliceT(rval[0], self.hidden_depth,
                                          self.hidden_size).sum(axis=0) /
                                   samp_lens[:, None])

        inp_list = [xI, samp_lens]

        return self.use_noise, inp_list, feat_enc, updatesLSTM
    def _stepP(*in_list):
        x_inp = []
        h_inp = []
        c_inp = []
        for i in xrange(nmodels):
            x_inp.append(in_list[i])
            h_inp.append(in_list[nmodels+i])
            c_inp.append(in_list[2*nmodels+i])
        lP_ = in_list[3*nmodels]
        dV_ = in_list[3*nmodels+1]

        p_comb = tensor.alloc(numpy_floatX(0.), options[0]['output_size']);
        cf = []
        h = []
        xW = []
        for i in xrange(nmodels):
            preact = tensor.dot(h_inp[i], tparams[i][_p(prefix, 'W_hid')])
            preact += (tensor.dot(x_inp[i], tparams[i][_p(prefix, 'W_inp')]) +
                       tparams[i][_p(prefix, 'b')])
            if options[i].get('en_aux_inp',0):
                preact += tensor.dot(aux_input2[i],tparams[i][_p(prefix,'W_aux')])
  
            inp = tensor.nnet.sigmoid(sliceT(preact, 0, options[i]['hidden_size']))
            f = tensor.nnet.sigmoid(sliceT(preact, 1, options[i]['hidden_size']))
            o = tensor.nnet.sigmoid(sliceT(preact, 2, options[i]['hidden_size']))
            c = tensor.tanh(sliceT(preact, 3, options[i]['hidden_size']))
  
            cf.append(f * c_inp[i] + inp * c)
  
            h.append(o * tensor.tanh(cf[i]))
            p = tensor.dot(h[i],tparams[i]['Wd']) + tparams[i]['bd']
            if i == 0:
                p_comb = tparams[i]['comb_weight']*tensor.nnet.softmax(p)
            else:    
                p_comb += tparams[i]['comb_weight']*tensor.nnet.softmax(p)
        
        lProb = tensor.log(p_comb + 1e-20)
        def _FindB_best(lPLcl, lPprev, dVLcl):
            srtLcl = tensor.argsort(-lPLcl)
            srtLcl = srtLcl[:beam_size]
            deltaVec = tensor.fill( lPLcl[srtLcl], numpy_floatX(-10000.))
            deltaVec = tensor.set_subtensor(deltaVec[0], lPprev)
            lProbBest = ifelse(tensor.eq( dVLcl, tensor.zeros_like(dVLcl)), lPLcl[srtLcl] + lPprev, deltaVec)
            xWIdxBest = ifelse(tensor.eq( dVLcl, tensor.zeros_like(dVLcl)), srtLcl, tensor.zeros_like(srtLcl)) 
            return lProbBest, xWIdxBest 
  
        rvalLcl, updatesLcl = theano.scan(_FindB_best, sequences = [lProb, lP_, dV_], name=_p(prefix, 'FindBest'), n_steps=x_inp[0].shape[0])
        xWIdxBest = rvalLcl[1]
        lProbBest = rvalLcl[0]
  
        xWIdxBest = xWIdxBest.flatten()
        lProb = lProbBest.flatten()
  
        # Now sort and find the best among these best extensions for the current beams
        srtIdx = tensor.argsort(-lProb)
        srtIdx = srtIdx[:beam_size]
        xWlogProb = lProb[srtIdx]
  
        xWIdx = xWIdxBest[srtIdx]
        xCandIdx = srtIdx // beam_size # Floor division 
  
        doneVec = tensor.eq(xWIdx,tensor.zeros_like(xWIdx))
        
        x_out = []
        h_out = []
        c_out = []
        for i in xrange(nmodels):
            x_out.append(tparams[i]['Wemb'][xWIdx.flatten()])
            h_out.append(h[i].take(xCandIdx.flatten(),axis=0))
            c_out.append(cf[i].take(xCandIdx.flatten(),axis=0))

        out_list = []
        out_list.extend(x_out)
        out_list.extend(h_out)
        out_list.extend(c_out)
        out_list.extend([xWlogProb, doneVec, xWIdx, xCandIdx])
  
        return out_list, theano.scan_module.until(doneVec.all())
  def build_model(self, tparams, options):
    trng = RandomStreams(1234)

    # Used for dropout.
    use_noise = theano.shared(numpy_floatX(0.))

    xW = tensor.matrix('xW', dtype='int64')

    mask = tensor.matrix('mask', dtype=config.floatX)
    n_timesteps = xW.shape[0]
    n_samples = xW.shape[1]

    embW = tparams['Wemb'][xW.flatten()].reshape([n_timesteps,
                                                n_samples,
                                                options['word_encoding_size']])
    xI = tensor.matrix('xI', dtype=config.floatX)
    xAux = tensor.matrix('xAux', dtype=config.floatX)

    if options.get('swap_aux',0):
       xAuxEmb = tensor.dot(xAux,tparams['WIemb_aux']) + tparams['b_Img_aux']
    else:
       xAuxEmb = xAux


    embImg = (tensor.dot(xI, tparams['WIemb']) + tparams['b_Img']).reshape([1,n_samples,options['image_encoding_size']]);
    emb = tensor.concatenate([embImg, embW], axis=0) 

    #This is implementation of input dropout !!
    if options['use_dropout']:
        emb = dropout_layer(emb, use_noise, trng, options['drop_prob_encoder'], shp = emb.shape)
        if options.get('en_aux_inp',0):
            xAuxEmb = dropout_layer(xAuxEmb, use_noise, trng, options['drop_prob_aux'], shp = xAuxEmb.shape)

    # Implement scehduled sampling!
    if options.get('sched_sampling_mode',None) != None: 
        curr_epoch = tensor.scalar(name='curr_epoch',dtype=config.floatX)        
        
        # Assign the probabilies according to the scheduling mode 
        if options['sched_sampling_mode'] == 'linear':
            prob = tensor.maximum(options['sslin_min'],options['sched_sampling_const'] - options['sslin_slope'] * curr_epoch)
        elif options['sched_sampling_mode'] == 'exp': 
            raise ValueError('ERROR: %s --> This solver type is not yet supported'%(options['sched_sampling_mode']))
        elif options['sched_sampling_mode'] == 'invsig': 
            raise ValueError('ERROR: %s --> This solver type is not yet supported'%(options['sched_sampling_mode']))
        else:
            raise ValueError('ERROR: %s --> This scheduling type is unknown'%(options['sched_sampling_mode']))
        
        # Now to build the mask. We don't want to do this coin toss when 
        # feeding in image feature and the start symbol        
        sched_mask = trng.binomial((n_timesteps - 2, n_samples), p=prob, n=1, dtype='int64')
        sched_mask = tensor.concatenate([sched_mask, tensor.alloc(1, 2, n_samples)],axis=0)
    else: 
        sched_mask = []
        

    #############################################################################################################################
    # This implements core lstm
    rval, updatesLSTM = basic_lstm_layer(tparams, emb[:n_timesteps,:,:], xAuxEmb, use_noise, options,
                                         prefix=options['generator'], sched_prob_mask = sched_mask)
    #############################################################################################################################
    
    
    # NOTE1: we are leaving out the first prediction, which was made for the image and is meaningless. 
    if options['use_dropout']:
        # XXX : Size given to dropout is missing one dimension. This keeps the dropped units consistent across time!?.
        # ###   Is this a good bug ? 
        p = dropout_layer(sliceT(rval[0][1:,:,:],options.get('hidden_depth',1)-1,options['hidden_size']), use_noise, trng,
            options['drop_prob_decoder'], (n_samples,options['hidden_size']))
    else:
        p = sliceT(rval[0][1:,:,:],options.get('hidden_depth',1)-1,options['hidden_size'])

    n_out_samps = (n_timesteps-1) * n_samples 
    if options.get('class_out_factoring',0) == 0:
        pW = (tensor.dot(p,tparams['Wd']) + tparams['bd']).reshape([n_out_samps,options['output_size']])
        pWSft = tensor.nnet.softmax(pW)
        totProb = pWSft[tensor.arange(n_out_samps), xW[1:,:].flatten()]
        out_list = [pWSft, totProb, p]
    else:
        ixtoclsinfo_t = tensor.as_tensor_variable(options['ixtoclsinfo'])
        xC = ixtoclsinfo_t[xW[1:,:].flatten(),0]
        pW = ((tparams['Wd'][:,xC,:].T*(p.reshape([1,n_out_samps,options['hidden_size']]))).sum(axis=-1).T 
             + tparams['bd'][:,xC,:])
        pWSft   = tensor.nnet.softmax(pW[0,:,:])
        pC    = (tensor.dot(p,tparams['WdCls']) + tparams['bdCls']).reshape([n_out_samps,options['nClasses']])
        pCSft = tensor.nnet.softmax(pC)
        
        totProb = pWSft[tensor.arange(n_out_samps), ixtoclsinfo_t[xW[1:,:].flatten(),3]] * \
                  pCSft[tensor.arange(n_out_samps), xC]
        out_list = [pWSft, pCSft, totProb, p]
    
    tot_cost = -(tensor.log(totProb + 1e-10) * mask[1:,:].flatten()).sum()
    tot_pplx = -(tensor.log2(totProb + 1e-10) * mask[1:,:].flatten()).sum()
    cost = [tot_cost/options['batch_size'], tot_pplx]

    inp_list = [xW, mask, xI]

    if options.get('en_aux_inp',0):
        inp_list.append(xAux)

    if options.get('sched_sampling_mode',None) != None:
        inp_list.append(curr_epoch)

    f_pred_prob = [] 
    #theano.function(inp_list, p, name='f_pred_prob', updates=updatesLSTM)


    return use_noise, inp_list, f_pred_prob, cost, out_list , updatesLSTM 
    def build_eval_other_sent(self, tparams, options, model_npy):

        zipp(model_npy, self.model_th)

        # Used for dropout.
        use_noise = theano.shared(numpy_floatX(0.))

        xW = tensor.matrix('xW', dtype='int64')
        mask = tensor.matrix('mask', dtype=config.floatX)
        n_timesteps = xW.shape[0]
        n_samples = xW.shape[1]
        n_out_samps = (n_timesteps - 1) * n_samples

        embW = tparams['Wemb'][xW.flatten()].reshape(
            [n_timesteps, n_samples, options['word_encoding_size']])
        xI = tensor.matrix('xI', dtype=config.floatX)
        xAux = tensor.matrix('xAux', dtype=config.floatX)

        if options.get('swap_aux', 0):
            xAuxEmb = tensor.dot(xAux,
                                 tparams['WIemb_aux']) + tparams['b_Img_aux']
        else:
            xAuxEmb = xAux

        embImg = (tensor.dot(xI, tparams['WIemb']) + tparams['b_Img']).reshape(
            [1, n_samples, options['image_encoding_size']])
        emb = tensor.concatenate([embImg, embW], axis=0)

        rval, updatesLSTM = basic_lstm_layer(tparams,
                                             emb[:n_timesteps, :, :],
                                             xAuxEmb,
                                             use_noise,
                                             options,
                                             prefix=options['generator'])
        p = sliceT(rval[0][1:, :, :], options.get('hidden_depth', 1),
                   options['hidden_size'])

        pW = (tensor.dot(p, tparams['Wd']) + tparams['bd']).reshape(
            [n_out_samps, options['output_size']])

        pWSft = tensor.nnet.softmax(pW)
        totProb = pWSft[tensor.arange(n_out_samps), xW[1:, :].flatten()]

        #    #pred = tensor.nnet.softmax(p)
        #
        #    #pred = rval[2]
        #
        #    #pred = pred[1:,:,:]
        #
        #    def accumCost(pred,xW,m,c_sum,ppl_sum):
        #        pred = tensor.nnet.softmax(pred)
        #        c_sum += (tensor.log(pred[tensor.arange(n_samples), xW]+1e-20) * m)
        #        ppl_sum += -(tensor.log2(pred[tensor.arange(n_samples), xW]+1e-10) * m)
        #        return c_sum, ppl_sum
        #
        #    sums, upd = theano.scan(fn=accumCost,
        #                                outputs_info=[tensor.alloc(numpy_floatX(0.), 1,n_samples),
        #                                              tensor.alloc(numpy_floatX(0.), 1,n_samples)],
        #                                sequences = [p, xW[1:,:], mask[1:,:]])
        # NOTE1: we are leaving out the first prediction, which was made for the image
        # and is meaningless. Here cost[0] contains log probability (log10) and cost[1] contains
        # perplexity (log2)
        tot_cost = -(tensor.log(totProb + 1e-10) * mask[1:, :].flatten()).sum()
        cost = tot_cost / options['batch_size']

        inp_list = [xW, mask, xI]

        if options.get('en_aux_inp', 0):
            inp_list.append(xAux)

        self.f_pred_prob_other = theano.function(inp_list,
                                                 p,
                                                 name='f_pred_prob',
                                                 updates=updatesLSTM)
        #f_pred = theano.function([xW, mask], pred.argmax(axis=1), name='f_pred')

        #cost = -tensor.log(pred[tensor.arange(n_timesteps),tensor.arange(n_samples), xW] + 1e-8).mean()

        self.f_eval_other = theano.function(inp_list, cost, name='f_eval')

        return use_noise, inp_list, self.f_pred_prob_other, cost, pW, updatesLSTM
Esempio n. 16
0
    def build_eval_other_sent(self, tparams, options, model_npy):

        zipp(model_npy, self.model_th)

        # Used for dropout.
        use_noise = theano.shared(numpy_floatX(0.))

        xW = tensor.matrix('xW', dtype='int64')
        mask = tensor.matrix('mask', dtype=config.floatX)
        n_timesteps = xW.shape[0]
        n_samples = xW.shape[1]
        n_out_samps = (n_timesteps - 1) * n_samples

        embW = tparams['Wemb'][xW.flatten()].reshape(
            [n_timesteps, n_samples, options['word_encoding_size']])
        xI = tensor.matrix('xI', dtype=config.floatX)
        xAux = tensor.matrix('xAux', dtype=config.floatX)

        if options.get('swap_aux', 0):
            xAuxEmb = tensor.dot(xAux,
                                 tparams['WIemb_aux']) + tparams['b_Img_aux']
        else:
            xAuxEmb = xAux

        embImg = (tensor.dot(xI, tparams['WIemb']) + tparams['b_Img']).reshape(
            [1, n_samples, options['image_encoding_size']])
        emb = tensor.concatenate([embImg, embW], axis=0)

        rval, updatesLSTM = basic_lstm_layer(tparams,
                                             emb[:n_timesteps, :, :],
                                             xAuxEmb,
                                             use_noise,
                                             options,
                                             prefix=options['generator'])
        p = sliceT(rval[0][1:, :, :], options.get('hidden_depth', 1),
                   options['hidden_size'])

        if options.get('class_out_factoring', 0) == 0:
            pW = (tensor.dot(p, tparams['Wd']) + tparams['bd']).reshape(
                [n_out_samps, options['output_size']])
            pWSft = tensor.nnet.softmax(pW)
            totProb = pWSft[tensor.arange(n_out_samps), xW[1:, :].flatten()]
            out_list = [pWSft, totProb, p]
        else:
            ixtoclsinfo_t = tensor.as_tensor_variable(self.clsinfo)
            xC = ixtoclsinfo_t[xW[1:, :].flatten(), 0]
            pW = ((tparams['Wd'][:, xC, :].T *
                   ((p.reshape([1, n_out_samps, options['hidden_size']]) -
                     tparams['WdCls'][:, xC].T))).sum(axis=-1).T +
                  tparams['bd'][:, xC, :])
            pWSft = tensor.nnet.softmax(pW[0, :, :])
            pC = (tensor.dot(p, tparams['WdCls']) + tparams['bdCls']).reshape(
                [n_out_samps, options['nClasses']])
            pCSft = tensor.nnet.softmax(pC)

            totProb = pWSft[tensor.arange(n_out_samps), ixtoclsinfo_t[xW[1:,:].flatten(),3]] * \
                      pCSft[tensor.arange(n_out_samps), xC]

        tot_cost = -(tensor.log(totProb + 1e-10) * mask[1:, :].flatten()
                     ).reshape([n_timesteps - 1, n_samples])
        cost = tot_cost.sum(axis=0)

        inp_list = [xW, mask, xI]

        if options.get('en_aux_inp', 0):
            inp_list.append(xAux)

        self.f_pred_prob_other = theano.function([xW, xI, xAux],
                                                 pWSft,
                                                 name='f_pred_prob',
                                                 updates=updatesLSTM)
        #f_pred = theano.function([xW, mask], pred.argmax(axis=1), name='f_pred')

        #cost = -tensor.log(pred[tensor.arange(n_timesteps),tensor.arange(n_samples), xW] + 1e-8).mean()

        self.f_eval_other = theano.function(inp_list, cost, name='f_eval')

        return use_noise, inp_list, self.f_pred_prob_other, cost, pW, updatesLSTM
Esempio n. 17
0
        def _stepP(U, xW_, h_, c_, lP_, dV_, xAux, xNoise):
            preact = tensor.dot(sliceT(h_, 0, h_sz),
                                tparams[_p(prefix, 'W_hid')])
            preact += (tensor.dot(xW_, tparams[_p(prefix, 'W_inp')]) +
                       tparams[_p(prefix, 'b')])
            preact += xAux
            if options.get('gen_input_noise', 0):
                preact += xNoise

            hL = [[]] * h_depth
            cL = [[]] * h_depth
            outp = [[]] * h_depth
            for di in xrange(h_depth):
                i = tensor.nnet.sigmoid(sliceT(preact, 0, h_sz))
                f = tensor.nnet.sigmoid(sliceT(preact, 1, h_sz))
                o = tensor.nnet.sigmoid(sliceT(preact, 2, h_sz))
                cL[di] = tensor.tanh(sliceT(preact, 3, h_sz))
                cL[di] = f * sliceT(c_, di, h_sz) + i * cL[di]
                hL[di] = o * tensor.tanh(cL[di])
                outp[di] = hL[di]
                if options.get('en_residual_conn', 1):
                    if (di > 0):
                        outp[di] += outp[di - 1]
                        print "Connecting residual at %d" % (di)
                if di < (h_depth - 1):
                    preact = tensor.dot(sliceT(h_, di+1, h_sz), tparams[_p(prefix, ('W_hid_' + str(di+1)))]) + \
                            tensor.dot(outp[di], tparams[_p(prefix, ('W_inp_' + str(di+1)))])

            c = tensor.concatenate(cL, axis=1)
            h = tensor.concatenate(hL, axis=1)

            logits = tensor.dot(outp[-1], tparams['Wd']) + tparams['bd']
            #p = tensor.dot(outp[-1],l2norm(tparams['Wd'],axis=0))# + tparams['bd']
            if options.get('use_gumbel_mse', 0) == 0 or options.get(
                    'greedy', 0):
                p = tensor.nnet.softmax(logits)
            else:
                p = gumbel_softmax_sample(
                    self.trng, logits * self.softmax_smooth_factor,
                    self.gumb_temp, U, options.get('use_gumbel_hard', False))

            if options.get('computelogprob', 0):
                lProb = tensor.log(
                    tensor.nnet.softmax(logits * self.softmax_smooth_factor) +
                    1e-20)
            else:
                lProb = logits

            # Idx of the correct word should come from the
            xWIdx = ~dV_ * tensor.argmax(p, axis=-1)

            xWlogProb = ~dV_ * lProb[tensor.arange(nBatchSamps * n_samp),
                                     xWIdx] + lP_
            #xW = tparams['Wemb'][xWIdx.flatten()]
            if options.get('use_gumbel_hard', 0) and options.get(
                    'use_gumbel_mse', 0) and not options.get('greedy', 0):
                xW = p.dot(tparams['Wemb'])
            else:
                xW = theano.gradient.disconnected_grad(
                    tparams['Wemb'][xWIdx.flatten()].reshape(
                        [xWIdx.shape[0], -1]))

            doneVec = tensor.eq(xWIdx, tensor.zeros_like(xWIdx))

            return [xW, h, c, xWlogProb, doneVec, xWIdx,
                    p], theano.scan_module.until(doneVec.all())
Esempio n. 18
0
    def build_model(self, tparams, options, xI=None, xAux=None, attn_nw=None):
        self.trng = RandomStreams(int(time.time()))

        # Used for dropout.
        use_noise = theano.shared(numpy_floatX(0.))

        xW = tensor.matrix('xW', dtype='int64')

        mask = tensor.matrix('mask', dtype=config.floatX)
        n_timesteps = xW.shape[0]
        n_samples = xW.shape[1]

        embW = tparams['Wemb'][xW.flatten()].reshape(
            [n_timesteps, n_samples, options['word_encoding_size']])
        if xI == None:
            xI = tensor.matrix('xI', dtype=config.floatX)
            embImg = (tensor.dot(xI, tparams['WIemb']) + tparams['b_Img'])
            xI_is_inp = True
        else:
            embImg = xI
            xI_is_inp = False

        if xAux == None:
            xAux = tensor.matrix(
                'xAux',
                dtype=config.floatX) if attn_nw == None else tensor.tensor3(
                    'xAux', dtype=config.floatX)
            if (options.get('swap_aux', 1)) and (attn_nw == None):
                xAuxEmb = tensor.dot(
                    xAux, tparams['WIemb_aux']) + tparams['b_Img_aux']
            else:
                xAuxEmb = xAux
            xA_is_inp = True
        else:
            xA_is_inp = False
            if options.get('encode_gt_sentences', 0):
                xAuxEmb = tensor.dot(
                    xAux, tparams['WIemb_aux']) + tparams['b_Img_aux']
            else:
                xAuxEmb = xAux

        embImg = embImg.reshape([1, n_samples, options['image_encoding_size']])
        emb = tensor.concatenate([embImg, embW], axis=0)

        #This is implementation of input dropout !!
        if options['use_dropout']:
            emb = dropout_layer(emb,
                                use_noise,
                                self.trng,
                                options['drop_prob_encoder'],
                                shp=emb.shape)
            if (options.get('en_aux_inp', 0)) and (attn_nw == None):
                xAuxEmb = dropout_layer(xAuxEmb,
                                        use_noise,
                                        self.trng,
                                        options['drop_prob_aux'],
                                        shp=xAuxEmb.shape)

        # Implement scehduled sampling!
        if options.get('sched_sampling_mode', None) != None:
            curr_epoch = tensor.scalar(name='curr_epoch', dtype=config.floatX)

            # Assign the probabilies according to the scheduling mode
            if options['sched_sampling_mode'] == 'linear':
                prob = tensor.maximum(
                    options['sslin_min'], options['sched_sampling_const'] -
                    options['sslin_slope'] * curr_epoch)
            elif options['sched_sampling_mode'] == 'exp':
                raise ValueError(
                    'ERROR: %s --> This solver type is not yet supported' %
                    (options['sched_sampling_mode']))
            elif options['sched_sampling_mode'] == 'invsig':
                raise ValueError(
                    'ERROR: %s --> This solver type is not yet supported' %
                    (options['sched_sampling_mode']))
            else:
                raise ValueError(
                    'ERROR: %s --> This scheduling type is unknown' %
                    (options['sched_sampling_mode']))

            # Now to build the mask. We don't want to do this coin toss when
            # feeding in image feature and the start symbol
            sched_mask = self.trng.binomial((n_timesteps - 2, n_samples),
                                            p=prob,
                                            n=1,
                                            dtype='int64')
            sched_mask = tensor.concatenate(
                [sched_mask, tensor.alloc(1, 2, n_samples)], axis=0)
        else:
            sched_mask = []

        #############################################################################################################################
        # This implements core lstm
        rval, updatesLSTM = basic_lstm_layer(tparams,
                                             emb[:n_timesteps, :, :],
                                             xAuxEmb,
                                             use_noise,
                                             options,
                                             prefix=options['generator'],
                                             sched_prob_mask=sched_mask,
                                             attn_nw=attn_nw)
        #############################################################################################################################

        # NOTE1: we are leaving out the first prediction, which was made for the image and is meaningless.
        if options['use_dropout']:
            # XXX : Size given to dropout is missing one dimension. This keeps the dropped units consistent across time!?.
            # ###   Is this a good bug ?
            p = dropout_layer(
                sliceT(rval[0][1:, :, :], options.get('hidden_depth', 1),
                       options['hidden_size']), use_noise, self.trng,
                options['drop_prob_decoder'],
                (n_samples, options['hidden_size']))
        else:
            p = sliceT(rval[0][1:, :, :], options.get('hidden_depth', 1),
                       options['hidden_size'])

        if options.get('class_out_factoring', 0) == 1:
            if options.get('cls_diff_layer', 0) == 1:
                pC_inp = dropout_layer(
                    sliceT(rval[0][1:, :, :],
                           options.get('hidden_depth', 1) - 2,
                           options['hidden_size']), use_noise, self.trng,
                    options['drop_prob_decoder'],
                    (n_samples, options['hidden_size']))
            else:
                pC_inp = p

        n_out_samps = (n_timesteps - 1) * n_samples
        if options.get('class_out_factoring', 0) == 0:
            pW = (tensor.dot(p, tparams['Wd']) + tparams['bd']).reshape(
                [n_out_samps, options['output_size']])
            if options.get('use_gumbel_mse', 0) == 0:
                pWSft = tensor.nnet.softmax(pW)
            else:
                w_out = ifelse(
                    self.usegumbel,
                    gumbel_softmax_sample(self.trng,
                                          pW,
                                          self.gumb_temp,
                                          hard=options.get(
                                              'use_gumbel_hard', False)),
                    tensor.nnet.softmax(pW))
                # This is not exactly right, but just testing
                pWSft = w_out

            totProb = pWSft[tensor.arange(n_out_samps), xW[1:, :].flatten()]
            out_list = [pWSft, totProb, pW]
        else:
            ixtoclsinfo_t = tensor.as_tensor_variable(self.clsinfo)
            xC = ixtoclsinfo_t[xW[1:, :].flatten(), 0]
            if options.get('cls_zmean', 1):
                pW = ((tparams['Wd'][:, xC, :].T *
                       ((p.reshape([1, n_out_samps, options['hidden_size']]) -
                         tparams['WdCls'][:, xC].T))).sum(axis=-1).T +
                      tparams['bd'][:, xC, :])
            else:
                pW = ((tparams['Wd'][:, xC, :].T *
                       (p.reshape([1, n_out_samps, options['hidden_size']]))
                       ).sum(axis=-1).T + tparams['bd'][:, xC, :])
            pWSft = tensor.nnet.softmax(pW[0, :, :])

            pC = (tensor.dot(pC_inp, tparams['WdCls']) +
                  tparams['bdCls']).reshape([n_out_samps, options['nClasses']])
            pCSft = tensor.nnet.softmax(pC)

            totProb = pWSft[tensor.arange(n_out_samps), ixtoclsinfo_t[xW[1:,:].flatten(),3]] * \
                      pCSft[tensor.arange(n_out_samps), xC]
            out_list = [pWSft, pCSft, totProb, p]

        tot_cost = -(tensor.log(totProb + 1e-10) * mask[1:, :].flatten()).sum()
        tot_pplx = -(tensor.log2(totProb + 1e-10) *
                     mask[1:, :].flatten()).sum()
        cost = [
            tot_cost / tensor.cast(n_samples, dtype=config.floatX), tot_pplx
        ]

        inp_list = [xW, mask]
        if xI_is_inp:
            inp_list.append(xI)

        if options.get('en_aux_inp', 0) and xA_is_inp:
            inp_list.append(xAux)

        if options.get('sched_sampling_mode', None) != None:
            inp_list.append(curr_epoch)

        f_pred_prob = theano.function([xW, xI, xAux],
                                      out_list,
                                      name='f_pred_prob',
                                      updates=updatesLSTM)

        return use_noise, inp_list, f_pred_prob, cost, out_list, updatesLSTM
        def _stepP(*in_list):
            x_inp = []
            h_inp = []
            c_inp = []
            for i in xrange(nmodels):
                x_inp.append(in_list[i])
                h_inp.append(in_list[nmodels + i])
                c_inp.append(in_list[2 * nmodels + i])
            lP_ = in_list[3 * nmodels]
            dV_ = in_list[3 * nmodels + 1]

            p_comb = tensor.alloc(numpy_floatX(0.), options[0]['output_size'])
            cf = []
            h = []
            xW = []
            for i in xrange(nmodels):
                preact = tensor.dot(h_inp[i], tparams[i][_p(prefix, 'W_hid')])
                preact += (
                    tensor.dot(x_inp[i], tparams[i][_p(prefix, 'W_inp')]) +
                    tparams[i][_p(prefix, 'b')])
                if options[i].get('en_aux_inp', 0):
                    preact += tensor.dot(aux_input2[i],
                                         tparams[i][_p(prefix, 'W_aux')])

                inp = tensor.nnet.sigmoid(
                    sliceT(preact, 0, options[i]['hidden_size']))
                f = tensor.nnet.sigmoid(
                    sliceT(preact, 1, options[i]['hidden_size']))
                o = tensor.nnet.sigmoid(
                    sliceT(preact, 2, options[i]['hidden_size']))
                c = tensor.tanh(sliceT(preact, 3, options[i]['hidden_size']))

                cf.append(f * c_inp[i] + inp * c)

                h.append(o * tensor.tanh(cf[i]))
                p = tensor.dot(h[i], tparams[i]['Wd']) + tparams[i]['bd']
                if i == 0:
                    p_comb = tparams[i]['comb_weight'] * tensor.nnet.softmax(p)
                else:
                    p_comb += tparams[i]['comb_weight'] * tensor.nnet.softmax(
                        p)

            lProb = tensor.log(p_comb + 1e-20)

            def _FindB_best(lPLcl, lPprev, dVLcl):
                srtLcl = tensor.argsort(-lPLcl)
                srtLcl = srtLcl[:beam_size]
                deltaVec = tensor.fill(lPLcl[srtLcl], numpy_floatX(-10000.))
                deltaVec = tensor.set_subtensor(deltaVec[0], lPprev)
                lProbBest = ifelse(tensor.eq(dVLcl, tensor.zeros_like(dVLcl)),
                                   lPLcl[srtLcl] + lPprev, deltaVec)
                xWIdxBest = ifelse(tensor.eq(dVLcl, tensor.zeros_like(dVLcl)),
                                   srtLcl, tensor.zeros_like(srtLcl))
                return lProbBest, xWIdxBest

            rvalLcl, updatesLcl = theano.scan(_FindB_best,
                                              sequences=[lProb, lP_, dV_],
                                              name=_p(prefix, 'FindBest'),
                                              n_steps=x_inp[0].shape[0])
            xWIdxBest = rvalLcl[1]
            lProbBest = rvalLcl[0]

            xWIdxBest = xWIdxBest.flatten()
            lProb = lProbBest.flatten()

            # Now sort and find the best among these best extensions for the current beams
            srtIdx = tensor.argsort(-lProb)
            srtIdx = srtIdx[:beam_size]
            xWlogProb = lProb[srtIdx]

            xWIdx = xWIdxBest[srtIdx]
            xCandIdx = srtIdx // beam_size  # Floor division

            doneVec = tensor.eq(xWIdx, tensor.zeros_like(xWIdx))

            x_out = []
            h_out = []
            c_out = []
            for i in xrange(nmodels):
                x_out.append(tparams[i]['Wemb'][xWIdx.flatten()])
                h_out.append(h[i].take(xCandIdx.flatten(), axis=0))
                c_out.append(cf[i].take(xCandIdx.flatten(), axis=0))

            out_list = []
            out_list.extend(x_out)
            out_list.extend(h_out)
            out_list.extend(c_out)
            out_list.extend([xWlogProb, doneVec, xWIdx, xCandIdx])

            return out_list, theano.scan_module.until(doneVec.all())
    def build_model(self, tparams, options):
        trng = RandomStreams(1234)

        # Used for dropout.
        use_noise = theano.shared(numpy_floatX(0.))

        xW = tensor.matrix('xW', dtype='int64')

        mask = tensor.matrix('mask', dtype=config.floatX)
        n_timesteps = xW.shape[0]
        n_samples = xW.shape[1]

        embW = tparams['Wemb'][xW.flatten()].reshape(
            [n_timesteps, n_samples, options['word_encoding_size']])
        xI = tensor.matrix('xI', dtype=config.floatX)
        xAux = tensor.matrix('xAux', dtype=config.floatX)

        embImg = (tensor.dot(xI, tparams['WIemb']) + tparams['b_Img']).reshape(
            [1, n_samples, options['image_encoding_size']])
        emb = tensor.concatenate([embImg, embW], axis=0)

        #This is implementation of input dropout !!
        if options['use_dropout']:
            emb = self.dropout_layer(emb,
                                     use_noise,
                                     trng,
                                     options['drop_prob_encoder'],
                                     shp=emb.shape)
            if options.get('en_aux_inp', 0):
                xAux = self.dropout_layer(xAux,
                                          use_noise,
                                          trng,
                                          options['drop_prob_aux'],
                                          shp=xAux.shape)

        # This implements core lstm
        rval, updatesLSTM = self.lstm_layer(tparams,
                                            emb[:n_timesteps, :, :],
                                            xAux,
                                            use_noise,
                                            options,
                                            prefix=options['generator'],
                                            mask=mask)
        if options['use_dropout']:
            p = self.dropout_layer(
                sliceT(rval[0],
                       options.get('hidden_depth', 1) - 1,
                       options['hidden_size']), use_noise, trng,
                options['drop_prob_decoder'],
                (n_samples, options['hidden_size']))
        else:
            p = sliceT(rval[0],
                       options.get('hidden_depth', 1) - 1,
                       options['hidden_size'])

        p = tensor.dot(p, tparams['Wd']) + tparams['bd']

        #pred = tensor.nnet.softmax(p)

        #pred = rval[2]

        #pred = pred[1:,:,:]
        p = p[1:, :, :]

        def accumCost(pred, xW, m, c_sum, ppl_sum):
            pred = tensor.nnet.softmax(pred)
            c_sum += -(tensor.log(pred[tensor.arange(n_samples), xW] + 1e-10) *
                       m).sum()
            ppl_sum += -(tensor.log2(pred[tensor.arange(n_samples), xW] +
                                     1e-10) * m).sum()
            return c_sum, ppl_sum

        sums, upd = theano.scan(fn=accumCost,
                                outputs_info=[
                                    tensor.as_tensor_variable(
                                        numpy_floatX(0.)),
                                    tensor.as_tensor_variable(numpy_floatX(0.))
                                ],
                                sequences=[p, xW[1:, :], mask[1:, :]])

        # NOTE1: we are leaving out the first prediction, which was made for the image
        # and is meaningless. Here cost[0] contains log probability (log10) and cost[1] contains
        # perplexity (log2)
        cost = [sums[0][-1] / options['batch_size'], sums[1][-1]]

        inp_list = [xW, xI, mask]

        if options.get('en_aux_inp', 0):
            inp_list.append(xAux)

        f_pred_prob = theano.function(inp_list,
                                      p,
                                      name='f_pred_prob',
                                      updates=updatesLSTM)

        return use_noise, inp_list, f_pred_prob, cost, p, updatesLSTM