def _step(x_in, h_, c_):
            preact = tensor.dot(sliceT(h_, 0, h_sz),
                                tparams[_p(prefix, 'W_hid')])
            preact += x_in

            #  preact += tparams[_p(prefix, 'b')]
            h = [[]] * h_depth
            c = [[]] * h_depth
            outp = [[]] * h_depth

            for di in xrange(h_depth):
                i = tensor.nnet.sigmoid(sliceT(preact, 0, h_sz))
                f = tensor.nnet.sigmoid(sliceT(preact, 1, h_sz))
                o = tensor.nnet.sigmoid(sliceT(preact, 2, h_sz))
                c[di] = tensor.tanh(sliceT(preact, 3, h_sz))
                c[di] = f * sliceT(c_, di, h_sz) + i * c[di]
                h[di] = o * tensor.tanh(c[di])
                outp[di] = h[di]
                if self.en_residual_conn:
                    if (di > 0):
                        outp[di] += outp[di - 1]
                        print "Connecting residual at %d" % (di)
                if di < (h_depth - 1):
                    preact = tensor.dot(sliceT(h_, di+1, h_sz), tparams[_p(prefix, ('W_hid_' + str(di+1)))]) + \
                            tensor.dot(outp[di], tparams[_p(prefix, ('W_inp_' + str(di+1)))])

            c_out = tensor.concatenate(c, axis=1)
            h_out = tensor.concatenate(h + [outp[-1]], axis=1)

            return h_out, c_out
        def _step(m_, x_, h_, c_, xAux):
            preact = tensor.dot(sliceT(h_, 0, h_sz),
                                tparams[_p(prefix, 'W_hid')])
            preact += x_
            if options.get('en_aux_inp', 0):
                preact += tensor.dot(xAux, tparams[_p(prefix, 'W_aux')])

            #  preact += tparams[_p(prefix, 'b')]
            h = [[]] * h_depth
            c = [[]] * h_depth

            for di in xrange(h_depth):
                i = tensor.nnet.sigmoid(sliceT(preact, 0, h_sz))
                f = tensor.nnet.sigmoid(sliceT(preact, 1, h_sz))
                o = tensor.nnet.sigmoid(sliceT(preact, 2, h_sz))
                c[di] = tensor.tanh(sliceT(preact, 3, h_sz))
                c[di] = f * sliceT(c_, di, h_sz) + i * c[di]
                h[di] = o * tensor.tanh(c[di])
                if di < (h_depth - 1):
                    preact = tensor.dot(sliceT(h_, di+1, h_sz), tparams[_p(prefix, ('W_hid_' + str(di+1)))]) + \
                            tensor.dot(h[di], tparams[_p(prefix, ('W_inp_' + str(di+1)))])

            c_out = tensor.concatenate(c, axis=1)
            h_out = tensor.concatenate(h, axis=1)

            return h_out, c_out
    def _step(m_, x_, h_, c_, xAux):
        preact = tensor.dot(sliceT(h_, 0, h_sz), tparams[_p(prefix, 'W_hid')])
        preact += x_
        if options.get('en_aux_inp',0):
            preact += tensor.dot(xAux,tparams[_p(prefix,'W_aux')])

        #  preact += tparams[_p(prefix, 'b')]
        h = [[]]*h_depth 
        c = [[]]*h_depth 
        
        for di in xrange(h_depth):
            i = tensor.nnet.sigmoid(sliceT(preact, 0, h_sz))
            f = tensor.nnet.sigmoid(sliceT(preact, 1, h_sz))
            o = tensor.nnet.sigmoid(sliceT(preact, 2, h_sz))
            c[di] = tensor.tanh(sliceT(preact, 3, h_sz))
            c[di] = f * sliceT(c_, di, h_sz) + i * c[di]
            h[di] = o * tensor.tanh(c[di])
            if di < (h_depth - 1):
                preact = tensor.dot(sliceT(h_, di+1, h_sz), tparams[_p(prefix, ('W_hid_' + str(di+1)))]) + \
                        tensor.dot(h[di], tparams[_p(prefix, ('W_inp_' + str(di+1)))])
        
        c_out = tensor.concatenate(c,axis=1)
        h_out = tensor.concatenate(h,axis=1)

        return h_out, c_out
    def lstm_enc_layer(self, tparams, state_below, prefix='lstm'):
        nsteps = state_below.shape[0]
        h_depth = self.hidden_depth
        h_sz = self.hidden_size

        if state_below.ndim == 3:
            n_samples = state_below.shape[1]
        else:
            n_samples = 1

        def _step(x_in, h_, c_):
            preact = tensor.dot(sliceT(h_, 0, h_sz),
                                tparams[_p(prefix, 'W_hid')])
            preact += x_in

            #  preact += tparams[_p(prefix, 'b')]
            h = [[]] * h_depth
            c = [[]] * h_depth
            outp = [[]] * h_depth

            for di in xrange(h_depth):
                i = tensor.nnet.sigmoid(sliceT(preact, 0, h_sz))
                f = tensor.nnet.sigmoid(sliceT(preact, 1, h_sz))
                o = tensor.nnet.sigmoid(sliceT(preact, 2, h_sz))
                c[di] = tensor.tanh(sliceT(preact, 3, h_sz))
                c[di] = f * sliceT(c_, di, h_sz) + i * c[di]
                h[di] = o * tensor.tanh(c[di])
                outp[di] = h[di]
                if self.en_residual_conn:
                    if (di > 0):
                        outp[di] += outp[di - 1]
                        print "Connecting residual at %d" % (di)
                if di < (h_depth - 1):
                    preact = tensor.dot(sliceT(h_, di+1, h_sz), tparams[_p(prefix, ('W_hid_' + str(di+1)))]) + \
                            tensor.dot(outp[di], tparams[_p(prefix, ('W_inp_' + str(di+1)))])

            c_out = tensor.concatenate(c, axis=1)
            h_out = tensor.concatenate(h + [outp[-1]], axis=1)

            return h_out, c_out

        state_below = (tensor.dot(state_below, tparams[_p(prefix, 'W_inp')]) +
                       tparams[_p(prefix, 'b')])

        rval, updates = theano.scan(_step,
                                    sequences=[state_below],
                                    outputs_info=[
                                        tensor.alloc(numpy_floatX(0.),
                                                     n_samples,
                                                     (h_depth + 1) * h_sz),
                                        tensor.alloc(numpy_floatX(0.),
                                                     n_samples,
                                                     h_depth * h_sz),
                                    ],
                                    name=_p(prefix, '_layers'),
                                    n_steps=nsteps)
        return rval, updates
  def lstm_layer(self, tparams, state_below, aux_input, use_noise, options, prefix='lstm', mask=None):
    nsteps = state_below.shape[0]
    h_depth = options.get('hidden_depth',1)
    h_sz = options['hidden_size']
    
    if state_below.ndim == 3:
        n_samples = state_below.shape[1]
    else:
        n_samples = 1

    assert mask is not None

    def _step(m_, x_, h_, c_, xAux):
        preact = tensor.dot(sliceT(h_, 0, h_sz), tparams[_p(prefix, 'W_hid')])
        preact += x_
        if options.get('en_aux_inp',0):
            preact += tensor.dot(xAux,tparams[_p(prefix,'W_aux')])

        #  preact += tparams[_p(prefix, 'b')]
        h = [[]]*h_depth 
        c = [[]]*h_depth 
        
        for di in xrange(h_depth):
            i = tensor.nnet.sigmoid(sliceT(preact, 0, h_sz))
            f = tensor.nnet.sigmoid(sliceT(preact, 1, h_sz))
            o = tensor.nnet.sigmoid(sliceT(preact, 2, h_sz))
            c[di] = tensor.tanh(sliceT(preact, 3, h_sz))
            c[di] = f * sliceT(c_, di, h_sz) + i * c[di]
            h[di] = o * tensor.tanh(c[di])
            if di < (h_depth - 1):
                preact = tensor.dot(sliceT(h_, di+1, h_sz), tparams[_p(prefix, ('W_hid_' + str(di+1)))]) + \
                        tensor.dot(h[di], tparams[_p(prefix, ('W_inp_' + str(di+1)))])
        
        c_out = tensor.concatenate(c,axis=1)
        h_out = tensor.concatenate(h,axis=1)

        return h_out, c_out

    state_below = (tensor.dot(state_below, tparams[_p(prefix, 'W_inp')]) + tparams[_p(prefix, 'b')])
    
    if options.get('en_aux_inp',0) == 0:
       aux_input = [] 

    rval, updates = theano.scan(_step,
                                sequences=[mask, state_below],
                                outputs_info=[tensor.alloc(numpy_floatX(0.),
                                                           n_samples,
                                                           h_depth*h_sz),
                                              tensor.alloc(numpy_floatX(0.),
                                                           n_samples,
                                                           h_depth*h_sz),
                                              #tensor.alloc(numpy_floatX(0.),n_samples,options['output_size'])],
                                              ],
                                non_sequences = [aux_input] ,
                                name=_p(prefix, '_layers'),
                                n_steps=nsteps)
    return rval, updates
        def _stepP(x_, h_, c_, lP_, dV_, xAux):
            preact = tensor.dot(h_, tparams[_p(prefix, 'W_hid')])
            preact += (tensor.dot(x_, tparams[_p(prefix, 'W_inp')]) +
                       tparams[_p(prefix, 'b')])
            if options.get('en_aux_inp', 0):
                preact += tensor.dot(xAux, tparams[_p(prefix, 'W_aux')])

            i = tensor.nnet.sigmoid(sliceT(preact, 0, options['hidden_size']))
            f = tensor.nnet.sigmoid(sliceT(preact, 1, options['hidden_size']))
            o = tensor.nnet.sigmoid(sliceT(preact, 2, options['hidden_size']))
            c = tensor.tanh(sliceT(preact, 3, options['hidden_size']))

            c = f * c_ + i * c

            h = o * tensor.tanh(c)

            p = tensor.dot(h, tparams['Wd']) + tparams['bd']
            p = tensor.nnet.softmax(p)
            lProb = tensor.log(p + 1e-20)

            def _FindB_best(lPLcl, lPprev, dVLcl):
                srtLcl = tensor.argsort(-lPLcl)
                srtLcl = srtLcl[:beam_size]
                deltaVec = tensor.fill(lPLcl[srtLcl], numpy_floatX(-10000.))
                deltaVec = tensor.set_subtensor(deltaVec[0], lPprev)
                lProbBest = ifelse(tensor.eq(dVLcl, tensor.zeros_like(dVLcl)),
                                   lPLcl[srtLcl] + lPprev, deltaVec)
                xWIdxBest = ifelse(tensor.eq(dVLcl, tensor.zeros_like(dVLcl)),
                                   srtLcl, tensor.zeros_like(srtLcl))
                return lProbBest, xWIdxBest

            rvalLcl, updatesLcl = theano.scan(_FindB_best,
                                              sequences=[lProb, lP_, dV_],
                                              name=_p(prefix, 'FindBest'),
                                              n_steps=x_.shape[0])
            xWIdxBest = rvalLcl[1]
            lProbBest = rvalLcl[0]

            xWIdxBest = xWIdxBest.flatten()
            lProb = lProbBest.flatten()

            # Now sort and find the best among these best extensions for the current beams
            srtIdx = tensor.argsort(-lProb)
            srtIdx = srtIdx[:beam_size]
            xWlogProb = lProb[srtIdx]

            xWIdx = xWIdxBest[srtIdx]
            xCandIdx = srtIdx // beam_size  # Floor division

            xW = tparams['Wemb'][xWIdx.flatten()]
            doneVec = tensor.eq(xWIdx, tensor.zeros_like(xWIdx))
            h = h.take(xCandIdx.flatten(), axis=0)
            c = c.take(xCandIdx.flatten(), axis=0)

            return [xW, h, c, xWlogProb, doneVec, xWIdx,
                    xCandIdx], theano.scan_module.until(doneVec.all())
   def _stepP(x_, h_, c_, lP_, dV_, xAux):
       preact = tensor.dot(h_, tparams[_p(prefix, 'W_hid')])
       preact += (tensor.dot(x_, tparams[_p(prefix, 'W_inp')]) +
                  tparams[_p(prefix, 'b')])
       if options.get('en_aux_inp',0):
           preact += tensor.dot(xAux,tparams[_p(prefix,'W_aux')])
 
       i = tensor.nnet.sigmoid(sliceT(preact, 0, options['hidden_size']))
       f = tensor.nnet.sigmoid(sliceT(preact, 1, options['hidden_size']))
       o = tensor.nnet.sigmoid(sliceT(preact, 2, options['hidden_size']))
       c = tensor.tanh(sliceT(preact, 3, options['hidden_size']))
 
       c = f * c_ + i * c
 
       h = o * tensor.tanh(c)
 
       p = tensor.dot(h,tparams['Wd']) + tparams['bd']
       p = tensor.nnet.softmax(p)
       lProb = tensor.log(p + 1e-20)
 
       def _FindB_best(lPLcl, lPprev, dVLcl):
           srtLcl = tensor.argsort(-lPLcl)
           srtLcl = srtLcl[:beam_size]
           deltaVec = tensor.fill( lPLcl[srtLcl], numpy_floatX(-10000.))
           deltaVec = tensor.set_subtensor(deltaVec[0], lPprev)
           lProbBest = ifelse(tensor.eq( dVLcl, tensor.zeros_like(dVLcl)), lPLcl[srtLcl] + lPprev, deltaVec)
           xWIdxBest = ifelse(tensor.eq( dVLcl, tensor.zeros_like(dVLcl)), srtLcl, tensor.zeros_like(srtLcl)) 
           return lProbBest, xWIdxBest 
 
       rvalLcl, updatesLcl = theano.scan(_FindB_best, sequences = [lProb, lP_, dV_], name=_p(prefix, 'FindBest'), n_steps=x_.shape[0])
       xWIdxBest = rvalLcl[1]
       lProbBest = rvalLcl[0]
 
       xWIdxBest = xWIdxBest.flatten()
       lProb = lProbBest.flatten()
 
       # Now sort and find the best among these best extensions for the current beams
       srtIdx = tensor.argsort(-lProb)
       srtIdx = srtIdx[:beam_size]
       xWlogProb = lProb[srtIdx]
 
       xWIdx = xWIdxBest[srtIdx]
       xCandIdx = srtIdx // beam_size # Floor division 
 
       xW = tparams['Wemb'][xWIdx.flatten()]
       doneVec = tensor.eq(xWIdx,tensor.zeros_like(xWIdx))
       h = h.take(xCandIdx.flatten(),axis=0);
       c = c.take(xCandIdx.flatten(),axis=0)
 
       return [xW, h, c, xWlogProb, doneVec, xWIdx, xCandIdx], theano.scan_module.until(doneVec.all())
        def _stepP(x_, h_, c_, lP_, dV_, xAux):
            preact = tensor.dot(sliceT(h_, 0, h_sz),
                                tparams[_p(prefix, 'W_hid')])
            preact += (tensor.dot(x_, tparams[_p(prefix, 'W_inp')]) +
                       tparams[_p(prefix, 'b')])
            if options.get('en_aux_inp', 0):
                preact += tensor.dot(xAux, tparams[_p(prefix, 'W_aux')])

            hL = [[]] * h_depth
            cL = [[]] * h_depth
            for di in xrange(h_depth):
                i = tensor.nnet.sigmoid(sliceT(preact, 0, h_sz))
                f = tensor.nnet.sigmoid(sliceT(preact, 1, h_sz))
                o = tensor.nnet.sigmoid(sliceT(preact, 2, h_sz))
                cL[di] = tensor.tanh(sliceT(preact, 3, h_sz))
                cL[di] = f * sliceT(c_, di, h_sz) + i * cL[di]
                hL[di] = o * tensor.tanh(cL[di])
                if di < (h_depth - 1):
                    preact = tensor.dot(sliceT(h_, di+1, h_sz), tparams[_p(prefix, ('W_hid_' + str(di+1)))]) + \
                            tensor.dot(hL[di], tparams[_p(prefix, ('W_inp_' + str(di+1)))])

            c = tensor.concatenate(cL, axis=1)
            h = tensor.concatenate(hL, axis=1)

            p = tensor.dot(hL[-1], tparams['Wd']) + tparams['bd']
            smooth_factor = tensor.as_tensor_variable(numpy_floatX(
                options.get('softmax_smooth_factor', 1.0)),
                                                      name='sm_f')
            p = tensor.nnet.softmax(p * smooth_factor)
            lProb = tensor.log(p + 1e-20)

            #xCandIdx = tensor.as_tensor_variable([0])
            lProb = lProb.flatten()
            xWIdx = tensor.argmax(lProb, keepdims=True)
            xWlogProb = lProb[xWIdx] + lP_

            if options.get('softmax_propogate', 0) == 0:
                xW = tparams['Wemb'][xWIdx.flatten()]
            else:
                xW = p.dot(tparams['Wemb'])
            doneVec = tensor.eq(xWIdx, tensor.zeros_like(xWIdx))

            return [xW, h, c, xWlogProb, doneVec, xWIdx,
                    p], theano.scan_module.until(doneVec.all())
    def _stepP(x_, h_, c_, lP_, dV_, xAux):
        preact = tensor.dot(sliceT(h_, 0, h_sz), tparams[_p(prefix, 'W_hid')])
        preact += (tensor.dot(x_, tparams[_p(prefix, 'W_inp')]) +
                   tparams[_p(prefix, 'b')])
        if options.get('en_aux_inp',0):
            preact += tensor.dot(xAux,tparams[_p(prefix,'W_aux')])
  
        hL = [[]]*h_depth 
        cL = [[]]*h_depth 
        for di in xrange(h_depth):
            i = tensor.nnet.sigmoid(sliceT(preact, 0, h_sz))
            f = tensor.nnet.sigmoid(sliceT(preact, 1, h_sz))
            o = tensor.nnet.sigmoid(sliceT(preact, 2, h_sz))
            cL[di] = tensor.tanh(sliceT(preact, 3, h_sz))
            cL[di] = f * sliceT(c_, di, h_sz) + i * cL[di]
            hL[di] = o * tensor.tanh(cL[di])
            if di < (h_depth - 1):
                preact = tensor.dot(sliceT(h_, di+1, h_sz), tparams[_p(prefix, ('W_hid_' + str(di+1)))]) + \
                        tensor.dot(hL[di], tparams[_p(prefix, ('W_inp_' + str(di+1)))])
        
        c = tensor.concatenate(cL,axis=1)
        h = tensor.concatenate(hL,axis=1)
  
        p = tensor.dot(hL[-1],tparams['Wd']) + tparams['bd']
        smooth_factor = tensor.as_tensor_variable(numpy_floatX(options.get('softmax_smooth_factor',1.0)), name='sm_f')
        p = tensor.nnet.softmax(p*smooth_factor)
        lProb = tensor.log(p + 1e-20)

        #xCandIdx = tensor.as_tensor_variable([0]) 
        lProb = lProb.flatten()
        xWIdx =  tensor.argmax(lProb,keepdims=True)
        xWlogProb = lProb[xWIdx] + lP_
        
        if options.get('softmax_propogate',0) == 0:
            xW = tparams['Wemb'][xWIdx.flatten()]
        else:
            xW = p.dot(tparams['Wemb'])
        doneVec = tensor.eq(xWIdx,tensor.zeros_like(xWIdx))
  
        return [xW, h, c, xWlogProb, doneVec, xWIdx, p], theano.scan_module.until(doneVec.all())
 def lstm_predict_layer(self, tparams, Xi, aux_input, options, beam_size, prefix='lstm'):
   nMaxsteps = 30 
   n_samples = 1 
 
   # ----------------------  STEP FUNCTION  ---------------------- #
   def _stepP(x_, h_, c_, lP_, dV_, xAux):
       preact = tensor.dot(h_, tparams[_p(prefix, 'W_hid')])
       preact += (tensor.dot(x_, tparams[_p(prefix, 'W_inp')]) +
                  tparams[_p(prefix, 'b')])
       if options.get('en_aux_inp',0):
           preact += tensor.dot(xAux,tparams[_p(prefix,'W_aux')])
 
       i = tensor.nnet.sigmoid(sliceT(preact, 0, options['hidden_size']))
       f = tensor.nnet.sigmoid(sliceT(preact, 1, options['hidden_size']))
       o = tensor.nnet.sigmoid(sliceT(preact, 2, options['hidden_size']))
       c = tensor.tanh(sliceT(preact, 3, options['hidden_size']))
 
       c = f * c_ + i * c
 
       h = o * tensor.tanh(c)
 
       p = tensor.dot(h,tparams['Wd']) + tparams['bd']
       p = tensor.nnet.softmax(p)
       lProb = tensor.log(p + 1e-20)
 
       def _FindB_best(lPLcl, lPprev, dVLcl):
           srtLcl = tensor.argsort(-lPLcl)
           srtLcl = srtLcl[:beam_size]
           deltaVec = tensor.fill( lPLcl[srtLcl], numpy_floatX(-10000.))
           deltaVec = tensor.set_subtensor(deltaVec[0], lPprev)
           lProbBest = ifelse(tensor.eq( dVLcl, tensor.zeros_like(dVLcl)), lPLcl[srtLcl] + lPprev, deltaVec)
           xWIdxBest = ifelse(tensor.eq( dVLcl, tensor.zeros_like(dVLcl)), srtLcl, tensor.zeros_like(srtLcl)) 
           return lProbBest, xWIdxBest 
 
       rvalLcl, updatesLcl = theano.scan(_FindB_best, sequences = [lProb, lP_, dV_], name=_p(prefix, 'FindBest'), n_steps=x_.shape[0])
       xWIdxBest = rvalLcl[1]
       lProbBest = rvalLcl[0]
 
       xWIdxBest = xWIdxBest.flatten()
       lProb = lProbBest.flatten()
 
       # Now sort and find the best among these best extensions for the current beams
       srtIdx = tensor.argsort(-lProb)
       srtIdx = srtIdx[:beam_size]
       xWlogProb = lProb[srtIdx]
 
       xWIdx = xWIdxBest[srtIdx]
       xCandIdx = srtIdx // beam_size # Floor division 
 
       xW = tparams['Wemb'][xWIdx.flatten()]
       doneVec = tensor.eq(xWIdx,tensor.zeros_like(xWIdx))
       h = h.take(xCandIdx.flatten(),axis=0);
       c = c.take(xCandIdx.flatten(),axis=0)
 
       return [xW, h, c, xWlogProb, doneVec, xWIdx, xCandIdx], theano.scan_module.until(doneVec.all())
   # ------------------- END of STEP FUNCTION  -------------------- #
   
   if options.get('en_aux_inp',0) == 0:
      aux_input = [] 
 
   hidden_size = options['hidden_size']
 
 
   h = tensor.alloc(numpy_floatX(0.),beam_size,hidden_size)
   c = tensor.alloc(numpy_floatX(0.),beam_size,hidden_size)
 
   lP = tensor.alloc(numpy_floatX(0.), beam_size);
   dV = tensor.alloc(np.int8(0.), beam_size);
 
   # Propogate the image feature vector
   [xW, h, c, _, _, _, _], _ = _stepP(Xi, h[:1,:], c[:1,:], lP, dV,aux_input) 
   
   xWStart = tparams['Wemb'][[0]]
   [xW, h, c, lP, dV, idx0, cand0], _ = _stepP(xWStart, h[:1,:], c[:1,:], lP, dV, aux_input) 
   
   aux_input = tensor.extra_ops.repeat(aux_input,beam_size,axis=0)
 
   # Now lets do the loop.
   rval, updates = theano.scan(_stepP, outputs_info=[xW, h, c, lP, dV, None, None], non_sequences = [aux_input], name=_p(prefix, 'predict_layers'), n_steps=nMaxsteps)
 
   return rval[3][-1], tensor.concatenate([idx0.reshape([1,beam_size]), rval[5]],axis=0), tensor.concatenate([cand0.reshape([1,beam_size]), rval[6]],axis=0)
    def lstm_advers_gen_layer(self,
                              tparams,
                              Xi,
                              aux_input,
                              options,
                              beam_size,
                              prefix='lstm'):
        nMaxsteps = options.get('maxlen', 15)
        n_samples = 1
        h_depth = options.get('hidden_depth', 1)
        h_sz = options['hidden_size']

        # ----------------------  STEP FUNCTION  ---------------------- #
        def _stepP(x_, h_, c_, lP_, dV_, xAux):
            preact = tensor.dot(sliceT(h_, 0, h_sz),
                                tparams[_p(prefix, 'W_hid')])
            preact += (tensor.dot(x_, tparams[_p(prefix, 'W_inp')]) +
                       tparams[_p(prefix, 'b')])
            if options.get('en_aux_inp', 0):
                preact += tensor.dot(xAux, tparams[_p(prefix, 'W_aux')])

            hL = [[]] * h_depth
            cL = [[]] * h_depth
            for di in xrange(h_depth):
                i = tensor.nnet.sigmoid(sliceT(preact, 0, h_sz))
                f = tensor.nnet.sigmoid(sliceT(preact, 1, h_sz))
                o = tensor.nnet.sigmoid(sliceT(preact, 2, h_sz))
                cL[di] = tensor.tanh(sliceT(preact, 3, h_sz))
                cL[di] = f * sliceT(c_, di, h_sz) + i * cL[di]
                hL[di] = o * tensor.tanh(cL[di])
                if di < (h_depth - 1):
                    preact = tensor.dot(sliceT(h_, di+1, h_sz), tparams[_p(prefix, ('W_hid_' + str(di+1)))]) + \
                            tensor.dot(hL[di], tparams[_p(prefix, ('W_inp_' + str(di+1)))])

            c = tensor.concatenate(cL, axis=1)
            h = tensor.concatenate(hL, axis=1)

            p = tensor.dot(hL[-1], tparams['Wd']) + tparams['bd']
            smooth_factor = tensor.as_tensor_variable(numpy_floatX(
                options.get('softmax_smooth_factor', 1.0)),
                                                      name='sm_f')
            p = tensor.nnet.softmax(p * smooth_factor)
            lProb = tensor.log(p + 1e-20)

            #xCandIdx = tensor.as_tensor_variable([0])
            lProb = lProb.flatten()
            xWIdx = tensor.argmax(lProb, keepdims=True)
            xWlogProb = lProb[xWIdx] + lP_

            if options.get('softmax_propogate', 0) == 0:
                xW = tparams['Wemb'][xWIdx.flatten()]
            else:
                xW = p.dot(tparams['Wemb'])
            doneVec = tensor.eq(xWIdx, tensor.zeros_like(xWIdx))

            return [xW, h, c, xWlogProb, doneVec, xWIdx,
                    p], theano.scan_module.until(doneVec.all())

        # ------------------- END of STEP FUNCTION  -------------------- #

        if options.get('en_aux_inp', 0) == 0:
            aux_input = []

        h = tensor.alloc(numpy_floatX(0.), n_samples, h_sz * h_depth)
        c = tensor.alloc(numpy_floatX(0.), n_samples, h_sz * h_depth)

        lP = tensor.alloc(numpy_floatX(0.), beam_size)
        dV = tensor.alloc(np.int8(0.), beam_size)

        # Propogate the image feature vector
        [xW, h, c, _, _, _, _], _ = _stepP(Xi, h, c, lP, dV, aux_input)

        xWStart = tparams['Wemb'][0, :]
        [xW, h, c, lP, dV, idx0, p0], _ = _stepP(xWStart, h, c, lP, dV,
                                                 aux_input)

        #if options.get('en_aux_inp',0) == 1:
        #    aux_input = tensor.extra_ops.repeat(aux_input,beam_size,axis=0)

        # Now lets do the loop.
        rval, updates = theano.scan(
            _stepP,
            outputs_info=[xW, h, c, lP, dV, None, None],
            non_sequences=[aux_input],
            name=_p(prefix, 'predict_layers'),
            n_steps=nMaxsteps - 1)

        return rval[3][-1], tensor.concatenate(
            [idx0.reshape([1, beam_size]), rval[5]],
            axis=0), tensor.concatenate(
                [tensor.shape_padleft(p0, n_ones=1), rval[6]], axis=0), updates
        def _stepP(x_, h_, c_, lP_, dV_, xAux):
            preact = tensor.dot(sliceT(h_, 0, h_sz),
                                tparams[_p(prefix, 'W_hid')])
            preact += (tensor.dot(x_, tparams[_p(prefix, 'W_inp')]) +
                       tparams[_p(prefix, 'b')])
            if options.get('en_aux_inp', 0):
                preact += tensor.dot(xAux, tparams[_p(prefix, 'W_aux')])

            hL = [[]] * h_depth
            cL = [[]] * h_depth
            outp = [[]] * h_depth
            for di in xrange(h_depth):
                i = tensor.nnet.sigmoid(sliceT(preact, 0, h_sz))
                f = tensor.nnet.sigmoid(sliceT(preact, 1, h_sz))
                o = tensor.nnet.sigmoid(sliceT(preact, 2, h_sz))
                cL[di] = tensor.tanh(sliceT(preact, 3, h_sz))
                cL[di] = f * sliceT(c_, di, h_sz) + i * cL[di]
                hL[di] = o * tensor.tanh(cL[di])
                outp[di] = hL[di]
                if options.get('en_residual_conn', 1):
                    if (di > 0):
                        outp[di] += outp[di - 1]
                        print "Connecting residual at %d" % (di)
                if di < (h_depth - 1):
                    preact = tensor.dot(sliceT(h_, di+1, h_sz), tparams[_p(prefix, ('W_hid_' + str(di+1)))]) + \
                            tensor.dot(outp[di], tparams[_p(prefix, ('W_inp_' + str(di+1)))])

            c = tensor.concatenate(cL, axis=1)
            h = tensor.concatenate(hL, axis=1)

            if options.get('class_out_factoring', 0) == 1:
                pC = tensor.dot(outp[-1], tparams['WdCls']) + tparams['bdCls']
                pCSft = tensor.nnet.softmax(pC)
                xCIdx = tensor.argmax(pCSft)
                pW = tensor.dot(
                    outp[-1],
                    tparams['Wd'][:, xCIdx, :]) + tparams['bd'][:, xCIdx, :]
                smooth_factor = tensor.as_tensor_variable(numpy_floatX(
                    options.get('softmax_smooth_factor', 1.0)),
                                                          name='sm_f')
                pWSft = tensor.nnet.softmax(pW * smooth_factor)
                lProb = tensor.log(pWSft +
                                   1e-20) + tensor.log(pCSft[0, xCIdx] + 1e-20)
            else:
                p = tensor.dot(outp[-1], tparams['Wd']) + tparams['bd']
                smooth_factor = tensor.as_tensor_variable(numpy_floatX(
                    options.get('softmax_smooth_factor', 1.0)),
                                                          name='sm_f')
                p = tensor.nnet.softmax(p * smooth_factor)
                lProb = tensor.log(p + 1e-20)

            if beam_size > 1:

                def _FindB_best(lPLcl, lPprev, dVLcl):
                    srtLcl = tensor.argsort(-lPLcl)
                    srtLcl = srtLcl[:beam_size]
                    deltaVec = tensor.fill(lPLcl[srtLcl],
                                           numpy_floatX(-10000.))
                    deltaVec = tensor.set_subtensor(deltaVec[0], lPprev)
                    lProbBest = ifelse(
                        tensor.eq(dVLcl, tensor.zeros_like(dVLcl)),
                        lPLcl[srtLcl] + lPprev, deltaVec)
                    xWIdxBest = ifelse(
                        tensor.eq(dVLcl, tensor.zeros_like(dVLcl)), srtLcl,
                        tensor.zeros_like(srtLcl))
                    return lProbBest, xWIdxBest

                rvalLcl, updatesLcl = theano.scan(_FindB_best,
                                                  sequences=[lProb, lP_, dV_],
                                                  name=_p(prefix, 'FindBest'),
                                                  n_steps=x_.shape[0])
                xWIdxBest = rvalLcl[1]
                lProbBest = rvalLcl[0]

                xWIdxBest = xWIdxBest.flatten()
                lProb = lProbBest.flatten()
                # Now sort and find the best among these best extensions for the current beams
                srtIdx = tensor.argsort(-lProb)
                srtIdx = srtIdx[:beam_size]
                xCandIdx = srtIdx // beam_size  # Floor division
                h = h.take(xCandIdx.flatten(), axis=0)
                c = c.take(xCandIdx.flatten(), axis=0)
                xWlogProb = lProb[srtIdx]
                xWIdx = xWIdxBest[srtIdx]
            else:
                xCandIdx = tensor.as_tensor_variable([0])
                lProb = lProb.flatten()
                xWIdx = tensor.argmax(lProb, keepdims=True)
                xWlogProb = lProb[xWIdx] + lP_
                if options.get('class_out_factoring', 0) == 1:
                    clsoffset = tensor.as_tensor_variable(
                        options['ixtoclsinfo'][:, 0])
                    xWIdx += clsoffset[xCIdx]
                h = h.take(xCandIdx.flatten(), axis=0)
                c = c.take(xCandIdx.flatten(), axis=0)

            if options.get('softmax_propogate', 0) == 0:
                xW = tparams['Wemb'][xWIdx.flatten()]
            else:
                xW = p.dot(tparams['Wemb'])
            doneVec = tensor.eq(xWIdx, tensor.zeros_like(xWIdx))

            return [xW, h, c, xWlogProb, doneVec, xWIdx,
                    xCandIdx], theano.scan_module.until(doneVec.all())
    def lstm_predict_layer(self,
                           tparams,
                           Xi,
                           aux_input,
                           options,
                           beam_size,
                           prefix='lstm'):

        nMaxsteps = options.get('maxlen', 30)

        if nMaxsteps is None:
            nMaxsteps = 30
        n_samples = 1
        h_depth = options.get('hidden_depth', 1)
        h_sz = options['hidden_size']

        # ----------------------  STEP FUNCTION  ---------------------- #
        def _stepP(x_, h_, c_, lP_, dV_, xAux):
            preact = tensor.dot(sliceT(h_, 0, h_sz),
                                tparams[_p(prefix, 'W_hid')])
            preact += (tensor.dot(x_, tparams[_p(prefix, 'W_inp')]) +
                       tparams[_p(prefix, 'b')])
            if options.get('en_aux_inp', 0):
                preact += tensor.dot(xAux, tparams[_p(prefix, 'W_aux')])

            hL = [[]] * h_depth
            cL = [[]] * h_depth
            outp = [[]] * h_depth
            for di in xrange(h_depth):
                i = tensor.nnet.sigmoid(sliceT(preact, 0, h_sz))
                f = tensor.nnet.sigmoid(sliceT(preact, 1, h_sz))
                o = tensor.nnet.sigmoid(sliceT(preact, 2, h_sz))
                cL[di] = tensor.tanh(sliceT(preact, 3, h_sz))
                cL[di] = f * sliceT(c_, di, h_sz) + i * cL[di]
                hL[di] = o * tensor.tanh(cL[di])
                outp[di] = hL[di]
                if options.get('en_residual_conn', 1):
                    if (di > 0):
                        outp[di] += outp[di - 1]
                        print "Connecting residual at %d" % (di)
                if di < (h_depth - 1):
                    preact = tensor.dot(sliceT(h_, di+1, h_sz), tparams[_p(prefix, ('W_hid_' + str(di+1)))]) + \
                            tensor.dot(outp[di], tparams[_p(prefix, ('W_inp_' + str(di+1)))])

            c = tensor.concatenate(cL, axis=1)
            h = tensor.concatenate(hL, axis=1)

            if options.get('class_out_factoring', 0) == 1:
                pC = tensor.dot(outp[-1], tparams['WdCls']) + tparams['bdCls']
                pCSft = tensor.nnet.softmax(pC)
                xCIdx = tensor.argmax(pCSft)
                pW = tensor.dot(
                    outp[-1],
                    tparams['Wd'][:, xCIdx, :]) + tparams['bd'][:, xCIdx, :]
                smooth_factor = tensor.as_tensor_variable(numpy_floatX(
                    options.get('softmax_smooth_factor', 1.0)),
                                                          name='sm_f')
                pWSft = tensor.nnet.softmax(pW * smooth_factor)
                lProb = tensor.log(pWSft +
                                   1e-20) + tensor.log(pCSft[0, xCIdx] + 1e-20)
            else:
                p = tensor.dot(outp[-1], tparams['Wd']) + tparams['bd']
                smooth_factor = tensor.as_tensor_variable(numpy_floatX(
                    options.get('softmax_smooth_factor', 1.0)),
                                                          name='sm_f')
                p = tensor.nnet.softmax(p * smooth_factor)
                lProb = tensor.log(p + 1e-20)

            if beam_size > 1:

                def _FindB_best(lPLcl, lPprev, dVLcl):
                    srtLcl = tensor.argsort(-lPLcl)
                    srtLcl = srtLcl[:beam_size]
                    deltaVec = tensor.fill(lPLcl[srtLcl],
                                           numpy_floatX(-10000.))
                    deltaVec = tensor.set_subtensor(deltaVec[0], lPprev)
                    lProbBest = ifelse(
                        tensor.eq(dVLcl, tensor.zeros_like(dVLcl)),
                        lPLcl[srtLcl] + lPprev, deltaVec)
                    xWIdxBest = ifelse(
                        tensor.eq(dVLcl, tensor.zeros_like(dVLcl)), srtLcl,
                        tensor.zeros_like(srtLcl))
                    return lProbBest, xWIdxBest

                rvalLcl, updatesLcl = theano.scan(_FindB_best,
                                                  sequences=[lProb, lP_, dV_],
                                                  name=_p(prefix, 'FindBest'),
                                                  n_steps=x_.shape[0])
                xWIdxBest = rvalLcl[1]
                lProbBest = rvalLcl[0]

                xWIdxBest = xWIdxBest.flatten()
                lProb = lProbBest.flatten()
                # Now sort and find the best among these best extensions for the current beams
                srtIdx = tensor.argsort(-lProb)
                srtIdx = srtIdx[:beam_size]
                xCandIdx = srtIdx // beam_size  # Floor division
                h = h.take(xCandIdx.flatten(), axis=0)
                c = c.take(xCandIdx.flatten(), axis=0)
                xWlogProb = lProb[srtIdx]
                xWIdx = xWIdxBest[srtIdx]
            else:
                xCandIdx = tensor.as_tensor_variable([0])
                lProb = lProb.flatten()
                xWIdx = tensor.argmax(lProb, keepdims=True)
                xWlogProb = lProb[xWIdx] + lP_
                if options.get('class_out_factoring', 0) == 1:
                    clsoffset = tensor.as_tensor_variable(
                        options['ixtoclsinfo'][:, 0])
                    xWIdx += clsoffset[xCIdx]
                h = h.take(xCandIdx.flatten(), axis=0)
                c = c.take(xCandIdx.flatten(), axis=0)

            if options.get('softmax_propogate', 0) == 0:
                xW = tparams['Wemb'][xWIdx.flatten()]
            else:
                xW = p.dot(tparams['Wemb'])
            doneVec = tensor.eq(xWIdx, tensor.zeros_like(xWIdx))

            return [xW, h, c, xWlogProb, doneVec, xWIdx,
                    xCandIdx], theano.scan_module.until(doneVec.all())

        # ------------------- END of STEP FUNCTION  -------------------- #

        if options.get('en_aux_inp', 0) == 0:
            aux_input = []

        h = tensor.alloc(numpy_floatX(0.), beam_size, h_sz * h_depth)
        c = tensor.alloc(numpy_floatX(0.), beam_size, h_sz * h_depth)

        lP = tensor.alloc(numpy_floatX(0.), beam_size)
        dV = tensor.alloc(np.int8(0.), beam_size)

        # Propogate the image feature vector
        [xW, h, c, _, _, _, _], _ = _stepP(Xi, h[:1, :], c[:1, :], lP, dV,
                                           aux_input)

        xWStart = tparams['Wemb'][[0]]
        [xW, h, c, lP, dV, idx0,
         cand0], _ = _stepP(xWStart, h[:1, :], c[:1, :], lP, dV, aux_input)

        if options.get('en_aux_inp', 0) == 1:
            aux_input = tensor.extra_ops.repeat(aux_input, beam_size, axis=0)

        # Now lets do the loop.
        rval, updates = theano.scan(
            _stepP,
            outputs_info=[xW, h, c, lP, dV, None, None],
            non_sequences=[aux_input],
            name=_p(prefix, 'predict_layers'),
            n_steps=nMaxsteps)

        return rval[3][-1], tensor.concatenate(
            [idx0.reshape([1, beam_size]), rval[5]],
            axis=0), tensor.concatenate(
                [cand0.reshape([1, beam_size]), rval[6]],
                axis=0), tensor.concatenate(
                    [tensor.shape_padleft(xW, n_ones=1), rval[0]],
                    axis=0), updates
    def lstm_predict_layer(self,
                           tparams,
                           Xi,
                           aux_input,
                           options,
                           beam_size,
                           prefix='lstm'):
        nMaxsteps = 30
        n_samples = 1

        # ----------------------  STEP FUNCTION  ---------------------- #
        def _stepP(x_, h_, c_, lP_, dV_, xAux):
            preact = tensor.dot(h_, tparams[_p(prefix, 'W_hid')])
            preact += (tensor.dot(x_, tparams[_p(prefix, 'W_inp')]) +
                       tparams[_p(prefix, 'b')])
            if options.get('en_aux_inp', 0):
                preact += tensor.dot(xAux, tparams[_p(prefix, 'W_aux')])

            i = tensor.nnet.sigmoid(sliceT(preact, 0, options['hidden_size']))
            f = tensor.nnet.sigmoid(sliceT(preact, 1, options['hidden_size']))
            o = tensor.nnet.sigmoid(sliceT(preact, 2, options['hidden_size']))
            c = tensor.tanh(sliceT(preact, 3, options['hidden_size']))

            c = f * c_ + i * c

            h = o * tensor.tanh(c)

            p = tensor.dot(h, tparams['Wd']) + tparams['bd']
            p = tensor.nnet.softmax(p)
            lProb = tensor.log(p + 1e-20)

            def _FindB_best(lPLcl, lPprev, dVLcl):
                srtLcl = tensor.argsort(-lPLcl)
                srtLcl = srtLcl[:beam_size]
                deltaVec = tensor.fill(lPLcl[srtLcl], numpy_floatX(-10000.))
                deltaVec = tensor.set_subtensor(deltaVec[0], lPprev)
                lProbBest = ifelse(tensor.eq(dVLcl, tensor.zeros_like(dVLcl)),
                                   lPLcl[srtLcl] + lPprev, deltaVec)
                xWIdxBest = ifelse(tensor.eq(dVLcl, tensor.zeros_like(dVLcl)),
                                   srtLcl, tensor.zeros_like(srtLcl))
                return lProbBest, xWIdxBest

            rvalLcl, updatesLcl = theano.scan(_FindB_best,
                                              sequences=[lProb, lP_, dV_],
                                              name=_p(prefix, 'FindBest'),
                                              n_steps=x_.shape[0])
            xWIdxBest = rvalLcl[1]
            lProbBest = rvalLcl[0]

            xWIdxBest = xWIdxBest.flatten()
            lProb = lProbBest.flatten()

            # Now sort and find the best among these best extensions for the current beams
            srtIdx = tensor.argsort(-lProb)
            srtIdx = srtIdx[:beam_size]
            xWlogProb = lProb[srtIdx]

            xWIdx = xWIdxBest[srtIdx]
            xCandIdx = srtIdx // beam_size  # Floor division

            xW = tparams['Wemb'][xWIdx.flatten()]
            doneVec = tensor.eq(xWIdx, tensor.zeros_like(xWIdx))
            h = h.take(xCandIdx.flatten(), axis=0)
            c = c.take(xCandIdx.flatten(), axis=0)

            return [xW, h, c, xWlogProb, doneVec, xWIdx,
                    xCandIdx], theano.scan_module.until(doneVec.all())

        # ------------------- END of STEP FUNCTION  -------------------- #

        if options.get('en_aux_inp', 0) == 0:
            aux_input = []

        hidden_size = options['hidden_size']

        h = tensor.alloc(numpy_floatX(0.), beam_size, hidden_size)
        c = tensor.alloc(numpy_floatX(0.), beam_size, hidden_size)

        lP = tensor.alloc(numpy_floatX(0.), beam_size)
        dV = tensor.alloc(np.int8(0.), beam_size)

        # Propogate the image feature vector
        [xW, h, c, _, _, _, _], _ = _stepP(Xi, h[:1, :], c[:1, :], lP, dV,
                                           aux_input)

        xWStart = tparams['Wemb'][[0]]
        [xW, h, c, lP, dV, idx0,
         cand0], _ = _stepP(xWStart, h[:1, :], c[:1, :], lP, dV, aux_input)

        aux_input = tensor.extra_ops.repeat(aux_input, beam_size, axis=0)

        # Now lets do the loop.
        rval, updates = theano.scan(
            _stepP,
            outputs_info=[xW, h, c, lP, dV, None, None],
            non_sequences=[aux_input],
            name=_p(prefix, 'predict_layers'),
            n_steps=nMaxsteps)

        return rval[3][-1], tensor.concatenate(
            [idx0.reshape([1, beam_size]), rval[5]],
            axis=0), tensor.concatenate(
                [cand0.reshape([1, beam_size]), rval[6]], axis=0)
  def lstm_predict_layer(self, tparams, Xi, aux_input, options, beam_size, prefix='lstm'):
    
    nMaxsteps = options.get('maxlen',30)
    
    if nMaxsteps is None: 
        nMaxsteps = 30
    n_samples = 1 
    h_depth = options.get('hidden_depth',1)
    h_sz = options['hidden_size']
  
    # ----------------------  STEP FUNCTION  ---------------------- #
    def _stepP(x_, h_, c_, lP_, dV_, xAux):
        preact = tensor.dot(sliceT(h_, 0, h_sz), tparams[_p(prefix, 'W_hid')])
        preact += (tensor.dot(x_, tparams[_p(prefix, 'W_inp')]) +
                   tparams[_p(prefix, 'b')])
        if options.get('en_aux_inp',0):
            preact += tensor.dot(xAux,tparams[_p(prefix,'W_aux')])
  
        hL = [[]]*h_depth 
        cL = [[]]*h_depth 
        for di in xrange(h_depth):
            i = tensor.nnet.sigmoid(sliceT(preact, 0, h_sz))
            f = tensor.nnet.sigmoid(sliceT(preact, 1, h_sz))
            o = tensor.nnet.sigmoid(sliceT(preact, 2, h_sz))
            cL[di] = tensor.tanh(sliceT(preact, 3, h_sz))
            cL[di] = f * sliceT(c_, di, h_sz) + i * cL[di]
            hL[di] = o * tensor.tanh(cL[di])
            if di < (h_depth - 1):
                preact = tensor.dot(sliceT(h_, di+1, h_sz), tparams[_p(prefix, ('W_hid_' + str(di+1)))]) + \
                        tensor.dot(hL[di], tparams[_p(prefix, ('W_inp_' + str(di+1)))])
        
        c = tensor.concatenate(cL,axis=1)
        h = tensor.concatenate(hL,axis=1)
  
        if options.get('class_out_factoring',0) == 1:
            pC    = tensor.dot(hL[-1],tparams['WdCls']) + tparams['bdCls']
            pCSft = tensor.nnet.softmax(pC)
            xCIdx =  tensor.argmax(pCSft)
            pW = tensor.dot(h[-1],tparams['Wd'][:,xCIdx,:]) + tparams['bd'][:,xCIdx,:]
            smooth_factor = tensor.as_tensor_variable(numpy_floatX(options.get('softmax_smooth_factor',1.0)), name='sm_f')
            pWSft = tensor.nnet.softmax(pW*smooth_factor)
            lProb = tensor.log(pWSft + 1e-20) + tensor.log(pCSft[0,xCIdx] + 1e-20)
        else:
            p = tensor.dot(hL[-1],tparams['Wd']) + tparams['bd']
            smooth_factor = tensor.as_tensor_variable(numpy_floatX(options.get('softmax_smooth_factor',1.0)), name='sm_f')
            p = tensor.nnet.softmax(p*smooth_factor)
            lProb = tensor.log(p + 1e-20)

        if beam_size > 1:
            def _FindB_best(lPLcl, lPprev, dVLcl):
                srtLcl = tensor.argsort(-lPLcl)
                srtLcl = srtLcl[:beam_size]
                deltaVec = tensor.fill( lPLcl[srtLcl], numpy_floatX(-10000.))
                deltaVec = tensor.set_subtensor(deltaVec[0], lPprev)
                lProbBest = ifelse(tensor.eq( dVLcl, tensor.zeros_like(dVLcl)), lPLcl[srtLcl] + lPprev, deltaVec)
                xWIdxBest = ifelse(tensor.eq( dVLcl, tensor.zeros_like(dVLcl)), srtLcl, tensor.zeros_like(srtLcl)) 
                return lProbBest, xWIdxBest 
  
            rvalLcl, updatesLcl = theano.scan(_FindB_best, sequences = [lProb, lP_, dV_], name=_p(prefix, 'FindBest'), n_steps=x_.shape[0])
            xWIdxBest = rvalLcl[1]
            lProbBest = rvalLcl[0]
  
            xWIdxBest = xWIdxBest.flatten()
            lProb = lProbBest.flatten()
            # Now sort and find the best among these best extensions for the current beams
            srtIdx = tensor.argsort(-lProb)
            srtIdx = srtIdx[:beam_size]
            xCandIdx = srtIdx // beam_size # Floor division 
            h = h.take(xCandIdx.flatten(),axis=0)
            c = c.take(xCandIdx.flatten(),axis=0)
            xWlogProb = lProb[srtIdx]
            xWIdx = xWIdxBest[srtIdx]
        else:
            xCandIdx = tensor.as_tensor_variable([0]) 
            lProb = lProb.flatten()
            xWIdx =  tensor.argmax(lProb,keepdims=True)
            xWlogProb = lProb[xWIdx] + lP_
            if options.get('class_out_factoring',0) == 1:
                clsoffset = tensor.as_tensor_variable(options['ixtoclsinfo'][:,0])
                xWIdx += clsoffset[xCIdx]
            h = h.take(xCandIdx.flatten(),axis=0)
            c = c.take(xCandIdx.flatten(),axis=0)
        
        if options.get('softmax_propogate',0) == 0:
            xW = tparams['Wemb'][xWIdx.flatten()]
        else:
            xW = p.dot(tparams['Wemb'])
        doneVec = tensor.eq(xWIdx,tensor.zeros_like(xWIdx))
  
        return [xW, h, c, xWlogProb, doneVec, xWIdx, xCandIdx], theano.scan_module.until(doneVec.all())
    # ------------------- END of STEP FUNCTION  -------------------- #
    
    if options.get('en_aux_inp',0) == 0:
       aux_input = [] 
  
    h = tensor.alloc(numpy_floatX(0.),beam_size,h_sz*h_depth)
    c = tensor.alloc(numpy_floatX(0.),beam_size,h_sz*h_depth)
  
    lP = tensor.alloc(numpy_floatX(0.), beam_size);
    dV = tensor.alloc(np.int8(0.), beam_size);
  
    # Propogate the image feature vector
    [xW, h, c, _, _, _, _], _ = _stepP(Xi, h[:1,:], c[:1,:], lP, dV,aux_input) 
    
    xWStart = tparams['Wemb'][[0]]
    [xW, h, c, lP, dV, idx0, cand0], _ = _stepP(xWStart, h[:1,:], c[:1,:], lP, dV, aux_input) 
    
    if options.get('en_aux_inp',0) == 1:
        aux_input = tensor.extra_ops.repeat(aux_input,beam_size,axis=0)
  
    # Now lets do the loop.
    rval, updates = theano.scan(_stepP, outputs_info=[xW, h, c, lP, dV, None, None], non_sequences = [aux_input], name=_p(prefix, 'predict_layers'), n_steps=nMaxsteps)
  
    return rval[3][-1], tensor.concatenate([idx0.reshape([1,beam_size]), rval[5]],axis=0), tensor.concatenate([cand0.reshape([1,beam_size]), rval[6]],axis=0), tensor.concatenate([tensor.shape_padleft(xW,n_ones=1),rval[0]],axis=0), updates
    def _stepP(*in_list):
        x_inp = []
        h_inp = []
        c_inp = []
        for i in xrange(nmodels):
            x_inp.append(in_list[i])
            h_inp.append(in_list[nmodels+i])
            c_inp.append(in_list[2*nmodels+i])
        lP_ = in_list[3*nmodels]
        dV_ = in_list[3*nmodels+1]

        p_comb = tensor.alloc(numpy_floatX(0.), options[0]['output_size']);
        cf = []
        h = []
        xW = []
        for i in xrange(nmodels):
            preact = tensor.dot(h_inp[i], tparams[i][_p(prefix, 'W_hid')])
            preact += (tensor.dot(x_inp[i], tparams[i][_p(prefix, 'W_inp')]) +
                       tparams[i][_p(prefix, 'b')])
            if options[i].get('en_aux_inp',0):
                preact += tensor.dot(aux_input2[i],tparams[i][_p(prefix,'W_aux')])
  
            inp = tensor.nnet.sigmoid(sliceT(preact, 0, options[i]['hidden_size']))
            f = tensor.nnet.sigmoid(sliceT(preact, 1, options[i]['hidden_size']))
            o = tensor.nnet.sigmoid(sliceT(preact, 2, options[i]['hidden_size']))
            c = tensor.tanh(sliceT(preact, 3, options[i]['hidden_size']))
  
            cf.append(f * c_inp[i] + inp * c)
  
            h.append(o * tensor.tanh(cf[i]))
            p = tensor.dot(h[i],tparams[i]['Wd']) + tparams[i]['bd']
            if i == 0:
                p_comb = tparams[i]['comb_weight']*tensor.nnet.softmax(p)
            else:    
                p_comb += tparams[i]['comb_weight']*tensor.nnet.softmax(p)
        
        lProb = tensor.log(p_comb + 1e-20)
        def _FindB_best(lPLcl, lPprev, dVLcl):
            srtLcl = tensor.argsort(-lPLcl)
            srtLcl = srtLcl[:beam_size]
            deltaVec = tensor.fill( lPLcl[srtLcl], numpy_floatX(-10000.))
            deltaVec = tensor.set_subtensor(deltaVec[0], lPprev)
            lProbBest = ifelse(tensor.eq( dVLcl, tensor.zeros_like(dVLcl)), lPLcl[srtLcl] + lPprev, deltaVec)
            xWIdxBest = ifelse(tensor.eq( dVLcl, tensor.zeros_like(dVLcl)), srtLcl, tensor.zeros_like(srtLcl)) 
            return lProbBest, xWIdxBest 
  
        rvalLcl, updatesLcl = theano.scan(_FindB_best, sequences = [lProb, lP_, dV_], name=_p(prefix, 'FindBest'), n_steps=x_inp[0].shape[0])
        xWIdxBest = rvalLcl[1]
        lProbBest = rvalLcl[0]
  
        xWIdxBest = xWIdxBest.flatten()
        lProb = lProbBest.flatten()
  
        # Now sort and find the best among these best extensions for the current beams
        srtIdx = tensor.argsort(-lProb)
        srtIdx = srtIdx[:beam_size]
        xWlogProb = lProb[srtIdx]
  
        xWIdx = xWIdxBest[srtIdx]
        xCandIdx = srtIdx // beam_size # Floor division 
  
        doneVec = tensor.eq(xWIdx,tensor.zeros_like(xWIdx))
        
        x_out = []
        h_out = []
        c_out = []
        for i in xrange(nmodels):
            x_out.append(tparams[i]['Wemb'][xWIdx.flatten()])
            h_out.append(h[i].take(xCandIdx.flatten(),axis=0))
            c_out.append(cf[i].take(xCandIdx.flatten(),axis=0))

        out_list = []
        out_list.extend(x_out)
        out_list.extend(h_out)
        out_list.extend(c_out)
        out_list.extend([xWlogProb, doneVec, xWIdx, xCandIdx])
  
        return out_list, theano.scan_module.until(doneVec.all())
  def lstm_multi_model_pred(self,tparams, Xi, aux_input, options, beam_size, nmodels, prefix='lstm'):
    nMaxsteps = 30 
  
    # ----------------------  STEP FUNCTION  ---------------------- #
    def _stepP(*in_list):
        x_inp = []
        h_inp = []
        c_inp = []
        for i in xrange(nmodels):
            x_inp.append(in_list[i])
            h_inp.append(in_list[nmodels+i])
            c_inp.append(in_list[2*nmodels+i])
        lP_ = in_list[3*nmodels]
        dV_ = in_list[3*nmodels+1]

        p_comb = tensor.alloc(numpy_floatX(0.), options[0]['output_size']);
        cf = []
        h = []
        xW = []
        for i in xrange(nmodels):
            preact = tensor.dot(h_inp[i], tparams[i][_p(prefix, 'W_hid')])
            preact += (tensor.dot(x_inp[i], tparams[i][_p(prefix, 'W_inp')]) +
                       tparams[i][_p(prefix, 'b')])
            if options[i].get('en_aux_inp',0):
                preact += tensor.dot(aux_input2[i],tparams[i][_p(prefix,'W_aux')])
  
            inp = tensor.nnet.sigmoid(sliceT(preact, 0, options[i]['hidden_size']))
            f = tensor.nnet.sigmoid(sliceT(preact, 1, options[i]['hidden_size']))
            o = tensor.nnet.sigmoid(sliceT(preact, 2, options[i]['hidden_size']))
            c = tensor.tanh(sliceT(preact, 3, options[i]['hidden_size']))
  
            cf.append(f * c_inp[i] + inp * c)
  
            h.append(o * tensor.tanh(cf[i]))
            p = tensor.dot(h[i],tparams[i]['Wd']) + tparams[i]['bd']
            if i == 0:
                p_comb = tparams[i]['comb_weight']*tensor.nnet.softmax(p)
            else:    
                p_comb += tparams[i]['comb_weight']*tensor.nnet.softmax(p)
        
        lProb = tensor.log(p_comb + 1e-20)
        def _FindB_best(lPLcl, lPprev, dVLcl):
            srtLcl = tensor.argsort(-lPLcl)
            srtLcl = srtLcl[:beam_size]
            deltaVec = tensor.fill( lPLcl[srtLcl], numpy_floatX(-10000.))
            deltaVec = tensor.set_subtensor(deltaVec[0], lPprev)
            lProbBest = ifelse(tensor.eq( dVLcl, tensor.zeros_like(dVLcl)), lPLcl[srtLcl] + lPprev, deltaVec)
            xWIdxBest = ifelse(tensor.eq( dVLcl, tensor.zeros_like(dVLcl)), srtLcl, tensor.zeros_like(srtLcl)) 
            return lProbBest, xWIdxBest 
  
        rvalLcl, updatesLcl = theano.scan(_FindB_best, sequences = [lProb, lP_, dV_], name=_p(prefix, 'FindBest'), n_steps=x_inp[0].shape[0])
        xWIdxBest = rvalLcl[1]
        lProbBest = rvalLcl[0]
  
        xWIdxBest = xWIdxBest.flatten()
        lProb = lProbBest.flatten()
  
        # Now sort and find the best among these best extensions for the current beams
        srtIdx = tensor.argsort(-lProb)
        srtIdx = srtIdx[:beam_size]
        xWlogProb = lProb[srtIdx]
  
        xWIdx = xWIdxBest[srtIdx]
        xCandIdx = srtIdx // beam_size # Floor division 
  
        doneVec = tensor.eq(xWIdx,tensor.zeros_like(xWIdx))
        
        x_out = []
        h_out = []
        c_out = []
        for i in xrange(nmodels):
            x_out.append(tparams[i]['Wemb'][xWIdx.flatten()])
            h_out.append(h[i].take(xCandIdx.flatten(),axis=0))
            c_out.append(cf[i].take(xCandIdx.flatten(),axis=0))

        out_list = []
        out_list.extend(x_out)
        out_list.extend(h_out)
        out_list.extend(c_out)
        out_list.extend([xWlogProb, doneVec, xWIdx, xCandIdx])
  
        return out_list, theano.scan_module.until(doneVec.all())
    # ------------------- END of STEP FUNCTION  -------------------- #
    
    #Xi = tensor.extra_ops.repeat(Xi,beam_size,axis=0)
  
    lP = tensor.alloc(numpy_floatX(0.), beam_size);
    dV = tensor.alloc(np.int8(0.), beam_size);
    
    h_inp = []
    c_inp = []
    x_inp = []
    for i in xrange(nmodels):
      hidden_size = options[i]['hidden_size']
      h = theano.shared(np.zeros((1,hidden_size),dtype='float32'))
      c = theano.shared(np.zeros((1,hidden_size),dtype='float32'))
      h_inp.append(h)
      c_inp.append(c)
      x_inp.append(Xi[i])
    
    aux_input2 = aux_input
    
    in_list = []
    in_list.extend(x_inp); in_list.extend(h_inp); in_list.extend(c_inp) 
    in_list.append(lP); in_list.append(dV) 

    
    # Propogate the image feature vector
    out_list,_ = _stepP(*in_list) 

    for i in xrange(nmodels):
        h_inp[i] = out_list[nmodels + i]
        c_inp[i] = out_list[2*nmodels + i]
    
    x_inp = []
    for i in xrange(nmodels):
      x_inp.append(tparams[i]['Wemb'][[0]])
      h_inp[i] = h_inp[i][:1,:]
      c_inp[i] = c_inp[i][:1,:]
      #if options[i].get('en_aux_inp',0):
      #  aux_input2.append(aux_input[i])
    
    in_list = []
    in_list.extend(x_inp); in_list.extend(h_inp); in_list.extend(c_inp) 
    in_list.append(lP); in_list.append(dV) 
  
    out_list, _ = _stepP(*in_list)
    aux_input2 = []
    for i in xrange(nmodels):
        x_inp[i] = out_list[i]
        h_inp[i] = out_list[nmodels + i]
        c_inp[i] = out_list[2*nmodels + i]
        aux_input2.append(tensor.extra_ops.repeat(aux_input[i],beam_size,axis=0))
    lP = out_list[3*nmodels]
    dV = out_list[3*nmodels+1]
    idx0 = out_list[3*nmodels+2]
    cand0 = out_list[3*nmodels+3]
    
    in_list = []
    in_list.extend(x_inp); in_list.extend(h_inp); in_list.extend(c_inp) 
    in_list.append(lP); in_list.append(dV)
    in_list.append(None);in_list.append(None);
    
    # Now lets do the loop.
    rval, updates = theano.scan(_stepP, outputs_info=in_list, name=_p(prefix, 'predict_layers'), n_steps=nMaxsteps)
  
    return rval[3*nmodels][-1], tensor.concatenate([idx0.reshape([1,beam_size]), rval[3*nmodels+2]],axis=0), tensor.concatenate([cand0.reshape([1,beam_size]), rval[3*nmodels+3]],axis=0), rval[3*nmodels] 
    def _stepP(x_, h_, c_, lP_, dV_, xAux):
        preact = tensor.dot(sliceT(h_, 0, h_sz), tparams[_p(prefix, 'W_hid')])
        preact += (tensor.dot(x_, tparams[_p(prefix, 'W_inp')]) +
                   tparams[_p(prefix, 'b')])
        if options.get('en_aux_inp',0):
            preact += tensor.dot(xAux,tparams[_p(prefix,'W_aux')])
  
        hL = [[]]*h_depth 
        cL = [[]]*h_depth 
        for di in xrange(h_depth):
            i = tensor.nnet.sigmoid(sliceT(preact, 0, h_sz))
            f = tensor.nnet.sigmoid(sliceT(preact, 1, h_sz))
            o = tensor.nnet.sigmoid(sliceT(preact, 2, h_sz))
            cL[di] = tensor.tanh(sliceT(preact, 3, h_sz))
            cL[di] = f * sliceT(c_, di, h_sz) + i * cL[di]
            hL[di] = o * tensor.tanh(cL[di])
            if di < (h_depth - 1):
                preact = tensor.dot(sliceT(h_, di+1, h_sz), tparams[_p(prefix, ('W_hid_' + str(di+1)))]) + \
                        tensor.dot(hL[di], tparams[_p(prefix, ('W_inp_' + str(di+1)))])
        
        c = tensor.concatenate(cL,axis=1)
        h = tensor.concatenate(hL,axis=1)
  
        if options.get('class_out_factoring',0) == 1:
            pC    = tensor.dot(hL[-1],tparams['WdCls']) + tparams['bdCls']
            pCSft = tensor.nnet.softmax(pC)
            xCIdx =  tensor.argmax(pCSft)
            pW = tensor.dot(h[-1],tparams['Wd'][:,xCIdx,:]) + tparams['bd'][:,xCIdx,:]
            smooth_factor = tensor.as_tensor_variable(numpy_floatX(options.get('softmax_smooth_factor',1.0)), name='sm_f')
            pWSft = tensor.nnet.softmax(pW*smooth_factor)
            lProb = tensor.log(pWSft + 1e-20) + tensor.log(pCSft[0,xCIdx] + 1e-20)
        else:
            p = tensor.dot(hL[-1],tparams['Wd']) + tparams['bd']
            smooth_factor = tensor.as_tensor_variable(numpy_floatX(options.get('softmax_smooth_factor',1.0)), name='sm_f')
            p = tensor.nnet.softmax(p*smooth_factor)
            lProb = tensor.log(p + 1e-20)

        if beam_size > 1:
            def _FindB_best(lPLcl, lPprev, dVLcl):
                srtLcl = tensor.argsort(-lPLcl)
                srtLcl = srtLcl[:beam_size]
                deltaVec = tensor.fill( lPLcl[srtLcl], numpy_floatX(-10000.))
                deltaVec = tensor.set_subtensor(deltaVec[0], lPprev)
                lProbBest = ifelse(tensor.eq( dVLcl, tensor.zeros_like(dVLcl)), lPLcl[srtLcl] + lPprev, deltaVec)
                xWIdxBest = ifelse(tensor.eq( dVLcl, tensor.zeros_like(dVLcl)), srtLcl, tensor.zeros_like(srtLcl)) 
                return lProbBest, xWIdxBest 
  
            rvalLcl, updatesLcl = theano.scan(_FindB_best, sequences = [lProb, lP_, dV_], name=_p(prefix, 'FindBest'), n_steps=x_.shape[0])
            xWIdxBest = rvalLcl[1]
            lProbBest = rvalLcl[0]
  
            xWIdxBest = xWIdxBest.flatten()
            lProb = lProbBest.flatten()
            # Now sort and find the best among these best extensions for the current beams
            srtIdx = tensor.argsort(-lProb)
            srtIdx = srtIdx[:beam_size]
            xCandIdx = srtIdx // beam_size # Floor division 
            h = h.take(xCandIdx.flatten(),axis=0)
            c = c.take(xCandIdx.flatten(),axis=0)
            xWlogProb = lProb[srtIdx]
            xWIdx = xWIdxBest[srtIdx]
        else:
            xCandIdx = tensor.as_tensor_variable([0]) 
            lProb = lProb.flatten()
            xWIdx =  tensor.argmax(lProb,keepdims=True)
            xWlogProb = lProb[xWIdx] + lP_
            if options.get('class_out_factoring',0) == 1:
                clsoffset = tensor.as_tensor_variable(options['ixtoclsinfo'][:,0])
                xWIdx += clsoffset[xCIdx]
            h = h.take(xCandIdx.flatten(),axis=0)
            c = c.take(xCandIdx.flatten(),axis=0)
        
        if options.get('softmax_propogate',0) == 0:
            xW = tparams['Wemb'][xWIdx.flatten()]
        else:
            xW = p.dot(tparams['Wemb'])
        doneVec = tensor.eq(xWIdx,tensor.zeros_like(xWIdx))
  
        return [xW, h, c, xWlogProb, doneVec, xWIdx, xCandIdx], theano.scan_module.until(doneVec.all())
  def lstm_advers_gen_layer(self, tparams, Xi, aux_input, options, beam_size, prefix='lstm'):
    nMaxsteps = options.get('maxlen',15)
    n_samples = 1 
    h_depth = options.get('hidden_depth',1)
    h_sz = options['hidden_size']
  
    # ----------------------  STEP FUNCTION  ---------------------- #
    def _stepP(x_, h_, c_, lP_, dV_, xAux):
        preact = tensor.dot(sliceT(h_, 0, h_sz), tparams[_p(prefix, 'W_hid')])
        preact += (tensor.dot(x_, tparams[_p(prefix, 'W_inp')]) +
                   tparams[_p(prefix, 'b')])
        if options.get('en_aux_inp',0):
            preact += tensor.dot(xAux,tparams[_p(prefix,'W_aux')])
  
        hL = [[]]*h_depth 
        cL = [[]]*h_depth 
        for di in xrange(h_depth):
            i = tensor.nnet.sigmoid(sliceT(preact, 0, h_sz))
            f = tensor.nnet.sigmoid(sliceT(preact, 1, h_sz))
            o = tensor.nnet.sigmoid(sliceT(preact, 2, h_sz))
            cL[di] = tensor.tanh(sliceT(preact, 3, h_sz))
            cL[di] = f * sliceT(c_, di, h_sz) + i * cL[di]
            hL[di] = o * tensor.tanh(cL[di])
            if di < (h_depth - 1):
                preact = tensor.dot(sliceT(h_, di+1, h_sz), tparams[_p(prefix, ('W_hid_' + str(di+1)))]) + \
                        tensor.dot(hL[di], tparams[_p(prefix, ('W_inp_' + str(di+1)))])
        
        c = tensor.concatenate(cL,axis=1)
        h = tensor.concatenate(hL,axis=1)
  
        p = tensor.dot(hL[-1],tparams['Wd']) + tparams['bd']
        smooth_factor = tensor.as_tensor_variable(numpy_floatX(options.get('softmax_smooth_factor',1.0)), name='sm_f')
        p = tensor.nnet.softmax(p*smooth_factor)
        lProb = tensor.log(p + 1e-20)

        #xCandIdx = tensor.as_tensor_variable([0]) 
        lProb = lProb.flatten()
        xWIdx =  tensor.argmax(lProb,keepdims=True)
        xWlogProb = lProb[xWIdx] + lP_
        
        if options.get('softmax_propogate',0) == 0:
            xW = tparams['Wemb'][xWIdx.flatten()]
        else:
            xW = p.dot(tparams['Wemb'])
        doneVec = tensor.eq(xWIdx,tensor.zeros_like(xWIdx))
  
        return [xW, h, c, xWlogProb, doneVec, xWIdx, p], theano.scan_module.until(doneVec.all())
    # ------------------- END of STEP FUNCTION  -------------------- #
    
    if options.get('en_aux_inp',0) == 0:
       aux_input = [] 
  
    h = tensor.alloc(numpy_floatX(0.),n_samples,h_sz*h_depth)
    c = tensor.alloc(numpy_floatX(0.),n_samples,h_sz*h_depth)
  
    lP = tensor.alloc(numpy_floatX(0.), beam_size);
    dV = tensor.alloc(np.int8(0.), beam_size);
  
    # Propogate the image feature vector
    [xW, h, c, _, _, _, _], _ = _stepP(Xi, h, c, lP, dV,aux_input) 
    
    xWStart = tparams['Wemb'][0,:]
    [xW, h, c, lP, dV, idx0, p0], _ = _stepP(xWStart, h, c, lP, dV, aux_input) 
    
    #if options.get('en_aux_inp',0) == 1:
    #    aux_input = tensor.extra_ops.repeat(aux_input,beam_size,axis=0)
  
    # Now lets do the loop.
    rval, updates = theano.scan(_stepP, outputs_info=[xW, h, c, lP, dV, None, None], non_sequences = [aux_input], name=_p(prefix, 'predict_layers'), n_steps=nMaxsteps-1)
  
    return rval[3][-1], tensor.concatenate([idx0.reshape([1,beam_size]), rval[5]],axis=0), tensor.concatenate([tensor.shape_padleft(p0,n_ones=1),rval[6]],axis=0), updates
    def lstm_multi_model_pred(self,
                              tparams,
                              Xi,
                              aux_input,
                              options,
                              beam_size,
                              nmodels,
                              prefix='lstm'):
        nMaxsteps = 30

        # ----------------------  STEP FUNCTION  ---------------------- #
        def _stepP(*in_list):
            x_inp = []
            h_inp = []
            c_inp = []
            for i in xrange(nmodels):
                x_inp.append(in_list[i])
                h_inp.append(in_list[nmodels + i])
                c_inp.append(in_list[2 * nmodels + i])
            lP_ = in_list[3 * nmodels]
            dV_ = in_list[3 * nmodels + 1]

            p_comb = tensor.alloc(numpy_floatX(0.), options[0]['output_size'])
            cf = []
            h = []
            xW = []
            for i in xrange(nmodels):
                preact = tensor.dot(h_inp[i], tparams[i][_p(prefix, 'W_hid')])
                preact += (
                    tensor.dot(x_inp[i], tparams[i][_p(prefix, 'W_inp')]) +
                    tparams[i][_p(prefix, 'b')])
                if options[i].get('en_aux_inp', 0):
                    preact += tensor.dot(aux_input2[i],
                                         tparams[i][_p(prefix, 'W_aux')])

                inp = tensor.nnet.sigmoid(
                    sliceT(preact, 0, options[i]['hidden_size']))
                f = tensor.nnet.sigmoid(
                    sliceT(preact, 1, options[i]['hidden_size']))
                o = tensor.nnet.sigmoid(
                    sliceT(preact, 2, options[i]['hidden_size']))
                c = tensor.tanh(sliceT(preact, 3, options[i]['hidden_size']))

                cf.append(f * c_inp[i] + inp * c)

                h.append(o * tensor.tanh(cf[i]))
                p = tensor.dot(h[i], tparams[i]['Wd']) + tparams[i]['bd']
                if i == 0:
                    p_comb = tparams[i]['comb_weight'] * tensor.nnet.softmax(p)
                else:
                    p_comb += tparams[i]['comb_weight'] * tensor.nnet.softmax(
                        p)

            lProb = tensor.log(p_comb + 1e-20)

            def _FindB_best(lPLcl, lPprev, dVLcl):
                srtLcl = tensor.argsort(-lPLcl)
                srtLcl = srtLcl[:beam_size]
                deltaVec = tensor.fill(lPLcl[srtLcl], numpy_floatX(-10000.))
                deltaVec = tensor.set_subtensor(deltaVec[0], lPprev)
                lProbBest = ifelse(tensor.eq(dVLcl, tensor.zeros_like(dVLcl)),
                                   lPLcl[srtLcl] + lPprev, deltaVec)
                xWIdxBest = ifelse(tensor.eq(dVLcl, tensor.zeros_like(dVLcl)),
                                   srtLcl, tensor.zeros_like(srtLcl))
                return lProbBest, xWIdxBest

            rvalLcl, updatesLcl = theano.scan(_FindB_best,
                                              sequences=[lProb, lP_, dV_],
                                              name=_p(prefix, 'FindBest'),
                                              n_steps=x_inp[0].shape[0])
            xWIdxBest = rvalLcl[1]
            lProbBest = rvalLcl[0]

            xWIdxBest = xWIdxBest.flatten()
            lProb = lProbBest.flatten()

            # Now sort and find the best among these best extensions for the current beams
            srtIdx = tensor.argsort(-lProb)
            srtIdx = srtIdx[:beam_size]
            xWlogProb = lProb[srtIdx]

            xWIdx = xWIdxBest[srtIdx]
            xCandIdx = srtIdx // beam_size  # Floor division

            doneVec = tensor.eq(xWIdx, tensor.zeros_like(xWIdx))

            x_out = []
            h_out = []
            c_out = []
            for i in xrange(nmodels):
                x_out.append(tparams[i]['Wemb'][xWIdx.flatten()])
                h_out.append(h[i].take(xCandIdx.flatten(), axis=0))
                c_out.append(cf[i].take(xCandIdx.flatten(), axis=0))

            out_list = []
            out_list.extend(x_out)
            out_list.extend(h_out)
            out_list.extend(c_out)
            out_list.extend([xWlogProb, doneVec, xWIdx, xCandIdx])

            return out_list, theano.scan_module.until(doneVec.all())

        # ------------------- END of STEP FUNCTION  -------------------- #

        #Xi = tensor.extra_ops.repeat(Xi,beam_size,axis=0)

        lP = tensor.alloc(numpy_floatX(0.), beam_size)
        dV = tensor.alloc(np.int8(0.), beam_size)

        h_inp = []
        c_inp = []
        x_inp = []
        for i in xrange(nmodels):
            hidden_size = options[i]['hidden_size']
            h = theano.shared(np.zeros((1, hidden_size), dtype='float32'))
            c = theano.shared(np.zeros((1, hidden_size), dtype='float32'))
            h_inp.append(h)
            c_inp.append(c)
            x_inp.append(Xi[i])

        aux_input2 = aux_input

        in_list = []
        in_list.extend(x_inp)
        in_list.extend(h_inp)
        in_list.extend(c_inp)
        in_list.append(lP)
        in_list.append(dV)

        # Propogate the image feature vector
        out_list, _ = _stepP(*in_list)

        for i in xrange(nmodels):
            h_inp[i] = out_list[nmodels + i]
            c_inp[i] = out_list[2 * nmodels + i]

        x_inp = []
        for i in xrange(nmodels):
            x_inp.append(tparams[i]['Wemb'][[0]])
            h_inp[i] = h_inp[i][:1, :]
            c_inp[i] = c_inp[i][:1, :]
            #if options[i].get('en_aux_inp',0):
            #  aux_input2.append(aux_input[i])

        in_list = []
        in_list.extend(x_inp)
        in_list.extend(h_inp)
        in_list.extend(c_inp)
        in_list.append(lP)
        in_list.append(dV)

        out_list, _ = _stepP(*in_list)
        aux_input2 = []
        for i in xrange(nmodels):
            x_inp[i] = out_list[i]
            h_inp[i] = out_list[nmodels + i]
            c_inp[i] = out_list[2 * nmodels + i]
            aux_input2.append(
                tensor.extra_ops.repeat(aux_input[i], beam_size, axis=0))
        lP = out_list[3 * nmodels]
        dV = out_list[3 * nmodels + 1]
        idx0 = out_list[3 * nmodels + 2]
        cand0 = out_list[3 * nmodels + 3]

        in_list = []
        in_list.extend(x_inp)
        in_list.extend(h_inp)
        in_list.extend(c_inp)
        in_list.append(lP)
        in_list.append(dV)
        in_list.append(None)
        in_list.append(None)

        # Now lets do the loop.
        rval, updates = theano.scan(_stepP,
                                    outputs_info=in_list,
                                    name=_p(prefix, 'predict_layers'),
                                    n_steps=nMaxsteps)

        return rval[3 * nmodels][-1], tensor.concatenate(
            [idx0.reshape([1, beam_size]), rval[3 * nmodels + 2]],
            axis=0), tensor.concatenate(
                [cand0.reshape([1, beam_size]), rval[3 * nmodels + 3]],
                axis=0), rval[3 * nmodels]
        def _stepP(U, xW_, h_, c_, lP_, dV_, xAux, xNoise):
            preact = tensor.dot(sliceT(h_, 0, h_sz),
                                tparams[_p(prefix, 'W_hid')])
            preact += (tensor.dot(xW_, tparams[_p(prefix, 'W_inp')]) +
                       tparams[_p(prefix, 'b')])
            preact += xAux
            if options.get('gen_input_noise', 0):
                preact += xNoise

            hL = [[]] * h_depth
            cL = [[]] * h_depth
            outp = [[]] * h_depth
            for di in xrange(h_depth):
                i = tensor.nnet.sigmoid(sliceT(preact, 0, h_sz))
                f = tensor.nnet.sigmoid(sliceT(preact, 1, h_sz))
                o = tensor.nnet.sigmoid(sliceT(preact, 2, h_sz))
                cL[di] = tensor.tanh(sliceT(preact, 3, h_sz))
                cL[di] = f * sliceT(c_, di, h_sz) + i * cL[di]
                hL[di] = o * tensor.tanh(cL[di])
                outp[di] = hL[di]
                if options.get('en_residual_conn', 1):
                    if (di > 0):
                        outp[di] += outp[di - 1]
                        print "Connecting residual at %d" % (di)
                if di < (h_depth - 1):
                    preact = tensor.dot(sliceT(h_, di+1, h_sz), tparams[_p(prefix, ('W_hid_' + str(di+1)))]) + \
                            tensor.dot(outp[di], tparams[_p(prefix, ('W_inp_' + str(di+1)))])

            c = tensor.concatenate(cL, axis=1)
            h = tensor.concatenate(hL, axis=1)

            logits = tensor.dot(outp[-1], tparams['Wd']) + tparams['bd']
            #p = tensor.dot(outp[-1],l2norm(tparams['Wd'],axis=0))# + tparams['bd']
            if options.get('use_gumbel_mse', 0) == 0 or options.get(
                    'greedy', 0):
                p = tensor.nnet.softmax(logits)
            else:
                p = gumbel_softmax_sample(
                    self.trng, logits * self.softmax_smooth_factor,
                    self.gumb_temp, U, options.get('use_gumbel_hard', False))

            if options.get('computelogprob', 0):
                lProb = tensor.log(
                    tensor.nnet.softmax(logits * self.softmax_smooth_factor) +
                    1e-20)
            else:
                lProb = logits

            # Idx of the correct word should come from the
            xWIdx = ~dV_ * tensor.argmax(p, axis=-1)

            xWlogProb = ~dV_ * lProb[tensor.arange(nBatchSamps * n_samp),
                                     xWIdx] + lP_
            #xW = tparams['Wemb'][xWIdx.flatten()]
            if options.get('use_gumbel_hard', 0) and options.get(
                    'use_gumbel_mse', 0) and not options.get('greedy', 0):
                xW = p.dot(tparams['Wemb'])
            else:
                xW = theano.gradient.disconnected_grad(
                    tparams['Wemb'][xWIdx.flatten()].reshape(
                        [xWIdx.shape[0], -1]))

            doneVec = tensor.eq(xWIdx, tensor.zeros_like(xWIdx))

            return [xW, h, c, xWlogProb, doneVec, xWIdx,
                    p], theano.scan_module.until(doneVec.all())
    def lstm_advers_gen_layer(self, tparams, xI, xAux, options, prefix='lstm'):
        nBatchSamps = xI.shape[0]
        nMaxsteps = options.get('maxlen', 15)
        if nMaxsteps is None:
            nMaxsteps = 30
        n_samp = options.get('n_gen_samples', 1)

        h_depth = options.get('hidden_depth', 1)
        h_sz = options['hidden_size']

        # ----------------------  STEP FUNCTION  ---------------------- #
        def _stepP(U, xW_, h_, c_, lP_, dV_, xAux, xNoise):
            preact = tensor.dot(sliceT(h_, 0, h_sz),
                                tparams[_p(prefix, 'W_hid')])
            preact += (tensor.dot(xW_, tparams[_p(prefix, 'W_inp')]) +
                       tparams[_p(prefix, 'b')])
            preact += xAux
            if options.get('gen_input_noise', 0):
                preact += xNoise

            hL = [[]] * h_depth
            cL = [[]] * h_depth
            outp = [[]] * h_depth
            for di in xrange(h_depth):
                i = tensor.nnet.sigmoid(sliceT(preact, 0, h_sz))
                f = tensor.nnet.sigmoid(sliceT(preact, 1, h_sz))
                o = tensor.nnet.sigmoid(sliceT(preact, 2, h_sz))
                cL[di] = tensor.tanh(sliceT(preact, 3, h_sz))
                cL[di] = f * sliceT(c_, di, h_sz) + i * cL[di]
                hL[di] = o * tensor.tanh(cL[di])
                outp[di] = hL[di]
                if options.get('en_residual_conn', 1):
                    if (di > 0):
                        outp[di] += outp[di - 1]
                        print "Connecting residual at %d" % (di)
                if di < (h_depth - 1):
                    preact = tensor.dot(sliceT(h_, di+1, h_sz), tparams[_p(prefix, ('W_hid_' + str(di+1)))]) + \
                            tensor.dot(outp[di], tparams[_p(prefix, ('W_inp_' + str(di+1)))])

            c = tensor.concatenate(cL, axis=1)
            h = tensor.concatenate(hL, axis=1)

            logits = tensor.dot(outp[-1], tparams['Wd']) + tparams['bd']
            #p = tensor.dot(outp[-1],l2norm(tparams['Wd'],axis=0))# + tparams['bd']
            if options.get('use_gumbel_mse', 0) == 0 or options.get(
                    'greedy', 0):
                p = tensor.nnet.softmax(logits)
            else:
                p = gumbel_softmax_sample(
                    self.trng, logits * self.softmax_smooth_factor,
                    self.gumb_temp, U, options.get('use_gumbel_hard', False))

            if options.get('computelogprob', 0):
                lProb = tensor.log(
                    tensor.nnet.softmax(logits * self.softmax_smooth_factor) +
                    1e-20)
            else:
                lProb = logits

            # Idx of the correct word should come from the
            xWIdx = ~dV_ * tensor.argmax(p, axis=-1)

            xWlogProb = ~dV_ * lProb[tensor.arange(nBatchSamps * n_samp),
                                     xWIdx] + lP_
            #xW = tparams['Wemb'][xWIdx.flatten()]
            if options.get('use_gumbel_hard', 0) and options.get(
                    'use_gumbel_mse', 0) and not options.get('greedy', 0):
                xW = p.dot(tparams['Wemb'])
            else:
                xW = theano.gradient.disconnected_grad(
                    tparams['Wemb'][xWIdx.flatten()].reshape(
                        [xWIdx.shape[0], -1]))

            doneVec = tensor.eq(xWIdx, tensor.zeros_like(xWIdx))

            return [xW, h, c, xWlogProb, doneVec, xWIdx,
                    p], theano.scan_module.until(doneVec.all())

        # ------------------- END of STEP FUNCTION  -------------------- #

        if options.get('use_gumbel_mse', 0) == 0:
            U = self.trng.uniform((nMaxsteps, 1),
                                  low=0.,
                                  high=1.,
                                  dtype=theano.config.floatX)
        else:
            U = self.trng.uniform((nMaxsteps + 1, nBatchSamps * n_samp,
                                   options['vocabulary_size']),
                                  low=0.,
                                  high=1.,
                                  dtype=theano.config.floatX)

        xI = tensor.extra_ops.repeat(xI, n_samp, axis=0)
        xAux = tensor.extra_ops.repeat(tensor.dot(xAux,
                                                  tparams[_p(prefix,
                                                             'W_aux')]),
                                       n_samp,
                                       axis=0)

        if options.get('gen_input_noise', 0):
            xNoise = tensor.dot(
                self.trng.normal([nBatchSamps * n_samp, self.noise_dim]),
                tparams[_p(prefix, 'W_noise')])
        else:
            xNoise = []

        if options.get('gen_use_rand_init',
                       0) and not options.get('gen_input_noise', 0):
            h = tensor.unbroadcast(
                self.trng.uniform([nBatchSamps * n_samp, h_sz * h_depth],
                                  low=-0.1,
                                  high=0.1), 0, 1)
            c = tensor.unbroadcast(
                self.trng.uniform([nBatchSamps * n_samp, h_sz * h_depth],
                                  low=-0.1,
                                  high=0.1), 0, 1)
        else:
            h = tensor.zeros([nBatchSamps * n_samp, h_sz * h_depth])
            c = tensor.zeros([nBatchSamps * n_samp, h_sz * h_depth])

        lP = tensor.alloc(numpy_floatX(0.), nBatchSamps * n_samp)
        dV = tensor.alloc(np.bool_(0.), nBatchSamps * n_samp)

        # Propogate the image feature vector
        [_, h, c, _, _, _, _], _ = _stepP(U[0, :], xI, h, c, lP, dV, xAux,
                                          xNoise)

        xWStart = tensor.unbroadcast(
            tensor.tile(tparams['Wemb'][[0]], [nBatchSamps * n_samp, 1]), 0, 1)

        # Now lets do the loop.
        rval, updates = theano.scan(
            _stepP,
            sequences=[U[1:, :]],
            outputs_info=[xWStart, h, c, lP, dV, None, None],
            non_sequences=[xAux, xNoise],
            name=_p(prefix, 'adv_predict_layers'),
            n_steps=nMaxsteps)

        seq_lengths = theano.gradient.disconnected_grad(
            tensor.argmax(tensor.concatenate(
                [rval[4][:-1, :],
                 tensor.ones((1, nBatchSamps * n_samp))],
                axis=0),
                          axis=0) + 1)

        return rval[3][-1], rval[5], rval[6], updates, seq_lengths
        def _stepP(*in_list):
            x_inp = []
            h_inp = []
            c_inp = []
            for i in xrange(nmodels):
                x_inp.append(in_list[i])
                h_inp.append(in_list[nmodels + i])
                c_inp.append(in_list[2 * nmodels + i])
            lP_ = in_list[3 * nmodels]
            dV_ = in_list[3 * nmodels + 1]

            p_comb = tensor.alloc(numpy_floatX(0.), options[0]['output_size'])
            cf = []
            h = []
            xW = []
            for i in xrange(nmodels):
                preact = tensor.dot(h_inp[i], tparams[i][_p(prefix, 'W_hid')])
                preact += (
                    tensor.dot(x_inp[i], tparams[i][_p(prefix, 'W_inp')]) +
                    tparams[i][_p(prefix, 'b')])
                if options[i].get('en_aux_inp', 0):
                    preact += tensor.dot(aux_input2[i],
                                         tparams[i][_p(prefix, 'W_aux')])

                inp = tensor.nnet.sigmoid(
                    sliceT(preact, 0, options[i]['hidden_size']))
                f = tensor.nnet.sigmoid(
                    sliceT(preact, 1, options[i]['hidden_size']))
                o = tensor.nnet.sigmoid(
                    sliceT(preact, 2, options[i]['hidden_size']))
                c = tensor.tanh(sliceT(preact, 3, options[i]['hidden_size']))

                cf.append(f * c_inp[i] + inp * c)

                h.append(o * tensor.tanh(cf[i]))
                p = tensor.dot(h[i], tparams[i]['Wd']) + tparams[i]['bd']
                if i == 0:
                    p_comb = tparams[i]['comb_weight'] * tensor.nnet.softmax(p)
                else:
                    p_comb += tparams[i]['comb_weight'] * tensor.nnet.softmax(
                        p)

            lProb = tensor.log(p_comb + 1e-20)

            def _FindB_best(lPLcl, lPprev, dVLcl):
                srtLcl = tensor.argsort(-lPLcl)
                srtLcl = srtLcl[:beam_size]
                deltaVec = tensor.fill(lPLcl[srtLcl], numpy_floatX(-10000.))
                deltaVec = tensor.set_subtensor(deltaVec[0], lPprev)
                lProbBest = ifelse(tensor.eq(dVLcl, tensor.zeros_like(dVLcl)),
                                   lPLcl[srtLcl] + lPprev, deltaVec)
                xWIdxBest = ifelse(tensor.eq(dVLcl, tensor.zeros_like(dVLcl)),
                                   srtLcl, tensor.zeros_like(srtLcl))
                return lProbBest, xWIdxBest

            rvalLcl, updatesLcl = theano.scan(_FindB_best,
                                              sequences=[lProb, lP_, dV_],
                                              name=_p(prefix, 'FindBest'),
                                              n_steps=x_inp[0].shape[0])
            xWIdxBest = rvalLcl[1]
            lProbBest = rvalLcl[0]

            xWIdxBest = xWIdxBest.flatten()
            lProb = lProbBest.flatten()

            # Now sort and find the best among these best extensions for the current beams
            srtIdx = tensor.argsort(-lProb)
            srtIdx = srtIdx[:beam_size]
            xWlogProb = lProb[srtIdx]

            xWIdx = xWIdxBest[srtIdx]
            xCandIdx = srtIdx // beam_size  # Floor division

            doneVec = tensor.eq(xWIdx, tensor.zeros_like(xWIdx))

            x_out = []
            h_out = []
            c_out = []
            for i in xrange(nmodels):
                x_out.append(tparams[i]['Wemb'][xWIdx.flatten()])
                h_out.append(h[i].take(xCandIdx.flatten(), axis=0))
                c_out.append(cf[i].take(xCandIdx.flatten(), axis=0))

            out_list = []
            out_list.extend(x_out)
            out_list.extend(h_out)
            out_list.extend(c_out)
            out_list.extend([xWlogProb, doneVec, xWIdx, xCandIdx])

            return out_list, theano.scan_module.until(doneVec.all())
    def lstm_layer(self,
                   tparams,
                   state_below,
                   aux_input,
                   use_noise,
                   options,
                   prefix='lstm',
                   mask=None):
        nsteps = state_below.shape[0]
        h_depth = options.get('hidden_depth', 1)
        h_sz = options['hidden_size']

        if state_below.ndim == 3:
            n_samples = state_below.shape[1]
        else:
            n_samples = 1

        assert mask is not None

        def _step(m_, x_, h_, c_, xAux):
            preact = tensor.dot(sliceT(h_, 0, h_sz),
                                tparams[_p(prefix, 'W_hid')])
            preact += x_
            if options.get('en_aux_inp', 0):
                preact += tensor.dot(xAux, tparams[_p(prefix, 'W_aux')])

            #  preact += tparams[_p(prefix, 'b')]
            h = [[]] * h_depth
            c = [[]] * h_depth

            for di in xrange(h_depth):
                i = tensor.nnet.sigmoid(sliceT(preact, 0, h_sz))
                f = tensor.nnet.sigmoid(sliceT(preact, 1, h_sz))
                o = tensor.nnet.sigmoid(sliceT(preact, 2, h_sz))
                c[di] = tensor.tanh(sliceT(preact, 3, h_sz))
                c[di] = f * sliceT(c_, di, h_sz) + i * c[di]
                h[di] = o * tensor.tanh(c[di])
                if di < (h_depth - 1):
                    preact = tensor.dot(sliceT(h_, di+1, h_sz), tparams[_p(prefix, ('W_hid_' + str(di+1)))]) + \
                            tensor.dot(h[di], tparams[_p(prefix, ('W_inp_' + str(di+1)))])

            c_out = tensor.concatenate(c, axis=1)
            h_out = tensor.concatenate(h, axis=1)

            return h_out, c_out

        state_below = (tensor.dot(state_below, tparams[_p(prefix, 'W_inp')]) +
                       tparams[_p(prefix, 'b')])

        if options.get('en_aux_inp', 0) == 0:
            aux_input = []

        rval, updates = theano.scan(
            _step,
            sequences=[mask, state_below],
            outputs_info=[
                tensor.alloc(numpy_floatX(0.), n_samples, h_depth * h_sz),
                tensor.alloc(numpy_floatX(0.), n_samples, h_depth * h_sz),
                #tensor.alloc(numpy_floatX(0.),n_samples,options['output_size'])],
            ],
            non_sequences=[aux_input],
            name=_p(prefix, '_layers'),
            n_steps=nsteps)
        return rval, updates