def rmsprop(self, lr, tparams, grads, inp_list, cost, params):
    clip = params['grad_clip']
    decay_rate = params['decay_rate'] 
    smooth_eps = params['smooth_eps']
    zipped_grads = [theano.shared(p.get_value() * numpy_floatX(0.),
                                  name='%s_grad' % k)
                    for k, p in tparams.iteritems()]
    running_grads2 = [theano.shared(p.get_value() * numpy_floatX(0.),
                                    name='%s_rgrad2' % k)
                      for k, p in tparams.iteritems()]
    zgup = [(zg, g) for zg, g in zip(zipped_grads, grads)]
    if clip > 0:
        rg2up = [(rg2, decay_rate * rg2 + (1 - decay_rate) * (tensor.clip(g,-clip,clip) ** 2))
             for rg2, g in zip(running_grads2, grads)]
    else:
        rg2up = [(rg2, decay_rate * rg2 + (1 - decay_rate) * (g ** 2))
             for rg2, g in zip(running_grads2, grads)]
  
    f_grad_shared = theano.function(inp_list, cost,
                                    updates=zgup + rg2up,
                                    name='rmsprop_f_grad_shared')

    updir = [theano.shared(p.get_value() * numpy_floatX(0.),
                           name='%s_updir' % k)
             for k, p in tparams.iteritems()]
    updir_new = [(ud, -lr * zg / (tensor.sqrt(rg2)+ smooth_eps))
                 for ud, zg, rg2 in zip(updir, zipped_grads, 
                                            running_grads2)]
    param_up = [(p, p + udn[1])
                for p, udn in zip(tparams.values(), updir_new)]
    f_update = theano.function([lr], [], updates=updir_new + param_up,
                               on_unused_input='ignore',
                               name='rmsprop_f_update')

    return f_grad_shared, f_update, zipped_grads, running_grads2, updir
    def lstm_enc_layer(self, tparams, state_below, prefix='lstm'):
        nsteps = state_below.shape[0]
        h_depth = self.hidden_depth
        h_sz = self.hidden_size

        if state_below.ndim == 3:
            n_samples = state_below.shape[1]
        else:
            n_samples = 1

        def _step(x_in, h_, c_):
            preact = tensor.dot(sliceT(h_, 0, h_sz),
                                tparams[_p(prefix, 'W_hid')])
            preact += x_in

            #  preact += tparams[_p(prefix, 'b')]
            h = [[]] * h_depth
            c = [[]] * h_depth
            outp = [[]] * h_depth

            for di in xrange(h_depth):
                i = tensor.nnet.sigmoid(sliceT(preact, 0, h_sz))
                f = tensor.nnet.sigmoid(sliceT(preact, 1, h_sz))
                o = tensor.nnet.sigmoid(sliceT(preact, 2, h_sz))
                c[di] = tensor.tanh(sliceT(preact, 3, h_sz))
                c[di] = f * sliceT(c_, di, h_sz) + i * c[di]
                h[di] = o * tensor.tanh(c[di])
                outp[di] = h[di]
                if self.en_residual_conn:
                    if (di > 0):
                        outp[di] += outp[di - 1]
                        print "Connecting residual at %d" % (di)
                if di < (h_depth - 1):
                    preact = tensor.dot(sliceT(h_, di+1, h_sz), tparams[_p(prefix, ('W_hid_' + str(di+1)))]) + \
                            tensor.dot(outp[di], tparams[_p(prefix, ('W_inp_' + str(di+1)))])

            c_out = tensor.concatenate(c, axis=1)
            h_out = tensor.concatenate(h + [outp[-1]], axis=1)

            return h_out, c_out

        state_below = (tensor.dot(state_below, tparams[_p(prefix, 'W_inp')]) +
                       tparams[_p(prefix, 'b')])

        rval, updates = theano.scan(_step,
                                    sequences=[state_below],
                                    outputs_info=[
                                        tensor.alloc(numpy_floatX(0.),
                                                     n_samples,
                                                     (h_depth + 1) * h_sz),
                                        tensor.alloc(numpy_floatX(0.),
                                                     n_samples,
                                                     h_depth * h_sz),
                                    ],
                                    name=_p(prefix, '_layers'),
                                    n_steps=nsteps)
        return rval, updates
  def lstm_layer(self, tparams, state_below, aux_input, use_noise, options, prefix='lstm', mask=None):
    nsteps = state_below.shape[0]
    h_depth = options.get('hidden_depth',1)
    h_sz = options['hidden_size']
    
    if state_below.ndim == 3:
        n_samples = state_below.shape[1]
    else:
        n_samples = 1

    assert mask is not None

    def _step(m_, x_, h_, c_, xAux):
        preact = tensor.dot(sliceT(h_, 0, h_sz), tparams[_p(prefix, 'W_hid')])
        preact += x_
        if options.get('en_aux_inp',0):
            preact += tensor.dot(xAux,tparams[_p(prefix,'W_aux')])

        #  preact += tparams[_p(prefix, 'b')]
        h = [[]]*h_depth 
        c = [[]]*h_depth 
        
        for di in xrange(h_depth):
            i = tensor.nnet.sigmoid(sliceT(preact, 0, h_sz))
            f = tensor.nnet.sigmoid(sliceT(preact, 1, h_sz))
            o = tensor.nnet.sigmoid(sliceT(preact, 2, h_sz))
            c[di] = tensor.tanh(sliceT(preact, 3, h_sz))
            c[di] = f * sliceT(c_, di, h_sz) + i * c[di]
            h[di] = o * tensor.tanh(c[di])
            if di < (h_depth - 1):
                preact = tensor.dot(sliceT(h_, di+1, h_sz), tparams[_p(prefix, ('W_hid_' + str(di+1)))]) + \
                        tensor.dot(h[di], tparams[_p(prefix, ('W_inp_' + str(di+1)))])
        
        c_out = tensor.concatenate(c,axis=1)
        h_out = tensor.concatenate(h,axis=1)

        return h_out, c_out

    state_below = (tensor.dot(state_below, tparams[_p(prefix, 'W_inp')]) + tparams[_p(prefix, 'b')])
    
    if options.get('en_aux_inp',0) == 0:
       aux_input = [] 

    rval, updates = theano.scan(_step,
                                sequences=[mask, state_below],
                                outputs_info=[tensor.alloc(numpy_floatX(0.),
                                                           n_samples,
                                                           h_depth*h_sz),
                                              tensor.alloc(numpy_floatX(0.),
                                                           n_samples,
                                                           h_depth*h_sz),
                                              #tensor.alloc(numpy_floatX(0.),n_samples,options['output_size'])],
                                              ],
                                non_sequences = [aux_input] ,
                                name=_p(prefix, '_layers'),
                                n_steps=nsteps)
    return rval, updates
 def _FindB_best(lPLcl, lPprev, dVLcl):
     srtLcl = tensor.argsort(-lPLcl)
     srtLcl = srtLcl[:beam_size]
     deltaVec = tensor.fill( lPLcl[srtLcl], numpy_floatX(-10000.))
     deltaVec = tensor.set_subtensor(deltaVec[0], lPprev)
     lProbBest = ifelse(tensor.eq( dVLcl, tensor.zeros_like(dVLcl)), lPLcl[srtLcl] + lPprev, deltaVec)
     xWIdxBest = ifelse(tensor.eq( dVLcl, tensor.zeros_like(dVLcl)), srtLcl, tensor.zeros_like(srtLcl)) 
     return lProbBest, xWIdxBest 
    def rmsprop(self, lr, tparams, grads, inp_list, cost, params):
        clip = params['grad_clip']
        decay_rate = params['decay_rate']
        smooth_eps = params['smooth_eps']
        zipped_grads = [
            theano.shared(p.get_value() * numpy_floatX(0.), name='%s_grad' % k)
            for k, p in tparams.iteritems()
        ]
        running_grads2 = [
            theano.shared(p.get_value() * numpy_floatX(0.),
                          name='%s_rgrad2' % k)
            for k, p in tparams.iteritems()
        ]
        zgup = [(zg, g) for zg, g in zip(zipped_grads, grads)]
        if clip > 0:
            rg2up = [(rg2, decay_rate * rg2 + (1 - decay_rate) *
                      (tensor.clip(g, -clip, clip)**2))
                     for rg2, g in zip(running_grads2, grads)]
        else:
            rg2up = [(rg2, decay_rate * rg2 + (1 - decay_rate) * (g**2))
                     for rg2, g in zip(running_grads2, grads)]

        f_grad_shared = theano.function(inp_list,
                                        cost,
                                        updates=zgup + rg2up,
                                        name='rmsprop_f_grad_shared')

        updir = [
            theano.shared(p.get_value() * numpy_floatX(0.),
                          name='%s_updir' % k) for k, p in tparams.iteritems()
        ]
        updir_new = [
            (ud, -lr * zg / (tensor.sqrt(rg2) + smooth_eps))
            for ud, zg, rg2 in zip(updir, zipped_grads, running_grads2)
        ]
        param_up = [(p, p + udn[1])
                    for p, udn in zip(tparams.values(), updir_new)]
        f_update = theano.function([lr], [],
                                   updates=updir_new + param_up,
                                   on_unused_input='ignore',
                                   name='rmsprop_f_update')

        return f_grad_shared, f_update, zipped_grads, running_grads2, updir
 def _FindB_best(lPLcl, lPprev, dVLcl):
     srtLcl = tensor.argsort(-lPLcl)
     srtLcl = srtLcl[:beam_size]
     deltaVec = tensor.fill(lPLcl[srtLcl], numpy_floatX(-10000.))
     deltaVec = tensor.set_subtensor(deltaVec[0], lPprev)
     lProbBest = ifelse(tensor.eq(dVLcl, tensor.zeros_like(dVLcl)),
                        lPLcl[srtLcl] + lPprev, deltaVec)
     xWIdxBest = ifelse(tensor.eq(dVLcl, tensor.zeros_like(dVLcl)),
                        srtLcl, tensor.zeros_like(srtLcl))
     return lProbBest, xWIdxBest
        def _stepP(x_, h_, c_, lP_, dV_, xAux):
            preact = tensor.dot(sliceT(h_, 0, h_sz),
                                tparams[_p(prefix, 'W_hid')])
            preact += (tensor.dot(x_, tparams[_p(prefix, 'W_inp')]) +
                       tparams[_p(prefix, 'b')])
            if options.get('en_aux_inp', 0):
                preact += tensor.dot(xAux, tparams[_p(prefix, 'W_aux')])

            hL = [[]] * h_depth
            cL = [[]] * h_depth
            for di in xrange(h_depth):
                i = tensor.nnet.sigmoid(sliceT(preact, 0, h_sz))
                f = tensor.nnet.sigmoid(sliceT(preact, 1, h_sz))
                o = tensor.nnet.sigmoid(sliceT(preact, 2, h_sz))
                cL[di] = tensor.tanh(sliceT(preact, 3, h_sz))
                cL[di] = f * sliceT(c_, di, h_sz) + i * cL[di]
                hL[di] = o * tensor.tanh(cL[di])
                if di < (h_depth - 1):
                    preact = tensor.dot(sliceT(h_, di+1, h_sz), tparams[_p(prefix, ('W_hid_' + str(di+1)))]) + \
                            tensor.dot(hL[di], tparams[_p(prefix, ('W_inp_' + str(di+1)))])

            c = tensor.concatenate(cL, axis=1)
            h = tensor.concatenate(hL, axis=1)

            p = tensor.dot(hL[-1], tparams['Wd']) + tparams['bd']
            smooth_factor = tensor.as_tensor_variable(numpy_floatX(
                options.get('softmax_smooth_factor', 1.0)),
                                                      name='sm_f')
            p = tensor.nnet.softmax(p * smooth_factor)
            lProb = tensor.log(p + 1e-20)

            #xCandIdx = tensor.as_tensor_variable([0])
            lProb = lProb.flatten()
            xWIdx = tensor.argmax(lProb, keepdims=True)
            xWlogProb = lProb[xWIdx] + lP_

            if options.get('softmax_propogate', 0) == 0:
                xW = tparams['Wemb'][xWIdx.flatten()]
            else:
                xW = p.dot(tparams['Wemb'])
            doneVec = tensor.eq(xWIdx, tensor.zeros_like(xWIdx))

            return [xW, h, c, xWlogProb, doneVec, xWIdx,
                    p], theano.scan_module.until(doneVec.all())
Example #8
0
  def rmsprop(self, lr, tparams, grads, inp_list, cost, params, prior_updates=[], w_clip = None):
    clip = params['grad_clip']
    decay_rate = tensor.constant(params['decay_rate'], dtype=theano.config.floatX)
    smooth_eps = tensor.constant(params['smooth_eps'], dtype=theano.config.floatX)
    zipped_grads = [theano.shared(np.zeros_like(p.get_value()),
                                  name='%s_grad' % k)
                    for k, p in tparams.iteritems()]
    running_grads2 = [theano.shared(np.zeros_like(p.get_value()),
                                    name='%s_rgrad2' % k)
                      for k, p in tparams.iteritems()]
    zgup = [(zg, g) for zg, g in zip(zipped_grads, grads)]
    if clip > 0.0:
        rg2up = [(rg2, tensor.clip(decay_rate * rg2 + (1 - decay_rate) * (tensor.clip(g,-clip,clip) ** 2),0.0,np.inf))
             for rg2, g in zip(running_grads2, grads)]
    else:
        rg2up = [(rg2, tensor.clip(decay_rate * rg2 + (1 - decay_rate) * (g ** 2),0.0,np.inf))
             for rg2, g in zip(running_grads2, grads)]

    f_grad_shared = theano.function(inp_list, cost,
                                    updates=zgup + rg2up + prior_updates,
                                    name='rmsprop_f_grad_shared')

    updir = [theano.shared(p.get_value() * numpy_floatX(0.),
                           name='%s_updir' % k)
             for k, p in tparams.iteritems()]
    updir_new = [(ud, -lr * zg / (tensor.sqrt(rg2)+ smooth_eps))
                 for ud, zg, rg2 in zip(updir, zipped_grads,
                                            running_grads2)]
    if w_clip != None:
        print 'clipping weights with %.2f in RMS-PROP'%(w_clip)
        param_up = [(p, tensor.clip(p + udn[1], -w_clip, w_clip))
                    for p, udn in zip(tparams.values(), updir_new)]
    else:
        param_up = [(p, p + udn[1])
                    for p, udn in zip(tparams.values(), updir_new)]
    f_update = theano.function([lr], [], updates=updir_new + param_up,
                               on_unused_input='ignore',
                               name='rmsprop_f_update')

    return f_grad_shared, f_update, zipped_grads, running_grads2, updir
    def _stepP(x_, h_, c_, lP_, dV_, xAux):
        preact = tensor.dot(sliceT(h_, 0, h_sz), tparams[_p(prefix, 'W_hid')])
        preact += (tensor.dot(x_, tparams[_p(prefix, 'W_inp')]) +
                   tparams[_p(prefix, 'b')])
        if options.get('en_aux_inp',0):
            preact += tensor.dot(xAux,tparams[_p(prefix,'W_aux')])
  
        hL = [[]]*h_depth 
        cL = [[]]*h_depth 
        for di in xrange(h_depth):
            i = tensor.nnet.sigmoid(sliceT(preact, 0, h_sz))
            f = tensor.nnet.sigmoid(sliceT(preact, 1, h_sz))
            o = tensor.nnet.sigmoid(sliceT(preact, 2, h_sz))
            cL[di] = tensor.tanh(sliceT(preact, 3, h_sz))
            cL[di] = f * sliceT(c_, di, h_sz) + i * cL[di]
            hL[di] = o * tensor.tanh(cL[di])
            if di < (h_depth - 1):
                preact = tensor.dot(sliceT(h_, di+1, h_sz), tparams[_p(prefix, ('W_hid_' + str(di+1)))]) + \
                        tensor.dot(hL[di], tparams[_p(prefix, ('W_inp_' + str(di+1)))])
        
        c = tensor.concatenate(cL,axis=1)
        h = tensor.concatenate(hL,axis=1)
  
        p = tensor.dot(hL[-1],tparams['Wd']) + tparams['bd']
        smooth_factor = tensor.as_tensor_variable(numpy_floatX(options.get('softmax_smooth_factor',1.0)), name='sm_f')
        p = tensor.nnet.softmax(p*smooth_factor)
        lProb = tensor.log(p + 1e-20)

        #xCandIdx = tensor.as_tensor_variable([0]) 
        lProb = lProb.flatten()
        xWIdx =  tensor.argmax(lProb,keepdims=True)
        xWlogProb = lProb[xWIdx] + lP_
        
        if options.get('softmax_propogate',0) == 0:
            xW = tparams['Wemb'][xWIdx.flatten()]
        else:
            xW = p.dot(tparams['Wemb'])
        doneVec = tensor.eq(xWIdx,tensor.zeros_like(xWIdx))
  
        return [xW, h, c, xWlogProb, doneVec, xWIdx, p], theano.scan_module.until(doneVec.all())
    def _stepP(x_, h_, c_, lP_, dV_, xAux):
        preact = tensor.dot(sliceT(h_, 0, h_sz), tparams[_p(prefix, 'W_hid')])
        preact += (tensor.dot(x_, tparams[_p(prefix, 'W_inp')]) +
                   tparams[_p(prefix, 'b')])
        if options.get('en_aux_inp',0):
            preact += tensor.dot(xAux,tparams[_p(prefix,'W_aux')])
  
        hL = [[]]*h_depth 
        cL = [[]]*h_depth 
        for di in xrange(h_depth):
            i = tensor.nnet.sigmoid(sliceT(preact, 0, h_sz))
            f = tensor.nnet.sigmoid(sliceT(preact, 1, h_sz))
            o = tensor.nnet.sigmoid(sliceT(preact, 2, h_sz))
            cL[di] = tensor.tanh(sliceT(preact, 3, h_sz))
            cL[di] = f * sliceT(c_, di, h_sz) + i * cL[di]
            hL[di] = o * tensor.tanh(cL[di])
            if di < (h_depth - 1):
                preact = tensor.dot(sliceT(h_, di+1, h_sz), tparams[_p(prefix, ('W_hid_' + str(di+1)))]) + \
                        tensor.dot(hL[di], tparams[_p(prefix, ('W_inp_' + str(di+1)))])
        
        c = tensor.concatenate(cL,axis=1)
        h = tensor.concatenate(hL,axis=1)
  
        if options.get('class_out_factoring',0) == 1:
            pC    = tensor.dot(hL[-1],tparams['WdCls']) + tparams['bdCls']
            pCSft = tensor.nnet.softmax(pC)
            xCIdx =  tensor.argmax(pCSft)
            pW = tensor.dot(h[-1],tparams['Wd'][:,xCIdx,:]) + tparams['bd'][:,xCIdx,:]
            smooth_factor = tensor.as_tensor_variable(numpy_floatX(options.get('softmax_smooth_factor',1.0)), name='sm_f')
            pWSft = tensor.nnet.softmax(pW*smooth_factor)
            lProb = tensor.log(pWSft + 1e-20) + tensor.log(pCSft[0,xCIdx] + 1e-20)
        else:
            p = tensor.dot(hL[-1],tparams['Wd']) + tparams['bd']
            smooth_factor = tensor.as_tensor_variable(numpy_floatX(options.get('softmax_smooth_factor',1.0)), name='sm_f')
            p = tensor.nnet.softmax(p*smooth_factor)
            lProb = tensor.log(p + 1e-20)

        if beam_size > 1:
            def _FindB_best(lPLcl, lPprev, dVLcl):
                srtLcl = tensor.argsort(-lPLcl)
                srtLcl = srtLcl[:beam_size]
                deltaVec = tensor.fill( lPLcl[srtLcl], numpy_floatX(-10000.))
                deltaVec = tensor.set_subtensor(deltaVec[0], lPprev)
                lProbBest = ifelse(tensor.eq( dVLcl, tensor.zeros_like(dVLcl)), lPLcl[srtLcl] + lPprev, deltaVec)
                xWIdxBest = ifelse(tensor.eq( dVLcl, tensor.zeros_like(dVLcl)), srtLcl, tensor.zeros_like(srtLcl)) 
                return lProbBest, xWIdxBest 
  
            rvalLcl, updatesLcl = theano.scan(_FindB_best, sequences = [lProb, lP_, dV_], name=_p(prefix, 'FindBest'), n_steps=x_.shape[0])
            xWIdxBest = rvalLcl[1]
            lProbBest = rvalLcl[0]
  
            xWIdxBest = xWIdxBest.flatten()
            lProb = lProbBest.flatten()
            # Now sort and find the best among these best extensions for the current beams
            srtIdx = tensor.argsort(-lProb)
            srtIdx = srtIdx[:beam_size]
            xCandIdx = srtIdx // beam_size # Floor division 
            h = h.take(xCandIdx.flatten(),axis=0)
            c = c.take(xCandIdx.flatten(),axis=0)
            xWlogProb = lProb[srtIdx]
            xWIdx = xWIdxBest[srtIdx]
        else:
            xCandIdx = tensor.as_tensor_variable([0]) 
            lProb = lProb.flatten()
            xWIdx =  tensor.argmax(lProb,keepdims=True)
            xWlogProb = lProb[xWIdx] + lP_
            if options.get('class_out_factoring',0) == 1:
                clsoffset = tensor.as_tensor_variable(options['ixtoclsinfo'][:,0])
                xWIdx += clsoffset[xCIdx]
            h = h.take(xCandIdx.flatten(),axis=0)
            c = c.take(xCandIdx.flatten(),axis=0)
        
        if options.get('softmax_propogate',0) == 0:
            xW = tparams['Wemb'][xWIdx.flatten()]
        else:
            xW = p.dot(tparams['Wemb'])
        doneVec = tensor.eq(xWIdx,tensor.zeros_like(xWIdx))
  
        return [xW, h, c, xWlogProb, doneVec, xWIdx, xCandIdx], theano.scan_module.until(doneVec.all())
        def _stepP(*in_list):
            x_inp = []
            h_inp = []
            c_inp = []
            for i in xrange(nmodels):
                x_inp.append(in_list[i])
                h_inp.append(in_list[nmodels + i])
                c_inp.append(in_list[2 * nmodels + i])
            lP_ = in_list[3 * nmodels]
            dV_ = in_list[3 * nmodels + 1]

            p_comb = tensor.alloc(numpy_floatX(0.), options[0]['output_size'])
            cf = []
            h = []
            xW = []
            for i in xrange(nmodels):
                preact = tensor.dot(h_inp[i], tparams[i][_p(prefix, 'W_hid')])
                preact += (
                    tensor.dot(x_inp[i], tparams[i][_p(prefix, 'W_inp')]) +
                    tparams[i][_p(prefix, 'b')])
                if options[i].get('en_aux_inp', 0):
                    preact += tensor.dot(aux_input2[i],
                                         tparams[i][_p(prefix, 'W_aux')])

                inp = tensor.nnet.sigmoid(
                    sliceT(preact, 0, options[i]['hidden_size']))
                f = tensor.nnet.sigmoid(
                    sliceT(preact, 1, options[i]['hidden_size']))
                o = tensor.nnet.sigmoid(
                    sliceT(preact, 2, options[i]['hidden_size']))
                c = tensor.tanh(sliceT(preact, 3, options[i]['hidden_size']))

                cf.append(f * c_inp[i] + inp * c)

                h.append(o * tensor.tanh(cf[i]))
                p = tensor.dot(h[i], tparams[i]['Wd']) + tparams[i]['bd']
                if i == 0:
                    p_comb = tparams[i]['comb_weight'] * tensor.nnet.softmax(p)
                else:
                    p_comb += tparams[i]['comb_weight'] * tensor.nnet.softmax(
                        p)

            lProb = tensor.log(p_comb + 1e-20)

            def _FindB_best(lPLcl, lPprev, dVLcl):
                srtLcl = tensor.argsort(-lPLcl)
                srtLcl = srtLcl[:beam_size]
                deltaVec = tensor.fill(lPLcl[srtLcl], numpy_floatX(-10000.))
                deltaVec = tensor.set_subtensor(deltaVec[0], lPprev)
                lProbBest = ifelse(tensor.eq(dVLcl, tensor.zeros_like(dVLcl)),
                                   lPLcl[srtLcl] + lPprev, deltaVec)
                xWIdxBest = ifelse(tensor.eq(dVLcl, tensor.zeros_like(dVLcl)),
                                   srtLcl, tensor.zeros_like(srtLcl))
                return lProbBest, xWIdxBest

            rvalLcl, updatesLcl = theano.scan(_FindB_best,
                                              sequences=[lProb, lP_, dV_],
                                              name=_p(prefix, 'FindBest'),
                                              n_steps=x_inp[0].shape[0])
            xWIdxBest = rvalLcl[1]
            lProbBest = rvalLcl[0]

            xWIdxBest = xWIdxBest.flatten()
            lProb = lProbBest.flatten()

            # Now sort and find the best among these best extensions for the current beams
            srtIdx = tensor.argsort(-lProb)
            srtIdx = srtIdx[:beam_size]
            xWlogProb = lProb[srtIdx]

            xWIdx = xWIdxBest[srtIdx]
            xCandIdx = srtIdx // beam_size  # Floor division

            doneVec = tensor.eq(xWIdx, tensor.zeros_like(xWIdx))

            x_out = []
            h_out = []
            c_out = []
            for i in xrange(nmodels):
                x_out.append(tparams[i]['Wemb'][xWIdx.flatten()])
                h_out.append(h[i].take(xCandIdx.flatten(), axis=0))
                c_out.append(cf[i].take(xCandIdx.flatten(), axis=0))

            out_list = []
            out_list.extend(x_out)
            out_list.extend(h_out)
            out_list.extend(c_out)
            out_list.extend([xWlogProb, doneVec, xWIdx, xCandIdx])

            return out_list, theano.scan_module.until(doneVec.all())
    def build_eval_other_sent(self, tparams, options, model_npy):

        zipp(model_npy, self.model_th)

        # Used for dropout.
        use_noise = theano.shared(numpy_floatX(0.))

        xW = tensor.matrix('xW', dtype='int64')
        mask = tensor.matrix('mask', dtype=config.floatX)
        n_timesteps = xW.shape[0]
        n_samples = xW.shape[1]
        n_out_samps = (n_timesteps - 1) * n_samples

        embW = tparams['Wemb'][xW.flatten()].reshape(
            [n_timesteps, n_samples, options['word_encoding_size']])
        xI = tensor.matrix('xI', dtype=config.floatX)
        xAux = tensor.matrix('xAux', dtype=config.floatX)

        if options.get('swap_aux', 0):
            xAuxEmb = tensor.dot(xAux,
                                 tparams['WIemb_aux']) + tparams['b_Img_aux']
        else:
            xAuxEmb = xAux

        embImg = (tensor.dot(xI, tparams['WIemb']) + tparams['b_Img']).reshape(
            [1, n_samples, options['image_encoding_size']])
        emb = tensor.concatenate([embImg, embW], axis=0)

        rval, updatesLSTM = basic_lstm_layer(tparams,
                                             emb[:n_timesteps, :, :],
                                             xAuxEmb,
                                             use_noise,
                                             options,
                                             prefix=options['generator'])
        p = sliceT(rval[0][1:, :, :], options.get('hidden_depth', 1),
                   options['hidden_size'])

        pW = (tensor.dot(p, tparams['Wd']) + tparams['bd']).reshape(
            [n_out_samps, options['output_size']])

        pWSft = tensor.nnet.softmax(pW)
        totProb = pWSft[tensor.arange(n_out_samps), xW[1:, :].flatten()]

        #    #pred = tensor.nnet.softmax(p)
        #
        #    #pred = rval[2]
        #
        #    #pred = pred[1:,:,:]
        #
        #    def accumCost(pred,xW,m,c_sum,ppl_sum):
        #        pred = tensor.nnet.softmax(pred)
        #        c_sum += (tensor.log(pred[tensor.arange(n_samples), xW]+1e-20) * m)
        #        ppl_sum += -(tensor.log2(pred[tensor.arange(n_samples), xW]+1e-10) * m)
        #        return c_sum, ppl_sum
        #
        #    sums, upd = theano.scan(fn=accumCost,
        #                                outputs_info=[tensor.alloc(numpy_floatX(0.), 1,n_samples),
        #                                              tensor.alloc(numpy_floatX(0.), 1,n_samples)],
        #                                sequences = [p, xW[1:,:], mask[1:,:]])
        # NOTE1: we are leaving out the first prediction, which was made for the image
        # and is meaningless. Here cost[0] contains log probability (log10) and cost[1] contains
        # perplexity (log2)
        tot_cost = -(tensor.log(totProb + 1e-10) * mask[1:, :].flatten()).sum()
        cost = tot_cost / options['batch_size']

        inp_list = [xW, mask, xI]

        if options.get('en_aux_inp', 0):
            inp_list.append(xAux)

        self.f_pred_prob_other = theano.function(inp_list,
                                                 p,
                                                 name='f_pred_prob',
                                                 updates=updatesLSTM)
        #f_pred = theano.function([xW, mask], pred.argmax(axis=1), name='f_pred')

        #cost = -tensor.log(pred[tensor.arange(n_timesteps),tensor.arange(n_samples), xW] + 1e-8).mean()

        self.f_eval_other = theano.function(inp_list, cost, name='f_eval')

        return use_noise, inp_list, self.f_pred_prob_other, cost, pW, updatesLSTM
    def lstm_advers_gen_layer(self,
                              tparams,
                              Xi,
                              aux_input,
                              options,
                              beam_size,
                              prefix='lstm'):
        nMaxsteps = options.get('maxlen', 15)
        n_samples = 1
        h_depth = options.get('hidden_depth', 1)
        h_sz = options['hidden_size']

        # ----------------------  STEP FUNCTION  ---------------------- #
        def _stepP(x_, h_, c_, lP_, dV_, xAux):
            preact = tensor.dot(sliceT(h_, 0, h_sz),
                                tparams[_p(prefix, 'W_hid')])
            preact += (tensor.dot(x_, tparams[_p(prefix, 'W_inp')]) +
                       tparams[_p(prefix, 'b')])
            if options.get('en_aux_inp', 0):
                preact += tensor.dot(xAux, tparams[_p(prefix, 'W_aux')])

            hL = [[]] * h_depth
            cL = [[]] * h_depth
            for di in xrange(h_depth):
                i = tensor.nnet.sigmoid(sliceT(preact, 0, h_sz))
                f = tensor.nnet.sigmoid(sliceT(preact, 1, h_sz))
                o = tensor.nnet.sigmoid(sliceT(preact, 2, h_sz))
                cL[di] = tensor.tanh(sliceT(preact, 3, h_sz))
                cL[di] = f * sliceT(c_, di, h_sz) + i * cL[di]
                hL[di] = o * tensor.tanh(cL[di])
                if di < (h_depth - 1):
                    preact = tensor.dot(sliceT(h_, di+1, h_sz), tparams[_p(prefix, ('W_hid_' + str(di+1)))]) + \
                            tensor.dot(hL[di], tparams[_p(prefix, ('W_inp_' + str(di+1)))])

            c = tensor.concatenate(cL, axis=1)
            h = tensor.concatenate(hL, axis=1)

            p = tensor.dot(hL[-1], tparams['Wd']) + tparams['bd']
            smooth_factor = tensor.as_tensor_variable(numpy_floatX(
                options.get('softmax_smooth_factor', 1.0)),
                                                      name='sm_f')
            p = tensor.nnet.softmax(p * smooth_factor)
            lProb = tensor.log(p + 1e-20)

            #xCandIdx = tensor.as_tensor_variable([0])
            lProb = lProb.flatten()
            xWIdx = tensor.argmax(lProb, keepdims=True)
            xWlogProb = lProb[xWIdx] + lP_

            if options.get('softmax_propogate', 0) == 0:
                xW = tparams['Wemb'][xWIdx.flatten()]
            else:
                xW = p.dot(tparams['Wemb'])
            doneVec = tensor.eq(xWIdx, tensor.zeros_like(xWIdx))

            return [xW, h, c, xWlogProb, doneVec, xWIdx,
                    p], theano.scan_module.until(doneVec.all())

        # ------------------- END of STEP FUNCTION  -------------------- #

        if options.get('en_aux_inp', 0) == 0:
            aux_input = []

        h = tensor.alloc(numpy_floatX(0.), n_samples, h_sz * h_depth)
        c = tensor.alloc(numpy_floatX(0.), n_samples, h_sz * h_depth)

        lP = tensor.alloc(numpy_floatX(0.), beam_size)
        dV = tensor.alloc(np.int8(0.), beam_size)

        # Propogate the image feature vector
        [xW, h, c, _, _, _, _], _ = _stepP(Xi, h, c, lP, dV, aux_input)

        xWStart = tparams['Wemb'][0, :]
        [xW, h, c, lP, dV, idx0, p0], _ = _stepP(xWStart, h, c, lP, dV,
                                                 aux_input)

        #if options.get('en_aux_inp',0) == 1:
        #    aux_input = tensor.extra_ops.repeat(aux_input,beam_size,axis=0)

        # Now lets do the loop.
        rval, updates = theano.scan(
            _stepP,
            outputs_info=[xW, h, c, lP, dV, None, None],
            non_sequences=[aux_input],
            name=_p(prefix, 'predict_layers'),
            n_steps=nMaxsteps - 1)

        return rval[3][-1], tensor.concatenate(
            [idx0.reshape([1, beam_size]), rval[5]],
            axis=0), tensor.concatenate(
                [tensor.shape_padleft(p0, n_ones=1), rval[6]], axis=0), updates
    def lstm_predict_layer(self,
                           tparams,
                           Xi,
                           aux_input,
                           options,
                           beam_size,
                           prefix='lstm'):

        nMaxsteps = options.get('maxlen', 30)

        if nMaxsteps is None:
            nMaxsteps = 30
        n_samples = 1
        h_depth = options.get('hidden_depth', 1)
        h_sz = options['hidden_size']

        # ----------------------  STEP FUNCTION  ---------------------- #
        def _stepP(x_, h_, c_, lP_, dV_, xAux):
            preact = tensor.dot(sliceT(h_, 0, h_sz),
                                tparams[_p(prefix, 'W_hid')])
            preact += (tensor.dot(x_, tparams[_p(prefix, 'W_inp')]) +
                       tparams[_p(prefix, 'b')])
            if options.get('en_aux_inp', 0):
                preact += tensor.dot(xAux, tparams[_p(prefix, 'W_aux')])

            hL = [[]] * h_depth
            cL = [[]] * h_depth
            outp = [[]] * h_depth
            for di in xrange(h_depth):
                i = tensor.nnet.sigmoid(sliceT(preact, 0, h_sz))
                f = tensor.nnet.sigmoid(sliceT(preact, 1, h_sz))
                o = tensor.nnet.sigmoid(sliceT(preact, 2, h_sz))
                cL[di] = tensor.tanh(sliceT(preact, 3, h_sz))
                cL[di] = f * sliceT(c_, di, h_sz) + i * cL[di]
                hL[di] = o * tensor.tanh(cL[di])
                outp[di] = hL[di]
                if options.get('en_residual_conn', 1):
                    if (di > 0):
                        outp[di] += outp[di - 1]
                        print "Connecting residual at %d" % (di)
                if di < (h_depth - 1):
                    preact = tensor.dot(sliceT(h_, di+1, h_sz), tparams[_p(prefix, ('W_hid_' + str(di+1)))]) + \
                            tensor.dot(outp[di], tparams[_p(prefix, ('W_inp_' + str(di+1)))])

            c = tensor.concatenate(cL, axis=1)
            h = tensor.concatenate(hL, axis=1)

            if options.get('class_out_factoring', 0) == 1:
                pC = tensor.dot(outp[-1], tparams['WdCls']) + tparams['bdCls']
                pCSft = tensor.nnet.softmax(pC)
                xCIdx = tensor.argmax(pCSft)
                pW = tensor.dot(
                    outp[-1],
                    tparams['Wd'][:, xCIdx, :]) + tparams['bd'][:, xCIdx, :]
                smooth_factor = tensor.as_tensor_variable(numpy_floatX(
                    options.get('softmax_smooth_factor', 1.0)),
                                                          name='sm_f')
                pWSft = tensor.nnet.softmax(pW * smooth_factor)
                lProb = tensor.log(pWSft +
                                   1e-20) + tensor.log(pCSft[0, xCIdx] + 1e-20)
            else:
                p = tensor.dot(outp[-1], tparams['Wd']) + tparams['bd']
                smooth_factor = tensor.as_tensor_variable(numpy_floatX(
                    options.get('softmax_smooth_factor', 1.0)),
                                                          name='sm_f')
                p = tensor.nnet.softmax(p * smooth_factor)
                lProb = tensor.log(p + 1e-20)

            if beam_size > 1:

                def _FindB_best(lPLcl, lPprev, dVLcl):
                    srtLcl = tensor.argsort(-lPLcl)
                    srtLcl = srtLcl[:beam_size]
                    deltaVec = tensor.fill(lPLcl[srtLcl],
                                           numpy_floatX(-10000.))
                    deltaVec = tensor.set_subtensor(deltaVec[0], lPprev)
                    lProbBest = ifelse(
                        tensor.eq(dVLcl, tensor.zeros_like(dVLcl)),
                        lPLcl[srtLcl] + lPprev, deltaVec)
                    xWIdxBest = ifelse(
                        tensor.eq(dVLcl, tensor.zeros_like(dVLcl)), srtLcl,
                        tensor.zeros_like(srtLcl))
                    return lProbBest, xWIdxBest

                rvalLcl, updatesLcl = theano.scan(_FindB_best,
                                                  sequences=[lProb, lP_, dV_],
                                                  name=_p(prefix, 'FindBest'),
                                                  n_steps=x_.shape[0])
                xWIdxBest = rvalLcl[1]
                lProbBest = rvalLcl[0]

                xWIdxBest = xWIdxBest.flatten()
                lProb = lProbBest.flatten()
                # Now sort and find the best among these best extensions for the current beams
                srtIdx = tensor.argsort(-lProb)
                srtIdx = srtIdx[:beam_size]
                xCandIdx = srtIdx // beam_size  # Floor division
                h = h.take(xCandIdx.flatten(), axis=0)
                c = c.take(xCandIdx.flatten(), axis=0)
                xWlogProb = lProb[srtIdx]
                xWIdx = xWIdxBest[srtIdx]
            else:
                xCandIdx = tensor.as_tensor_variable([0])
                lProb = lProb.flatten()
                xWIdx = tensor.argmax(lProb, keepdims=True)
                xWlogProb = lProb[xWIdx] + lP_
                if options.get('class_out_factoring', 0) == 1:
                    clsoffset = tensor.as_tensor_variable(
                        options['ixtoclsinfo'][:, 0])
                    xWIdx += clsoffset[xCIdx]
                h = h.take(xCandIdx.flatten(), axis=0)
                c = c.take(xCandIdx.flatten(), axis=0)

            if options.get('softmax_propogate', 0) == 0:
                xW = tparams['Wemb'][xWIdx.flatten()]
            else:
                xW = p.dot(tparams['Wemb'])
            doneVec = tensor.eq(xWIdx, tensor.zeros_like(xWIdx))

            return [xW, h, c, xWlogProb, doneVec, xWIdx,
                    xCandIdx], theano.scan_module.until(doneVec.all())

        # ------------------- END of STEP FUNCTION  -------------------- #

        if options.get('en_aux_inp', 0) == 0:
            aux_input = []

        h = tensor.alloc(numpy_floatX(0.), beam_size, h_sz * h_depth)
        c = tensor.alloc(numpy_floatX(0.), beam_size, h_sz * h_depth)

        lP = tensor.alloc(numpy_floatX(0.), beam_size)
        dV = tensor.alloc(np.int8(0.), beam_size)

        # Propogate the image feature vector
        [xW, h, c, _, _, _, _], _ = _stepP(Xi, h[:1, :], c[:1, :], lP, dV,
                                           aux_input)

        xWStart = tparams['Wemb'][[0]]
        [xW, h, c, lP, dV, idx0,
         cand0], _ = _stepP(xWStart, h[:1, :], c[:1, :], lP, dV, aux_input)

        if options.get('en_aux_inp', 0) == 1:
            aux_input = tensor.extra_ops.repeat(aux_input, beam_size, axis=0)

        # Now lets do the loop.
        rval, updates = theano.scan(
            _stepP,
            outputs_info=[xW, h, c, lP, dV, None, None],
            non_sequences=[aux_input],
            name=_p(prefix, 'predict_layers'),
            n_steps=nMaxsteps)

        return rval[3][-1], tensor.concatenate(
            [idx0.reshape([1, beam_size]), rval[5]],
            axis=0), tensor.concatenate(
                [cand0.reshape([1, beam_size]), rval[6]],
                axis=0), tensor.concatenate(
                    [tensor.shape_padleft(xW, n_ones=1), rval[0]],
                    axis=0), updates
Example #15
0
def main(params):
    word_count_threshold = params['word_count_threshold']
    max_epochs = params['max_epochs']
    host = socket.gethostname()  # get computer hostname

    # fetch the data provider
    dp = getDataProvider(params)
    # Initialize the optimizer
    solver = Solver(params['solver'])

    params['image_feat_size'] = dp.img_feat_size
    params['aux_inp_size'] = dp.aux_inp_size

    misc = {
    }  # stores various misc items that need to be passed around the framework

    # go over all training sentences and find the vocabulary we want to use, i.e. the words that occur
    # at least word_count_threshold number of times
    misc['wordtoix'], misc[
        'ixtoword'], bias_init_vector = preProBuildWordVocab(
            dp.iterSentences('train'), word_count_threshold)

    if params['fine_tune'] == 1:
        params['mode'] = 'multi_choice_mode' if params[
            'mc_mode'] == 1 else 'multimodal_lstm'
        if params['checkpoint_file_name'] != None:
            #params['batch_size'] = dp.dataset['batchsize']
            misc['wordtoix'] = checkpoint_init['wordtoix']
            misc['ixtoword'] = checkpoint_init['ixtoword']
        batch_size = 1
        num_sentences_total = dp.getSplitSize('train', ofwhat='images')
    else:
        params['mode'] = 'batchtrain'
        batch_size = params['batch_size']
        num_sentences_total = dp.getSplitSize('train', ofwhat='sentences')

    params['vocabulary_size'] = len(misc['wordtoix'])
    pos_samp = np.arange(batch_size, dtype=np.int32)

    # This initializes the model parameters and does matrix initializations
    evalModel = decodeEvaluator(params)
    model, misc['update'], misc['regularize'] = (evalModel.model_th,
                                                 evalModel.updateP,
                                                 evalModel.regularize)

    #----------------- If we are using feature encoders -----------------------
    if params['use_encoder_for'] & 1:
        imgFeatEncoder = RecurrentFeatEncoder(params['image_feat_size'],
                                              params['sent_encoding_size'],
                                              params,
                                              mdl_prefix='img_enc_',
                                              features=dp.features.T)
        mdlLen = len(model.keys())
        model.update(imgFeatEncoder.model_th)
        assert (len(model.keys()) == (mdlLen +
                                      len(imgFeatEncoder.model_th.keys())))
        #misc['update'].extend(imgFeatEncoder.update_list)
        misc['regularize'].extend(imgFeatEncoder.regularize)
        (imgenc_use_dropout, imgFeatEnc_inp, xI,
         updatesLSTMImgFeat) = imgFeatEncoder.build_model(model, params)
    else:
        xI = None
        imgFeatEnc_inp = []

    # Define the computational graph for relating the input image features and word indices to the
    # log probability cost funtion.
    (use_dropout, inp_list_eval, miscOuts, cost, predTh,
     model) = evalModel.build_model(model,
                                    params,
                                    xI=xI,
                                    prior_inp_list=imgFeatEnc_inp)

    inp_list = imgFeatEnc_inp + inp_list_eval

    # Compile an evaluation function.. Doesn't include gradients
    # To be used for validation set evaluation
    f_eval = theano.function(inp_list, cost, name='f_eval')

    # Add the regularization cost. Since this is specific to trainig and doesn't get included when we
    # evaluate the cost on test or validation data, we leave it here outside the model definition
    if params['regc'] > 0.:
        reg_cost = theano.shared(numpy_floatX(0.), name='reg_c')
        reg_c = tensor.as_tensor_variable(numpy_floatX(params['regc']),
                                          name='reg_c')
        for p in misc['regularize']:
            reg_cost += (model[p]**2).sum()
            reg_cost *= 0.5 * reg_c
        cost[0] += (reg_cost / params['batch_size'])

    # Now let's build a gradient computation graph and rmsprop update mechanism
    grads = tensor.grad(cost[0], wrt=model.values())
    lr = tensor.scalar(name='lr', dtype=config.floatX)
    if params['sim_minibatch'] > 0:
        f_grad_accum, f_clr, ag = solver.accumGrads(model, grads, inp_list,
                                                    cost,
                                                    params['sim_minibatch'])
        f_grad_shared, f_update, zg, rg, ud = solver.build_solver_model(
            lr, model, ag, inp_list, cost, params)
    else:
        f_grad_shared, f_update, zg, rg, ud = solver.build_solver_model(
            lr, model, grads, inp_list, cost, params)

    print 'model init done.'
    print 'model has keys: ' + ', '.join(model.keys())

    # calculate how many iterations we need, One epoch is considered once going through all the sentences and not images
    # Hence in case of coco/flickr this will 5* no of images
    num_iters_one_epoch = num_sentences_total / batch_size
    max_iters = max_epochs * num_iters_one_epoch
    inner_loop = params['sim_minibatch'] if params['sim_minibatch'] > 0 else 1
    max_iters = max_iters / inner_loop
    eval_period_in_epochs = params['eval_period']
    eval_period_in_iters = max(
        1, int(num_iters_one_epoch * eval_period_in_epochs / inner_loop))
    top_val_ppl2 = -1
    smooth_train_cost = len(
        misc['ixtoword'])  # initially size of dictionary of confusion
    smooth_error_rate = 100.
    error_rate = 0.
    prev_it = -1
    val_ppl2 = len(misc['ixtoword'])
    last_status_write_time = 0  # for writing worker job status reports
    json_worker_status = {}
    json_worker_status['params'] = params
    json_worker_status['history'] = []

    len_hist = defaultdict(int)

    ## Initialize the model parameters from the checkpoint file if we are resuming training
    if params['checkpoint_file_name'] != None:
        zipp(model_init_from, model)
        zipp(rg_init, rg)
        print("\nContinuing training from previous model\n. Already run for %0.2f epochs with validation perplx at %0.3f\n" % (checkpoint_init['epoch'], \
          checkpoint_init['perplexity']))
    elif params['init_from_imagernn'] != None:
        # Initialize word vecs and image emb from generative model file
        rnnCv = pickle.load(open(params['init_from_imagernn'], 'rb'))
        model['Wemb'].set_value(rnnCv['model']['Wemb'])
        model['WIemb'].set_value(rnnCv['model']['WIemb_aux'])
        misc['wordtoix'] = rnnCv['wordtoix']
        misc['ixtoword'] = rnnCv['ixtoword']
        print(
            "\n Initialized Word embedding and Image embeddings from gen mode %s"
            % (params['init_from_imagernn']))

    write_checkpoint_ppl_threshold = params['write_checkpoint_ppl_threshold']

    use_dropout.set_value(1.)
    #################### Main Loop ############################################
    for it in xrange(max_iters):
        t0 = time.time()

        if params['use_encoder_for'] & 1:
            imgenc_use_dropout.set_value(float(params['use_dropout']))

        # fetch a batch of data
        cost_inner = np.zeros((inner_loop, ), dtype=np.float32)
        if params['sim_minibatch'] > 0:
            for i_l in xrange(inner_loop):
                batch, pos_samp_sent = dp.sampPosNegSentSamps(
                    params['batch_size'], params['mode'], thresh=0.3)
                eval_inp_list, lenS = prepare_data(
                    batch,
                    misc['wordtoix'],
                    maxlen=params['maxlen'],
                    pos_samp=pos_samp,
                    prep_for=params['eval_model'],
                    use_enc_for=params['use_encoder_for'])
                if params['fine_tune'] == 1:
                    eval_inp_list.append(pos_samp_sent)
                cost_inner[i_l] = f_grad_accum(*eval_inp_list)
        else:
            batch, pos_samp_sent = dp.sampPosNegSentSamps(params['batch_size'],
                                                          params['mode'],
                                                          thresh=0.3)
            enc_inp_list = prepare_seq_features(
                batch,
                use_enc_for=params['use_encoder_for'],
                use_shared_mem=params['use_shared_mem_enc'])
            eval_inp_list, lenS = prepare_data(
                batch,
                misc['wordtoix'],
                maxlen=params['maxlen'],
                pos_samp=pos_samp,
                prep_for=params['eval_model'],
                use_enc_for=params['use_encoder_for'])
            if params['fine_tune'] == 1:
                eval_inp_list.append(pos_samp_sent)

        real_inp_list = enc_inp_list + eval_inp_list

        # Enable using dropout in training
        cost = f_grad_shared(*real_inp_list)
        f_update(params['learning_rate'])
        dt = time.time() - t0

        # Reset accumulated gradients to 0
        if params['sim_minibatch'] > 0:
            f_clr()
        #print 'model: ' + ' '.join([str(np.isnan(model[m].get_value()).any()) for m in model])
        #print 'rg: ' +' '.join([str(np.isnan(rg[i].get_value()).any()) for i in xrange(len(rg))])
        #print 'zg: ' + ' '.join([str(np.isnan(zg[i].get_value()).any()) for i in xrange(len(zg))])
        #print 'ud: ' + ' '.join([str(np.isnan(ud[i].get_value()).any()) for i in xrange(len(ud))])
        #import pdb; pdb.set_trace()
        #print 'udAft: ' + ' '.join([str(np.isnan(ud[i].get_value()).any()) for i in xrange(len(ud))])

        # print training statistics
        epoch = it * inner_loop * 1.0 / num_iters_one_epoch
        total_cost = (np.e**(-cost[0]) + (np.e**(-cost_inner)).sum() *
                      (params['sim_minibatch'] > 0)) / (
                          1 + params['sim_minibatch'])
        #print '%d/%d batch done in %.3fs. at epoch %.2f. loss cost = %f, reg cost = %f, ppl2 = %.2f (smooth %.2f)' \
        #      % (it, max_iters, dt, epoch, cost['loss_cost'], cost['reg_cost'], \
        #         train_ppl2, smooth_train_cost)
        if it == 0: smooth_train_cost = total_cost
        else: smooth_train_cost = 0.99 * smooth_train_cost + 0.01 * total_cost
        error_rate += 100.0 * float((cost[2] < 0.).sum()) / batch_size

        margin_strength = cost[2].sum()
        smooth_error_rate = 0.99 * smooth_error_rate + 0.01 * 100.0 * (
            float(cost[1]) / batch_size) if it > 0 else 100.0 * (
                float(cost[1]) / batch_size)

        tnow = time.time()
        if tnow > last_status_write_time + 60 * 1:  # every now and then lets write a report
            print '%d/%d batch done in %.3fs. at epoch %.2f. Prob now is %.4f, Error '\
                    'rate is %.3f%%, Margin %.2f, negMarg=%.2f' % (it, max_iters, dt, \
                    epoch, smooth_train_cost, smooth_error_rate,
                    margin_strength, error_rate/(it-prev_it))
            error_rate = 0.
            prev_it = it
            last_status_write_time = tnow
            jstatus = {}
            jstatus['time'] = datetime.datetime.now().isoformat()
            jstatus['iter'] = (it, max_iters)
            jstatus['epoch'] = (epoch, max_epochs)
            jstatus['time_per_batch'] = dt
            jstatus['val_ppl2'] = val_ppl2  # just write the last available one
            json_worker_status['history'].append(jstatus)
            status_file = os.path.join(
                params['worker_status_output_directory'],
                host + '_status.json')
            #import pdb; pdb.set_trace()
            try:
                json.dump(json_worker_status, open(status_file, 'w'))
            except Exception, e:  # todo be more clever here
                print 'tried to write worker status into %s but got error:' % (
                    status_file, )
                print e

        ## perform perplexity evaluation on the validation set and save a model checkpoint if it's good
        is_last_iter = (it + 1) == max_iters
        if (((it + 1) % eval_period_in_iters) == 0
                and it < max_iters - 5) or is_last_iter:
            # Disable using dropout in validation
            use_dropout.set_value(0.)
            if params['use_encoder_for'] & 1:
                imgenc_use_dropout.set_value(0.)

            val_ppl2 = eval_split_theano(
                'val', dp, model, params, misc,
                f_eval)  # perform the evaluation on VAL set
            if epoch - params['lr_decay_st_epoch'] >= 0:
                params['learning_rate'] = params['learning_rate'] * params[
                    'lr_decay']
                params['lr_decay_st_epoch'] += 1

            print 'validation perplexity = %f, lr = %f' % (
                val_ppl2, params['learning_rate'])
            #if params['sample_by_len'] == 1:
            #  print len_hist

            if val_ppl2 < top_val_ppl2 or top_val_ppl2 < 0:
                if val_ppl2 < write_checkpoint_ppl_threshold or write_checkpoint_ppl_threshold < 0:
                    # if we beat a previous record or if this is the first time
                    # AND we also beat the user-defined threshold or it doesnt exist
                    top_val_ppl2 = val_ppl2
                    filename = '%s_checkpoint_%s_%s_%s_%.2f_%.2f.p' % (
                        params['eval_model'], params['dataset'], host,
                        params['fappend'], smooth_error_rate, val_ppl2)
                    filepath = os.path.join(
                        params['checkpoint_output_directory'], filename)
                    model_npy = unzip(model)
                    rgrads_npy = unzip(rg)
                    checkpoint = {}
                    checkpoint['it'] = it
                    checkpoint['epoch'] = epoch
                    checkpoint['model'] = model_npy
                    checkpoint['rgrads'] = rgrads_npy
                    checkpoint['params'] = params
                    checkpoint['perplexity'] = val_ppl2
                    checkpoint['wordtoix'] = misc['wordtoix']
                    checkpoint['ixtoword'] = misc['ixtoword']
                    try:
                        pickle.dump(checkpoint, open(filepath, "wb"))
                        print 'saved checkpoint in %s' % (filepath, )
                    except Exception, e:  # todo be more clever here
                        print 'tried to write checkpoint into %s but got error: ' % (
                            filepath, )
                        print e

            use_dropout.set_value(1.)
    def lstm_predict_layer(self,
                           tparams,
                           Xi,
                           aux_input,
                           options,
                           beam_size,
                           prefix='lstm'):
        nMaxsteps = 30
        n_samples = 1

        # ----------------------  STEP FUNCTION  ---------------------- #
        def _stepP(x_, h_, c_, lP_, dV_, xAux):
            preact = tensor.dot(h_, tparams[_p(prefix, 'W_hid')])
            preact += (tensor.dot(x_, tparams[_p(prefix, 'W_inp')]) +
                       tparams[_p(prefix, 'b')])
            if options.get('en_aux_inp', 0):
                preact += tensor.dot(xAux, tparams[_p(prefix, 'W_aux')])

            i = tensor.nnet.sigmoid(sliceT(preact, 0, options['hidden_size']))
            f = tensor.nnet.sigmoid(sliceT(preact, 1, options['hidden_size']))
            o = tensor.nnet.sigmoid(sliceT(preact, 2, options['hidden_size']))
            c = tensor.tanh(sliceT(preact, 3, options['hidden_size']))

            c = f * c_ + i * c

            h = o * tensor.tanh(c)

            p = tensor.dot(h, tparams['Wd']) + tparams['bd']
            p = tensor.nnet.softmax(p)
            lProb = tensor.log(p + 1e-20)

            def _FindB_best(lPLcl, lPprev, dVLcl):
                srtLcl = tensor.argsort(-lPLcl)
                srtLcl = srtLcl[:beam_size]
                deltaVec = tensor.fill(lPLcl[srtLcl], numpy_floatX(-10000.))
                deltaVec = tensor.set_subtensor(deltaVec[0], lPprev)
                lProbBest = ifelse(tensor.eq(dVLcl, tensor.zeros_like(dVLcl)),
                                   lPLcl[srtLcl] + lPprev, deltaVec)
                xWIdxBest = ifelse(tensor.eq(dVLcl, tensor.zeros_like(dVLcl)),
                                   srtLcl, tensor.zeros_like(srtLcl))
                return lProbBest, xWIdxBest

            rvalLcl, updatesLcl = theano.scan(_FindB_best,
                                              sequences=[lProb, lP_, dV_],
                                              name=_p(prefix, 'FindBest'),
                                              n_steps=x_.shape[0])
            xWIdxBest = rvalLcl[1]
            lProbBest = rvalLcl[0]

            xWIdxBest = xWIdxBest.flatten()
            lProb = lProbBest.flatten()

            # Now sort and find the best among these best extensions for the current beams
            srtIdx = tensor.argsort(-lProb)
            srtIdx = srtIdx[:beam_size]
            xWlogProb = lProb[srtIdx]

            xWIdx = xWIdxBest[srtIdx]
            xCandIdx = srtIdx // beam_size  # Floor division

            xW = tparams['Wemb'][xWIdx.flatten()]
            doneVec = tensor.eq(xWIdx, tensor.zeros_like(xWIdx))
            h = h.take(xCandIdx.flatten(), axis=0)
            c = c.take(xCandIdx.flatten(), axis=0)

            return [xW, h, c, xWlogProb, doneVec, xWIdx,
                    xCandIdx], theano.scan_module.until(doneVec.all())

        # ------------------- END of STEP FUNCTION  -------------------- #

        if options.get('en_aux_inp', 0) == 0:
            aux_input = []

        hidden_size = options['hidden_size']

        h = tensor.alloc(numpy_floatX(0.), beam_size, hidden_size)
        c = tensor.alloc(numpy_floatX(0.), beam_size, hidden_size)

        lP = tensor.alloc(numpy_floatX(0.), beam_size)
        dV = tensor.alloc(np.int8(0.), beam_size)

        # Propogate the image feature vector
        [xW, h, c, _, _, _, _], _ = _stepP(Xi, h[:1, :], c[:1, :], lP, dV,
                                           aux_input)

        xWStart = tparams['Wemb'][[0]]
        [xW, h, c, lP, dV, idx0,
         cand0], _ = _stepP(xWStart, h[:1, :], c[:1, :], lP, dV, aux_input)

        aux_input = tensor.extra_ops.repeat(aux_input, beam_size, axis=0)

        # Now lets do the loop.
        rval, updates = theano.scan(
            _stepP,
            outputs_info=[xW, h, c, lP, dV, None, None],
            non_sequences=[aux_input],
            name=_p(prefix, 'predict_layers'),
            n_steps=nMaxsteps)

        return rval[3][-1], tensor.concatenate(
            [idx0.reshape([1, beam_size]), rval[5]],
            axis=0), tensor.concatenate(
                [cand0.reshape([1, beam_size]), rval[6]], axis=0)
    def lstm_advers_gen_layer(self, tparams, xI, xAux, options, prefix='lstm'):
        nBatchSamps = xI.shape[0]
        nMaxsteps = options.get('maxlen', 15)
        if nMaxsteps is None:
            nMaxsteps = 30
        n_samp = options.get('n_gen_samples', 1)

        h_depth = options.get('hidden_depth', 1)
        h_sz = options['hidden_size']

        # ----------------------  STEP FUNCTION  ---------------------- #
        def _stepP(U, xW_, h_, c_, lP_, dV_, xAux, xNoise):
            preact = tensor.dot(sliceT(h_, 0, h_sz),
                                tparams[_p(prefix, 'W_hid')])
            preact += (tensor.dot(xW_, tparams[_p(prefix, 'W_inp')]) +
                       tparams[_p(prefix, 'b')])
            preact += xAux
            if options.get('gen_input_noise', 0):
                preact += xNoise

            hL = [[]] * h_depth
            cL = [[]] * h_depth
            outp = [[]] * h_depth
            for di in xrange(h_depth):
                i = tensor.nnet.sigmoid(sliceT(preact, 0, h_sz))
                f = tensor.nnet.sigmoid(sliceT(preact, 1, h_sz))
                o = tensor.nnet.sigmoid(sliceT(preact, 2, h_sz))
                cL[di] = tensor.tanh(sliceT(preact, 3, h_sz))
                cL[di] = f * sliceT(c_, di, h_sz) + i * cL[di]
                hL[di] = o * tensor.tanh(cL[di])
                outp[di] = hL[di]
                if options.get('en_residual_conn', 1):
                    if (di > 0):
                        outp[di] += outp[di - 1]
                        print "Connecting residual at %d" % (di)
                if di < (h_depth - 1):
                    preact = tensor.dot(sliceT(h_, di+1, h_sz), tparams[_p(prefix, ('W_hid_' + str(di+1)))]) + \
                            tensor.dot(outp[di], tparams[_p(prefix, ('W_inp_' + str(di+1)))])

            c = tensor.concatenate(cL, axis=1)
            h = tensor.concatenate(hL, axis=1)

            logits = tensor.dot(outp[-1], tparams['Wd']) + tparams['bd']
            #p = tensor.dot(outp[-1],l2norm(tparams['Wd'],axis=0))# + tparams['bd']
            if options.get('use_gumbel_mse', 0) == 0 or options.get(
                    'greedy', 0):
                p = tensor.nnet.softmax(logits)
            else:
                p = gumbel_softmax_sample(
                    self.trng, logits * self.softmax_smooth_factor,
                    self.gumb_temp, U, options.get('use_gumbel_hard', False))

            if options.get('computelogprob', 0):
                lProb = tensor.log(
                    tensor.nnet.softmax(logits * self.softmax_smooth_factor) +
                    1e-20)
            else:
                lProb = logits

            # Idx of the correct word should come from the
            xWIdx = ~dV_ * tensor.argmax(p, axis=-1)

            xWlogProb = ~dV_ * lProb[tensor.arange(nBatchSamps * n_samp),
                                     xWIdx] + lP_
            #xW = tparams['Wemb'][xWIdx.flatten()]
            if options.get('use_gumbel_hard', 0) and options.get(
                    'use_gumbel_mse', 0) and not options.get('greedy', 0):
                xW = p.dot(tparams['Wemb'])
            else:
                xW = theano.gradient.disconnected_grad(
                    tparams['Wemb'][xWIdx.flatten()].reshape(
                        [xWIdx.shape[0], -1]))

            doneVec = tensor.eq(xWIdx, tensor.zeros_like(xWIdx))

            return [xW, h, c, xWlogProb, doneVec, xWIdx,
                    p], theano.scan_module.until(doneVec.all())

        # ------------------- END of STEP FUNCTION  -------------------- #

        if options.get('use_gumbel_mse', 0) == 0:
            U = self.trng.uniform((nMaxsteps, 1),
                                  low=0.,
                                  high=1.,
                                  dtype=theano.config.floatX)
        else:
            U = self.trng.uniform((nMaxsteps + 1, nBatchSamps * n_samp,
                                   options['vocabulary_size']),
                                  low=0.,
                                  high=1.,
                                  dtype=theano.config.floatX)

        xI = tensor.extra_ops.repeat(xI, n_samp, axis=0)
        xAux = tensor.extra_ops.repeat(tensor.dot(xAux,
                                                  tparams[_p(prefix,
                                                             'W_aux')]),
                                       n_samp,
                                       axis=0)

        if options.get('gen_input_noise', 0):
            xNoise = tensor.dot(
                self.trng.normal([nBatchSamps * n_samp, self.noise_dim]),
                tparams[_p(prefix, 'W_noise')])
        else:
            xNoise = []

        if options.get('gen_use_rand_init',
                       0) and not options.get('gen_input_noise', 0):
            h = tensor.unbroadcast(
                self.trng.uniform([nBatchSamps * n_samp, h_sz * h_depth],
                                  low=-0.1,
                                  high=0.1), 0, 1)
            c = tensor.unbroadcast(
                self.trng.uniform([nBatchSamps * n_samp, h_sz * h_depth],
                                  low=-0.1,
                                  high=0.1), 0, 1)
        else:
            h = tensor.zeros([nBatchSamps * n_samp, h_sz * h_depth])
            c = tensor.zeros([nBatchSamps * n_samp, h_sz * h_depth])

        lP = tensor.alloc(numpy_floatX(0.), nBatchSamps * n_samp)
        dV = tensor.alloc(np.bool_(0.), nBatchSamps * n_samp)

        # Propogate the image feature vector
        [_, h, c, _, _, _, _], _ = _stepP(U[0, :], xI, h, c, lP, dV, xAux,
                                          xNoise)

        xWStart = tensor.unbroadcast(
            tensor.tile(tparams['Wemb'][[0]], [nBatchSamps * n_samp, 1]), 0, 1)

        # Now lets do the loop.
        rval, updates = theano.scan(
            _stepP,
            sequences=[U[1:, :]],
            outputs_info=[xWStart, h, c, lP, dV, None, None],
            non_sequences=[xAux, xNoise],
            name=_p(prefix, 'adv_predict_layers'),
            n_steps=nMaxsteps)

        seq_lengths = theano.gradient.disconnected_grad(
            tensor.argmax(tensor.concatenate(
                [rval[4][:-1, :],
                 tensor.ones((1, nBatchSamps * n_samp))],
                axis=0),
                          axis=0) + 1)

        return rval[3][-1], rval[5], rval[6], updates, seq_lengths
    def build_eval_other_sent(self, tparams, options, model_npy):

        zipp(model_npy, self.model_th)

        # Used for dropout.
        use_noise = theano.shared(numpy_floatX(0.))

        xW = tensor.matrix('xW', dtype='int64')
        mask = tensor.matrix('mask', dtype=config.floatX)
        n_timesteps = xW.shape[0]
        n_samples = xW.shape[1]
        n_out_samps = (n_timesteps - 1) * n_samples

        embW = tparams['Wemb'][xW.flatten()].reshape(
            [n_timesteps, n_samples, options['word_encoding_size']])
        xI = tensor.matrix('xI', dtype=config.floatX)
        xAux = tensor.matrix('xAux', dtype=config.floatX)

        if options.get('swap_aux', 0):
            xAuxEmb = tensor.dot(xAux,
                                 tparams['WIemb_aux']) + tparams['b_Img_aux']
        else:
            xAuxEmb = xAux

        embImg = (tensor.dot(xI, tparams['WIemb']) + tparams['b_Img']).reshape(
            [1, n_samples, options['image_encoding_size']])
        emb = tensor.concatenate([embImg, embW], axis=0)

        rval, updatesLSTM = basic_lstm_layer(tparams,
                                             emb[:n_timesteps, :, :],
                                             xAuxEmb,
                                             use_noise,
                                             options,
                                             prefix=options['generator'])
        p = sliceT(rval[0][1:, :, :], options.get('hidden_depth', 1),
                   options['hidden_size'])

        if options.get('class_out_factoring', 0) == 0:
            pW = (tensor.dot(p, tparams['Wd']) + tparams['bd']).reshape(
                [n_out_samps, options['output_size']])
            pWSft = tensor.nnet.softmax(pW)
            totProb = pWSft[tensor.arange(n_out_samps), xW[1:, :].flatten()]
            out_list = [pWSft, totProb, p]
        else:
            ixtoclsinfo_t = tensor.as_tensor_variable(self.clsinfo)
            xC = ixtoclsinfo_t[xW[1:, :].flatten(), 0]
            pW = ((tparams['Wd'][:, xC, :].T *
                   ((p.reshape([1, n_out_samps, options['hidden_size']]) -
                     tparams['WdCls'][:, xC].T))).sum(axis=-1).T +
                  tparams['bd'][:, xC, :])
            pWSft = tensor.nnet.softmax(pW[0, :, :])
            pC = (tensor.dot(p, tparams['WdCls']) + tparams['bdCls']).reshape(
                [n_out_samps, options['nClasses']])
            pCSft = tensor.nnet.softmax(pC)

            totProb = pWSft[tensor.arange(n_out_samps), ixtoclsinfo_t[xW[1:,:].flatten(),3]] * \
                      pCSft[tensor.arange(n_out_samps), xC]

        tot_cost = -(tensor.log(totProb + 1e-10) * mask[1:, :].flatten()
                     ).reshape([n_timesteps - 1, n_samples])
        cost = tot_cost.sum(axis=0)

        inp_list = [xW, mask, xI]

        if options.get('en_aux_inp', 0):
            inp_list.append(xAux)

        self.f_pred_prob_other = theano.function([xW, xI, xAux],
                                                 pWSft,
                                                 name='f_pred_prob',
                                                 updates=updatesLSTM)
        #f_pred = theano.function([xW, mask], pred.argmax(axis=1), name='f_pred')

        #cost = -tensor.log(pred[tensor.arange(n_timesteps),tensor.arange(n_samples), xW] + 1e-8).mean()

        self.f_eval_other = theano.function(inp_list, cost, name='f_eval')

        return use_noise, inp_list, self.f_pred_prob_other, cost, pW, updatesLSTM
    def __init__(self, params):

        image_encoding_size = params.get('image_encoding_size', 128)
        word_encoding_size = params.get('word_encoding_size', 128)

        hidden_size = params.get('hidden_size', 128)
        hidden_depth = params.get('hidden_depth', 1)
        generator = params.get('generator', 'lstm')
        vocabulary_size = params.get('vocabulary_size', -1)
        output_size = params.get('output_size', -1)
        image_feat_size = params.get('image_feat_size',
                                     -1)  # size of CNN vectors hardcoded here

        aux_inp_size = params.get('aux_inp_size', -1)

        model = OrderedDict()
        # Recurrent weights: take x_t, h_{t-1}, and bias unit
        # and produce the 3 gates and the input to cell signal

        encoder = params.get('feat_encoder', None)
        use_feat_enc = params.get('use_encoder_for', 0)

        if not (use_feat_enc & 1):
            model['WIemb'] = initwTh(image_feat_size,
                                     word_encoding_size)  # image encoder
            model['b_Img'] = np.zeros(
                (word_encoding_size)).astype(config.floatX)

        model['Wemb'] = initwTh(vocabulary_size,
                                word_encoding_size)  # word encoder
        model['lstm_W_hid'] = initwTh(hidden_size, 4 * hidden_size)
        model['lstm_W_inp'] = initwTh(word_encoding_size, 4 * hidden_size)

        for i in xrange(1, hidden_depth):
            model['lstm_W_hid_' + str(i)] = initwTh(hidden_size,
                                                    4 * hidden_size)
            model['lstm_W_inp_' + str(i)] = initwTh(hidden_size,
                                                    4 * hidden_size)

        model['lstm_b'] = np.zeros((4 * hidden_size, )).astype(config.floatX)
        # Decoder weights (e.g. mapping to vocabulary)

        if params.get('class_out_factoring', 0) == 0:
            model['Wd'] = initwTh(hidden_size, output_size)  # decoder
            model['bd'] = np.zeros((output_size, )).astype(config.floatX)
        else:
            clsinfo = params['ixtoclsinfo']
            self.clsinfo = clsinfo
            clsSizes = clsinfo[:, 2] - clsinfo[:, 1]
            self.clsSize = np.zeros(params['nClasses'])
            self.clsOffset = np.zeros(params['nClasses'], dtype=np.int32)
            self.clsSize[clsinfo[:, 0]] = clsSizes
            self.clsOffset[clsinfo[:, 0]] = np.int32(clsinfo[:, 1])
            max_cls_size = np.max(clsSizes)
            self.max_cls_size = max_cls_size
            Wd = np.zeros(
                (params['hidden_size'], params['nClasses'], max_cls_size),
                dtype=config.floatX)
            model['bd'] = np.zeros((1, params['nClasses'], max_cls_size),
                                   dtype=config.floatX)
            for cix in clsinfo[:, 0]:
                Wd[:, cix, :clsSizes[cix]] = initwTh(params['hidden_size'],
                                                     clsSizes[cix])
                model['bd'][0, cix, clsSizes[cix]:] = -100
            model['Wd'] = Wd

        update_list = [
            'lstm_W_hid', 'lstm_W_inp', 'lstm_b', 'Wd', 'bd', 'Wemb'
        ]
        self.regularize = ['lstm_W_hid', 'lstm_W_inp', 'Wd', 'Wemb']
        if not (use_feat_enc & 1):
            update_list.extend(['WIemb', 'b_Img'])
            self.regularize.extend(['WIemb'])

        if params.get('class_out_factoring', 0) == 1:
            model['WdCls'] = initwTh(hidden_size,
                                     params['nClasses'])  # decoder
            model['bdCls'] = np.zeros(
                (params['nClasses'], )).astype(config.floatX)
            update_list.extend(['WdCls', 'bdCls'])
            self.regularize.extend(['WdCls'])

        for i in xrange(1, hidden_depth):
            update_list.append('lstm_W_hid_' + str(i))
            update_list.append('lstm_W_hid_' + str(i))
            self.regularize.append('lstm_W_inp_' + str(i))
            self.regularize.append('lstm_W_inp_' + str(i))

        if params.get('en_aux_inp', 0):
            if params.get('swap_aux', 1) == 1:
                if not (use_feat_enc & 2) or params.get(
                        'encode_gt_sentences', 0):
                    model['WIemb_aux'] = initwTh(
                        aux_inp_size, image_encoding_size)  # image encoder
                    model['b_Img_aux'] = np.zeros(
                        (image_encoding_size)).astype(config.floatX)
                    update_list.append('WIemb_aux')
                    self.regularize.append('WIemb_aux')
                    update_list.append('b_Img_aux')
                model['lstm_W_aux'] = initwTh(image_encoding_size,
                                              4 * hidden_size, 0.00005)
            else:
                model['lstm_W_aux'] = initwTh(aux_inp_size, 4 * hidden_size,
                                              0.001)
            update_list.append('lstm_W_aux')
            self.regularize.append('lstm_W_aux')

        if params.get('gen_input_noise', 0):
            self.noise_dim = params.get('gen_inp_noise_dim', 50)
            model['lstm_W_noise'] = initwTh(self.noise_dim, 4 * hidden_size,
                                            0.001)

        self.model_th = self.init_tparams(model)
        del model
        if params.get('use_gumbel_mse', 0):
            self.usegumbel = theano.shared(1)
            self.gumb_temp = theano.shared(
                numpy_floatX(params.get('gumbel_temp_init', 0.5)))
            #self.model_th['gumb_temp'] = self.gumb_temp
            self.softmax_smooth_factor = theano.shared(
                numpy_floatX(params.get('softmax_smooth_factor', 1.0)))
        else:
            self.usegumbel = theano.shared(0)
        self.update_list = update_list
    def build_model(self, tparams, options, xI=None, xAux=None, attn_nw=None):
        self.trng = RandomStreams(int(time.time()))

        # Used for dropout.
        use_noise = theano.shared(numpy_floatX(0.))

        xW = tensor.matrix('xW', dtype='int64')

        mask = tensor.matrix('mask', dtype=config.floatX)
        n_timesteps = xW.shape[0]
        n_samples = xW.shape[1]

        embW = tparams['Wemb'][xW.flatten()].reshape(
            [n_timesteps, n_samples, options['word_encoding_size']])
        if xI == None:
            xI = tensor.matrix('xI', dtype=config.floatX)
            embImg = (tensor.dot(xI, tparams['WIemb']) + tparams['b_Img'])
            xI_is_inp = True
        else:
            embImg = xI
            xI_is_inp = False

        if xAux == None:
            xAux = tensor.matrix(
                'xAux',
                dtype=config.floatX) if attn_nw == None else tensor.tensor3(
                    'xAux', dtype=config.floatX)
            if (options.get('swap_aux', 1)) and (attn_nw == None):
                xAuxEmb = tensor.dot(
                    xAux, tparams['WIemb_aux']) + tparams['b_Img_aux']
            else:
                xAuxEmb = xAux
            xA_is_inp = True
        else:
            xA_is_inp = False
            if options.get('encode_gt_sentences', 0):
                xAuxEmb = tensor.dot(
                    xAux, tparams['WIemb_aux']) + tparams['b_Img_aux']
            else:
                xAuxEmb = xAux

        embImg = embImg.reshape([1, n_samples, options['image_encoding_size']])
        emb = tensor.concatenate([embImg, embW], axis=0)

        #This is implementation of input dropout !!
        if options['use_dropout']:
            emb = dropout_layer(emb,
                                use_noise,
                                self.trng,
                                options['drop_prob_encoder'],
                                shp=emb.shape)
            if (options.get('en_aux_inp', 0)) and (attn_nw == None):
                xAuxEmb = dropout_layer(xAuxEmb,
                                        use_noise,
                                        self.trng,
                                        options['drop_prob_aux'],
                                        shp=xAuxEmb.shape)

        # Implement scehduled sampling!
        if options.get('sched_sampling_mode', None) != None:
            curr_epoch = tensor.scalar(name='curr_epoch', dtype=config.floatX)

            # Assign the probabilies according to the scheduling mode
            if options['sched_sampling_mode'] == 'linear':
                prob = tensor.maximum(
                    options['sslin_min'], options['sched_sampling_const'] -
                    options['sslin_slope'] * curr_epoch)
            elif options['sched_sampling_mode'] == 'exp':
                raise ValueError(
                    'ERROR: %s --> This solver type is not yet supported' %
                    (options['sched_sampling_mode']))
            elif options['sched_sampling_mode'] == 'invsig':
                raise ValueError(
                    'ERROR: %s --> This solver type is not yet supported' %
                    (options['sched_sampling_mode']))
            else:
                raise ValueError(
                    'ERROR: %s --> This scheduling type is unknown' %
                    (options['sched_sampling_mode']))

            # Now to build the mask. We don't want to do this coin toss when
            # feeding in image feature and the start symbol
            sched_mask = self.trng.binomial((n_timesteps - 2, n_samples),
                                            p=prob,
                                            n=1,
                                            dtype='int64')
            sched_mask = tensor.concatenate(
                [sched_mask, tensor.alloc(1, 2, n_samples)], axis=0)
        else:
            sched_mask = []

        #############################################################################################################################
        # This implements core lstm
        rval, updatesLSTM = basic_lstm_layer(tparams,
                                             emb[:n_timesteps, :, :],
                                             xAuxEmb,
                                             use_noise,
                                             options,
                                             prefix=options['generator'],
                                             sched_prob_mask=sched_mask,
                                             attn_nw=attn_nw)
        #############################################################################################################################

        # NOTE1: we are leaving out the first prediction, which was made for the image and is meaningless.
        if options['use_dropout']:
            # XXX : Size given to dropout is missing one dimension. This keeps the dropped units consistent across time!?.
            # ###   Is this a good bug ?
            p = dropout_layer(
                sliceT(rval[0][1:, :, :], options.get('hidden_depth', 1),
                       options['hidden_size']), use_noise, self.trng,
                options['drop_prob_decoder'],
                (n_samples, options['hidden_size']))
        else:
            p = sliceT(rval[0][1:, :, :], options.get('hidden_depth', 1),
                       options['hidden_size'])

        if options.get('class_out_factoring', 0) == 1:
            if options.get('cls_diff_layer', 0) == 1:
                pC_inp = dropout_layer(
                    sliceT(rval[0][1:, :, :],
                           options.get('hidden_depth', 1) - 2,
                           options['hidden_size']), use_noise, self.trng,
                    options['drop_prob_decoder'],
                    (n_samples, options['hidden_size']))
            else:
                pC_inp = p

        n_out_samps = (n_timesteps - 1) * n_samples
        if options.get('class_out_factoring', 0) == 0:
            pW = (tensor.dot(p, tparams['Wd']) + tparams['bd']).reshape(
                [n_out_samps, options['output_size']])
            if options.get('use_gumbel_mse', 0) == 0:
                pWSft = tensor.nnet.softmax(pW)
            else:
                w_out = ifelse(
                    self.usegumbel,
                    gumbel_softmax_sample(self.trng,
                                          pW,
                                          self.gumb_temp,
                                          hard=options.get(
                                              'use_gumbel_hard', False)),
                    tensor.nnet.softmax(pW))
                # This is not exactly right, but just testing
                pWSft = w_out

            totProb = pWSft[tensor.arange(n_out_samps), xW[1:, :].flatten()]
            out_list = [pWSft, totProb, pW]
        else:
            ixtoclsinfo_t = tensor.as_tensor_variable(self.clsinfo)
            xC = ixtoclsinfo_t[xW[1:, :].flatten(), 0]
            if options.get('cls_zmean', 1):
                pW = ((tparams['Wd'][:, xC, :].T *
                       ((p.reshape([1, n_out_samps, options['hidden_size']]) -
                         tparams['WdCls'][:, xC].T))).sum(axis=-1).T +
                      tparams['bd'][:, xC, :])
            else:
                pW = ((tparams['Wd'][:, xC, :].T *
                       (p.reshape([1, n_out_samps, options['hidden_size']]))
                       ).sum(axis=-1).T + tparams['bd'][:, xC, :])
            pWSft = tensor.nnet.softmax(pW[0, :, :])

            pC = (tensor.dot(pC_inp, tparams['WdCls']) +
                  tparams['bdCls']).reshape([n_out_samps, options['nClasses']])
            pCSft = tensor.nnet.softmax(pC)

            totProb = pWSft[tensor.arange(n_out_samps), ixtoclsinfo_t[xW[1:,:].flatten(),3]] * \
                      pCSft[tensor.arange(n_out_samps), xC]
            out_list = [pWSft, pCSft, totProb, p]

        tot_cost = -(tensor.log(totProb + 1e-10) * mask[1:, :].flatten()).sum()
        tot_pplx = -(tensor.log2(totProb + 1e-10) *
                     mask[1:, :].flatten()).sum()
        cost = [
            tot_cost / tensor.cast(n_samples, dtype=config.floatX), tot_pplx
        ]

        inp_list = [xW, mask]
        if xI_is_inp:
            inp_list.append(xI)

        if options.get('en_aux_inp', 0) and xA_is_inp:
            inp_list.append(xAux)

        if options.get('sched_sampling_mode', None) != None:
            inp_list.append(curr_epoch)

        f_pred_prob = theano.function([xW, xI, xAux],
                                      out_list,
                                      name='f_pred_prob',
                                      updates=updatesLSTM)

        return use_noise, inp_list, f_pred_prob, cost, out_list, updatesLSTM
  def build_model(self, tparams, options):
    trng = RandomStreams(1234)

    # Used for dropout.
    use_noise = theano.shared(numpy_floatX(0.))

    xW = tensor.matrix('xW', dtype='int64')

    mask = tensor.matrix('mask', dtype=config.floatX)
    n_timesteps = xW.shape[0]
    n_samples = xW.shape[1]

    embW = tparams['Wemb'][xW.flatten()].reshape([n_timesteps,
                                                n_samples,
                                                options['word_encoding_size']])
    xI = tensor.matrix('xI', dtype=config.floatX)
    xAux = tensor.matrix('xAux', dtype=config.floatX)

    if options.get('swap_aux',0):
       xAuxEmb = tensor.dot(xAux,tparams['WIemb_aux']) + tparams['b_Img_aux']
    else:
       xAuxEmb = xAux


    embImg = (tensor.dot(xI, tparams['WIemb']) + tparams['b_Img']).reshape([1,n_samples,options['image_encoding_size']]);
    emb = tensor.concatenate([embImg, embW], axis=0) 

    #This is implementation of input dropout !!
    if options['use_dropout']:
        emb = dropout_layer(emb, use_noise, trng, options['drop_prob_encoder'], shp = emb.shape)
        if options.get('en_aux_inp',0):
            xAuxEmb = dropout_layer(xAuxEmb, use_noise, trng, options['drop_prob_aux'], shp = xAuxEmb.shape)

    # Implement scehduled sampling!
    if options.get('sched_sampling_mode',None) != None: 
        curr_epoch = tensor.scalar(name='curr_epoch',dtype=config.floatX)        
        
        # Assign the probabilies according to the scheduling mode 
        if options['sched_sampling_mode'] == 'linear':
            prob = tensor.maximum(options['sslin_min'],options['sched_sampling_const'] - options['sslin_slope'] * curr_epoch)
        elif options['sched_sampling_mode'] == 'exp': 
            raise ValueError('ERROR: %s --> This solver type is not yet supported'%(options['sched_sampling_mode']))
        elif options['sched_sampling_mode'] == 'invsig': 
            raise ValueError('ERROR: %s --> This solver type is not yet supported'%(options['sched_sampling_mode']))
        else:
            raise ValueError('ERROR: %s --> This scheduling type is unknown'%(options['sched_sampling_mode']))
        
        # Now to build the mask. We don't want to do this coin toss when 
        # feeding in image feature and the start symbol        
        sched_mask = trng.binomial((n_timesteps - 2, n_samples), p=prob, n=1, dtype='int64')
        sched_mask = tensor.concatenate([sched_mask, tensor.alloc(1, 2, n_samples)],axis=0)
    else: 
        sched_mask = []
        

    #############################################################################################################################
    # This implements core lstm
    rval, updatesLSTM = basic_lstm_layer(tparams, emb[:n_timesteps,:,:], xAuxEmb, use_noise, options,
                                         prefix=options['generator'], sched_prob_mask = sched_mask)
    #############################################################################################################################
    
    
    # NOTE1: we are leaving out the first prediction, which was made for the image and is meaningless. 
    if options['use_dropout']:
        # XXX : Size given to dropout is missing one dimension. This keeps the dropped units consistent across time!?.
        # ###   Is this a good bug ? 
        p = dropout_layer(sliceT(rval[0][1:,:,:],options.get('hidden_depth',1)-1,options['hidden_size']), use_noise, trng,
            options['drop_prob_decoder'], (n_samples,options['hidden_size']))
    else:
        p = sliceT(rval[0][1:,:,:],options.get('hidden_depth',1)-1,options['hidden_size'])

    n_out_samps = (n_timesteps-1) * n_samples 
    if options.get('class_out_factoring',0) == 0:
        pW = (tensor.dot(p,tparams['Wd']) + tparams['bd']).reshape([n_out_samps,options['output_size']])
        pWSft = tensor.nnet.softmax(pW)
        totProb = pWSft[tensor.arange(n_out_samps), xW[1:,:].flatten()]
        out_list = [pWSft, totProb, p]
    else:
        ixtoclsinfo_t = tensor.as_tensor_variable(options['ixtoclsinfo'])
        xC = ixtoclsinfo_t[xW[1:,:].flatten(),0]
        pW = ((tparams['Wd'][:,xC,:].T*(p.reshape([1,n_out_samps,options['hidden_size']]))).sum(axis=-1).T 
             + tparams['bd'][:,xC,:])
        pWSft   = tensor.nnet.softmax(pW[0,:,:])
        pC    = (tensor.dot(p,tparams['WdCls']) + tparams['bdCls']).reshape([n_out_samps,options['nClasses']])
        pCSft = tensor.nnet.softmax(pC)
        
        totProb = pWSft[tensor.arange(n_out_samps), ixtoclsinfo_t[xW[1:,:].flatten(),3]] * \
                  pCSft[tensor.arange(n_out_samps), xC]
        out_list = [pWSft, pCSft, totProb, p]
    
    tot_cost = -(tensor.log(totProb + 1e-10) * mask[1:,:].flatten()).sum()
    tot_pplx = -(tensor.log2(totProb + 1e-10) * mask[1:,:].flatten()).sum()
    cost = [tot_cost/options['batch_size'], tot_pplx]

    inp_list = [xW, mask, xI]

    if options.get('en_aux_inp',0):
        inp_list.append(xAux)

    if options.get('sched_sampling_mode',None) != None:
        inp_list.append(curr_epoch)

    f_pred_prob = [] 
    #theano.function(inp_list, p, name='f_pred_prob', updates=updatesLSTM)


    return use_noise, inp_list, f_pred_prob, cost, out_list , updatesLSTM 
    def _stepP(*in_list):
        x_inp = []
        h_inp = []
        c_inp = []
        for i in xrange(nmodels):
            x_inp.append(in_list[i])
            h_inp.append(in_list[nmodels+i])
            c_inp.append(in_list[2*nmodels+i])
        lP_ = in_list[3*nmodels]
        dV_ = in_list[3*nmodels+1]

        p_comb = tensor.alloc(numpy_floatX(0.), options[0]['output_size']);
        cf = []
        h = []
        xW = []
        for i in xrange(nmodels):
            preact = tensor.dot(h_inp[i], tparams[i][_p(prefix, 'W_hid')])
            preact += (tensor.dot(x_inp[i], tparams[i][_p(prefix, 'W_inp')]) +
                       tparams[i][_p(prefix, 'b')])
            if options[i].get('en_aux_inp',0):
                preact += tensor.dot(aux_input2[i],tparams[i][_p(prefix,'W_aux')])
  
            inp = tensor.nnet.sigmoid(sliceT(preact, 0, options[i]['hidden_size']))
            f = tensor.nnet.sigmoid(sliceT(preact, 1, options[i]['hidden_size']))
            o = tensor.nnet.sigmoid(sliceT(preact, 2, options[i]['hidden_size']))
            c = tensor.tanh(sliceT(preact, 3, options[i]['hidden_size']))
  
            cf.append(f * c_inp[i] + inp * c)
  
            h.append(o * tensor.tanh(cf[i]))
            p = tensor.dot(h[i],tparams[i]['Wd']) + tparams[i]['bd']
            if i == 0:
                p_comb = tparams[i]['comb_weight']*tensor.nnet.softmax(p)
            else:    
                p_comb += tparams[i]['comb_weight']*tensor.nnet.softmax(p)
        
        lProb = tensor.log(p_comb + 1e-20)
        def _FindB_best(lPLcl, lPprev, dVLcl):
            srtLcl = tensor.argsort(-lPLcl)
            srtLcl = srtLcl[:beam_size]
            deltaVec = tensor.fill( lPLcl[srtLcl], numpy_floatX(-10000.))
            deltaVec = tensor.set_subtensor(deltaVec[0], lPprev)
            lProbBest = ifelse(tensor.eq( dVLcl, tensor.zeros_like(dVLcl)), lPLcl[srtLcl] + lPprev, deltaVec)
            xWIdxBest = ifelse(tensor.eq( dVLcl, tensor.zeros_like(dVLcl)), srtLcl, tensor.zeros_like(srtLcl)) 
            return lProbBest, xWIdxBest 
  
        rvalLcl, updatesLcl = theano.scan(_FindB_best, sequences = [lProb, lP_, dV_], name=_p(prefix, 'FindBest'), n_steps=x_inp[0].shape[0])
        xWIdxBest = rvalLcl[1]
        lProbBest = rvalLcl[0]
  
        xWIdxBest = xWIdxBest.flatten()
        lProb = lProbBest.flatten()
  
        # Now sort and find the best among these best extensions for the current beams
        srtIdx = tensor.argsort(-lProb)
        srtIdx = srtIdx[:beam_size]
        xWlogProb = lProb[srtIdx]
  
        xWIdx = xWIdxBest[srtIdx]
        xCandIdx = srtIdx // beam_size # Floor division 
  
        doneVec = tensor.eq(xWIdx,tensor.zeros_like(xWIdx))
        
        x_out = []
        h_out = []
        c_out = []
        for i in xrange(nmodels):
            x_out.append(tparams[i]['Wemb'][xWIdx.flatten()])
            h_out.append(h[i].take(xCandIdx.flatten(),axis=0))
            c_out.append(cf[i].take(xCandIdx.flatten(),axis=0))

        out_list = []
        out_list.extend(x_out)
        out_list.extend(h_out)
        out_list.extend(c_out)
        out_list.extend([xWlogProb, doneVec, xWIdx, xCandIdx])
  
        return out_list, theano.scan_module.until(doneVec.all())
    def lstm_layer(self,
                   tparams,
                   state_below,
                   aux_input,
                   use_noise,
                   options,
                   prefix='lstm',
                   mask=None):
        nsteps = state_below.shape[0]
        h_depth = options.get('hidden_depth', 1)
        h_sz = options['hidden_size']

        if state_below.ndim == 3:
            n_samples = state_below.shape[1]
        else:
            n_samples = 1

        assert mask is not None

        def _step(m_, x_, h_, c_, xAux):
            preact = tensor.dot(sliceT(h_, 0, h_sz),
                                tparams[_p(prefix, 'W_hid')])
            preact += x_
            if options.get('en_aux_inp', 0):
                preact += tensor.dot(xAux, tparams[_p(prefix, 'W_aux')])

            #  preact += tparams[_p(prefix, 'b')]
            h = [[]] * h_depth
            c = [[]] * h_depth

            for di in xrange(h_depth):
                i = tensor.nnet.sigmoid(sliceT(preact, 0, h_sz))
                f = tensor.nnet.sigmoid(sliceT(preact, 1, h_sz))
                o = tensor.nnet.sigmoid(sliceT(preact, 2, h_sz))
                c[di] = tensor.tanh(sliceT(preact, 3, h_sz))
                c[di] = f * sliceT(c_, di, h_sz) + i * c[di]
                h[di] = o * tensor.tanh(c[di])
                if di < (h_depth - 1):
                    preact = tensor.dot(sliceT(h_, di+1, h_sz), tparams[_p(prefix, ('W_hid_' + str(di+1)))]) + \
                            tensor.dot(h[di], tparams[_p(prefix, ('W_inp_' + str(di+1)))])

            c_out = tensor.concatenate(c, axis=1)
            h_out = tensor.concatenate(h, axis=1)

            return h_out, c_out

        state_below = (tensor.dot(state_below, tparams[_p(prefix, 'W_inp')]) +
                       tparams[_p(prefix, 'b')])

        if options.get('en_aux_inp', 0) == 0:
            aux_input = []

        rval, updates = theano.scan(
            _step,
            sequences=[mask, state_below],
            outputs_info=[
                tensor.alloc(numpy_floatX(0.), n_samples, h_depth * h_sz),
                tensor.alloc(numpy_floatX(0.), n_samples, h_depth * h_sz),
                #tensor.alloc(numpy_floatX(0.),n_samples,options['output_size'])],
            ],
            non_sequences=[aux_input],
            name=_p(prefix, '_layers'),
            n_steps=nsteps)
        return rval, updates
    def build_eval_other_sent(self, tparams, options, model_npy):

        zipp(model_npy, self.model_th)

        # Used for dropout.
        use_noise = theano.shared(numpy_floatX(0.))

        xW = tensor.matrix('xW', dtype='int64')
        mask = tensor.matrix('mask', dtype=config.floatX)
        n_timesteps = xW.shape[0]
        n_samples = xW.shape[1]

        embW = tparams['Wemb'][xW.flatten()].reshape(
            [n_timesteps, n_samples, options['word_encoding_size']])
        xI = tensor.matrix('xI', dtype=config.floatX)
        xAux = tensor.matrix('xAux', dtype=config.floatX)

        embImg = (tensor.dot(xI, tparams['WIemb']) + tparams['b_Img']).reshape(
            [1, n_samples, options['image_encoding_size']])
        emb = tensor.concatenate([embImg, embW], axis=0)

        rval, updatesLSTM = self.lstm_layer(tparams,
                                            emb[:n_timesteps, :, :],
                                            xAux,
                                            use_noise,
                                            options,
                                            prefix=options['generator'],
                                            mask=mask)
        p = rval[0]

        p = tensor.dot(p, tparams['Wd']) + tparams['bd']

        #pred = tensor.nnet.softmax(p)

        #pred = rval[2]

        #pred = pred[1:,:,:]
        p = p[1:, :, :]

        def accumCost(pred, xW, m, c_sum, ppl_sum):
            pred = tensor.nnet.softmax(pred)
            c_sum += (tensor.log(pred[tensor.arange(n_samples), xW] + 1e-20) *
                      m)
            ppl_sum += -(
                tensor.log2(pred[tensor.arange(n_samples), xW] + 1e-10) * m)
            return c_sum, ppl_sum

        sums, upd = theano.scan(fn=accumCost,
                                outputs_info=[
                                    tensor.alloc(numpy_floatX(0.), 1,
                                                 n_samples),
                                    tensor.alloc(numpy_floatX(0.), 1,
                                                 n_samples)
                                ],
                                sequences=[p, xW[1:, :], mask[1:, :]])

        # NOTE1: we are leaving out the first prediction, which was made for the image
        # and is meaningless. Here cost[0] contains log probability (log10) and cost[1] contains
        # perplexity (log2)
        cost = sums[0][-1]

        inp_list = [xW, xI, mask]

        if options.get('en_aux_inp', 0):
            inp_list.append(xAux)

        f_pred_prob = theano.function(inp_list,
                                      p,
                                      name='f_pred_prob',
                                      updates=updatesLSTM)

        self.f_pred_prob_other = theano.function(inp_list,
                                                 p,
                                                 name='f_pred_prob',
                                                 updates=updatesLSTM)
        #f_pred = theano.function([xW, mask], pred.argmax(axis=1), name='f_pred')

        #cost = -tensor.log(pred[tensor.arange(n_timesteps),tensor.arange(n_samples), xW] + 1e-8).mean()

        self.f_eval_other = theano.function(inp_list, cost, name='f_eval')

        return use_noise, inp_list, self.f_pred_prob_other, cost, p, updatesLSTM
def main(params):
  word_count_threshold = params['word_count_threshold']
  max_epochs = params['max_epochs']
  host = socket.gethostname() # get computer hostname

  # fetch the data provider
  dp = getDataProvider(params)
  # Initialize the optimizer 
  solver = Solver(params['solver'])

  params['image_feat_size'] = dp.img_feat_size

  misc = {} # stores various misc items that need to be passed around the framework

  # go over all training sentences and find the vocabulary we want to use, i.e. the words that occur
  # at least word_count_threshold number of times
  misc['wordtoix'], misc['ixtoword'], bias_init_vector = preProBuildWordVocab(dp.iterSentences('train'), word_count_threshold)
  params['use_dropout'] = 1 

  if params['fine_tune'] == 1:
    params['mode'] = 'multimodal_lstm' if params['multimodal_lstm'] == 0 else 'multimodal_lstm'
    if params['checkpoint_file_name'] != None:
        params['batch_size'] = dp.dataset['batchsize']
        misc['wordtoix'] = checkpoint_init['wordtoix']
        misc['ixtoword'] = checkpoint_init['ixtoword']
    batch_size = 1
    num_sentences_total = dp.getSplitSize('train', ofwhat = 'images')
  else:
    params['mode'] = 'batchtrain'
    batch_size = params['batch_size']
    num_sentences_total = dp.getSplitSize('train', ofwhat = 'sentences')
  
  params['vocabulary_size'] = len(misc['wordtoix'])
  pos_samp = np.arange(batch_size,dtype=np.int32)

  # This initializes the model parameters and does matrix initializations  
  evalModel = decodeEvaluator(params)
  model, misc['update'], misc['regularize'] = (evalModel.model_th, evalModel.updateP, evalModel.regularize)
  
  # Define the computational graph for relating the input image features and word indices to the
  # log probability cost funtion. 
  (use_dropout, inp_list,
     miscOuts, cost, predTh, model) = evalModel.build_model(model, params)

  # Add the regularization cost. Since this is specific to trainig and doesn't get included when we 
  # evaluate the cost on test or validation data, we leave it here outside the model definition
  if params['regc'] > 0.:
      reg_cost = theano.shared(numpy_floatX(0.), name='reg_c')
      reg_c = tensor.as_tensor_variable(numpy_floatX(params['regc']), name='reg_c')
      reg_cost = 0.
      for p in misc['regularize']:
        reg_cost += (model[p] ** 2).sum()
        reg_cost *= 0.5 * reg_c 
      cost[0] += (reg_cost /params['batch_size'])
    
  # Compile an evaluation function.. Doesn't include gradients
  # To be used for validation set evaluation
  f_eval= theano.function(inp_list, cost, name='f_eval')

  # Now let's build a gradient computation graph and rmsprop update mechanism
  grads = tensor.grad(cost, wrt=model.values())
  lr = tensor.scalar(name='lr',dtype=config.floatX)
  if params['sim_minibatch'] > 0:
    f_grad_accum, f_clr, ag = solver.accumGrads(model,grads,inp_list,cost, params['sim_minibatch'])
    f_grad_shared, f_update, zg, rg, ud = solver.build_solver_model(lr, model, ag,
                                      inp_list, cost, params)
  else: 
    f_grad_shared, f_update, zg, rg, ud = solver.build_solver_model(lr, model, grads,
                                      inp_list, cost, params)

  print 'model init done.'
  print 'model has keys: ' + ', '.join(model.keys())

  # calculate how many iterations we need, One epoch is considered once going through all the sentences and not images
  # Hence in case of coco/flickr this will 5* no of images
  num_iters_one_epoch = num_sentences_total / batch_size
  max_iters = max_epochs * num_iters_one_epoch
  inner_loop =   params['sim_minibatch'] if params['sim_minibatch'] > 0 else 1
  max_iters = max_iters / inner_loop 
  eval_period_in_epochs = params['eval_period']
  eval_period_in_iters = max(1, int(num_iters_one_epoch * eval_period_in_epochs/ inner_loop))
  top_val_ppl2 = -1
  smooth_train_cost = len(misc['ixtoword']) # initially size of dictionary of confusion
  val_ppl2 = len(misc['ixtoword'])
  last_status_write_time = 0 # for writing worker job status reports
  json_worker_status = {}
  json_worker_status['params'] = params
  json_worker_status['history'] = []

  len_hist = defaultdict(int)
  
  ## Initialize the model parameters from the checkpoint file if we are resuming training
  if params['checkpoint_file_name'] != None:
    zipp(model_init_from,model)
    zipp(rg_init,rg)
    print("\nContinuing training from previous model\n. Already run for %0.2f epochs with validation perplx at %0.3f\n" % (checkpoint_init['epoch'], \
      checkpoint_init['perplexity']))
  elif params['init_from_imagernn'] != None:
    # Initialize word vecs and image emb from generative model file
    rnnCv = pickle.load(open(params['init_from_imagernn'], 'rb'))
    model['Wemb'].set_value(rnnCv['model']['Wemb'])
    model['WIemb'].set_value(rnnCv['model']['WIemb_aux'])
    misc['wordtoix'] = rnnCv['wordtoix']
    misc['ixtoword'] = rnnCv['ixtoword']
    print("\n Initialized Word embedding and Image embeddings from gen mode %s" % (params['init_from_imagernn']))


  use_dropout.set_value(1.)
  #################### Main Loop ############################################
  for it in xrange(max_iters):
    t0 = time.time()
    # fetch a batch of data
    cost_inner = np.zeros((inner_loop,),dtype=np.float32)
    if params['sim_minibatch'] > 0:
        for i_l in xrange(inner_loop):
            batch,pos_samp_sent = dp.sampPosNegSentSamps(params['batch_size'],params['mode'],thresh=0.3) 
            real_inp_list, lenS = prepare_data(batch,misc['wordtoix'],maxlen=params['maxlen'],pos_samp=pos_samp,prep_for=params['eval_model'])
            if params['fine_tune'] == 1:
               real_inp_list.append(pos_samp_sent)
            cost_inner[i_l] = f_grad_accum(*real_inp_list)
    else:
        batch,pos_samp_sent = dp.sampPosNegSentSamps(params['batch_size'],params['mode'],thresh=0.3)
        real_inp_list, lenS = prepare_data(batch,misc['wordtoix'],maxlen=params['maxlen'],pos_samp=pos_samp,prep_for=params['eval_model'])
        if params['fine_tune'] == 1:
           real_inp_list.append(pos_samp_sent)
    # Enable using dropout in training 
    cost = f_grad_shared(*real_inp_list)
    f_update(params['learning_rate'])
    dt = time.time() - t0
   
    # Reset accumulated gradients to 0
    if params['sim_minibatch'] > 0:
        f_clr()
    #print 'model: ' + ' '.join([str(np.isnan(model[m].get_value()).any()) for m in model])
    #print 'rg: ' +' '.join([str(np.isnan(rg[i].get_value()).any()) for i in xrange(len(rg))])
    #print 'zg: ' + ' '.join([str(np.isnan(zg[i].get_value()).any()) for i in xrange(len(zg))])
    #print 'ud: ' + ' '.join([str(np.isnan(ud[i].get_value()).any()) for i in xrange(len(ud))])
    #import pdb; pdb.set_trace()
    #print 'udAft: ' + ' '.join([str(np.isnan(ud[i].get_value()).any()) for i in xrange(len(ud))])

    # print training statistics
    epoch = it*inner_loop * 1.0 / num_iters_one_epoch
    total_cost = (np.e**-cost + (np.e**(-cost_inner)).sum()*(params['sim_minibatch'] > 0))/ (1 + params['sim_minibatch'])
    #print '%d/%d batch done in %.3fs. at epoch %.2f. loss cost = %f, reg cost = %f, ppl2 = %.2f (smooth %.2f)' \
    #      % (it, max_iters, dt, epoch, cost['loss_cost'], cost['reg_cost'], \
    #         train_ppl2, smooth_train_cost)
    if it == 0: smooth_train_cost = total_cost 
    else: smooth_train_cost = 0.99 * smooth_train_cost + 0.01 * total_cost

    tnow = time.time()
    if tnow > last_status_write_time + 60*1: # every now and then lets write a report
      print '%d/%d batch done in %.3fs. at epoch %.2f. Prob now is %.3f' % (it, max_iters, dt, \
		epoch, smooth_train_cost)
      last_status_write_time = tnow
      jstatus = {}
      jstatus['time'] = datetime.datetime.now().isoformat()
      jstatus['iter'] = (it, max_iters)
      jstatus['epoch'] = (epoch, max_epochs)
      jstatus['time_per_batch'] = dt
      jstatus['val_ppl2'] = val_ppl2 # just write the last available one
      json_worker_status['history'].append(jstatus)
      status_file = os.path.join(params['worker_status_output_directory'], host + '_status.json')
      #import pdb; pdb.set_trace()
      try:
        json.dump(json_worker_status, open(status_file, 'w'))
      except Exception, e: # todo be more clever here
        print 'tried to write worker status into %s but got error:' % (status_file, )
        print e
    
    ## perform perplexity evaluation on the validation set and save a model checkpoint if it's good
    is_last_iter = (it+1) == max_iters
    if (((it+1) % eval_period_in_iters) == 0 and it < max_iters - 5) or is_last_iter:
      # Disable using dropout in validation 
      use_dropout.set_value(0.)

      val_ppl2 = eval_split_theano('val', dp, model, params, misc,f_eval) # perform the evaluation on VAL set
      if epoch - params['lr_decay_st_epoch'] >= 0:
        params['learning_rate'] = params['learning_rate'] * params['lr_decay']
        params['lr_decay_st_epoch'] += 1
      
      print 'validation perplexity = %f, lr = %f' % (val_ppl2, params['learning_rate'])
      if params['sample_by_len'] == 1:
        print len_hist

      write_checkpoint_ppl_threshold = params['write_checkpoint_ppl_threshold']
      if val_ppl2 < top_val_ppl2 or top_val_ppl2 < 0:
        if val_ppl2 < write_checkpoint_ppl_threshold or write_checkpoint_ppl_threshold < 0:
          # if we beat a previous record or if this is the first time
          # AND we also beat the user-defined threshold or it doesnt exist
          #top_val_ppl2 = val_ppl2
          filename = '%s_checkpoint_%s_%s_%s_%.2f_%.2f.p' % (params['eval_model'], params['dataset'], host, params['fappend'],val_ppl2,smooth_train_cost)
          filepath = os.path.join(params['checkpoint_output_directory'], filename)
          model_npy = unzip(model)
          rgrads_npy = unzip(rg)
          checkpoint = {}
          checkpoint['it'] = it
          checkpoint['epoch'] = epoch
          checkpoint['model'] = model_npy
          checkpoint['rgrads'] = rgrads_npy
          checkpoint['params'] = params
          checkpoint['perplexity'] = val_ppl2
          checkpoint['wordtoix'] = misc['wordtoix']
          checkpoint['ixtoword'] = misc['ixtoword']
          try:
            pickle.dump(checkpoint, open(filepath, "wb"))
            print 'saved checkpoint in %s' % (filepath, )
          except Exception, e: # todo be more clever here
            print 'tried to write checkpoint into %s but got error: ' % (filepath, )
            print e

      use_dropout.set_value(1.)
    def build_model(self, tparams, options):
        trng = RandomStreams(1234)

        # Used for dropout.
        self.use_noise = theano.shared(numpy_floatX(0.))

        if self.use_shared_features == False:
            xI = tensor.tensor3('xI', dtype=config.floatX)
            xIemb = xI
            n_timesteps = xI.shape[0]
            n_samples = xI.shape[1]
        else:
            xI = tensor.matrix('xI', dtype='int64')
            n_timesteps = xI.shape[0]
            n_samples = xI.shape[1]
            #feats = tensor.concatenate([self.features,tensor.alloc(numpy_floatX(0.),self.image_feat_size,1)],axis=1).T
            xIemb = self.features[xI.flatten(), :].reshape(
                [n_timesteps, n_samples, self.image_feat_size])

        samp_lens = tensor.vector('sL', dtype='int64')

        #This is implementation of input dropout !!
        if options['use_dropout']:
            emb = dropout_layer(xIemb,
                                self.use_noise,
                                trng,
                                options['drop_prob_encoder'],
                                shp=xIemb.shape)

        #############################################################################################################################
        # This implements core lstm
        rval, updatesLSTM = self.lstm_enc_layer(tparams,
                                                emb,
                                                prefix=self.mp + 'lstm')
        #############################################################################################################################
        # This implements core reverse lstm
        if self.encoder == 'bilstm':
            rev_rval, rev_updatesLSTM = basic_lstm_layer(tparams,
                                                         emb[::-1, :, :],
                                                         prefix=self.mp +
                                                         'rev_lstm')
        #############################################################################################################################

        # NOTE1: we are leaving out the first prediction, which was made for the image and is meaningless.
        p = sliceT(rval[0][samp_lens, tensor.arange(n_samples), :],
                   self.hidden_depth, self.hidden_size)

        if self.encoder == 'bilstm':
            rev_p = sliceT(rev_rval[0][-1, :, :], self.hidden_depth,
                           self.hidden_size)

        feat_enc = p + rev_p if self.encoder == 'bilstm' else p

        if options.get('encoder_add_mean', 0):
            feat_enc = feat_enc + (sliceT(rval[0], self.hidden_depth,
                                          self.hidden_size).sum(axis=0) /
                                   samp_lens[:, None])

        inp_list = [xI, samp_lens]

        return self.use_noise, inp_list, feat_enc, updatesLSTM
  def lstm_advers_gen_layer(self, tparams, Xi, aux_input, options, beam_size, prefix='lstm'):
    nMaxsteps = options.get('maxlen',15)
    n_samples = 1 
    h_depth = options.get('hidden_depth',1)
    h_sz = options['hidden_size']
  
    # ----------------------  STEP FUNCTION  ---------------------- #
    def _stepP(x_, h_, c_, lP_, dV_, xAux):
        preact = tensor.dot(sliceT(h_, 0, h_sz), tparams[_p(prefix, 'W_hid')])
        preact += (tensor.dot(x_, tparams[_p(prefix, 'W_inp')]) +
                   tparams[_p(prefix, 'b')])
        if options.get('en_aux_inp',0):
            preact += tensor.dot(xAux,tparams[_p(prefix,'W_aux')])
  
        hL = [[]]*h_depth 
        cL = [[]]*h_depth 
        for di in xrange(h_depth):
            i = tensor.nnet.sigmoid(sliceT(preact, 0, h_sz))
            f = tensor.nnet.sigmoid(sliceT(preact, 1, h_sz))
            o = tensor.nnet.sigmoid(sliceT(preact, 2, h_sz))
            cL[di] = tensor.tanh(sliceT(preact, 3, h_sz))
            cL[di] = f * sliceT(c_, di, h_sz) + i * cL[di]
            hL[di] = o * tensor.tanh(cL[di])
            if di < (h_depth - 1):
                preact = tensor.dot(sliceT(h_, di+1, h_sz), tparams[_p(prefix, ('W_hid_' + str(di+1)))]) + \
                        tensor.dot(hL[di], tparams[_p(prefix, ('W_inp_' + str(di+1)))])
        
        c = tensor.concatenate(cL,axis=1)
        h = tensor.concatenate(hL,axis=1)
  
        p = tensor.dot(hL[-1],tparams['Wd']) + tparams['bd']
        smooth_factor = tensor.as_tensor_variable(numpy_floatX(options.get('softmax_smooth_factor',1.0)), name='sm_f')
        p = tensor.nnet.softmax(p*smooth_factor)
        lProb = tensor.log(p + 1e-20)

        #xCandIdx = tensor.as_tensor_variable([0]) 
        lProb = lProb.flatten()
        xWIdx =  tensor.argmax(lProb,keepdims=True)
        xWlogProb = lProb[xWIdx] + lP_
        
        if options.get('softmax_propogate',0) == 0:
            xW = tparams['Wemb'][xWIdx.flatten()]
        else:
            xW = p.dot(tparams['Wemb'])
        doneVec = tensor.eq(xWIdx,tensor.zeros_like(xWIdx))
  
        return [xW, h, c, xWlogProb, doneVec, xWIdx, p], theano.scan_module.until(doneVec.all())
    # ------------------- END of STEP FUNCTION  -------------------- #
    
    if options.get('en_aux_inp',0) == 0:
       aux_input = [] 
  
    h = tensor.alloc(numpy_floatX(0.),n_samples,h_sz*h_depth)
    c = tensor.alloc(numpy_floatX(0.),n_samples,h_sz*h_depth)
  
    lP = tensor.alloc(numpy_floatX(0.), beam_size);
    dV = tensor.alloc(np.int8(0.), beam_size);
  
    # Propogate the image feature vector
    [xW, h, c, _, _, _, _], _ = _stepP(Xi, h, c, lP, dV,aux_input) 
    
    xWStart = tparams['Wemb'][0,:]
    [xW, h, c, lP, dV, idx0, p0], _ = _stepP(xWStart, h, c, lP, dV, aux_input) 
    
    #if options.get('en_aux_inp',0) == 1:
    #    aux_input = tensor.extra_ops.repeat(aux_input,beam_size,axis=0)
  
    # Now lets do the loop.
    rval, updates = theano.scan(_stepP, outputs_info=[xW, h, c, lP, dV, None, None], non_sequences = [aux_input], name=_p(prefix, 'predict_layers'), n_steps=nMaxsteps-1)
  
    return rval[3][-1], tensor.concatenate([idx0.reshape([1,beam_size]), rval[5]],axis=0), tensor.concatenate([tensor.shape_padleft(p0,n_ones=1),rval[6]],axis=0), updates
def main(params):
    batch_size = params['batch_size']
    word_count_threshold = params['word_count_threshold']
    max_epochs = params['max_epochs']
    host = socket.gethostname()  # get computer hostname

    # fetch the data provider
    dp = getDataProvider(params)

    params['aux_inp_size'] = dp.aux_inp_size
    params['image_feat_size'] = dp.img_feat_size

    print 'Image feature size is %d, and aux input size is %d' % (
        params['image_feat_size'], params['aux_inp_size'])

    misc = {
    }  # stores various misc items that need to be passed around the framework

    # go over all training sentences and find the vocabulary we want to use, i.e. the words that occur
    # at least word_count_threshold number of times
    misc['wordtoix'], misc[
        'ixtoword'], bias_init_vector = preProBuildWordVocab(
            dp.iterSentences('train'), word_count_threshold)
    params['vocabulary_size'] = len(misc['wordtoix'])
    params['output_size'] = len(misc['ixtoword'])  # these should match though
    params['use_dropout'] = 1

    # This initializes the model parameters and does matrix initializations
    lstmGenerator = LSTMGenerator(params)
    model, misc['update'], misc['regularize'] = (lstmGenerator.model_th,
                                                 lstmGenerator.update,
                                                 lstmGenerator.regularize)

    # force overwrite here. The bias to the softmax is initialized to reflect word frequencies
    # This is a bit of a hack, not happy about it
    model['bd'].set_value(bias_init_vector.astype(config.floatX))

    # Define the computational graph for relating the input image features and word indices to the
    # log probability cost funtion.
    (use_dropout, inp_list, f_pred_prob, cost, predTh,
     updatesLSTM) = lstmGenerator.build_model(model, params)

    # Add the regularization cost. Since this is specific to trainig and doesn't get included when we
    # evaluate the cost on test or validation data, we leave it here outside the model definition
    if params['regc'] > 0.:
        reg_cost = theano.shared(numpy_floatX(0.), name='reg_c')
        reg_c = tensor.as_tensor_variable(numpy_floatX(params['regc']),
                                          name='reg_c')
        reg_cost = 0.
        for p in misc['regularize']:
            reg_cost += (model[p]**2).sum()
            reg_cost *= 0.5 * reg_c
        cost[0] += (reg_cost / params['batch_size'])

    # Compile an evaluation function.. Doesn't include gradients
    # To be used for validation set evaluation
    f_eval = theano.function(inp_list, cost, name='f_eval')

    # Now let's build a gradient computation graph and rmsprop update mechanism
    grads = tensor.grad(cost[0], wrt=model.values())
    lr = tensor.scalar(name='lr', dtype=config.floatX)
    f_grad_shared, f_update, zg, rg, ud = lstmGenerator.rmsprop(
        lr, model, grads, inp_list, cost, params)

    print 'model init done.'
    print 'model has keys: ' + ', '.join(model.keys())
    #print 'updating: ' + ', '.join( '%s [%dx%d]' % (k, model[k].shape[0], model[k].shape[1]) for k in misc['update'])
    #print 'updating: ' + ', '.join( '%s [%dx%d]' % (k, model[k].shape[0], model[k].shape[1]) for k in misc['regularize'])
    #print 'number of learnable parameters total: %d' % (sum(model[k].shape[0] * model[k].shape[1] for k in misc['update']), )

    # calculate how many iterations we need, One epoch is considered once going through all the sentences and not images
    # Hence in case of coco/flickr this will 5* no of images
    num_sentences_total = dp.getSplitSize('train', ofwhat='sentences')
    num_iters_one_epoch = num_sentences_total / batch_size
    max_iters = max_epochs * num_iters_one_epoch
    eval_period_in_epochs = params['eval_period']
    eval_period_in_iters = max(
        1, int(num_iters_one_epoch * eval_period_in_epochs))
    top_val_ppl2 = -1
    smooth_train_ppl2 = len(
        misc['ixtoword'])  # initially size of dictionary of confusion
    val_ppl2 = len(misc['ixtoword'])
    last_status_write_time = 0  # for writing worker job status reports
    json_worker_status = {}
    json_worker_status['params'] = params
    json_worker_status['history'] = []

    len_hist = defaultdict(int)

    ## Initialize the model parameters from the checkpoint file if we are resuming training
    if params['checkpoint_file_name'] != 'None':
        zipp(model_init_from, model)
        zipp(rg_init, rg)
        print("\nContinuing training from previous model\n. Already run for %0.2f epochs with validation perplx at %0.3f\n" % (checkpoint_init['epoch'], \
          checkpoint_init['perplexity']))

    for it in xrange(max_iters):
        t0 = time.time()
        # fetch a batch of data
        if params['sample_by_len'] == 0:
            batch = [dp.sampleImageSentencePair() for i in xrange(batch_size)]
        else:
            batch, l = dp.getRandBatchByLen(batch_size)
            len_hist[l] += 1

        if params['use_pos_tag'] != 'None':
            real_inp_list, lenS = prepare_data(batch, misc['wordtoix'], None,
                                               sentTagMap, misc['ixtoword'])
        else:
            real_inp_list, lenS = prepare_data(batch, misc['wordtoix'])

        # Enable using dropout in training
        use_dropout.set_value(1.)

        # evaluate cost, gradient and perform parameter update
        cost = f_grad_shared(*real_inp_list)
        f_update(params['learning_rate'])
        dt = time.time() - t0

        # print training statistics
        train_ppl2 = (2**(cost[1] / lenS))  #step_struct['stats']['ppl2']
        smooth_train_ppl2 = 0.99 * smooth_train_ppl2 + 0.01 * train_ppl2  # smooth exponentially decaying moving average
        if it == 0:
            smooth_train_ppl2 = train_ppl2  # start out where we start out
        epoch = it * 1.0 / num_iters_one_epoch
        total_cost = cost[0]
        #print '%d/%d batch done in %.3fs. at epoch %.2f. loss cost = %f, reg cost = %f, ppl2 = %.2f (smooth %.2f)' \
        #      % (it, max_iters, dt, epoch, cost['loss_cost'], cost['reg_cost'], \
        #         train_ppl2, smooth_train_ppl2)

        tnow = time.time()
        if tnow > last_status_write_time + 60 * 1:  # every now and then lets write a report
            print '%d/%d batch done in %.3fs. at epoch %.2f. Cost now is %.3f and pplx is %.3f' % (it, max_iters, dt, \
        epoch, total_cost, smooth_train_ppl2)
            last_status_write_time = tnow
            jstatus = {}
            jstatus['time'] = datetime.datetime.now().isoformat()
            jstatus['iter'] = (it, max_iters)
            jstatus['epoch'] = (epoch, max_epochs)
            jstatus['time_per_batch'] = dt
            jstatus['smooth_train_ppl2'] = smooth_train_ppl2
            jstatus['val_ppl2'] = val_ppl2  # just write the last available one
            jstatus['train_ppl2'] = train_ppl2
            json_worker_status['history'].append(jstatus)
            status_file = os.path.join(
                params['worker_status_output_directory'],
                host + '_status.json')
            #import pdb; pdb.set_trace()
            try:
                json.dump(json_worker_status, open(status_file, 'w'))
            except Exception, e:  # todo be more clever here
                print 'tried to write worker status into %s but got error:' % (
                    status_file, )
                print e

        ## perform perplexity evaluation on the validation set and save a model checkpoint if it's good
        is_last_iter = (it + 1) == max_iters
        if (((it + 1) % eval_period_in_iters) == 0
                and it < max_iters - 5) or is_last_iter:
            # Disable using dropout in validation
            use_dropout.set_value(0.)

            val_ppl2 = eval_split_theano(
                'val', dp, model, params, misc,
                f_eval)  # perform the evaluation on VAL set

            if epoch - params['lr_decay_st_epoch'] >= 0:
                params['learning_rate'] = params['learning_rate'] * params[
                    'lr_decay']
                params['lr_decay_st_epoch'] += 1

            print 'validation perplexity = %f, lr = %f' % (
                val_ppl2, params['learning_rate'])
            if params['sample_by_len'] == 1:
                print len_hist

            write_checkpoint_ppl_threshold = params[
                'write_checkpoint_ppl_threshold']
            if val_ppl2 < top_val_ppl2 or top_val_ppl2 < 0:
                if val_ppl2 < write_checkpoint_ppl_threshold or write_checkpoint_ppl_threshold < 0:
                    # if we beat a previous record or if this is the first time
                    # AND we also beat the user-defined threshold or it doesnt exist
                    top_val_ppl2 = val_ppl2
                    filename = 'model_checkpoint_%s_%s_%s_%.2f.p' % (
                        params['dataset'], host, params['fappend'], val_ppl2)
                    filepath = os.path.join(
                        params['checkpoint_output_directory'], filename)
                    model_npy = unzip(model)
                    rgrads_npy = unzip(rg)
                    checkpoint = {}
                    checkpoint['it'] = it
                    checkpoint['epoch'] = epoch
                    checkpoint['model'] = model_npy
                    checkpoint['rgrads'] = rgrads_npy
                    checkpoint['params'] = params
                    checkpoint['perplexity'] = val_ppl2
                    checkpoint['wordtoix'] = misc['wordtoix']
                    checkpoint['ixtoword'] = misc['ixtoword']
                    try:
                        pickle.dump(checkpoint, open(filepath, "wb"))
                        print 'saved checkpoint in %s' % (filepath, )
                    except Exception, e:  # todo be more clever here
                        print 'tried to write checkpoint into %s but got error: ' % (
                            filepath, )
                        print e
def main(params):
  batch_size = params['batch_size']
  word_count_threshold = params['word_count_threshold']
  max_epochs = params['max_epochs']
  host = socket.gethostname() # get computer hostname

  # fetch the data provider
  dp = getDataProvider(params)

  params['aux_inp_size'] = dp.aux_inp_size
  params['image_feat_size'] = dp.img_feat_size

  print 'Image feature size is %d, and aux input size is %d'%(params['image_feat_size'],params['aux_inp_size'])

  misc = {} # stores various misc items that need to be passed around the framework

  # go over all training sentences and find the vocabulary we want to use, i.e. the words that occur
  # at least word_count_threshold number of times
  misc['wordtoix'], misc['ixtoword'], bias_init_vector = preProBuildWordVocab(dp.iterSentences('train'), word_count_threshold)
  params['vocabulary_size'] = len(misc['wordtoix'])
  params['output_size'] = len(misc['ixtoword']) # these should match though
  params['use_dropout'] = 1 

  # This initializes the model parameters and does matrix initializations  
  lstmGenerator = LSTMGenerator(params)
  model, misc['update'], misc['regularize'] = (lstmGenerator.model_th, lstmGenerator.update, lstmGenerator.regularize)
  
  # force overwrite here. The bias to the softmax is initialized to reflect word frequencies
  # This is a bit of a hack, not happy about it
  model['bd'].set_value(bias_init_vector.astype(config.floatX))

  # Define the computational graph for relating the input image features and word indices to the
  # log probability cost funtion. 
  (use_dropout, inp_list,
     f_pred_prob, cost, predTh, updatesLSTM) = lstmGenerator.build_model(model, params)

  # Add the regularization cost. Since this is specific to trainig and doesn't get included when we 
  # evaluate the cost on test or validation data, we leave it here outside the model definition
  if params['regc'] > 0.:
      reg_cost = theano.shared(numpy_floatX(0.), name='reg_c')
      reg_c = tensor.as_tensor_variable(numpy_floatX(params['regc']), name='reg_c')
      reg_cost = 0.
      for p in misc['regularize']:
        reg_cost += (model[p] ** 2).sum()
        reg_cost *= 0.5 * reg_c 
      cost[0] += (reg_cost /params['batch_size'])
    
  # Compile an evaluation function.. Doesn't include gradients
  # To be used for validation set evaluation
  f_eval= theano.function(inp_list, cost, name='f_eval')

  # Now let's build a gradient computation graph and rmsprop update mechanism
  grads = tensor.grad(cost[0], wrt=model.values())
  lr = tensor.scalar(name='lr',dtype=config.floatX)
  f_grad_shared, f_update, zg, rg, ud = lstmGenerator.rmsprop(lr, model, grads,
                                      inp_list, cost, params)

  print 'model init done.'
  print 'model has keys: ' + ', '.join(model.keys())
  #print 'updating: ' + ', '.join( '%s [%dx%d]' % (k, model[k].shape[0], model[k].shape[1]) for k in misc['update'])
  #print 'updating: ' + ', '.join( '%s [%dx%d]' % (k, model[k].shape[0], model[k].shape[1]) for k in misc['regularize'])
  #print 'number of learnable parameters total: %d' % (sum(model[k].shape[0] * model[k].shape[1] for k in misc['update']), )

  # calculate how many iterations we need, One epoch is considered once going through all the sentences and not images
  # Hence in case of coco/flickr this will 5* no of images
  num_sentences_total = dp.getSplitSize('train', ofwhat = 'sentences')
  num_iters_one_epoch = num_sentences_total / batch_size
  max_iters = max_epochs * num_iters_one_epoch
  eval_period_in_epochs = params['eval_period']
  eval_period_in_iters = max(1, int(num_iters_one_epoch * eval_period_in_epochs))
  top_val_ppl2 = -1
  smooth_train_ppl2 = len(misc['ixtoword']) # initially size of dictionary of confusion
  val_ppl2 = len(misc['ixtoword'])
  last_status_write_time = 0 # for writing worker job status reports
  json_worker_status = {}
  json_worker_status['params'] = params
  json_worker_status['history'] = []

  len_hist = defaultdict(int)
  
  ## Initialize the model parameters from the checkpoint file if we are resuming training
  if params['checkpoint_file_name'] != 'None':
    zipp(model_init_from,model)
    zipp(rg_init,rg)
    print("\nContinuing training from previous model\n. Already run for %0.2f epochs with validation perplx at %0.3f\n" % (checkpoint_init['epoch'], \
      checkpoint_init['perplexity']))
  
  for it in xrange(max_iters):
    t0 = time.time()
    # fetch a batch of data
    if params['sample_by_len'] == 0:
        batch = [dp.sampleImageSentencePair() for i in xrange(batch_size)]
    else: 
        batch,l = dp.getRandBatchByLen(batch_size)
        len_hist[l] += 1

    if params['use_pos_tag'] != 'None':
        real_inp_list, lenS = prepare_data(batch,misc['wordtoix'],None,sentTagMap,misc['ixtoword'])
    else:    
        real_inp_list, lenS = prepare_data(batch,misc['wordtoix'])
    
    # Enable using dropout in training 
    use_dropout.set_value(1.)

    # evaluate cost, gradient and perform parameter update
    cost = f_grad_shared(*real_inp_list)
    f_update(params['learning_rate'])
    dt = time.time() - t0

    # print training statistics
    train_ppl2 = (2**(cost[1]/lenS)) #step_struct['stats']['ppl2']
    smooth_train_ppl2 = 0.99 * smooth_train_ppl2 + 0.01 * train_ppl2 # smooth exponentially decaying moving average
    if it == 0: smooth_train_ppl2 = train_ppl2 # start out where we start out
    epoch = it * 1.0 / num_iters_one_epoch
    total_cost = cost[0]
    #print '%d/%d batch done in %.3fs. at epoch %.2f. loss cost = %f, reg cost = %f, ppl2 = %.2f (smooth %.2f)' \
    #      % (it, max_iters, dt, epoch, cost['loss_cost'], cost['reg_cost'], \
    #         train_ppl2, smooth_train_ppl2)

    tnow = time.time()
    if tnow > last_status_write_time + 60*1: # every now and then lets write a report
      print '%d/%d batch done in %.3fs. at epoch %.2f. Cost now is %.3f and pplx is %.3f' % (it, max_iters, dt, \
		epoch, total_cost, smooth_train_ppl2)
      last_status_write_time = tnow
      jstatus = {}
      jstatus['time'] = datetime.datetime.now().isoformat()
      jstatus['iter'] = (it, max_iters)
      jstatus['epoch'] = (epoch, max_epochs)
      jstatus['time_per_batch'] = dt
      jstatus['smooth_train_ppl2'] = smooth_train_ppl2
      jstatus['val_ppl2'] = val_ppl2 # just write the last available one
      jstatus['train_ppl2'] = train_ppl2
      json_worker_status['history'].append(jstatus)
      status_file = os.path.join(params['worker_status_output_directory'], host + '_status.json')
      #import pdb; pdb.set_trace()
      try:
        json.dump(json_worker_status, open(status_file, 'w'))
      except Exception, e: # todo be more clever here
        print 'tried to write worker status into %s but got error:' % (status_file, )
        print e
    
    ## perform perplexity evaluation on the validation set and save a model checkpoint if it's good
    is_last_iter = (it+1) == max_iters
    if (((it+1) % eval_period_in_iters) == 0 and it < max_iters - 5) or is_last_iter:
      # Disable using dropout in validation 
      use_dropout.set_value(0.)

      val_ppl2 = eval_split_theano('val', dp, model, params, misc,f_eval) # perform the evaluation on VAL set
      
      if epoch - params['lr_decay_st_epoch'] >= 0:
        params['learning_rate'] = params['learning_rate'] * params['lr_decay']
        params['lr_decay_st_epoch'] += 1
      
      print 'validation perplexity = %f, lr = %f' % (val_ppl2, params['learning_rate'])
      if params['sample_by_len'] == 1:
        print len_hist

        
      write_checkpoint_ppl_threshold = params['write_checkpoint_ppl_threshold']
      if val_ppl2 < top_val_ppl2 or top_val_ppl2 < 0:
        if val_ppl2 < write_checkpoint_ppl_threshold or write_checkpoint_ppl_threshold < 0:
          # if we beat a previous record or if this is the first time
          # AND we also beat the user-defined threshold or it doesnt exist
          top_val_ppl2 = val_ppl2
          filename = 'model_checkpoint_%s_%s_%s_%.2f.p' % (params['dataset'], host, params['fappend'], val_ppl2)
          filepath = os.path.join(params['checkpoint_output_directory'], filename)
          model_npy = unzip(model)
          rgrads_npy = unzip(rg)
          checkpoint = {}
          checkpoint['it'] = it
          checkpoint['epoch'] = epoch
          checkpoint['model'] = model_npy
          checkpoint['rgrads'] = rgrads_npy
          checkpoint['params'] = params
          checkpoint['perplexity'] = val_ppl2
          checkpoint['wordtoix'] = misc['wordtoix']
          checkpoint['ixtoword'] = misc['ixtoword']
          try:
            pickle.dump(checkpoint, open(filepath, "wb"))
            print 'saved checkpoint in %s' % (filepath, )
          except Exception, e: # todo be more clever here
            print 'tried to write checkpoint into %s but got error: ' % (filepath, )
            print e
    def build_model(self, tparams, options):
        trng = RandomStreams(1234)

        # Used for dropout.
        use_noise = theano.shared(numpy_floatX(0.))

        xW = tensor.matrix('xW', dtype='int64')

        mask = tensor.matrix('mask', dtype=config.floatX)
        n_timesteps = xW.shape[0]
        n_samples = xW.shape[1]

        embW = tparams['Wemb'][xW.flatten()].reshape(
            [n_timesteps, n_samples, options['word_encoding_size']])

        embW_rev = tparams['Wemb'][xW[::-1, :].flatten()].reshape(
            [n_timesteps, n_samples, options['word_encoding_size']])
        xI = tensor.matrix('xI', dtype=config.floatX)
        xAux = tensor.matrix('xAux', dtype=config.floatX)

        if options.get('swap_aux', 0):
            xAuxEmb = tensor.dot(xAux,
                                 tparams['WIemb_aux']) + tparams['b_Img_aux']
        else:
            xAuxEmb = xAux

        embImg = (tensor.dot(xI, tparams['WIemb']) + tparams['b_Img']).reshape(
            [1, n_samples, options['image_encoding_size']])
        emb = tensor.concatenate([embImg, embW], axis=0)

        emb_rev = tensor.set_subtensor(
            embW_rev[mask[::-1, :].argmax(axis=0) - 1,
                     tensor.arange(n_samples), :], embImg[0, :, :])

        #This is implementation of input dropout !!
        if options['use_dropout']:
            emb = dropout_layer(emb,
                                use_noise,
                                trng,
                                options['drop_prob_encoder'],
                                shp=emb.shape)
            if options.get('en_aux_inp', 0):
                xAuxEmb = dropout_layer(xAuxEmb,
                                        use_noise,
                                        trng,
                                        options['drop_prob_aux'],
                                        shp=xAuxEmb.shape)

        #############################################################################################################################
        # This implements core lstm
        rval, updatesLSTM = basic_lstm_layer(tparams,
                                             emb[:n_timesteps, :, :],
                                             xAuxEmb,
                                             use_noise,
                                             options,
                                             prefix='lstm',
                                             sched_prob_mask=[])
        #############################################################################################################################
        # This implements core reverse lstm
        rev_rval, rev_updatesLSTM = basic_lstm_layer(
            tparams,
            emb_rev[:n_timesteps, :, :],
            xAuxEmb,
            use_noise,
            options,
            prefix='rev_lstm',
            sched_prob_mask=[])
        #############################################################################################################################

        # NOTE1: we are leaving out the first prediction, which was made for the image and is meaningless.
        if options['use_dropout']:
            # XXX : Size given to dropout is missing one dimension. This keeps the dropped units consistent across time!?.
            # ###   Is this a good bug ?
            p = dropout_layer(
                sliceT(rval[0][1:, :, :], options.get('hidden_depth', 1),
                       options['hidden_size']), use_noise, trng,
                options['drop_prob_decoder'],
                (n_samples, options['hidden_size']))
            rev_p = dropout_layer(
                sliceT(rev_rval[0][:, :, :], options.get('hidden_depth', 1),
                       options['hidden_size']), use_noise, trng,
                options['drop_prob_decoder'],
                (n_samples, options['hidden_size']))
        else:
            p = sliceT(rval[0][1:, :, :], options.get('hidden_depth', 1),
                       options['hidden_size'])
            rev_p = sliceT(rev_rval[0][:, :, :],
                           options.get('hidden_depth',
                                       1), options['hidden_size'])

        n_out_samps = (n_timesteps - 2) * n_samples
        if options.get('class_out_factoring', 0) == 0:
            pW = (tensor.dot(p[:-1, :, :] + rev_p[::-1, :, :][2:, :, :],
                             tparams['Wd']) + tparams['bd']).reshape(
                                 [n_out_samps, options['output_size']])
            pWSft = tensor.nnet.softmax(pW)
            totProb = pWSft[tensor.arange(n_out_samps), xW[1:-1, :].flatten()]
            out_list = [pWSft, totProb, p]
        else:
            ixtoclsinfo_t = tensor.as_tensor_variable(options['ixtoclsinfo'])
            xC = ixtoclsinfo_t[xW[1:, :].flatten(), 0]
            pW = ((tparams['Wd'][:, xC, :].T *
                   (p.reshape([1, n_out_samps, options['hidden_size']]))).sum(
                       axis=-1).T + tparams['bd'][:, xC, :])
            pWSft = tensor.nnet.softmax(pW[0, :, :])
            pC = (tensor.dot(p, tparams['WdCls']) + tparams['bdCls']).reshape(
                [n_out_samps, options['nClasses']])
            pCSft = tensor.nnet.softmax(pC)

            totProb = pWSft[tensor.arange(n_out_samps), ixtoclsinfo_t[xW[1:,:].flatten(),3]] * \
                      pCSft[tensor.arange(n_out_samps), xC]
            out_list = [pWSft, pCSft, totProb, p]

        # XXX : THIS IS VERY FISHY, CHECK THE MASK INDEXING AGAIN
        probs_valid = tensor.log(totProb + 1e-10) * mask[1:-1, :].flatten()
        tot_cost = -(probs_valid.sum())
        tot_pplx = -(tensor.log2(totProb + 1e-10) *
                     mask[1:-1, :].flatten()).sum()
        cost = [tot_cost / options['batch_size'], tot_pplx]

        inp_list = [xW, mask, xI]

        if options.get('en_aux_inp', 0):
            inp_list.append(xAux)

        if options.get('sched_sampling_mode', None) != None:
            inp_list.append(curr_epoch)

        per_sent_prob = probs_valid.reshape([n_timesteps - 2,
                                             n_samples]).sum(axis=0)
        f_per_sentLogP = theano.function(inp_list,
                                         per_sent_prob,
                                         name='f_pred_logprob',
                                         updates=updatesLSTM)
        f_pred_prob = ['', f_per_sentLogP, '']

        return use_noise, inp_list, f_pred_prob, cost, out_list, updatesLSTM
  def build_model(self, tparams, options):
    trng = RandomStreams(1234)

    # Used for dropout.
    use_noise = theano.shared(numpy_floatX(0.))

    xW = tensor.matrix('xW', dtype='int64')

    mask = tensor.matrix('mask', dtype=config.floatX)
    n_timesteps = xW.shape[0]
    n_samples = xW.shape[1]

    embW = tparams['Wemb'][xW.flatten()].reshape([n_timesteps,
                                                n_samples,
                                                options['word_encoding_size']])
    xI = tensor.matrix('xI', dtype=config.floatX)
    xAux = tensor.matrix('xAux', dtype=config.floatX)

    embImg = (tensor.dot(xI, tparams['WIemb']) + tparams['b_Img']).reshape([1,n_samples,options['image_encoding_size']]);
    emb = tensor.concatenate([embImg, embW], axis=0) 

    #This is implementation of input dropout !!
    if options['use_dropout']:
        emb = self.dropout_layer(emb, use_noise, trng, options['drop_prob_encoder'], shp = emb.shape)
        if options.get('en_aux_inp',0):
            xAux = self.dropout_layer(xAux, use_noise, trng, options['drop_prob_aux'], shp = xAux.shape)

    # This implements core lstm
    rval, updatesLSTM = self.lstm_layer(tparams, emb[:n_timesteps,:,:], xAux, use_noise, options, prefix=options['generator'],
                                mask=mask)
    if options['use_dropout']:
        p = self.dropout_layer(sliceT(rval[0],options.get('hidden_depth',1)-1,options['hidden_size']), use_noise, trng,
            options['drop_prob_decoder'], (n_samples,options['hidden_size']))
    else:
        p = sliceT(rval[0],options.get('hidden_depth',1)-1,options['hidden_size'])


    p = tensor.dot(p,tparams['Wd']) + tparams['bd']

    #pred = tensor.nnet.softmax(p)

    #pred = rval[2]

    #pred = pred[1:,:,:]
    p = p[1:,:,:]

    def accumCost(pred, xW, m, c_sum, ppl_sum):
        pred = tensor.nnet.softmax(pred)
        c_sum += -(tensor.log(pred[tensor.arange(n_samples), xW]+1e-10) * m).sum()
        ppl_sum += -(tensor.log2(pred[tensor.arange(n_samples), xW]+1e-10) * m).sum()
        return c_sum, ppl_sum

    sums, upd = theano.scan(fn=accumCost, 
                                outputs_info=[tensor.as_tensor_variable(numpy_floatX(0.)), 
                                              tensor.as_tensor_variable(numpy_floatX(0.))],
                                sequences = [p, xW[1:,:], mask[1:,:]])

    # NOTE1: we are leaving out the first prediction, which was made for the image
    # and is meaningless. Here cost[0] contains log probability (log10) and cost[1] contains
    # perplexity (log2)
    cost = [sums[0][-1]/options['batch_size'], sums[1][-1]]

    inp_list = [xW, xI, mask]

    if options.get('en_aux_inp',0):
        inp_list.append(xAux)

    f_pred_prob = theano.function(inp_list, p, name='f_pred_prob', updates=updatesLSTM)


    return use_noise, inp_list, f_pred_prob, cost, p, updatesLSTM 
        def _stepP(x_, h_, c_, lP_, dV_, xAux):
            preact = tensor.dot(sliceT(h_, 0, h_sz),
                                tparams[_p(prefix, 'W_hid')])
            preact += (tensor.dot(x_, tparams[_p(prefix, 'W_inp')]) +
                       tparams[_p(prefix, 'b')])
            if options.get('en_aux_inp', 0):
                preact += tensor.dot(xAux, tparams[_p(prefix, 'W_aux')])

            hL = [[]] * h_depth
            cL = [[]] * h_depth
            outp = [[]] * h_depth
            for di in xrange(h_depth):
                i = tensor.nnet.sigmoid(sliceT(preact, 0, h_sz))
                f = tensor.nnet.sigmoid(sliceT(preact, 1, h_sz))
                o = tensor.nnet.sigmoid(sliceT(preact, 2, h_sz))
                cL[di] = tensor.tanh(sliceT(preact, 3, h_sz))
                cL[di] = f * sliceT(c_, di, h_sz) + i * cL[di]
                hL[di] = o * tensor.tanh(cL[di])
                outp[di] = hL[di]
                if options.get('en_residual_conn', 1):
                    if (di > 0):
                        outp[di] += outp[di - 1]
                        print "Connecting residual at %d" % (di)
                if di < (h_depth - 1):
                    preact = tensor.dot(sliceT(h_, di+1, h_sz), tparams[_p(prefix, ('W_hid_' + str(di+1)))]) + \
                            tensor.dot(outp[di], tparams[_p(prefix, ('W_inp_' + str(di+1)))])

            c = tensor.concatenate(cL, axis=1)
            h = tensor.concatenate(hL, axis=1)

            if options.get('class_out_factoring', 0) == 1:
                pC = tensor.dot(outp[-1], tparams['WdCls']) + tparams['bdCls']
                pCSft = tensor.nnet.softmax(pC)
                xCIdx = tensor.argmax(pCSft)
                pW = tensor.dot(
                    outp[-1],
                    tparams['Wd'][:, xCIdx, :]) + tparams['bd'][:, xCIdx, :]
                smooth_factor = tensor.as_tensor_variable(numpy_floatX(
                    options.get('softmax_smooth_factor', 1.0)),
                                                          name='sm_f')
                pWSft = tensor.nnet.softmax(pW * smooth_factor)
                lProb = tensor.log(pWSft +
                                   1e-20) + tensor.log(pCSft[0, xCIdx] + 1e-20)
            else:
                p = tensor.dot(outp[-1], tparams['Wd']) + tparams['bd']
                smooth_factor = tensor.as_tensor_variable(numpy_floatX(
                    options.get('softmax_smooth_factor', 1.0)),
                                                          name='sm_f')
                p = tensor.nnet.softmax(p * smooth_factor)
                lProb = tensor.log(p + 1e-20)

            if beam_size > 1:

                def _FindB_best(lPLcl, lPprev, dVLcl):
                    srtLcl = tensor.argsort(-lPLcl)
                    srtLcl = srtLcl[:beam_size]
                    deltaVec = tensor.fill(lPLcl[srtLcl],
                                           numpy_floatX(-10000.))
                    deltaVec = tensor.set_subtensor(deltaVec[0], lPprev)
                    lProbBest = ifelse(
                        tensor.eq(dVLcl, tensor.zeros_like(dVLcl)),
                        lPLcl[srtLcl] + lPprev, deltaVec)
                    xWIdxBest = ifelse(
                        tensor.eq(dVLcl, tensor.zeros_like(dVLcl)), srtLcl,
                        tensor.zeros_like(srtLcl))
                    return lProbBest, xWIdxBest

                rvalLcl, updatesLcl = theano.scan(_FindB_best,
                                                  sequences=[lProb, lP_, dV_],
                                                  name=_p(prefix, 'FindBest'),
                                                  n_steps=x_.shape[0])
                xWIdxBest = rvalLcl[1]
                lProbBest = rvalLcl[0]

                xWIdxBest = xWIdxBest.flatten()
                lProb = lProbBest.flatten()
                # Now sort and find the best among these best extensions for the current beams
                srtIdx = tensor.argsort(-lProb)
                srtIdx = srtIdx[:beam_size]
                xCandIdx = srtIdx // beam_size  # Floor division
                h = h.take(xCandIdx.flatten(), axis=0)
                c = c.take(xCandIdx.flatten(), axis=0)
                xWlogProb = lProb[srtIdx]
                xWIdx = xWIdxBest[srtIdx]
            else:
                xCandIdx = tensor.as_tensor_variable([0])
                lProb = lProb.flatten()
                xWIdx = tensor.argmax(lProb, keepdims=True)
                xWlogProb = lProb[xWIdx] + lP_
                if options.get('class_out_factoring', 0) == 1:
                    clsoffset = tensor.as_tensor_variable(
                        options['ixtoclsinfo'][:, 0])
                    xWIdx += clsoffset[xCIdx]
                h = h.take(xCandIdx.flatten(), axis=0)
                c = c.take(xCandIdx.flatten(), axis=0)

            if options.get('softmax_propogate', 0) == 0:
                xW = tparams['Wemb'][xWIdx.flatten()]
            else:
                xW = p.dot(tparams['Wemb'])
            doneVec = tensor.eq(xWIdx, tensor.zeros_like(xWIdx))

            return [xW, h, c, xWlogProb, doneVec, xWIdx,
                    xCandIdx], theano.scan_module.until(doneVec.all())
    def build_model(self, tparams, options):
        trng = RandomStreams(1234)

        # Used for dropout.
        use_noise = theano.shared(numpy_floatX(0.))

        xW = tensor.matrix('xW', dtype='int64')

        mask = tensor.matrix('mask', dtype=config.floatX)
        n_timesteps = xW.shape[0]
        n_samples = xW.shape[1]

        embW = tparams['Wemb'][xW.flatten()].reshape(
            [n_timesteps, n_samples, options['word_encoding_size']])
        xI = tensor.matrix('xI', dtype=config.floatX)
        xAux = tensor.matrix('xAux', dtype=config.floatX)

        embImg = (tensor.dot(xI, tparams['WIemb']) + tparams['b_Img']).reshape(
            [1, n_samples, options['image_encoding_size']])
        emb = tensor.concatenate([embImg, embW], axis=0)

        #This is implementation of input dropout !!
        if options['use_dropout']:
            emb = self.dropout_layer(emb,
                                     use_noise,
                                     trng,
                                     options['drop_prob_encoder'],
                                     shp=emb.shape)
            if options.get('en_aux_inp', 0):
                xAux = self.dropout_layer(xAux,
                                          use_noise,
                                          trng,
                                          options['drop_prob_aux'],
                                          shp=xAux.shape)

        # This implements core lstm
        rval, updatesLSTM = self.lstm_layer(tparams,
                                            emb[:n_timesteps, :, :],
                                            xAux,
                                            use_noise,
                                            options,
                                            prefix=options['generator'],
                                            mask=mask)
        if options['use_dropout']:
            p = self.dropout_layer(
                sliceT(rval[0],
                       options.get('hidden_depth', 1) - 1,
                       options['hidden_size']), use_noise, trng,
                options['drop_prob_decoder'],
                (n_samples, options['hidden_size']))
        else:
            p = sliceT(rval[0],
                       options.get('hidden_depth', 1) - 1,
                       options['hidden_size'])

        p = tensor.dot(p, tparams['Wd']) + tparams['bd']

        #pred = tensor.nnet.softmax(p)

        #pred = rval[2]

        #pred = pred[1:,:,:]
        p = p[1:, :, :]

        def accumCost(pred, xW, m, c_sum, ppl_sum):
            pred = tensor.nnet.softmax(pred)
            c_sum += -(tensor.log(pred[tensor.arange(n_samples), xW] + 1e-10) *
                       m).sum()
            ppl_sum += -(tensor.log2(pred[tensor.arange(n_samples), xW] +
                                     1e-10) * m).sum()
            return c_sum, ppl_sum

        sums, upd = theano.scan(fn=accumCost,
                                outputs_info=[
                                    tensor.as_tensor_variable(
                                        numpy_floatX(0.)),
                                    tensor.as_tensor_variable(numpy_floatX(0.))
                                ],
                                sequences=[p, xW[1:, :], mask[1:, :]])

        # NOTE1: we are leaving out the first prediction, which was made for the image
        # and is meaningless. Here cost[0] contains log probability (log10) and cost[1] contains
        # perplexity (log2)
        cost = [sums[0][-1] / options['batch_size'], sums[1][-1]]

        inp_list = [xW, xI, mask]

        if options.get('en_aux_inp', 0):
            inp_list.append(xAux)

        f_pred_prob = theano.function(inp_list,
                                      p,
                                      name='f_pred_prob',
                                      updates=updatesLSTM)

        return use_noise, inp_list, f_pred_prob, cost, p, updatesLSTM
 def lstm_predict_layer(self, tparams, Xi, aux_input, options, beam_size, prefix='lstm'):
   nMaxsteps = 30 
   n_samples = 1 
 
   # ----------------------  STEP FUNCTION  ---------------------- #
   def _stepP(x_, h_, c_, lP_, dV_, xAux):
       preact = tensor.dot(h_, tparams[_p(prefix, 'W_hid')])
       preact += (tensor.dot(x_, tparams[_p(prefix, 'W_inp')]) +
                  tparams[_p(prefix, 'b')])
       if options.get('en_aux_inp',0):
           preact += tensor.dot(xAux,tparams[_p(prefix,'W_aux')])
 
       i = tensor.nnet.sigmoid(sliceT(preact, 0, options['hidden_size']))
       f = tensor.nnet.sigmoid(sliceT(preact, 1, options['hidden_size']))
       o = tensor.nnet.sigmoid(sliceT(preact, 2, options['hidden_size']))
       c = tensor.tanh(sliceT(preact, 3, options['hidden_size']))
 
       c = f * c_ + i * c
 
       h = o * tensor.tanh(c)
 
       p = tensor.dot(h,tparams['Wd']) + tparams['bd']
       p = tensor.nnet.softmax(p)
       lProb = tensor.log(p + 1e-20)
 
       def _FindB_best(lPLcl, lPprev, dVLcl):
           srtLcl = tensor.argsort(-lPLcl)
           srtLcl = srtLcl[:beam_size]
           deltaVec = tensor.fill( lPLcl[srtLcl], numpy_floatX(-10000.))
           deltaVec = tensor.set_subtensor(deltaVec[0], lPprev)
           lProbBest = ifelse(tensor.eq( dVLcl, tensor.zeros_like(dVLcl)), lPLcl[srtLcl] + lPprev, deltaVec)
           xWIdxBest = ifelse(tensor.eq( dVLcl, tensor.zeros_like(dVLcl)), srtLcl, tensor.zeros_like(srtLcl)) 
           return lProbBest, xWIdxBest 
 
       rvalLcl, updatesLcl = theano.scan(_FindB_best, sequences = [lProb, lP_, dV_], name=_p(prefix, 'FindBest'), n_steps=x_.shape[0])
       xWIdxBest = rvalLcl[1]
       lProbBest = rvalLcl[0]
 
       xWIdxBest = xWIdxBest.flatten()
       lProb = lProbBest.flatten()
 
       # Now sort and find the best among these best extensions for the current beams
       srtIdx = tensor.argsort(-lProb)
       srtIdx = srtIdx[:beam_size]
       xWlogProb = lProb[srtIdx]
 
       xWIdx = xWIdxBest[srtIdx]
       xCandIdx = srtIdx // beam_size # Floor division 
 
       xW = tparams['Wemb'][xWIdx.flatten()]
       doneVec = tensor.eq(xWIdx,tensor.zeros_like(xWIdx))
       h = h.take(xCandIdx.flatten(),axis=0);
       c = c.take(xCandIdx.flatten(),axis=0)
 
       return [xW, h, c, xWlogProb, doneVec, xWIdx, xCandIdx], theano.scan_module.until(doneVec.all())
   # ------------------- END of STEP FUNCTION  -------------------- #
   
   if options.get('en_aux_inp',0) == 0:
      aux_input = [] 
 
   hidden_size = options['hidden_size']
 
 
   h = tensor.alloc(numpy_floatX(0.),beam_size,hidden_size)
   c = tensor.alloc(numpy_floatX(0.),beam_size,hidden_size)
 
   lP = tensor.alloc(numpy_floatX(0.), beam_size);
   dV = tensor.alloc(np.int8(0.), beam_size);
 
   # Propogate the image feature vector
   [xW, h, c, _, _, _, _], _ = _stepP(Xi, h[:1,:], c[:1,:], lP, dV,aux_input) 
   
   xWStart = tparams['Wemb'][[0]]
   [xW, h, c, lP, dV, idx0, cand0], _ = _stepP(xWStart, h[:1,:], c[:1,:], lP, dV, aux_input) 
   
   aux_input = tensor.extra_ops.repeat(aux_input,beam_size,axis=0)
 
   # Now lets do the loop.
   rval, updates = theano.scan(_stepP, outputs_info=[xW, h, c, lP, dV, None, None], non_sequences = [aux_input], name=_p(prefix, 'predict_layers'), n_steps=nMaxsteps)
 
   return rval[3][-1], tensor.concatenate([idx0.reshape([1,beam_size]), rval[5]],axis=0), tensor.concatenate([cand0.reshape([1,beam_size]), rval[6]],axis=0)
  def build_eval_other_sent(self, tparams, options,model_npy):

    zipp(model_npy, self.model_th)

    # Used for dropout.
    use_noise = theano.shared(numpy_floatX(0.))

    xW = tensor.matrix('xW', dtype='int64')
    mask = tensor.matrix('mask', dtype=config.floatX)
    n_timesteps = xW.shape[0]
    n_samples = xW.shape[1]

    embW = tparams['Wemb'][xW.flatten()].reshape([n_timesteps,
                                                n_samples,
                                                options['word_encoding_size']])
    xI = tensor.matrix('xI', dtype=config.floatX)
    xAux = tensor.matrix('xAux', dtype=config.floatX)

    embImg = (tensor.dot(xI, tparams['WIemb']) + tparams['b_Img']).reshape([1,n_samples,options['image_encoding_size']]);
    emb = tensor.concatenate([embImg, embW], axis=0) 


    rval, updatesLSTM = self.lstm_layer(tparams, emb[:n_timesteps,:,:], xAux, use_noise, options, prefix=options['generator'],
                                mask=mask)
    p = rval[0]

    p = tensor.dot(p,tparams['Wd']) + tparams['bd']

    #pred = tensor.nnet.softmax(p)

    #pred = rval[2]

    #pred = pred[1:,:,:]
    p = p[1:,:,:]

    def accumCost(pred,xW,m,c_sum,ppl_sum):
        pred = tensor.nnet.softmax(pred)
        c_sum += (tensor.log(pred[tensor.arange(n_samples), xW]+1e-20) * m)
        ppl_sum += -(tensor.log2(pred[tensor.arange(n_samples), xW]+1e-10) * m)
        return c_sum, ppl_sum

    sums, upd = theano.scan(fn=accumCost, 
                                outputs_info=[tensor.alloc(numpy_floatX(0.), 1,n_samples),
                                              tensor.alloc(numpy_floatX(0.), 1,n_samples)],
                                sequences = [p, xW[1:,:], mask[1:,:]])

    # NOTE1: we are leaving out the first prediction, which was made for the image
    # and is meaningless. Here cost[0] contains log probability (log10) and cost[1] contains
    # perplexity (log2)
    cost = sums[0][-1]

    inp_list = [xW, xI, mask]

    if options.get('en_aux_inp',0):
        inp_list.append(xAux)

    f_pred_prob = theano.function(inp_list, p, name='f_pred_prob', updates=updatesLSTM)

    self.f_pred_prob_other = theano.function(inp_list, p, name='f_pred_prob', updates=updatesLSTM)
    #f_pred = theano.function([xW, mask], pred.argmax(axis=1), name='f_pred')

    #cost = -tensor.log(pred[tensor.arange(n_timesteps),tensor.arange(n_samples), xW] + 1e-8).mean()

    self.f_eval_other = theano.function(inp_list, cost, name='f_eval')

    return use_noise, inp_list, self.f_pred_prob_other, cost, p, updatesLSTM 
    def lstm_multi_model_pred(self,
                              tparams,
                              Xi,
                              aux_input,
                              options,
                              beam_size,
                              nmodels,
                              prefix='lstm'):
        nMaxsteps = 30

        # ----------------------  STEP FUNCTION  ---------------------- #
        def _stepP(*in_list):
            x_inp = []
            h_inp = []
            c_inp = []
            for i in xrange(nmodels):
                x_inp.append(in_list[i])
                h_inp.append(in_list[nmodels + i])
                c_inp.append(in_list[2 * nmodels + i])
            lP_ = in_list[3 * nmodels]
            dV_ = in_list[3 * nmodels + 1]

            p_comb = tensor.alloc(numpy_floatX(0.), options[0]['output_size'])
            cf = []
            h = []
            xW = []
            for i in xrange(nmodels):
                preact = tensor.dot(h_inp[i], tparams[i][_p(prefix, 'W_hid')])
                preact += (
                    tensor.dot(x_inp[i], tparams[i][_p(prefix, 'W_inp')]) +
                    tparams[i][_p(prefix, 'b')])
                if options[i].get('en_aux_inp', 0):
                    preact += tensor.dot(aux_input2[i],
                                         tparams[i][_p(prefix, 'W_aux')])

                inp = tensor.nnet.sigmoid(
                    sliceT(preact, 0, options[i]['hidden_size']))
                f = tensor.nnet.sigmoid(
                    sliceT(preact, 1, options[i]['hidden_size']))
                o = tensor.nnet.sigmoid(
                    sliceT(preact, 2, options[i]['hidden_size']))
                c = tensor.tanh(sliceT(preact, 3, options[i]['hidden_size']))

                cf.append(f * c_inp[i] + inp * c)

                h.append(o * tensor.tanh(cf[i]))
                p = tensor.dot(h[i], tparams[i]['Wd']) + tparams[i]['bd']
                if i == 0:
                    p_comb = tparams[i]['comb_weight'] * tensor.nnet.softmax(p)
                else:
                    p_comb += tparams[i]['comb_weight'] * tensor.nnet.softmax(
                        p)

            lProb = tensor.log(p_comb + 1e-20)

            def _FindB_best(lPLcl, lPprev, dVLcl):
                srtLcl = tensor.argsort(-lPLcl)
                srtLcl = srtLcl[:beam_size]
                deltaVec = tensor.fill(lPLcl[srtLcl], numpy_floatX(-10000.))
                deltaVec = tensor.set_subtensor(deltaVec[0], lPprev)
                lProbBest = ifelse(tensor.eq(dVLcl, tensor.zeros_like(dVLcl)),
                                   lPLcl[srtLcl] + lPprev, deltaVec)
                xWIdxBest = ifelse(tensor.eq(dVLcl, tensor.zeros_like(dVLcl)),
                                   srtLcl, tensor.zeros_like(srtLcl))
                return lProbBest, xWIdxBest

            rvalLcl, updatesLcl = theano.scan(_FindB_best,
                                              sequences=[lProb, lP_, dV_],
                                              name=_p(prefix, 'FindBest'),
                                              n_steps=x_inp[0].shape[0])
            xWIdxBest = rvalLcl[1]
            lProbBest = rvalLcl[0]

            xWIdxBest = xWIdxBest.flatten()
            lProb = lProbBest.flatten()

            # Now sort and find the best among these best extensions for the current beams
            srtIdx = tensor.argsort(-lProb)
            srtIdx = srtIdx[:beam_size]
            xWlogProb = lProb[srtIdx]

            xWIdx = xWIdxBest[srtIdx]
            xCandIdx = srtIdx // beam_size  # Floor division

            doneVec = tensor.eq(xWIdx, tensor.zeros_like(xWIdx))

            x_out = []
            h_out = []
            c_out = []
            for i in xrange(nmodels):
                x_out.append(tparams[i]['Wemb'][xWIdx.flatten()])
                h_out.append(h[i].take(xCandIdx.flatten(), axis=0))
                c_out.append(cf[i].take(xCandIdx.flatten(), axis=0))

            out_list = []
            out_list.extend(x_out)
            out_list.extend(h_out)
            out_list.extend(c_out)
            out_list.extend([xWlogProb, doneVec, xWIdx, xCandIdx])

            return out_list, theano.scan_module.until(doneVec.all())

        # ------------------- END of STEP FUNCTION  -------------------- #

        #Xi = tensor.extra_ops.repeat(Xi,beam_size,axis=0)

        lP = tensor.alloc(numpy_floatX(0.), beam_size)
        dV = tensor.alloc(np.int8(0.), beam_size)

        h_inp = []
        c_inp = []
        x_inp = []
        for i in xrange(nmodels):
            hidden_size = options[i]['hidden_size']
            h = theano.shared(np.zeros((1, hidden_size), dtype='float32'))
            c = theano.shared(np.zeros((1, hidden_size), dtype='float32'))
            h_inp.append(h)
            c_inp.append(c)
            x_inp.append(Xi[i])

        aux_input2 = aux_input

        in_list = []
        in_list.extend(x_inp)
        in_list.extend(h_inp)
        in_list.extend(c_inp)
        in_list.append(lP)
        in_list.append(dV)

        # Propogate the image feature vector
        out_list, _ = _stepP(*in_list)

        for i in xrange(nmodels):
            h_inp[i] = out_list[nmodels + i]
            c_inp[i] = out_list[2 * nmodels + i]

        x_inp = []
        for i in xrange(nmodels):
            x_inp.append(tparams[i]['Wemb'][[0]])
            h_inp[i] = h_inp[i][:1, :]
            c_inp[i] = c_inp[i][:1, :]
            #if options[i].get('en_aux_inp',0):
            #  aux_input2.append(aux_input[i])

        in_list = []
        in_list.extend(x_inp)
        in_list.extend(h_inp)
        in_list.extend(c_inp)
        in_list.append(lP)
        in_list.append(dV)

        out_list, _ = _stepP(*in_list)
        aux_input2 = []
        for i in xrange(nmodels):
            x_inp[i] = out_list[i]
            h_inp[i] = out_list[nmodels + i]
            c_inp[i] = out_list[2 * nmodels + i]
            aux_input2.append(
                tensor.extra_ops.repeat(aux_input[i], beam_size, axis=0))
        lP = out_list[3 * nmodels]
        dV = out_list[3 * nmodels + 1]
        idx0 = out_list[3 * nmodels + 2]
        cand0 = out_list[3 * nmodels + 3]

        in_list = []
        in_list.extend(x_inp)
        in_list.extend(h_inp)
        in_list.extend(c_inp)
        in_list.append(lP)
        in_list.append(dV)
        in_list.append(None)
        in_list.append(None)

        # Now lets do the loop.
        rval, updates = theano.scan(_stepP,
                                    outputs_info=in_list,
                                    name=_p(prefix, 'predict_layers'),
                                    n_steps=nMaxsteps)

        return rval[3 * nmodels][-1], tensor.concatenate(
            [idx0.reshape([1, beam_size]), rval[3 * nmodels + 2]],
            axis=0), tensor.concatenate(
                [cand0.reshape([1, beam_size]), rval[3 * nmodels + 3]],
                axis=0), rval[3 * nmodels]
  def lstm_multi_model_pred(self,tparams, Xi, aux_input, options, beam_size, nmodels, prefix='lstm'):
    nMaxsteps = 30 
  
    # ----------------------  STEP FUNCTION  ---------------------- #
    def _stepP(*in_list):
        x_inp = []
        h_inp = []
        c_inp = []
        for i in xrange(nmodels):
            x_inp.append(in_list[i])
            h_inp.append(in_list[nmodels+i])
            c_inp.append(in_list[2*nmodels+i])
        lP_ = in_list[3*nmodels]
        dV_ = in_list[3*nmodels+1]

        p_comb = tensor.alloc(numpy_floatX(0.), options[0]['output_size']);
        cf = []
        h = []
        xW = []
        for i in xrange(nmodels):
            preact = tensor.dot(h_inp[i], tparams[i][_p(prefix, 'W_hid')])
            preact += (tensor.dot(x_inp[i], tparams[i][_p(prefix, 'W_inp')]) +
                       tparams[i][_p(prefix, 'b')])
            if options[i].get('en_aux_inp',0):
                preact += tensor.dot(aux_input2[i],tparams[i][_p(prefix,'W_aux')])
  
            inp = tensor.nnet.sigmoid(sliceT(preact, 0, options[i]['hidden_size']))
            f = tensor.nnet.sigmoid(sliceT(preact, 1, options[i]['hidden_size']))
            o = tensor.nnet.sigmoid(sliceT(preact, 2, options[i]['hidden_size']))
            c = tensor.tanh(sliceT(preact, 3, options[i]['hidden_size']))
  
            cf.append(f * c_inp[i] + inp * c)
  
            h.append(o * tensor.tanh(cf[i]))
            p = tensor.dot(h[i],tparams[i]['Wd']) + tparams[i]['bd']
            if i == 0:
                p_comb = tparams[i]['comb_weight']*tensor.nnet.softmax(p)
            else:    
                p_comb += tparams[i]['comb_weight']*tensor.nnet.softmax(p)
        
        lProb = tensor.log(p_comb + 1e-20)
        def _FindB_best(lPLcl, lPprev, dVLcl):
            srtLcl = tensor.argsort(-lPLcl)
            srtLcl = srtLcl[:beam_size]
            deltaVec = tensor.fill( lPLcl[srtLcl], numpy_floatX(-10000.))
            deltaVec = tensor.set_subtensor(deltaVec[0], lPprev)
            lProbBest = ifelse(tensor.eq( dVLcl, tensor.zeros_like(dVLcl)), lPLcl[srtLcl] + lPprev, deltaVec)
            xWIdxBest = ifelse(tensor.eq( dVLcl, tensor.zeros_like(dVLcl)), srtLcl, tensor.zeros_like(srtLcl)) 
            return lProbBest, xWIdxBest 
  
        rvalLcl, updatesLcl = theano.scan(_FindB_best, sequences = [lProb, lP_, dV_], name=_p(prefix, 'FindBest'), n_steps=x_inp[0].shape[0])
        xWIdxBest = rvalLcl[1]
        lProbBest = rvalLcl[0]
  
        xWIdxBest = xWIdxBest.flatten()
        lProb = lProbBest.flatten()
  
        # Now sort and find the best among these best extensions for the current beams
        srtIdx = tensor.argsort(-lProb)
        srtIdx = srtIdx[:beam_size]
        xWlogProb = lProb[srtIdx]
  
        xWIdx = xWIdxBest[srtIdx]
        xCandIdx = srtIdx // beam_size # Floor division 
  
        doneVec = tensor.eq(xWIdx,tensor.zeros_like(xWIdx))
        
        x_out = []
        h_out = []
        c_out = []
        for i in xrange(nmodels):
            x_out.append(tparams[i]['Wemb'][xWIdx.flatten()])
            h_out.append(h[i].take(xCandIdx.flatten(),axis=0))
            c_out.append(cf[i].take(xCandIdx.flatten(),axis=0))

        out_list = []
        out_list.extend(x_out)
        out_list.extend(h_out)
        out_list.extend(c_out)
        out_list.extend([xWlogProb, doneVec, xWIdx, xCandIdx])
  
        return out_list, theano.scan_module.until(doneVec.all())
    # ------------------- END of STEP FUNCTION  -------------------- #
    
    #Xi = tensor.extra_ops.repeat(Xi,beam_size,axis=0)
  
    lP = tensor.alloc(numpy_floatX(0.), beam_size);
    dV = tensor.alloc(np.int8(0.), beam_size);
    
    h_inp = []
    c_inp = []
    x_inp = []
    for i in xrange(nmodels):
      hidden_size = options[i]['hidden_size']
      h = theano.shared(np.zeros((1,hidden_size),dtype='float32'))
      c = theano.shared(np.zeros((1,hidden_size),dtype='float32'))
      h_inp.append(h)
      c_inp.append(c)
      x_inp.append(Xi[i])
    
    aux_input2 = aux_input
    
    in_list = []
    in_list.extend(x_inp); in_list.extend(h_inp); in_list.extend(c_inp) 
    in_list.append(lP); in_list.append(dV) 

    
    # Propogate the image feature vector
    out_list,_ = _stepP(*in_list) 

    for i in xrange(nmodels):
        h_inp[i] = out_list[nmodels + i]
        c_inp[i] = out_list[2*nmodels + i]
    
    x_inp = []
    for i in xrange(nmodels):
      x_inp.append(tparams[i]['Wemb'][[0]])
      h_inp[i] = h_inp[i][:1,:]
      c_inp[i] = c_inp[i][:1,:]
      #if options[i].get('en_aux_inp',0):
      #  aux_input2.append(aux_input[i])
    
    in_list = []
    in_list.extend(x_inp); in_list.extend(h_inp); in_list.extend(c_inp) 
    in_list.append(lP); in_list.append(dV) 
  
    out_list, _ = _stepP(*in_list)
    aux_input2 = []
    for i in xrange(nmodels):
        x_inp[i] = out_list[i]
        h_inp[i] = out_list[nmodels + i]
        c_inp[i] = out_list[2*nmodels + i]
        aux_input2.append(tensor.extra_ops.repeat(aux_input[i],beam_size,axis=0))
    lP = out_list[3*nmodels]
    dV = out_list[3*nmodels+1]
    idx0 = out_list[3*nmodels+2]
    cand0 = out_list[3*nmodels+3]
    
    in_list = []
    in_list.extend(x_inp); in_list.extend(h_inp); in_list.extend(c_inp) 
    in_list.append(lP); in_list.append(dV)
    in_list.append(None);in_list.append(None);
    
    # Now lets do the loop.
    rval, updates = theano.scan(_stepP, outputs_info=in_list, name=_p(prefix, 'predict_layers'), n_steps=nMaxsteps)
  
    return rval[3*nmodels][-1], tensor.concatenate([idx0.reshape([1,beam_size]), rval[3*nmodels+2]],axis=0), tensor.concatenate([cand0.reshape([1,beam_size]), rval[3*nmodels+3]],axis=0), rval[3*nmodels] 
Example #38
0
def main(params):
    batch_size = params['batch_size']
    word_count_threshold = params['word_count_threshold']
    max_epochs = params['max_epochs']
    host = socket.gethostname()  # get computer hostname

    #--------------------------------- Init data provider and load data+features #---------------------------------#
    # fetch the data provider
    dp = getDataProvider(params)

    params['aux_inp_size'] = params['featenc_hidden_size'] * params[
        'n_encgt_sent'] if params['encode_gt_sentences'] else dp.aux_inp_size
    params['featenc_hidden_size'] = params['featenc_hidden_size'] if params[
        'encode_gt_sentences'] else params['aux_inp_size']

    params['image_feat_size'] = dp.img_feat_size
    print 'Image feature size is %d, and aux input size is %d' % (
        params['image_feat_size'], params['aux_inp_size'])

    #--------------------------------- Preprocess sentences and build Vocabulary #---------------------------------#
    misc = {
    }  # stores various misc items that need to be passed around the framework
    # go over all training sentences and find the vocabulary we want to use, i.e. the words that occur
    # at least word_count_threshold number of times
    if params['checkpoint_file_name'] == 'None':
        if params['class_out_factoring'] == 0:
            misc['wordtoix'], misc[
                'ixtoword'], bias_init_vector = preProBuildWordVocab(
                    dp.iterSentences('train'), word_count_threshold)
        else:
            [misc['wordtoix'], misc['classes']
             ], [misc['ixtoword'], misc['clstotree'], misc['ixtoclsinfo']
                 ], [bias_init_vector, bias_init_inter_class
                     ] = preProBuildWordVocab(dp.iterSentences('train'),
                                              word_count_threshold, params)
            params['nClasses'] = bias_init_inter_class.shape[0]
            params['ixtoclsinfo'] = misc['ixtoclsinfo']
    else:
        misc = checkpoint_init['misc']
        params['nClasses'] = checkpoint_init['params']['nClasses']
        if 'ixtoclsinfo' in misc:
            params['ixtoclsinfo'] = misc['ixtoclsinfo']

    params['vocabulary_size'] = len(misc['wordtoix'])
    params['output_size'] = len(misc['ixtoword'])  # these should match though
    print len(misc['wordtoix']), len(misc['ixtoword'])

    #------------------------------ Initialize the solver/generator and build forward path #-----------------------#
    # Initialize the optimizer
    solver = Solver(params['solver'])
    # This initializes the model parameters and does matrix initializations
    lstmGenerator = decodeGenerator(params)
    model, misc['update'], misc['regularize'] = (lstmGenerator.model_th,
                                                 lstmGenerator.update_list,
                                                 lstmGenerator.regularize)

    # force overwrite here. The bias to the softmax is initialized to reflect word frequencies
    # This is a bit of a hack
    if params['checkpoint_file_name'] == 'None':
        model['bd'].set_value(bias_init_vector.astype(config.floatX))
        if params['class_out_factoring'] == 1:
            model['bdCls'].set_value(
                bias_init_inter_class.astype(config.floatX))

    #----------------- If we are using feature encoders -----------------------
    # This mode can now also be used for encoding GT sentences.
    if params['use_encoder_for'] & 1:
        if params['encode_gt_sentences']:
            xI = tensor.zeros((batch_size, params['image_encoding_size']))
            imgFeatEnc_inp = []
        else:
            imgFeatEncoder = RecurrentFeatEncoder(params['image_feat_size'],
                                                  params['word_encoding_size'],
                                                  params,
                                                  mdl_prefix='img_enc_',
                                                  features=dp.features.T)
            mdlLen = len(model.keys())
            model.update(imgFeatEncoder.model_th)
            assert (len(model.keys()) == (mdlLen +
                                          len(imgFeatEncoder.model_th.keys())))
            misc['update'].extend(imgFeatEncoder.update_list)
            misc['regularize'].extend(imgFeatEncoder.regularize)
            (imgenc_use_dropout, imgFeatEnc_inp, xI,
             updatesLSTMImgFeat) = imgFeatEncoder.build_model(model, params)
    else:
        xI = None
        imgFeatEnc_inp = []

    if params['use_encoder_for'] & 2:
        aux_enc_inp = model['Wemb'] if params[
            'encode_gt_sentences'] else dp.aux_inputs.T
        hid_size = params['featenc_hidden_size']
        auxFeatEncoder = RecurrentFeatEncoder(hid_size,
                                              params['image_encoding_size'],
                                              params,
                                              mdl_prefix='aux_enc_',
                                              features=aux_enc_inp)
        mdlLen = len(model.keys())
        model.update(auxFeatEncoder.model_th)
        assert (len(model.keys()) == (mdlLen +
                                      len(auxFeatEncoder.model_th.keys())))
        misc['update'].extend(auxFeatEncoder.update_list)
        misc['regularize'].extend(auxFeatEncoder.regularize)
        (auxenc_use_dropout, auxFeatEnc_inp, xAux,
         updatesLSTMAuxFeat) = auxFeatEncoder.build_model(model, params)

        if params['encode_gt_sentences']:
            # Reshape it size(batch_size, n_gt, hidden_size)
            xAux = xAux.reshape(
                (-1, params['n_encgt_sent'], params['featenc_hidden_size']))
            # Convert it to size (batch_size, n_gt*hidden_size
            xAux = xAux.flatten(2)

    else:
        auxFeatEnc_inp = []
        xAux = None

    #--------------------------------- Initialize the Attention Network #-------------------------------#
    if params['use_attn'] != None:
        attnModel = AttentionNetwork(params['image_feat_size'],
                                     params['hidden_size'],
                                     params,
                                     mdl_prefix='attn_mlp_')
        mdlLen = len(model.keys())
        model.update(attnModel.model_th)
        assert (len(model.keys()) == (mdlLen + len(attnModel.model_th.keys())))
        misc['update'].extend(attnModel.update_list)
        misc['regularize'].extend(attnModel.regularize)
        attn_nw_func = attnModel.build_model
    else:
        attn_nw_func = None

    #--------------------------------- Build the language model graph #---------------------------------#
    # Define the computational graph for relating the input image features and word indices to the
    # log probability cost funtion.
    (use_dropout, inp_list_gen, f_pred_prob, cost, predTh,
     updatesLSTM) = lstmGenerator.build_model(model,
                                              params,
                                              xI,
                                              xAux,
                                              attn_nw=attn_nw_func)

    inp_list = imgFeatEnc_inp + auxFeatEnc_inp + inp_list_gen
    #--------------------------------- Cost function and gradient computations setup #---------------------------------#
    costGrad = cost[0]
    # Add class uncertainity to final cost
    #if params['class_out_factoring'] == 1:
    #  costGrad += cost[2]
    # Add the regularization cost. Since this is specific to trainig and doesn't get included when we
    # evaluate the cost on test or validation data, we leave it here outside the model definition
    if params['regc'] > 0.:
        reg_cost = theano.shared(numpy_floatX(0.), name='reg_c')
        reg_c = tensor.as_tensor_variable(numpy_floatX(params['regc']),
                                          name='reg_c')
        reg_cost = 0.
        for p in misc['regularize']:
            reg_cost += (model[p]**2).sum()
            reg_cost *= 0.5 * reg_c
        costGrad += (reg_cost / params['batch_size'])

    # Compile an evaluation function.. Doesn't include gradients
    # To be used for validation set evaluation
    f_eval = theano.function(inp_list, cost, name='f_eval')

    # Now let's build a gradient computation graph and rmsprop update mechanism
    grads = tensor.grad(costGrad, wrt=model.values())
    lr = tensor.scalar(name='lr', dtype=config.floatX)
    f_grad_shared, f_update, zg, rg, ud = solver.build_solver_model(
        lr, model, grads, inp_list, cost, params)

    print 'model init done.'
    print 'model has keys: ' + ', '.join(model.keys())
    #print 'updating: ' + ', '.join( '%s [%dx%d]' % (k, model[k].shape[0], model[k].shape[1]) for k in misc['update'])
    #print 'updating: ' + ', '.join( '%s [%dx%d]' % (k, model[k].shape[0], model[k].shape[1]) for k in misc['regularize'])
    #print 'number of learnable parameters total: %d' % (sum(model[k].shape[0] * model[k].shape[1] for k in misc['update']), )

    #-------------------------------- Intialize the prediction path if needed by evaluator ----------------------------#
    evalKwargs = {
        'eval_metric': params['eval_metric'],
        'f_gen': lstmGenerator.predict,
        'beamsize': params['eval_beamsize']
    }
    if params['eval_metric'] != 'perplex':
        lstmGenerator.prepPredictor(None, params, params['eval_beamsize'])
        refToks, scr_info = eval_prep_refs('val', dp, params['eval_metric'])
        evalKwargs['refToks'] = refToks
        evalKwargs['scr_info'] = scr_info
        valMetOp = operator.gt
    else:
        valMetOp = operator.lt

    if params['met_to_track'] != []:
        trackMetargs = {
            'eval_metric': params['met_to_track'],
            'f_gen': lstmGenerator.predict,
            'beamsize': params['eval_beamsize']
        }
        lstmGenerator.prepPredictor(None, params, params['eval_beamsize'])
        refToks, scr_info = eval_prep_refs('val', dp, params['met_to_track'])
        trackMetargs['refToks'] = refToks
        trackMetargs['scr_info'] = scr_info

    #--------------------------------- Iterations and Logging intializations ------------------------------------------#
    # calculate how many iterations we need, One epoch is considered once going through all the sentences and not images
    # Hence in case of coco/flickr this will 5* no of images
    num_sentences_total = dp.getSplitSize('train', ofwhat='sentences')
    num_iters_one_epoch = num_sentences_total / batch_size
    max_iters = max_epochs * num_iters_one_epoch
    eval_period_in_epochs = params['eval_period']
    eval_period_in_iters = max(
        1, int(num_iters_one_epoch * eval_period_in_epochs))
    top_val_sc = -1
    smooth_train_ppl2 = len(
        misc['ixtoword'])  # initially size of dictionary of confusion
    val_sc = len(misc['ixtoword'])
    last_status_write_time = 0  # for writing worker job status reports
    json_worker_status = {}
    #json_worker_status['params'] = params
    json_worker_status['history'] = []
    len_hist = defaultdict(int)

    #Initialize Tracking the perplexity of train and val, with iters.
    train_perplex = []
    val_perplex = []
    trackSc_array = []

    #-------------------------------------- Load previously saved model ------------------------------------------------#
    #- Initialize the model parameters from the checkpoint file if we are resuming training
    if params['checkpoint_file_name'] != 'None':
        zipp(model_init_from, model)
        if params['restore_grads'] == 1:
            zipp(rg_init, rg)
        #Copy trackers from previous checkpoint
        if 'trackers' in checkpoint_init:
            train_perplex = checkpoint_init['trackers']['train_perplex']
            val_perplex = checkpoint_init['trackers']['val_perplex']
            trackSc_array = checkpoint_init['trackers'].get('trackScores', [])
        print(
            """\nContinuing training from previous model\n. Already run for %0.2f epochs with
            validation perplx at %0.3f\n""" %
            (checkpoint_init['epoch'], checkpoint_init['perplexity']))

    #--------------------------------------  MAIN LOOP ----------------------------------------------------------------#
    for it in xrange(max_iters):
        t0 = time.time()
        # Enable using dropout in training
        use_dropout.set_value(float(params['use_dropout']))
        if params['use_encoder_for'] & 1:
            imgenc_use_dropout.set_value(float(params['use_dropout']))
        if params['use_encoder_for'] & 2:
            auxenc_use_dropout.set_value(float(params['use_dropout']))

        epoch = it * 1.0 / num_iters_one_epoch
        #-------------------------------------- Prepare batch-------------------------------------------#
        # fetch a batch of data
        if params['sample_by_len'] == 0:
            batch = [dp.sampleImageSentencePair() for i in xrange(batch_size)]
        else:
            batch, l = dp.getRandBatchByLen(batch_size)
            len_hist[l] += 1

        enc_inp_list = prepare_seq_features(
            batch,
            use_enc_for=params['use_encoder_for'],
            maxlen=params['maxlen'],
            use_shared_mem=params['use_shared_mem_enc'],
            enc_gt_sent=params['encode_gt_sentences'],
            n_enc_sent=params['n_encgt_sent'],
            wordtoix=misc['wordtoix'])

        if params['use_pos_tag'] != 'None':
            gen_inp_list, lenS = prepare_data(
                batch,
                misc['wordtoix'],
                params['maxlen'],
                sentTagMap,
                misc['ixtoword'],
                rev_sents=params['reverse_sentence'],
                use_enc_for=params['use_encoder_for'],
                use_unk_token=params['use_unk_token'])
        else:
            gen_inp_list, lenS = prepare_data(
                batch,
                misc['wordtoix'],
                params['maxlen'],
                rev_sents=params['reverse_sentence'],
                use_enc_for=params['use_encoder_for'],
                use_unk_token=params['use_unk_token'])

        if params['sched_sampling_mode'] != None:
            gen_inp_list.append(epoch)

        real_inp_list = enc_inp_list + gen_inp_list

        #import ipdb; ipdb.set_trace()
        #---------------------------------- Compute cost and apply gradients ---------------------------#
        # evaluate cost, gradient and perform parameter update
        cost = f_grad_shared(*real_inp_list)
        f_update(params['learning_rate'])
        dt = time.time() - t0

        # print training statistics
        train_ppl2 = (2**(cost[1] / lenS))  #step_struct['stats']['ppl2']
        # smooth exponentially decaying moving average
        smooth_train_ppl2 = 0.99 * smooth_train_ppl2 + 0.01 * train_ppl2
        if it == 0:
            smooth_train_ppl2 = train_ppl2  # start out where we start out

        total_cost = cost[0]
        if it == 0: smooth_cost = total_cost  # start out where we start out
        smooth_cost = 0.99 * smooth_cost + 0.01 * total_cost

        #print '%d/%d batch done in %.3fs. at epoch %.2f. loss cost = %f, reg cost = %f, ppl2 = %.2f (smooth %.2f)' \
        #      % (it, max_iters, dt, epoch, cost['loss_cost'], cost['reg_cost'], \
        #         train_ppl2, smooth_train_ppl2)

        #---------------------------------- Write a report into a json file ---------------------------#
        tnow = time.time()
        if tnow > last_status_write_time + 60 * 1:  # every now and then lets write a report
            print '%d/%d batch done in %.3fs. at epoch %.2f. Cost now is %.3f and pplx is %.3f' \
                    % (it, max_iters, dt, epoch, smooth_cost, smooth_train_ppl2)
            last_status_write_time = tnow
            jstatus = {}
            jstatus['time'] = datetime.datetime.now().isoformat()
            jstatus['iter'] = (it, max_iters)
            jstatus['epoch'] = (epoch, max_epochs)
            jstatus['time_per_batch'] = dt
            jstatus['smooth_train_ppl2'] = smooth_train_ppl2
            jstatus['val_sc'] = val_sc  # just write the last available one
            jstatus['val_metric'] = params[
                'eval_metric']  # just write the last available one
            jstatus['train_ppl2'] = train_ppl2
            #if params['class_out_factoring'] == 1:
            #  jstatus['class_cost'] = float(cost[2])
            json_worker_status['history'].append(jstatus)
            status_file = os.path.join(
                params['worker_status_output_directory'],
                host + '_status.json')
            #import pdb; pdb.set_trace()
            try:
                json.dump(json_worker_status, open(status_file, 'w'))
            except Exception, e:  # todo be more clever here
                print 'tried to write worker status into %s but got error:' % (
                    status_file, )
                print e

        #--------------------------------- VALIDATION ---------------------------#
        #- perform perplexity evaluation on the validation set and save a model checkpoint if it's good
        is_last_iter = (it + 1) == max_iters
        if (((it + 1) % eval_period_in_iters) == 0
                and it < max_iters - 5) or is_last_iter:
            # Disable using dropout in validation
            use_dropout.set_value(0.)
            if params['use_encoder_for'] & 1:
                imgenc_use_dropout.set_value(0.)
            if params['use_encoder_for'] & 2:
                auxenc_use_dropout.set_value(0.)

            # perform the evaluation on VAL set
            val_sc = eval_split_theano('val', dp, model, params, misc, f_eval,
                                       **evalKwargs)
            val_sc = val_sc[0]
            val_perplex.append((it, val_sc))
            train_perplex.append((it, smooth_train_ppl2))

            if params['met_to_track'] != []:
                track_sc = eval_split_theano('val', dp, model, params, misc,
                                             f_eval, **trackMetargs)
                trackSc_array.append((it, {
                    evm: track_sc[i]
                    for i, evm in enumerate(params['met_to_track'])
                }))

            if epoch - params['lr_decay_st_epoch'] >= 0:
                params['learning_rate'] = params['learning_rate'] * params[
                    'lr_decay']
                params['lr_decay_st_epoch'] += 1

            print 'validation %s = %f, lr = %f' % (
                params['eval_metric'], val_sc, params['learning_rate'])
            #if params['sample_by_len'] == 1:
            #  print len_hist

            #----------------------------- SAVE THE MODEL -------------------#
            write_checkpoint_ppl_threshold = params[
                'write_checkpoint_ppl_threshold']
            if valMetOp(val_sc, top_val_sc) or top_val_sc < 0:
                if valMetOp(val_sc, write_checkpoint_ppl_threshold
                            ) or write_checkpoint_ppl_threshold < 0:
                    # if we beat a previous record or if this is the first time
                    # AND we also beat the user-defined threshold or it doesnt exist
                    top_val_sc = val_sc
                    filename = 'model_checkpoint_%s_%s_%s_%s%.2f.p' % (
                        params['dataset'], host, params['fappend'],
                        params['eval_metric'][:3], val_sc)
                    filepath = os.path.join(
                        params['checkpoint_output_directory'], filename)
                    model_npy = unzip(model)
                    rgrads_npy = unzip(rg)
                    checkpoint = {}
                    checkpoint['it'] = it
                    checkpoint['epoch'] = epoch
                    checkpoint['model'] = model_npy
                    checkpoint['rgrads'] = rgrads_npy
                    checkpoint['params'] = params
                    checkpoint['perplexity'] = val_sc
                    checkpoint['misc'] = misc
                    checkpoint['trackers'] = {
                        'train_perplex': train_perplex,
                        'val_perplex': val_perplex,
                        'trackScores': trackSc_array
                    }
                    try:
                        pickle.dump(checkpoint, open(filepath, "wb"))
                        print 'saved checkpoint in %s' % (filepath, )
                    except Exception, e:  # todo be more clever here
                        print 'tried to write checkpoint into %s but got error: ' % (
                            filepath, )
                        print e
  def lstm_predict_layer(self, tparams, Xi, aux_input, options, beam_size, prefix='lstm'):
    
    nMaxsteps = options.get('maxlen',30)
    
    if nMaxsteps is None: 
        nMaxsteps = 30
    n_samples = 1 
    h_depth = options.get('hidden_depth',1)
    h_sz = options['hidden_size']
  
    # ----------------------  STEP FUNCTION  ---------------------- #
    def _stepP(x_, h_, c_, lP_, dV_, xAux):
        preact = tensor.dot(sliceT(h_, 0, h_sz), tparams[_p(prefix, 'W_hid')])
        preact += (tensor.dot(x_, tparams[_p(prefix, 'W_inp')]) +
                   tparams[_p(prefix, 'b')])
        if options.get('en_aux_inp',0):
            preact += tensor.dot(xAux,tparams[_p(prefix,'W_aux')])
  
        hL = [[]]*h_depth 
        cL = [[]]*h_depth 
        for di in xrange(h_depth):
            i = tensor.nnet.sigmoid(sliceT(preact, 0, h_sz))
            f = tensor.nnet.sigmoid(sliceT(preact, 1, h_sz))
            o = tensor.nnet.sigmoid(sliceT(preact, 2, h_sz))
            cL[di] = tensor.tanh(sliceT(preact, 3, h_sz))
            cL[di] = f * sliceT(c_, di, h_sz) + i * cL[di]
            hL[di] = o * tensor.tanh(cL[di])
            if di < (h_depth - 1):
                preact = tensor.dot(sliceT(h_, di+1, h_sz), tparams[_p(prefix, ('W_hid_' + str(di+1)))]) + \
                        tensor.dot(hL[di], tparams[_p(prefix, ('W_inp_' + str(di+1)))])
        
        c = tensor.concatenate(cL,axis=1)
        h = tensor.concatenate(hL,axis=1)
  
        if options.get('class_out_factoring',0) == 1:
            pC    = tensor.dot(hL[-1],tparams['WdCls']) + tparams['bdCls']
            pCSft = tensor.nnet.softmax(pC)
            xCIdx =  tensor.argmax(pCSft)
            pW = tensor.dot(h[-1],tparams['Wd'][:,xCIdx,:]) + tparams['bd'][:,xCIdx,:]
            smooth_factor = tensor.as_tensor_variable(numpy_floatX(options.get('softmax_smooth_factor',1.0)), name='sm_f')
            pWSft = tensor.nnet.softmax(pW*smooth_factor)
            lProb = tensor.log(pWSft + 1e-20) + tensor.log(pCSft[0,xCIdx] + 1e-20)
        else:
            p = tensor.dot(hL[-1],tparams['Wd']) + tparams['bd']
            smooth_factor = tensor.as_tensor_variable(numpy_floatX(options.get('softmax_smooth_factor',1.0)), name='sm_f')
            p = tensor.nnet.softmax(p*smooth_factor)
            lProb = tensor.log(p + 1e-20)

        if beam_size > 1:
            def _FindB_best(lPLcl, lPprev, dVLcl):
                srtLcl = tensor.argsort(-lPLcl)
                srtLcl = srtLcl[:beam_size]
                deltaVec = tensor.fill( lPLcl[srtLcl], numpy_floatX(-10000.))
                deltaVec = tensor.set_subtensor(deltaVec[0], lPprev)
                lProbBest = ifelse(tensor.eq( dVLcl, tensor.zeros_like(dVLcl)), lPLcl[srtLcl] + lPprev, deltaVec)
                xWIdxBest = ifelse(tensor.eq( dVLcl, tensor.zeros_like(dVLcl)), srtLcl, tensor.zeros_like(srtLcl)) 
                return lProbBest, xWIdxBest 
  
            rvalLcl, updatesLcl = theano.scan(_FindB_best, sequences = [lProb, lP_, dV_], name=_p(prefix, 'FindBest'), n_steps=x_.shape[0])
            xWIdxBest = rvalLcl[1]
            lProbBest = rvalLcl[0]
  
            xWIdxBest = xWIdxBest.flatten()
            lProb = lProbBest.flatten()
            # Now sort and find the best among these best extensions for the current beams
            srtIdx = tensor.argsort(-lProb)
            srtIdx = srtIdx[:beam_size]
            xCandIdx = srtIdx // beam_size # Floor division 
            h = h.take(xCandIdx.flatten(),axis=0)
            c = c.take(xCandIdx.flatten(),axis=0)
            xWlogProb = lProb[srtIdx]
            xWIdx = xWIdxBest[srtIdx]
        else:
            xCandIdx = tensor.as_tensor_variable([0]) 
            lProb = lProb.flatten()
            xWIdx =  tensor.argmax(lProb,keepdims=True)
            xWlogProb = lProb[xWIdx] + lP_
            if options.get('class_out_factoring',0) == 1:
                clsoffset = tensor.as_tensor_variable(options['ixtoclsinfo'][:,0])
                xWIdx += clsoffset[xCIdx]
            h = h.take(xCandIdx.flatten(),axis=0)
            c = c.take(xCandIdx.flatten(),axis=0)
        
        if options.get('softmax_propogate',0) == 0:
            xW = tparams['Wemb'][xWIdx.flatten()]
        else:
            xW = p.dot(tparams['Wemb'])
        doneVec = tensor.eq(xWIdx,tensor.zeros_like(xWIdx))
  
        return [xW, h, c, xWlogProb, doneVec, xWIdx, xCandIdx], theano.scan_module.until(doneVec.all())
    # ------------------- END of STEP FUNCTION  -------------------- #
    
    if options.get('en_aux_inp',0) == 0:
       aux_input = [] 
  
    h = tensor.alloc(numpy_floatX(0.),beam_size,h_sz*h_depth)
    c = tensor.alloc(numpy_floatX(0.),beam_size,h_sz*h_depth)
  
    lP = tensor.alloc(numpy_floatX(0.), beam_size);
    dV = tensor.alloc(np.int8(0.), beam_size);
  
    # Propogate the image feature vector
    [xW, h, c, _, _, _, _], _ = _stepP(Xi, h[:1,:], c[:1,:], lP, dV,aux_input) 
    
    xWStart = tparams['Wemb'][[0]]
    [xW, h, c, lP, dV, idx0, cand0], _ = _stepP(xWStart, h[:1,:], c[:1,:], lP, dV, aux_input) 
    
    if options.get('en_aux_inp',0) == 1:
        aux_input = tensor.extra_ops.repeat(aux_input,beam_size,axis=0)
  
    # Now lets do the loop.
    rval, updates = theano.scan(_stepP, outputs_info=[xW, h, c, lP, dV, None, None], non_sequences = [aux_input], name=_p(prefix, 'predict_layers'), n_steps=nMaxsteps)
  
    return rval[3][-1], tensor.concatenate([idx0.reshape([1,beam_size]), rval[5]],axis=0), tensor.concatenate([cand0.reshape([1,beam_size]), rval[6]],axis=0), tensor.concatenate([tensor.shape_padleft(xW,n_ones=1),rval[0]],axis=0), updates