def build_model(self, tparams, options):
        trng = RandomStreams(1234)

        # Used for dropout.
        use_noise = theano.shared(numpy_floatX(0.))

        xW = tensor.matrix('xW', dtype='int64')

        mask = tensor.matrix('mask', dtype=config.floatX)
        n_timesteps = xW.shape[0]
        n_samples = xW.shape[1]

        embW = tparams['Wemb'][xW.flatten()].reshape(
            [n_timesteps, n_samples, options['word_encoding_size']])

        embW_rev = tparams['Wemb'][xW[::-1, :].flatten()].reshape(
            [n_timesteps, n_samples, options['word_encoding_size']])
        xI = tensor.matrix('xI', dtype=config.floatX)
        xAux = tensor.matrix('xAux', dtype=config.floatX)

        if options.get('swap_aux', 0):
            xAuxEmb = tensor.dot(xAux,
                                 tparams['WIemb_aux']) + tparams['b_Img_aux']
        else:
            xAuxEmb = xAux

        embImg = (tensor.dot(xI, tparams['WIemb']) + tparams['b_Img']).reshape(
            [1, n_samples, options['image_encoding_size']])
        emb = tensor.concatenate([embImg, embW], axis=0)

        emb_rev = tensor.set_subtensor(
            embW_rev[mask[::-1, :].argmax(axis=0) - 1,
                     tensor.arange(n_samples), :], embImg[0, :, :])

        #This is implementation of input dropout !!
        if options['use_dropout']:
            emb = dropout_layer(emb,
                                use_noise,
                                trng,
                                options['drop_prob_encoder'],
                                shp=emb.shape)
            if options.get('en_aux_inp', 0):
                xAuxEmb = dropout_layer(xAuxEmb,
                                        use_noise,
                                        trng,
                                        options['drop_prob_aux'],
                                        shp=xAuxEmb.shape)

        #############################################################################################################################
        # This implements core lstm
        rval, updatesLSTM = basic_lstm_layer(tparams,
                                             emb[:n_timesteps, :, :],
                                             xAuxEmb,
                                             use_noise,
                                             options,
                                             prefix='lstm',
                                             sched_prob_mask=[])
        #############################################################################################################################
        # This implements core reverse lstm
        rev_rval, rev_updatesLSTM = basic_lstm_layer(
            tparams,
            emb_rev[:n_timesteps, :, :],
            xAuxEmb,
            use_noise,
            options,
            prefix='rev_lstm',
            sched_prob_mask=[])
        #############################################################################################################################

        # NOTE1: we are leaving out the first prediction, which was made for the image and is meaningless.
        if options['use_dropout']:
            # XXX : Size given to dropout is missing one dimension. This keeps the dropped units consistent across time!?.
            # ###   Is this a good bug ?
            p = dropout_layer(
                sliceT(rval[0][1:, :, :], options.get('hidden_depth', 1),
                       options['hidden_size']), use_noise, trng,
                options['drop_prob_decoder'],
                (n_samples, options['hidden_size']))
            rev_p = dropout_layer(
                sliceT(rev_rval[0][:, :, :], options.get('hidden_depth', 1),
                       options['hidden_size']), use_noise, trng,
                options['drop_prob_decoder'],
                (n_samples, options['hidden_size']))
        else:
            p = sliceT(rval[0][1:, :, :], options.get('hidden_depth', 1),
                       options['hidden_size'])
            rev_p = sliceT(rev_rval[0][:, :, :],
                           options.get('hidden_depth',
                                       1), options['hidden_size'])

        n_out_samps = (n_timesteps - 2) * n_samples
        if options.get('class_out_factoring', 0) == 0:
            pW = (tensor.dot(p[:-1, :, :] + rev_p[::-1, :, :][2:, :, :],
                             tparams['Wd']) + tparams['bd']).reshape(
                                 [n_out_samps, options['output_size']])
            pWSft = tensor.nnet.softmax(pW)
            totProb = pWSft[tensor.arange(n_out_samps), xW[1:-1, :].flatten()]
            out_list = [pWSft, totProb, p]
        else:
            ixtoclsinfo_t = tensor.as_tensor_variable(options['ixtoclsinfo'])
            xC = ixtoclsinfo_t[xW[1:, :].flatten(), 0]
            pW = ((tparams['Wd'][:, xC, :].T *
                   (p.reshape([1, n_out_samps, options['hidden_size']]))).sum(
                       axis=-1).T + tparams['bd'][:, xC, :])
            pWSft = tensor.nnet.softmax(pW[0, :, :])
            pC = (tensor.dot(p, tparams['WdCls']) + tparams['bdCls']).reshape(
                [n_out_samps, options['nClasses']])
            pCSft = tensor.nnet.softmax(pC)

            totProb = pWSft[tensor.arange(n_out_samps), ixtoclsinfo_t[xW[1:,:].flatten(),3]] * \
                      pCSft[tensor.arange(n_out_samps), xC]
            out_list = [pWSft, pCSft, totProb, p]

        # XXX : THIS IS VERY FISHY, CHECK THE MASK INDEXING AGAIN
        probs_valid = tensor.log(totProb + 1e-10) * mask[1:-1, :].flatten()
        tot_cost = -(probs_valid.sum())
        tot_pplx = -(tensor.log2(totProb + 1e-10) *
                     mask[1:-1, :].flatten()).sum()
        cost = [tot_cost / options['batch_size'], tot_pplx]

        inp_list = [xW, mask, xI]

        if options.get('en_aux_inp', 0):
            inp_list.append(xAux)

        if options.get('sched_sampling_mode', None) != None:
            inp_list.append(curr_epoch)

        per_sent_prob = probs_valid.reshape([n_timesteps - 2,
                                             n_samples]).sum(axis=0)
        f_per_sentLogP = theano.function(inp_list,
                                         per_sent_prob,
                                         name='f_pred_logprob',
                                         updates=updatesLSTM)
        f_pred_prob = ['', f_per_sentLogP, '']

        return use_noise, inp_list, f_pred_prob, cost, out_list, updatesLSTM
    def build_eval_other_sent(self, tparams, options, model_npy):

        zipp(model_npy, self.model_th)

        # Used for dropout.
        use_noise = theano.shared(numpy_floatX(0.))

        xW = tensor.matrix('xW', dtype='int64')
        mask = tensor.matrix('mask', dtype=config.floatX)
        n_timesteps = xW.shape[0]
        n_samples = xW.shape[1]
        n_out_samps = (n_timesteps - 1) * n_samples

        embW = tparams['Wemb'][xW.flatten()].reshape(
            [n_timesteps, n_samples, options['word_encoding_size']])
        xI = tensor.matrix('xI', dtype=config.floatX)
        xAux = tensor.matrix('xAux', dtype=config.floatX)

        if options.get('swap_aux', 0):
            xAuxEmb = tensor.dot(xAux,
                                 tparams['WIemb_aux']) + tparams['b_Img_aux']
        else:
            xAuxEmb = xAux

        embImg = (tensor.dot(xI, tparams['WIemb']) + tparams['b_Img']).reshape(
            [1, n_samples, options['image_encoding_size']])
        emb = tensor.concatenate([embImg, embW], axis=0)

        rval, updatesLSTM = basic_lstm_layer(tparams,
                                             emb[:n_timesteps, :, :],
                                             xAuxEmb,
                                             use_noise,
                                             options,
                                             prefix=options['generator'])
        p = sliceT(rval[0][1:, :, :], options.get('hidden_depth', 1),
                   options['hidden_size'])

        pW = (tensor.dot(p, tparams['Wd']) + tparams['bd']).reshape(
            [n_out_samps, options['output_size']])

        pWSft = tensor.nnet.softmax(pW)
        totProb = pWSft[tensor.arange(n_out_samps), xW[1:, :].flatten()]

        #    #pred = tensor.nnet.softmax(p)
        #
        #    #pred = rval[2]
        #
        #    #pred = pred[1:,:,:]
        #
        #    def accumCost(pred,xW,m,c_sum,ppl_sum):
        #        pred = tensor.nnet.softmax(pred)
        #        c_sum += (tensor.log(pred[tensor.arange(n_samples), xW]+1e-20) * m)
        #        ppl_sum += -(tensor.log2(pred[tensor.arange(n_samples), xW]+1e-10) * m)
        #        return c_sum, ppl_sum
        #
        #    sums, upd = theano.scan(fn=accumCost,
        #                                outputs_info=[tensor.alloc(numpy_floatX(0.), 1,n_samples),
        #                                              tensor.alloc(numpy_floatX(0.), 1,n_samples)],
        #                                sequences = [p, xW[1:,:], mask[1:,:]])
        # NOTE1: we are leaving out the first prediction, which was made for the image
        # and is meaningless. Here cost[0] contains log probability (log10) and cost[1] contains
        # perplexity (log2)
        tot_cost = -(tensor.log(totProb + 1e-10) * mask[1:, :].flatten()).sum()
        cost = tot_cost / options['batch_size']

        inp_list = [xW, mask, xI]

        if options.get('en_aux_inp', 0):
            inp_list.append(xAux)

        self.f_pred_prob_other = theano.function(inp_list,
                                                 p,
                                                 name='f_pred_prob',
                                                 updates=updatesLSTM)
        #f_pred = theano.function([xW, mask], pred.argmax(axis=1), name='f_pred')

        #cost = -tensor.log(pred[tensor.arange(n_timesteps),tensor.arange(n_samples), xW] + 1e-8).mean()

        self.f_eval_other = theano.function(inp_list, cost, name='f_eval')

        return use_noise, inp_list, self.f_pred_prob_other, cost, pW, updatesLSTM
    def build_model(self, tparams, options):
        trng = RandomStreams(1234)

        # Used for dropout.
        self.use_noise = theano.shared(numpy_floatX(0.))

        if self.use_shared_features == False:
            xI = tensor.tensor3('xI', dtype=config.floatX)
            xIemb = xI
            n_timesteps = xI.shape[0]
            n_samples = xI.shape[1]
        else:
            xI = tensor.matrix('xI', dtype='int64')
            n_timesteps = xI.shape[0]
            n_samples = xI.shape[1]
            #feats = tensor.concatenate([self.features,tensor.alloc(numpy_floatX(0.),self.image_feat_size,1)],axis=1).T
            xIemb = self.features[xI.flatten(), :].reshape(
                [n_timesteps, n_samples, self.image_feat_size])

        samp_lens = tensor.vector('sL', dtype='int64')

        #This is implementation of input dropout !!
        if options['use_dropout']:
            emb = dropout_layer(xIemb,
                                self.use_noise,
                                trng,
                                options['drop_prob_encoder'],
                                shp=xIemb.shape)

        #############################################################################################################################
        # This implements core lstm
        rval, updatesLSTM = self.lstm_enc_layer(tparams,
                                                emb,
                                                prefix=self.mp + 'lstm')
        #############################################################################################################################
        # This implements core reverse lstm
        if self.encoder == 'bilstm':
            rev_rval, rev_updatesLSTM = basic_lstm_layer(tparams,
                                                         emb[::-1, :, :],
                                                         prefix=self.mp +
                                                         'rev_lstm')
        #############################################################################################################################

        # NOTE1: we are leaving out the first prediction, which was made for the image and is meaningless.
        p = sliceT(rval[0][samp_lens, tensor.arange(n_samples), :],
                   self.hidden_depth, self.hidden_size)

        if self.encoder == 'bilstm':
            rev_p = sliceT(rev_rval[0][-1, :, :], self.hidden_depth,
                           self.hidden_size)

        feat_enc = p + rev_p if self.encoder == 'bilstm' else p

        if options.get('encoder_add_mean', 0):
            feat_enc = feat_enc + (sliceT(rval[0], self.hidden_depth,
                                          self.hidden_size).sum(axis=0) /
                                   samp_lens[:, None])

        inp_list = [xI, samp_lens]

        return self.use_noise, inp_list, feat_enc, updatesLSTM
    def build_eval_other_sent(self, tparams, options, model_npy):

        zipp(model_npy, self.model_th)

        # Used for dropout.
        use_noise = theano.shared(numpy_floatX(0.))

        xW = tensor.matrix('xW', dtype='int64')
        mask = tensor.matrix('mask', dtype=config.floatX)
        n_timesteps = xW.shape[0]
        n_samples = xW.shape[1]
        n_out_samps = (n_timesteps - 1) * n_samples

        embW = tparams['Wemb'][xW.flatten()].reshape(
            [n_timesteps, n_samples, options['word_encoding_size']])
        xI = tensor.matrix('xI', dtype=config.floatX)
        xAux = tensor.matrix('xAux', dtype=config.floatX)

        if options.get('swap_aux', 0):
            xAuxEmb = tensor.dot(xAux,
                                 tparams['WIemb_aux']) + tparams['b_Img_aux']
        else:
            xAuxEmb = xAux

        embImg = (tensor.dot(xI, tparams['WIemb']) + tparams['b_Img']).reshape(
            [1, n_samples, options['image_encoding_size']])
        emb = tensor.concatenate([embImg, embW], axis=0)

        rval, updatesLSTM = basic_lstm_layer(tparams,
                                             emb[:n_timesteps, :, :],
                                             xAuxEmb,
                                             use_noise,
                                             options,
                                             prefix=options['generator'])
        p = sliceT(rval[0][1:, :, :], options.get('hidden_depth', 1),
                   options['hidden_size'])

        if options.get('class_out_factoring', 0) == 0:
            pW = (tensor.dot(p, tparams['Wd']) + tparams['bd']).reshape(
                [n_out_samps, options['output_size']])
            pWSft = tensor.nnet.softmax(pW)
            totProb = pWSft[tensor.arange(n_out_samps), xW[1:, :].flatten()]
            out_list = [pWSft, totProb, p]
        else:
            ixtoclsinfo_t = tensor.as_tensor_variable(self.clsinfo)
            xC = ixtoclsinfo_t[xW[1:, :].flatten(), 0]
            pW = ((tparams['Wd'][:, xC, :].T *
                   ((p.reshape([1, n_out_samps, options['hidden_size']]) -
                     tparams['WdCls'][:, xC].T))).sum(axis=-1).T +
                  tparams['bd'][:, xC, :])
            pWSft = tensor.nnet.softmax(pW[0, :, :])
            pC = (tensor.dot(p, tparams['WdCls']) + tparams['bdCls']).reshape(
                [n_out_samps, options['nClasses']])
            pCSft = tensor.nnet.softmax(pC)

            totProb = pWSft[tensor.arange(n_out_samps), ixtoclsinfo_t[xW[1:,:].flatten(),3]] * \
                      pCSft[tensor.arange(n_out_samps), xC]

        tot_cost = -(tensor.log(totProb + 1e-10) * mask[1:, :].flatten()
                     ).reshape([n_timesteps - 1, n_samples])
        cost = tot_cost.sum(axis=0)

        inp_list = [xW, mask, xI]

        if options.get('en_aux_inp', 0):
            inp_list.append(xAux)

        self.f_pred_prob_other = theano.function([xW, xI, xAux],
                                                 pWSft,
                                                 name='f_pred_prob',
                                                 updates=updatesLSTM)
        #f_pred = theano.function([xW, mask], pred.argmax(axis=1), name='f_pred')

        #cost = -tensor.log(pred[tensor.arange(n_timesteps),tensor.arange(n_samples), xW] + 1e-8).mean()

        self.f_eval_other = theano.function(inp_list, cost, name='f_eval')

        return use_noise, inp_list, self.f_pred_prob_other, cost, pW, updatesLSTM
    def build_model(self, tparams, options, xI=None, xAux=None, attn_nw=None):
        self.trng = RandomStreams(int(time.time()))

        # Used for dropout.
        use_noise = theano.shared(numpy_floatX(0.))

        xW = tensor.matrix('xW', dtype='int64')

        mask = tensor.matrix('mask', dtype=config.floatX)
        n_timesteps = xW.shape[0]
        n_samples = xW.shape[1]

        embW = tparams['Wemb'][xW.flatten()].reshape(
            [n_timesteps, n_samples, options['word_encoding_size']])
        if xI == None:
            xI = tensor.matrix('xI', dtype=config.floatX)
            embImg = (tensor.dot(xI, tparams['WIemb']) + tparams['b_Img'])
            xI_is_inp = True
        else:
            embImg = xI
            xI_is_inp = False

        if xAux == None:
            xAux = tensor.matrix(
                'xAux',
                dtype=config.floatX) if attn_nw == None else tensor.tensor3(
                    'xAux', dtype=config.floatX)
            if (options.get('swap_aux', 1)) and (attn_nw == None):
                xAuxEmb = tensor.dot(
                    xAux, tparams['WIemb_aux']) + tparams['b_Img_aux']
            else:
                xAuxEmb = xAux
            xA_is_inp = True
        else:
            xA_is_inp = False
            if options.get('encode_gt_sentences', 0):
                xAuxEmb = tensor.dot(
                    xAux, tparams['WIemb_aux']) + tparams['b_Img_aux']
            else:
                xAuxEmb = xAux

        embImg = embImg.reshape([1, n_samples, options['image_encoding_size']])
        emb = tensor.concatenate([embImg, embW], axis=0)

        #This is implementation of input dropout !!
        if options['use_dropout']:
            emb = dropout_layer(emb,
                                use_noise,
                                self.trng,
                                options['drop_prob_encoder'],
                                shp=emb.shape)
            if (options.get('en_aux_inp', 0)) and (attn_nw == None):
                xAuxEmb = dropout_layer(xAuxEmb,
                                        use_noise,
                                        self.trng,
                                        options['drop_prob_aux'],
                                        shp=xAuxEmb.shape)

        # Implement scehduled sampling!
        if options.get('sched_sampling_mode', None) != None:
            curr_epoch = tensor.scalar(name='curr_epoch', dtype=config.floatX)

            # Assign the probabilies according to the scheduling mode
            if options['sched_sampling_mode'] == 'linear':
                prob = tensor.maximum(
                    options['sslin_min'], options['sched_sampling_const'] -
                    options['sslin_slope'] * curr_epoch)
            elif options['sched_sampling_mode'] == 'exp':
                raise ValueError(
                    'ERROR: %s --> This solver type is not yet supported' %
                    (options['sched_sampling_mode']))
            elif options['sched_sampling_mode'] == 'invsig':
                raise ValueError(
                    'ERROR: %s --> This solver type is not yet supported' %
                    (options['sched_sampling_mode']))
            else:
                raise ValueError(
                    'ERROR: %s --> This scheduling type is unknown' %
                    (options['sched_sampling_mode']))

            # Now to build the mask. We don't want to do this coin toss when
            # feeding in image feature and the start symbol
            sched_mask = self.trng.binomial((n_timesteps - 2, n_samples),
                                            p=prob,
                                            n=1,
                                            dtype='int64')
            sched_mask = tensor.concatenate(
                [sched_mask, tensor.alloc(1, 2, n_samples)], axis=0)
        else:
            sched_mask = []

        #############################################################################################################################
        # This implements core lstm
        rval, updatesLSTM = basic_lstm_layer(tparams,
                                             emb[:n_timesteps, :, :],
                                             xAuxEmb,
                                             use_noise,
                                             options,
                                             prefix=options['generator'],
                                             sched_prob_mask=sched_mask,
                                             attn_nw=attn_nw)
        #############################################################################################################################

        # NOTE1: we are leaving out the first prediction, which was made for the image and is meaningless.
        if options['use_dropout']:
            # XXX : Size given to dropout is missing one dimension. This keeps the dropped units consistent across time!?.
            # ###   Is this a good bug ?
            p = dropout_layer(
                sliceT(rval[0][1:, :, :], options.get('hidden_depth', 1),
                       options['hidden_size']), use_noise, self.trng,
                options['drop_prob_decoder'],
                (n_samples, options['hidden_size']))
        else:
            p = sliceT(rval[0][1:, :, :], options.get('hidden_depth', 1),
                       options['hidden_size'])

        if options.get('class_out_factoring', 0) == 1:
            if options.get('cls_diff_layer', 0) == 1:
                pC_inp = dropout_layer(
                    sliceT(rval[0][1:, :, :],
                           options.get('hidden_depth', 1) - 2,
                           options['hidden_size']), use_noise, self.trng,
                    options['drop_prob_decoder'],
                    (n_samples, options['hidden_size']))
            else:
                pC_inp = p

        n_out_samps = (n_timesteps - 1) * n_samples
        if options.get('class_out_factoring', 0) == 0:
            pW = (tensor.dot(p, tparams['Wd']) + tparams['bd']).reshape(
                [n_out_samps, options['output_size']])
            if options.get('use_gumbel_mse', 0) == 0:
                pWSft = tensor.nnet.softmax(pW)
            else:
                w_out = ifelse(
                    self.usegumbel,
                    gumbel_softmax_sample(self.trng,
                                          pW,
                                          self.gumb_temp,
                                          hard=options.get(
                                              'use_gumbel_hard', False)),
                    tensor.nnet.softmax(pW))
                # This is not exactly right, but just testing
                pWSft = w_out

            totProb = pWSft[tensor.arange(n_out_samps), xW[1:, :].flatten()]
            out_list = [pWSft, totProb, pW]
        else:
            ixtoclsinfo_t = tensor.as_tensor_variable(self.clsinfo)
            xC = ixtoclsinfo_t[xW[1:, :].flatten(), 0]
            if options.get('cls_zmean', 1):
                pW = ((tparams['Wd'][:, xC, :].T *
                       ((p.reshape([1, n_out_samps, options['hidden_size']]) -
                         tparams['WdCls'][:, xC].T))).sum(axis=-1).T +
                      tparams['bd'][:, xC, :])
            else:
                pW = ((tparams['Wd'][:, xC, :].T *
                       (p.reshape([1, n_out_samps, options['hidden_size']]))
                       ).sum(axis=-1).T + tparams['bd'][:, xC, :])
            pWSft = tensor.nnet.softmax(pW[0, :, :])

            pC = (tensor.dot(pC_inp, tparams['WdCls']) +
                  tparams['bdCls']).reshape([n_out_samps, options['nClasses']])
            pCSft = tensor.nnet.softmax(pC)

            totProb = pWSft[tensor.arange(n_out_samps), ixtoclsinfo_t[xW[1:,:].flatten(),3]] * \
                      pCSft[tensor.arange(n_out_samps), xC]
            out_list = [pWSft, pCSft, totProb, p]

        tot_cost = -(tensor.log(totProb + 1e-10) * mask[1:, :].flatten()).sum()
        tot_pplx = -(tensor.log2(totProb + 1e-10) *
                     mask[1:, :].flatten()).sum()
        cost = [
            tot_cost / tensor.cast(n_samples, dtype=config.floatX), tot_pplx
        ]

        inp_list = [xW, mask]
        if xI_is_inp:
            inp_list.append(xI)

        if options.get('en_aux_inp', 0) and xA_is_inp:
            inp_list.append(xAux)

        if options.get('sched_sampling_mode', None) != None:
            inp_list.append(curr_epoch)

        f_pred_prob = theano.function([xW, xI, xAux],
                                      out_list,
                                      name='f_pred_prob',
                                      updates=updatesLSTM)

        return use_noise, inp_list, f_pred_prob, cost, out_list, updatesLSTM
  def build_model(self, tparams, options):
    trng = RandomStreams(1234)

    # Used for dropout.
    use_noise = theano.shared(numpy_floatX(0.))

    xW = tensor.matrix('xW', dtype='int64')

    mask = tensor.matrix('mask', dtype=config.floatX)
    n_timesteps = xW.shape[0]
    n_samples = xW.shape[1]

    embW = tparams['Wemb'][xW.flatten()].reshape([n_timesteps,
                                                n_samples,
                                                options['word_encoding_size']])
    xI = tensor.matrix('xI', dtype=config.floatX)
    xAux = tensor.matrix('xAux', dtype=config.floatX)

    if options.get('swap_aux',0):
       xAuxEmb = tensor.dot(xAux,tparams['WIemb_aux']) + tparams['b_Img_aux']
    else:
       xAuxEmb = xAux


    embImg = (tensor.dot(xI, tparams['WIemb']) + tparams['b_Img']).reshape([1,n_samples,options['image_encoding_size']]);
    emb = tensor.concatenate([embImg, embW], axis=0) 

    #This is implementation of input dropout !!
    if options['use_dropout']:
        emb = dropout_layer(emb, use_noise, trng, options['drop_prob_encoder'], shp = emb.shape)
        if options.get('en_aux_inp',0):
            xAuxEmb = dropout_layer(xAuxEmb, use_noise, trng, options['drop_prob_aux'], shp = xAuxEmb.shape)

    # Implement scehduled sampling!
    if options.get('sched_sampling_mode',None) != None: 
        curr_epoch = tensor.scalar(name='curr_epoch',dtype=config.floatX)        
        
        # Assign the probabilies according to the scheduling mode 
        if options['sched_sampling_mode'] == 'linear':
            prob = tensor.maximum(options['sslin_min'],options['sched_sampling_const'] - options['sslin_slope'] * curr_epoch)
        elif options['sched_sampling_mode'] == 'exp': 
            raise ValueError('ERROR: %s --> This solver type is not yet supported'%(options['sched_sampling_mode']))
        elif options['sched_sampling_mode'] == 'invsig': 
            raise ValueError('ERROR: %s --> This solver type is not yet supported'%(options['sched_sampling_mode']))
        else:
            raise ValueError('ERROR: %s --> This scheduling type is unknown'%(options['sched_sampling_mode']))
        
        # Now to build the mask. We don't want to do this coin toss when 
        # feeding in image feature and the start symbol        
        sched_mask = trng.binomial((n_timesteps - 2, n_samples), p=prob, n=1, dtype='int64')
        sched_mask = tensor.concatenate([sched_mask, tensor.alloc(1, 2, n_samples)],axis=0)
    else: 
        sched_mask = []
        

    #############################################################################################################################
    # This implements core lstm
    rval, updatesLSTM = basic_lstm_layer(tparams, emb[:n_timesteps,:,:], xAuxEmb, use_noise, options,
                                         prefix=options['generator'], sched_prob_mask = sched_mask)
    #############################################################################################################################
    
    
    # NOTE1: we are leaving out the first prediction, which was made for the image and is meaningless. 
    if options['use_dropout']:
        # XXX : Size given to dropout is missing one dimension. This keeps the dropped units consistent across time!?.
        # ###   Is this a good bug ? 
        p = dropout_layer(sliceT(rval[0][1:,:,:],options.get('hidden_depth',1)-1,options['hidden_size']), use_noise, trng,
            options['drop_prob_decoder'], (n_samples,options['hidden_size']))
    else:
        p = sliceT(rval[0][1:,:,:],options.get('hidden_depth',1)-1,options['hidden_size'])

    n_out_samps = (n_timesteps-1) * n_samples 
    if options.get('class_out_factoring',0) == 0:
        pW = (tensor.dot(p,tparams['Wd']) + tparams['bd']).reshape([n_out_samps,options['output_size']])
        pWSft = tensor.nnet.softmax(pW)
        totProb = pWSft[tensor.arange(n_out_samps), xW[1:,:].flatten()]
        out_list = [pWSft, totProb, p]
    else:
        ixtoclsinfo_t = tensor.as_tensor_variable(options['ixtoclsinfo'])
        xC = ixtoclsinfo_t[xW[1:,:].flatten(),0]
        pW = ((tparams['Wd'][:,xC,:].T*(p.reshape([1,n_out_samps,options['hidden_size']]))).sum(axis=-1).T 
             + tparams['bd'][:,xC,:])
        pWSft   = tensor.nnet.softmax(pW[0,:,:])
        pC    = (tensor.dot(p,tparams['WdCls']) + tparams['bdCls']).reshape([n_out_samps,options['nClasses']])
        pCSft = tensor.nnet.softmax(pC)
        
        totProb = pWSft[tensor.arange(n_out_samps), ixtoclsinfo_t[xW[1:,:].flatten(),3]] * \
                  pCSft[tensor.arange(n_out_samps), xC]
        out_list = [pWSft, pCSft, totProb, p]
    
    tot_cost = -(tensor.log(totProb + 1e-10) * mask[1:,:].flatten()).sum()
    tot_pplx = -(tensor.log2(totProb + 1e-10) * mask[1:,:].flatten()).sum()
    cost = [tot_cost/options['batch_size'], tot_pplx]

    inp_list = [xW, mask, xI]

    if options.get('en_aux_inp',0):
        inp_list.append(xAux)

    if options.get('sched_sampling_mode',None) != None:
        inp_list.append(curr_epoch)

    f_pred_prob = [] 
    #theano.function(inp_list, p, name='f_pred_prob', updates=updatesLSTM)


    return use_noise, inp_list, f_pred_prob, cost, out_list , updatesLSTM 
  def build_eval_other_sent(self, tparams, options,model_npy):

    zipp(model_npy, self.model_th)

    # Used for dropout.
    use_noise = theano.shared(numpy_floatX(0.))

    xW = tensor.matrix('xW', dtype='int64')
    mask = tensor.matrix('mask', dtype=config.floatX)
    n_timesteps = xW.shape[0]
    n_samples = xW.shape[1]

    embW = tparams['Wemb'][xW.flatten()].reshape([n_timesteps,
                                                n_samples,
                                                options['word_encoding_size']])
    xI = tensor.matrix('xI', dtype=config.floatX)
    xAux = tensor.matrix('xAux', dtype=config.floatX)

    embImg = (tensor.dot(xI, tparams['WIemb']) + tparams['b_Img']).reshape([1,n_samples,options['image_encoding_size']]);
    emb = tensor.concatenate([embImg, embW], axis=0) 


    rval, updatesLSTM = basic_lstm_layer(tparams, emb[:n_timesteps,:,:], xAux, use_noise, options, prefix=options['generator'])
    p = rval[0]

    p = tensor.dot(p,tparams['Wd']) + tparams['bd']

    #pred = tensor.nnet.softmax(p)

    #pred = rval[2]

    #pred = pred[1:,:,:]
    p = p[1:,:,:]

    def accumCost(pred,xW,m,c_sum,ppl_sum):
        pred = tensor.nnet.softmax(pred)
        c_sum += (tensor.log(pred[tensor.arange(n_samples), xW]+1e-20) * m)
        ppl_sum += -(tensor.log2(pred[tensor.arange(n_samples), xW]+1e-10) * m)
        return c_sum, ppl_sum

    sums, upd = theano.scan(fn=accumCost, 
                                outputs_info=[tensor.alloc(numpy_floatX(0.), 1,n_samples),
                                              tensor.alloc(numpy_floatX(0.), 1,n_samples)],
                                sequences = [p, xW[1:,:], mask[1:,:]])
    # NOTE1: we are leaving out the first prediction, which was made for the image
    # and is meaningless. Here cost[0] contains log probability (log10) and cost[1] contains
    # perplexity (log2)
    cost = sums[0][-1]

    inp_list = [xW, mask, xI]

    if options.get('en_aux_inp',0):
        inp_list.append(xAux)

    self.f_pred_prob_other = theano.function(inp_list, p, name='f_pred_prob', updates=updatesLSTM)
    #f_pred = theano.function([xW, mask], pred.argmax(axis=1), name='f_pred')

    #cost = -tensor.log(pred[tensor.arange(n_timesteps),tensor.arange(n_samples), xW] + 1e-8).mean()

    self.f_eval_other = theano.function(inp_list, cost, name='f_eval')

    return use_noise, inp_list, self.f_pred_prob_other, cost, p, updatesLSTM