def auc_cost(self, y, kappa=0.9, tau=2):
     f_pos = T.nonzero_values(y * self.p_y_given_x)
     f_neg = T.nonzero_values((1 - y) * self.p_y_given_x)
     diff = f_pos.T.dimshuffle(0, 'x') - f_neg.T.dimshuffle('x', 0)
     r = (-(diff - kappa)) ** tau * (diff < kappa)
     auc = T.mean(r)
     return auc
        def _step2(diag_, state_, hs_, Cs_):

            hs, Cs = [], []
            token_idxs = tensor.cast(state_.argmax(axis=-1), "int32")
            msk_ = tensor.fill(
                (tensor.zeros_like(token_idxs, dtype="float32")), 1)
            msk_ = msk_.dimshuffle('x', 0)
            state_below0 = self.de_lookuptable[token_idxs].reshape(
                (1, encoderInputs.shape[1], self.de_hidden_size))
            for i, lstm in enumerate(self.decoder_lstm_layers):
                h, C = lstm.forward(state_below0, msk_, hs_[i],
                                    Cs_[i])  #mind msk
                hs += h[-1],
                Cs += C[-1],
                state_below0 = h

            hs, Cs = tensor.as_tensor_variable(hs), tensor.as_tensor_variable(
                Cs)
            state_below0 = state_below0.reshape(
                (encoderInputs.shape[1], self.de_hidden_size))

            attn_index = tensor.nonzero(diag_, True)
            attn_value = tensor.nonzero_values(diag_)

            en_context = Encoder_shuffle[:, attn_index[0], :]
            attn_context = Encoder_shuffle_re[:, attn_index[0], :]

            attn_weight = tensor.batched_dot(attn_context, state_below0)
            attn_weight = tensor.nnet.softmax(attn_weight)
            #attn_weight *= (encoderMask.dimshuffle(1,0))

            attn_weight *= (attn_value.dimshuffle('x', 0))
            ##attn_weight = attn_weight/(tensor.sum(attn_weight, axis=1).dimshuffle(0,'x'))
            ####### ctx_ : (b, h)
            ctx_ = tensor.sum(en_context * attn_weight[:, :, None], axis=1)

            state_below0 = tensor.concatenate([ctx_, state_below0], axis=1)
            newpred = tensor.dot(state_below0,
                                 self.linear) + self.linear_bias[None, :]
            state_below = tensor.nnet.softmax(newpred)
            ##### the beging symbole probablity is 0
            extra_p = tensor.zeros_like(hs[:, :, 0])
            state_below = tensor.concatenate([state_below, extra_p.T], axis=1)

            return state_below, hs, Cs
if __name__ == '__main__':

    net = createNetwork()
    ## loading network parameters
    # params = pickle.load(open(loading_path,"rb"))
    # lasagne.layers.set_all_param_values(net['fc5'], params)   
    # print "loading params successfully"
    ####    
    input_X = T.tensor4('input_X')
    target_Y = T.vector("target_Y") 
    action_input = T.matrix("action")

    pred_Y = lasagne.layers.get_output(net['fc5'], inputs=input_X)
    Action_Y_index = T.argmax(pred_Y, axis=1)
    
    error_term = target_Y - T.nonzero_values(action_input * pred_Y)
    cost = T.mean(T.sqr(error_term))

    #scaled_error_term = lasagne.updates.norm_constraint(error_term, max_norm=1, norm_axes=0)
    #cost = T.mean(T.sqr(scaled_error)) 

    params = lasagne.layers.get_all_params(net['fc5'], trainable=True)
    updates = lasagne.updates.adam(cost, params, learning_rate=LEARNING_RATE, beta1=GRADIENT_MOMENTUM, 
                                                 beta2=SQUARED_GRADIENT_MOMENTUM, epsilon=MIN_SQUARED_GRADIENT)
    
    average_Q = T.mean(T.nonzero_values(action_input * pred_Y))
    Q_value = T.max(pred_Y, axis=1)
    
    train_fn = theano.function( inputs=[input_X, action_input, target_Y], updates=updates, outputs=[average_Q, cost]) 
    action_index_fn = theano.function( inputs=[input_X], outputs=[Action_Y_index])
    Q_value_fn = theano.function( inputs=[input_X], outputs=[Q_value])
Exemple #4
0
    def __init__(self, nh, nc, nf, mb):
        '''
        nh :: dimension of the hidden layer
        nc :: number of classes
        nf :: input feature size
        mb :: mini batch size
        '''
        # parameters of the model
        # first level : input to hidden bias
        self.wx_z = generate_weight(nf, nh, 'wx_z')

        # first level: recurrent : hidden to hidden state
        self.wh_z = generate_weight(nh, nh, 'wh_z')

        # first level: input to hidden bias
        self.bh_z = generate_weight(1, nh, 'bh_z')

        # first level : input to hidden bias
        self.wx_i = theano.shared(name='wx_i',
                                  value=0.2 * np.random.uniform(-1.0, 1.0,
                                                                   (nf, nh))
                                  .astype(theano.config.floatX))

        # first level: recurrent : hidden to hidden state
        self.wh_i = theano.shared(name='wh_i',
                                  value=0.2 * np.random.uniform(-1.0, 1.0,
                                                                   (nh, nh))
                                  .astype(theano.config.floatX))

        # first level: input to hidden bias
        self.bh_i = theano.shared(name='bh_i',
                                  value=np.zeros((1, nh),
                                                    dtype=theano.config.floatX))

        # first level : input to hidden bias
        self.wx_f = theano.shared(name='wx_f',
                                  value=0.2 * np.random.uniform(-1.0, 1.0,
                                                                   (nf, nh))
                                  .astype(theano.config.floatX))

        # first level: recurrent : hidden to hidden state
        self.wh_f = theano.shared(name='wh_f',
                                  value=0.2 * np.random.uniform(-1.0, 1.0,
                                                                   (nh, nh))
                                  .astype(theano.config.floatX))

        # first level: input to hidden bias
        self.bh_f = theano.shared(name='bh_f',
                                  value=np.zeros((1, nh),
                                                    dtype=theano.config.floatX))

        # first level : input to hidden bias
        self.wx_o = theano.shared(name='wx_o',
                                  value=0.2 * np.random.uniform(-1.0, 1.0,
                                                                   (nf, nh))
                                  .astype(theano.config.floatX))

        # first level: recurrent : hidden to hidden state
        self.wh_o = theano.shared(name='wh_o',
                                  value=0.2 * np.random.uniform(-1.0, 1.0,
                                                                   (nh, nh))
                                  .astype(theano.config.floatX))

        # first level: input to hidden bias
        self.bh_o = theano.shared(name='bh_o',
                                  value=np.zeros((1, nh),
                                                    dtype=theano.config.floatX))

        ## the peephole weights
        self.ph_o = theano.shared(name='ph_o',
                                  value=np.zeros((1, nh),
                                                    dtype=theano.config.floatX))
        self.ph_i = theano.shared(name='ph_i',
                                  value=np.zeros((1, nh),
                                                    dtype=theano.config.floatX))
        self.ph_f = theano.shared(name='ph_f',
                                  value=np.zeros((1, nh),
                                                    dtype=theano.config.floatX))

        # hidden layer value :
        self.h0 = theano.shared(name='h0',
                                value=np.zeros((mb, nh),
                                                  dtype=theano.config.floatX))
        # hidden layer value :
        self.c0 = theano.shared(name='c0',
                                value=np.zeros((mb, nh),
                                                  dtype=theano.config.floatX))
        ## LAST Level
        # last level:  hidden to output
        self.w = theano.shared(name='w',
                               value=0.2 * np.random.uniform(-1.0, 1.0,
                                                                (nh, nc))
                               .astype(theano.config.floatX))

        # last level:  hidden to output
        self.b = theano.shared(name='b',
                               value=np.zeros((1, nc),
                                                 dtype=theano.config.floatX))

        self.I_mb = theano.shared(name='I',
                                value=np.ones((mb, 1),
                                                  dtype=theano.config.floatX))

        # bundle
        self.params = [self.wx_z, self.wx_f, self.wx_i, self.wx_o,
                       self.wh_z, self.wh_f, self.wh_i, self.wh_o,
                       self.bh_z, self.bh_f, self.bh_i, self.bh_o,
                       self.ph_i, self.ph_o, self.ph_f,
                       self.w, self.b]

        lr = T.scalar('lr')

        idxs = T.tensor3()     # input, since batched, dim rise to 3
        x = idxs.astype(theano.config.floatX)
        yinput = T.tensor3()   # labels
        y_sentence = yinput.astype(theano.config.floatX)
        #idxs = T.imatrix()
        #y_sentence = T.ivector()  # no batch version

        def recurrence(x_t, h_tm1, c_tm1):
            z_t = T.tanh(T.dot(x_t, self.wx_z) + T.dot(h_tm1, self.wh_z)
                              + T.dot(self.I_mb, self.bh_z))
            i_t = T.nnet.sigmoid(T.dot(x_t, self.wx_i) + T.dot(h_tm1, self.wh_i)
                                 + T.dot(self.I_mb, self.ph_i) * c_tm1
                                 + T.dot(self.I_mb, self.bh_i))
            f_t = T.nnet.sigmoid(T.dot(x_t, self.wx_f) + T.dot(h_tm1, self.wh_f)
                                 + T.dot(self.I_mb, self.ph_f) * c_tm1
                                 + T.dot(self.I_mb, self.bh_f))
            c_t = z_t * i_t + c_tm1 * f_t
            o_t = T.nnet.sigmoid(T.dot(x_t, self.wx_o) + T.dot(h_tm1, self.wh_o)
                                 + T.dot(self.I_mb, self.ph_o) * c_t
                                 + T.dot(self.I_mb, self.bh_o))
            h_t = T.tanh(c_t) * o_t

            s_t = T.nnet.softmax(T.dot(h_t, self.w) + T.dot(self.I_mb, self.b))

            '''no batch, raw math equations'''
            '''
            z_t = T.tanh(T.dot(x_t, self.wx_z) + T.dot(h_tm1, self.wh_z) + self.bh_z)
            i_t = T.nnet.sigmoid(T.dot(x_t, self.wx_i) + T.dot(h_tm1, self.wh_i) + self.bh_i + self.ph_i * c_tm1)
            f_t = T.nnet.sigmoid(T.dot(x_t, self.wx_f) + T.dot(h_tm1, self.wh_f) + self.bh_f + self.ph_f * c_tm1)
            c_t = z_t * i_t + c_tm1 * f_t
            o_t = T.nnet.sigmoid(T.dot(x_t, self.wx_o) + T.dot(h_tm1, self.wh_o) + self.bh_o + self.ph_o * c_t)
            h_t = T.tanh(c_t) * o_t
            s_t = T.nnet.softmax(T.dot(h_t, self.w) + self.b)
            '''
            return [h_t, c_t, s_t]

        [h, c, s], _ = theano.scan(fn=recurrence,
                                   sequences=x,
                                   outputs_info=[self.h0, self.c0, None],
                                   n_steps=x.shape[0])

        p_y_given_x_sentence = s[:, :, :]      # here size len x nc x mb
        y_pred = T.argmax(p_y_given_x_sentence, axis=2)
        #no batch:
        #p_y_given_x_sentence = s[:, 0, :]
        #y_pred = T.argmax(p_y_given_x_sentence, axis=1)

        # cost and gradients and learning rate
        sentence_nll = -T.mean(T.log(T.nonzero_values(p_y_given_x_sentence * y_sentence))) * mb
        #no batch:
        #sentence_nll = -T.mean(T.log(p_y_given_x_sentence)[T.arange(x.shape[0]), y_sentence])

        sentence_gradients = T.grad(sentence_nll, self.params)
        sentence_updates = OrderedDict((p, p - lr * g)
                                       for p, g in
                                       zip(self.params, sentence_gradients))

        # theano functions to compile
        self.classify = theano.function(inputs=[idxs], outputs=y_pred)#, mode=profmode)
        self.sentence_train = theano.function(inputs=[idxs, y_sentence, lr],
                                              outputs=sentence_nll,
                                              updates=sentence_updates)

         # by default it is sgd
        self.optm = optimizers.sgd
        self.f_grad_shared, self.f_update = self.optm(lr, dict(zip([s.name for s in self.params], self.params)),
                                                      sentence_gradients,x, y_sentence, sentence_nll)
Exemple #5
0
    def __init__(self, nh, nc, nf, mb):
        '''
        nh :: dimension of the hidden layer
        nc :: number of classes
        nf :: number of features
        mb :: batch size : mini batch
        '''
        # parameters of the model
        self.wx = theano.shared(name='wx',
                                value=0.2 * numpy.random.uniform(-1.0, 1.0,
                                                                 (nf, nh))
                                .astype(theano.config.floatX))
        self.wh = theano.shared(name='wh',
                                value=0.2 * numpy.random.uniform(-1.0, 1.0,
                                                                 (nh, nh))
                                .astype(theano.config.floatX))
        self.w = theano.shared(name='w',
                               value=0.2 * numpy.random.uniform(-1.0, 1.0,
                                                                (nh, nc))
                               .astype(theano.config.floatX))
        self.bh = theano.shared(name='bh',
                                value=numpy.zeros((nh, 1),
                                                  dtype=theano.config.floatX))
        self.b = theano.shared(name='b',
                               value=numpy.zeros((nc, 1),
                                                 dtype=theano.config.floatX))
        self.h0 = theano.shared(name='h0',
                                value=numpy.zeros((mb, nh),
                                                  dtype=theano.config.floatX))
        self.I_mb = theano.shared(name='I',
                                value=numpy.ones((mb, 1),
                                                  dtype=theano.config.floatX))

        # bundle
        self.params = [self.wx, self.wh, self.w,
                       self.bh, self.b]

        lr = T.scalar('lr')

        idxs = T.tensor3()     # input, since batched, dim rise to 3
        x = idxs.astype(theano.config.floatX)
        yinput = T.tensor3()   # labels
        y_sentence = yinput.astype(theano.config.floatX)
        # no batch:
        #idxs = T.imatrix()
        #y_sentence = T.ivector('y_sentence')  # labels

        def recurrence(x_t, h_tm1):
            h_t = T.nnet.sigmoid(T.dot(x_t, self.wx)
                                 + T.dot(h_tm1, self.wh) + T.dot(self.I_mb, self.bh.T)) #
            s_t = T.nnet.softmax(T.dot(h_t, self.w) + T.dot(self.I_mb, self.b.T))  #
            # trying for the sparse version? //TODO: suppose to be much faster since both cost and grad are sparse
            ## Sparse.structured_dot(Sparse.csc_from_dense(x_t), self.wx)
            return [h_t, s_t]   # output is dimension len x (nc x 1) but s_t is of len x 1 x nc

        [h, s], _ = theano.scan(fn=recurrence,
                                sequences=x,
                                outputs_info=[self.h0, None],
                                n_steps=x.shape[0])

        p_y_given_x_sentence = s[:, :, :]      # here size len x nc x mb
        y_pred = T.argmax(p_y_given_x_sentence, axis=2)
        # no batch:
        #p_y_given_x_sentence = s[:, 0, :]
        #y_pred = T.argmax(p_y_given_x_sentence, axis=1)

        # cost and gradients and learning rate
        # y is matrix (nlabel , batch) now instead of pure vector
        # TODO: NEED TO FIGURE OUT PROPER WAY TO COMPUTE COST, NOW MEAN DOES NOT MAKE SENSE ....
        #sentence_nll = -T.mean(T.log(p_y_given_x_sentence) * y_sentence) * mb * 5
        sentence_nll = -T.mean(T.log(T.nonzero_values(p_y_given_x_sentence * y_sentence))) * mb
                    # sparse version?
                    #   T.mean(T.log(  T.nonzero_values(p_y_given_x_sentence * y_sentence)))
        # non-batch version:
        # sentence_nll = -T.mean(T.log(p_y_given_x_sentence)[T.arange(x.shape[0]), y_sentence])

        sentence_gradients = T.grad(sentence_nll, self.params)

        sentence_updates = OrderedDict((p, p - lr * g)
                                       for p, g in
                                         zip(self.params, sentence_gradients))

        # theano functions to compile
        self.classify = theano.function(inputs=[idxs], outputs=y_pred)
        # this is not going to be used .....
        self.sentence_train = theano.function(inputs=[idxs, y_sentence, lr],
                                              outputs=sentence_nll,
                                              updates=sentence_updates)

        # by default it is sgd
        self.optm = optimizers.rmsprop
        self.f_grad_shared, self.f_update = self.optm(lr, dict(zip([s.name for s in self.params], self.params)),
                                                      sentence_gradients,x, y_sentence, sentence_nll)
b = T.ivector()

c = a[T.arange(b.shape[0]),b]

test = theano.function([a,b], c)

a0 = np.random.uniform(-0.01, 0.01, (3, 4, 5)).astype('float32')
b0 = np.random.randint(4, size=(3)).astype('int32')
c0 = test(a0, b0)
"""
a = T.fvector()
#a = T.fmatrix()
#b = T.ftensor3()

c = T.nonzero(a, True)
d = T.nonzero_values(a)

test = theano.function([a], [c, d])

#a0 = np.random.uniform(-0.01, 0.01, (2, 3)).astype('float32')
#b0 = np.random.uniform(-0.01, 0.01, (2, 3, 3)).astype('float32')

#b0 = np.random.randint(0, 4, size=(4)).astype('int32')
#c0 = np.random.randint(0, 4, size=(4)).astype('int32')
a0 = np.array([1, 2, 0, 0]).astype('float32')

c0, d0 = test(a0)

#print a0
#print b0
print c0, d0
Exemple #7
0
    def __init__(self, nh_enc, nh_dec, nh_att, nx, ny, mb, lt, bidir, nonlstm_encode=False, restriction=None):
        '''
        nh_enc :: dimension of the hidden layer of encoder
        nh_dec :: dimension of the hidden layer of decoder
        nh_att :: dimension of the hidden layer of attention
        ny :: number of classes
        nx :: input feature size
        mb :: mini batch size
        lt :: length of input, after padding .. for attention
        bidir:: bidirection or not ... 2 is bidirection, 1 is single ...
        '''
        self.nh_enc = nh_enc
        self.nh_dec = nh_dec
        self.nh_att = nh_att
        self.nx = nx
        self.ny = ny
        self.lt = lt
        self.bidir = bidir

        # parameters of the model
        xhdim = nx+nh_enc*bidir
        # encoder forward
        # 1 level : input to hidden bias, *4 below is since we compressed the W, H, b computation
        self.Wf_enc_z = generate_weight(nx, nh_enc, "Wf_enc_z")
        self.Wf_enc_i = generate_weight(nx, nh_enc, "Wf_enc_i")
        self.Wf_enc_f = generate_weight(nx, nh_enc, "Wf_enc_f")
        self.Wf_enc_o = generate_weight(nx, nh_enc, "Wf_enc_o")

        self.Hf_enc_z = generate_weight(nh_enc, nh_enc, "Hf_enc_z")
        self.Hf_enc_i = generate_weight(nh_enc, nh_enc, "Hf_enc_i")
        self.Hf_enc_f = generate_weight(nh_enc, nh_enc, "Hf_enc_f")
        self.Hf_enc_o = generate_weight(nh_enc, nh_enc, "Hf_enc_o")

        self.bf_enc_z = generate_weight(1, nh_enc, "bf_enc_z")
        self.bf_enc_i = generate_weight(1, nh_enc, "bf_enc_i")
        self.bf_enc_f = generate_weight(1, nh_enc, "bf_enc_f")
        self.bf_enc_o = generate_weight(1, nh_enc, "bf_enc_o")

        # encoder backward:
        self.Wb_enc_z = generate_weight(nx, nh_enc, "Wb_enc_z")
        self.Wb_enc_i = generate_weight(nx, nh_enc, "Wb_enc_i")
        self.Wb_enc_f = generate_weight(nx, nh_enc, "Wb_enc_f")
        self.Wb_enc_o = generate_weight(nx, nh_enc, "Wb_enc_o")

        self.Hb_enc_z = generate_weight(nh_enc, nh_enc, "Hb_enc_z")
        self.Hb_enc_i = generate_weight(nh_enc, nh_enc, "Hb_enc_i")
        self.Hb_enc_f = generate_weight(nh_enc, nh_enc, "Hb_enc_f")
        self.Hb_enc_o = generate_weight(nh_enc, nh_enc, "Hb_enc_o")

        self.bb_enc_z = generate_weight(1, nh_enc, "bb_enc_z")
        self.bb_enc_i = generate_weight(1, nh_enc, "bb_enc_i")
        self.bb_enc_f = generate_weight(1, nh_enc, "bb_enc_f")
        self.bb_enc_o = generate_weight(1, nh_enc, "bb_enc_o")

        ## attention level:
        self.UV_att = generate_weight(xhdim, nh_att, "UV_att")
        self.W_att = generate_weight(nh_dec, nh_att, "W_att")
        self.v_att = generate_weight(nh_att, 1, "v_att")

        # decoder level : input to hidden bias
        self.W_dec_z = generate_weight(xhdim, nh_dec, "W_dec_z")
        self.W_dec_i = generate_weight(xhdim, nh_dec, "W_dec_i")
        self.W_dec_f = generate_weight(xhdim, nh_dec, "W_dec_f")
        self.W_dec_o = generate_weight(xhdim, nh_dec, "W_dec_o")

        self.H_dec_z = generate_weight(nh_dec, nh_dec, "H_dec_z")
        self.H_dec_i = generate_weight(nh_dec, nh_dec, "H_dec_i")
        self.H_dec_f = generate_weight(nh_dec, nh_dec, "H_dec_f")
        self.H_dec_o = generate_weight(nh_dec, nh_dec, "H_dec_o")

        self.b_dec_z = generate_weight(1, nh_dec, "b_dec_z")
        self.b_dec_i = generate_weight(1, nh_dec, "b_dec_i")
        self.b_dec_f = generate_weight(1, nh_dec, "b_dec_f")
        self.b_dec_o = generate_weight(1, nh_dec, "b_dec_o")

            # e is extra in decoder, for previous outut
        self.E_dec_z = generate_weight(ny, nh_dec, "E_dec_z")
        self.E_dec_i = generate_weight(ny, nh_dec, "E_dec_i")
        self.E_dec_f = generate_weight(ny, nh_dec, "E_dec_f")
        self.E_dec_o = generate_weight(ny, nh_dec, "E_dec_o")

        ## LAST Level
        # last level:  hidden to output
        self.W_y = generate_weight(xhdim, ny, "W_y")
        self.H_y = generate_weight(nh_dec, ny, "H_y")
        self.E_y = generate_weight(ny, ny, "E_y")
        self.b_y = generate_weight(1, ny, "b_y", 0.0)

        ## INTERMEDIATE value
        hf0 = theano.shared(name='hf0', value=np.zeros((mb, nh_enc), dtype=config.floatX)) # forward
        cf0 = theano.shared(name='cf0', value=np.zeros((mb, nh_enc), dtype=config.floatX))
        hb0 = theano.shared(name='hb0', value=np.zeros((mb, nh_enc), dtype=config.floatX)) # backward
        cb0 = theano.shared(name='cb0', value=np.zeros((mb, nh_enc), dtype=config.floatX))

        sd0 = theano.shared(name='sd0', value=np.zeros((mb, nh_dec), dtype=config.floatX))
        cd0 = theano.shared(name='cd0', value=np.zeros((mb, nh_dec), dtype=config.floatX))
        a= np.zeros((1, mb, ny), dtype=config.floatX)
        a[:,:,0] =1
        y0 = theano.shared(name='y0', value=a)


        # all one vector for batch size ... , deprecated, should be matching automatically
        I_mb = theano.shared(name='I', value=np.ones((mb, 1), dtype=config.floatX))

        WHb_f_enc = [self.Wf_enc_z, self.Hf_enc_z, self.bf_enc_z,
                       self.Wf_enc_i, self.Hf_enc_i, self.bf_enc_i,
                       self.Wf_enc_f, self.Hf_enc_f, self.bf_enc_f,
                       self.Wf_enc_o, self.Hf_enc_o, self.bf_enc_o]
        WHb_b_enc = [self.Wb_enc_z, self.Hb_enc_z, self.bb_enc_z,
                       self.Wb_enc_i, self.Hb_enc_i, self.bb_enc_i,
                       self.Wb_enc_f, self.Hb_enc_f, self.bb_enc_f,
                       self.Wb_enc_o, self.Hb_enc_o, self.bb_enc_o]
        WHEb_dec =  [self.W_dec_z, self.E_dec_z, self.H_dec_z, self.b_dec_z,
                       self.W_dec_i, self.E_dec_i, self.H_dec_i, self.b_dec_i,
                       self.W_dec_f, self.E_dec_f, self.H_dec_f, self.b_dec_f,
                       self.W_dec_o, self.E_dec_o, self.H_dec_o, self.b_dec_o]

        Wb_nonlstm_enc = [self.Wf_enc_z, self.bf_enc_z]

        # bundle, todo: note we removed peephole from definition ...
        self.params = [self.UV_att, self.W_att, self.v_att,
                       self.W_y, self.H_y, self.b_y, self.E_y] + WHEb_dec

        if not nonlstm_encode:
            self.params += WHb_f_enc
            if bidir == 2:
                self.params += WHb_b_enc
        else:
            self.params += Wb_nonlstm_enc  ## special case, to test image capture, just twist the encode to be nonlstm

        # Used for dropout.
        trng = RandomStreams(SEED)
        use_noise = theano.shared(numpy_floatX(0.))

        # input parameter defined ....
        x_in = T.tensor3()     # input, since batched, dim rise to 3 : len * mb * nx
        x = x_in.astype(config.floatX)

        y_in = T.tensor3()   # ground truth labels ,  len * mb * ny
        y_target = y_in.astype(config.floatX)

        y_decinput = T.concatenate([y0, y_target], axis=0)[:-1, :,:].astype(config.floatX)   # decode input labels, shifted to right by one and start with eos


        lr = T.scalar('lr')

        def encode(x_t, h_tm1, c_tm1, W_enc_z, H_enc_z, b_enc_z, W_enc_i, H_enc_i, b_enc_i,
                                        W_enc_f, H_enc_f, b_enc_f, W_enc_o, H_enc_o, b_enc_o):
            g_t = T.tanh(T.dot(x_t, W_enc_z) + T.dot(h_tm1, H_enc_z) + T.dot(I_mb, b_enc_z))
            i_t = T.nnet.sigmoid(T.dot(x_t, W_enc_i) + T.dot(h_tm1, H_enc_i) + T.dot(I_mb, b_enc_i) ) # + T.dot(I_mb, ph_i.T) * c_tm1)
            f_t = T.nnet.sigmoid(T.dot(x_t, W_enc_f) + T.dot(h_tm1, H_enc_f) + T.dot(I_mb, b_enc_f) ) # + T.dot(I_mb, ph_f.T) * c_tm1
            c_t = g_t * i_t + c_tm1 * f_t
            o_t = T.nnet.sigmoid(T.dot(x_t, W_enc_o) + T.dot(h_tm1, H_enc_o) + T.dot(I_mb, b_enc_o) ) # + T.dot(I_mb, ph_o.T) * c_t
            h_t = T.tanh(c_t) * o_t
            return [h_t, c_t]

        def relu(x):
            return theano.tensor.switch(x<0, 0, x)

        if nonlstm_encode:
            hf = relu(T.dot(x, self.Wf_enc_z) + T.dot(I_mb, self.bf_enc_z))  # len * mb * nx
            xh = T.concatenate([x, hf], axis=2)  # since dim0 is the length of input,  so it is of len * batch * xh_dim
        else:
            [hf, cf], _ = theano.scan(fn=encode, sequences=x, outputs_info=[hf0, cf0],
                                  non_sequences=WHb_f_enc,
                                  n_steps=x.shape[0])
            xh = T.concatenate([x, hf], axis=2)  # since dim0 is the length of input,  so it is of len * batch * xh_dim

            if bidir == 2:
                [hb, cb], _ = theano.scan(fn=encode, sequences=x, outputs_info=[hb0, cb0],
                                      non_sequences=WHb_b_enc, go_backwards=True)

                xh = T.concatenate([x, hf, hb[::-1]], axis=2)  #same as above
                # note: scan is in input backward fashion, but output corresponding to an inverted order, thus use [::-1] to reverse it.

        # attention prepare, since the same across all place
        UVxh = T.dot(xh, self.UV_att)  #.dimshuffle(1, 0)  actually it does not matter (then shuffle dim by switch 1st and 2nd dim)
        # dim z=x+h, then dot of len*mb*z  and z*a=  len*mb*a
        if restriction is not None:
            restriction_matrix = theano.shared(name="restriction", value=restriction).astype(config.floatX)

        def stable_softmax(yin):
            e_yin = np.exp(yin - yin.max(axis=1, keepdims=True))
            return e_yin / e_yin.sum(axis=1, keepdims=True)

        def stable_softmax_nonzero(yin, zerosout):
            e_yin = np.exp(yin - yin.max(axis=1, keepdims=True)) #
            return e_yin / e_yin.sum(axis=1, keepdims=True) * zerosout
            #return T.nnet.softmax(yin - yin.max(axis=1, keepdims=True)) * zerosout

        def decode(y_tm1, sd_tm1, cd_tm1, xh, UVxh, I_mb):
            beta_st = T.dot(sd_tm1, self.W_att) + UVxh  # note, dimension mismatch is fine, a*mb +  len * a * mb
            beta_t = T.dot(beta_st, self.v_att)   #1*len*mb      v_att is (a*1)  => len * mb * 1
            alpha_t = stable_softmax(beta_t.dimshuffle(1,0,2))
            z_t = T.batched_dot(xh.dimshuffle(1, 2, 0), alpha_t).flatten(2)
            g_t = T.tanh(T.dot(z_t, self.W_dec_z) + T.dot(sd_tm1, self.H_dec_z) + T.dot(I_mb, self.b_dec_z)
                         + T.dot(y_tm1, self.E_dec_z))
            i_t = T.nnet.sigmoid(T.dot(z_t, self.W_dec_i) + T.dot(sd_tm1, self.H_dec_i) + T.dot(I_mb, self.b_dec_i)
                         + T.dot(y_tm1, self.E_dec_i))    # + T.dot(I_mb, ph_i.T) * c_tm1)
            f_t = T.nnet.sigmoid(T.dot(z_t, self.W_dec_f) + T.dot(sd_tm1, self.H_dec_f) + T.dot(I_mb, self.b_dec_f)
                         + T.dot(y_tm1, self.E_dec_f))   #+ T.dot(I_mb, ph_f.T) * c_tm1
            cd_t = g_t * i_t + cd_tm1 * f_t
            o_t = T.nnet.sigmoid(T.dot(z_t, self.W_dec_o) + T.dot(sd_tm1, self.H_dec_o) + T.dot(I_mb, self.b_dec_o)
                         + T.dot(y_tm1, self.E_dec_o))   # + T.dot(I_mb, ph_o.T) * c_t
            sd_t = T.tanh(cd_t) * o_t

            #sd_t = dropout(sd_t, use_noise, trng)
            if restriction is None:
                y_t = stable_softmax( ( T.dot(z_t, self.W_y) + T.dot(sd_t, self.H_y)
                    + T.dot(y_tm1, self.E_y) + T.dot(I_mb, self.b_y) ) )
            else:
                restriction_perbatch = restriction_matrix[T.argmax(y_tm1, axis=1)]
                y_t = stable_softmax_nonzero( (T.dot(z_t, self.W_y) + T.dot(sd_t, self.H_y)
                    + T.dot(y_tm1, self.E_y) + T.dot(I_mb, self.b_y)) , restriction_perbatch)

            return [sd_t, cd_t, y_t]

        [sd_dec, cd_dec, y_dec], _ = theano.scan(fn=decode,
                                   sequences=y_decinput, # dict(input=y_decinput, taps=[0]),
                                   outputs_info=[dict(initial=sd0, taps=[-1]), dict(initial=cd0, taps=[-1]), None ], #, dict(initial=y0, taps=[-1])],
                                   non_sequences=[xh, UVxh, I_mb],
                                   n_steps=y_decinput.shape[0])

        p_y_given_x_sentence = y_dec[:, :, :]      # here size len x ny x mb
        y_pred = T.argmax(p_y_given_x_sentence, axis=2)

        # cost and gradients and learning rate
        sentence_cost = -T.mean(T.log(T.nonzero_values(p_y_given_x_sentence * y_target[:,:,:]) + np.float32(1e-8)))

        sentence_gradients = T.grad(sentence_cost, self.params)
        sentence_updates = OrderedDict((p, p - lr * g)
                                       for p, g in
                                       zip(self.params, sentence_gradients))

        # theano functions to compile
        self.classify = theano.function(inputs=[x_in, y_target], outputs=y_pred)
        self.sentence_train = theano.function(inputs=[x_in, y_target, lr],
                                              outputs=sentence_cost,
                                              updates=sentence_updates)

        self.only_encode = theano.function(inputs=[x_in], outputs=[xh, UVxh])
        self.only_decode_step = decode

        # by default it is sgd
        self.optm = optimizers.sgd
        self.f_grad_shared, self.f_update = self.optm(lr, dict(zip([s.name for s in self.params], self.params)),
                                                      sentence_gradients, x, y_target, sentence_cost)
def test4(x3):
    return TT.nonzero_values(x3)