def simple_upsample3d(inpt, up_factor):
    inpt = T.repeat(inpt, up_factor[0], axis=3)
    inpt = T.repeat(inpt, up_factor[1], axis=4)
    inpt = T.repeat(inpt, up_factor[2], axis=1)
    #rep = [1, up_factor[2], 1, up_factor[0], up_factor[1]]
    #inpt = T.tile(inpt, rep, ndim=5)
    return inpt
    def run(self, h):
        channels = self.channels#images.shape[1]
        if not self.test:
            gx,gy,dx,dy,s2,g = self.get_params(h)
        else:
            gx,gy,dx,dy,s2,g = self.get_params_test(h)

        w = self.w_transform.run(h)

        w = w.reshape((self.batch_size*self.channels, self.N, self.N))


        muX = gx.dimshuffle([0,'x']) + dx.dimshuffle([0,'x']) * (T.arange(self.N).astype(theano.config.floatX) - self.N/2 - 0.5)
        muY = gy.dimshuffle([0,'x']) + dy.dimshuffle([0,'x']) * (T.arange(self.N).astype(theano.config.floatX) - self.N/2 - 0.5)

        a = T.arange(self.width).astype(theano.config.floatX)
        b = T.arange(self.height).astype(theano.config.floatX)

        Fx = T.exp(-(a-muX.dimshuffle([0,1,'x']))**2 / 2. / s2.dimshuffle([0,'x','x'])**2)
        Fy = T.exp(-(b-muY.dimshuffle([0,1,'x']))**2 / 2. / s2.dimshuffle([0,'x','x'])**2)

        Fx = Fx / (Fx.sum(axis=-1).dimshuffle([0,1,'x']) + 1e-4)
        Fy = Fy / (Fy.sum(axis=-1).dimshuffle([0,1,'x']) + 1e-4)

        self.Fx = T.repeat(Fx, channels, axis=0)
        self.Fy = T.repeat(Fy, channels, axis=0)

        self.fint = self.batched_dot(self.Fy.transpose((0,2,1)), w)

        self.fim = self.batched_dot(self.fint, self.Fx).reshape((self.batch_size, self.channels*self.width*self.height))

        return 1./g * self.fim, (gx, gy, dx, dy, self.fint)
Exemple #3
0
def neglog_2d(output, target):
    i = T.arange(target.shape[0]).reshape((target.shape[0], 1))
    i = T.repeat(i, target.shape[1], axis=1).flatten()
    j = T.arange(target.shape[1]).reshape((1, target.shape[1]))
    j = T.repeat(j, target.shape[0], axis=0).flatten()
    k = target.flatten()
    return -T.mean(T.log(output)[i, j, k])
    def run(self, images, h):#, error_images, h):
        channels = self.channels#images.shape[1]
        if not self.test:
            gx,gy,dx,dy,s2,g = self.get_params(h)
        else:
            gx,gy,dx,dy,s2,g = self.get_params_test(h)

        # how to handle variable sized input images? (mask??)
        I = images.reshape((self.batch_size*self.channels, self.height, self.width))

        muX = gx.dimshuffle([0,'x']) + dx.dimshuffle([0,'x']) * (T.arange(self.N).astype(theano.config.floatX) - self.N/2 - 0.5)
        muY = gy.dimshuffle([0,'x']) + dy.dimshuffle([0,'x']) * (T.arange(self.N).astype(theano.config.floatX) - self.N/2 - 0.5)

        a = T.arange(self.width).astype(theano.config.floatX)
        b = T.arange(self.height).astype(theano.config.floatX)

        Fx = T.exp(-(a-muX.dimshuffle([0,1,'x']))**2 / 2. / s2.dimshuffle([0,'x','x'])**2)
        Fy = T.exp(-(b-muY.dimshuffle([0,1,'x']))**2 / 2. / s2.dimshuffle([0,'x','x'])**2)

        Fx = Fx / (Fx.sum(axis=-1).dimshuffle([0,1,'x']) + 1e-4)
        Fy = Fy / (Fy.sum(axis=-1).dimshuffle([0,1,'x']) + 1e-4)

        self.Fx = T.repeat(Fx, channels, axis=0)
        self.Fy = T.repeat(Fy, channels, axis=0)

        self.fint = self.batched_dot(self.Fy, I)
#        self.efint = T.dot(self.Fx, error_images)
        self.fim = self.batched_dot(self.fint, self.Fx.transpose([0,2,1])).reshape(
            (self.batch_size, self.channels*self.N*self.N))
#        self.feim = T.dot(self.efint, self.Fy.transpose([0,2,1])).reshape(
#            (self.batch_size, channels,self.N,self.N))
        return g * self.fim, (gx, gy, dx, dy, self.fint)#$T.concatenate([self.fim, self.feim], axis=1)
Exemple #5
0
    def output(self, train):
        X = self.get_input(train) # shape: (nb_samples, time (padded with zeros at the end), input_dim)
        # new shape: (time, nb_samples, input_dim) -> because theano.scan iterates over main dimension
        X = X.dimshuffle((1, 0, 2))

        xf = self.activation(T.dot(X, self.W_if) + self.b_if)
        xb = self.activation(T.dot(X, self.W_ib) + self.b_ib)
        b_o=self.b_o
        b_on= T.repeat(T.repeat(b_o.reshape((1,self.output_dim)),X.shape[0],axis=0).reshape((1,X.shape[0],self.output_dim)),X.shape[1],axis=0)
        # Iterate forward over the first dimension of the x array (=time).
        outputs_f, updates_f = theano.scan(
            self._step,  # this will be called with arguments (sequences[i], outputs[i-1], non_sequences[i])
            sequences=xf,  # tensors to iterate over, inputs to _step
            # initialization of the output. Input to _step with default tap=-1.
            outputs_info=alloc_zeros_matrix(X.shape[1], self.output_dim),
            non_sequences=[self.W_ff,self.b_f],  # static inputs to _step
            truncate_gradient=self.truncate_gradient
        )
        # Iterate backward over the first dimension of the x array (=time).
        outputs_b, updates_b = theano.scan(
            self._step,  # this will be called with arguments (sequences[i], outputs[i-1], non_sequences[i])
            sequences=xb,  # tensors to iterate over, inputs to _step
            # initialization of the output. Input to _step with default tap=-1.
            outputs_info=alloc_zeros_matrix(X.shape[1], self.output_dim),
            non_sequences=[self.W_bb,self.b_b],  # static inputs to _step
            truncate_gradient=self.truncate_gradient,
            go_backwards=True  # Iterate backwards through time
        )
        #return outputs_f.dimshuffle((1, 0, 2))
        if self.return_sequences:
            return T.add(T.tensordot(T.add(outputs_f.dimshuffle((1, 0, 2)), outputs_b[::-1].dimshuffle((1,0,2))),self.W_o,[[2],[0]]),b_on)
        return T.concatenate((outputs_f[-1], outputs_b[0]))
def keep_max(input, theta, k, sent_mask):
    sig_input = T.nnet.sigmoid(T.dot(input, theta))
    sent_mask = sent_mask.dimshuffle(0, 'x', 1, 'x')
    sig_input = sig_input * sent_mask
    #sig_input = T.dot(input, theta)
    if k == 0:
        result = input * T.addbroadcast(sig_input, 3)
        return result, sig_input

    # get the sorted idx
    sort_idx = T.argsort(sig_input, axis=2)
    k_max_ids = sort_idx[:,:,-k:,:]
    dim0, dim1, dim2, dim3 = k_max_ids.shape
    batchids = T.repeat(T.arange(dim0), dim1*dim2*dim3)
    mapids = T.repeat(T.arange(dim1), dim2*dim3).reshape((1, dim2*dim3))
    mapids = T.repeat(mapids, dim0, axis=0).flatten()
    rowids = k_max_ids.flatten()
    colids = T.arange(dim3).reshape((1, dim3))
    colids = T.repeat(colids, dim0*dim1*dim2, axis=0).flatten()
    sig_mask = T.zeros_like(sig_input)
    choosed = sig_input[batchids, mapids, rowids, colids]
    sig_mask = T.set_subtensor(sig_mask[batchids, mapids, rowids, colids], 1)
    input_mask = sig_mask * sig_input
    result = input * T.addbroadcast(input_mask, 3)
    return result, sig_input
Exemple #7
0
    def __theano__unpool(self, inp, us, dim=None, issequence=False):

        # Determine the dimensionality of convolution (2 or 3?)
        if dim is None:
            dim = 3 if not issequence and len(us) == 3 and inp.ndim == 5 else 2

        # Reshape 2D sequential data if required
        # Log input shape
        inpshape = inp.shape
        reallyissequential = issequence and inp.ndim == 5
        if issequence:
            if reallyissequential:
                # Reshape
                inp = inp.reshape((inpshape[0] * inpshape[1], inpshape[2], inpshape[3], inpshape[4]), ndim=4)
                us = us[0:2]
            else:
                warn("Expected 5D sequential output, but got 4D non-sequential instead.")

        if dim == 2:
            y = T.repeat(T.repeat(inp, us[0], axis=2), us[1], axis=3)
        elif dim == 3:
            y = T.repeat(T.repeat(T.repeat(inp, us[0], axis=3), us[1], axis=4), us[2], axis=1)
        else:
            raise NotImplementedError("Upsampling is implemented in 2D and 3D.")

        if issequence and reallyissequential:
            # Reshape sequential data (and remember that the spatial size has doubled)
            y = y.reshape((inpshape[0], inpshape[1], inpshape[2], us[0] * inpshape[3], us[1] * inpshape[4]), ndim=5)

        return y
def keep_max(input, theta, k):
    """
    :type input: theano.tensor.tensor4
    :param input: the input data
                
    :type theta: theano.tensor.matrix
    :param theta: the parameter for sigmoid function
                            
    :type k: int 
    :param k: the number k used to define top k sentence to remain
    """
    sig_input = T.nnet.sigmoid(T.dot(input, theta))
    if k == 0: # using all the sentences
        result = input * T.addbroadcast(sig_input, 3)
        return result, sig_input

    # get the sorted idx
    sort_idx = T.argsort(sig_input, axis=2)
    k_max_ids = sort_idx[:,:,-k:,:]
    dim0, dim1, dim2, dim3 = k_max_ids.shape
    batchids = T.repeat(T.arange(dim0), dim1*dim2*dim3)
    mapids = T.repeat(T.arange(dim1), dim2*dim3).reshape((1, dim2*dim3))
    mapids = T.repeat(mapids, dim0, axis=0).flatten()
    rowids = k_max_ids.flatten()
    colids = T.arange(dim3).reshape((1, dim3))
    colids = T.repeat(colids, dim0*dim1*dim2, axis=0).flatten()
    # construct masked data
    sig_mask = T.zeros_like(sig_input)
    choosed = sig_input[batchids, mapids, rowids, colids]
    sig_mask = T.set_subtensor(sig_mask[batchids, mapids, rowids, colids], 1)

    input_mask = sig_mask * sig_input
    result = input * T.addbroadcast(input_mask, 3)
    return result, sig_input
 def initial_states(self, batch_size, *args, **kwargs):
     return [
         tensor.repeat(self.initial_state_[None, :], batch_size, 0),
         tensor.repeat(self.initial_cells[None, :], batch_size, 0),
         tensor.repeat(self.initial_location[None, :], batch_size, 0),
         tensor.repeat(self.initial_scale[None, :], batch_size, 0),
     ]
Exemple #10
0
 def create_prediction(self):#做一次predict的方法
     gfs=self.gfs
     pm25in=self.pm25in
     #初始第一次前传
     x=T.concatenate([gfs[:,0],gfs[:,1],gfs[:,2],pm25in[:,0],pm25in[:,1],self.cnt[:,:,0]],axis=1)
     if self.celltype==RNN:
         init_hiddens = [(T.repeat(T.shape_padleft(create_shared(layer.hidden_size, name="RNN.initial_hidden_state")),
                                   x.shape[0], axis=0)
                          if x.ndim > 1 else create_shared(layer.hidden_size, name="RNN.initial_hidden_state"))
                         if hasattr(layer, 'initial_hidden_state') else None
                         for layer in self.model.layers]
     if self.celltype==LSTM:
         init_hiddens = [(T.repeat(T.shape_padleft(create_shared(layer.hidden_size * 2, name="LSTM.initial_hidden_state")),
                                   x.shape[0], axis=0)
                          if x.ndim > 1 else create_shared(layer.hidden_size * 2, name="LSTM.initial_hidden_state"))
                         if hasattr(layer, 'initial_hidden_state') else None
                         for layer in self.model.layers]
     self.layerstatus=self.model.forward(x,init_hiddens)
     #results.shape?40*1
     self.results=self.layerstatus[-1]
     if self.steps > 1:
         self.layerstatus=self.model.forward(T.concatenate([gfs[:,1],gfs[:,2],gfs[:,3],pm25in[:,1],self.results,self.cnt[:,:,1]],axis=1),self.layerstatus)
         self.results=T.concatenate([self.results,self.layerstatus[-1]],axis=1)      
         #前传之后step-2次
         for i in xrange(2,self.steps):
             self.layerstatus=self.model.forward(T.concatenate([gfs[:,i],gfs[:,i+1],gfs[:,i+2],T.shape_padright(self.results[:,i-2]),T.shape_padright(self.results[:,i-1]),self.cnt[:,:,i]],axis=1),self.layerstatus)
             #need T.shape_padright???
             self.results=T.concatenate([self.results,self.layerstatus[-1]],axis=1)
     return self.results
Exemple #11
0
    def fprop(self, X):
        w, z = X 
        batch_size, num_channel, height, width = self.glimpse_shape
        w = w.reshape((batch_size*num_channel, height, width))
       
        centey = z[:, 0]
        centex = z[:, 1]
        logdel = z[:, 2]
        logsig = z[:, 3]
        loggam = z[:, 4]

        centy = 0.5 * (self.input_shape[2] + 1) * (centey + 1)
        centx = 0.5 * (self.input_shape[3] + 1) * (centex + 1)
        delta = T.exp(logdel)
        delta = (max(self.input_shape[2], self.input_shape[3]) - 1) * delta /\
                 (max(self.glimpse_shape[2], self.glimpse_shape[3]) - 1)
        sigma = T.exp(0.5 * logsig)
        gamma = T.exp(loggam).dimshuffle(0, 'x')

        Fy, Fx = self.filter_bank(centx, centy, delta, sigma)
        if num_channel > 1:
            Fx = T.repeat(Fx, num_channel, axis=0)
            Fy = T.repeat(Fy, num_channel, axis=0)

        I = batched_dot(batched_dot(Fy.transpose(0, 2, 1), w), Fx)
        reshape_shape = (batch_size, num_channel*self.input_shape[2]*self.input_shape[3])
        return I.reshape(reshape_shape) / gamma
Exemple #12
0
    def fprop(self, X):
        x, x_hat, z = X
        batch_size, num_channel, height, width = self.input_shape
        x = x.reshape((batch_size*num_channel, height, width))
        x_hat = x_hat.reshape((batch_size*num_channel, height, width))

        centey = z[:, 0]
        centex = z[:, 1]
        logdel = z[:, 2]
        logsig = z[:, 3]
        loggam = z[:, 4]

        centy = 0.5 * (self.input_shape[2] + 1) * (centey + 1)
        centx = 0.5 * (self.input_shape[3] + 1) * (centex + 1)
        delta = T.exp(logdel)
        delta = (max(self.input_shape[2], self.input_shape[3]) - 1) * delta /\
                 (max(self.glimpse_shape[2], self.glimpse_shape[3]) - 1)
        sigma = T.exp(0.5 * logsig)
        gamma = T.exp(loggam).dimshuffle(0, 'x')

        Fy, Fx = self.filter_bank(centx, centy, delta, sigma)
        if num_channel > 1:
            Fx = T.repeat(Fx, num_channel, axis=0)
            Fy = T.repeat(Fy, num_channel, axis=0)

        x = batched_dot(batched_dot(Fy, x), Fx.transpose(0, 2, 1))
        x_hat = batched_dot(batched_dot(Fy, x_hat), Fx.transpose(0, 2, 1))
        reshape_shape = (batch_size,
                         num_channel*self.glimpse_shape[2]*self.glimpse_shape[3])
        return gamma * T.concatenate([x.reshape(reshape_shape), x_hat.reshape(reshape_shape)], axis=1)
Exemple #13
0
    def get_output(self, train=False):
        X = self.get_input(train)
        # mask = self.get_padded_shuffled_mask(train, X, pad=0)
        mask = self.get_input_mask(train=train)
        ind = T.switch(T.eq(mask[:, -1], 1.), mask.shape[-1], T.argmin(mask, axis=-1)).astype('int32').ravel()
        max_time = T.max(ind)
        X = X.dimshuffle((1, 0, 2))
        Y = T.dot(X, self.W) + self.b
        # h0 = T.unbroadcast(alloc_zeros_matrix(X.shape[1], self.output_dim), 1)
        h0 = T.repeat(self.h_m1, X.shape[1], axis=0)
        c0 = T.repeat(self.c_m1, X.shape[1], axis=0)

        [outputs, _], updates = theano.scan(
            self._step,
            sequences=Y,
            outputs_info=[h0, c0],
            non_sequences=[self.R], n_steps=max_time,
            truncate_gradient=self.truncate_gradient, strict=True,
            allow_gc=theano.config.scan.allow_gc)

        res = T.concatenate([h0.dimshuffle('x', 0, 1), outputs], axis=0).dimshuffle((1, 0, 2))
        if self.return_sequences:
            return res
        #return outputs[-1]
        return res[T.arange(mask.shape[0], dtype='int32'), ind]
    def construct_graph_ref(self, args, x, length, popstats=None):

        p = self.allocate_parameters(args)

        if args.baseline:
            def bn(x, gammas, betas):
                return x + betas
        else:
            def bn(x, gammas, betas):
                mean, var = x.mean(axis=0, keepdims=True), x.var(axis=0, keepdims=True)
                # if only
                mean.tag.batchstat, var.tag.batchstat = True, True
                #var = T.maximum(var, args.epsilon)
                var = var + args.epsilon
                return (x - mean) / T.sqrt(var) * gammas + betas

        def stepfn(x, dummy_h, dummy_c, h, c):
            # a_mean, b_mean, c_mean,
            # a_var, b_var, c_var):

            a_mean, b_mean, c_mean = 0, 0, 0
            a_var, b_var, c_var = 0, 0, 0

            atilde = T.dot(h, p.Wa)
            btilde = x
            a_normal = bn(atilde, p.a_gammas, p.ab_betas)
            b_normal = bn(btilde, p.b_gammas, 0)
            ab = a_normal + b_normal
            g, f, i, o = [fn(ab[:, j * args.num_hidden:(j + 1) * args.num_hidden])
                          for j, fn in enumerate([self.activation] + 3 * [T.nnet.sigmoid])]
            c = dummy_c + f * c + i * g
            c_normal = bn(c, p.c_gammas, p.c_betas)
            h = dummy_h + o * self.activation(c_normal)
            return h, c, atilde, btilde, c_normal



        xtilde = T.dot(x, p.Wx)

        if args.noise:
            # prime h with white noise
            Trng = MRG_RandomStreams()
            h_prime = Trng.normal((xtilde.shape[1], args.num_hidden), std=args.noise)
        elif args.summarize:
            # prime h with mean of example
            h_prime = x.mean(axis=[0, 2])[:, None]
        else:
            h_prime = 0

        dummy_states = dict(h=T.zeros((xtilde.shape[0], xtilde.shape[1], args.num_hidden)),
                            c=T.zeros((xtilde.shape[0], xtilde.shape[1], args.num_hidden)))

        [h, c, atilde, btilde, htilde], _ = theano.scan(
            stepfn,
            sequences=[xtilde, dummy_states["h"], dummy_states["c"]],
            outputs_info=[T.repeat(p.h0[None, :], xtilde.shape[1], axis=0) + h_prime,
                          T.repeat(p.c0[None, :], xtilde.shape[1], axis=0),
                          None, None, None])
        return dict(h=h, c=c,
                    atilde=atilde, btilde=btilde, htilde=htilde), [], dummy_states, popstats
Exemple #15
0
 def initial_state_with_taps(self, num=None):
     if num is not None:
         cell = T.repeat(self.default_cell, num, axis=0)
         output = T.repeat(self.default_output, num, axis=0)
     else:
         cell = self.default_cell
         output = self.default_output
     return dict(initial=output, taps=[-1]), dict(initial=cell, taps=[-1])
Exemple #16
0
def softmax( y ):
    y_max = T.max(y , axis = 2)
    y_max_rep = y_max.reshape( ( y_max.shape[0] , y_max.shape[1] , 1))
    y_opt = y - T.repeat (y_max_rep , y.shape[2] , axis = 2)
    y_sum = T.sum( T.exp(y_opt) , axis = 2 )
    y_reshape = y_sum.reshape( (y_sum.shape[0] , y_sum.shape[1] , 1) )
    a = ( T.exp(y_opt) / T.repeat  (y_reshape , y.shape[2] , axis = 2  ))
    return a
    def apply(self, x):

        # lazy hack
        h0 = self.parameters[0]
        c0 = self.parameters[1]
        Wa = self.parameters[2]
        Wx = self.parameters[3]
        if self.baseline:
            ab_betas = self.parameters[4]
            h_betas = self.parameters[5]
            a_gammas = None
            b_gammas = None
            h_gammas = None
        else:
            a_gammas = self.parameters[4]
            b_gammas = self.parameters[5]
            h_gammas = self.parameters[6]
            ab_betas = self.parameters[7]
            h_betas = self.parameters[8]

        xtilde = tensor.dot(x, Wx)

        if self.noise:
            # prime h with white noise
            Trng = MRG_RandomStreams()
            h_prime = Trng.normal((xtilde.shape[1], self.state_dim), std=args.noise)
        #elif args.summarize:
        #    # prime h with summary of example
        #    Winit = theano.shared(orthogonal((nclasses, self.state_dim)), name="Winit")
        #    parameters.append(Winit)
        #    h_prime = tensor.dot(x, Winit).mean(axis=0)
        else:
            h_prime = 0

        dummy_states = dict(h=tensor.zeros((xtilde.shape[0], xtilde.shape[1], self.state_dim)),
                            c=tensor.zeros((xtilde.shape[0], xtilde.shape[1], self.state_dim)))

        def stepfn(xtilde, dummy_h, dummy_c, h, c):
            atilde = tensor.dot(h, Wa)
            btilde = xtilde
            a = self.bn(atilde, a_gammas, ab_betas)
            b = self.bn(btilde, b_gammas, 0)
            ab = a + b
            g, f, i, o = [fn(ab[:, j * self.state_dim:(j + 1) * self.state_dim])
                          for j, fn in enumerate([self.children[0].apply] + 3 * [tensor.nnet.sigmoid])]
            c = dummy_c + f * c + i * g
            htilde = c
            h = dummy_h + o * self.children[0].apply(self.bn(htilde, h_gammas, h_betas))
            return h, c, atilde, btilde, htilde

        [h, c, atilde, btilde, htilde], _ = theano.scan(
            stepfn,
            sequences=[xtilde, dummy_states["h"], dummy_states["c"]],
            outputs_info=[tensor.repeat(h0[None, :], xtilde.shape[1], axis=0) + h_prime,
                          tensor.repeat(c0[None, :], xtilde.shape[1], axis=0),
                          None, None, None])
        #return dict(h=h, c=c, atilde=atilde, btilde=btilde, htilde=htilde), dummy_states, parameters
        return h
Exemple #18
0
    def output(self, train):
        X = self.get_input(train)
        X = X.dimshuffle((1,0,2))


        if self.is_entity:
            Entity = X[-1:].dimshuffle(1,0,2)
            X = X[:-1]

        b_y = self.b_y
        b_yn = T.repeat(T.repeat(b_y.reshape((1,self.output_dim)),X.shape[0],axis=0).reshape((1,X.shape[0],self.output_dim)), X.shape[1], axis=0)

        xif = T.dot(X, self.W_if) + self.b_if
        xib = T.dot(X, self.W_ib) + self.b_ib

        xff = T.dot(X, self.W_ff) + self.b_ff
        xfb = T.dot(X, self.W_fb) + self.b_fb

        xcf = T.dot(X, self.W_cf) + self.b_cf
        xcb = T.dot(X, self.W_cb) + self.b_cb

        xof = T.dot(X, self.W_of) + self.b_of
        xob = T.dot(X, self.W_ob) + self.b_ob

        [outputs_f, memories_f], updates_f = theano.scan(
            self._step,
            sequences=[xif, xff, xof, xcf],
            outputs_info=[
                alloc_zeros_matrix(X.shape[1], self.output_dim),
                alloc_zeros_matrix(X.shape[1], self.output_dim)
            ],
            non_sequences=[self.U_if, self.U_ff, self.U_of, self.U_cf],
            truncate_gradient=self.truncate_gradient
        )
        [outputs_b, memories_b], updates_b = theano.scan(
            self._step,
            sequences=[xib, xfb, xob, xcb],
            outputs_info=[
                alloc_zeros_matrix(X.shape[1], self.output_dim),
                alloc_zeros_matrix(X.shape[1], self.output_dim)
            ],
            non_sequences=[self.U_ib, self.U_fb, self.U_ob, self.U_cb],
            truncate_gradient=self.truncate_gradient
        )
        if self.return_sequences:
            y = T.add(T.add(
                    T.tensordot(outputs_f.dimshuffle((1,0,2)), self.W_yf, [[2],[0]]),
                    T.tensordot(outputs_b[::-1].dimshuffle((1,0,2)), self.W_yb, [[2],[0]])),
                b_yn)
            # y = T.add(T.tensordot(
            #     T.add(outputs_f.dimshuffle((1, 0, 2)),
            #           outputs_b[::-1].dimshuffle((1,0,2))),
            #     self.W_y,[[2],[0]]),b_yn)
            if self.is_entity:
                return T.concatenate([y, Entity], axis=1)
            else:
                return y
        return T.concatenate((outputs_f[-1], outputs_b[0]))
    def apply(self, x):
        x_to_inter = T.concatenate([self.x_to_f,
                                    self.x_to_i,
                                    self.x_to_g,
                                    self.x_to_o],
                                   axis=1)

        h_to_inter = T.concatenate([self.h_to_f,
                                    self.h_to_i,
                                    self.h_to_g,
                                    self.h_to_o],
                                   axis=1)

        b_inter = T.concatenate([self.b_f,
                                 self.b_i,
                                 self.b_g,
                                 self.b_o])
        
        x_feat = x.dot(x_to_inter) + b_inter.dimshuffle('x', 'x', 0)
        x_feat = x_feat.dimshuffle(1, 0, 2)

        initial_h = T.repeat(self.h,
                             x.shape[0],
                             axis=0)

        initial_c = T.repeat(self.c,
                             x.shape[0],
                             axis=0)


        def step(x_feat, h, c, h_to_inter):
            
            intermediates = T.tanh(x_feat + h.dot(h_to_inter))
    
            i = intermediates[:, :self.num_hidden]
            o = intermediates[:, self.num_hidden:2 * self.num_hidden]
            f = intermediates[:, 2 * self.num_hidden:3 * self.num_hidden]
            g = intermediates[:, 3 * self.num_hidden:]
    
            i = T.nnet.sigmoid(i)
            o = T.nnet.sigmoid(o)
            f = T.nnet.sigmoid(f)
            g = T.tanh(g)
                
            new_c = f * c + i * g
            new_h = o * new_c
        
            return new_h, new_c
    
        outputs, _ = theano.scan(fn=step,
                                 sequences=[x_feat],                     
                                 outputs_info=[dict(initial=initial_h),
                                               dict(initial=initial_c)],
                                 non_sequences=[h_to_inter])

        _, states = outputs

        return states.dimshuffle(1, 0, 2)
Exemple #20
0
    def __init__(self, rng, x, n_in, n_h, p, training, rnn_batch_training=False):
        """ This is to initialise a standard RNN hidden unit

        :param rng: random state, fixed value for randome state for reproducible objective results
        :param x: input data to current layer
        :param n_in: dimension of input data
        :param n_h: number of hidden units/blocks
        :param p: the probability of dropout
        :param training: a binary value to indicate training or testing (for dropout training)
        """
        self.input = x

        if p > 0.0:
            if training==1:
                srng = RandomStreams(seed=123456)
                self.input = T.switch(srng.binomial(size=x.shape,p=p), x, 0)
            else:
                self.input =  (1-p) * x #(1-p) *

        self.n_in = int(n_in)
        self.n_h  = int(n_h)

        self.rnn_batch_training = rnn_batch_training

        # random initialisation
        Wx_value = np.asarray(rng.normal(0.0, 1.0/np.sqrt(n_in), size=(n_in, n_h)), dtype=config.floatX)
        Wh_value = np.asarray(rng.normal(0.0, 1.0/np.sqrt(n_h), size=(n_h, n_h)), dtype=config.floatX)

        # Input gate weights
        self.W_xi = theano.shared(value=Wx_value, name='W_xi')
        self.W_hi = theano.shared(value=Wh_value, name='W_hi')

        # bias
        self.b_i = theano.shared(value=np.zeros((n_h, ), dtype=config.floatX), name='b_i')


        # initial value of hidden and cell state
        if self.rnn_batch_training:
            self.h0 = theano.shared(value=np.zeros((1, n_h), dtype = config.floatX), name = 'h0')
            self.c0 = theano.shared(value=np.zeros((1, n_h), dtype = config.floatX), name = 'c0')

            self.h0 = T.repeat(self.h0, x.shape[1], 0)
            self.c0 = T.repeat(self.c0, x.shape[1], 0)
        else:
            self.h0 = theano.shared(value=np.zeros((n_h, ), dtype = config.floatX), name = 'h0')
            self.c0 = theano.shared(value=np.zeros((n_h, ), dtype = config.floatX), name = 'c0')


        self.Wix = T.dot(self.input, self.W_xi)

        [self.h, self.c], _ = theano.scan(self.recurrent_as_activation_function, sequences = [self.Wix],
                                                                      outputs_info = [self.h0, self.c0])

        self.output = self.h

        self.params = [self.W_xi, self.W_hi, self.b_i]

        self.L2_cost = (self.W_xi ** 2).sum() + (self.W_hi ** 2).sum()
Exemple #21
0
 def seq_score(out_matrix,img_matrix):
     out_len=out_matrix.shape[0]
     img_len=img_matrix.shape[0]
     k_mat=T.repeat(T.arange(out_len).reshape((1,out_len)),img_len,axis=0)
     j_mat=T.repeat(T.arange(img_len).reshape((img_len,1)),out_len,axis=1)
     #entityscore=T.dot(entity,img_matrix.T)
     eye=T.eye(out_len,img_len)
     eye=eye/T.sum(eye)
     return T.sum(T.dot(out_matrix,img_matrix.T)*eye)
Exemple #22
0
def mmd_full(x_t, y_t, alpha=0.5):
    """ Implementation of the full kernel MMD statistic (gaussian kernel)"""
    N = x_t.shape[1]
    M = y_t.shape[1]

    term1 = T.mean(T.exp(-0.5 * (1 / alpha) * T.square(T.repeat(x_t, N) - T.tile(x_t, N))))
    term2 = T.mean(T.exp(-0.5 * (1 / alpha) * T.square(T.repeat(x_t, M) - T.tile(y_t, N))))
    term3 = T.mean(T.exp(-0.5 * (1 / alpha) * T.square(T.repeat(y_t, M) - T.tile(y_t, M))))
    return term1 - 2 * term2 + term3
def get_input_vectors(shape, phases, scaling, offset):
    x = T.repeat(offset[0] + T.arange(shape[0]) / scaling, shape[1] * phases).reshape(
        (shape[0], shape[1], phases)) * T.pow(2, T.arange(phases))
    y = T.repeat(T.tile(offset[1] + T.arange(shape[1]) / scaling, shape[0]).reshape(
        (shape[0], shape[1], 1)), phases, axis=2) * T.pow(2, T.arange(phases))
    z = T.tile(offset[2] + 10 * T.arange(phases), shape[0] * shape[1]).reshape((shape[0], shape[1], phases, 1))
    x = x.reshape((shape[0], shape[1], phases, 1))
    y = y.reshape((shape[0], shape[1], phases, 1))
    return T.concatenate([x, y, z], axis=3).reshape((shape[0] * shape[1] * phases, 3)).astype('float32')
Exemple #24
0
    def get_output_for(self, input, **kwargs):
        mu = input[0]
        sigma = input[1]

        x_range = T.arange(0, self.max_support).dimshuffle('x', 0)
        mu = T.repeat(mu, self.max_support, axis=1)
        sigma = T.repeat(sigma, self.max_support, axis=1)
        x = (x_range - mu) / (sigma * T.sqrt(2.) + 1e-16)
        cdf = (T.erf(x) + 1.) / 2.
        return cdf
 def get_output_for(self, input, **kwargs):
     mu = input
     batch_size, num_latent = mu.shape
     shp = (batch_size, self.eq_samples, self.iw_samples, num_latent)
     mu_shp = mu.dimshuffle(0,'x','x',1)
     mu_shp = T.repeat(mu_shp, axis=1, repeats=self.eq_samples)
     mu_shp = T.repeat(mu_shp, axis=2, repeats=self.iw_samples)
     samples = self._srng.binomial(
         size=shp, p=mu_shp, dtype=theano.config.floatX)
     return samples.reshape((-1, num_latent))
Exemple #26
0
def micro_activate(x, w, b, act):
    if x.ndim>1:
        if act is None:
            return T.dot(x,w) + T.repeat(b, x.shape[0], axis=0)
        return act(T.dot(x, w) + T.repeat(b, x.shape[0], axis=0))
    else:
        if act is None:
            res = T.dot(w.T, x) + b
        res = act(T.dot(w.T, x) + b)
        return res.flatten()
Exemple #27
0
 def _prepare_outputs_info(self, x_dot_w):
     if self.learn_init_state:
         outputs_info = [
             T.repeat(self.init_c.dimshuffle('x', 0), x_dot_w.shape[1], axis=0),
             T.repeat(self.init_h.dimshuffle('x', 0), x_dot_w.shape[1], axis=0),
         ]
     else:
         outputs_info = [
             self.init_c,
             self.init_h
         ]
     return outputs_info
Exemple #28
0
def sqdist(a,b,data_num = 59000,dimen = 80):
    a = T.transpose(a)
    b = T.transpose(b)
    aa = T.reshape(T.sum(a ** 2, 0),(1,data_num))
    bb = T.reshape(T.sum(b ** 2, 0),(1,dimen))
    ab= T.dot(T.transpose(a),b)
    d = T.repeat(T.transpose(aa),bb.shape[1],axis=1) + T.repeat(bb,aa.shape[1],axis = 0) - 2*ab
    sigma = T.mean(d)
    d = T.exp(-d / (2 * sigma))
    mvec = T.reshape(T.mean(d, 0),(1,dimen))
    d = d - T.repeat(mvec, d.shape[0], axis=0)
    return d,sigma,mvec
    def __init__(self, dnodex,inputdim,dim):
        X=T.ivector()
	Y=T.ivector()
	Z=T.lscalar()
	NP=T.ivector()
	lambd = T.scalar()
	eta = T.scalar()
        temperature=T.scalar()
        num_input = inputdim
	self.umatrix=theano.shared(floatX(np.random.rand(dnodex.nuser,inputdim, inputdim)))
        self.pmatrix=theano.shared(floatX(np.random.rand(dnodex.npoi,inputdim)))
        self.p_l2_norm=(self.pmatrix**2).sum()
        self.u_l2_norm=(self.umatrix**2).sum()
        num_hidden = dim
        num_output = inputdim
        inputs = InputPLayer(self.pmatrix[X,:], self.umatrix[Z,:,:], name="inputs")
        lstm1 = LSTMLayer(num_input, num_hidden, input_layer=inputs, name="lstm1")
        #lstm2 = LSTMLayer(num_hidden, num_hidden, input_layer=lstm1, name="lstm2")
        #lstm3 = LSTMLayer(num_hidden, num_hidden, input_layer=lstm2, name="lstm3")
        softmax = SoftmaxPLayer(num_hidden, num_output, self.umatrix[Z,:,:], input_layer=lstm1, name="yhat", temperature=temperature)

        Y_hat = softmax.output()

        self.layers = inputs, lstm1,softmax
        params = get_params(self.layers)
        #caches = make_caches(params)

        tmp_u=T.mean(T.dot(self.pmatrix[X,:],self.umatrix[Z,:,:]),axis=0)
        tr=T.dot(tmp_u,(self.pmatrix[X,:]-self.pmatrix[NP,:]).transpose())
        pfp_loss1=sigmoid(tr)
        pfp_loss=pfp_loss1*(T.ones_like(pfp_loss1)-pfp_loss1)
        tmp_u1=T.reshape(T.repeat(tmp_u,X.shape[0]),(inputdim,X.shape[0])).T
        pfp_lossv=T.reshape(T.repeat(pfp_loss,inputdim),(inputdim,X.shape[0])).T
	cost = lambd*10*T.mean(T.nnet.categorical_crossentropy(Y_hat, T.dot(self.pmatrix[Y,:],self.umatrix[Z,:,:])))+lambd*self.p_l2_norm+lambd*self.u_l2_norm
    #    updates = PerSGD(cost,params,eta,X,Z,dnodex)#momentum(cost, params, caches, eta)
        updates = []
        grads = T.grad(cost=cost, wrt=params)
        updates.append([self.pmatrix,T.set_subtensor(self.pmatrix[X,:],self.pmatrix[X,:]-eta*grads[0])])
        updates.append([self.umatrix,T.set_subtensor(self.umatrix[Z,:,:],self.umatrix[Z,:,:]-eta*grads[1])])
        for p,g in zip(params[2:], grads[2:]):
            updates.append([p, p - eta * g])

        rlist=T.argsort(T.dot(tmp_u,self.pmatrix.T))[::-1]
        n_updates=[(self.pmatrix, T.set_subtensor(self.pmatrix[NP,:],self.pmatrix[NP,:]-eta*pfp_lossv*tmp_u1-eta*lambd*self.pmatrix[NP,:]))]
	p_updates=[(self.pmatrix, T.set_subtensor(self.pmatrix[X,:],self.pmatrix[X,:]+eta*pfp_lossv*tmp_u1-eta*lambd*self.pmatrix[X,:])),(self.umatrix, T.set_subtensor(self.umatrix[Z,:,:],self.umatrix[Z,:,:]+eta*T.mean(pfp_loss)*(T.reshape(tmp_u,(tmp_u.shape[0],1))*T.mean(self.pmatrix[X,:]-self.pmatrix[NP,:],axis=0)))-eta*lambd*self.umatrix[Z,:,:])]
        
        self.train = theano.function([X,Y,Z, eta, lambd, temperature], cost, updates=updates, allow_input_downcast=True)
        self.trainpos=theano.function([X,NP,Z,eta, lambd],tmp_u, updates=p_updates,allow_input_downcast=True)
        self.trainneg=theano.function([X,NP,Z,eta, lambd],T.mean(pfp_loss), updates=n_updates,allow_input_downcast=True)
        
        
        self.predict_pfp = theano.function([X,Z], rlist, allow_input_downcast=True)
Exemple #30
0
def log_prob_correct(mem, desired_output, cost_mask, max_int):
    """Compute log-probability of correctness over all registers."""
    cost = 0

    # Add epsilon to every log to avoid having inf in costs.
    epsilon = 1e-100

    samples = mem.shape[0]
    sample_idxs = repeat(shape_padright(arange(samples), 1), max_int, axis=1)
    cell_idxs = repeat(shape_padleft(arange(max_int), 1), samples, axis=0)
    vals = mem[sample_idxs, cell_idxs, desired_output]
    cost = (cost_mask * tensor.log(vals + epsilon)).sum(axis=1, keepdims=True)

    return cost
 def initial_states(self, batch_size, *args, **kwargs):
     return [
         tensor.repeat(self.parameters.initial_state[None, :], batch_size,
                       0)
     ]
Exemple #32
0
def MyRepeat(x, reps, axes):
    assert len(reps) == len(axes)
    y = x
    for r, a in zip(reps, axes):
        y = T.repeat(y, [r], axis=a)
    return y
 def training_cost_weighted(self, y, weights=None):
     """ Wrapper for standard name """
     loss = self.hinge_sq(y)
     weights = T.repeat(weights.dimshuffle('x', 0), y.shape[0], axis=0)
     factors = weights[T.arange(y.shape[0]), y]
     return T.sum(loss * factors)
Exemple #34
0
 def call(self, x, mask):
     Mean = x
     Std = T.repeat(T.exp(self.logstd)[None, :], Mean.shape[0], axis=0)
     return T.concatenate([Mean, Std], axis=1)
Exemple #35
0
def get_rupture_times_theano(slownesses, patch_size, nuc_x, nuc_y):
    """
    Does the same calculation as get_rupture_times_numpy
    just with symbolic variable input and output for theano graph
    implementation optimization.
    """
    [step_dip_max, step_str_max] = slownesses.shape
    StartTimes = tt.ones((step_dip_max, step_str_max)) * 1e8
    StartTimes = tt.set_subtensor(StartTimes[nuc_y, nuc_x], 0)

    # Stopping check var
    epsilon = theano.shared(0.1)
    err_val = theano.shared(1e6)

    # Iterator matrixes
    dip1 = tt.repeat(tt.arange(step_dip_max), step_str_max)
    str1 = tt.tile(tt.arange(step_str_max), step_dip_max)

    dip2 = tt.repeat(tt.arange(step_dip_max), step_str_max)
    str2 = tt.tile(tt.arange(step_str_max - 1, -1, -1), step_dip_max)

    dip3 = tt.repeat(tt.arange(step_dip_max - 1, -1, -1), step_str_max)
    str3 = tt.tile(tt.arange(step_str_max - 1, -1, -1), step_dip_max)

    dip4 = tt.repeat(tt.arange(step_dip_max - 1, -1, -1), step_str_max)
    str4 = tt.tile(tt.arange(step_str_max), step_dip_max)

    DIP = tt.concatenate([dip1, dip2, dip3, dip4])
    STR = tt.concatenate([str1, str2, str3, str4])

    ### Upwind scheme ###
    def upwind(dip_ind, str_ind, StartTimes, slownesses, patch_size):
        [n_patch_dip, n_patch_str] = slownesses.shape
        zero = theano.shared(0)
        s1 = str_ind - 1
        d1 = dip_ind - 1
        s2 = str_ind + 1
        d2 = dip_ind + 1

        # if a < b return b
        checked_s1 = ifelse(tt.lt(s1, zero), zero, s1)
        checked_d1 = ifelse(tt.lt(d1, zero), zero, d1)

        # if a =< b return a-1
        checked_s2 = ifelse(tt.le(n_patch_str, s2), n_patch_str - 1, s2)
        checked_d2 = ifelse(tt.le(n_patch_dip, d2), n_patch_dip - 1, d2)

        ST_xmin = tt.min(
            (StartTimes[checked_d1, str_ind], StartTimes[checked_d2, str_ind]))
        ST_ymin = tt.min(
            (StartTimes[dip_ind, checked_s1], StartTimes[dip_ind, checked_s2]))

        ### Eikonal equation solver ###
        # The unique solution to the equation
        # [(x-a)^+]^2 + [(x-b)^+]^2 = f^2 * h^2
        # where a = u_xmin, b = u_ymin, is
        #
        #         | min(a,b) + f*h,                           |a-b|>= f*h
        # xnew =  |
        #         |0.5 * [ a+b+sqrt( 2*f^2*h^2 - (a-b)^2 ) ], |a-b| < f*h
        start_new = ifelse(
            tt.le(slownesses[dip_ind, str_ind] * patch_size,
                tt.abs_(ST_xmin - ST_ymin)),
            tt.min((ST_xmin, ST_ymin)) + slownesses[dip_ind, str_ind] * \
                patch_size,
            (ST_xmin + ST_ymin + \
                tt.sqrt(2 * tt.pow(slownesses[dip_ind, str_ind], 2) * \
                                tt.pow(patch_size, 2) - \
                                tt.pow((ST_xmin - ST_ymin), 2)
                       )) / 2
                          )

        # if a < b return a
        output = ifelse(tt.lt(start_new, StartTimes[dip_ind, str_ind]),
                        start_new, StartTimes[dip_ind, str_ind])
        return tt.set_subtensor(
            StartTimes[dip_ind:dip_ind + 1, str_ind:str_ind + 1], output)

    def loop_upwind(StartTimes, PreviousTimes, err_val, iteration, epsilon):
        [results,
         updates] = theano.scan(fn=upwind,
                                sequences=[DIP, STR],
                                outputs_info=[StartTimes],
                                non_sequences=[slownesses, patch_size])

        StartTimes = results[-1]
        err_val = tt.sum(tt.sum(tt.pow((StartTimes - PreviousTimes), 2)))

        PreviousTimes = StartTimes.copy()
        return (StartTimes, PreviousTimes, err_val, iteration + 1), \
                theano.scan_module.until(err_val < epsilon)

    # while loop until err < epsilon
    iteration = theano.shared(0)
    PreviousTimes = StartTimes.copy()
    ([result, PreviousTimes, errs, Iteration], updates) = theano.scan(
        fn=loop_upwind,
        outputs_info=[StartTimes, PreviousTimes, err_val, iteration],
        non_sequences=[epsilon],
        n_steps=500)  # arbitrary set, stops after few iterations
    return result[-1]
Exemple #36
0
 def _prepare_outputs_info(self, x_dot_w):
     outputs_info = [
         T.repeat(self.init_c.dimshuffle('x', 0), x_dot_w.shape[1], axis=0),
         T.repeat(self.init_h.dimshuffle('x', 0), x_dot_w.shape[1], axis=0),
     ]
     return outputs_info
Exemple #37
0
def matrixify(vector, n):
    # Cast n to int32 if necessary to prevent error on 32 bit systems
    return T.repeat(T.shape_padleft(vector),
                    n if (theano.configdefaults.local_bitwidth() == 64) else T.cast(n,'int32'),
                    axis=0)
Exemple #38
0
def heaviside(x):
    return T.arange(0, 600).dimshuffle('x', 0) - T.repeat(x, 600, axis=1) >= 0
 def get_initial_hidden(self):
     return [T.repeat(self.hidden[None, :], self.batch_size, 0),
             T.repeat(self.cells[None, :], self.batch_size, 0)]
Exemple #40
0
    dis_layers[-1], {
        dis_in_x: T.concatenate([sym_x_l, sym_x_u_d], axis=0),
        dis_in_y: T.concatenate([sym_y, cla_out_y_d_hard], axis=0)
    },
    deterministic=False)
dis_out_p_g = ll.get_output(dis_layers[-1], {
    dis_in_x: gen_out_x,
    dis_in_y: sym_y_g
},
                            deterministic=False)

if objective_flag == 'integrate':
    # integrate
    dis_out_p_c = ll.get_output(
        dis_layers[-1], {
            dis_in_x: T.repeat(sym_x_u, num_classes, axis=0),
            dis_in_y: np.tile(np.arange(num_classes), batch_size_u_c)
        },
        deterministic=False)
elif objective_flag == 'argmax':
    # argmax approximation
    cla_out_y_hard = cla_out_y.argmax(axis=1)
    dis_out_p_c = ll.get_output(dis_layers[-1], {
        dis_in_x: sym_x_u,
        dis_in_y: cla_out_y_hard
    },
                                deterministic=False)
else:
    raise Exception('Unknown objective flags')

image = ll.get_output(gen_layers[-1], {
Exemple #41
0
def _interpolate(im, x, y, out_height, out_width, border_mode):
    # *_f are floats
    num_batch, height, width, channels = im.shape
    height_f = T.cast(height, theano.config.floatX)
    width_f = T.cast(width, theano.config.floatX)

    # clip coordinates to [-1, 1]
    if border_mode == 'nearest':
        x = T.clip(x, -1, 1)
        y = T.clip(y, -1, 1)
    # 0.9 1.0 1.1 -> 0.9 1.0 0.9
    elif border_mode == 'mirror':
        xa = T.mod(x + 1, 4) - 1
        ya = T.mod(y + 1, 4) - 1
        x = T.minimum(xa, 2 - xa)
        y = T.minimum(ya, 2 - ya)
    # 0.9 1.0 1.1 -> 0.9 1.0 -0.9
    elif border_mode == 'wrap':
        x = T.mod(x + 1, 2) - 1
        y = T.mod(y + 1, 2) - 1
    else:
        raise ValueError("border_mode must be one of "
                         "'nearest', 'mirror', 'wrap'")

    # scale coordinates from [-1, 1] to [0, width/height - 1]
    x = (x + 1) / 2 * (width_f - 1)
    y = (y + 1) / 2 * (height_f - 1)

    # obtain indices of the 2x2 pixel neighborhood surrounding the coordinates;
    # we need those in floatX for interpolation and in int64 for indexing. for
    # indexing, we need to take care they do not extend past the image.
    x0_f = T.floor(x)
    y0_f = T.floor(y)
    x1_f = x0_f + 1
    y1_f = y0_f + 1
    x0 = T.cast(x0_f, 'int64')
    y0 = T.cast(y0_f, 'int64')
    x1 = T.cast(T.minimum(x1_f, width_f - 1), 'int64')
    y1 = T.cast(T.minimum(y1_f, height_f - 1), 'int64')

    # The input is [num_batch, height, width, channels]. We do the lookup in
    # the flattened input, i.e [num_batch*height*width, channels]. We need
    # to offset all indices to match the flat version
    dim2 = width
    dim1 = width * height
    base = T.repeat(
        T.arange(num_batch, dtype='int64') * dim1, out_height * out_width)
    base_y0 = base + y0 * dim2
    base_y1 = base + y1 * dim2
    idx_a = base_y0 + x0
    idx_b = base_y1 + x0
    idx_c = base_y0 + x1
    idx_d = base_y1 + x1

    # use indices to lookup pixels for all samples
    im_flat = im.reshape((-1, channels))
    Ia = im_flat[idx_a]
    Ib = im_flat[idx_b]
    Ic = im_flat[idx_c]
    Id = im_flat[idx_d]

    # calculate interpolated values
    wa = ((x1_f - x) * (y1_f - y)).dimshuffle(0, 'x')
    wb = ((x1_f - x) * (y - y0_f)).dimshuffle(0, 'x')
    wc = ((x - x0_f) * (y1_f - y)).dimshuffle(0, 'x')
    wd = ((x - x0_f) * (y - y0_f)).dimshuffle(0, 'x')
    output = T.sum([wa * Ia, wb * Ib, wc * Ic, wd * Id], axis=0)
    return output
def evaluate_lenet5(learning_rate=0.02,
                    n_epochs=100,
                    emb_size=300,
                    batch_size=50,
                    filter_size=[3],
                    sent_len=40,
                    claim_len=40,
                    cand_size=10,
                    hidden_size=[300, 300],
                    max_pred_pick=5):

    model_options = locals().copy()
    print "model options", model_options

    pred_id2label = {1: 'SUPPORTS', 0: 'REFUTES', 2: 'NOT ENOUGH INFO'}

    seed = 1234
    np.random.seed(seed)
    rng = np.random.RandomState(
        seed)  #random seed, control the model generates the same results
    srng = T.shared_randomstreams.RandomStreams(rng.randint(seed))

    "load raw data"
    train_sents, train_sent_masks, train_sent_labels, train_claims, train_claim_mask, train_labels, word2id = load_fever_train(
        sent_len, claim_len, cand_size)
    train_3th_sents, train_3th_sent_masks, train_3th_sent_labels, train_3th_claims, train_3th_claim_mask, train_3th_labels, word2id = load_fever_train_NoEnoughInfo(
        sent_len, claim_len, cand_size, word2id)
    test_sents, test_sent_masks, test_sent_labels, test_claims, test_claim_mask, test_sent_names, test_ground_names, test_labels, word2id = load_fever_dev(
        sent_len, claim_len, cand_size, word2id)
    test_3th_sents, test_3th_sent_masks, test_3th_sent_labels, test_3th_claims, test_3th_claim_mask, test_3th_labels, word2id = load_fever_dev_NoEnoughInfo(
        sent_len, claim_len, cand_size, word2id)
    dev_sents, dev_sent_masks, dev_sent_labels, dev_claims, dev_claim_mask, dev_sent_names, dev_ground_names, dev_labels, word2id = load_fever_test(
        sent_len, claim_len, cand_size, word2id)
    dev_3th_sents, dev_3th_sent_masks, dev_3th_sent_labels, dev_3th_claims, dev_3th_claim_mask, dev_3th_labels, word2id = load_fever_test_NoEnoughInfo(
        sent_len, claim_len, cand_size, word2id)

    train_sents = np.asarray(train_sents, dtype='int32')
    train_3th_sents = np.asarray(train_3th_sents, dtype='int32')
    joint_train_sents = np.concatenate((train_sents, train_3th_sents))
    test_sents = np.asarray(test_sents, dtype='int32')
    test_3th_sents = np.asarray(test_3th_sents, dtype='int32')
    joint_test_sents = np.concatenate((test_sents, test_3th_sents))
    dev_sents = np.asarray(dev_sents, dtype='int32')
    dev_3th_sents = np.asarray(dev_3th_sents, dtype='int32')
    joint_dev_sents = np.concatenate((dev_sents, dev_3th_sents))

    train_sent_masks = np.asarray(train_sent_masks, dtype=theano.config.floatX)
    train_3th_sent_masks = np.asarray(train_3th_sent_masks,
                                      dtype=theano.config.floatX)
    joint_train_sent_masks = np.concatenate(
        (train_sent_masks, train_3th_sent_masks))
    test_sent_masks = np.asarray(test_sent_masks, dtype=theano.config.floatX)
    test_3th_sent_masks = np.asarray(test_3th_sent_masks,
                                     dtype=theano.config.floatX)
    joint_test_sent_masks = np.concatenate(
        (test_sent_masks, test_3th_sent_masks))
    dev_sent_masks = np.asarray(dev_sent_masks, dtype=theano.config.floatX)
    dev_3th_sent_masks = np.asarray(dev_3th_sent_masks,
                                    dtype=theano.config.floatX)
    joint_dev_sent_masks = np.concatenate((dev_sent_masks, dev_3th_sent_masks))

    train_sent_labels = np.asarray(train_sent_labels, dtype='int32')
    train_3th_sent_labels = np.asarray(train_3th_sent_labels, dtype='int32')
    joint_train_sent_labels = np.concatenate(
        (train_sent_labels, train_3th_sent_labels))
    test_sent_labels = np.asarray(test_sent_labels, dtype='int32')
    test_3th_sent_labels = np.asarray(test_3th_sent_labels, dtype='int32')
    joint_test_sent_labels = np.concatenate(
        (test_sent_labels, test_3th_sent_labels))
    dev_sent_labels = np.asarray(dev_sent_labels, dtype='int32')
    dev_3th_sent_labels = np.asarray(dev_3th_sent_labels, dtype='int32')
    joint_dev_sent_labels = np.concatenate(
        (dev_sent_labels, dev_3th_sent_labels))

    train_claims = np.asarray(train_claims, dtype='int32')
    train_3th_claims = np.asarray(train_3th_claims, dtype='int32')
    joint_train_claims = np.concatenate((train_claims, train_3th_claims))
    test_claims = np.asarray(test_claims, dtype='int32')
    test_3th_claims = np.asarray(test_3th_claims, dtype='int32')
    joint_test_claims = np.concatenate((test_claims, test_3th_claims))
    dev_claims = np.asarray(dev_claims, dtype='int32')
    dev_3th_claims = np.asarray(dev_3th_claims, dtype='int32')
    joint_dev_claims = np.concatenate((dev_claims, dev_3th_claims))

    train_claim_mask = np.asarray(train_claim_mask, dtype=theano.config.floatX)
    train_3th_claim_mask = np.asarray(train_3th_claim_mask,
                                      dtype=theano.config.floatX)
    joint_train_claim_mask = np.concatenate(
        (train_claim_mask, train_3th_claim_mask))
    test_claim_mask = np.asarray(test_claim_mask, dtype=theano.config.floatX)
    test_3th_claim_mask = np.asarray(test_3th_claim_mask,
                                     dtype=theano.config.floatX)
    joint_test_claim_mask = np.concatenate(
        (test_claim_mask, test_3th_claim_mask))
    dev_claim_mask = np.asarray(dev_claim_mask, dtype=theano.config.floatX)
    dev_3th_claim_mask = np.asarray(dev_3th_claim_mask,
                                    dtype=theano.config.floatX)
    joint_dev_claim_mask = np.concatenate((dev_claim_mask, dev_3th_claim_mask))

    train_labels = np.asarray(train_labels, dtype='int32')
    train_3th_labels = np.asarray(train_3th_labels, dtype='int32')
    joint_train_labels = np.concatenate((train_labels, train_3th_labels))
    test_labels = np.asarray(test_labels, dtype='int32')
    test_3th_labels = np.asarray(test_3th_labels, dtype='int32')
    joint_test_labels = np.concatenate((test_labels, test_3th_labels))
    dev_labels = np.asarray(dev_labels, dtype='int32')
    dev_3th_labels = np.asarray(dev_3th_labels, dtype='int32')
    joint_dev_labels = np.concatenate((dev_labels, dev_3th_labels))

    joint_train_size = len(joint_train_claims)
    joint_test_size = len(joint_test_claims)
    joint_dev_size = len(joint_dev_claims)
    train_size = len(train_claims)
    test_size = len(test_claims)
    dev_size = len(dev_claims)
    test_3th_size = len(test_3th_claims)
    dev_3th_size = len(dev_3th_claims)
    vocab_size = len(word2id) + 1
    print 'joint_train size: ', joint_train_size, ' joint_dev size: ', joint_test_size, ' joint_test size: ', joint_dev_size
    print 'train size: ', train_size, ' dev size: ', test_size, ' test size: ', dev_size
    print 'vocab size: ', vocab_size

    rand_values = rng.normal(
        0.0, 0.01,
        (vocab_size, emb_size))  #generate a matrix by Gaussian distribution
    id2word = {y: x for x, y in word2id.iteritems()}
    word2vec = load_word2vec()
    rand_values = load_word2vec_to_init(rand_values, id2word, word2vec)
    init_embeddings = theano.shared(
        value=np.array(rand_values, dtype=theano.config.floatX), borrow=True
    )  #wrap up the python variable "rand_values" into theano variable

    "now, start to build the input form of the model"
    sents_ids = T.itensor3()  #(batch, cand_size, sent_len)
    sents_mask = T.ftensor3()
    sents_labels = T.imatrix()  #(batch, cand_size)
    claim_ids = T.imatrix()  #(batch, claim_len)
    claim_mask = T.fmatrix()

    joint_sents_ids = T.itensor3()  #(batch, cand_size, sent_len)
    joint_sents_mask = T.ftensor3()
    joint_sents_labels = T.imatrix()  #(batch, cand_size)
    joint_claim_ids = T.imatrix()  #(batch, claim_len)
    joint_claim_mask = T.fmatrix()
    joint_labels = T.ivector()
    ######################
    # BUILD ACTUAL MODEL #
    ######################
    print '... building the model'

    embed_input_sents = init_embeddings[sents_ids.flatten(
    )].reshape((batch_size * cand_size, sent_len, emb_size)).dimshuffle(
        0, 2, 1
    )  #embed_input(init_embeddings, sents_ids_l)#embeddings[sents_ids_l.flatten()].reshape((batch_size,maxSentLen, emb_size)).dimshuffle(0,2,1) #the input format can be adapted into CNN or GRU or LSTM
    embed_input_claim = init_embeddings[claim_ids.flatten()].reshape(
        (batch_size, claim_len, emb_size)).dimshuffle(0, 2, 1)

    conv_W, conv_b = create_conv_para(rng,
                                      filter_shape=(hidden_size[0], 1,
                                                    emb_size, filter_size[0]))
    task1_att_conv_W, task1_att_conv_b = create_conv_para(
        rng, filter_shape=(hidden_size[0], 1, emb_size, filter_size[0]))
    task1_conv_W_context, task1_conv_b_context = create_conv_para(
        rng, filter_shape=(hidden_size[0], 1, emb_size, 1))
    att_conv_W, att_conv_b = create_conv_para(rng,
                                              filter_shape=(hidden_size[0], 1,
                                                            emb_size,
                                                            filter_size[0]))
    conv_W_context, conv_b_context = create_conv_para(
        rng, filter_shape=(hidden_size[0], 1, emb_size, 1))

    NN_para = [
        conv_W, conv_b, task1_att_conv_W, task1_att_conv_b, att_conv_W,
        att_conv_b, task1_conv_W_context, conv_W_context
    ]

    conv_model_sents = Conv_with_Mask(
        rng,
        input_tensor3=embed_input_sents,
        mask_matrix=sents_mask.reshape(
            (sents_mask.shape[0] * sents_mask.shape[1], sents_mask.shape[2])),
        image_shape=(batch_size * cand_size, 1, emb_size, sent_len),
        filter_shape=(hidden_size[0], 1, emb_size, filter_size[0]),
        W=conv_W,
        b=conv_b
    )  #mutiple mask with the conv_out to set the features by UNK to zero
    sent_embeddings = conv_model_sents.maxpool_vec  #(batch_size*cand_size, hidden_size) # each sentence then have an embedding of length hidden_size
    batch_sent_emb = sent_embeddings.reshape(
        (batch_size, cand_size, hidden_size[0]))

    conv_model_claims = Conv_with_Mask(
        rng,
        input_tensor3=embed_input_claim,
        mask_matrix=claim_mask,
        image_shape=(batch_size, 1, emb_size, claim_len),
        filter_shape=(hidden_size[0], 1, emb_size, filter_size[0]),
        W=conv_W,
        b=conv_b
    )  #mutiple mask with the conv_out to set the features by UNK to zero
    claim_embeddings = conv_model_claims.maxpool_vec  #(batch_size, hidden_size) # each sentence then have an embedding of length hidden_size
    batch_claim_emb = T.repeat(claim_embeddings.dimshuffle(0, 'x', 1),
                               cand_size,
                               axis=1)
    '''
    attentive conv for task1
    '''
    task1_attentive_conv_layer = Attentive_Conv_for_Pair_easy_version(
        rng,
        input_tensor3=
        embed_input_sents,  #batch_size*cand_size, emb_size, sent_len
        input_tensor3_r=T.repeat(embed_input_claim, cand_size, axis=0),
        mask_matrix=sents_mask.reshape(
            (sents_mask.shape[0] * sents_mask.shape[1], sents_mask.shape[2])),
        mask_matrix_r=T.repeat(claim_mask, cand_size, axis=0),
        image_shape=(batch_size * cand_size, 1, emb_size, sent_len),
        image_shape_r=(batch_size * cand_size, 1, emb_size, claim_len),
        filter_shape=(hidden_size[0], 1, emb_size, filter_size[0]),
        filter_shape_context=(hidden_size[0], 1, emb_size, 1),
        W=task1_att_conv_W,
        b=task1_att_conv_b,
        W_context=task1_conv_W_context,
        b_context=task1_conv_b_context)
    task1_attentive_sent_embeddings_l = task1_attentive_conv_layer.attentive_maxpool_vec_l  #(batch_size*cand_size, hidden_size)
    task1_attentive_sent_embeddings_r = task1_attentive_conv_layer.attentive_maxpool_vec_r

    concate_claim_sent = T.concatenate([
        batch_claim_emb, batch_sent_emb,
        T.sum(batch_claim_emb * batch_sent_emb, axis=2).dimshuffle(0, 1, 'x')
    ],
                                       axis=2)
    concate_2_matrix = concate_claim_sent.reshape(
        (batch_size * cand_size, hidden_size[0] * 2 + 1))

    LR_input = T.concatenate([
        concate_2_matrix, task1_attentive_sent_embeddings_l,
        task1_attentive_sent_embeddings_r
    ],
                             axis=1)
    LR_input_size = hidden_size[0] * 2 + 1 + hidden_size[0] * 2

    # LR_input = concate_2_matrix
    # LR_input_size = hidden_size[0]*2+1
    #classification layer, it is just mapping from a feature vector of size "hidden_size" to a vector of only two values: positive, negative
    U_a = create_ensemble_para(
        rng, 1, LR_input_size)  # the weight matrix hidden_size*2
    # LR_b = theano.shared(value=np.zeros((8,),dtype=theano.config.floatX),name='LR_b', borrow=True)  #bias for each target class
    LR_para = [U_a]
    # layer_LR=LogisticRegression(rng, input=LR_input, n_in=LR_input_size, n_out=8, W=U_a, b=LR_b) #basically it is a multiplication between weight matrix and input feature vector
    score_matrix = T.nnet.sigmoid(LR_input.dot(U_a))  #batch * 12
    inter_matrix = score_matrix.reshape((batch_size, cand_size))

    # inter_sent_claim = T.batched_dot(batch_sent_emb, batch_claim_emb) #(batch_size, cand_size, 1)
    # inter_matrix = T.nnet.sigmoid(inter_sent_claim.reshape((batch_size, cand_size)))
    '''
    maybe 1.0-inter_matrix can be rewritten into 1/e^(inter_matrix)
    '''
    # prob_pos = T.where( sents_labels < 1, 1.0-inter_matrix, inter_matrix)
    # loss = -T.mean(T.log(prob_pos))
    #f1 as loss
    batch_overlap = T.sum(sents_labels * inter_matrix, axis=1)
    batch_recall = batch_overlap / T.sum(sents_labels, axis=1)
    batch_precision = batch_overlap / T.sum(inter_matrix, axis=1)
    batch_f1 = 2.0 * batch_recall * batch_precision / (batch_recall +
                                                       batch_precision)
    loss = -T.mean(T.log(batch_f1))
    # loss = T.nnet.nnet.binary_crossentropy(inter_matrix, sents_labels).mean()
    '''
    training task2, predict 3 labels
    '''
    joint_embed_input_sents = init_embeddings[joint_sents_ids.flatten(
    )].reshape((batch_size * cand_size, sent_len, emb_size)).dimshuffle(
        0, 2, 1
    )  #embed_input(init_embeddings, sents_ids_l)#embeddings[sents_ids_l.flatten()].reshape((batch_size,maxSentLen, emb_size)).dimshuffle(0,2,1) #the input format can be adapted into CNN or GRU or LSTM
    joint_embed_input_claim = init_embeddings[
        joint_claim_ids.flatten()].reshape(
            (batch_size, claim_len, emb_size)).dimshuffle(0, 2, 1)
    joint_conv_model_sents = Conv_with_Mask(
        rng,
        input_tensor3=joint_embed_input_sents,
        mask_matrix=joint_sents_mask.reshape(
            (joint_sents_mask.shape[0] * joint_sents_mask.shape[1],
             joint_sents_mask.shape[2])),
        image_shape=(batch_size * cand_size, 1, emb_size, sent_len),
        filter_shape=(hidden_size[0], 1, emb_size, filter_size[0]),
        W=conv_W,
        b=conv_b
    )  #mutiple mask with the conv_out to set the features by UNK to zero
    joint_sent_embeddings = joint_conv_model_sents.maxpool_vec  #(batch_size*cand_size, hidden_size) # each sentence then have an embedding of length hidden_size
    joint_batch_sent_emb = joint_sent_embeddings.reshape(
        (batch_size, cand_size, hidden_size[0]))
    joint_premise_emb = T.sum(joint_batch_sent_emb *
                              joint_sents_labels.dimshuffle(0, 1, 'x'),
                              axis=1)  #(batch, hidden_size)

    joint_conv_model_claims = Conv_with_Mask(
        rng,
        input_tensor3=joint_embed_input_claim,
        mask_matrix=joint_claim_mask,
        image_shape=(batch_size, 1, emb_size, claim_len),
        filter_shape=(hidden_size[0], 1, emb_size, filter_size[0]),
        W=conv_W,
        b=conv_b
    )  #mutiple mask with the conv_out to set the features by UNK to zero
    joint_claim_embeddings = joint_conv_model_claims.maxpool_vec  #(batch_size, hidden_size) # each sentence then have an embedding of length hidden_size

    joint_premise_hypo_emb = T.concatenate(
        [joint_premise_emb, joint_claim_embeddings],
        axis=1)  #(batch, 2*hidden_size)
    '''
    attentive conv in task2
    '''
    joint_sents_tensor3 = joint_embed_input_sents.dimshuffle(0, 2, 1).reshape(
        (batch_size, cand_size * sent_len, emb_size))
    joint_sents_dot = T.batched_dot(
        joint_sents_tensor3, joint_sents_tensor3.dimshuffle(
            0, 2, 1))  #(batch_size, cand_size*sent_len, cand_size*sent_len)
    joint_sents_dot_2_matrix = T.nnet.softmax(
        joint_sents_dot.reshape(
            (batch_size * cand_size * sent_len, cand_size * sent_len)))
    joint_sents_context = T.batched_dot(
        joint_sents_dot_2_matrix.reshape(
            (batch_size, cand_size * sent_len, cand_size * sent_len)),
        joint_sents_tensor3)  #(batch_size, cand_size*sent_len, emb_size)
    joint_add_sents_context = joint_embed_input_sents + joint_sents_context.reshape(
        (batch_size * cand_size, sent_len, emb_size)
    ).dimshuffle(
        0, 2, 1
    )  #T.concatenate([joint_embed_input_sents, joint_sents_context.reshape((batch_size*cand_size, sent_len, emb_size)).dimshuffle(0,2,1)], axis=1) #(batch_size*cand_size, 2*emb_size, sent_len)

    attentive_conv_layer = Attentive_Conv_for_Pair_easy_version(
        rng,
        input_tensor3=
        joint_add_sents_context,  #batch_size*cand_size, 2*emb_size, sent_len
        input_tensor3_r=T.repeat(joint_embed_input_claim, cand_size, axis=0),
        mask_matrix=joint_sents_mask.reshape(
            (joint_sents_mask.shape[0] * joint_sents_mask.shape[1],
             joint_sents_mask.shape[2])),
        mask_matrix_r=T.repeat(joint_claim_mask, cand_size, axis=0),
        image_shape=(batch_size * cand_size, 1, emb_size, sent_len),
        image_shape_r=(batch_size * cand_size, 1, emb_size, claim_len),
        filter_shape=(hidden_size[0], 1, emb_size, filter_size[0]),
        filter_shape_context=(hidden_size[0], 1, emb_size, 1),
        W=att_conv_W,
        b=att_conv_b,
        W_context=conv_W_context,
        b_context=conv_b_context)
    attentive_sent_embeddings_l = attentive_conv_layer.attentive_maxpool_vec_l.reshape(
        (batch_size, cand_size,
         hidden_size[0]))  #(batch_size*cand_size, hidden_size)
    attentive_sent_embeddings_r = attentive_conv_layer.attentive_maxpool_vec_r.reshape(
        (batch_size, cand_size, hidden_size[0]))
    masked_sents_attconv = attentive_sent_embeddings_l * joint_sents_labels.dimshuffle(
        0, 1, 'x')
    masked_claim_attconv = attentive_sent_embeddings_r * joint_sents_labels.dimshuffle(
        0, 1, 'x')
    fine_max = T.concatenate([
        T.max(masked_sents_attconv, axis=1),
        T.max(masked_claim_attconv, axis=1)
    ],
                             axis=1)  #(batch, 2*hidden)
    # fine_sum = T.concatenate([T.sum(masked_sents_attconv, axis=1),T.sum(masked_claim_attconv, axis=1)],axis=1) #(batch, 2*hidden)
    "Logistic Regression layer"
    joint_LR_input = T.concatenate([joint_premise_hypo_emb, fine_max], axis=1)
    joint_LR_input_size = 2 * hidden_size[0] + 2 * hidden_size[0]

    joint_U_a = create_ensemble_para(rng, 3,
                                     joint_LR_input_size)  # (input_size, 3)
    joint_LR_b = theano.shared(value=np.zeros((3, ),
                                              dtype=theano.config.floatX),
                               name='LR_b',
                               borrow=True)  #bias for each target class
    joint_LR_para = [joint_U_a, joint_LR_b]

    joint_layer_LR = LogisticRegression(
        rng,
        input=joint_LR_input,
        n_in=joint_LR_input_size,
        n_out=3,
        W=joint_U_a,
        b=joint_LR_b
    )  #basically it is a multiplication between weight matrix and input feature vector
    joint_loss = joint_layer_LR.negative_log_likelihood(
        joint_labels
    )  #for classification task, we usually used negative log likelihood as loss, the lower the better.
    '''
    testing
    '''
    # binarize_prob = T.where( inter_matrix > 0.5, 1, 0)  #(batch_size, cand_size

    masked_inter_matrix = inter_matrix * sents_labels  #(batch, cand_size)
    test_premise_emb = T.sum(batch_sent_emb *
                             masked_inter_matrix.dimshuffle(0, 1, 'x'),
                             axis=1)
    test_premise_hypo_emb = T.concatenate([test_premise_emb, claim_embeddings],
                                          axis=1)

    #fine-maxsum
    sents_tensor3 = embed_input_sents.dimshuffle(0, 2, 1).reshape(
        (batch_size, cand_size * sent_len, emb_size))
    sents_dot = T.batched_dot(sents_tensor3, sents_tensor3.dimshuffle(
        0, 2, 1))  #(batch_size, cand_size*sent_len, cand_size*sent_len)
    sents_dot_2_matrix = T.nnet.softmax(
        sents_dot.reshape(
            (batch_size * cand_size * sent_len, cand_size * sent_len)))
    sents_context = T.batched_dot(
        sents_dot_2_matrix.reshape(
            (batch_size, cand_size * sent_len, cand_size * sent_len)),
        sents_tensor3)  #(batch_size, cand_size*sent_len, emb_size)
    add_sents_context = embed_input_sents + sents_context.reshape(
        (batch_size * cand_size, sent_len, emb_size)
    ).dimshuffle(
        0, 2, 1
    )  #T.concatenate([embed_input_sents, sents_context.reshape((batch_size*cand_size, sent_len, emb_size)).dimshuffle(0,2,1)], axis=1) #(batch_size*cand_size, 2*emb_size, sent_len)

    test_attentive_conv_layer = Attentive_Conv_for_Pair_easy_version(
        rng,
        input_tensor3=
        add_sents_context,  #batch_size*cand_size, 2*emb_size, sent_len
        input_tensor3_r=T.repeat(embed_input_claim, cand_size, axis=0),
        mask_matrix=sents_mask.reshape(
            (sents_mask.shape[0] * sents_mask.shape[1], sents_mask.shape[2])),
        mask_matrix_r=T.repeat(claim_mask, cand_size, axis=0),
        image_shape=(batch_size * cand_size, 1, emb_size, sent_len),
        image_shape_r=(batch_size * cand_size, 1, emb_size, claim_len),
        filter_shape=(hidden_size[0], 1, emb_size, filter_size[0]),
        filter_shape_context=(hidden_size[0], 1, emb_size, 1),
        W=att_conv_W,
        b=att_conv_b,
        W_context=conv_W_context,
        b_context=conv_b_context)
    # attentive_sent_embeddings_l = attentive_conv_layer.attentive_maxpool_vec_l  #(batch_size*cand_size, hidden_size)
    # attentive_sent_embeddings_r = attentive_conv_layer.attentive_maxpool_vec_r

    test_attentive_sent_embeddings_l = test_attentive_conv_layer.attentive_maxpool_vec_l.reshape(
        (batch_size, cand_size,
         hidden_size[0]))  #(batch_size*cand_size, hidden_size)
    test_attentive_sent_embeddings_r = test_attentive_conv_layer.attentive_maxpool_vec_r.reshape(
        (batch_size, cand_size, hidden_size[0]))
    test_masked_sents_attconv = test_attentive_sent_embeddings_l * masked_inter_matrix.dimshuffle(
        0, 1, 'x')
    test_masked_claim_attconv = test_attentive_sent_embeddings_r * masked_inter_matrix.dimshuffle(
        0, 1, 'x')
    test_fine_max = T.concatenate([
        T.max(test_masked_sents_attconv, axis=1),
        T.max(test_masked_claim_attconv, axis=1)
    ],
                                  axis=1)  #(batch, 2*hidden)
    # test_fine_sum = T.concatenate([T.sum(test_masked_sents_attconv, axis=1),T.sum(test_masked_claim_attconv, axis=1)],axis=1) #(batch, 2*hidden)

    test_LR_input = T.concatenate([test_premise_hypo_emb, test_fine_max],
                                  axis=1)
    test_LR_input_size = joint_LR_input_size

    test_layer_LR = LogisticRegression(
        rng,
        input=test_LR_input,
        n_in=test_LR_input_size,
        n_out=3,
        W=joint_U_a,
        b=joint_LR_b
    )  #basically it is a multiplication between weight matrix and input feature vector

    params = [init_embeddings] + NN_para + LR_para + joint_LR_para
    cost = loss + joint_loss
    "Use AdaGrad to update parameters"
    updates = Gradient_Cost_Para(cost, params, learning_rate)

    train_model = theano.function([
        sents_ids, sents_mask, sents_labels, claim_ids, claim_mask,
        joint_sents_ids, joint_sents_mask, joint_sents_labels, joint_claim_ids,
        joint_claim_mask, joint_labels
    ],
                                  cost,
                                  updates=updates,
                                  allow_input_downcast=True,
                                  on_unused_input='ignore')
    test_model = theano.function([
        sents_ids, sents_mask, sents_labels, claim_ids, claim_mask,
        joint_labels
    ], [
        inter_matrix,
        test_layer_LR.errors(joint_labels), test_layer_LR.y_pred
    ],
                                 allow_input_downcast=True,
                                 on_unused_input='ignore')
    dev_model = theano.function([
        sents_ids, sents_mask, sents_labels, claim_ids, claim_mask,
        joint_labels
    ], [
        inter_matrix,
        test_layer_LR.errors(joint_labels), test_layer_LR.y_pred
    ],
                                allow_input_downcast=True,
                                on_unused_input='ignore')

    ###############
    # TRAIN MODEL #
    ###############
    print '... training'
    # early-stopping parameters
    patience = 50000000000  # look as this many examples regardless
    start_time = time.time()
    mid_time = start_time
    past_time = mid_time
    epoch = 0
    done_looping = False

    joint_n_train_batches = joint_train_size / batch_size
    joint_train_batch_start = list(
        np.arange(joint_n_train_batches) *
        batch_size) + [joint_train_size - batch_size]
    n_train_batches = train_size / batch_size
    train_batch_start = list(
        np.arange(n_train_batches) * batch_size) + [train_size - batch_size]

    n_test_batches = test_size / batch_size
    test_batch_start = list(
        np.arange(n_test_batches) * batch_size) + [test_size - batch_size]
    n_test_3th_batches = test_3th_size / batch_size
    test_3th_batch_start = list(np.arange(n_test_3th_batches) *
                                batch_size) + [test_3th_size - batch_size]

    n_dev_batches = dev_size / batch_size
    dev_batch_start = list(
        np.arange(n_dev_batches) * batch_size) + [dev_size - batch_size]
    n_dev_3th_batches = dev_3th_size / batch_size
    dev_3th_batch_start = list(np.arange(n_dev_3th_batches) *
                               batch_size) + [dev_3th_size - batch_size]

    max_acc = 0.0
    max_test_f1 = 0.0
    max_test_acc = 0.0

    cost_i = 0.0
    joint_train_indices = range(joint_train_size)
    train_indices = range(train_size)

    while epoch < n_epochs:
        epoch = epoch + 1

        random.Random(100).shuffle(
            joint_train_indices
        )  #shuffle training set for each new epoch, is supposed to promote performance, but not garrenteed
        random.Random(100).shuffle(train_indices)
        iter_accu = 0

        for joint_batch_id in joint_train_batch_start:  #for each batch
            # iter means how many batches have been run, taking into loop
            iter = (epoch - 1) * joint_n_train_batches + iter_accu + 1
            iter_accu += 1
            joint_train_id_batch = joint_train_indices[
                joint_batch_id:joint_batch_id + batch_size]
            for i in range(3):
                batch_id = random.choice(train_batch_start)
                train_id_batch = train_indices[batch_id:batch_id + batch_size]
                cost_i += train_model(
                    train_sents[train_id_batch],
                    train_sent_masks[train_id_batch],
                    train_sent_labels[train_id_batch],
                    train_claims[train_id_batch],
                    train_claim_mask[train_id_batch],
                    #joint_sents_ids,joint_sents_mask,joint_sents_labels, joint_claim_ids, joint_claim_mask, joint_labels
                    joint_train_sents[joint_train_id_batch],
                    joint_train_sent_masks[joint_train_id_batch],
                    joint_train_sent_labels[joint_train_id_batch],
                    joint_train_claims[joint_train_id_batch],
                    joint_train_claim_mask[joint_train_id_batch],
                    joint_train_labels[joint_train_id_batch])

            #after each 1000 batches, we test the performance of the model on all test data
            # if (epoch==1 and iter%1000==0) or (epoch>=2 and iter%5==0):
            if iter % 100 == 0:
                print 'Epoch ', epoch, 'iter ' + str(
                    iter) + ' average cost: ' + str(cost_i / iter), 'uses ', (
                        time.time() - past_time) / 60.0, 'min'
                past_time = time.time()

                f1_sum = 0.0
                error_sum = 0.0
                full_evi = 0
                predictions = []
                for test_batch_id in test_batch_start:  # for each test batch
                    batch_prob, error_i, pred_i = test_model(
                        test_sents[test_batch_id:test_batch_id + batch_size],
                        test_sent_masks[test_batch_id:test_batch_id +
                                        batch_size],
                        test_sent_labels[test_batch_id:test_batch_id +
                                         batch_size],
                        test_claims[test_batch_id:test_batch_id + batch_size],
                        test_claim_mask[test_batch_id:test_batch_id +
                                        batch_size],
                        test_labels[test_batch_id:test_batch_id + batch_size])
                    error_sum += error_i
                    batch_sent_labels = test_sent_labels[
                        test_batch_id:test_batch_id + batch_size]
                    batch_sent_names = test_sent_names[
                        test_batch_id:test_batch_id + batch_size]
                    batch_ground_names = test_ground_names[
                        test_batch_id:test_batch_id + batch_size]
                    batch_ground_labels = test_labels[
                        test_batch_id:test_batch_id + batch_size]
                    for i in range(batch_size):
                        instance_i = {}
                        instance_i['label'] = pred_id2label.get(
                            batch_ground_labels[i])
                        instance_i['predicted_label'] = pred_id2label.get(
                            pred_i[i])
                        pred_sent_names = []
                        gold_sent_names = batch_ground_names[i]
                        zipped = [(batch_prob[i, k], batch_sent_labels[i][k],
                                   batch_sent_names[i][k])
                                  for k in range(cand_size)]
                        sorted_zip = sorted(zipped,
                                            key=lambda x: x[0],
                                            reverse=True)
                        for j in range(cand_size):
                            triple = sorted_zip[j]
                            if triple[1] == 1.0:
                                '''
                                we should consider a rank, instead of binary
                                if triple[0] >0.5: can control the recall, influence the strict_acc
                                '''
                                if triple[0] > 0.5:
                                    # pred_sent_names.append(batch_sent_names[i][j])
                                    pred_sent_names.append(triple[2])
                                # if len(pred_sent_names) == max_pred_pick:
                                #     break
                        instance_i['predicted_evidence'] = pred_sent_names
                        # print 'pred_sent_names:',pred_sent_names
                        # print 'gold_sent_names:',gold_sent_names
                        new_gold_names = []
                        for gold_name in gold_sent_names:
                            new_gold_names.append([None, None] + gold_name)
                        instance_i['evidence'] = [new_gold_names]
                        predictions.append(instance_i)

                strict_score, label_accuracy, precision, recall, f1 = fever_score(
                    predictions)
                print 'strict_score, label_accuracy, precision, recall, f1: ', strict_score, label_accuracy, precision, recall, f1
                # test_f1=f1_sum/(len(test_batch_start)*batch_size)

                for test_batch_id in test_3th_batch_start:  # for each test batch
                    _, error_i, pred_i = test_model(
                        test_3th_sents[test_batch_id:test_batch_id +
                                       batch_size],
                        test_3th_sent_masks[test_batch_id:test_batch_id +
                                            batch_size],
                        test_3th_sent_labels[test_batch_id:test_batch_id +
                                             batch_size],
                        test_3th_claims[test_batch_id:test_batch_id +
                                        batch_size],
                        test_3th_claim_mask[test_batch_id:test_batch_id +
                                            batch_size],
                        test_3th_labels[test_batch_id:test_batch_id +
                                        batch_size])
                    for i in range(batch_size):
                        instance_i = {}
                        instance_i['label'] = pred_id2label.get(2)
                        instance_i['predicted_label'] = pred_id2label.get(
                            pred_i[i])
                        instance_i['predicted_evidence'] = []
                        instance_i['evidence'] = []
                        predictions.append(instance_i)

                strict_score, label_accuracy, precision, recall, f1 = fever_score(
                    predictions)
                print 'strict_score, label_accuracy, precision, recall, f1: ', strict_score, label_accuracy, precision, recall, f1
                if f1 > max_test_f1 or strict_score > max_test_acc:
                    if f1 > max_test_f1:
                        max_test_f1 = f1
                    if strict_score > max_test_acc:
                        max_test_acc = strict_score
                    #test
                    print '....................\n'
                    f1_sum = 0.0
                    error_sum = 0.0
                    full_evi = 0
                    predictions = []
                    fine_grained_sent_predictions = {
                        1: [],
                        2: [],
                        3: [],
                        4: [],
                        5: []
                    }
                    fine_grained_page_predictions = {
                        1: [],
                        2: [],
                        3: [],
                        4: []
                    }
                    for dev_batch_id in dev_batch_start:  # for each test batch
                        batch_prob, error_i, pred_i = dev_model(
                            dev_sents[dev_batch_id:dev_batch_id + batch_size],
                            dev_sent_masks[dev_batch_id:dev_batch_id +
                                           batch_size],
                            dev_sent_labels[dev_batch_id:dev_batch_id +
                                            batch_size],
                            dev_claims[dev_batch_id:dev_batch_id + batch_size],
                            dev_claim_mask[dev_batch_id:dev_batch_id +
                                           batch_size],
                            dev_labels[dev_batch_id:dev_batch_id + batch_size])
                        error_sum += error_i
                        batch_sent_labels = dev_sent_labels[
                            dev_batch_id:dev_batch_id + batch_size]
                        batch_sent_names = dev_sent_names[
                            dev_batch_id:dev_batch_id + batch_size]
                        batch_ground_names = dev_ground_names[
                            dev_batch_id:dev_batch_id + batch_size]
                        batch_ground_labels = dev_labels[
                            dev_batch_id:dev_batch_id + batch_size]
                        for i in range(batch_size):
                            instance_i = {}
                            instance_i['label'] = pred_id2label.get(
                                batch_ground_labels[i])
                            instance_i['predicted_label'] = pred_id2label.get(
                                pred_i[i])
                            pred_sent_names = []
                            gold_sent_names = batch_ground_names[i]
                            zipped = [(batch_prob[i,
                                                  k], batch_sent_labels[i][k],
                                       batch_sent_names[i][k])
                                      for k in range(cand_size)]
                            sorted_zip = sorted(zipped,
                                                key=lambda x: x[0],
                                                reverse=True)
                            for j in range(cand_size):
                                triple = sorted_zip[j]
                                if triple[1] == 1.0:
                                    '''
                                    we should consider a rank, instead of binary
                                    if triple[0] >0.5: can control the recall, influence the strict_acc
                                    '''
                                    if triple[0] > 0.5:
                                        # pred_sent_names.append(batch_sent_names[i][j])
                                        pred_sent_names.append(triple[2])
                                    # if len(pred_sent_names) == max_pred_pick:
                                    #     break
                            instance_i['predicted_evidence'] = pred_sent_names
                            # print 'pred_sent_names:',pred_sent_names
                            # print 'gold_sent_names:',gold_sent_names
                            new_gold_names = []
                            for gold_name in gold_sent_names:
                                new_gold_names.append([None, None] + gold_name)
                            instance_i['evidence'] = [new_gold_names]
                            predictions.append(instance_i)

                            evi_sent_size, evi_page_size = count_sent_page(
                                gold_sent_names)
                            fine_grained_sent_predictions.get(
                                evi_sent_size).append(instance_i)
                            fine_grained_page_predictions.get(
                                evi_page_size).append(instance_i)
                    strict_score, label_accuracy, precision, recall, f1 = fever_score(
                        predictions)
                    print 'strict_score, label_accuracy, precision, recall, f1: ', strict_score, label_accuracy, precision, recall, f1
                    print '......sent...\n'
                    for i in range(1, 6):
                        predictions_i = fine_grained_sent_predictions.get(i)
                        if len(predictions_i) > 0:
                            strict_score, label_accuracy, precision, recall, f1 = fever_score(
                                predictions_i)
                            print i, '\tstrict, all, pre, rec, f1: ', strict_score, label_accuracy, precision, recall, f1
                        else:
                            print i, '\tstrict, all, pre, rec, f1: ', 0.0, 0.0, 0.0, 0.0, 0.0
                    print '......page...\n'
                    for i in range(1, 5):
                        predictions_i = fine_grained_page_predictions.get(i)
                        if len(predictions_i) > 0:
                            strict_score, label_accuracy, precision, recall, f1 = fever_score(
                                predictions_i)
                            print i, '\tstrict, all, pre, rec, f1: ', strict_score, label_accuracy, precision, recall, f1
                        else:
                            print i, '\tstrict, all, pre, rec, f1: ', 0.0, 0.0, 0.0, 0.0, 0.0

                    for dev_batch_id in dev_3th_batch_start:  # for each test batch
                        _, error_i, pred_i = dev_model(
                            dev_3th_sents[dev_batch_id:dev_batch_id +
                                          batch_size],
                            dev_3th_sent_masks[dev_batch_id:dev_batch_id +
                                               batch_size],
                            dev_3th_sent_labels[dev_batch_id:dev_batch_id +
                                                batch_size],
                            dev_3th_claims[dev_batch_id:dev_batch_id +
                                           batch_size],
                            dev_3th_claim_mask[dev_batch_id:dev_batch_id +
                                               batch_size],
                            dev_3th_labels[dev_batch_id:dev_batch_id +
                                           batch_size])
                        for i in range(batch_size):
                            instance_i = {}
                            instance_i['label'] = pred_id2label.get(2)
                            instance_i['predicted_label'] = pred_id2label.get(
                                pred_i[i])
                            instance_i['predicted_evidence'] = []
                            instance_i['evidence'] = []
                            predictions.append(instance_i)

                    strict_score, label_accuracy, precision, recall, f1 = fever_score(
                        predictions)
                    print 'strict_score, label_accuracy, precision, recall, f1: ', strict_score, label_accuracy, precision, recall, f1

        print 'Epoch ', epoch, 'uses ', (time.time() - mid_time) / 60.0, 'min'
        mid_time = time.time()

        #print 'Batch_size: ', update_freq
    end_time = time.time()

    print >> sys.stderr, ('The code for file ' + os.path.split(__file__)[1] +
                          ' ran for %.2fm' % ((end_time - start_time) / 60.))

    return max_acc_test
def BasicTheano():

	#REPEAT vs TILE 
	x = theano.tensor.fmatrix("x")
	z = theano.tensor.repeat(x, 1, axis = 0)
	z_one_more = theano.tensor.repeat(z, 2, axis = 1)

	foo = theano.function([x], z)
	foo_one_more = theano.function([z], z_one_more)

	a = np.array([[1, 2, 3]]).astype("float32")
	print('a.shape: ')
	print(a.shape)

	c = foo(a)
	c_one_more = foo_one_more(c)

	print("applying repeat along axis 0")
	print(c)
	print(c.shape)
	print("applying one more along axis 1")
	print(c_one_more)
	print(c_one_more.shape)

	z_tile = theano.tensor.tile(x, (3,2))
	
	foo_tile = theano.function([x], z_tile)

	c_tile = foo_tile(a)
	print("applying tile along axis 0")
	print(c_tile)


	#TRANSPOSE vs RESHAPE vs DIMSHUFFLE
	v = theano.tensor.ivector("v")
	u = theano.tensor.ivector("u")
	u_dot_v = theano.tensor.dot(u, theano.tensor.transpose(v))
	v_trans = theano.tensor.transpose(v)

	u_dot_v_no_transpose = theano.tensor.dot(u, v)

	foo_dot = theano.function([u, v], u_dot_v)
	foo_trans = theano.function([v], v_trans)
	foo_dot_no_transpose = theano.function([u, v], u_dot_v_no_transpose)

	v_value = np.array([1, 2, 3]).astype("int32")
	u_value = np.array([1, 2, 3]).astype("int32")

	foo_dot_value = foo_dot(u_value, v_value)
	foo_trans_value = foo_trans(v_value)
	foo_dot_no_transpose_value = foo_dot_no_transpose(u_value, v_value)

	print('dot product')
	print(foo_dot_value)

	print('dot product no transpose: ')
	print(foo_dot_no_transpose_value)

	print('transpose: ')
	print(foo_trans_value)
	print(foo_trans_value.shape)

	print('original shape')
	print(v_value.shape)

	print('v reshape')
	v_reshape = v.reshape((v.shape[0], 1))
	print(v_reshape.type)

	foo_reshape = theano.function([v], v_reshape)
	foo_reshape_value = foo_reshape(v_value)
	print(foo_reshape_value.shape)

	#SUM
	v_sum_0 = v.sum(axis = 0)
	foo_sum_0 = theano.function([v], v_sum_0)

	foo_sum_0_value = foo_sum_0(v_value)
	print(foo_sum_0_value)

	#v_sum_1 = v_reshape.sum(axis = 1)
	#foo_sum_1 = theano.function([v], v_sum_1)
	#foo_sum_1_value = foo_sum_0(v_value)
	#print(foo_sum_1_value)

	#test reshape
	y = theano.tensor.ftensor3("y")
	y_shape = y.shape
	y_reshape = y.reshape((y_shape[1], y_shape[2]))#, y_shape[0]

	function_reshape = theano.function([y], y_reshape)

	y_value = np.ones((1,3,2)).astype("float32")
	print(y_value.shape)

	y_reshape_value = function_reshape(y_value)
	print('y_reshape:')
	print(y_reshape_value)

	#reshape matrix to tensor3
	matrix = theano.tensor.fmatrix("matrix")
	print(matrix.type)
	mat_shape = matrix.shape
	mat_reshape = matrix.reshape((-1, mat_shape[0], mat_shape[1]))
	print(mat_reshape.type)

	mat_reshape_func= theano.function([matrix], mat_reshape)

	mat_value = np.ones((3,2)).astype("float32")
	mat_reshape_func_out = mat_reshape_func(mat_value)

	print(mat_value.shape)
	print("matrix to 3D tensor")
	print(mat_reshape_func_out.shape)
	print(mat_reshape_func_out)

	#creating a square matrix with the given vector as its diagonal
	given_vec = theano.tensor.fvector("given_vec")
	diag_mat = theano.tensor.nlinalg.AllocDiag()(given_vec)

	diag_function = theano.function([given_vec], diag_mat)

	given_vec_value = np.array([1, 2, 3]).astype("float32")
	diag_function_value = diag_function(given_vec_value)
	print("diagonal matrix is: ")
	print(diag_function_value)
	print(diag_function_value.shape)

	#multiply an element of vector (1*N) with a row/column of a matrix (N*D*1)
	multiply_vector_matrix = T.dot(diag_mat, y_reshape)

	result_function = theano.function([diag_mat, y_reshape], multiply_vector_matrix)
	output_value = result_function(diag_function_value, y_reshape_value)
	print(output_value.shape)
	print(output_value)
	
	#Reshape to convert tensor from matrix to vector/column/row/3D
	print("matrix to vector/row/column/3D")
	matrix_origin = theano.tensor.fmatrix('Mat')
	mat_2_vector  = matrix_origin.reshape((matrix_origin.shape[0]*matrix_origin.shape[1], ))   
	print(mat_2_vector.type)
	mat_2_row  = matrix_origin.reshape((1, matrix_origin.shape[0]*matrix_origin.shape[1]))   
	print(mat_2_row.type)
	mat_2_column     = matrix_origin.reshape((matrix_origin.shape[0]*matrix_origin.shape[1], 1))  
	print(mat_2_column.type)
	mat_2_3dtensor= matrix_origin.reshape((-1, matrix_origin.shape[0], matrix_origin.shape[1])) 
	print(mat_2_3dtensor.type)

	f = theano.function([matrix_origin], [mat_2_vector, mat_2_column, mat_2_row, mat_2_3dtensor])

	input_value = np.array([[1.,2.], [3.,4.]]).astype("float32")
	print(input_value.shape)

	for output in f(np.array([[1., 2.], [3., 4.]]).astype("float32")):
		
		print(output.shape)
		print(output)

	#REPEAT for 3D tensor
	print("repeat 3D tensor")
	h_t = theano.tensor.tensor3("h_t")
	axis_scalar = theano.tensor.dscalar("axis")
	h_t_repeat = theano.tensor.repeat(h_t, 3,axis= 0)

	repeat_func = theano.function([h_t], h_t_repeat)
	input_value = np.ones((1,3,2)).astype("float32")
	input_value[0, 1, 1] = 5.
	input_value[0, 2, 1] = 3.
	repeat_func_out = repeat_func(input_value)
	print(repeat_func_out.shape)
	print("input value:")
	print(input_value)
	print("element in output: ")
	print(repeat_func_out[0, :, :])
	print("out: ")
	print(repeat_func_out)

	#test (3,1, 2) -> (3, 3, 2)
	h_t_repeat_1 = theano.tensor.repeat(h_t, 3,axis= 1)

	repeat_func_1 = theano.function([h_t], h_t_repeat_1)
	print("repeat (312) to (332)")
	input_values_312 = np.ones((3,1, 2)).astype("float32")
	input_values_312[2, 0, 0] = 9.
	repeat_func_out_332 = repeat_func_1(input_values_312)
	print(repeat_func_out_332.shape)
	print("input value")
	print(input_values_312)
	print("out value")
	print(repeat_func_out_332)


	# print("repeat 2 times: ")
	# #Repeat 3D tensor 2 times with 2 different axes
	# b_value = np.ones((1, 1 ,5)).astype("float32")
	# b_value[0, 0, 2] = 3.
	# b_value[0, 0, 4] = 5.
	# print("1st time: ")
	# print(repeat_func(b_value))
	# print(repeat_func(b_value).shape)

	# h_t_repeat_2x = theano.tensor.repeat(h_t_repeat, 2, axis = 1)
	# repeat_func_2x = theano.function([h_t_repeat], h_t_repeat_2x)
	# print("2nd time")
	# print(repeat_func_2x(repeat_func(b_value)))
	# print(repeat_func_2x(repeat_func(b_value)).shape)

	#Theano tensor concatenate
	z_t = theano.tensor.tensor3("z_t")
	concat = theano.tensor.concatenate([h_t, z_t], axis = 0)

	concat_func = theano.function([h_t, z_t], concat)
	z_t_input = np.ones((3,2,1)).astype("float32")
	h_t_input = np.ones((3,2,1)).astype("float32")
	print("concat : ")
	print(concat_func(h_t_input, z_t_input))
	print(concat_func(h_t_input, z_t_input).shape)

	#T.arange, T.mean, T.log, T.neq
	print("T.arange function:")
	mat_y = theano.tensor.fmatrix("mat_y")
	colum_vector = mat_y[theano.tensor.arange(mat_y.shape[0]), :]

	t_arange_function = theano.function([mat_y], colum_vector)

	mat_y_value = np.random.randn(3,2).astype("float32")
	t_arange_out = t_arange_function(mat_y_value)
	print('input value:')
	print(mat_y_value)
	print('output value:')
	print(t_arange_out.shape)
	print(t_arange_out)


	#NUMPY example  A(N, M, K)  B(N, M)  -> C(N, M) = A[arange(N), arange(M), B] using B as indexing matrix
	A = np.arange(4*2*5).reshape(4,2,5)
	B = np.arange(4*2).reshape(4,2)%5

	# print('arange: ')
	# print(np.arange(A.shape[0])[:, np.newaxis])
	# print(np.arange(A.shape[1]))

	C = A[np.arange(A.shape[0])[:, np.newaxis], np.arange(A.shape[1]), B] #
	print(A)
	print(B)
	print(C)
	print(C.shape)

	#Theano tensor slicing and assigning
	print("slicing theano")
	x_vector = theano.tensor.vector()
	y_slicing = x_vector[0::2]
	print(y_slicing.eval({x_vector: np.array([1,2, 3, 4]).astype("float32")}))

	#Theano split----------------------------------------
	# print("split theano")
	# def split_half(x, axis = 0):
	# 	if theano.tensor.le(x.shape[axis], 1):
	# 		return x
	# 	size1 = x.shape[axis]/2
	# 	size2 = x.shape[axis] - size1
	# 	split_out = theano.tensor.split(x, [size1, size2], 2, axis = axis)
	# 	first_part= split_out[0]
	# 	second_part = split_out[1]
	# 	return (split_half(first_part), split_half(second_part))
	
	# def split_6_along_axis(x, axis = 0):
	# 	size = []
	# 	for i in range(6):
	# 		size.append(1)
	# 	return theano.tensor.split(x, size, 6, axis = axis)

	# split_x = theano.tensor.matrix("split_x")
	# axis_split = theano.tensor.lscalar()
	# split_y_first, split_y_second = split_half(split_x, axis= axis_split)
	# f_split = theano.function([split_x, axis_split], split_y_first, split_y_second)
	# print(f_split(np.arange(12).reshape(6, 2).astype("float32"), 0))
	# # print(split_y.eval({split_x: np.arange(12).reshape(6, 2).astype("float32"), axis_split: 0}))


	# split_y_individual = split_6_along_axis(split_x, axis = axis_split)
	# f_split_individual = theano.function([split_x, axis_split], split_y_individual)
	# print(f_split_individual(np.arange(12).reshape(6, 2).astype("float32"), 0))

	#T.dot between two 3D tensors-------------------------
	tensor_1 = theano.tensor.tensor3("tensor_1")
	tensor_2 = theano.tensor.tensor3("tensor_2")

	dot_2_tensors = theano.tensor.dot(tensor_1, tensor_2)
	dot_2_tensor_func = theano.function([tensor_1, tensor_2], dot_2_tensors)

	tensor_1_in = np.ones((3,2,2)).astype("float32")
	tensor_2_in = np.ones((2,2,3)).astype("float32") #2,1,3 -wrong

	out_dot_2_tensors = dot_2_tensor_func(tensor_1_in, tensor_2_in)

	print("dot between two 3D tensors")
	print(out_dot_2_tensors.shape)
	# print(out_dot_2_tensors)

	#Theano tensor identity_like
	print('tensor identity like')
	identity_3D = T.identity_like(tensor_1)

	identity_out = identity_3D.eval({tensor_1: tensor_1_in})
	print(identity_out.shape)
	print(identity_out)
	print(tensor_1_in.shape)
	print(tensor_1_in)

	#T.repeat itself
	# bi = T.tensor3("bi")

	# bi = T.repeat(bi, 3, axis = 0)
	# out = bi.eval({bi: np.ones((1,3,2)).astype("float32")})
	# print(out.shape)

	# out_func = theano.function([bi], bi)
	# print(out_func(np.ones((1,3,2)).astype("float32")).shape)

	#i_t = i_t + a_t - Add itself --------------------------
	print("adding itself")
	a_t = T.fmatrix("a_t")
	h_t = T.fmatrix("h_t")

	i_t = h_t + a_t
	i_t = i_t + a_t

	# function_itself = i_t.eval({i_t: np.ones((2,2)).astype("float32"), a_t: np.ones((2,2)).astype("float32")})
	function_itself = theano.function([h_t, a_t], i_t)
	function_itself_out = function_itself(np.zeros((2,2)).astype("float32"), np.ones((2,2)).astype("float32"))
	print(function_itself_out)
	# ------------------------------------------------------

	#shared variable repeat - It works
	shared_var = theano.shared(name = "shared",
		value = np.ones((1,3, 2)).astype("float32"),
		borrow = True)

	shared_var = T.repeat(shared_var, 3, axis = 0)

	shared_var_reshape_out = shared_var.eval()
	print(shared_var_reshape_out.shape)

	#Test Max, Min, and along axis
	print("Test max, min")
	value_mat = np.asarray([[1.0, 2.0],[3.0, 4.0]]).astype("float32")
	test_tensor = theano.tensor.fmatrix("tensor")
	c = test_tensor.min()
	function_max = theano.function([test_tensor], c)
	out = function_max(value_mat)
	print(out)
	c_along = test_tensor.min(axis = 1)
	function_max_along = theano.function([test_tensor], c_along)
	out = function_max_along(value_mat)
	print(out)

	#rescale 3D tensor values to range [0, 1]
	print("scaling value of a tensor to range [0, 1]")
	def rescale_step(input_tensor):
		min_value = input_tensor.min()
		max_value = input_tensor.max()
		out_rescale = (input_tensor - min_value)/(max_value- min_value)
		return out_rescale

	input_rescale = theano.tensor.tensor3("in_rescale", dtype = theano.config.floatX)
	output_rescale, updates = theano.scan(fn=rescale_step,
	                                   outputs_info=[],
	                                   sequences=[input_rescale],
	                                   non_sequences=[])

	rescale_func = theano.function([input_rescale], output_rescale)

	input_rescale_value = np.linspace(1, 30, num = 2*5*3, dtype = theano.config.floatX).reshape(2, 5, 3)
	out_rescale = rescale_func(input_rescale_value)
	print(out_rescale)
	print(out_rescale.shape)
	print("input value")
	print(input_rescale_value)
    def __init__(self,
                 z_n,
                 z_k,
                 encoder_net,
                 decoder_net,
                 opt,
                 iw=False,
                 iw_samples=10,
                 val_iw=False,
                 val_iw_samples=100,
                 regularizer=None,
                 initializer=uniform_initializer(0.05),
                 hard=True,
                 tau0=5.,
                 tau_min=0.25,
                 tau_decay=1e-6,
                 srng=RandomStreams(123),
                 eps=1e-9):
        self.z_n = z_n
        self.z_k = z_k
        self.encoder_net = encoder_net
        self.decoder_net = decoder_net
        self.srng = srng
        self.hard = hard
        self.iw = iw
        self.iw_samples = iw_samples
        self.val_iw = val_iw
        self.val_iw_samples = val_iw_samples
        self.ceps = T.constant(eps, name='epsilon', dtype='float32')
        # Temperature
        self.iteration = K.variable(0, dtype='int32', name='iteration')
        iter_updates = [(self.iteration, self.iteration + 1)]
        tau = T.constant(tau0, dtype='float32', name='tau0')
        if tau_decay > 0:
            tau_decay = T.constant(tau_decay,
                                   name='tau_decay',
                                   dtype='float32')
            tau_min = T.constant(tau_min, name='tau_min', dtype='float32')
            tau = tau / (1. + (tau_decay * self.iteration))
            tau = T.nnet.relu(tau - tau_min) + tau_min
        self.tau = tau

        # Prior
        self.z_prior = T.ones((z_n, z_k), dtype='float32') / z_k
        pz_params = []

        # Quantization
        span = (z_k - 1.) / 2.
        self.quant_np = (np.arange(z_k, dtype=np.float32) - span) / span
        self.quant = T.constant(self.quant_np, name='quant', dtype='float32')
        print("Quantization: {}".format(self.quant_np))

        # Input
        input_x = T.fmatrix(name='input_x')  # (n, input_units)
        rnd = srng.uniform(size=input_x.shape,
                           low=0.,
                           high=1.,
                           dtype='float32')
        input_x_binary = T.gt(input_x, rnd)  # (n, input_units)

        (train_loss, mean_nll_x, mean_kl, encode_updates,
         decode_updates) = self.calc_nll_tot(iw=iw,
                                             iw_samples=iw_samples,
                                             input_x_binary=input_x_binary,
                                             validation=False)

        val_loss, val_mean_nll_x, val_mean_kl, _1, _2 = self.calc_nll_tot(
            iw=val_iw,
            iw_samples=val_iw_samples,
            input_x_binary=input_x_binary,
            validation=True)

        # Validation function
        val_function = theano.function([input_x],
                                       [val_mean_nll_x, val_mean_kl, val_loss])
        val_headers = ['Val NLL X', 'KL', 'Val NLL']

        # Regularization
        self.params = pz_params + encoder_net.params + decoder_net.params
        reg_loss = T.constant(0.)
        if regularizer:
            for p in self.params:
                reg_loss += regularizer(p)

        # Training
        loss = train_loss + reg_loss
        train_updates = opt.get_updates(loss, self.params)
        all_updates = train_updates + iter_updates + decode_updates + encode_updates
        train_function = theano.function(
            [input_x], [mean_nll_x, mean_kl, reg_loss, loss, self.tau],
            updates=fix_updates(all_updates))
        train_headers = ['NLL X', 'KL', 'Reg', 'Loss', 'Tau']
        weights = (self.params + opt.weights + [self.iteration] +
                   encoder_net.non_trainable_weights +
                   decoder_net.non_trainable_weights)

        # Generation
        input_n = T.iscalar()
        logitrep = T.log(self.ceps + T.repeat(
            T.reshape(self.z_prior, (1, z_n, z_k)), repeats=input_n, axis=0))
        zsamp = sample_one_hot(logits=logitrep, srng=srng)
        zqsamp = T.dot(zsamp, self.quant)  # (n, z_n)
        xgen, _ = self.decode(zqsamp, validation=True)
        # rnd = srng.uniform(size=xgen.shape, low=0., high=1., dtype='float32')
        # xsamp = T.cast(T.gt(xgen, rnd), 'int32')
        generate_function = theano.function([input_n],
                                            xgen)  # xsamp for binarized
        self.sample_z_function = theano.function([input_n], zqsamp)

        # Decoding
        input_zq = T.fmatrix()
        xgen, _ = self.decode(input_zq, validation=True)
        self.decode_function = theano.function([input_zq], xgen)

        # Autoencode
        # rnd = srng.uniform(low=0., high=1., dtype='float32', size=val_xpred.shape)
        # xout = T.cast(T.gt(val_xpred, rnd), dtype='float32')
        pz, z, _ = self.encode(input_x_binary,
                               validation=True)  # (n, z_n, z_k)
        zq = T.dot(z, self.quant)
        xpred, _ = self.decode(zq, validation=True)  # (n, input_units)
        autoencode_function = theano.function(
            [input_x], [input_x_binary, xpred])  # xout for binarized

        super(GumbelQuantizedAutoencoder,
              self).__init__(train_headers=train_headers,
                             val_headers=val_headers,
                             train_function=train_function,
                             generate_function=generate_function,
                             val_function=val_function,
                             autoencode_function=autoencode_function,
                             weights=weights)
Exemple #45
0
 def initial_states(self, batch_size, *args, **kwargs):
     return [
         tensor.repeat(self.initial_state_[None, :], batch_size, 0),
         tensor.repeat(self.initial_cells[None, :], batch_size, 0)
     ]
    def __init__(self,
                 rng,
                 x,
                 n_in,
                 n_out,
                 p=0.0,
                 training=1,
                 rnn_batch_training=False):
        """ This is to initialise a standard RNN hidden unit

        :param rng: random state, fixed value for randome state for reproducible objective results
        :param x: input data to current layer
        :param n_in: dimension of input data
        :param n_out: dimension of output data
        :param p: the probability of dropout
        :param training: a binary value to indicate training or testing (for dropout training)
        """
        self.input = x

        if p > 0.0:
            if training == 1:
                srng = RandomStreams(seed=123456)
                self.input = T.switch(srng.binomial(size=x.shape, p=p), x, 0)
            else:
                self.input = (1 - p) * x  #(1-p) *

        self.n_in = int(n_in)
        self.n_out = int(n_out)

        self.rnn_batch_training = rnn_batch_training

        # random initialisation
        Wx_value = np.asarray(rng.normal(0.0,
                                         old_div(1.0, np.sqrt(n_in)),
                                         size=(n_in, n_out)),
                              dtype=config.floatX)
        Wy_value = np.asarray(np.zeros((n_out, n_out)), dtype=config.floatX)

        # Input gate weights
        self.W_xi = theano.shared(value=Wx_value, name='W_xi')
        self.W_yi = theano.shared(value=Wy_value, name='W_yi')

        # bias
        self.b_y = theano.shared(value=np.zeros((n_out, ),
                                                dtype=config.floatX),
                                 name='b_y')

        # initial value of output
        if self.rnn_batch_training:
            self.y0 = theano.shared(value=np.zeros((1, n_out),
                                                   dtype=config.floatX),
                                    name='y0')
            self.y0 = T.repeat(self.y0, x.shape[1], 0)
        else:
            self.y0 = theano.shared(value=np.zeros((n_out, ),
                                                   dtype=config.floatX),
                                    name='y0')

        self.Wix = T.dot(self.input, self.W_xi)

        self.y, _ = theano.scan(self.recurrent_as_activation_function,
                                sequences=self.Wix,
                                outputs_info=self.y0)

        self.output = self.y

        self.params = [self.W_xi, self.W_yi, self.b_y]
Exemple #47
0
    def __init__(self, data_dir, word2vec, word_vector_size, truncate_gradient, learning_rate, dim, cnn_dim, cnn_dim_fc, story_len,
                patches, mode, answer_module, memory_hops, batch_size, l2,
                normalize_attention, batch_norm, dropout, **kwargs):
        
        print "==> not used params in DMN class:", kwargs.keys()

        self.data_dir = data_dir
        self.learning_rate = learning_rate
        
        self.truncate_gradient = truncate_gradient
        self.word2vec = word2vec
        self.word_vector_size = word_vector_size
        self.dim = dim
        self.cnn_dim = cnn_dim
        self.cnn_dim_fc = cnn_dim_fc
        self.story_len = story_len
        self.mode = mode
        self.patches = patches
        self.answer_module = answer_module
        self.memory_hops = memory_hops
        self.batch_size = batch_size
        self.l2 = l2
        self.normalize_attention = normalize_attention
        self.batch_norm = batch_norm
        self.dropout = dropout

        self.vocab, self.ivocab = self._load_vocab(self.data_dir)

        self.train_story = None
        self.test_story = None
        self.train_dict_story, self.train_lmdb_env_fc, self.train_lmdb_env_conv = self._process_input_sind(self.data_dir, 'train')
        self.test_dict_story, self.test_lmdb_env_fc, self.test_lmdb_env_conv = self._process_input_sind(self.data_dir, 'val')

        self.train_story = self.train_dict_story.keys()
        self.test_story = self.test_dict_story.keys()
        self.vocab_size = len(self.vocab)

        # Since this is pretty expensive, we will pass a story each time.
        # We assume that the input has been processed such that the sequences of patches 
        # are snake like path.

        self.input_var = T.tensor4('input_var') # (batch_size, seq_len, patches, cnn_dim)
        self.q_var = T.matrix('q_var') # Now, it's a batch * image_sieze.
        self.answer_var = T.imatrix('answer_var') # answer of example in minibatch
        self.answer_mask = T.matrix('answer_mask')
        self.answer_inp_var = T.tensor3('answer_inp_var') # answer of example in minibatch
        
        print "==> building input module"
        self.W_inp_emb_in = nn_utils.normal_param(std=0.1, shape=(self.dim, self.cnn_dim))
        #self.b_inp_emb_in = nn_utils.constant_param(value=0.0, shape=(self.dim,))
        # First, we embed the visual features before sending it to the bi-GRUs.

        inp_rhp = T.reshape(self.input_var, (self.batch_size* self.story_len* self.patches, self.cnn_dim))
        inp_rhp_dimshuffled = inp_rhp.dimshuffle(1,0)
        inp_rhp_emb = T.dot(self.W_inp_emb_in, inp_rhp_dimshuffled)
        inp_rhp_emb_dimshuffled = inp_rhp_emb.dimshuffle(1,0)
        inp_emb_raw = T.reshape(inp_rhp_emb_dimshuffled, (self.batch_size, self.story_len, self.patches, self.cnn_dim))
        inp_emb = T.tanh(inp_emb_raw) # Just follow the paper DMN for visual and textual QA.


        # Now, we use a bi-directional GRU to produce the input.
        # Forward GRU.
        self.inp_dim = self.dim/2 # since we have forward and backward
        self.W_inpf_res_in = nn_utils.normal_param(std=0.1, shape=(self.inp_dim, self.cnn_dim))
        self.W_inpf_res_hid = nn_utils.normal_param(std=0.1, shape=(self.inp_dim, self.inp_dim))
        self.b_inpf_res = nn_utils.constant_param(value=0.0, shape=(self.inp_dim,))
        
        self.W_inpf_upd_in = nn_utils.normal_param(std=0.1, shape=(self.inp_dim, self.cnn_dim))
        self.W_inpf_upd_hid = nn_utils.normal_param(std=0.1, shape=(self.inp_dim, self.inp_dim))
        self.b_inpf_upd = nn_utils.constant_param(value=0.0, shape=(self.inp_dim,))
        
        self.W_inpf_hid_in = nn_utils.normal_param(std=0.1, shape=(self.inp_dim, self.cnn_dim))
        self.W_inpf_hid_hid = nn_utils.normal_param(std=0.1, shape=(self.inp_dim, self.inp_dim))
        self.b_inpf_hid = nn_utils.constant_param(value=0.0, shape=(self.inp_dim,))
        # Backward GRU.
        self.W_inpb_res_in = nn_utils.normal_param(std=0.1, shape=(self.inp_dim, self.cnn_dim))
        self.W_inpb_res_hid = nn_utils.normal_param(std=0.1, shape=(self.inp_dim, self.inp_dim))
        self.b_inpb_res = nn_utils.constant_param(value=0.0, shape=(self.inp_dim,))
        
        self.W_inpb_upd_in = nn_utils.normal_param(std=0.1, shape=(self.inp_dim, self.cnn_dim))
        self.W_inpb_upd_hid = nn_utils.normal_param(std=0.1, shape=(self.inp_dim, self.inp_dim))
        self.b_inpb_upd = nn_utils.constant_param(value=0.0, shape=(self.inp_dim,))
        
        self.W_inpb_hid_in = nn_utils.normal_param(std=0.1, shape=(self.inp_dim, self.cnn_dim))
        self.W_inpb_hid_hid = nn_utils.normal_param(std=0.1, shape=(self.inp_dim, self.inp_dim))
        self.b_inpb_hid = nn_utils.constant_param(value=0.0, shape=(self.inp_dim,))

        # Now, we use the GRU to build the inputs.
        # Two-level of nested scan is unnecessary. It will become too complicated. Just use this one.
        inp_dummy = theano.shared(np.zeros((self.inp_dim, self.story_len), dtype = floatX))
        for i in range(self.batch_size):
            if i == 0:
                inp_1st_f, _ = theano.scan(fn = self.input_gru_step_forward,
                                    sequences = inp_emb[i,:].dimshuffle(1,2,0),
                                    outputs_info=T.zeros_like(inp_dummy))

                inp_1st_b, _ = theano.scan(fn = self.input_gru_step_backward,
                                    sequences = inp_emb[i,:,::-1,:].dimshuffle(1,2,0),
                                    outputs_info=T.zeros_like(inp_dummy))
                # Now, combine them.
                inp_1st = T.concatenate([inp_1st_f.dimshuffle(2,0,1), inp_1st_b.dimshuffle(2,0,1)], axis = -1)
                self.inp_c = inp_1st.dimshuffle('x', 0, 1, 2)
            else:
                inp_f, _ = theano.scan(fn = self.input_gru_step_forward,
                                    sequences = inp_emb[i,:].dimshuffle(1,2,0),
                                    outputs_info=T.zeros_like(inp_dummy))

                inp_b, _ = theano.scan(fn = self.input_gru_step_backward,
                                    sequences = inp_emb[i,:,::-1,:].dimshuffle(1,2,0),
                                    outputs_info=T.zeros_like(inp_dummy))
                # Now, combine them.
                inp_fb = T.concatenate([inp_f.dimshuffle(2,0,1), inp_b.dimshuffle(2,0,1)], axis = -1)
                self.inp_c = T.concatenate([self.inp_c, inp_fb.dimshuffle('x', 0, 1, 2)], axis = 0)
        # Done, now self.inp_c should be batch_size x story_len x patches x cnn_dim
        # Eventually, we can flattern them.
        # Now, the input dimension is 1024 because we have forward and backward.
        inp_c_t = T.reshape(self.inp_c, (self.batch_size, self.story_len * self.patches, self.dim))
        inp_c_t_dimshuffled = inp_c_t.dimshuffle(0,'x', 1, 2)
        inp_batch = T.repeat(inp_c_t_dimshuffled, self.story_len, axis = 1)
        # Now, its ready for all the 5 images in the same story.
        # 50 * 980 * 512 
        self.inp_batch = T.reshape(inp_batch, (inp_batch.shape[0] * inp_batch.shape[1], inp_batch.shape[2], inp_batch.shape[3]))
        self.inp_batch_dimshuffled = self.inp_batch.dimshuffle(1,2,0) # 980 x 512 x 50
        
        
        # It's very simple now, the input module just need to map from cnn_dim to dim.
        logging.info('self.cnn_dim = %d', self.cnn_dim)

        print "==> building question module"
        # Now, share the parameter with the input module.
        self.W_inp_emb_q = nn_utils.normal_param(std = 0.1, shape=(self.dim, self.cnn_dim_fc))
        self.b_inp_emb_q = nn_utils.normal_param(std = 0.1, shape=(self.dim,))
        q_var_shuffled = self.q_var.dimshuffle(1,0)

        inp_q = T.dot(self.W_inp_emb_q, q_var_shuffled) + self.b_inp_emb_q.dimshuffle(0,'x') # 512 x 50
        self.q_q = T.tanh(inp_q) # Since this is used to initialize the memory, we need to make it tanh.
        
        print "==> creating parameters for memory module"
        self.W_mem_res_in = nn_utils.normal_param(std=0.1, shape=(self.dim, self.dim))
        self.W_mem_res_hid = nn_utils.normal_param(std=0.1, shape=(self.dim, self.dim))
        self.b_mem_res = nn_utils.constant_param(value=0.0, shape=(self.dim,))
        
        self.W_mem_upd_in = nn_utils.normal_param(std=0.1, shape=(self.dim, self.dim))
        self.W_mem_upd_hid = nn_utils.normal_param(std=0.1, shape=(self.dim, self.dim))
        self.b_mem_upd = nn_utils.constant_param(value=0.0, shape=(self.dim,))
        
        self.W_mem_hid_in = nn_utils.normal_param(std=0.1, shape=(self.dim, self.dim))
        self.W_mem_hid_hid = nn_utils.normal_param(std=0.1, shape=(self.dim, self.dim))
        self.b_mem_hid = nn_utils.constant_param(value=0.0, shape=(self.dim,))
        
        #self.W_b = nn_utils.normal_param(std=0.1, shape=(self.dim, self.dim))
        self.W_1 = nn_utils.normal_param(std=0.1, shape=(self.dim, 7 * self.dim + 0))
        self.W_2 = nn_utils.normal_param(std=0.1, shape=(1, self.dim))
        self.b_1 = nn_utils.constant_param(value=0.0, shape=(self.dim,))
        self.b_2 = nn_utils.constant_param(value=0.0, shape=(1,))
        

        print "==> building episodic memory module (fixed number of steps: %d)" % self.memory_hops
        memory = [self.q_q.copy()]
        for iter in range(1, self.memory_hops + 1):
            #m = printing.Print('mem')(memory[iter-1])
            current_episode = self.new_episode(memory[iter - 1])
            #current_episode = self.new_episode(m)
            #current_episode = printing.Print('current_episode')(current_episode)
            memory.append(self.GRU_update(memory[iter - 1], current_episode,
                                          self.W_mem_res_in, self.W_mem_res_hid, self.b_mem_res, 
                                          self.W_mem_upd_in, self.W_mem_upd_hid, self.b_mem_upd,
                                          self.W_mem_hid_in, self.W_mem_hid_hid, self.b_mem_hid))                         
        
        last_mem_raw = memory[-1].dimshuffle((1, 0))
        
        net = layers.InputLayer(shape=(self.batch_size * self.story_len, self.dim), input_var=last_mem_raw)

        if self.batch_norm:
            net = layers.BatchNormLayer(incoming=net)
        if self.dropout > 0 and self.mode == 'train':
            net = layers.DropoutLayer(net, p=self.dropout)
        last_mem = layers.get_output(net).dimshuffle((1, 0))

        logging.info('last_mem size')
        print last_mem.shape.eval({self.input_var: np.random.rand(10,5,196,512).astype('float32'),
            self.q_var: np.random.rand(50, 4096).astype('float32')})
       
        print "==> building answer module"

        answer_inp_var_shuffled = self.answer_inp_var.dimshuffle(1,2,0)
        # Sounds good. Now, we need to map last_mem to a new space. 
        self.W_mem_emb = nn_utils.normal_param(std = 0.1, shape = (self.dim, self.dim * 2))
        self.W_inp_emb = nn_utils.normal_param(std = 0.1, shape = (self.dim, self.vocab_size + 1))

        def _dot2(x, W):
            return  T.dot(W, x)

        answer_inp_var_shuffled_emb,_ = theano.scan(fn = _dot2, sequences = answer_inp_var_shuffled,
                non_sequences = self.W_inp_emb ) # seq x dim x batch
        
        # Now, we also need to embed the image and use it to do the memory. 
        #q_q_shuffled = self.q_q.dimshuffle(1,0) # dim * batch.
        init_ans = T.concatenate([self.q_q, last_mem], axis = 0)

        mem_ans = T.dot(self.W_mem_emb, init_ans) # dim x batchsize.

        mem_ans_dim = mem_ans.dimshuffle('x',0,1)

        answer_inp = T.concatenate([mem_ans_dim, answer_inp_var_shuffled_emb], axis = 0)
        
        # Now, we have both embedding. We can let them go to the rnn. 

        # We also need to map the input layer as well. 

        dummy = theano.shared(np.zeros((self.dim, self.batch_size * self.story_len), dtype=floatX))

        self.W_a = nn_utils.normal_param(std=0.1, shape=(self.vocab_size + 1, self.dim))
        
        self.W_ans_res_in = nn_utils.normal_param(std=0.1, shape=(self.dim, self.dim))
        self.W_ans_res_hid = nn_utils.normal_param(std=0.1, shape=(self.dim, self.dim))
        self.b_ans_res = nn_utils.constant_param(value=0.0, shape=(self.dim,))
        
        self.W_ans_upd_in = nn_utils.normal_param(std=0.1, shape=(self.dim, self.dim))
        self.W_ans_upd_hid = nn_utils.normal_param(std=0.1, shape=(self.dim, self.dim))
        self.b_ans_upd = nn_utils.constant_param(value=0.0, shape=(self.dim,))
        
        self.W_ans_hid_in = nn_utils.normal_param(std=0.1, shape=(self.dim, self.dim))
        self.W_ans_hid_hid = nn_utils.normal_param(std=0.1, shape=(self.dim, self.dim))
        self.b_ans_hid = nn_utils.constant_param(value=0.0, shape=(self.dim,))

        logging.info('answer_inp size')

        #print answer_inp.shape.eval({self.input_var: np.random.rand(10,4,4096).astype('float32'),
        #    self.answer_inp_var: np.random.rand(10, 18, 8001).astype('float32'),
        #    self.q_var: np.random.rand(10, 4096).astype('float32')})
        
        #last_mem = printing.Print('prob_sm')(last_mem)
        results, _ = theano.scan(fn = self.answer_gru_step,
                sequences = answer_inp,
                outputs_info = [ dummy ])
        # Assume there is a start token 
        #print results.shape.eval({self.input_var: np.random.rand(10,4,4096).astype('float32'),
        #    self.q_var: np.random.rand(10, 4096).astype('float32'), 
        #    self.answer_inp_var: np.random.rand(10, 18, 8001).astype('float32')}, on_unused_input='ignore')
        results = results[1:-1,:,:] # get rid of the last token as well as the first one (image)
        #print results.shape.eval({self.input_var: np.random.rand(10,4,4096).astype('float32'),
        #    self.q_var: np.random.rand(10, 4096).astype('float32'), 
        #    self.answer_inp_var: np.random.rand(10, 18, 8001).astype('float32')}, on_unused_input='ignore')
            
        # Now, we need to transform it to the probabilities.

        prob,_ = theano.scan(fn = lambda x, w: T.dot(w, x), sequences = results, non_sequences = self.W_a )

        prob_shuffled = prob.dimshuffle(2,0,1) # b * len * vocab


        logging.info("prob shape.")
        #print prob.shape.eval({self.input_var: np.random.rand(10,4,4096).astype('float32'),
        #    self.q_var: np.random.rand(10, 4096).astype('float32'), 
        #    self.answer_inp_var: np.random.rand(10, 18, 8001).astype('float32')})

        n = prob_shuffled.shape[0] * prob_shuffled.shape[1]
        prob_rhp = T.reshape(prob_shuffled, (n, prob_shuffled.shape[2]))
        prob_sm = nn_utils.softmax_(prob_rhp)
        self.prediction = prob_sm

        mask =  T.reshape(self.answer_mask, (n,))
        lbl = T.reshape(self.answer_var, (n,))

        self.params = [self.W_inp_emb_in, #self.b_inp_emb_in, 
                  self.W_inpf_res_in, self.W_inpf_res_hid,self.b_inpf_res,
                  self.W_inpf_upd_in, self.W_inpf_upd_hid, self.b_inpf_upd,
                  self.W_inpf_hid_in, self.W_inpf_hid_hid, self.b_inpf_hid,
                  self.W_inpb_res_in, self.W_inpb_res_hid, self.b_inpb_res,
                  self.W_inpb_upd_in, self.W_inpb_upd_hid, self.b_inpb_upd,
                  self.W_inpb_hid_in, self.W_inpb_hid_hid, self.b_inpb_hid,
                  self.W_inp_emb_q, self.b_inp_emb_q,
                  self.W_mem_res_in, self.W_mem_res_hid, self.b_mem_res, 
                  self.W_mem_upd_in, self.W_mem_upd_hid, self.b_mem_upd,
                  self.W_mem_hid_in, self.W_mem_hid_hid, self.b_mem_hid, #self.W_b
                  self.W_1, self.W_2, self.b_1, self.b_2, self.W_a,
                  self.W_mem_emb, self.W_inp_emb,
                  self.W_ans_res_in, self.W_ans_res_hid, self.b_ans_res, 
                  self.W_ans_upd_in, self.W_ans_upd_hid, self.b_ans_upd,
                  self.W_ans_hid_in, self.W_ans_hid_hid, self.b_ans_hid,
                  ]            
                              
        print "==> building loss layer and computing updates"
        loss_vec = T.nnet.categorical_crossentropy(prob_sm, lbl)
        self.loss_ce = (mask * loss_vec ).sum() / mask.sum() 

        #self.loss_ce = T.nnet.categorical_crossentropy(results_rhp, lbl)
            
        if self.l2 > 0:
            self.loss_l2 = self.l2 * nn_utils.l2_reg(self.params)
        else:
            self.loss_l2 = 0
        
        self.loss = self.loss_ce + self.loss_l2
            
        updates = lasagne.updates.adadelta(self.loss, self.params, learning_rate = self.learning_rate)
        #updates = lasagne.updates.momentum(self.loss, self.params, learning_rate=0.001)
        
        if self.mode == 'train':
            print "==> compiling train_fn"
            self.train_fn = theano.function(inputs=[self.input_var, self.q_var, self.answer_var, self.answer_mask, self.answer_inp_var], 
                                            outputs=[self.prediction, self.loss],
                                            updates=updates)
        
        print "==> compiling test_fn"
        self.test_fn = theano.function(inputs=[self.input_var, self.q_var, self.answer_var, self.answer_mask, self.answer_inp_var],
                                       outputs=[self.prediction, self.loss])
Exemple #48
0
    def construct_graph_popstats(self, args, x, drops, length, popstats=None):
        p = self.allocate_parameters(args)

        def stepfn(x, drops, dummy_h, dummy_c, pop_means_a, pop_means_b,
                   pop_means_c, pop_vars_a, pop_vars_b, pop_vars_c, h, c):

            atilde = T.dot(h, p.Wa)
            btilde = x
            if args.baseline:
                a_normal, a_mean, a_var = bn(atilde, 1.0, p.ab_betas,
                                             pop_means_a, pop_vars_a, args)
                b_normal, b_mean, b_var = bn(btilde, 1.0, 0, pop_means_b,
                                             pop_vars_b, args)
            else:
                a_normal, a_mean, a_var = bn(atilde, p.a_gammas, p.ab_betas,
                                             pop_means_a, pop_vars_a, args)
                b_normal, b_mean, b_var = bn(btilde, p.b_gammas, 0,
                                             pop_means_b, pop_vars_b, args)
            ab = a_normal + b_normal
            g, f, i, o = [
                fn(ab[:, j * args.num_hidden:(j + 1) * args.num_hidden])
                for j, fn in enumerate([self.activation] +
                                       3 * [T.nnet.sigmoid])
            ]

            if args.elephant:
                c_n = dummy_c + f * c + drops * (i * g)
            else:
                c_n = dummy_c + f * c + i * g
            if args.baseline:
                c_normal, c_mean, c_var = bn(c_n, 1.0, p.c_betas, pop_means_c,
                                             pop_vars_c, args)
            else:
                c_normal, c_mean, c_var = bn(c_n, p.c_gammas, p.c_betas,
                                             pop_means_c, pop_vars_c, args)
            h_n = dummy_h + o * self.activation(c_normal)

            ## Zoneout
            if args.zoneout:
                h = h_n * drops + (1 - drops) * h
                c = c_n * drops + (1 - drops) * c
            else:
                h = h_n
                c = c_n

            return (h, c, atilde, btilde, c_normal, a_mean, b_mean, c_mean,
                    a_var, b_var, c_var)

        xtilde = T.dot(x, p.Wx)
        if args.noise:
            # prime h with white noise
            Trng = MRG_RandomStreams()
            h_prime = Trng.normal((xtilde.shape[1], args.num_hidden),
                                  std=args.noise)
        elif args.summarize:
            # prime h with mean of example
            h_prime = x.mean(axis=[0, 2])[:, None]
        else:
            h_prime = 0

        dummy_states = dict(h=T.zeros(
            (xtilde.shape[0], xtilde.shape[1], args.num_hidden)),
                            c=T.zeros((xtilde.shape[0], xtilde.shape[1],
                                       args.num_hidden)))

        if popstats is None:
            popstats = OrderedDict()
            for key, size in zip(
                    "abc",
                [4 * args.num_hidden, 4 * args.num_hidden, args.num_hidden]):
                for stat, init in zip("mean var".split(), [0, 1]):
                    name = "%s_%s" % (key, stat)
                    popstats[name] = theano.shared(init + np.zeros(
                        (
                            length,
                            size,
                        ), dtype=theano.config.floatX),
                                                   name=name)
        popstats_seq = [
            popstats['a_mean'], popstats['b_mean'], popstats['c_mean'],
            popstats['a_var'], popstats['b_var'], popstats['c_var']
        ]

        [
            h, c, atilde, btilde, htilde, batch_mean_a, batch_mean_b,
            batch_mean_c, batch_var_a, batch_var_b, batch_var_c
        ], _ = theano.scan(
            stepfn,
            sequences=[xtilde, drops, dummy_states["h"], dummy_states["c"]] +
            popstats_seq,
            outputs_info=[
                T.repeat(p.h0[None, :], xtilde.shape[1], axis=0) + h_prime,
                T.repeat(p.c0[None, :], xtilde.shape[1], axis=0), None, None,
                None, None, None, None, None, None, None
            ])

        batchstats = OrderedDict()
        batchstats['a_mean'] = batch_mean_a
        batchstats['b_mean'] = batch_mean_b
        batchstats['c_mean'] = batch_mean_c
        batchstats['a_var'] = batch_var_a
        batchstats['b_var'] = batch_var_b
        batchstats['c_var'] = batch_var_c

        updates = OrderedDict()
        if not args.use_population_statistics:
            alpha = 1e-2
            for key in "abc":
                for stat, init in zip("mean var".split(), [0, 1]):
                    name = "%s_%s" % (key, stat)
                    popstats[name].tag.estimand = batchstats[name]
                    updates[popstats[name]] = (alpha * batchstats[name] +
                                               (1 - alpha) * popstats[name])
        return dict(h=h, c=c, atilde=atilde, btilde=btilde,
                    htilde=htilde), updates, dummy_states, popstats
Exemple #49
0
def matrixify(vector, n):
    return T.repeat(T.shape_padleft(vector), n, axis=0)
def onestep_attend_tell(x_t, pre_h, pre_c, pre_z, Wi, Wf, Wc, Wo, Ui, Uf, Uc, Uo, Zi, Zf, Zc, Zo, Zcontext, Hcontext, Va, bi, bf, bc, bo, image_feature_region, weight_y):
	#-------------------------------------------------
	# pre_h = T.tensor3(name = 'h0_initial', dtype = theano.config.floatX)
	# x_t   = T.tensor3(name ='x', dtype=theano.config.floatX)
	# pre_z = T.tensor3(name= 'z0_initial', dtype = theano.config.floatX)
	# Wi, Ui, Zi   =  T.fmatrices(3)
	# bi    = T.ftensor3("bi")
	#-------------------------------------------------

	i_t = T.dot(x_t, Wi) + T.dot(pre_h, Ui) + T.dot(pre_z, Zi)
	i_t_shape = T.shape(i_t)

	#------------------------------------------------------------------
	# i_t_test = i_t.eval({x_t: x_theano, pre_h: h0_theano, pre_z: z0_theano, Wi: Wx[:, :H], Ui: Wh[:, :H], Zi: Wz[:, :H]})
	# print(i_t_test.shape)
	# pdb.set_trace()    
	#------------------------------------------------------------------

	bi_reshape = T.repeat(bi, i_t_shape[0], 0)
	bi_reshape_2x = T.repeat(bi_reshape, i_t_shape[1], 1)

	# -----------------------------------------------------------------
	# bi_test = bi_reshape_2x.eval({bi: b_theano[:,:,:H], i_t: i_t.eval({i_t: np.zeros((1,2,4)).astype("float32")})})
	# print(bi_test.shape)
	# pdb.set_trace()
	# ------------------------------------------------------------------

	bf_reshape = T.repeat(bf, i_t_shape[0], 0)
	bf_reshape_2x = T.repeat(bf_reshape, i_t_shape[1], 1)

	bc_reshape = T.repeat(bc, i_t_shape[0], 0)
	bc_reshape_2x = T.repeat(bc_reshape, i_t_shape[1], 1)

	bo_reshape = T.repeat(bo, i_t_shape[0], 0)
	bo_reshape_2x = T.repeat(bo_reshape, i_t_shape[1], 1)

	i_t_new= sigmoid(i_t + bi_reshape_2x)
	

	# ------------------------------------------------------------------
	# i_t_new_eval = i_t_new.eval({i_t: np.zeros((1,2,4)).astype("float32"), bi: b_theano[:, : , :H]})
	# print(i_t_new_eval.shape)
	# pdb.set_trace()
	# --------------------------------------------------------------------
	

	f_t= sigmoid(T.dot(x_t, Wf) + T.dot(pre_h, Uf) + T.dot(pre_z, Zf) + bf_reshape_2x)
	# --------------------------------------------------------------------
	# f_t_eval = f_t.eval({x_t:x_theano, pre_h: h0_theano, pre_z: z0_theano,
	# 	Wf: Wx[:, H:2*H],
	# 	Uf: Wh[:, H:2*H],
	# 	Zf: Wz[:, H:2*H],
	# 	bf: b_theano[:, :, H:2*H],
	# 	i_t: np.zeros((1,2,4)).astype("float32")})

	# print(f_t_eval.shape)
	# pdb.set_trace()
	# --------------------------------------------------------------------

	o_t= sigmoid(T.dot(x_t, Wo) + T.dot(pre_h, Uo) + T.dot(pre_z, Zo) + bo_reshape_2x)

	c_th = tanh(T.dot(x_t, Wc)  + T.dot(pre_h, Uc) + T.dot(pre_z, Zc) + bc_reshape_2x)

	c_t = f_t*pre_c + i_t_new*c_th

	h_t = o_t*T.tanh(c_t) #shape (1, N, h_dim)

	# ------------------------------------------------------------------
	# ht_test = h_t.eval({x_t:x_theano, pre_h: h0_theano, pre_c: c0_theano, pre_z: z0_theano, 
	# 	Wi: Wx[:, :H], Wf: Wx[:, H:2*H], Wo: Wx[:, 2*H:3*H], Wc: Wx[:, 3*H:],
	# 	Ui: Wh[:, :H], Uf: Wh[:, H:2*H], Uo: Wh[:, 2*H:3*H], Uc: Wh[:, 3*H:],
	# 	Zi: Wz[:, :H], Zf: Wz[:, H:2*H], Zo: Wz[:, 2*H:3*H], Zc: Wz[:, 3*H:], 
	# 	bi: b_theano[:,:,:H], bf: b_theano[:, :, H:2*H], bo: b_theano[ :, :, 2*H:3*H], bc: b_theano[:,:, 3*H:]})
	# print(ht_test.shape)
	# pdb.set_trace()
	# ------------------------------------------------------------------

	h_t_context = T.repeat(h_t, image_feature_region.shape[1], axis = 0) #new shape (No_region, N, h_dim)
	image_feature_reshape = T.transpose(image_feature_region, (1, 0, 2))
	#compute non-linear correlation between h_t(current text) to image_feature_region (64 for 128*128 and 196 for 224*224)
	# pdb.set_trace()
	m_t = T.tanh(T.dot(h_t_context, Hcontext) + T.dot(image_feature_reshape, Zcontext)) #shape (No_region, N, context_dim)	

	# ------------------------------------------------------------------
	# N = 2 #number of sample
	# D = 5 #dimension of input
	# H = 4 #dimension of hidden
	# T_new = 1 #length of per each sample
	# context_dim = 3
	# K = 5

	# x = np.linspace(-0.4, 0.6, num=N*T_new*D, dtype = theano.config.floatX).reshape(T_new, N, D)
	# h0= np.linspace(-0.4, 0.8, num=N*H, dtype = theano.config.floatX).reshape(N, H)
	# Wx= np.linspace(-0.2, 0.9, num=4*D*H, dtype = theano.config.floatX).reshape(D, 4*H)
	# Wh= np.linspace(-0.3,0.6, num =4*H*H, dtype = theano.config.floatX).reshape(H,4*H)
	# b = np.linspace(0.0, 0.0, num = 4*H, dtype = theano.config.floatX)
	# Wz= np.linspace(-0.3, 0.6, num=4*H*context_dim, dtype = theano.config.floatX).reshape(context_dim, 4*H)
	# Hcontext_in = np.linspace(-0.2, 0.6, num=H*K, dtype = theano.config.floatX).reshape(H, K)
	# Zcontext_in = np.linspace(-0.2, 0.5, num=context_dim*K, dtype= theano.config.floatX).reshape(context_dim, K)
	# Va= np.linspace(0.1, 0.4, num=K, dtype = theano.config.floatX)
	# Va_reshape = Va.reshape(K,1)

	# image_feature_3D = np.linspace(-0.2, 0.5, num=10*N*context_dim, dtype = theano.config.floatX).reshape(N,10, context_dim)

	# h0_theano = h0.reshape(1, N, H)
	# # h0_symb   = theano.tensor.ftensor3("h_symb")
	# # lstm_theano_layer.h_m1.set_value(h0_theano)

	# c0_theano = np.zeros((1, N, H), dtype = theano.config.floatX)
	# # c0_symb   = theano.tensor.ftensor3("c_symb")
	# # lstm_theano_layer.c_m1.set_value(c0_theano)

	# z0_theano = np.zeros((1, N, context_dim), dtype = theano.config.floatX)

	# x_theano = x.reshape(T_new, N, D)
	# image_feature_input = image_feature_3D

	# weight_y_in_value = np.zeros(( 10, context_dim) , dtype= theano.config.floatX)
	# b_theano= b.reshape(1, 1, 4*H)

	# h_t_context_eval = m_t.eval({h_t: np.ones((1,2,4)).astype("float32"), image_feature_region: image_feature_input, Hcontext: Hcontext_in, Zcontext: Zcontext_in})
	# print(h_t_context_eval.shape)
	# pdb.set_trace()
	# ------------------------------------------------------------------

	
	e = T.dot(m_t, Va) #No_region, N, 1
	e_reshape = e.reshape((e.shape[0], T.prod(e.shape[1:])))
	
	# ------------------------------------------------------------------
	# Va_in= np.linspace(0.1, 0.4, num=5*1, dtype = theano.config.floatX).reshape(5,1)
	# Va_reshape = Va_in.reshape(5,1).astype("float32")
	# # print(Va_reshape)
	# e_val = e_reshape.eval({m_t: np.ones((10,2,5)).astype("float32"), Va: Va_reshape}) #np.ones((10,2,5)).astype("float32")
	# print(e_val.shape)
	# ------------------------------------------------------------------
	


	e_softmax = softmax_along_axis(e_reshape, axis = 0) #shape No_region, N
	
	# -------------------------------------------------------------------
	# pdb.set_trace()
	# e_softmax_eval = e_softmax.eval({e_reshape: np.random.randn(10,2).astype("float32")})
	# print(e_softmax_eval.shape)
	# -------------------------------------------------------------------

	e_t = T.transpose(e_softmax, (1,0)) #shape N, No_region
	e_t_r = e_t.reshape([-1, e_softmax.shape[0], e_softmax.shape[1]]) #3D tensor 1, N, No_region
	e_t_r_t = T.transpose(e_t_r, (1,0, 2)) # shape N, 1, No_region
	e_3D = T.repeat(e_t_r_t, e_t_r_t.shape[2], axis = 1) #shape N, No_region, No_region  image_feature_region.shape[1]
	e_3D_t = T.transpose(e_3D, (1,2,0)) #No_region, No_region, N

	# ---------------------------------------------------------------------
	# image_feature_3D = np.linspace(-0.2, 0.5, num=10*2*3, dtype = theano.config.floatX).reshape(2,10, 3)
	# e_3D_t_eval = e_3D_t.eval({e_softmax: np.random.randn(10,2).astype("float32")})
	# print(e_3D_t_eval.shape)
	# pdb.set_trace()
	# ---------------------------------------------------------------------

	identity_2D = T.identity_like(e_3D_t)# shape No_region, No_region
	identity_3D = identity_2D.reshape([-1, identity_2D.shape[0], identity_2D.shape[1]]) # shape 1, No_region, No_region
	identity_3D_t = T.repeat(identity_3D,  image_feature_region.shape[0], axis = 0)
	e_3D_diagonal = e_3D*identity_3D_t #diagonal tensor 3D  (N, No_region, No_region)

	# ----------------------------------------------------------------------
	# image_feature_3D = np.linspace(-0.2, 0.5, num=10*2*3, dtype = theano.config.floatX).reshape(2,10, 3)
	
	# e_3D_diagonal_eval = e_3D_diagonal.eval({e_3D_t: np.ones((10, 10, 2)).astype("float32"), 
	# 	image_feature_region: image_feature_3D, 
	# 	e_3D: np.ones((2, 10, 10)).astype("float32")})
	
	# print(e_3D_diagonal_eval)
	# pdb.set_trace()
	# ----------------------------------------------------------------------

	# weight_y = T.fmatrix("weight_y")

	out_weight_y, updates = theano.scan(fn=onestep_weight_feature_multiply,
	                                   outputs_info=[weight_y],
	                                   sequences=[e_3D_diagonal, image_feature_region],
	                                   non_sequences=[])

	#out_weight_y shape (N, No_region, feature_dim)
	z_t = T.sum(out_weight_y, axis = 1) #shape (N, feature_dim)

	z_t_r = z_t.reshape((-1,z_t.shape[0],z_t.shape[1]))

    #------------------------------------------------------------------------ 
	pdb.set_trace()
	image_feature_3D = np.linspace(-0.2, 0.5, num=10*2*3, dtype = theano.config.floatX).reshape(2,10, 3)
	z_t_r_eval = z_t_r.eval({e_3D_diagonal: np.ones((2,10,10)).astype("float32"), 
		image_feature_region: image_feature_3D, 
		weight_y: np.zeros((10,3)).astype("float32")})
	
	print(z_t_r_eval.shape)

	pdb.set_trace()
	# -----------------------------------------------------------------------

	return [h_t, c_t, z_t_r]
 def initial_states(self, batch_size, *args, **kwargs):
     return tensor.repeat(
         tensor.ones(self.parameters[1][None, :].shape),
         batch_size,
         0)
Exemple #52
0
 def _get_initial_states(self, batch_size):
     init_h = T.repeat(self.init_h.dimshuffle('x', 0), batch_size, axis=0)
     init_o = apply_model(self.readout, init_h)
     return init_h, init_o
Exemple #53
0
    def __init__(self, x_h_0, v_h_0, t_h_0, x_t_0, v_t_0, a_t_0, t_t_0,
                 time_steps, exist, is_leader, x_goal, turn_vec_h, turn_vec_t,
                 n_steps, lr, game_params, arch_params, solver_params, params):

        self._init_layers(params, arch_params, game_params)

        self._connect(game_params, solver_params)

        def _dist_from_rail(pos, rail_center, rail_radius):
            d = tt.sqrt(((pos - rail_center)**2).sum())
            return tt.sum((d - rail_radius)**2)

        def _step_state(x_h_, v_h_, angle_, speed_, t_h_, turn_vec_h, x_t_,
                        v_t_, t_t_, turn_vec_t, ctrl, exist, time_step):

            a_t_e, v_t_e, x_t_e, t_t, t_h = step(x_h_, v_h_, t_h_, turn_vec_h,
                                                 x_t_, v_t_, t_t_, turn_vec_h,
                                                 exist, time_step)

            t_h = common.disconnected_grad(t_h)
            t_t = common.disconnected_grad(t_t)

            # approximated dynamic of the un-observed parts in the state
            a_t_a = tt.zeros(shape=(3, 2), dtype=np.float32)

            v_t_a = v_t_

            x_t_a = x_t_ + self.dt * v_t_a

            # difference in predictions
            n_v_t = v_t_e - v_t_a

            n_a_t = a_t_e - a_t_a

            n_x_t = x_t_e - x_t_a

            # disconnect the gradient of the noise signals
            n_v_t = common.disconnected_grad(n_v_t)

            n_a_t = common.disconnected_grad(n_a_t)

            n_x_t = common.disconnected_grad(n_x_t)

            # add the noise to the approximation
            a_t = a_t_a + n_a_t

            v_t = v_t_a + n_v_t

            x_t = x_t_a + n_x_t

            # update the observed part of the state
            delta_steer = ctrl[0]
            accel = ctrl[1]

            delta_steer = tt.clip(delta_steer, -np.pi / 4, np.pi / 4)

            angle = angle_ + delta_steer

            speed = speed_ + accel * self.dt

            speed = tt.clip(speed, 0, self.v_max)

            v_h_x = speed * tt.sin(angle)
            v_h_y = speed * tt.cos(angle)

            v_h = tt.stack([v_h_x, v_h_y])

            x_h = x_h_ + self.dt * v_h
            x_h = tt.clip(x_h, -self.bw, self.bw)

            return x_h, v_h, angle, speed, t_h, x_t, v_t, a_t, t_t

        def _recurrence(time_step, x_h_, v_h_, angle_, speed_, t_h_, x_t_,
                        v_t_, a_t_, t_t_, exist, is_leader, x_goal, turn_vec_h,
                        turn_vec_t):
            # state
            '''
            1. host
                1.1 position (2) - (x,y) coordinates in cross coordinate system
                1.2 speed (2) - (v_x,v_y)
                # 1.3 acceleration (2) - (a_x,a_y)
                # 1.4 waiting time (1) - start counting on full stop. stop counting when clearing the junction
                1.5 x_goal (2) - destination position (indicates different turns)
                total = 5
            2. right lane car
                2.1 position (2) - null value = (-1,-1)
                2.2 speed (2) - null value = (0,0)
                2.3 acceleration (2) - null value = (0,0)
                2.4 waiting time (1) - null value = 0
                total = 7
            3. front lane car
                3.1 position (2)
                3.2 speed (2)
                3.3 acceleration (2)
                3.4 waiting time (1)
                total = 7
            4. target 3
                4.1 position (2)
                4.2 speed (2)
                4.3 acceleration (2)
                4.4 waiting time (1)
                total = 7
            total = 26
            '''

            # host_state_vec = tt.concatenate([x_h_, v_h_, t_h_])
            ang_spd = tt.stack([angle_, speed_])
            host_state_vec = tt.concatenate([x_h_, ang_spd, x_goal])

            # target_state_vec = tt.concatenate([tt.flatten(x_t_), tt.flatten(v_t_), tt.flatten(a_t_), tt.flatten(t_t_)])
            target_state_vec = tt.concatenate([
                tt.flatten(x_t_),
                tt.flatten(v_t_),
                tt.flatten(a_t_), is_leader
            ])

            state = tt.concatenate([host_state_vec, target_state_vec])

            h0 = tt.dot(state, self.W_0) + self.b_0

            relu0 = tt.nnet.relu(h0)

            h1 = tt.dot(relu0, self.W_1) + self.b_1

            relu1 = tt.nnet.relu(h1)

            h2 = tt.dot(relu1, self.W_2) + self.b_2

            relu2 = tt.nnet.relu(h2)

            a_h = tt.dot(relu2, self.W_c)

            x_h, v_h, angle, speed, t_h, x_t, v_t, a_t, t_t = _step_state(
                x_h_, v_h_, angle_, speed_, t_h_, turn_vec_h, x_t_, v_t_, t_t_,
                turn_vec_t, a_h, exist, time_step)

            # cost:

            discount_factor = 0.99**time_step

            # 0. smooth driving policy
            cost_steer = discount_factor * a_h[0]**2
            cost_accel = discount_factor * a_h[1]**2

            # 1. forcing the host to move forward
            dist_from_goal = tt.mean((x_goal - x_h)**2)

            cost_progress = discount_factor * dist_from_goal

            # 2. keeping distance from in front vehicles
            d_t_h = x_t - x_h

            h_t_dists = (d_t_h**2).sum(axis=1)

            # v_h_norm = tt.sqrt((v_h**2).sum())
            # d_t_h_norm = tt.sqrt((d_t_h**2).sum(axis=1))
            #
            # denominator = v_h_norm * d_t_h_norm
            #
            # host_targets_orientation = tt.dot(d_t_h, v_h) / (denominator + 1e-3)
            #
            # in_fornt_targets = tt.nnet.sigmoid(5 * host_targets_orientation)
            #
            # close_targets = tt.sum(tt.abs_(d_t_h))
            #
            # cost_accident = tt.mean(in_fornt_targets * close_targets)

            cost_accident = tt.sum(
                tt.nnet.relu(self.require_distance - h_t_dists))

            # 3. rail divergence
            cost_right_rail = _dist_from_rail(
                x_h, self.right_rail_center,
                self.right_rail_radius) * turn_vec_h[0]
            cost_front_rail = (x_h[0] - self.lw / 2)**2 * turn_vec_h[1]
            cost_left_rail = _dist_from_rail(
                x_h, self.left_rail_center,
                self.left_rail_radius) * turn_vec_h[2]

            cost_rail = cost_right_rail + cost_left_rail + cost_front_rail

            return (x_h, v_h, angle, speed, t_h, x_t, v_t, a_t, t_t,
                    cost_steer, cost_accel, cost_progress, cost_accident,
                    cost_rail,
                    a_h), t.scan_module.until(dist_from_goal < 0.001)

        [
            x_h, v_h, angle, speed, t_h, x_t, v_t, a_t, t_t, costs_steer,
            costs_accel, costs_progress, costs_accident, costs_rail, a_hs
        ], scan_updates = t.scan(
            fn=_recurrence,
            sequences=time_steps,
            outputs_info=[
                x_h_0, v_h_0, 0., 0., t_h_0, x_t_0, v_t_0, a_t_0, t_t_0, None,
                None, None, None, None, None
            ],
            non_sequences=[exist, is_leader, x_goal, turn_vec_h, turn_vec_t],
            n_steps=n_steps,
            name='scan_func')

        # 3. right of way cost term

        T = x_h.shape[0]

        x_h_rpt_1 = tt.repeat(x_h, T, axis=1)  # (Tx2T)

        x_h_rpt_1_3d = x_h_rpt_1.dimshuffle(0, 1, 'x')  # (Tx2Tx1)

        x_h_3D = tt.repeat(x_h_rpt_1_3d, 3, axis=2)  # (Tx2Tx3)

        x_t_rshp_1 = tt.zeros(shape=(2 * T, 3), dtype=np.float32)  # (2Tx3)

        x_t_rshp_1_x = tt.set_subtensor(x_t_rshp_1[:T, :], x_t[:, :, 0])

        x_t_rshp_1_xy = tt.set_subtensor(x_t_rshp_1_x[T:, :], x_t[:, :, 1])

        x_t_rshp_1_3d = x_t_rshp_1_xy.dimshuffle(0, 1, 'x')  # (2Tx3x1)

        x_t_rpt_2_3d = tt.repeat(x_t_rshp_1_3d, T, axis=2)  # (2Tx3xT)

        x_t_3D = x_t_rpt_2_3d.dimshuffle(2, 0, 1)  # (Tx2Tx3)

        # abs_diff_mat = tt.abs_(x_h_3D - x_t_3D) # (Tx2Tx3)
        abs_diff_mat = (x_h_3D - x_t_3D)**2  # (Tx2Tx3)

        dists_mat = abs_diff_mat[:, :
                                 T, :] + abs_diff_mat[:,
                                                      T:, :]  # d_x+d_y: (TxTx3)

        # punish only when cutting a leader
        host_effective_dists = (tt.triu(dists_mat[:, :, 0]) * is_leader[0] +
                                tt.triu(dists_mat[:, :, 1]) * is_leader[1] +
                                tt.triu(dists_mat[:, :, 2]) * is_leader[2])

        costs_row = tt.mean(
            tt.nnet.sigmoid(self.eps_row - host_effective_dists))

        self.cost_steer = tt.mean(costs_steer)
        self.cost_accel = tt.mean(costs_accel)
        self.cost_progress = tt.mean(costs_progress)
        self.cost_accident = tt.mean(costs_accident)
        self.cost_row = tt.mean(costs_row)
        self.cost_rail = tt.mean(costs_rail)

        self.weighted_cost = (
            self.w_delta_steer * self.cost_steer +
            self.w_accel * self.cost_accel +
            self.w_progress * self.cost_progress +
            self.w_accident * self.cost_accident +
            # self.w_row * self.cost_row
            self.w_rail * self.cost_rail)

        self.cost = (
            self.cost_steer + self.cost_accel + self.cost_progress +
            self.cost_accident +
            # self.cost_row
            self.cost_rail)

        objective = self.weighted_cost

        objective = common.weight_decay(objective=objective,
                                        params=self.params,
                                        l1_weight=self.l1_weight)

        objective = t.gradient.grad_clip(objective, -self.grad_clip_val,
                                         self.grad_clip_val)

        gradients = tt.grad(objective, self.params)

        self.updates = optimizers.optimizer(lr=lr,
                                            param_struct=self,
                                            gradients=gradients,
                                            solver_params=solver_params)

        self.x_h = x_h
        self.v_h = v_h
        self.x_t = x_t
        self.v_t = v_t

        self.max_a = tt.max(abs(a_hs))

        self.max_grad_val = 0
        self.grad_mean = 0
        for g in gradients:
            self.grad_mean += tt.mean(tt.abs_(g))
            self.max_grad_val = (tt.max(g) > self.max_grad_val) * tt.max(g) + (
                tt.max(g) <= self.max_grad_val) * self.max_grad_val

        self.params_abs_norm = self._calc_params_norm()
 def training_cost_weighted(self, y, weights=None):
     """ Wrapper for standard name """
     LL = T.log(self.p_y_given_x)[T.arange(y.shape[0]), y]
     weights = T.repeat(weights.dimshuffle('x', 0), y.shape[0], axis=0)
     factors = weights[T.arange(y.shape[0]), y]
     return -T.mean(LL * factors)
Exemple #55
0
def evaluate_lenet5(learning_rate=0.01,
                    n_epochs=4,
                    emb_size=40,
                    batch_size=50,
                    describ_max_len=20,
                    type_size=12,
                    filter_size=[3, 5],
                    maxSentLen=100,
                    hidden_size=[300, 300]):

    model_options = locals().copy()
    print "model options", model_options
    emb_root = '/save/wenpeng/datasets/LORELEI/multi-lingual-emb/'
    test_file_path = '/save/wenpeng/datasets/LORELEI/il5-setE-as-test-input_ner_filtered_w2.txt'
    output_file_path = '/save/wenpeng/datasets/LORELEI/il5_system_output_forfun_w2.json'
    seed = 1234
    np.random.seed(seed)
    rng = np.random.RandomState(
        seed)  #random seed, control the model generates the same results
    srng = T.shared_randomstreams.RandomStreams(rng.randint(seed))
    word2id = {}
    # all_sentences, all_masks, all_labels, all_other_labels, word2id=load_BBN_il5Trans_il5_dataset(maxlen=maxSentLen)  #minlen, include one label, at least one word in the sentence
    train_p1_sents, train_p1_masks, train_p1_labels, word2id = load_trainingData_types(
        word2id, maxSentLen)
    train_p2_sents, train_p2_masks, train_p2_labels, train_p2_other_labels, word2id = load_trainingData_types_plus_others(
        word2id, maxSentLen)
    test_sents, test_masks, test_lines, word2id = load_official_testData(
        word2id, maxSentLen, test_file_path)

    label_sent, label_mask = load_SF_type_descriptions(word2id, type_size,
                                                       describ_max_len)
    label_sent = np.asarray(label_sent, dtype='int32')
    label_mask = np.asarray(label_mask, dtype=theano.config.floatX)

    train_p1_sents = np.asarray(train_p1_sents, dtype='int32')
    train_p1_masks = np.asarray(train_p1_masks, dtype=theano.config.floatX)
    train_p1_labels = np.asarray(train_p1_labels, dtype='int32')
    train_p1_size = len(train_p1_labels)

    train_p2_sents = np.asarray(train_p2_sents, dtype='int32')
    train_p2_masks = np.asarray(train_p2_masks, dtype=theano.config.floatX)
    train_p2_labels = np.asarray(train_p2_labels, dtype='int32')
    train_p2_other_labels = np.asarray(train_p2_other_labels, dtype='int32')
    train_p2_size = len(train_p2_labels)
    '''
    combine train_p1 and train_p2
    '''
    train_sents = np.concatenate([train_p1_sents, train_p2_sents], axis=0)
    train_masks = np.concatenate([train_p1_masks, train_p2_masks], axis=0)
    train_labels = np.concatenate([train_p1_labels, train_p2_labels], axis=0)
    train_size = train_p1_size + train_p2_size

    test_sents = np.asarray(test_sents, dtype='int32')
    test_masks = np.asarray(test_masks, dtype=theano.config.floatX)
    # test_labels=np.asarray(all_labels[2], dtype='int32')
    test_size = len(test_sents)

    vocab_size = len(word2id) + 1  # add one zero pad index

    rand_values = rng.normal(
        0.0, 0.01,
        (vocab_size, emb_size))  #generate a matrix by Gaussian distribution
    rand_values[0] = np.array(np.zeros(emb_size), dtype=theano.config.floatX)
    id2word = {y: x for x, y in word2id.iteritems()}
    word2vec = load_fasttext_multiple_word2vec_given_file([
        emb_root + 'IL5-cca-wiki-lorelei-d40.eng.vec',
        emb_root + 'IL5-cca-wiki-lorelei-d40.IL5.vec'
    ], 40)
    rand_values = load_word2vec_to_init(rand_values, id2word, word2vec)
    embeddings = theano.shared(
        value=np.array(rand_values, dtype=theano.config.floatX), borrow=True
    )  #wrap up the python variable "rand_values" into theano variable

    #now, start to build the input form of the model
    sents_id_matrix = T.imatrix('sents_id_matrix')
    sents_mask = T.fmatrix('sents_mask')
    labels = T.imatrix('labels')  #batch*12
    other_labels = T.imatrix()  #batch*4

    des_id_matrix = T.imatrix()
    des_mask = T.fmatrix()
    ######################
    # BUILD ACTUAL MODEL #
    ######################
    print '... building the model'

    common_input = embeddings[sents_id_matrix.flatten()].reshape(
        (batch_size, maxSentLen, emb_size)).dimshuffle(
            0, 2, 1)  #the input format can be adapted into CNN or GRU or LSTM
    bow_emb = T.sum(common_input * sents_mask.dimshuffle(0, 'x', 1), axis=2)
    repeat_common_input = T.repeat(
        normalize_tensor3_colwise(common_input), type_size,
        axis=0)  #(batch_size*type_size, emb_size, maxsentlen)

    des_input = embeddings[des_id_matrix.flatten()].reshape(
        (type_size, describ_max_len, emb_size)).dimshuffle(0, 2, 1)
    bow_des = T.sum(des_input * des_mask.dimshuffle(0, 'x', 1),
                    axis=2)  #(tyope_size, emb_size)
    repeat_des_input = T.tile(
        normalize_tensor3_colwise(des_input),
        (batch_size, 1, 1))  #(batch_size*type_size, emb_size, maxsentlen)

    conv_W, conv_b = create_conv_para(rng,
                                      filter_shape=(hidden_size[0], 1,
                                                    emb_size, filter_size[0]))
    conv_W2, conv_b2 = create_conv_para(rng,
                                        filter_shape=(hidden_size[0], 1,
                                                      emb_size,
                                                      filter_size[1]))
    multiCNN_para = [conv_W, conv_b, conv_W2, conv_b2]

    conv_att_W, conv_att_b = create_conv_para(rng,
                                              filter_shape=(hidden_size[0], 1,
                                                            emb_size,
                                                            filter_size[0]))
    conv_W_context, conv_b_context = create_conv_para(
        rng, filter_shape=(hidden_size[0], 1, emb_size, 1))
    conv_att_W2, conv_att_b2 = create_conv_para(rng,
                                                filter_shape=(hidden_size[0],
                                                              1, emb_size,
                                                              filter_size[1]))
    conv_W_context2, conv_b_context2 = create_conv_para(
        rng, filter_shape=(hidden_size[0], 1, emb_size, 1))
    ACNN_para = [
        conv_att_W, conv_att_b, conv_W_context, conv_att_W2, conv_att_b2,
        conv_W_context2
    ]
    '''
    multi-CNN
    '''
    conv_model = Conv_with_Mask(
        rng,
        input_tensor3=common_input,
        mask_matrix=sents_mask,
        image_shape=(batch_size, 1, emb_size, maxSentLen),
        filter_shape=(hidden_size[0], 1, emb_size, filter_size[0]),
        W=conv_W,
        b=conv_b
    )  #mutiple mask with the conv_out to set the features by UNK to zero
    sent_embeddings = conv_model.maxpool_vec  #(batch_size, hidden_size) # each sentence then have an embedding of length hidden_size

    conv_model2 = Conv_with_Mask(
        rng,
        input_tensor3=common_input,
        mask_matrix=sents_mask,
        image_shape=(batch_size, 1, emb_size, maxSentLen),
        filter_shape=(hidden_size[0], 1, emb_size, filter_size[1]),
        W=conv_W2,
        b=conv_b2
    )  #mutiple mask with the conv_out to set the features by UNK to zero
    sent_embeddings2 = conv_model2.maxpool_vec  #(batch_size, hidden_size) # each sentence then have an embedding of length hidden_size
    '''
    GRU
    '''
    U1, W1, b1 = create_GRU_para(rng, emb_size, hidden_size[0])
    GRU_NN_para = [
        U1, W1, b1
    ]  #U1 includes 3 matrices, W1 also includes 3 matrices b1 is bias
    # gru_input = common_input.dimshuffle((0,2,1))   #gru requires input (batch_size, emb_size, maxSentLen)
    gru_layer = GRU_Batch_Tensor_Input_with_Mask(common_input, sents_mask,
                                                 hidden_size[0], U1, W1, b1)
    gru_sent_embeddings = gru_layer.output_sent_rep  # (batch_size, hidden_size)
    '''
    ACNN
    '''
    attentive_conv_layer = Attentive_Conv_for_Pair(
        rng,
        origin_input_tensor3=common_input,
        origin_input_tensor3_r=common_input,
        input_tensor3=common_input,
        input_tensor3_r=common_input,
        mask_matrix=sents_mask,
        mask_matrix_r=sents_mask,
        image_shape=(batch_size, 1, emb_size, maxSentLen),
        image_shape_r=(batch_size, 1, emb_size, maxSentLen),
        filter_shape=(hidden_size[0], 1, emb_size, filter_size[0]),
        filter_shape_context=(hidden_size[0], 1, emb_size, 1),
        W=conv_att_W,
        b=conv_att_b,
        W_context=conv_W_context,
        b_context=conv_b_context)
    sent_att_embeddings = attentive_conv_layer.attentive_maxpool_vec_l

    attentive_conv_layer2 = Attentive_Conv_for_Pair(
        rng,
        origin_input_tensor3=common_input,
        origin_input_tensor3_r=common_input,
        input_tensor3=common_input,
        input_tensor3_r=common_input,
        mask_matrix=sents_mask,
        mask_matrix_r=sents_mask,
        image_shape=(batch_size, 1, emb_size, maxSentLen),
        image_shape_r=(batch_size, 1, emb_size, maxSentLen),
        filter_shape=(hidden_size[0], 1, emb_size, filter_size[1]),
        filter_shape_context=(hidden_size[0], 1, emb_size, 1),
        W=conv_att_W2,
        b=conv_att_b2,
        W_context=conv_W_context2,
        b_context=conv_b_context2)
    sent_att_embeddings2 = attentive_conv_layer2.attentive_maxpool_vec_l
    '''
    cross-DNN-dataless
    '''
    #first map label emb into hidden space
    HL_layer_1_W, HL_layer_1_b = create_HiddenLayer_para(
        rng, emb_size, hidden_size[0])
    HL_layer_1_params = [HL_layer_1_W, HL_layer_1_b]
    HL_layer_1 = HiddenLayer(rng,
                             input=bow_des,
                             n_in=emb_size,
                             n_out=hidden_size[0],
                             W=HL_layer_1_W,
                             b=HL_layer_1_b,
                             activation=T.tanh)
    des_rep_hidden = HL_layer_1.output  #(type_size, hidden_size)
    dot_dnn_dataless_1 = T.tanh(sent_embeddings.dot(
        des_rep_hidden.T))  #(batch_size, type_size)
    dot_dnn_dataless_2 = T.tanh(sent_embeddings2.dot(des_rep_hidden.T))
    '''
    dataless cosine
    '''
    cosine_scores = normalize_matrix_rowwise(bow_emb).dot(
        normalize_matrix_rowwise(bow_des).T)
    cosine_score_matrix = T.nnet.sigmoid(
        cosine_scores)  #(batch_size, type_size)
    '''
    dataless top-30 fine grained cosine
    '''
    fine_grained_cosine = T.batched_dot(
        repeat_common_input.dimshuffle(0, 2, 1),
        repeat_des_input)  #(batch_size*type_size,maxsentlen,describ_max_len)
    fine_grained_cosine_to_matrix = fine_grained_cosine.reshape(
        (batch_size * type_size, maxSentLen * describ_max_len))
    sort_fine_grained_cosine_to_matrix = T.sort(fine_grained_cosine_to_matrix,
                                                axis=1)
    top_k_simi = sort_fine_grained_cosine_to_matrix[:,
                                                    -30:]  # (batch_size*type_size, 5)
    max_fine_grained_cosine = T.mean(top_k_simi, axis=1)
    top_k_cosine_scores = max_fine_grained_cosine.reshape(
        (batch_size, type_size))
    top_k_score_matrix = T.nnet.sigmoid(top_k_cosine_scores)

    acnn_LR_input = T.concatenate([
        dot_dnn_dataless_1, dot_dnn_dataless_2, cosine_score_matrix,
        top_k_score_matrix, sent_embeddings, sent_embeddings2,
        gru_sent_embeddings, sent_att_embeddings, sent_att_embeddings2, bow_emb
    ],
                                  axis=1)
    acnn_LR_input_size = hidden_size[0] * 5 + emb_size + 4 * type_size
    #classification layer, it is just mapping from a feature vector of size "hidden_size" to a vector of only two values: positive, negative
    acnn_U_a, acnn_LR_b = create_LR_para(rng, acnn_LR_input_size, 12)
    acnn_LR_para = [acnn_U_a, acnn_LR_b]
    acnn_layer_LR = LogisticRegression(
        rng,
        input=acnn_LR_input,
        n_in=acnn_LR_input_size,
        n_out=12,
        W=acnn_U_a,
        b=acnn_LR_b
    )  #basically it is a multiplication between weight matrix and input feature vector
    acnn_score_matrix = T.nnet.sigmoid(
        acnn_layer_LR.before_softmax)  #batch * 12
    acnn_prob_pos = T.where(labels < 1, 1.0 - acnn_score_matrix,
                            acnn_score_matrix)
    acnn_loss = -T.mean(T.log(acnn_prob_pos))

    acnn_other_U_a, acnn_other_LR_b = create_LR_para(rng, acnn_LR_input_size,
                                                     16)
    acnn_other_LR_para = [acnn_other_U_a, acnn_other_LR_b]
    acnn_other_layer_LR = LogisticRegression(rng,
                                             input=acnn_LR_input,
                                             n_in=acnn_LR_input_size,
                                             n_out=16,
                                             W=acnn_other_U_a,
                                             b=acnn_other_LR_b)
    acnn_other_prob_matrix = T.nnet.softmax(
        acnn_other_layer_LR.before_softmax.reshape((batch_size * 4, 4)))
    acnn_other_prob_tensor3 = acnn_other_prob_matrix.reshape(
        (batch_size, 4, 4))
    acnn_other_prob = acnn_other_prob_tensor3[
        T.repeat(T.arange(batch_size), 4),
        T.tile(T.arange(4), (batch_size)),
        other_labels.flatten()]
    acnn_other_field_loss = -T.mean(T.log(acnn_other_prob))

    params = multiCNN_para + GRU_NN_para + ACNN_para + acnn_LR_para + HL_layer_1_params  # put all model parameters together
    cost = acnn_loss + 1e-4 * ((conv_W**2).sum() + (conv_W2**2).sum() +
                               (conv_att_W**2).sum() + (conv_att_W2**2).sum())
    updates = Gradient_Cost_Para(cost, params, learning_rate)

    other_paras = params + acnn_other_LR_para
    cost_other = cost + acnn_other_field_loss
    other_updates = Gradient_Cost_Para(cost_other, other_paras, learning_rate)
    '''
    testing
    '''

    ensemble_NN_scores = acnn_score_matrix  #T.max(T.concatenate([att_score_matrix.dimshuffle('x',0,1), score_matrix.dimshuffle('x',0,1), acnn_score_matrix.dimshuffle('x',0,1)],axis=0),axis=0)
    # '''
    # majority voting, does not work
    # '''
    # binarize_NN = T.where(ensemble_NN_scores > 0.5, 1, 0)
    # binarize_dataless = T.where(cosine_score_matrix > 0.5, 1, 0)
    # binarize_dataless_finegrained = T.where(top_k_score_matrix > 0.5, 1, 0)
    # binarize_conc =  T.concatenate([binarize_NN.dimshuffle('x',0,1), binarize_dataless.dimshuffle('x',0,1),binarize_dataless_finegrained.dimshuffle('x',0,1)],axis=0)
    # sum_binarize_conc = T.sum(binarize_conc,axis=0)
    # binarize_prob = T.where(sum_binarize_conc > 0.0, 1, 0)
    # '''
    # sum up prob, works
    # '''
    # ensemble_scores_1 = 0.6*ensemble_NN_scores+0.4*top_k_score_matrix
    # binarize_prob = T.where(ensemble_scores_1 > 0.3, 1, 0)
    '''
    sum up prob, works
    '''
    ensemble_scores = ensemble_NN_scores  #0.6*ensemble_NN_scores+0.4*0.5*(cosine_score_matrix+top_k_score_matrix)
    binarize_prob = T.where(ensemble_scores > 0.3, 1, 0)
    '''
    test for other fields
    '''
    sum_tensor3 = acnn_other_prob_tensor3  #(batch, 4, 3)

    #train_model = theano.function([sents_id_matrix, sents_mask, labels], cost, updates=updates, on_unused_input='ignore')
    train_p1_model = theano.function(
        [sents_id_matrix, sents_mask, labels, des_id_matrix, des_mask],
        cost,
        updates=updates,
        allow_input_downcast=True,
        on_unused_input='ignore')
    train_p2_model = theano.function([
        sents_id_matrix, sents_mask, labels, des_id_matrix, des_mask,
        other_labels
    ],
                                     cost_other,
                                     updates=other_updates,
                                     allow_input_downcast=True,
                                     on_unused_input='ignore')
    test_model = theano.function(
        [sents_id_matrix, sents_mask, des_id_matrix, des_mask],
        [binarize_prob, ensemble_scores, sum_tensor3],
        allow_input_downcast=True,
        on_unused_input='ignore')

    ###############
    # TRAIN MODEL #
    ###############
    print '... training'
    # early-stopping parameters
    patience = 50000000000  # look as this many examples regardless
    start_time = time.time()
    mid_time = start_time
    past_time = mid_time
    epoch = 0
    done_looping = False

    n_train_batches = train_size / batch_size
    train_batch_start = list(
        np.arange(n_train_batches) * batch_size) + [train_size - batch_size]
    n_train_p2_batches = train_p2_size / batch_size
    train_p2_batch_start = list(np.arange(n_train_p2_batches) *
                                batch_size) + [train_p2_size - batch_size]
    n_test_batches = test_size / batch_size
    n_test_remain = test_size % batch_size
    test_batch_start = list(
        np.arange(n_test_batches) * batch_size) + [test_size - batch_size]

    train_p2_batch_start_set = set(train_p2_batch_start)
    # max_acc_dev=0.0
    # max_meanf1_test=0.0
    # max_weightf1_test=0.0
    train_indices = range(train_size)
    train_p2_indices = range(train_p2_size)
    cost_i = 0.0
    other_cost_i = 0.0
    min_mean_frame = 100.0
    while epoch < n_epochs:
        epoch = epoch + 1
        random.Random(100).shuffle(train_indices)
        random.Random(100).shuffle(train_p2_indices)
        iter_accu = 0

        for batch_id in train_batch_start:  #for each batch
            # iter means how many batches have been run, taking into loop
            iter = (epoch - 1) * n_train_batches + iter_accu + 1
            iter_accu += 1
            train_id_batch = train_indices[batch_id:batch_id + batch_size]

            cost_i += train_p1_model(train_sents[train_id_batch],
                                     train_masks[train_id_batch],
                                     train_labels[train_id_batch], label_sent,
                                     label_mask)

            if batch_id in train_p2_batch_start_set:
                train_p2_id_batch = train_p2_indices[batch_id:batch_id +
                                                     batch_size]
                other_cost_i += train_p2_model(
                    train_p2_sents[train_p2_id_batch],
                    train_p2_masks[train_p2_id_batch],
                    train_p2_labels[train_p2_id_batch], label_sent, label_mask,
                    train_p2_other_labels[train_p2_id_batch])
            # else:
            #     random_batch_id = random.choice(train_p2_batch_start)
            #     train_p2_id_batch = train_p2_indices[random_batch_id:random_batch_id+batch_size]
            #     other_cost_i+=train_p2_model(
            #                         train_p2_sents[train_p2_id_batch],
            #                         train_p2_masks[train_p2_id_batch],
            #                         train_p2_labels[train_p2_id_batch],
            #                         label_sent,
            #                         label_mask,
            #                         train_p2_other_labels[train_p2_id_batch]
            #                         )
            #after each 1000 batches, we test the performance of the model on all test data
            if iter % 20 == 0:
                print 'Epoch ', epoch, 'iter ' + str(
                    iter) + ' average cost: ' + str(cost_i / iter), str(
                        other_cost_i /
                        iter), 'uses ', (time.time() - past_time) / 60.0, 'min'
                past_time = time.time()

                pred_types = []
                pred_confs = []
                pred_others = []
                for i, test_batch_id in enumerate(
                        test_batch_start):  # for each test batch
                    pred_types_i, pred_conf_i, pred_fields_i = test_model(
                        test_sents[test_batch_id:test_batch_id + batch_size],
                        test_masks[test_batch_id:test_batch_id + batch_size],
                        label_sent, label_mask)
                    if i < len(test_batch_start) - 1:
                        pred_types.append(pred_types_i)
                        pred_confs.append(pred_conf_i)
                        pred_others.append(pred_fields_i)
                    else:
                        pred_types.append(pred_types_i[-n_test_remain:])
                        pred_confs.append(pred_conf_i[-n_test_remain:])
                        pred_others.append(pred_fields_i[-n_test_remain:])
                pred_types = np.concatenate(pred_types, axis=0)
                pred_confs = np.concatenate(pred_confs, axis=0)
                pred_others = np.concatenate(pred_others, axis=0)
                mean_frame = generate_output_for_EDL(test_lines,
                                                     output_file_path,
                                                     pred_types, pred_confs,
                                                     pred_others,
                                                     min_mean_frame)
                # mean_frame = generate_2017_official_output(test_lines, output_file_path, pred_types, pred_confs, pred_others, min_mean_frame)
                if mean_frame < min_mean_frame:
                    min_mean_frame = mean_frame
                print '\t\t\t test  over, min_mean_frame:', min_mean_frame

        print 'Epoch ', epoch, 'uses ', (time.time() - mid_time) / 60.0, 'min'
        mid_time = time.time()

        #print 'Batch_size: ', update_freq
    end_time = time.time()

    print >> sys.stderr, ('The code for file ' + os.path.split(__file__)[1] +
                          ' ran for %.2fm' % ((end_time - start_time) / 60.))
Exemple #56
0
 def repeat(self, t, n):
     T.repeat(t, n)
Exemple #57
0
def evaluate_lenet5(learning_rate=0.2,
                    n_epochs=2000,
                    nkerns=[6, 14],
                    batch_size=70,
                    useAllSamples=0,
                    kmax=30,
                    ktop=4,
                    filter_size=[7, 5],
                    L2_weight=0.00005,
                    dropout_p=0.8,
                    useEmb=0,
                    task=2,
                    corpus=1,
                    dataMode=3,
                    maxSentLength=60):
    #def evaluate_lenet5(learning_rate=0.1, n_epochs=2000, nkerns=[6, 12], batch_size=70, useAllSamples=0, kmax=30, ktop=5, filter_size=[10,7],
    #                    L2_weight=0.000005, dropout_p=0.5, useEmb=0, task=5, corpus=1):

    root = "/mounts/data/proj/wenpeng/Dataset/StanfordSentiment/stanfordSentimentTreebank/"
    embeddingPath = '/mounts/data/proj/wenpeng/Downloads/hlbl-embeddings-original.EMBEDDING_SIZE=50.txt'
    embeddingPath2 = '/mounts/data/proj/wenpeng/MC/src/released_embedding.txt'
    rng = numpy.random.RandomState(23455)
    datasets, embedding_size, embeddings, embeddings_Q, unigram = read_data_WP(
        root + str(task) + 'classes/' + str(corpus) + 'train.txt',
        root + str(task) + 'classes/' + str(corpus) + 'dev.txt',
        root + str(task) + 'classes/' + str(corpus) + 'test.txt',
        embeddingPath, maxSentLength, useEmb, dataMode)
    #datasets, embedding_size, embeddings=read_data(root+'2classes/train.txt', root+'2classes/dev.txt', root+'2classes/test.txt', embeddingPath,60)

    #datasets = load_data(dataset)
    indices_train, trainY, trainLengths, trainLeftPad, trainRightPad = datasets[
        0]
    indices_dev, devY, devLengths, devLeftPad, devRightPad = datasets[1]
    indices_test, testY, testLengths, testLeftPad, testRightPad = datasets[2]
    n_train_batches = indices_train.shape[0] / batch_size
    n_valid_batches = indices_dev.shape[0] / batch_size
    n_test_batches = indices_test.shape[0] / batch_size
    remain_train = indices_train.shape[0] % batch_size

    train_batch_start = []
    dev_batch_start = []
    test_batch_start = []
    if useAllSamples:
        train_batch_start = list(
            numpy.arange(n_train_batches) *
            batch_size) + [indices_train.shape[0] - batch_size]
        dev_batch_start = list(numpy.arange(n_valid_batches) * batch_size) + [
            indices_dev.shape[0] - batch_size
        ]
        test_batch_start = list(numpy.arange(n_test_batches) * batch_size) + [
            indices_test.shape[0] - batch_size
        ]
        n_train_batches = n_train_batches + 1
        n_valid_batches = n_valid_batches + 1
        n_test_batches = n_test_batches + 1
    else:
        train_batch_start = list(numpy.arange(n_train_batches) * batch_size)
        dev_batch_start = list(numpy.arange(n_valid_batches) * batch_size)
        test_batch_start = list(numpy.arange(n_test_batches) * batch_size)

    indices_train_theano = theano.shared(numpy.asarray(
        indices_train, dtype=theano.config.floatX),
                                         borrow=True)
    indices_dev_theano = theano.shared(numpy.asarray(
        indices_dev, dtype=theano.config.floatX),
                                       borrow=True)
    indices_test_theano = theano.shared(numpy.asarray(
        indices_test, dtype=theano.config.floatX),
                                        borrow=True)
    indices_train_theano = T.cast(indices_train_theano, 'int32')
    indices_dev_theano = T.cast(indices_dev_theano, 'int32')
    indices_test_theano = T.cast(indices_test_theano, 'int32')

    # allocate symbolic variables for the data
    index = T.lscalar()  # index to a [mini]batch
    x_index = T.imatrix(
        'x_index')  # now, x is the index matrix, must be integer
    y = T.ivector('y')
    z = T.ivector('z')
    left = T.ivector('left')
    right = T.ivector('right')

    x = embeddings[x_index.flatten()].reshape(
        (batch_size, maxSentLength, embedding_size)).transpose(0, 2,
                                                               1).flatten()
    ishape = (embedding_size, maxSentLength
              )  # this is the size of MNIST images
    filter_size1 = (embedding_size, filter_size[0])
    filter_size2 = (embedding_size / 2, filter_size[1])
    #poolsize1=(1, ishape[1]-filter_size1[1]+1) #?????????????????????????????
    poolsize1 = (1, ishape[1] + filter_size1[1] - 1)
    '''
    left_after_conv=T.maximum(0,left-filter_size1[1]+1)
    right_after_conv=T.maximum(0, right-filter_size1[1]+1)
    '''
    left_after_conv = left
    right_after_conv = right

    #kmax=30 # this can not be too small, like 20
    #ktop=6
    #poolsize2=(1, kmax-filter_size2[1]+1) #(1,6)
    poolsize2 = (1, kmax + filter_size2[1] - 1)  #(1,6)
    dynamic_lengths = T.maximum(ktop, z / 2 + 1)  # dynamic k-max pooling
    ######################
    # BUILD ACTUAL MODEL #
    ######################
    print '... building the model'

    # Reshape matrix of rasterized images of shape (batch_size,28*28)
    # to a 4D tensor, compatible with our LeNetConvPoolLayer
    layer0_input = x.reshape((batch_size, 1, ishape[0], ishape[1]))

    # Construct the first convolutional pooling layer:
    # filtering reduces the image size to (28-5+1,28-5+1)=(24,24)
    # maxpooling reduces this further to (24/2,24/2) = (12,12)
    # 4D output tensor is thus of shape (batch_size,nkerns[0],12,12)
    '''
    layer0 = LeNetConvPoolLayer(rng, input=layer0_input,
            image_shape=(batch_size, 1, ishape[0], ishape[1]),
            filter_shape=(nkerns[0], 1, filter_size1[0], filter_size1[1]), poolsize=poolsize1, k=kmax)
    '''
    layer0 = Conv_Fold_DynamicK_PoolLayer(
        rng,
        input=layer0_input,
        image_shape=(batch_size, 1, ishape[0], ishape[1]),
        filter_shape=(nkerns[0], 1, filter_size1[0], filter_size1[1]),
        poolsize=poolsize1,
        k=dynamic_lengths,
        unifiedWidth=kmax,
        left=left_after_conv,
        right=right_after_conv,
        firstLayer=True)

    # Construct the second convolutional pooling layer
    # filtering reduces the image size to (12-5+1,12-5+1)=(8,8)
    # maxpooling reduces this further to (8/2,8/2) = (4,4)
    # 4D output tensor is thus of shape (nkerns[0],nkerns[1],4,4)
    '''
    layer1 = LeNetConvPoolLayer(rng, input=layer0.output,
            image_shape=(batch_size, nkerns[0], ishape[0], kmax),
            filter_shape=(nkerns[1], nkerns[0], filter_size2[0], filter_size2[1]), poolsize=poolsize2, k=ktop)
    '''
    '''
    left_after_conv=T.maximum(0, layer0.leftPad-filter_size2[1]+1)
    right_after_conv=T.maximum(0, layer0.rightPad-filter_size2[1]+1)
    '''
    left_after_conv = layer0.leftPad
    right_after_conv = layer0.rightPad
    dynamic_lengths = T.repeat([ktop], batch_size)  # dynamic k-max pooling
    '''
    layer1 = ConvFoldPoolLayer(rng, input=layer0.output,
            image_shape=(batch_size, nkerns[0], ishape[0]/2, kmax),
            filter_shape=(nkerns[1], nkerns[0], filter_size2[0], filter_size2[1]), poolsize=poolsize2, k=ktop, left=left_after_conv, right=right_after_conv)
    '''
    layer1 = Conv_Fold_DynamicK_PoolLayer(
        rng,
        input=layer0.output,
        image_shape=(batch_size, nkerns[0], ishape[0] / 2, kmax),
        filter_shape=(nkerns[1], nkerns[0], filter_size2[0], filter_size2[1]),
        poolsize=poolsize2,
        k=dynamic_lengths,
        unifiedWidth=ktop,
        left=left_after_conv,
        right=right_after_conv,
        firstLayer=False)

    # the HiddenLayer being fully-connected, it operates on 2D matrices of
    # shape (batch_size,num_pixels) (i.e matrix of rasterized images).
    # This will generate a matrix of shape (20,32*4*4) = (20,512)
    layer2_input = layer1.output.flatten(2)
    dropout = dropout_from_layer(rng, layer2_input, dropout_p)
    # construct a fully-connected sigmoidal layer, the output of layers has nkerns[1]=50 images, each is 4*4 size
    #layer2 = FullyConnectedLayer(rng, input=dropout, n_in=nkerns[1] * (embedding_size/4) * ktop, n_out=task)

    layer3 = LogisticRegression(rng,
                                input=dropout,
                                n_in=nkerns[1] * (embedding_size / 4) * ktop,
                                n_out=task)
    #layer3=SoftMaxlayer(input=layer2.output)
    #layer3 = LogisticRegression(rng, input=layer2.output, n_in=50, n_out=2)
    # the cost we minimize during training is the NLL of the model
    #L1_reg= abs(layer3.W).sum() + abs(layer2.W).sum() +abs(layer1.W).sum()+abs(layer0.W).sum()+abs(embeddings).sum()
    L2_reg = (layer3.W**2).sum() + (layer1.W**2).sum() + (
        layer0.W**2).sum() + (embeddings**2).sum()
    #L2_reg = (layer3.W** 2).sum() + (layer2.W** 2).sum()+(layer0.W** 2).sum()+(embeddings**2).sum()
    #cost must have L2, otherwise, will produce nan, while with L2, each word embedding will be updated
    cost = layer3.negative_log_likelihood(y) + L2_weight * L2_reg

    #cost = layer3.negative_log_likelihood(y)
    # create a function to compute the mistakes that are made by the model
    test_model = theano.function(
        [index],
        layer3.errors(y),
        givens={
            x_index: indices_test_theano[index:index + batch_size],
            y: testY[index:index + batch_size],
            z: testLengths[index:index + batch_size],
            left: testLeftPad[index:index + batch_size],
            right: testRightPad[index:index + batch_size]
        })

    validate_model = theano.function(
        [index],
        layer3.errors(y),
        givens={
            x_index: indices_dev_theano[index:index + batch_size],
            y: devY[index:index + batch_size],
            z: devLengths[index:index + batch_size],
            left: devLeftPad[index:index + batch_size],
            right: devRightPad[index:index + batch_size]
        })

    # create a list of all model parameters to be fit by gradient descent
    params = layer3.params + layer1.params + layer0.params + [embeddings]
    #params = layer3.params + layer2.params + layer0.params+[embeddings]

    accumulator = []
    for para_i in params:
        eps_p = numpy.zeros_like(para_i.get_value(borrow=True),
                                 dtype=theano.config.floatX)
        accumulator.append(theano.shared(eps_p, borrow=True))

    # create a list of gradients for all model parameters
    grads = T.grad(cost, params)
    # train_model is a function that updates the model parameters by
    # SGD Since this model has many parameters, it would be tedious to
    # manually create an update rule for each model parameter. We thus
    # create the updates list by automatically looping over all
    # (params[i],grads[i]) pairs.
    '''   
    updates = []
    for param_i, grad_i in zip(params, grads):
        updates.append((param_i, param_i - learning_rate * grad_i))
    
    '''
    updates = []
    for param_i, grad_i, acc_i in zip(params, grads, accumulator):
        acc = acc_i + T.sqr(grad_i)
        if param_i == embeddings:
            updates.append(
                (param_i,
                 T.set_subtensor(
                     (param_i - learning_rate * grad_i / T.sqrt(acc))[0],
                     theano.shared(numpy.zeros(embedding_size)))))  #AdaGrad
        else:
            updates.append(
                (param_i,
                 param_i - learning_rate * grad_i / T.sqrt(acc)))  #AdaGrad
        updates.append((acc_i, acc))

    train_model = theano.function(
        [index], [cost, layer3.errors(y)],
        updates=updates,
        givens={
            x_index: indices_train_theano[index:index + batch_size],
            y: trainY[index:index + batch_size],
            z: trainLengths[index:index + batch_size],
            left: trainLeftPad[index:index + batch_size],
            right: trainRightPad[index:index + batch_size]
        })

    ###############
    # TRAIN MODEL #
    ###############
    print '... training'
    # early-stopping parameters
    patience = 50000  # look as this many examples regardless
    patience_increase = 2  # wait this much longer when a new best is
    # found
    improvement_threshold = 0.995  # a relative improvement of this much is
    # considered significant
    validation_frequency = min(n_train_batches / 50, patience / 2)
    # go through this many
    # minibatche before checking the network
    # on the validation set; in this case we
    # check every epoch

    best_params = None
    best_validation_loss = numpy.inf
    best_iter = 0
    test_score = 0.
    start_time = time.clock()

    epoch = 0
    done_looping = False

    while (epoch < n_epochs) and (not done_looping):
        epoch = epoch + 1
        #for minibatch_index in xrange(n_train_batches): # each batch
        minibatch_index = 0
        for batch_start in train_batch_start:
            # iter means how many batches have been runed, taking into loop
            iter = (epoch - 1) * n_train_batches + minibatch_index + 1

            minibatch_index = minibatch_index + 1
            #if epoch %2 ==0:
            #    batch_start=batch_start+remain_train
            cost_ij, error_ij = train_model(batch_start)
            #if iter ==1:
            #    exit(0)
            if iter % n_train_batches == 0:
                print 'training @ iter = ' + str(iter) + ' cost: ' + str(
                    cost_ij) + ' error: ' + str(error_ij)
            if iter % validation_frequency == 0:

                # compute zero-one loss on validation set
                #validation_losses = [validate_model(i) for i in xrange(n_valid_batches)]
                validation_losses = [
                    validate_model(i) for i in dev_batch_start
                ]
                this_validation_loss = numpy.mean(validation_losses)
                print('\t\tepoch %i, minibatch %i/%i, validation error %f %%' % \
                      (epoch, minibatch_index , n_train_batches, \
                       this_validation_loss * 100.))

                # if we got the best validation score until now
                if this_validation_loss < best_validation_loss:

                    #improve patience if loss improvement is good enough
                    if this_validation_loss < best_validation_loss *  \
                       improvement_threshold:
                        patience = max(patience, iter * patience_increase)

                    # save best validation score and iteration number
                    best_validation_loss = this_validation_loss
                    best_iter = iter

                    # test it on the test set
                    test_losses = [test_model(i) for i in test_batch_start]
                    test_score = numpy.mean(test_losses)
                    print((
                        '\t\t\t\tepoch %i, minibatch %i/%i, test error of best '
                        'model %f %%') % (epoch, minibatch_index,
                                          n_train_batches, test_score * 100.))

            if patience <= iter:
                done_looping = True
                break

    end_time = time.clock()
    print('Optimization complete.')
    print('Best validation score of %f %% obtained at iteration %i,'\
          'with test performance %f %%' %
          (best_validation_loss * 100., best_iter + 1, test_score * 100.))
    print >> sys.stderr, ('The code for file ' + os.path.split(__file__)[1] +
                          ' ran for %.2fm' % ((end_time - start_time) / 60.))
Exemple #58
0
    def evaluate_lenet5(self):
    #def evaluate_lenet5(learning_rate=0.1, n_epochs=2000, nkerns=[6, 12], batch_size=70, useAllSamples=0, kmax=30, ktop=5, filter_size=[10,7],
    #                    L2_weight=0.000005, dropout_p=0.5, useEmb=0, task=5, corpus=1):
        rng = numpy.random.RandomState(23455)
        
        #datasets, embedding_size, embeddings=read_data(root+'2classes/train.txt', root+'2classes/dev.txt', root+'2classes/test.txt', embeddingPath,60)

        #datasets = load_data(dataset)
        indices_train, trainLengths, trainLeftPad, trainRightPad= self.datasets[0]
        #indices_dev, devLengths, devLeftPad, devRightPad= self.datasets[1]
        '''
        print 'indices_train shapes:'
        print indices_train.shape[0], indices_train.shape[1]
        print indices_train
        '''
        #create embedding matrix to store the final embeddings
        sentences_embs=numpy.zeros((indices_train.shape[0],self.sentEm_length), dtype=theano.config.floatX)

        n_train_batches=indices_train.shape[0]/self.batch_size
        #n_valid_batches=indices_dev.shape[0]/self.batch_size
        remain_train=indices_train.shape[0]%self.batch_size
        
        train_batch_start=[]
        dev_batch_start=[]
        if self.useAllSamples:
            train_batch_start=list(numpy.arange(n_train_batches)*self.batch_size)+[indices_train.shape[0]-self.batch_size]
            #dev_batch_start=list(numpy.arange(n_valid_batches)*self.batch_size)+[indices_dev.shape[0]-self.batch_size]
            n_train_batches=n_train_batches+1
            #n_valid_batches=n_valid_batches+1
        else:
            train_batch_start=list(numpy.arange(n_train_batches)*self.batch_size)
            #dev_batch_start=list(numpy.arange(n_valid_batches)*self.batch_size)
        '''
        print 'train_batch_start:'
        print train_batch_start
        '''
        indices_train_theano=theano.shared(numpy.asarray(indices_train, dtype=theano.config.floatX), borrow=True)
        #indices_dev_theano=theano.shared(numpy.asarray(indices_dev, dtype=theano.config.floatX), borrow=True)
        indices_train_theano=T.cast(indices_train_theano, 'int32')
        '''
        print 'target_matrix shape'
        print self.target_matrix.shape[0], self.target_matrix.shape[1]
        print self.target_matrix
        '''
        indices_target_theano=theano.shared(numpy.asarray(self.target_matrix, dtype=theano.config.floatX), borrow=True)
        #indices_dev_theano=theano.shared(numpy.asarray(indices_dev, dtype=theano.config.floatX), borrow=True)
        indices_target_theano=T.cast(indices_target_theano, 'int32')
        #print 'context_matrix shape'
        #print self.context_matrix.shape[0], self.context_matrix.shape[1]
        #print self.context_matrix[:,0:300], self.context_matrix[:,300:600], self.context_matrix[:,600:900], self.context_matrix[:,900:]
        indices_context_theano=theano.shared(numpy.asarray(self.context_matrix, dtype=theano.config.floatX), borrow=True)
        #indices_dev_theano=theano.shared(numpy.asarray(indices_dev, dtype=theano.config.floatX), borrow=True)
        indices_context_theano=T.cast(indices_context_theano, 'int32')        

        #indices_dev_theano=T.cast(indices_dev_theano, 'int32')
        
        # allocate symbolic variables for the data
        index = T.lscalar()  # index to a [mini]batch
        x_index = T.imatrix('x_index')   # now, x is the index matrix, must be integer
        #y = T.ivector('y')  
        z = T.ivector('z')   # sentence length
        left=T.ivector('left')
        right=T.ivector('right')
        iteration= T.lscalar()
        t_index=T.imatrix('t_index')
        c_index=T.imatrix('c_index')
    
        x_index=debug_print(x_index,'x_index')
        x_transpose=debug_print(self.embeddings_R[x_index.flatten()].reshape((self.batch_size,self.maxSentLength, self.context_embedding_size)).transpose(0, 2, 1),'x_transpose')
        x=debug_print(x_transpose.flatten(),'x')
        ishape = (self.context_embedding_size, self.maxSentLength)  # this is the size of MNIST images
        filter_size1=(self.context_embedding_size,self.filter_size[0])
        filter_size2=(self.context_embedding_size/2,self.filter_size[1])
        #poolsize1=(1, ishape[1]-filter_size1[1]+1) #?????????????????????????????
        poolsize1=(1, ishape[1]+filter_size1[1]-1)
    
        '''
        left_after_conv=T.maximum(0,left-filter_size1[1]+1)
        right_after_conv=T.maximum(0, right-filter_size1[1]+1)
        '''
        left_after_conv=left
        right_after_conv=right
        
        #kmax=30 # this can not be too small, like 20
        #ktop=6
        #poolsize2=(1, kmax-filter_size2[1]+1) #(1,6)
        poolsize2=(1, self.kmax+filter_size2[1]-1) #(1,6)
        dynamic_lengths=T.maximum(self.ktop,z/2+1)  # dynamic k-max pooling
        ######################
        # BUILD ACTUAL MODEL #
        ######################
        print '... building the model'
    
        # Reshape matrix of rasterized images of shape (batch_size,28*28)
        # to a 4D tensor, compatible with our LeNetConvPoolLayer
        layer0_input=debug_print(x.reshape((self.batch_size, 1, ishape[0], ishape[1])),'layer0_input')

        # Construct the first convolutional pooling layer:
        # filtering reduces the image size to (28-5+1,28-5+1)=(24,24)
        # maxpooling reduces this further to (24/2,24/2) = (12,12)
        # 4D output tensor is thus of shape (batch_size,nkerns[0],12,12)
        '''
        layer0 = LeNetConvPoolLayer(rng, input=layer0_input,
                image_shape=(batch_size, 1, ishape[0], ishape[1]),
                filter_shape=(nkerns[0], 1, filter_size1[0], filter_size1[1]), poolsize=poolsize1, k=kmax)
        '''
        layer0 = Conv_Fold_DynamicK_PoolLayer(rng, input=layer0_input,
                image_shape=(self.batch_size, 1, ishape[0], ishape[1]),
                filter_shape=(self.nkerns[0], 1, filter_size1[0], filter_size1[1]), poolsize=poolsize1, k=dynamic_lengths, unifiedWidth=self.kmax, left=left_after_conv, right=right_after_conv, firstLayer=True)
        
        # Construct the second convolutional pooling layer
        # filtering reduces the image size to (12-5+1,12-5+1)=(8,8)
        # maxpooling reduces this further to (8/2,8/2) = (4,4)
        # 4D output tensor is thus of shape (nkerns[0],nkerns[1],4,4)
        '''
        layer1 = LeNetConvPoolLayer(rng, input=layer0.output,
                image_shape=(batch_size, nkerns[0], ishape[0], kmax),
                filter_shape=(nkerns[1], nkerns[0], filter_size2[0], filter_size2[1]), poolsize=poolsize2, k=ktop)
        '''
        '''
        left_after_conv=T.maximum(0, layer0.leftPad-filter_size2[1]+1)
        right_after_conv=T.maximum(0, layer0.rightPad-filter_size2[1]+1)
        '''
        left_after_conv=layer0.leftPad
        right_after_conv=layer0.rightPad
        dynamic_lengths=T.repeat([self.ktop],self.batch_size)  # dynamic k-max pooling
        layer1_input=debug_print(layer0.output, 'layer0_output')
        '''
        layer1 = ConvFoldPoolLayer(rng, input=layer0.output,
                image_shape=(batch_size, nkerns[0], ishape[0]/2, kmax),
                filter_shape=(nkerns[1], nkerns[0], filter_size2[0], filter_size2[1]), poolsize=poolsize2, k=ktop, left=left_after_conv, right=right_after_conv)
        '''
        layer1 = Conv_Fold_DynamicK_PoolLayer(rng, input=layer1_input,
                image_shape=(self.batch_size, self.nkerns[0], ishape[0]/2, self.kmax),
                filter_shape=(self.nkerns[1], self.nkerns[0], filter_size2[0], filter_size2[1]), poolsize=poolsize2, k=dynamic_lengths, unifiedWidth=self.ktop, left=left_after_conv, right=right_after_conv, firstLayer=False)    
        
        # the HiddenLayer being fully-connected, it operates on 2D matrices of
        # shape (batch_size,num_pixels) (i.e matrix of rasterized images).
        # This will generate a matrix of shape (20,32*4*4) = (20,512)
        
        
        layer1_output = debug_print(layer1.output.flatten(2), 'layer1_output')
        #layer2_input=theano.printing.Print('layer2_input')(layer2_input)
        #produce sentence embeddings
        #layer2 = HiddenLayer(rng, input=layer2_input, n_in=self.nkerns[1] * (self.context_embedding_size/4) * self.ktop, n_out=self.sentEm_length, activation=T.tanh)
        
        #context_matrix,  target_matrix=self.extract_contexts_targets(indices_matrix=x_index, sentLengths=z, leftPad=left)
        target_matrix=t_index
        context_matrix=c_index
        #note that context indices might be zero embeddings
        h_indices=debug_print(context_matrix[:, self.context_size*iteration:self.context_size*(iteration+1)],'h_indices')
        w_indices=debug_print(target_matrix[:, iteration:(iteration+1)],'w_indices')
        #r_h is the concatenation of context embeddings
        r_h=debug_print(self.embed_context(h_indices), 'embedded_context')  #(batch_size, context_size*embedding_size)
        q_w=debug_print(self.embed_target(w_indices), 'embedded_target')
        #q_hat: concatenate sentence embeddings and context embeddings
        #q_hat=self.concatenate_sent_context(layer2.output, r_h)
        q_hat=self.concatenate_sent_context(layer1_output, r_h)
        layer3 = HiddenLayer(rng, input=q_hat, n_in=self.nkerns[1] * (self.context_embedding_size/4) * self.ktop+self.context_size*self.context_embedding_size, n_out=self.target_embedding_size, activation=T.tanh)
        layer3_output=debug_print(layer3.output, 'layer3.output')
        noise_indices, p_n_noise=self.get_noise()
        noise_indices=debug_print(noise_indices, 'noise_indices')
        #noise_indices=theano.printing.Print('noise_indices')(noise_indices)
        s_theta_data=debug_print(T.sum(layer3_output * q_w, axis=1).reshape((self.batch_size,1)) + self.bias[w_indices] , 's_theta_data')
        #s_theta_data=theano.printing.Print('s_theta_data')(s_theta_data)
        p_n_data = debug_print(self.p_n[w_indices],'p_n_data') #p_n[0] indicates the probability of word indexed 1
        delta_s_theta_data = debug_print(s_theta_data - T.log(self.k * p_n_data),'delta_s_theta_data')
        log_sigm_data = debug_print(T.log(T.nnet.sigmoid(delta_s_theta_data)),'log_sigm_data')
        
        #create the noise, q_noise has shape(self.batch_size, self.k, self.embedding_size )
        q_noise = debug_print(self.embed_noise(noise_indices),'embed_noise')
        q_hat_res = layer3_output.reshape((self.batch_size, 1, self.target_embedding_size))
        s_theta_noise = debug_print(T.sum(q_hat_res * q_noise, axis=2) + self.bias[noise_indices],'s_theta_noise') #(batch_size, k)
        delta_s_theta_noise = debug_print(s_theta_noise - T.log(self.k * p_n_noise), 'delta_s_theta_noise')  # it should be matrix (batch_size, k)
        log_sigm_noise = debug_print(T.log(1 - T.nnet.sigmoid(delta_s_theta_noise)), 'log_sigm_noise')
        sum_noise_per_example =debug_print(T.sum(log_sigm_noise, axis=1), 'sum_noise_per_example')   #(batch_size, 1)
        # Calc objective function
        J = debug_print(-T.mean(log_sigm_data) - T.mean(sum_noise_per_example),'J')
        L2_reg = (layer3.W** 2).sum()+ (layer1.W** 2).sum()+(layer0.W** 2).sum()+(self.embeddings_R**2).sum()#+( self.embeddings_Q**2).sum()
        self.cost = J + self.L2_weight*L2_reg
        '''
        validate_model = theano.function([index,iteration], self.cost,
                givens={
                    x_index: indices_dev_theano[index: index + self.batch_size],
                    z: devLengths[index: index + self.batch_size],
                    left: devLeftPad[index: index + self.batch_size],
                    right: devRightPad[index: index + self.batch_size]})
        '''
        # create a list of all model parameters to be fit by gradient descent
        self.params = layer3.params+layer1.params + layer0.params+[self.embeddings_R]#, self.embeddings_Q]
        #params = layer3.params + layer2.params + layer0.params+[embeddings]
        
        accumulator=[]
        for para_i in self.params:
            eps_p=numpy.zeros_like(para_i.get_value(borrow=True),dtype=theano.config.floatX)
            accumulator.append(theano.shared(eps_p, borrow=True))
          
        # create a list of gradients for all model parameters
        grads = T.grad(self.cost, self.params)
        updates = []
        for param_i, grad_i, acc_i in zip(self.params, grads, accumulator):
            grad_i=debug_print(grad_i,'grad_i')
            acc = acc_i + T.sqr(grad_i)
            if param_i == self.embeddings_R:# or param_i == self.embeddings_Q:
                updates.append((param_i, T.set_subtensor((param_i - self.ini_learning_rate * grad_i / T.sqrt(acc))[0], theano.shared(numpy.zeros(self.context_embedding_size)))))   #AdaGrad
            else:
                updates.append((param_i, param_i - self.ini_learning_rate * grad_i / T.sqrt(acc)))   #AdaGrad
            updates.append((acc_i, acc))    
           
        train_model = theano.function([index,iteration], [self.cost], updates=updates,
              givens={
                x_index: indices_train_theano[index: index + self.batch_size],
                z: trainLengths[index: index + self.batch_size],
                left: trainLeftPad[index: index + self.batch_size],
                right: trainRightPad[index: index + self.batch_size],
                t_index: indices_target_theano[index: index + self.batch_size],
                c_index: indices_context_theano[index: index + self.batch_size]})
    
        ###############
        # TRAIN MODEL #
        ###############
        print '... training'
        # early-stopping parameters
        patience = 50000  # look as this many examples regardless
        patience_increase = 2  # wait this much longer when a new best is
                               # found
        improvement_threshold = 0.995  # a relative improvement of this much is
                                       # considered significant
        validation_frequency = min(10, patience / 2)
                                      # go through this many
                                      # minibatche before checking the network
                                      # on the validation set; in this case we
                                      # check every epoch
    
        best_params = None
        best_validation_loss = numpy.inf
        best_iter = 0
        test_score = 0.
        start_time = time.clock()
    
        epoch = 0
        done_looping = False
        vali_loss_list=[]
        train_loss_list=[]
        while (epoch < self.n_epochs) and (not done_looping):
            epoch = epoch + 1
            #for minibatch_index in xrange(n_train_batches): # each batch
            minibatch_index=0
            for batch_start in train_batch_start: 
                # iter means how many batches have been runed, taking into loop
                iter = (epoch - 1) * n_train_batches + minibatch_index +1
    
                minibatch_index=minibatch_index+1
                #print 'batch_start: '+str(batch_start)
                total_iteration=min(max(self.target_lengths[batch_start: batch_start + self.batch_size]), 60) # total iteration is not allowed to surpass 60
                # we only care the last cost within those iterations
                cost_of_end_batch=0.0
                costs_in_batch=[]
                for iteration in range(total_iteration):
                    #print 'iteration: '+str(iteration)+'/'+str(total_iteration)+' in iter '+str(iter)
                    #if iteration==3:
                    #    exit(0)
                    cost_of_end_batch = train_model(batch_start, iteration)
                    '''
                    print 'updated self.embeddings_R:'
                    print self.embeddings_R.get_value()[:37,:]
                    print self.embeddings_R.get_value()[37:,:]

                    print 'updated layer0 W: '
                    print layer0.W.get_value()[0:1,0:1,0:1,:]
                    print 'updated layer1 W:'
                    print layer1.W.get_value()[0:1,0:1,0:1,:]

                    print 'updated layer2 W: '
                    print layer2.W.get_value()
                    print 'updated layer3 W:'
                    print layer3.W.get_value()
                    '''
                    costs_in_batch.append(cost_of_end_batch)
                    #print 'cost_of_each_iteration: '+str(cost_of_end_batch)
                average_cost_per_batch=numpy.mean(costs_in_batch)
                #print 'cost_of_batch: '+str(average_cost_per_batch)
                if iter % validation_frequency == 0:
                    print 'training @ iter = '+str(iter)+' cost: '+str(average_cost_per_batch)# +' error: '+str(error_ij)
                    #print batch_embs
                #store sentence embeddings
                #for row in range(batch_start, batch_start + self.batch_size):
                #    sentences_embs[row]=batch_embs[row-batch_start]
                    
                if average_cost_per_batch<minimal_of_list(train_loss_list):
                    del train_loss_list[:]
                    train_loss_list.append(average_cost_per_batch)
                    self.best_params=self.params
                elif len(train_loss_list)<self.vali_cost_list_length:
                    train_loss_list.append(average_cost_per_batch)
                    if len(train_loss_list)==self.vali_cost_list_length:
                        self.store_model_to_file()
                        #self.store_sentence_embeddings(sentences_embs)
                        self.store_embeddings()
                        print 'Training over, best model got at train_cost:'+str(train_loss_list[0])
                        exit(0)                        
                #print 'sentence embeddings:'
                #print sentences_embs[:6,:]
                #if iter ==1:
                #    exit(0)
                '''
                if iter % validation_frequency == 0:
                    print 'training @ iter = '+str(iter)+' cost: '+str(cost_of_end_batch)# +' error: '+str(error_ij)
                if iter % validation_frequency == 0:
                    #print '\t iter: '+str(iter)
                    # compute zero-one loss on validation set
                    #validation_losses = [validate_model(i) for i in xrange(n_valid_batches)]
                    validation_losses=[]
                    for batch_start in dev_batch_start:
                        #print '\t\t batch_start: '+str(batch_start)
                        total_iteration=max(self.dev_lengths[batch_start: batch_start + self.batch_size])
                        #for validate, we need the cost among all the iterations in that batch

                        for iteration in range(total_iteration):
                            vali_loss_i=validate_model(batch_start, iteration)
                            #print vali_loss_i
                            validation_losses.append(vali_loss_i)
                    this_validation_loss = numpy.mean(validation_losses)
                    print('\t\tepoch %i, minibatch %i/%i, validation cost %f ' % \
                      (epoch, minibatch_index , n_train_batches, \
                       this_validation_loss))
                    
                    if this_validation_loss < minimal_of_list(vali_loss_list):
                        del vali_loss_list[:]
                        vali_loss_list.append(this_validation_loss)
                        #store params
                        self.best_params=self.params
                        #fake
                    elif len(vali_loss_list)<self.vali_cost_list_length:
                        vali_loss_list.append(this_validation_loss)
                        if len(vali_loss_list)==self.vali_cost_list_length:
                            self.store_model_to_file()
                            self.store_sentence_embeddings(sentences_embs)
                            print 'Training over, best model got at vali_cost:'+str(vali_loss_list[0])
                            exit(0)
                '''
    
                if patience <= iter:
                    done_looping = True
                    break
    
        end_time = time.clock()
        '''
        print('Optimization complete.')
        print('Best validation score of %f %% obtained at iteration %i,'\
              'with test performance %f %%' %
              (best_validation_loss * 100., best_iter + 1, test_score * 100.))
        '''
        print >> sys.stderr, ('The code for file ' +
                              os.path.split(__file__)[1] +
                              ' ran for %.2fm' % ((end_time - start_time) / 60.))
def onestep_attend_copy():

	i_t = T.dot(x_t, Wi) + T.dot(pre_h, Ui) + T.dot(pre_z, Zi)
	i_t_shape = T.shape(i_t)

	bi_reshape = T.repeat(bi, i_t_shape[0], 0)
	bi_reshape_2x = T.repeat(bi_reshape, i_t_shape[1], 1)

	bf_reshape = T.repeat(bf, i_t_shape[0], 0)
	bf_reshape_2x = T.repeat(bf_reshape, i_t_shape[1], 1)

	bc_reshape = T.repeat(bc, i_t_shape[0], 0)
	bc_reshape_2x = T.repeat(bc_reshape, i_t_shape[1], 1)

	bo_reshape = T.repeat(bo, i_t_shape[0], 0)
	bo_reshape_2x = T.repeat(bo_reshape, i_t_shape[1], 1)

	i_t_new= sigmoid(i_t + bi_reshape_2x)
	f_t= sigmoid(T.dot(x_t, Wf) + T.dot(pre_h, Uf) + T.dot(pre_z, Zf) + bf_reshape_2x)
	o_t= sigmoid(T.dot(x_t, Wo) + T.dot(pre_h, Uo) + T.dot(pre_z, Zo) + bo_reshape_2x)
	c_th = tanh(T.dot(x_t, Wc)  + T.dot(pre_h, Uc) + T.dot(pre_z, Zc) + bc_reshape_2x)

	c_t = f_t*pre_c + i_t_new*c_th

	h_t = o_t*T.tanh(c_t) #shape (1, N, h_dim)

	h_t_context = T.repeat(h_t, image_feature_region.shape[1], axis = 0) #new shape (No_region, N, h_dim)
	image_feature_reshape = T.transpose(image_feature_region, (1, 0, 2))
	#compute non-linear correlation between h_t(current text) to image_feature_region (64 for 128*128 and 196 for 224*224)
	# pdb.set_trace()
	m_t = T.tanh(T.dot(h_t_context, Hcontext) + T.dot(image_feature_reshape, Zcontext)) #shape (No_region, N, context_dim)

	e = T.dot(m_t, Va) #No_region, N, 1
	e_reshape = e.reshape((e.shape[0], T.prod(e.shape[1:])))

	e_softmax = softmax_along_axis(e_reshape, axis = 0) #shape No_region, N

	e_t = T.transpose(e_softmax, (1,0)) #shape N, No_region
	e_t_r = e_t.reshape([-1, e_softmax.shape[0], e_softmax.shape[1]]) #3D tensor 1, N, No_region
	e_t_r_t = T.transpose(e_t_r, (1,0, 2)) # shape N, 1, No_region
	e_3D = T.repeat(e_t_r_t, e_t_r_t.shape[2], axis = 1) #shape N, No_region, No_region  image_feature_region.shape[1]
	e_3D_t = T.transpose(e_3D, (1,2,0)) #No_region, No_region, N

	identity_2D = T.identity_like(e_3D_t)# shape No_region, No_region
	identity_3D = identity_2D.reshape([-1, identity_2D.shape[0], identity_2D.shape[1]]) # shape 1, No_region, No_region
	identity_3D_t = T.repeat(identity_3D,  image_feature_region.shape[0], axis = 0)
	e_3D_diagonal = e_3D*identity_3D_t #diagonal tensor 3D  (N, No_region, No_region)

	out_weight_y, updates = theano.scan(fn=onestep_weight_feature_multiply,
	                                   outputs_info=[weight_y],
	                                   sequences=[e_3D_diagonal, image_feature_region],
	                                   non_sequences=[])

	z_t = T.sum(out_weight_y, axis = 1) #shape (N, feature_dim)

	z_t_r = z_t.reshape((-1,z_t.shape[0],z_t.shape[1]))


	return [h_t, c_t, z_t_r]
Exemple #60
0
 def logdet(self):
     return tt.repeat(tt.sum(tt.log(self.scale)), self.z0.shape[0])