Esempio n. 1
0
 def predict(self, new_data, batch_size, pool_size):
     """
     predict for new data
     """
     img_shape = (batch_size, 1, self.image_shape[2], self.image_shape[3])
     conv_out = conv.conv2d(input=new_data, filters=self.W, filter_shape=self.filter_shape, image_shape=img_shape)
     pool_list = []
     if self.non_linear == "tanh":
         conv_out_tanh = T.tanh(conv_out + self.b.dimshuffle("x", 0, "x", "x"))
         # pad_len = int(self.max_window_len/2)
         # right_pad_len = int(self.filter_shape[2]/2)
         # index_shift = pad_len-right_pad_len
         index_shift = int(self.filter_shape[2] / 2)
         for i in xrange(batch_size):
             # partition sentence via pool size
             e1pos = pool_size[i, 0] + index_shift
             e2pos = pool_size[i, 1] + index_shift
             # if T.gt(e1pos, 0):
             #     p1 = conv_out_tanh[i, :, :e1pos, :]
             # else:
             #     p1 = conv_out_tanh[i, :, 0, :]
             p1 = conv_out_tanh[i, :, :e1pos, :]
             p2 = conv_out_tanh[i, :, e1pos:e2pos, :]
             p3 = conv_out_tanh[i, :, e2pos:, :]
             p1_pool_out = T.max(p1, axis=1)
             p2_pool_out = T.max(p2, axis=1)
             p3_pool_out = T.max(p3, axis=1)
             temp = T.concatenate([p1_pool_out, p2_pool_out, p3_pool_out], axis=1)
             pool_list.append(temp.dimshuffle("x", 0, 1))
     else:
         pass
     output = T.concatenate(pool_list, axis=0)
     return output
Esempio n. 2
0
    def pool_function(input, axis):

        input_shape = tuple(input.shape)
        num_feature_maps_out = input_shape[axis - 1]
        pool_size = input_shape[axis]

        pool_shape = (input_shape[:axis] + (num_in_sum,
                                            num_in_max) + input_shape[axis + 1:])
        # print("make_ghh_pool_conv2d: pool_shape is {}".format(pool_shape))
        input_reshaped = input.reshape(pool_shape)

        # raise NotImplementedError('TODO: use a soft max instead of T.max')
        # res_after_max = T.max(input_reshaped,axis=axis+1)

        # Soft max with strength of max_strength
        res_after_max = np.cast[floatX](1.0) / np.cast[floatX](max_strength) \
            * T.log(T.mean(T.exp(max_strength * (input_reshaped - T.max(input_reshaped, axis=axis + 1, keepdims=True))), axis=axis + 1)) \
            + T.max(input_reshaped, axis=axis + 1)

        # Get deltas
        delta = np.cast[floatX](1.0) - np.cast[floatX](2.0) * \
            (T.arange(num_in_sum, dtype=floatX) % np.cast[floatX](2))
        target_dimshuffle = ('x',) * axis + (0,) + ('x',) * \
            (len(input_shape) - 1 - axis)
        # print("make_ghh_pool_conv2d: target_dimshuffle is {}".format(target_dimshuffle))
        delta = delta.flatten().dimshuffle(*target_dimshuffle)

        res_after_sum = T.sum(res_after_max * delta, axis=axis)

        return res_after_sum
Esempio n. 3
0
    def test_optimization_max(self):
        data = numpy.asarray(numpy.random.rand(2,3),dtype=config.floatX)
        n = tensor.matrix()

        f = function([n],tensor.max(n,0), mode=self.mode)
        topo = f.maker.env.toposort()
        assert len(topo)==1
        assert isinstance(topo[0].op,CAReduce)
        f(data)


        f = function([n],tensor.max(-n,0), mode=self.mode)
        topo = f.maker.env.toposort()
        assert len(topo)==2
        assert isinstance(topo[0].op, Elemwise)
        assert isinstance(topo[0].op.scalar_op, scalar.Neg)
        assert isinstance(topo[1].op,CAReduce)
        f(data)

        f = function([n],-tensor.max(n,0), mode=self.mode)
        topo = f.maker.env.toposort()
        assert len(topo)==2
        assert isinstance(topo[0].op,CAReduce)
        assert isinstance(topo[1].op, Elemwise)
        assert isinstance(topo[1].op.scalar_op, scalar.Neg)
        f(data)

        f = function([n],-tensor.max(-n,0), mode=self.mode)
        topo = f.maker.env.toposort()
        assert len(topo)==1
        assert isinstance(topo[0].op,CAReduce)#min
        f(data)
Esempio n. 4
0
def compile_gpu_func(nan_is_error, inf_is_error, big_is_error):
    """ compile utility function used by contains_nan and contains_inf
    """
    global f_gpumin, f_gpumax, f_gpuabsmax
    if not cuda.cuda_available:
        return
    guard_input = cuda.fvector("nan_guard")
    cuda_compile_failed = False
    if (nan_is_error or inf_is_error) and f_gpumin is None:
        try:
            f_gpumin = theano.function([guard_input], T.min(guard_input), mode="FAST_RUN")
        except RuntimeError:
            # This can happen if cuda is available, but the
            # device is in exclusive mode and used by another
            # process.
            cuda_compile_failed = True
    if inf_is_error and not cuda_compile_failed and f_gpumax is None:
        try:
            f_gpumax = theano.function([guard_input], T.max(guard_input), mode="FAST_RUN")
        except RuntimeError:
            # This can happen if cuda is available, but the
            # device is in exclusive mode and used by another
            # process.
            cuda_compile_failed = True
    if big_is_error and not cuda_compile_failed and f_gpuabsmax is None:
        try:
            f_gpuabsmax = theano.function([guard_input], T.max(T.abs_(guard_input)), mode="FAST_RUN")
        except RuntimeError:
            # This can happen if cuda is available, but the
            # device is in exclusive mode and used by another
            # process.
            cuda_compile_failed = True
Esempio n. 5
0
 def _test_layer_stats(self, layer_output):
     """
     DESCRIPTION:
         This method is called every batch whereby the examples from test or valid set 
         is pass through, the final result will be the mean of all the results from all 
         the batches in an epoch from the test set or valid set.
     PARAM:
         layer_output: the output from the layer
     RETURN:
         A list of tuples of [('name_a', var_a), ('name_b', var_b)] whereby var is scalar 
     """
     
     w_len = T.sqrt((self.W ** 2).sum(axis=0))
     max_length = T.max(w_len)
     mean_length = T.mean(w_len)
     min_length = T.min(w_len)
     
     return [('max_col_length', max_length),
             ('mean_col_length', mean_length),
             ('min_col_length', min_length), 
             ('output_max', T.max(layer_output)),
             ('output_mean', T.mean(layer_output)), 
             ('output_min', T.min(layer_output)),
             ('max_W', T.max(self.W)),
             ('mean_W', T.mean(self.W)),
             ('min_W', T.min(self.W)),
             ('max_b', T.max(self.b)),
             ('mean_b', T.mean(self.b)),
             ('min_b', T.min(self.b))]
    def filterbank_matrices(self, center_y, center_x, delta, sigma):
        """
        Create a Fy and a Fx

        Parameters
        ----------
        center_y : T.vector (shape: batch_size)
        center_x : T.vector (shape: batch_size)
            Y and X center coordinates for the attention window
        delta : T.vector (shape: batch_size)
        sigma : T.vector (shape: batch_size)

        Returns
        -------
            FY, FX
        """
        tol = 1e-4
        # construct x and y coordinates for the grid points
        obj_x = center_x.dimshuffle(0, 'x') + \
                (delta.dimshuffle(0, 'x') * self.obj_x)
        obj_y = center_y.dimshuffle(0, 'x') + \
                (delta.dimshuffle(0, 'x') * self.obj_y)

        # construct unnormalized attention weights for each grid point
        FX = T.exp( -(self.img_x - obj_x.dimshuffle(0,1,'x'))**2. / \
                   (2. * sigma.dimshuffle(0,'x','x')**2.) )
        FY = T.exp( -(self.img_y - obj_y.dimshuffle([0,1,'x']))**2. / \
                   (2. * sigma.dimshuffle(0,'x','x')**2.) )

        # normalize the attention weights
        #FX = FX / (FX.sum(axis=-1).dimshuffle(0, 1, 'x') + tol)
        #FY = FY / (FY.sum(axis=-1).dimshuffle(0, 1, 'x') + tol)
        FX = FX / (T.max(FX.sum(axis=-1)) + tol)
        FY = FY / (T.max(FY.sum(axis=-1)) + tol)
        return FY, FX
Esempio n. 7
0
    def define_network(self, layers_info=None):
        """
        Builds Theano graph of the network.
        """
        self.hidden_layers = [None]*self.n_hidden.size

        self.params = []
        for i, h in enumerate(self.n_hidden):
            if i == 0:
                self.hidden_layers[i] = LBNHiddenLayer(self.rng, self.trng, self.x, self.n_in,
                                        h, self.det_activation[i],
                                        self.stoch_n_hidden, self.stoch_activation,
                                        det_activation_name=self.det_activation_names[i],
                                        stoch_activation_names=self.stoch_activation_names,
                                        m=self.m,
                                        det_W=None if layers_info is None else
                                        np.array(
                                        layers_info['hidden_layers'][i]['LBNlayer']['detLayer']\
                                                                                            ['W']),
                                        det_b=None if layers_info is None else
                                        np.array(layers_info['hidden_layers'][i]\
                                                                    ['LBNlayer']['detLayer']['b']),
                                        stoch_mlp_info=None if layers_info is None else
                                        layers_info['hidden_layers'][i]['LBNlayer']['stochLayer'])
            else:
                self.hidden_layers[i] = LBNHiddenLayer(self.rng, self.trng,
                                        self.hidden_layers[i-1].output,
                                        self.n_hidden[i-1], h, self.det_activation[i],
                                        self.stoch_n_hidden, self.stoch_activation,
                                        det_activation_name=self.det_activation_names[i],
                                        stoch_activation_names=self.stoch_activation_names, 
                                        det_W=None if layers_info is None else
                                        np.array(layers_info['hidden_layers'][i]['LBNlayer']\
                                                                                ['detLayer']['W']),
                                        det_b=None if layers_info is None else
                                        np.array(layers_info['hidden_layers'][i]['LBNlayer']\
                                                                                ['detLayer']['b']),
                                        stoch_mlp_info=None if layers_info is None else
                                        layers_info['hidden_layers'][i]['LBNlayer']['stochLayer'])

            self.params.append(self.hidden_layers[i].params)

        self.output_layer = OutputLayer(self.rng, self.hidden_layers[-1].output, self.n_hidden[-1], 
                                                            self.n_out, self.det_activation[-1],
                                                            self.det_activation_names[-1],
                                                            V_values=None 
                                                            if layers_info is None else np.array(
                                                            layers_info['output_layer']['W']))

        self.params.append(self.output_layer.params)
        self.output = self.output_layer.output
        exp_value = -0.5*T.sum((self.output - self.y.dimshuffle('x',0,1))**2, axis=2)
        max_exp_value = theano.ifelse.ifelse(T.lt(T.max(exp_value), -1*T.min(exp_value)),
                                                                T.max(exp_value), T.min(exp_value))
 
        self.log_likelihood = T.sum(T.log(T.sum(T.exp(exp_value - max_exp_value), axis=0)) +
                                                                                    max_exp_value)-\
                                self.y.shape[0]*(T.log(self.m)+self.y.shape[1]/2.*T.log(2*np.pi))

        self.predict = theano.function(inputs=[self.x, self.m], outputs=self.output)
Esempio n. 8
0
            def maxout(z = None):
                #g = theano.shared(numpy.zeros((hidden_layers_sizes[i],)),name='g',borrow=True)
                g = T.max(z[0:5])
                g = T.stack(g,T.max(z[5:10]))
                for index in xrange(hidden_layers_sizes[i]-10):
                    g = T.concatenate([g,[T.max(z[5*(index+2):5*(index+3)])]])
		return g
Esempio n. 9
0
    def __theano__softmax(self, inp, dim=None, predict=False, issequence=False):

        if dim is None:
            assert issequence, "Data dimensionality could not be parsed."
            dim = 2

        # FFD for dimensions 1 and 2
        if dim == 1 or dim == 2:
            # Using the numerically stable implementation (along the channel axis):
            ex = T.exp(inp - T.max(inp, axis=1, keepdims=True))
            y = ex / T.sum(ex, axis=1, keepdims=True)

            # One hot encoding for prediction
            if predict:
                y = T.argmax(y, axis=1)

        elif dim == 3:
            # Stable implementation again, this time along axis = 2 (channel axis)
            ex = T.exp(inp - T.max(inp, axis=2, keepdims=True))
            y = ex / T.sum(ex, axis=2, keepdims=True)

            # One hot encoding for prediction
            if predict:
                y = T.argmax(y, axis=2)

        else:
            raise NotImplementedError("Softmax is implemented in 2D, 3D and 1D.")

        return y
Esempio n. 10
0
 def test_max(self):
     # If we call max directly, we will return an CAReduce object
     # and he don't have R_op implemented!
     # self.check_mat_rop_lop(tensor.max(self.mx, axis=[0,1])[0],
     #                       ())
     self.check_mat_rop_lop(tensor.max(self.mx, axis=0), (self.mat_in_shape[1],))
     self.check_mat_rop_lop(tensor.max(self.mx, axis=1), (self.mat_in_shape[0],))
Esempio n. 11
0
    def forward_init(self):
        obs_ = self.obs_.reshape([self.obs_.shape[0]*self.obs_.shape[1], self.obs_.shape[-1]])

        h = eval(self.activ)(tensor.dot(obs_, self.params['W']) + self.params['b'][None,None,:])

        self.pi = []
        for oi in xrange(self.n_out):
            pi = tensor.dot(h, self.params['U%d'%oi]) + self.params['c%d'%oi][None,:]
            pi = tensor.exp(pi - tensor.max(pi,-1,keepdims=True))
            self.pi.append(pi / (pi.sum(-1, keepdims=True)))

        prev = tensor.matrix('prev', dtype='float32')
        #obs = tensor.matrix('obs', dtype='float32')
        obs_ = self.obs_.reshape([self.obs_.shape[0]*self.obs_.shape[1], 
                                  self.obs_.shape[-1]])
        obs_ = obs_[0]

        self.h_init = lambda x: numpy.float32(0.)

        h = eval(self.activ)(tensor.dot(obs_, self.params['W']) + self.params['b'][None,:])

        pi = []
        for oi in xrange(self.n_out):
            pi_ = tensor.dot(h, self.params['U%d'%oi]) + self.params['c%d'%oi][None,:]
            pi_ = tensor.exp(pi_ - tensor.max(pi_,-1,keepdims=True))
            pi.append(pi_ / (pi_.sum(-1, keepdims=True)))

        self.forward = theano.function([self.obs, prev], [h] + pi, name='forward', on_unused_input='ignore')
def decoder(localt, stm1, cstm1, hmat,
            Wbeta, Ubeta, vbeta,
            Wzide, Wzfde, Wzcde, Wzode,
            Ede, Wxide, Wside, bide, Wxfde, Wsfde, bfde, 
            Wxcde, Wscde, bcde, Wxode, Wsode, bode,
            L0, Ls, Lz):
    xt = theano.dot(localt, Ede)
    # get z from hmat (sentlen * nen), stm1
    beta = \
    theano.dot( act( theano.dot(hmat,Ubeta) + theano.dot(stm1,Wbeta) ) , vbeta )
    alpha = T.exp(beta-T.max(beta)) / T.sum(T.exp(beta-T.max(beta)) )
    zt = theano.dot(alpha, hmat)
    #
    it = sigma(theano.dot(xt,Wxide) + theano.dot(stm1,Wside) + theano.dot(zt,Wzide) + bide )
    ft = sigma(theano.dot(xt,Wxfde) + theano.dot(stm1,Wsfde) + theano.dot(zt,Wzfde) + bfde )
    cst = ft * cstm1 + it*act(theano.dot(xt,Wxcde)+theano.dot(stm1,Wscde)+ theano.dot(zt,Wzcde) +bcde )
    ot = sigma(theano.dot(xt,Wxode) + theano.dot(stm1,Wsode) + theano.dot(zt,Wzode) +bode )
    st = ot * act(cst)
    #
    winst = getwins()
    stfory = st * winst
    #
    yt0 = T.dot( (xt + T.dot(stfory, Ls) + T.dot(zt, Lz) ) , L0)
    #yt0 = theano.dot(st,Wsyde)
    yt0max = T.max(yt0)
    #yt0maxvec = T.maximum(yt0, yt0max)
    yt = T.exp(yt0-yt0max) / T.sum(T.exp(yt0-yt0max))
    logyt = yt0-yt0max-T.log(T.sum(T.exp(yt0-yt0max)))
    #yt = T.exp(yt0-yt0maxvec) / T.sum(T.exp(yt0-yt0maxvec))
    #logyt = yt0-yt0maxvec-T.log(T.sum(T.exp(yt0-yt0maxvec)))
#    yt = T.concatenate([addzero,tempyt],axis=0)
    return st, cst, yt, logyt
Esempio n. 13
0
def update_log_p(skip_idxs,zeros,active,log_p_curr,log_p_prev):
    active_skip_idxs = skip_idxs[(skip_idxs < active).nonzero()]
    active_next = T.cast(T.minimum(
        T.maximum(
            active + 1,
            T.max(T.concatenate([active_skip_idxs, [-1]])) + 2 + 1
        ),
        log_p_curr.shape[0]
    ), 'int32')

    common_factor = T.max(log_p_prev[:active])
    p_prev = T.exp(log_p_prev[:active] - common_factor)
    _p_prev = zeros[:active_next]
    # copy over
    _p_prev = T.set_subtensor(_p_prev[:active], p_prev)
    # previous transitions
    _p_prev = T.inc_subtensor(_p_prev[1:], _p_prev[:-1])
    # skip transitions
    _p_prev = T.inc_subtensor(
        _p_prev[active_skip_idxs + 2], p_prev[active_skip_idxs])
    updated_log_p_prev = T.log(_p_prev) + common_factor

    log_p_next = T.set_subtensor(
        zeros[:active_next],
        log_p_curr[:active_next] + updated_log_p_prev
    )
    return active_next, log_p_next
Esempio n. 14
0
    def plotUpdate(self,updates):
        '''
        >>>get update info of each layer
        >>>type updates: dict
        >>>para updates: update dictionary
        '''
        maxdict=T.zeros(shape=(self.deep*2+1,))
        mindict=T.zeros(shape=(self.deep*2+1,))
        meandict=T.zeros(shape=(self.deep*2+1,))
        
        for i in xrange(self.deep):
            updw=updates[self.layers[i].w]-self.layers[i].w
            maxdict=T.set_subtensor(maxdict[2*i],T.max(updw))
            mindict=T.set_subtensor(mindict[2*i],T.min(updw))
            meandict=T.set_subtensor(meandict[2*i],T.mean(updw))
            updb=updates[self.layers[i].b]-self.layers[i].b
            maxdict=T.set_subtensor(maxdict[2*i+1],T.max(updb))
            mindict=T.set_subtensor(mindict[2*i+1],T.min(updb))
            meandict=T.set_subtensor(meandict[2*i+1],T.mean(updb))

        updw=updates[self.classifier.w]-self.classifier.w
        maxdict=T.set_subtensor(maxdict[self.deep*2],T.max(updw))
        mindict=T.set_subtensor(mindict[self.deep*2],T.min(updw))
        meandict=T.set_subtensor(meandict[self.deep*2],T.mean(updw))
        return [maxdict,mindict,meandict]
Esempio n. 15
0
    def _activation(self, Y, L, M, W):
        """Returns the activation for a given input.

        Derived from the generative model formulation of hierarchical
        Poisson mixtures, the formular for the activation in the network
        reads as follows:
        I_c =
         \sum_d \log(W_{cd})y_d + \log(M_{lc})        for labeled data
         \sum_d \log(W_{cd})y_d + \log(\sum_k M_{kc}) for unlabeled data
        s_c = softmax(I_c)
        """
        # first: complete inference to find label
        # Input integration:
        I = T.tensordot(Y,T.log(W),axes=[1,1])
        # recurrent term:
        vM = M[L]
        L_index = T.eq(L,-1).nonzero()
        vM = T.set_subtensor(vM[L_index], T.sum(M, axis=0))
        # numeric trick to prevent overflow in the exp-function
        max_exponent = 86. - T.ceil(T.log(I.shape[1].astype('float32')))
        scale = T.switch(
            T.gt(T.max(I, axis=1, keepdims=True), max_exponent),
            T.max(I, axis=1, keepdims=True) - max_exponent,
            0.)
        # numeric approximation to prevent underflow in the exp-function:
        # map too low values of I to a fixed minimum value
        min_exponent = -87. + T.ceil(T.log(I.shape[1].astype('float32')))
        I = T.switch(
            T.lt(I-scale, min_exponent),
            scale+min_exponent,
            I)
        # activation: recurrent softmax with overflow protection
        s = vM*T.exp(I-scale)/T.sum(vM*T.exp(I-scale), axis=1, keepdims=True)
        return s
Esempio n. 16
0
def norm(x,ord):
    x = as_tensor_variable(x)
    ndim = x.ndim
    if ndim == 0:
        raise ValueError("'axis' entry is out of bounds.")
    elif ndim == 1:
        if ord == None:
            return tensor.sum(x**2)**0.5
        elif ord == 'inf':
            return tensor.max(abs(x))
        elif ord == '-inf':
            return tensor.min(abs(x))
        elif ord == 0:
            return x[x.nonzero()].shape[0]
        else:
            try:
                z = tensor.sum(abs(x**ord))**(1./ord)
            except TypeError:
                raise ValueError("Invalid norm order for vectors.")
            return z
    elif ndim == 2:
        if ord == None or ord == 'fro':
            return tensor.sum(abs(x**2))**(0.5)
        elif ord == 'inf':
            return tensor.max(tensor.sum(abs(x), 1))
        elif ord == '-inf':
            return tensor.min(tensor.sum(abs(x), 1))
        elif ord == 1:
            return tensor.max(tensor.sum(abs(x), 0))
        elif ord == -1:
            return tensor.min(tensor.sum(abs(x),0))
        else:
            raise ValueError(0)
    elif ndim > 2:
        raise NotImplementedError("We don't support norm witn ndim > 2")
Esempio n. 17
0
    def init_opt(self):
        is_recurrent = int(self.policy.recurrent)

        obs_var = self.env.observation_space.new_tensor_variable(
            'obs',
            extra_dims=1 + is_recurrent,
        )
        action_var = self.env.action_space.new_tensor_variable(
            'action',
            extra_dims=1 + is_recurrent,
        )
        advantage_var = ext.new_tensor(
            'advantage',
            ndim=1 + is_recurrent,
            dtype=theano.config.floatX
        )
        dist = self.policy.distribution
        old_dist_info_vars = {
            k: ext.new_tensor(
                'old_%s' % k,
                ndim=2 + is_recurrent,
                dtype=theano.config.floatX
            ) for k in dist.dist_info_keys
            }
        old_dist_info_vars_list = [old_dist_info_vars[k] for k in dist.dist_info_keys]

        if is_recurrent:
            valid_var = TT.matrix('valid')
        else:
            valid_var = None

        dist_info_vars = self.policy.dist_info_sym(obs_var, action_var)
        logli = dist.log_likelihood_sym(action_var, dist_info_vars)
        kl = dist.kl_sym(old_dist_info_vars, dist_info_vars)

        # formulate as a minimization problem
        # The gradient of the surrogate objective is the policy gradient
        if is_recurrent:
            surr_obj = - TT.sum(logli * advantage_var * valid_var) / TT.sum(valid_var)
            mean_kl = TT.sum(kl * valid_var) / TT.sum(valid_var)
            max_kl = TT.max(kl * valid_var)
        else:
            surr_obj = - TT.mean(logli * advantage_var)
            mean_kl = TT.mean(kl)
            max_kl = TT.max(kl)

        input_list = [obs_var, action_var, advantage_var]
        if is_recurrent:
            input_list.append(valid_var)

        self.optimizer.update_opt(surr_obj, target=self.policy, inputs=input_list)

        f_kl = ext.compile_function(
            inputs=input_list + old_dist_info_vars_list,
            outputs=[mean_kl, max_kl],
        )
        self.opt_info = dict(
            f_kl=f_kl,
        )
Esempio n. 18
0
 def pos_mf_iteration(g1, h1, v, pos_counter):
     h2 = self.h_hat(g1, v)
     s2_1 = self.s1_hat(g1, v)
     s2_0 = self.s0_hat(g1, v)
     g2 = self.g_hat(h2, s2_1, s2_0)
     # stopping criterion
     stop = T.maximum(T.max(g2 - g1), T.max(h2 - h1))
     return [g2, h2, s2_1, s2_0, v, pos_counter + 1], theano.scan_module.until(stop < eps)
Esempio n. 19
0
def rmax(x):

	xmax  = T.ge(x, T.max(x, axis = 1).reshape((x.shape[0],1)))
	shift = (T.ones_like(x) - xmax) * x
	max2  = T.max(shift,axis = 1).reshape((x.shape[0],1))
	out = T.nnet.relu(x - max2)

	return out
Esempio n. 20
0
 def __call__(self, x):
     if x.ndim == 2:
         x = T.max([x[:, n::self.n_pool] for n in range(self.n_pool)], axis=0)
     elif x.ndim == 4:
         x = T.max([x[:, n::self.n_pool, :, :] for n in range(self.n_pool)], axis=0)
     elif x.ndim == 3:
         x = T.max([x[:, :, n::self.n_pool] for n in range(self.n_pool)], axis=0)
     return x
Esempio n. 21
0
 def __call__(self, x):
     if x.ndim == 2:
         x = T.max([x[:, n::self.n_pool] for n in range(self.n_pool)], axis=0)
     elif x.ndim == 4:
         x = T.max([x[:, n::self.n_pool, :, :] for n in range(self.n_pool)], axis=0)
     else:
         raise NotImplementedError
     return x
Esempio n. 22
0
def Max_pooling(inp):
	"""
	Finding max across rows; inp is a 2D matrix
	"""
	if inp.ndim==1:
		return T.max(inp)
	else:
		return T.max(inp,axis=0)
Esempio n. 23
0
 def __call__(self, x):
     if x.ndim == 2:
         x = T.max([x[:, n :: self.n_pool] for n in range(self.n_pool)], axis=0)
     elif x.ndim == 4:
         x = T.max([x[:, n :: self.n_pool, :, :] for n in range(self.n_pool)], axis=0)
     elif x.ndim == 3:
         print "assuming standard rnn 3tensor"
         x = T.max([x[:, :, n :: self.n_pool] for n in range(self.n_pool)], axis=0)
     return x
 def pos_mf_iteration(g1, h1, v, pos_counter):
     h2 = self.h_hat(g1, v)
     s2_1 = self.s1_hat(g1, v)
     s2_0 = self.s0_hat(g1, v)
     g2 = self.g_hat(h2, s2_1, s2_0)
     # stopping criterion
     dl_dghat = T.max(abs(self.dlbound_dg(g2, h2, s2_1, s2_0, v)))
     dl_dhhat = T.max(abs(self.dlbound_dh(g2, h2, s2_1, s2_0, v)))
     stop = T.maximum(dl_dghat, dl_dhhat)
     return [g2, h2, s2_1, s2_0, v, pos_counter + 1], theano.scan_module.until(stop < eps)
Esempio n. 25
0
	def _step_test(self,
			  x_t, xi_t, xf_t, xo_t, xc_t, mask_tm1,
			  pred1_tm1, pred2_tm1, pred3_tm1, pred4_tm1, h_tm1, c_tm1, ctx_tm1, 
			  u_i, u_f, u_o, u_c, x_encoder, attention_encoder, x_img, B_W, B_U, B_Wimg, B_Wctx):

		outer1 = pred1_tm1[:, :, np.newaxis] * pred2_tm1[:, np.newaxis, :]
		outer1 =  outer1.reshape((outer1.shape[0],-1))
		outer2 = pred3_tm1[:, :, np.newaxis] * pred4_tm1[:, np.newaxis, :]
		outer2 =  outer2.reshape((outer2.shape[0],-1))
		pred = outer1[:, :, np.newaxis] * outer2[:, np.newaxis, :]
		pred =	pred.reshape((pred.shape[0],-1))
		x_t = self.W_embedding[T.argmax(pred, axis = 1)] * B_W[4]

		h_mask_tm1 = mask_tm1 * h_tm1
		c_mask_tm1 = mask_tm1 * c_tm1

		attention_x = T.dot(x_t, self.W_x2a)
		attention_total = attention_x[:,None,:] + attention_encoder
		if self.prev_context:
			attention_prev = T.dot(ctx_tm1,self.W_ctx2a)
			attention_total += attention_prev[:,None,:]

		attention_activation = T.dot( T.tanh(attention_total), self.V) # attention -> scores
		attention_alpha = T.nnet.softmax(attention_activation[:,:,0])  # scores -> weights
		ctx_t = (x_encoder * attention_alpha[:,:,None]).sum(axis = 1)  # weighted average of context vectors

		xi_t = T.dot(x_t * B_W[0], self.W_i) + self.b_i + T.dot(x_img * B_Wimg[0], self.Wimg_i) + T.dot(ctx_t * B_Wctx[0], self.Wctx_i)
		xf_t = T.dot(x_t * B_W[1], self.W_f) + self.b_f + T.dot(x_img * B_Wimg[1], self.Wimg_f) + T.dot(ctx_t * B_Wctx[1], self.Wctx_f)
		xc_t = T.dot(x_t * B_W[2], self.W_c) + self.b_c + T.dot(x_img * B_Wimg[2], self.Wimg_c) + T.dot(ctx_t * B_Wctx[2], self.Wctx_c)
		xo_t = T.dot(x_t * B_W[3], self.W_o) + self.b_o + T.dot(x_img * B_Wimg[3], self.Wimg_o) + T.dot(ctx_t * B_Wctx[3], self.Wctx_o)

		i_t = self.inner_activation(xi_t + T.dot(h_mask_tm1 * B_U[0], u_i))
		f_t = self.inner_activation(xf_t + T.dot(h_mask_tm1 * B_U[1], u_f))
		c_t = f_t * c_mask_tm1 + i_t * self.activation(xc_t + T.dot(h_mask_tm1 * B_U[2], u_c))
		o_t = self.inner_activation(xo_t + T.dot(h_mask_tm1 * B_U[3], u_o))
		h_t = o_t * self.activation(c_t)

		pred1_t = T.dot(h_t, self.U_p1) + self.b_p1
		pred1_t = T.nnet.softmax(pred1_t.reshape((-1, pred1_t.shape[-1]))).reshape(pred1_t.shape)

		pred2_t = T.dot(h_t, self.U_p2) + self.b_p2
		pred2_t = T.nnet.softmax(pred2_t.reshape((-1, pred2_t.shape[-1]))).reshape(pred2_t.shape)

		pred3_t = T.dot(h_t, self.U_p3) + self.b_p3
		pred3_t = T.nnet.softmax(pred3_t.reshape((-1, pred3_t.shape[-1]))).reshape(pred3_t.shape)

		pred4_t = T.dot(h_t, self.U_p4) + self.b_p4
		pred4_t = T.nnet.softmax(pred4_t.reshape((-1, pred4_t.shape[-1]))).reshape(pred4_t.shape)

		pred1_t = T.ge(pred1_t, T.max(pred1_t, axis = 1).reshape((pred1_t.shape[0],1)))*1.0
		pred2_t = T.ge(pred2_t, T.max(pred2_t, axis = 1).reshape((pred2_t.shape[0],1)))*1.0
		pred3_t = T.ge(pred3_t, T.max(pred3_t, axis = 1).reshape((pred3_t.shape[0],1)))*1.0
		pred4_t = T.ge(pred4_t, T.max(pred4_t, axis = 1).reshape((pred4_t.shape[0],1)))*1.0

		return pred1_t, pred2_t, pred3_t, pred4_t, h_t, c_t, ctx_t
Esempio n. 26
0
def logsoftmax(x, axis=None):
    '''
    Applies logsoftmax to x over the given axis (i.e. exp/sum(exp)).
    '''
    if isinstance(axis, int):
        m = T.max(x, axis=axis, keepdims=True)
    else:
        m = T.max(x)
    exp_x = T.exp(x - m)
    Z = T.sum(exp_x, axis=axis, keepdims=True)
    return x - m - T.log(Z)
Esempio n. 27
0
 def update_t(t, LLForward, alphas, scorematrix, queryseq, blank, T, L2):
     start = tensor.max([0, L2 - 2 * (T - t)])
     end = tensor.min([2 * t + 2, L2])
     s = tensor.arange(start, end)
     results, _ = theano.scan(fn=update_s, sequences=[s], non_sequences=[scorematrix, queryseq, blank, t],
                              outputs_info=[alphas], name='scan_along_s')
     alphas = results[-1]
     c = tensor.sum(alphas[start:end, t])
     c = tensor.max([1e-15, c])
     alphas = tensor.set_subtensor(alphas[start:end, t], alphas[start:end, t] / c)
     LLForward += tensor.log(c)
     return LLForward, alphas
Esempio n. 28
0
 def Convolution(self, x, mask):
     xe = self.approx_embedder(x)
     _mask = self.tmp[mask]
     
     _res1, _ = theano.scan(self.ConvLayer1, sequences=[xe])
     _res2, _ = theano.scan(self.ConvLayer2, sequences=[xe[:-1], xe[1:]])
     _res3, _ = theano.scan(self.ConvLayer3, sequences=[xe[:-2],xe[1:-1],xe[2:]])
     
     hidden1 = T.tanh(T.max(_res1*_mask, axis=0)).dimshuffle('x',0,1)
     hidden2 = T.tanh(T.max(_res2*_mask[:-1], axis=0)).dimshuffle('x',0,1)
     hidden3 = T.tanh(T.max(_res3*_mask[:-2], axis=0)).dimshuffle('x',0,1)
     
     return T.mean(T.concatenate([hidden1, hidden2, hidden3], axis=0), axis=0)
Esempio n. 29
0
    def get_monitoring_channels(self, V):

        vb, hb, weights = self.get_params()
        norms = theano_norms(weights)
        return {'W_min': tensor.min(weights),
                'W_max': tensor.max(weights),
                'W_norm_mean': tensor.mean(norms),
                'bias_hid_min' : tensor.min(hb),
                'bias_hid_mean' : tensor.mean(hb),
                'bias_hid_max' : tensor.max(hb),
                'bias_vis_min' : tensor.min(vb),
                'bias_vis_mean' : tensor.mean(vb),
                'bias_vis_max': tensor.max(vb),
        }
Esempio n. 30
0
    def __init__(self,
                 sample_fn,
                 free_energy_fn,
                 v_sample0,
                 n_runs,
                 log_int=500):
        """
        Initialized the AIS object.

        Parameters
        ----------
        sample_fn: compiled theano function, sample_fn(beta, v_sample)
            returns new model samples, at inverse temperature `beta`.
            Internally, we do this by performing block gibbs sampling using
            Eq.(15-17) (implemented in rbm_ais_gibbs_for_v) starting from
            configuration v_sample.

        free_energy_fn: theano function, free_energy_fn(beta,v_sample)
            Computes the free-energy of of configuration v_sample at the
            interpolating distribution p_a^(1-beta) p_b^(beta).

        v_sample0: numpy.ndarray
            initial samples from model A.

        n_runs: int
            number of AIS runs (i.e. minibatch size)

        log_int: int
            log standard deviation of log ais weights every `log_int`
            temperatures.
        """

        self.sample_fn = sample_fn
        self.free_energy_fn = free_energy_fn
        self.v_sample0 = v_sample0
        self.n_runs = n_runs
        self.log_int = log_int

        # initialize log importance weights
        self.log_ais_w = numpy.zeros(n_runs, dtype=config.floatX)

        # utility function for safely computing log-mean of the ais weights
        ais_w = tensor.vector()
        dlogz = (
            tensor.log(tensor.mean(tensor.exp(ais_w - tensor.max(ais_w)))) \
                + tensor.max(ais_w)
        )
        self.log_mean = theano.function([ais_w],
                                        dlogz,
                                        allow_input_downcast=False)
Esempio n. 31
0
def compile_update_svdd(nnet, inputs, targets):
    """
    create a Deep SVDD loss for network given in argument
    """

    floatX = Cfg.floatX

    ndim = nnet.data._X_train.ndim

    C = Cfg.C
    C_rec = Cfg.C_rec
    nu = Cfg.nu

    # initialize R
    if nnet.R_init > 0:
        nnet.Rvar = shared(floatX(nnet.R_init), name="R")
    else:
        nnet.Rvar = shared(floatX(1), name="R")  # initialization with R=1

    # Final Layer of the network
    final_layer = nnet.all_layers[-1]

    # SVDD Loss
    feature_layer = nnet.feature_layer
    rep = lasagne.layers.get_output(feature_layer,
                                    inputs=inputs,
                                    deterministic=False)

    # initialize c (0.5 in every feature representation dimension)
    rep_dim = feature_layer.num_units
    # maximum likehood
    volume = T.cast(floatX(-0.5) * T.sum(((rep)**2), axis=1, dtype='floatX'),
                    dtype='floatX')
    log_pro = T.cast(T.log(floatX(2 * np.pi) * T.exp(volume)), dtype='floatX')

    # log_likehood =T.mean(log_pro)
    # volume = T.cast(T.sum(floatX(-0.5) *(T.log(floatX(2 * np.pi)) + (rep ** 2)) , axis=1, dtype='floatX'), dtype='floatX')
    # log_pro = T.cast(T.exp(volume), dtype='floatX')
    # log_likehood = T.mean(log_pro)

    # # calculate entropy throught Kernel Density Estimation
    # rep_tranpose = T.transpose(rep, (1, 0))
    # rep_reshape = T.reshape(rep_tranpose, [rep_dim, Cfg.batch_size, 1])
    # transfer_vector = theano.shared(np.ones([1, Cfg.batch_size], dtype='float32'))
    # result = T.dot(rep_reshape, transfer_vector)
    # result1 = T.transpose(result, (0, 2, 1))
    # subtract = result - result1
    # KL_volume = T.cast(floatX(-0.5) * (T.log(floatX(2 * np.pi)) + (subtract ** 2)), dtype='floatX')
    # KL_volume = T.sum(KL_volume, axis=0)
    # KL_pro = T.cast(T.exp(KL_volume), dtype='floatX')
    # KL_pro_average = T.mean(KL_pro, axis=1)
    # log_KL_pro = T.log(KL_pro_average)
    # entropy = T.mean(log_KL_pro)

    # nnet.cvar = shared(floatX(np.ones(rep_dim) * (1. / (rep_dim ** 0.5))),
    #                    name="c")
    nnet.cvar = shared(floatX(np.ones(rep_dim) * 0.5), name="c")

    dist = T.sum(((rep - nnet.cvar.dimshuffle('x', 0))**2),
                 axis=1,
                 dtype='floatX')
    scores = dist - nnet.Rvar
    stack = T.stack([T.zeros_like(scores), scores], axis=1)
    loss = T.cast(T.sum(T.max(stack, axis=1)) / (inputs.shape[0] * nu),
                  dtype='floatX')

    y_pred = T.argmax(stack, axis=1)
    acc = T.cast((T.sum(T.eq(y_pred.flatten(), targets), dtype='int32') * 1. /
                  targets.shape[0]), 'floatX')

    # Network weight decay
    if Cfg.weight_decay:
        l2_penalty = (1 / C) * get_l2_penalty(nnet)
    else:
        l2_penalty = T.cast(0, dtype='floatX')

    # Reconstruction regularization
    if Cfg.reconstruction_penalty:
        reconstruction = lasagne.layers.get_output(final_layer,
                                                   inputs=inputs,
                                                   deterministic=False)

        # use l2 or binary crossentropy loss (features are scaled to [0,1])
        if Cfg.ae_loss == "l2":
            rec_loss = lasagne.objectives.squared_error(reconstruction, inputs)
        if Cfg.ae_loss == "ce":
            rec_loss = lasagne.objectives.binary_crossentropy(
                reconstruction, inputs)

        rec_loss = T.sum(rec_loss, axis=range(1, ndim), dtype='floatX')
        rec_penalty = (1 / C_rec) * T.mean(rec_loss)
    else:
        rec_penalty = T.cast(0, dtype='floatX')

    trainable_params = lasagne.layers.get_all_params(final_layer,
                                                     trainable=True)
    #Deep Gaussian Model
    updates_deep_kde = get_updates(nnet,
                                   deep_kde_loss,
                                   trainable_params,
                                   solver=nnet.solver)
    # nnet.backprop_deep_kde = theano.function([inputs, targets], [log_likehood, entropy], updates=updates_deep_kde,
    #                                      on_unused_input='warn')

    # Backpropagation (hard-margin: only minimizing everything to a ball centered at c)
    if not Cfg.center_fixed:
        trainable_params.append(
            nnet.cvar
        )  # add center c to trainable parameters if it should not be fixed.

    avg_dist = T.mean(dist, dtype="floatX")

    obj_ball = T.cast(floatX(0.5) * (l2_penalty + rec_penalty) + avg_dist,
                      dtype='floatX')
    updates_ball = get_updates(nnet,
                               obj_ball,
                               trainable_params,
                               solver=nnet.solver)
    nnet.backprop_ball = theano.function([inputs, targets], [obj_ball, acc],
                                         updates=updates_ball,
                                         on_unused_input='warn')

    # Backpropagation (without training R)
    obj = T.cast(floatX(0.5) * (l2_penalty + rec_penalty) + nnet.Rvar + loss,
                 dtype='floatX')
    updates = get_updates(nnet, obj, trainable_params, solver=nnet.solver)
    nnet.backprop_without_R = theano.function([inputs, targets], [obj, acc],
                                              updates=updates,
                                              on_unused_input='warn')

    # Backpropagation (with training R)
    trainable_params.append(nnet.Rvar)  # add radius R to trainable parameters
    updates = get_updates(nnet, obj, trainable_params, solver=nnet.solver)
    nnet.backprop = theano.function([inputs, targets], [obj, acc],
                                    updates=updates,
                                    on_unused_input='warn')

    # Forwardpropagation
    test_rep = lasagne.layers.get_output(feature_layer,
                                         inputs=inputs,
                                         deterministic=True)
    test_rep_norm = test_rep.norm(L=2, axis=1)

    test_dist = T.sum(((test_rep - nnet.cvar.dimshuffle('x', 0))**2),
                      axis=1,
                      dtype='floatX')

    test_scores = test_dist - nnet.Rvar
    test_stack = T.stack([T.zeros_like(test_scores), test_scores], axis=1)
    test_loss = T.cast(T.sum(T.max(test_stack, axis=1)) /
                       (inputs.shape[0] * nu),
                       dtype='floatX')

    test_y_pred = T.argmax(test_stack, axis=1)
    test_acc = T.cast(
        (T.sum(T.eq(test_y_pred.flatten(), targets), dtype='int32') * 1. /
         targets.shape[0]),
        dtype='floatX')

    # Reconstruction regularization (with determinisitc=True)
    if Cfg.reconstruction_penalty:
        test_reconstruction = lasagne.layers.get_output(final_layer,
                                                        inputs=inputs,
                                                        deterministic=True)

        # use l2 or binary crossentropy loss (features are scaled to [0,1])
        if Cfg.ae_loss == "l2":
            test_rec_loss = lasagne.objectives.squared_error(
                test_reconstruction, inputs)
        if Cfg.ae_loss == "ce":
            test_rec_loss = lasagne.objectives.binary_crossentropy(
                test_reconstruction, inputs)

        test_rec_loss = T.sum(test_rec_loss,
                              axis=range(1, ndim),
                              dtype='floatX')
        test_rec_penalty = (1 / C_rec) * T.mean(test_rec_loss)
    else:
        test_reconstruction = lasagne.layers.get_output(final_layer,
                                                        inputs=inputs,
                                                        deterministic=True)
        test_rec_penalty = T.cast(0, dtype='floatX')

    test_obj = T.cast(floatX(0.5) * (l2_penalty + test_rec_penalty) +
                      nnet.Rvar + test_loss,
                      dtype='floatX')
    nnet.forward = theano.function([inputs, targets], [
        test_obj, test_acc, test_scores,
        floatX(0.5) * l2_penalty,
        floatX(0.5) * test_rec_penalty, test_rep, test_rep_norm,
        test_reconstruction, test_loss, nnet.Rvar
    ],
                                   on_unused_input='warn')
Esempio n. 32
0
    def ready(self):
        embedding_layer = self.embedding_layer
        args = self.args
        padding_id = embedding_layer.vocab_map["<padding>"]

        dropout = self.dropout = theano.shared(
            np.float64(args.dropout).astype(theano.config.floatX))

        # len*batch
        x = self.x = T.imatrix()

        z = self.z = T.bmatrix()
        z = z.dimshuffle((0, 1, "x"))

        # batch*nclasses
        y = self.y = T.fmatrix()

        n_d = args.hidden_dimension
        n_e = embedding_layer.n_d
        activation = get_activation_by_name(args.activation)

        layers = self.layers = []
        depth = args.depth
        layer_type = args.layer.lower()
        for i in xrange(depth):
            if layer_type == "rcnn":
                l = ExtRCNN(n_in=n_e if i == 0 else n_d,
                            n_out=n_d,
                            activation=activation,
                            order=args.order)
            elif layer_type == "lstm":
                l = ExtLSTM(n_in=n_e if i == 0 else n_d,
                            n_out=n_d,
                            activation=activation)
            layers.append(l)

        # len * batch * 1
        masks = T.cast(
            T.neq(x, padding_id).dimshuffle((0, 1, "x")) * z,
            theano.config.floatX)
        # batch * 1
        cnt_non_padding = T.sum(masks, axis=0) + 1e-8

        # (len*batch)*n_e
        embs = embedding_layer.forward(x.ravel())
        # len*batch*n_e
        embs = embs.reshape((x.shape[0], x.shape[1], n_e))
        embs = apply_dropout(embs, dropout)

        pooling = args.pooling
        lst_states = []
        h_prev = embs
        for l in layers:
            # len*batch*n_d
            h_next = l.forward_all(h_prev, z)
            if pooling:
                # batch * n_d
                masked_sum = T.sum(h_next * masks, axis=0)
                lst_states.append(masked_sum / cnt_non_padding)  # mean pooling
            else:
                lst_states.append(h_next[-1])  # last state
            h_prev = apply_dropout(h_next, dropout)

        if args.use_all:
            size = depth * n_d
            # batch * size (i.e. n_d*depth)
            h_final = T.concatenate(lst_states, axis=1)
        else:
            size = n_d
            h_final = lst_states[-1]
        h_final = apply_dropout(h_final, dropout)

        output_layer = self.output_layer = Layer(n_in=size,
                                                 n_out=self.nclasses,
                                                 activation=sigmoid)

        # batch * nclasses
        preds = self.preds = output_layer.forward(h_final)

        # batch
        loss_mat = self.loss_mat = (preds - y)**2
        loss = self.loss = T.mean(loss_mat)

        pred_diff = self.pred_diff = T.mean(
            T.max(preds, axis=1) - T.min(preds, axis=1))

        params = self.params = []
        for l in layers + [output_layer]:
            for p in l.params:
                params.append(p)
        nparams = sum(len(x.get_value(borrow=True).ravel()) \
                                        for x in params)
        say("total # parameters: {}\n".format(nparams))

        l2_cost = None
        for p in params:
            if l2_cost is None:
                l2_cost = T.sum(p**2)
            else:
                l2_cost = l2_cost + T.sum(p**2)
        l2_cost = l2_cost * args.l2_reg
        self.l2_cost = l2_cost

        cost = self.cost = loss * 10 + l2_cost
Esempio n. 33
0
def set_network_trainer(input_data,
                        input_mask,
                        target_data,
                        target_mask,
                        num_outputs,
                        network,
                        updater,
                        learning_rate,
                        grad_max_norm=10.,
                        l2_lambda=1e-5,
                        load_updater_params=None):
    # get one hot target
    one_hot_target_data = T.extra_ops.to_one_hot(y=T.flatten(target_data, 1),
                                                 nb_class=num_outputs,
                                                 dtype=floatX)

    # get network output data
    predict_data = get_output(network, deterministic=False)
    num_seqs = predict_data.shape[0]

    # get prediction cost
    predict_data = T.reshape(x=predict_data,
                             newshape=(-1, num_outputs),
                             ndim=2)
    predict_data = predict_data - T.max(predict_data, axis=-1, keepdims=True)
    predict_data = predict_data - T.log(
        T.sum(T.exp(predict_data), axis=-1, keepdims=True))
    train_predict_cost = -T.sum(T.mul(one_hot_target_data, predict_data),
                                axis=-1)
    train_predict_cost = train_predict_cost * T.flatten(target_mask, 1)
    train_model_cost = train_predict_cost.sum() / num_seqs
    train_frame_cost = train_predict_cost.sum() / target_mask.sum()

    # get regularizer cost
    train_regularizer_cost = regularize_network_params(network, penalty=l2)

    # get network parameters
    network_params = get_all_params(network, trainable=True)

    # get network gradients
    network_grads = theano.grad(cost=train_model_cost +
                                train_regularizer_cost * l2_lambda,
                                wrt=network_params)

    if grad_max_norm > 0.:
        network_grads, network_grads_norm = total_norm_constraint(
            tensor_vars=network_grads,
            max_norm=grad_max_norm,
            return_norm=True)
    else:
        network_grads_norm = T.sqrt(
            sum(T.sum(grad**2) for grad in network_grads))

    # set updater
    train_updates, trainer_params = updater(
        loss_or_grads=network_grads,
        params=network_params,
        learning_rate=learning_rate,
        load_params_dict=load_updater_params)

    # get training (update) function
    training_fn = theano.function(
        inputs=[input_data, input_mask, target_data, target_mask],
        outputs=[train_frame_cost, network_grads_norm],
        updates=train_updates)
    return training_fn, trainer_params
    def initialize_network(self):
        """
        :description: this method initializes the network, updates, and theano functions for training and 
            retrieving q values. Here's an outline: 

            1. build the q network and target q network
            2. initialize theano symbolic variables used for compiling functions
            3. initialize the theano numeric variables used as input to functions
            4. formulate the symbolic loss 
            5. formulate the symbolic updates 
            6. compile theano functions for training and for getting q_values
        """
        build_network = self.get_build_network()
        batch_size, input_shape = self.batch_size, self.input_shape
        lasagne.random.set_rng(self.rng)

        # 1. build the q network and target q network
        self.l_out = build_network(input_shape, self.sequence_length, batch_size, self.num_actions)
        self.next_l_out = build_network(input_shape, self.sequence_length, batch_size, self.num_actions)
        self.reset_target_network()

        # 2. initialize theano symbolic variables used for compiling functions
        states = T.tensor3('states')
        actions = T.icol('actions')
        rewards = T.col('rewards')
        next_states = T.tensor3('next_states')
        # terminals are used to indicate a terminal state in the episode and hence a mask over the future
        # q values i.e., Q(s',a')
        terminals = T.icol('terminals')

        # 3. initialize the theano numeric variables used as input to functions or in functions
        self.states_shape = (batch_size,) + (self.sequence_length,) + (self.input_shape, )
        self.states_shared = theano.shared(np.zeros(self.states_shape, dtype=theano.config.floatX))
        self.next_states_shared = theano.shared(np.zeros(self.states_shape, dtype=theano.config.floatX))
        self.rewards_shared = theano.shared(np.zeros((batch_size, 1), dtype=theano.config.floatX), 
            broadcastable=(False, True))
        self.actions_shared = theano.shared(np.zeros((batch_size, 1), dtype='int32'),
            broadcastable=(False, True))
        self.terminals_shared = theano.shared(np.zeros((batch_size, 1), dtype='int32'),
            broadcastable=(False, True))        

        # 4. formulate the symbolic loss 
        q_vals = lasagne.layers.get_output(self.l_out, states)
        next_q_vals = lasagne.layers.get_output(self.next_l_out, next_states)
        target = (rewards +
                 (T.ones_like(terminals) - terminals) *
                  self.discount * T.max(next_q_vals, axis=1, keepdims=True))
        # reshape((-1,)) == 'make a row vector', reshape((-1, 1) == 'make a column vector'
        diff = target - q_vals[T.arange(batch_size), actions.reshape((-1,))].reshape((-1, 1))

        # a lot of the recent work clips the td error at 1 so we do that here
        # the problem is that gradient backpropagating through this minimum node
        # will be zero if diff is larger then 1.0 (because changing params before
        # the minimum does not impact the output of the minimum). To account for 
        # this we take the part of the td error (magnitude) greater than 1.0 and simply
        # add it to the loss, which allows gradient to backprop but just linearly
        # in the td error rather than quadratically
        quadratic_part = T.minimum(abs(diff), 1.0)
        linear_part = abs(diff) - quadratic_part
        loss = 0.5 * quadratic_part ** 2 + linear_part
        loss = T.sum(loss)

        # 5. formulate the symbolic updates 
        params = lasagne.layers.helper.get_all_params(self.l_out)  
        updates = self.initialize_updates(self.update_rule, loss, params, self.learning_rate)

        # 6. compile theano functions for training and for getting q_values and hid init
        givens = {
            states: self.states_shared,
            next_states: self.next_states_shared,
            rewards: self.rewards_shared,
            actions: self.actions_shared,
            terminals: self.terminals_shared
        }
        self._train = theano.function([], [loss, q_vals], updates=updates, givens=givens)
        self._get_q_values = theano.function([], [q_vals], givens={states: self.states_shared})
    def get_output_for(self, deterministic=False):

        if deterministic:
            deterministic_flag = T.constant(1)
        else:
            deterministic_flag = T.constant(0)

        batch_size = self.pred.shape[0]
        time_steps = self.pred.shape[1]
        label_num = self.pred.shape[2]

        ## the start state to first label
        pred_t1 = self.pred[:, 0]  # shape: (batch size, label num)
        gs_t1 = self.gs[:, 0] - 1
        mask_t1 = self.masks[:, 0]

        score_t0 = T.zeros((batch_size, label_num))
        index_t0 = T.zeros((batch_size, label_num), dtype='int64')

        init_flag = T.constant(1)
        # return shape: (batch size, label num), (batch size, label num)
        score_t1, index_t1 = self.score_one_step(pred_t1, gs_t1, mask_t1,
                                                 score_t0, index_t0,
                                                 self.init_t, self.tran_t,
                                                 deterministic_flag, init_flag)

        print 'score_t1', score_t1.eval()
        print 'index_t1', index_t1.eval()

        pred = self.pred.dimshuffle(1, 0, 2)
        gs = self.gs.dimshuffle(1, 0)
        mask = self.masks.dimshuffle(1, 0)
        init_flag = T.constant(0)

        # init_flag = T.constant(0)
        # score_t2, index_t2 = self.score_one_step(pred[1], gs[1]-1,
        #     mask[1], score_t1, index_t1, self.init_t, self.tran_t, deterministic_flag, init_flag)

        # print 'score_t2', score_t2.eval()
        # print 'index_t2', index_t2.eval()

        # print pred[1:].eval().shape
        # print (gs[1:]-1).eval().shape
        # print mask[1:].eval().shape
        # return shape: (time steps - 1, batch size, label num) ..., (time steps - 1, batch size)
        step_scores, step_indexs = theano.scan(
            fn=self.score_one_step,
            outputs_info=[score_t1, index_t1],
            sequences=[pred[1:], gs[1:] - 1, mask[1:]],
            non_sequences=[
                self.init_t, self.tran_t, deterministic_flag, init_flag
            ])[0]

        # # print step_scores.eval().shape
        # # print step_indexs.eval().shape
        print 'score_t2', step_scores.dimshuffle(1, 0, 2)[:, 0].eval()
        print 'index_t2', step_indexs.dimshuffle(1, 0, 2)[:, 0].eval()
        print 'score_t3', step_scores.dimshuffle(1, 0, 2)[:, 1].eval()
        print 'index_t3', step_indexs.dimshuffle(1, 0, 2)[:, 1].eval()

        # shape: (batch size, )
        last_step_max_score = T.max(step_scores[-1], axis=-1)
        last_step_max_index = T.argmax(step_scores[-1], axis=-1)

        def track_one_step(index_t, max_index_t):
            # example_indexs shape: (batch size, label num)
            # step_max_index shape: (batch size, )
            def scan_example(index_t_e, max_index_t_e):
                max_index_tm1_e = index_t_e[max_index_t_e]
                return max_index_tm1_e

            # return shape: (batch size, )
            max_index_tm1 = theano.scan(fn=scan_example,
                                        sequences=[index_t, max_index_t])[0]
            return max_index_tm1

        # reverse time step, shape: (time steps - 1, batch size, label num)
        #step_indexs = step_indexs[::-1]

        # return shape: (time steps - 1, batch size)
        index_chain = theano.scan(fn=track_one_step,
                                  sequences=step_indexs,
                                  outputs_info=last_step_max_index,
                                  go_backwards=True)[0]
        # return shape: (batch size, time steps - 1)
        index_chain = index_chain.dimshuffle(1, 0)

        # shape: (batch size, time steps)
        index_chain_reverse = self.aggregateTensor(last_step_max_index,
                                                   index_chain)

        # add 1 for label index (which index from 1)
        # return shape: (batch size, time steps)
        index_chain = (index_chain_reverse +
                       T.ones_like(index_chain_reverse))[:, ::-1]

        print 'index chain', index_chain.eval()

        def one_step_cost(step_index, pred_t, gs_t, index_chain_t, mask_t,
                          cost_tm1, gs_tm1, index_chain_tm1, init_tran, tran):
            # step_index: (1,)
            # pred_t: (batch size, label num)
            # gs_t_e: (batch size, )
            # index_chain_t: (batch size, )
            # mask_t: (batch size, )
            # cost_tm1: (batch size, )
            # gs_tm1: (batch size, )
            # index_chain_tm1: (batch size, )

            def scan_example(pred_t_e, gs_t_e, index_chain_t_e, mask_t_e,
                             cost_tm1_e, gs_tm1_e, index_chain_tm1_e,
                             step_index, init_tran, tran):
                # pred_t_e: (label num, )
                # gs_t_e: (1, )
                # index_chain_t_e: (1, )
                # mask_t_e: (1, )
                # gs_tm1_e: (1, )
                # index_chain_tm1_e: (1, )
                # init_tran: (label num, )
                # tran: (label num, label num)

                cost_t_e = None
                cost_t_e = theano.ifelse.ifelse(
                    T.eq(step_index, 0),
                    theano.printing.Print('\ninit step pred_t_e\n')(pred_t_e[
                        theano.printing.Print('\ninit step index_chain_t_e\n')
                        (index_chain_t_e)]) +
                    theano.printing.Print('\n initstep init_tran\n')(
                        init_tran[index_chain_t_e]) -
                    theano.printing.Print('\ninit step pred_t_e\n')(
                        pred_t_e[theano.printing.Print('\ninit step gs_t_e\n')
                                 (gs_t_e)]) -
                    theano.printing.Print('\ninit step init_tran\n')(
                        init_tran[gs_t_e]),
                    theano.printing.Print('\nother pred_t_e\n')(
                        pred_t_e[theano.printing.Print(
                            '\nother index_chain_t_e\n')(index_chain_t_e)]) +
                    theano.printing.Print('\nother tran\n')
                    (tran[theano.printing.Print('\nother index_chain_tm1_e\n')
                          (index_chain_tm1_e)][index_chain_t_e]) -
                    theano.printing.Print('\nother pred_t_e\n')(pred_t_e[
                        theano.printing.Print('\nother gs_t_e\n')(gs_t_e)]) -
                    theano.printing.Print('\nother tran\n')(
                        tran[theano.printing.Print('\nother gs_tm1_e\n')
                             (gs_tm1_e)][gs_t_e]))
                # if T.eq(step_index, 0) == T.constant(1):
                #     cost_t_e = pred_t_e[index_chain_t_e] + init_tran[index_chain_t_e]\
                #      - pred_t_e[gs_t_e] - init_tran[gs_t_e]
                # else:
                #     cost_t_e = pred_t_e[index_chain_t_e] + tran[index_chain_t_e][index_chain_tm1_e]\
                #      - pred_t_e[gs_t_e] - tran[gs_tm1_e][gs_t_e]

                cost_t_e = cost_t_e * mask_t_e

                # return shape: (1, )
                return theano.printing.Print('\ncost_t_e\n')(
                    cost_t_e), gs_t_e, index_chain_t_e

            # return shape: (batch size, )...
            cost_t, _, _ = theano.scan(
                fn=scan_example,
                sequences=[
                    pred_t, gs_t, index_chain_t, mask_t, cost_tm1, gs_tm1,
                    index_chain_tm1
                ],
                non_sequences=[step_index, init_tran, tran])[0]

            # return shape: (batch size, )...
            return cost_t, gs_t, index_chain_t

        # return shape: (time steps, batch size)
        index_chain_sff = index_chain.dimshuffle(1, 0)
        gs_t0 = T.zeros((batch_size, ), dtype='int64')
        cost_t0 = T.zeros((batch_size, ), dtype='float64')
        index_chain_t0 = T.zeros((batch_size, ), dtype='int64')

        # return shape: (time steps, batch size)
        print(gs - 1).eval()
        print(index_chain_sff - 1).eval()
        steps_cost, _, _ = theano.scan(
            fn=one_step_cost,
            outputs_info=[cost_t0, gs_t0, index_chain_t0],
            sequences=[
                T.arange(time_steps), pred, gs - 1, index_chain_sff - 1, mask
            ],
            non_sequences=[self.init_t, self.tran_t])[0]

        # return shape: (batch size, )
        cost = T.sum(steps_cost.dimshuffle(1, 0), axis=-1)

        # # return shape: (batch size, time steps - 1)
        # step_gs_scores = step_gs_scores.dimshuffle(1, 0)

        # # return shape: (batch size, )
        # last_gs_score = step_gs_scores[:, -1]

        # print 'score_t2', step_scores.dimshuffle(1, 0, 2)[:, 0].eval()
        # print 'index_t2', step_indexs.dimshuffle(1, 0, 2)[:, 0].eval()
        # print 'gs_score_t2', step_gs_scores[:, 0].eval()

        # print 'score_t3', step_scores.dimshuffle(1, 0, 2)[:, 1].eval()
        # print 'index_t3', step_indexs.dimshuffle(1, 0, 2)[:, 1].eval()
        # print 'gs_score_t3', step_gs_scores[:, 1].eval()

        # print index_chain.eval()
        # print last_step_max_score.eval()
        # print last_gs_score.eval()

        # return shape: (exmaple num, time steps), (batch size, ), (batch size, )
        #return [index_chain, last_step_max_score, last_gs_score]

        print 'cost', cost.eval()
        # return shape: (batch size, )
        return cost
Esempio n. 36
0
def run_cnn(exp_name,
        dataset, embedding,
        log_fn, perf_fn,
        emb_dm=100,
        batch_size=100,
        filter_hs=[1, 2, 3],
        hidden_units=[200, 100, 11],
        type_hidden_units=[200, 100, 6],
        dropout_rate=0.5,
        shuffle_batch=True,
        n_epochs=300,
        lr_decay=0.95,
        activation=ReLU,
        sqr_norm_lim=9,
        non_static=True,
        print_freq=5, 
        sen_reg=False,
        L2=False):
    """
    Train and Evaluate CNN event encoder model
    :dataset: list containing three elements[(train_x, train_y), 
            (valid_x, valid_y), (test_x, test_y)]
    :embedding: word embedding with shape (|V| * emb_dm)
    :filter_hs: filter height for each paralle cnn layer
    :dropout_rate: dropout rate for full connected layers
    :n_epochs: the max number of iterations
    
    """
    start_time = timeit.default_timer()
    rng = np.random.RandomState(1234)
   
    input_height = len(dataset[0][0][0][0])
    num_sens = len(dataset[0][0][0])
    print "--input height ", input_height 
    input_width = emb_dm
    num_maps = hidden_units[0]

    ###################
    # start snippet 1 #
    ###################
    print "start to construct the model ...."
    x = T.tensor3("x")
    type_y = T.ivector("y_type")
    pop_y = T.ivector("y_pop")

    words = shared(value=np.asarray(embedding,
        dtype=theano.config.floatX), 
        name="embedding", borrow=True)

    # define function to keep padding vector as zero
    zero_vector_tensor = T.vector()
    zero_vec = np.zeros(input_width, dtype=theano.config.floatX)
    set_zero = function([zero_vector_tensor],
            updates=[(words, T.set_subtensor(words[0,:], zero_vector_tensor))])

    layer0_input = words[T.cast(x.flatten(), dtype="int32")].reshape((
        x.shape[0] * x.shape[1], 1, x.shape[2], emb_dm
        ))

    #########################
    # Construct Sen Vec #####
    #########################
    conv_layers = []
    filter_shape = (num_maps, 1, filter_hs[0], emb_dm)
    pool_size = (input_height - filter_hs[0] + 1, 1)
    conv_layer = nn.ConvPoolLayer(rng, input=layer0_input,
            input_shape=None, filter_shape=filter_shape,
            pool_size=pool_size, activation=activation)
    sen_vecs = conv_layer.output.reshape((x.shape[0], x.shape[1], num_maps))
    conv_layers.append(conv_layer)
    
    ########################
    ## Task 1: populaiton###
    ######################## 
    pop_layer_sizes = zip(hidden_units, hidden_units[1:])
    pop_layer_input = sen_vecs
    pop_drop_input = sen_vecs
    pop_hidden_outs = []
    pop_drop_outs = []
    pop_hidden_layers = []
    pop_drop_layers = []
    droprate = 0.5
    for layer_size in pop_layer_sizes[:-1]:
        U_value = np.random.random(layer_size).astype(theano.config.floatX)
        b_value = np.zeros((layer_size[-1],), dtype=theano.config.floatX)

        U = theano.shared(U_value, borrow=True, name="U")
        b = theano.shared(b_value, borrow=True, name="b")
        
        pop_hidden_layer = nn.HiddenLayer(rng, pop_layer_input, 
                layer_size[0], layer_size[1], ReLU, 
                U * (1 - droprate), b)
        pop_drop_hidden_layer = nn.DropoutHiddenLayer(rng, pop_drop_input,
                layer_size[0], layer_size[1], ReLU,
                droprate, U, b)

        pop_hidden_layers.append(pop_hidden_layer)
        pop_drop_layers.append(pop_drop_hidden_layer)

        pop_hidden_out = pop_hidden_layer.output
        pop_drop_out = pop_drop_hidden_layer.output

        pop_layer_input = pop_hidden_out
        pop_drop_input = pop_drop_out

        pop_hidden_outs.append(pop_hidden_out)
        pop_drop_outs.append(pop_drop_out)

    # construct pop classifier 
    n_in, n_out = pop_layer_sizes[-1]
    W_value = np.random.random((n_in, n_out)).astype(theano.config.floatX)
    b_value = np.zeros((n_out,), dtype=theano.config.floatX)

    pop_W = theano.shared(W_value, borrow=True, name="pop_W")
    pop_b = theano.shared(b_value, borrow=True, name="pop_b")

    pop_act = T.dot(pop_hidden_outs[-1], pop_W * (1 - droprate)) + pop_b
    pop_drop_act = T.dot(pop_drop_outs[-1], pop_W) + pop_b

    pop_max_act = T.max(pop_act, axis=1).flatten(2)
    pop_drop_max_act = T.max(pop_drop_act, axis=1).flatten(2)

    pop_sen_max = T.argmax(T.max(pop_act, axis=2).flatten(2), axis=1)
    pop_drop_sen_max = T.argmax(T.max(pop_drop_act, axis=2).flatten(2), axis=1)
    
    pop_probs = T.nnet.softmax(pop_max_act)
    pop_drop_probs = T.nnet.softmax(pop_drop_max_act)

    pop_y_pred = T.argmax(pop_probs, axis=1)
    pop_drop_y_pred = T.argmax(pop_drop_probs, axis=1)

    pop_neg_loglikelihood = -T.mean(T.log(pop_probs)[T.arange(pop_y.shape[0]), pop_y])
    pop_drop_neg_loglikelihood = -T.mean(T.log(pop_drop_probs)[T.arange(pop_y.shape[0]), pop_y])

    pop_errors = T.mean(T.neq(pop_y_pred, pop_y))
    pop_errors_detail = T.neq(pop_y_pred, pop_y)

    pop_cost = pop_neg_loglikelihood
    pop_drop_cost = pop_drop_neg_loglikelihood


    
    ########################
    ## Task 1: event type###
    ######################## 
    type_layer_sizes = zip(type_hidden_units, type_hidden_units[1:])
    type_layer_input = sen_vecs
    type_drop_input = sen_vecs
    type_hidden_outs = []
    type_drop_outs = []
    type_hidden_layers = []
    type_drop_layers = []
    droprate = 0.5
    for layer_size in type_layer_sizes[:-1]:
        U_value = np.random.random(layer_size).astype(theano.config.floatX)
        b_value = np.zeros((layer_size[-1],), dtype=theano.config.floatX)

        U = theano.shared(U_value, borrow=True, name="U")
        b = theano.shared(b_value, borrow=True, name="b")
        
        type_hidden_layer = nn.HiddenLayer(rng, type_layer_input, 
                layer_size[0], layer_size[1], ReLU, 
                U * (1 - droprate), b)
        type_drop_hidden_layer = nn.DropoutHiddenLayer(rng, type_drop_input,
                layer_size[0], layer_size[1], ReLU,
                droprate, U, b)

        type_hidden_layers.append(type_hidden_layer)
        type_drop_layers.append(type_drop_hidden_layer)

        type_hidden_out = type_hidden_layer.output
        type_drop_out = type_drop_hidden_layer.output

        type_layer_input = type_hidden_out
        type_drop_input = type_drop_out

        type_hidden_outs.append(type_hidden_out)
        type_drop_outs.append(type_drop_out)

    # construct pop classifier 
    n_in, n_out = type_layer_sizes[-1]
    W_value = np.random.random((n_in, n_out)).astype(theano.config.floatX)
    b_value = np.zeros((n_out,), dtype=theano.config.floatX)

    type_W = theano.shared(W_value, borrow=True, name="pop_W")
    type_b = theano.shared(b_value, borrow=True, name="pop_b")

    type_act = T.dot(type_hidden_outs[-1], type_W * (1 - droprate)) + type_b
    type_drop_act = T.dot(type_drop_outs[-1], type_W) + type_b

    type_max_act = T.max(type_act, axis=1).flatten(2)
    type_drop_max_act = T.max(type_drop_act, axis=1).flatten(2)
    
    type_sen_max = T.argmax(T.max(type_act, axis=2).flatten(2), axis=1)
    type_drop_sen_max = T.argmax(T.max(type_drop_act, axis=2).flatten(2), axis=1)
    
    type_probs = T.nnet.softmax(type_max_act)
    type_drop_probs = T.nnet.softmax(type_drop_max_act)

    type_y_pred = T.argmax(type_probs, axis=1)
    type_drop_y_pred = T.argmax(type_drop_probs, axis=1)

    type_neg_loglikelihood = -T.mean(T.log(type_probs)[T.arange(type_y.shape[0]), type_y])
    type_drop_neg_loglikelihood = -T.mean(T.log(type_drop_probs)[T.arange(type_y.shape[0]), type_y])

    type_errors = T.mean(T.neq(type_y_pred, type_y))
    type_errors_detail = T.neq(type_y_pred, type_y)

    type_cost = type_neg_loglikelihood
    type_drop_cost = type_drop_neg_loglikelihood


    ###################################
    ## Choose the max sens in two task#
    ###################################
    pop_drop_choosed_sens = sen_vecs[T.arange(sen_vecs.shape[0]), pop_drop_sen_max]
    type_drop_choosed_sens = sen_vecs[T.arange(sen_vecs.shape[0]), type_drop_sen_max]
    simi_drop_cost = T.mean(T.sum((pop_drop_choosed_sens - type_drop_choosed_sens) ** 2, axis=1))
    
    pop_choosed_sens = sen_vecs[T.arange(sen_vecs.shape[0]), pop_sen_max]
    type_choosed_sens = sen_vecs[T.arange(sen_vecs.shape[0]), type_sen_max]
    simi_cost = T.mean(T.sum((pop_choosed_sens - type_choosed_sens) ** 2, axis=1))


    ##################################
    # Collect all the parameters #####
    ##################################
    params = []
    # convolution layer params
    for conv_layer in conv_layers:
        params += conv_layer.params

    # params for population task
    for layer in pop_drop_layers:
        params += layer.params

    params.append(pop_W)
    params.append(pop_b)

    # params for event type task
    for layer in type_drop_layers:
        params += layer.params

    params.append(type_W)
    params.append(type_b)

    if non_static:
        params.append(words)
    
    total_cost = pop_cost + type_cost
    total_drop_cost = pop_drop_cost + type_drop_cost 

    if sen_reg:
        simi_weight = 0.05
        total_cost += simi_weight * simi_cost
        total_drop_cost += simi_weight * simi_drop_cost

    if L2:
        l2_norm = 0.1 * T.sum(pop_W ** 2) + 0.1 * T.sum(type_W ** 2)
        for drop_layer in type_drop_layers:
            l2_norm += 0.1 * T.sum(drop_layer.W ** 2)
    
        for drop_layer in pop_drop_layers:
            l2_norm += 0.1 * T.sum(drop_layer.W ** 2)
        total_cost += l2_norm
        total_drop_cost += l2_norm

    total_grad_updates = sgd_updates_adadelta(params, 
            total_drop_cost,
            lr_decay,
            1e-6,
            sqr_norm_lim)

    total_preds = [pop_y_pred, type_y_pred]
    total_errors_details = [pop_errors_detail, type_errors_detail]
    total_choosed_sens = [pop_sen_max, type_sen_max]
    total_out = total_preds + total_errors_details + total_choosed_sens

    #####################
    # Construct Dataset #
    #####################
    print "Copy data to GPU and constrct train/valid/test func"
    np.random.seed(1234)
    
    train_x, train_pop_y, train_type_y = shared_dataset(dataset[0])
    test_x, test_pop_y, test_type_y = shared_dataset(dataset[1])

    n_train_batches = int(np.ceil(1.0 * len(dataset[0][0]) / batch_size))
    n_test_batches = int(np.ceil(1.0 * len(dataset[1][0]) / batch_size))

    #####################
    # Train model func #
    #####################
    index = T.iscalar()
    train_func = function([index], total_drop_cost, updates=total_grad_updates,
            givens={
                x: train_x[index*batch_size:(index+1)*batch_size],
                pop_y: train_pop_y[index*batch_size:(index+1)*batch_size],
                type_y:train_type_y[index*batch_size:(index+1)*batch_size]
                })
   
    train_pred_detail = function([index], total_out,
            givens={
                x:train_x[index*batch_size:(index+1)*batch_size],
                pop_y:train_pop_y[index*batch_size:(index+1)*batch_size],
                type_y:train_type_y[index*batch_size:(index+1)*batch_size]
                })

    test_pred_detail = function([index], total_out,
            givens={
                x:test_x[index*batch_size:(index+1)*batch_size],
                pop_y:test_pop_y[index*batch_size:(index+1)*batch_size],
                type_y:test_type_y[index*batch_size:(index+1)*batch_size]
                })


    # apply early stop strategy
    patience = 100
    patience_increase = 2
    improvement_threshold = 1.005
    
    n_test = len(dataset[1][0])

    epoch = 0
    best_params = None
    best_validation_score = 0.
    test_perf = 0

    done_loop = False
    
    log_file = open(log_fn, 'w')

    print "Start to train the model....."
    
    total_score = 0.0
    while (epoch < n_epochs) and not done_loop:
        start_time = timeit.default_timer()
        epoch += 1
        costs = []
        for minibatch_index in np.random.permutation(range(n_train_batches)):
            cost_epoch = train_func(minibatch_index)
            costs.append(cost_epoch)
            set_zero(zero_vec)

        if epoch % print_freq == 0:
            # do test
            pop_preds = []
            type_preds = []
            pop_errors = []
            type_errors = []
            pop_sens = []
            type_sens = []

            for i in xrange(n_test_batches):
                test_pop_pred, test_type_pred, test_pop_error, test_type_error, test_pop_sen, test_type_sen = test_pred_detail(i)

                pop_preds.append(test_pop_pred)
                type_preds.append(test_type_pred)
                pop_errors.append(test_pop_error)
                type_errors.append(test_type_error)
                pop_sens.append(test_pop_sen)
                type_sens.append(test_type_sen)

            pop_preds = np.concatenate(pop_preds)
            type_preds = np.concatenate(type_preds)
            pop_errors = np.concatenate(pop_errors)
            type_errors = np.concatenate(type_errors)
            pop_sens = np.concatenate(pop_sens)
            type_sens = np.concatenate(type_sens)

            pop_perf = 1 - np.mean(pop_errors)
            type_perf = 1 - np.mean(type_errors)

            # dumps the predictions and the choosed sentences
            with open(os.path.join(perf_fn, "%s_%d.pop_pred" % (exp_name, epoch)), 'w') as epf:
                for p in pop_preds:
                    epf.write("%d\n" % int(p))

            with open(os.path.join(perf_fn, "%s_%d.type_pred" % (exp_name, epoch)), 'w') as epf:
                for p in type_preds:
                    epf.write("%d\n" % int(p))
            
            with open(os.path.join(perf_fn, "%s_%d.test_pop_sens" % (exp_name, epoch)), 'w') as epf:
                for s in pop_sens:
                    epf.write("%d\n" % int(s))

            with open(os.path.join(perf_fn, "%s_%d.test_type_sens" % (exp_name, epoch)), 'w') as epf:
                for s in type_sens:
                    epf.write("%d\n" % int(s))
            
            train_pop_sens = []
            train_type_sens = []

            for i in xrange(n_train_batches):
                train_pop_pred, train_type_pred, train_pop_error, train_type_error, train_pop_sen, train_type_sen = train_pred_detail(i)

                train_pop_sens.append(train_pop_sen)
                train_type_sens.append(train_type_sen)

            pop_sens = np.concatenate(train_pop_sens)
            type_sens = np.concatenate(train_type_sens)

            
            with open(os.path.join(perf_fn, "%s_%d.train_pop_sens" % (exp_name, epoch)), 'w') as epf:
                for s in pop_sens:
                    epf.write("%d\n" % int(s))

            with open(os.path.join(perf_fn, "%s_%d.train_type_sens" % (exp_name, epoch)), 'w') as epf:
                for s in type_sens:
                    epf.write("%d\n" % int(s))
            
            message = "Epoch %d test pop perf %f, type perf %f, training_cost %f" % (epoch, pop_perf, type_perf, np.mean(costs))
            print message
            log_file.write(message + "\n")
            log_file.flush()

            if (pop_perf + type_perf) > total_score and False:
                total_score = pop_perf + type_perf
                # save the model
                model_name = os.path.join(perf_fn, "%s_%d.best_model" % (exp_name, epoch))
                with open(model_name, 'wb') as mn:
                    for param in params:
                        cPickle.dump(param.get_value(), mn)


        end_time = timeit.default_timer()
        print "Finish one iteration using %f m" % ((end_time - start_time)/60.)

    # output the final model params
    print "Output the final model"
    model_name = os.path.join(perf_fn, "%s_%d.final_model" % (exp_name, epoch))
    with open(model_name, 'wb') as mn:
        for param in params:
            cPickle.dump(param.get_value(), mn)


    log_file.flush()
    log_file.close()
Esempio n. 37
0
    def target_function(self, x_neg, word_emb, x_local, x, x_g):
        score = self.forward(word_emb, x_local, x, x_g)
        score_neg = self.forward(word_emb, x_local, x_neg, x_g)

        return T.max([0, 1 - score + score_neg])
Esempio n. 38
0
 def comp_one(self, param, info):
     return T.max(T.abs_(get_p(param)))
Esempio n. 39
0
 def comp_one(self, param, grad=None, diff=None):
     return T.max(T.abs_(get_p(param)))
def evaluate_lenet5(learning_rate=0.02,
                    n_epochs=100,
                    emb_size=300,
                    batch_size=50,
                    filter_size=[3],
                    sent_len=40,
                    claim_len=40,
                    cand_size=10,
                    hidden_size=[300, 300],
                    max_pred_pick=5):

    model_options = locals().copy()
    print "model options", model_options

    pred_id2label = {1: 'SUPPORTS', 0: 'REFUTES', 2: 'NOT ENOUGH INFO'}
    root = '/save/wenpeng/datasets/FEVER/'
    seed = 1234
    np.random.seed(seed)
    rng = np.random.RandomState(
        seed)  #random seed, control the model generates the same results
    srng = T.shared_randomstreams.RandomStreams(rng.randint(seed))

    "load raw data"
    train_sents, train_sent_masks, train_sent_labels, train_claims, train_claim_mask, train_labels, word2id = load_fever_train(
        sent_len, claim_len, cand_size)
    train_3th_sents, train_3th_sent_masks, train_3th_sent_labels, train_3th_claims, train_3th_claim_mask, train_3th_labels, word2id = load_fever_train_NoEnoughInfo(
        sent_len, claim_len, cand_size, word2id)
    test_sents, test_sent_masks, test_sent_labels, test_claims, test_claim_mask, test_sent_names, test_ground_names, test_labels, word2id = load_fever_dev(
        sent_len, claim_len, cand_size, word2id)
    test_3th_sents, test_3th_sent_masks, test_3th_sent_labels, test_3th_claims, test_3th_claim_mask, test_3th_labels, word2id = load_fever_dev_NoEnoughInfo(
        sent_len, claim_len, cand_size, word2id)

    train_sents = np.asarray(train_sents, dtype='int32')
    train_3th_sents = np.asarray(train_3th_sents, dtype='int32')
    joint_train_sents = np.concatenate((train_sents, train_3th_sents))
    test_sents = np.asarray(test_sents, dtype='int32')
    test_3th_sents = np.asarray(test_3th_sents, dtype='int32')
    joint_test_sents = np.concatenate((test_sents, test_3th_sents))

    train_sent_masks = np.asarray(train_sent_masks, dtype=theano.config.floatX)
    train_3th_sent_masks = np.asarray(train_3th_sent_masks,
                                      dtype=theano.config.floatX)
    joint_train_sent_masks = np.concatenate(
        (train_sent_masks, train_3th_sent_masks))
    test_sent_masks = np.asarray(test_sent_masks, dtype=theano.config.floatX)
    test_3th_sent_masks = np.asarray(test_3th_sent_masks,
                                     dtype=theano.config.floatX)
    joint_test_sent_masks = np.concatenate(
        (test_sent_masks, test_3th_sent_masks))

    train_sent_labels = np.asarray(train_sent_labels, dtype='int32')
    train_3th_sent_labels = np.asarray(train_3th_sent_labels, dtype='int32')
    joint_train_sent_labels = np.concatenate(
        (train_sent_labels, train_3th_sent_labels))
    test_sent_labels = np.asarray(test_sent_labels, dtype='int32')
    test_3th_sent_labels = np.asarray(test_3th_sent_labels, dtype='int32')
    joint_test_sent_labels = np.concatenate(
        (test_sent_labels, test_3th_sent_labels))

    train_claims = np.asarray(train_claims, dtype='int32')
    train_3th_claims = np.asarray(train_3th_claims, dtype='int32')
    joint_train_claims = np.concatenate((train_claims, train_3th_claims))
    test_claims = np.asarray(test_claims, dtype='int32')
    test_3th_claims = np.asarray(test_3th_claims, dtype='int32')
    joint_test_claims = np.concatenate((test_claims, test_3th_claims))

    train_claim_mask = np.asarray(train_claim_mask, dtype=theano.config.floatX)
    train_3th_claim_mask = np.asarray(train_3th_claim_mask,
                                      dtype=theano.config.floatX)
    joint_train_claim_mask = np.concatenate(
        (train_claim_mask, train_3th_claim_mask))
    test_claim_mask = np.asarray(test_claim_mask, dtype=theano.config.floatX)
    test_3th_claim_mask = np.asarray(test_3th_claim_mask,
                                     dtype=theano.config.floatX)
    joint_test_claim_mask = np.concatenate(
        (test_claim_mask, test_3th_claim_mask))

    train_labels = np.asarray(train_labels, dtype='int32')
    train_3th_labels = np.asarray(train_3th_labels, dtype='int32')
    joint_train_labels = np.concatenate((train_labels, train_3th_labels))
    test_labels = np.asarray(test_labels, dtype='int32')
    test_3th_labels = np.asarray(test_3th_labels, dtype='int32')
    joint_test_labels = np.concatenate((test_labels, test_3th_labels))

    joint_train_size = len(joint_train_claims)
    joint_test_size = len(joint_test_claims)
    train_size = len(train_claims)
    test_size = len(test_claims)
    test_3th_size = len(test_3th_claims)
    vocab_size = len(word2id) + 1
    print 'joint_train size: ', joint_train_size, ' joint_test size: ', joint_test_size
    print 'train size: ', train_size, ' test size: ', test_size
    print 'vocab size: ', vocab_size

    rand_values = rng.normal(
        0.0, 0.01,
        (vocab_size, emb_size))  #generate a matrix by Gaussian distribution
    id2word = {y: x for x, y in word2id.iteritems()}
    word2vec = load_word2vec()
    rand_values = load_word2vec_to_init(rand_values, id2word, word2vec)
    init_embeddings = theano.shared(
        value=np.array(rand_values, dtype=theano.config.floatX), borrow=True
    )  #wrap up the python variable "rand_values" into theano variable

    "now, start to build the input form of the model"
    sents_ids = T.itensor3()  #(batch, cand_size, sent_len)
    sents_mask = T.ftensor3()
    sents_labels = T.imatrix()  #(batch, cand_size)
    claim_ids = T.imatrix()  #(batch, claim_len)
    claim_mask = T.fmatrix()

    joint_sents_ids = T.itensor3()  #(batch, cand_size, sent_len)
    joint_sents_mask = T.ftensor3()
    joint_sents_labels = T.imatrix()  #(batch, cand_size)
    joint_claim_ids = T.imatrix()  #(batch, claim_len)
    joint_claim_mask = T.fmatrix()
    joint_labels = T.ivector()
    ######################
    # BUILD ACTUAL MODEL #
    ######################
    print '... building the model'

    embed_input_sents = init_embeddings[sents_ids.flatten(
    )].reshape((batch_size * cand_size, sent_len, emb_size)).dimshuffle(
        0, 2, 1
    )  #embed_input(init_embeddings, sents_ids_l)#embeddings[sents_ids_l.flatten()].reshape((batch_size,maxSentLen, emb_size)).dimshuffle(0,2,1) #the input format can be adapted into CNN or GRU or LSTM
    embed_input_claim = init_embeddings[claim_ids.flatten()].reshape(
        (batch_size, claim_len, emb_size)).dimshuffle(0, 2, 1)

    conv_W, conv_b = create_conv_para(rng,
                                      filter_shape=(hidden_size[0], 1,
                                                    emb_size, filter_size[0]))
    task1_att_conv_W, task1_att_conv_b = create_conv_para(
        rng, filter_shape=(hidden_size[0], 1, emb_size, filter_size[0]))
    task1_conv_W_context, task1_conv_b_context = create_conv_para(
        rng, filter_shape=(hidden_size[0], 1, emb_size, 1))
    att_conv_W, att_conv_b = create_conv_para(rng,
                                              filter_shape=(hidden_size[0], 1,
                                                            emb_size,
                                                            filter_size[0]))
    conv_W_context, conv_b_context = create_conv_para(
        rng, filter_shape=(hidden_size[0], 1, emb_size, 1))

    NN_para = [
        conv_W, conv_b, task1_att_conv_W, task1_att_conv_b, att_conv_W,
        att_conv_b, task1_conv_W_context, conv_W_context
    ]

    conv_model_sents = Conv_with_Mask(
        rng,
        input_tensor3=embed_input_sents,
        mask_matrix=sents_mask.reshape(
            (sents_mask.shape[0] * sents_mask.shape[1], sents_mask.shape[2])),
        image_shape=(batch_size * cand_size, 1, emb_size, sent_len),
        filter_shape=(hidden_size[0], 1, emb_size, filter_size[0]),
        W=conv_W,
        b=conv_b
    )  #mutiple mask with the conv_out to set the features by UNK to zero
    sent_embeddings = conv_model_sents.maxpool_vec  #(batch_size*cand_size, hidden_size) # each sentence then have an embedding of length hidden_size
    batch_sent_emb = sent_embeddings.reshape(
        (batch_size, cand_size, hidden_size[0]))

    conv_model_claims = Conv_with_Mask(
        rng,
        input_tensor3=embed_input_claim,
        mask_matrix=claim_mask,
        image_shape=(batch_size, 1, emb_size, claim_len),
        filter_shape=(hidden_size[0], 1, emb_size, filter_size[0]),
        W=conv_W,
        b=conv_b
    )  #mutiple mask with the conv_out to set the features by UNK to zero
    claim_embeddings = conv_model_claims.maxpool_vec  #(batch_size, hidden_size) # each sentence then have an embedding of length hidden_size
    batch_claim_emb = T.repeat(claim_embeddings.dimshuffle(0, 'x', 1),
                               cand_size,
                               axis=1)
    '''
    attentive conv for task1
    '''
    task1_attentive_conv_layer = Attentive_Conv_for_Pair_easy_version(
        rng,
        input_tensor3=
        embed_input_sents,  #batch_size*cand_size, emb_size, sent_len
        input_tensor3_r=T.repeat(embed_input_claim, cand_size, axis=0),
        mask_matrix=sents_mask.reshape(
            (sents_mask.shape[0] * sents_mask.shape[1], sents_mask.shape[2])),
        mask_matrix_r=T.repeat(claim_mask, cand_size, axis=0),
        image_shape=(batch_size * cand_size, 1, emb_size, sent_len),
        image_shape_r=(batch_size * cand_size, 1, emb_size, claim_len),
        filter_shape=(hidden_size[0], 1, emb_size, filter_size[0]),
        filter_shape_context=(hidden_size[0], 1, emb_size, 1),
        W=task1_att_conv_W,
        b=task1_att_conv_b,
        W_context=task1_conv_W_context,
        b_context=task1_conv_b_context)
    task1_attentive_sent_embeddings_l = task1_attentive_conv_layer.attentive_maxpool_vec_l  #(batch_size*cand_size, hidden_size)
    task1_attentive_sent_embeddings_r = task1_attentive_conv_layer.attentive_maxpool_vec_r

    concate_claim_sent = T.concatenate([
        batch_claim_emb, batch_sent_emb,
        T.sum(batch_claim_emb * batch_sent_emb, axis=2).dimshuffle(0, 1, 'x')
    ],
                                       axis=2)
    concate_2_matrix = concate_claim_sent.reshape(
        (batch_size * cand_size, hidden_size[0] * 2 + 1))

    LR_input = T.concatenate([
        concate_2_matrix, task1_attentive_sent_embeddings_l,
        task1_attentive_sent_embeddings_r
    ],
                             axis=1)
    LR_input_size = hidden_size[0] * 2 + 1 + hidden_size[0] * 2

    # LR_input = concate_2_matrix
    # LR_input_size = hidden_size[0]*2+1
    #classification layer, it is just mapping from a feature vector of size "hidden_size" to a vector of only two values: positive, negative
    U_a = create_ensemble_para(
        rng, 1, LR_input_size)  # the weight matrix hidden_size*2
    # LR_b = theano.shared(value=np.zeros((8,),dtype=theano.config.floatX),name='LR_b', borrow=True)  #bias for each target class
    LR_para = [U_a]
    # layer_LR=LogisticRegression(rng, input=LR_input, n_in=LR_input_size, n_out=8, W=U_a, b=LR_b) #basically it is a multiplication between weight matrix and input feature vector
    score_matrix = T.nnet.sigmoid(LR_input.dot(U_a))  #batch * 12
    inter_matrix = score_matrix.reshape((batch_size, cand_size))

    # inter_sent_claim = T.batched_dot(batch_sent_emb, batch_claim_emb) #(batch_size, cand_size, 1)
    # inter_matrix = T.nnet.sigmoid(inter_sent_claim.reshape((batch_size, cand_size)))
    '''
    maybe 1.0-inter_matrix can be rewritten into 1/e^(inter_matrix)
    '''
    # prob_pos = T.where( sents_labels < 1, 1.0-inter_matrix, inter_matrix)
    # loss = -T.mean(T.log(prob_pos))
    #f1 as loss
    batch_overlap = T.sum(sents_labels * inter_matrix, axis=1)
    batch_recall = batch_overlap / T.sum(sents_labels, axis=1)
    batch_precision = batch_overlap / T.sum(inter_matrix, axis=1)
    batch_f1 = 2.0 * batch_recall * batch_precision / (batch_recall +
                                                       batch_precision)
    loss = -T.mean(T.log(batch_f1))
    # loss = T.nnet.nnet.binary_crossentropy(inter_matrix, sents_labels).mean()
    '''
    training task2, predict 3 labels
    '''
    joint_embed_input_sents = init_embeddings[joint_sents_ids.flatten(
    )].reshape((batch_size * cand_size, sent_len, emb_size)).dimshuffle(
        0, 2, 1
    )  #embed_input(init_embeddings, sents_ids_l)#embeddings[sents_ids_l.flatten()].reshape((batch_size,maxSentLen, emb_size)).dimshuffle(0,2,1) #the input format can be adapted into CNN or GRU or LSTM
    joint_embed_input_claim = init_embeddings[
        joint_claim_ids.flatten()].reshape(
            (batch_size, claim_len, emb_size)).dimshuffle(0, 2, 1)
    joint_conv_model_sents = Conv_with_Mask(
        rng,
        input_tensor3=joint_embed_input_sents,
        mask_matrix=joint_sents_mask.reshape(
            (joint_sents_mask.shape[0] * joint_sents_mask.shape[1],
             joint_sents_mask.shape[2])),
        image_shape=(batch_size * cand_size, 1, emb_size, sent_len),
        filter_shape=(hidden_size[0], 1, emb_size, filter_size[0]),
        W=conv_W,
        b=conv_b
    )  #mutiple mask with the conv_out to set the features by UNK to zero
    joint_sent_embeddings = joint_conv_model_sents.maxpool_vec  #(batch_size*cand_size, hidden_size) # each sentence then have an embedding of length hidden_size
    joint_batch_sent_emb = joint_sent_embeddings.reshape(
        (batch_size, cand_size, hidden_size[0]))
    joint_premise_emb = T.sum(joint_batch_sent_emb *
                              joint_sents_labels.dimshuffle(0, 1, 'x'),
                              axis=1)  #(batch, hidden_size)

    joint_conv_model_claims = Conv_with_Mask(
        rng,
        input_tensor3=joint_embed_input_claim,
        mask_matrix=joint_claim_mask,
        image_shape=(batch_size, 1, emb_size, claim_len),
        filter_shape=(hidden_size[0], 1, emb_size, filter_size[0]),
        W=conv_W,
        b=conv_b
    )  #mutiple mask with the conv_out to set the features by UNK to zero
    joint_claim_embeddings = joint_conv_model_claims.maxpool_vec  #(batch_size, hidden_size) # each sentence then have an embedding of length hidden_size

    joint_premise_hypo_emb = T.concatenate(
        [joint_premise_emb, joint_claim_embeddings],
        axis=1)  #(batch, 2*hidden_size)
    '''
    attentive conv in task2
    '''
    joint_sents_tensor3 = joint_embed_input_sents.dimshuffle(0, 2, 1).reshape(
        (batch_size, cand_size * sent_len, emb_size))
    joint_sents_dot = T.batched_dot(
        joint_sents_tensor3, joint_sents_tensor3.dimshuffle(
            0, 2, 1))  #(batch_size, cand_size*sent_len, cand_size*sent_len)
    joint_sents_dot_2_matrix = T.nnet.softmax(
        joint_sents_dot.reshape(
            (batch_size * cand_size * sent_len, cand_size * sent_len)))
    joint_sents_context = T.batched_dot(
        joint_sents_dot_2_matrix.reshape(
            (batch_size, cand_size * sent_len, cand_size * sent_len)),
        joint_sents_tensor3)  #(batch_size, cand_size*sent_len, emb_size)
    joint_add_sents_context = joint_embed_input_sents + joint_sents_context.reshape(
        (batch_size * cand_size, sent_len, emb_size)
    ).dimshuffle(
        0, 2, 1
    )  #T.concatenate([joint_embed_input_sents, joint_sents_context.reshape((batch_size*cand_size, sent_len, emb_size)).dimshuffle(0,2,1)], axis=1) #(batch_size*cand_size, 2*emb_size, sent_len)

    attentive_conv_layer = Attentive_Conv_for_Pair_easy_version(
        rng,
        input_tensor3=
        joint_add_sents_context,  #batch_size*cand_size, 2*emb_size, sent_len
        input_tensor3_r=T.repeat(joint_embed_input_claim, cand_size, axis=0),
        mask_matrix=joint_sents_mask.reshape(
            (joint_sents_mask.shape[0] * joint_sents_mask.shape[1],
             joint_sents_mask.shape[2])),
        mask_matrix_r=T.repeat(joint_claim_mask, cand_size, axis=0),
        image_shape=(batch_size * cand_size, 1, emb_size, sent_len),
        image_shape_r=(batch_size * cand_size, 1, emb_size, claim_len),
        filter_shape=(hidden_size[0], 1, emb_size, filter_size[0]),
        filter_shape_context=(hidden_size[0], 1, emb_size, 1),
        W=att_conv_W,
        b=att_conv_b,
        W_context=conv_W_context,
        b_context=conv_b_context)
    attentive_sent_embeddings_l = attentive_conv_layer.attentive_maxpool_vec_l.reshape(
        (batch_size, cand_size,
         hidden_size[0]))  #(batch_size*cand_size, hidden_size)
    attentive_sent_embeddings_r = attentive_conv_layer.attentive_maxpool_vec_r.reshape(
        (batch_size, cand_size, hidden_size[0]))
    masked_sents_attconv = attentive_sent_embeddings_l * joint_sents_labels.dimshuffle(
        0, 1, 'x')
    masked_claim_attconv = attentive_sent_embeddings_r * joint_sents_labels.dimshuffle(
        0, 1, 'x')
    fine_max = T.concatenate([
        T.max(masked_sents_attconv, axis=1),
        T.max(masked_claim_attconv, axis=1)
    ],
                             axis=1)  #(batch, 2*hidden)
    # fine_sum = T.concatenate([T.sum(masked_sents_attconv, axis=1),T.sum(masked_claim_attconv, axis=1)],axis=1) #(batch, 2*hidden)
    "Logistic Regression layer"
    joint_LR_input = T.concatenate([joint_premise_hypo_emb, fine_max], axis=1)
    joint_LR_input_size = 2 * hidden_size[0] + 2 * hidden_size[0]

    joint_U_a = create_ensemble_para(rng, 3,
                                     joint_LR_input_size)  # (input_size, 3)
    joint_LR_b = theano.shared(value=np.zeros((3, ),
                                              dtype=theano.config.floatX),
                               name='LR_b',
                               borrow=True)  #bias for each target class
    joint_LR_para = [joint_U_a, joint_LR_b]

    joint_layer_LR = LogisticRegression(
        rng,
        input=joint_LR_input,
        n_in=joint_LR_input_size,
        n_out=3,
        W=joint_U_a,
        b=joint_LR_b
    )  #basically it is a multiplication between weight matrix and input feature vector
    joint_loss = joint_layer_LR.negative_log_likelihood(
        joint_labels
    )  #for classification task, we usually used negative log likelihood as loss, the lower the better.
    '''
    testing
    '''
    # binarize_prob = T.where( inter_matrix > 0.5, 1, 0)  #(batch_size, cand_size

    masked_inter_matrix = inter_matrix * sents_labels  #(batch, cand_size)
    test_premise_emb = T.sum(batch_sent_emb *
                             masked_inter_matrix.dimshuffle(0, 1, 'x'),
                             axis=1)
    test_premise_hypo_emb = T.concatenate([test_premise_emb, claim_embeddings],
                                          axis=1)

    #fine-maxsum
    sents_tensor3 = embed_input_sents.dimshuffle(0, 2, 1).reshape(
        (batch_size, cand_size * sent_len, emb_size))
    sents_dot = T.batched_dot(sents_tensor3, sents_tensor3.dimshuffle(
        0, 2, 1))  #(batch_size, cand_size*sent_len, cand_size*sent_len)
    sents_dot_2_matrix = T.nnet.softmax(
        sents_dot.reshape(
            (batch_size * cand_size * sent_len, cand_size * sent_len)))
    sents_context = T.batched_dot(
        sents_dot_2_matrix.reshape(
            (batch_size, cand_size * sent_len, cand_size * sent_len)),
        sents_tensor3)  #(batch_size, cand_size*sent_len, emb_size)
    add_sents_context = embed_input_sents + sents_context.reshape(
        (batch_size * cand_size, sent_len, emb_size)
    ).dimshuffle(
        0, 2, 1
    )  #T.concatenate([embed_input_sents, sents_context.reshape((batch_size*cand_size, sent_len, emb_size)).dimshuffle(0,2,1)], axis=1) #(batch_size*cand_size, 2*emb_size, sent_len)

    test_attentive_conv_layer = Attentive_Conv_for_Pair_easy_version(
        rng,
        input_tensor3=
        add_sents_context,  #batch_size*cand_size, 2*emb_size, sent_len
        input_tensor3_r=T.repeat(embed_input_claim, cand_size, axis=0),
        mask_matrix=sents_mask.reshape(
            (sents_mask.shape[0] * sents_mask.shape[1], sents_mask.shape[2])),
        mask_matrix_r=T.repeat(claim_mask, cand_size, axis=0),
        image_shape=(batch_size * cand_size, 1, emb_size, sent_len),
        image_shape_r=(batch_size * cand_size, 1, emb_size, claim_len),
        filter_shape=(hidden_size[0], 1, emb_size, filter_size[0]),
        filter_shape_context=(hidden_size[0], 1, emb_size, 1),
        W=att_conv_W,
        b=att_conv_b,
        W_context=conv_W_context,
        b_context=conv_b_context)
    # attentive_sent_embeddings_l = attentive_conv_layer.attentive_maxpool_vec_l  #(batch_size*cand_size, hidden_size)
    # attentive_sent_embeddings_r = attentive_conv_layer.attentive_maxpool_vec_r

    test_attentive_sent_embeddings_l = test_attentive_conv_layer.attentive_maxpool_vec_l.reshape(
        (batch_size, cand_size,
         hidden_size[0]))  #(batch_size*cand_size, hidden_size)
    test_attentive_sent_embeddings_r = test_attentive_conv_layer.attentive_maxpool_vec_r.reshape(
        (batch_size, cand_size, hidden_size[0]))
    test_masked_sents_attconv = test_attentive_sent_embeddings_l * masked_inter_matrix.dimshuffle(
        0, 1, 'x')
    test_masked_claim_attconv = test_attentive_sent_embeddings_r * masked_inter_matrix.dimshuffle(
        0, 1, 'x')
    test_fine_max = T.concatenate([
        T.max(test_masked_sents_attconv, axis=1),
        T.max(test_masked_claim_attconv, axis=1)
    ],
                                  axis=1)  #(batch, 2*hidden)
    # test_fine_sum = T.concatenate([T.sum(test_masked_sents_attconv, axis=1),T.sum(test_masked_claim_attconv, axis=1)],axis=1) #(batch, 2*hidden)

    test_LR_input = T.concatenate([test_premise_hypo_emb, test_fine_max],
                                  axis=1)
    test_LR_input_size = joint_LR_input_size

    test_layer_LR = LogisticRegression(
        rng,
        input=test_LR_input,
        n_in=test_LR_input_size,
        n_out=3,
        W=joint_U_a,
        b=joint_LR_b
    )  #basically it is a multiplication between weight matrix and input feature vector

    params = [init_embeddings] + NN_para + LR_para + joint_LR_para
    cost = loss + joint_loss
    "Use AdaGrad to update parameters"
    updates = Gradient_Cost_Para(cost, params, learning_rate)

    train_model = theano.function([
        sents_ids, sents_mask, sents_labels, claim_ids, claim_mask,
        joint_sents_ids, joint_sents_mask, joint_sents_labels, joint_claim_ids,
        joint_claim_mask, joint_labels
    ],
                                  cost,
                                  updates=updates,
                                  allow_input_downcast=True,
                                  on_unused_input='ignore')
    # dev_model = theano.function([sents_ids_l, sents_mask_l, sents_ids_r, sents_mask_r, labels], layer_LR.errors(labels), allow_input_downcast=True, on_unused_input='ignore')
    test_model = theano.function([
        sents_ids, sents_mask, sents_labels, claim_ids, claim_mask,
        joint_labels
    ], [
        inter_matrix,
        test_layer_LR.errors(joint_labels), test_layer_LR.y_pred
    ],
                                 allow_input_downcast=True,
                                 on_unused_input='ignore')

    ###############
    # TRAIN MODEL #
    ###############
    print '... training'
    # early-stopping parameters
    patience = 50000000000  # look as this many examples regardless
    start_time = time.time()
    mid_time = start_time
    past_time = mid_time
    epoch = 0
    done_looping = False

    joint_n_train_batches = joint_train_size / batch_size
    joint_train_batch_start = list(
        np.arange(joint_n_train_batches) *
        batch_size) + [joint_train_size - batch_size]
    n_train_batches = train_size / batch_size
    train_batch_start = list(
        np.arange(n_train_batches) * batch_size) + [train_size - batch_size]

    n_test_batches = test_size / batch_size
    test_batch_start = list(
        np.arange(n_test_batches) * batch_size) + [test_size - batch_size]
    n_test_3th_batches = test_3th_size / batch_size
    test_3th_batch_start = list(np.arange(n_test_3th_batches) *
                                batch_size) + [test_3th_size - batch_size]

    max_strict_acc = 0.0
    max_test_f1 = 0.0
    max_all_acc = 0.0

    cost_i = 0.0
    joint_train_indices = range(joint_train_size)
    train_indices = range(train_size)

    while epoch < n_epochs:
        epoch = epoch + 1

        random.Random(100).shuffle(
            joint_train_indices
        )  #shuffle training set for each new epoch, is supposed to promote performance, but not garrenteed
        random.Random(100).shuffle(train_indices)
        iter_accu = 0

        for joint_batch_id in joint_train_batch_start:  #for each batch
            # iter means how many batches have been run, taking into loop
            iter = (epoch - 1) * joint_n_train_batches + iter_accu + 1
            iter_accu += 1
            joint_train_id_batch = joint_train_indices[
                joint_batch_id:joint_batch_id + batch_size]
            for i in range(3):
                batch_id = random.choice(train_batch_start)
                train_id_batch = train_indices[batch_id:batch_id + batch_size]
                cost_i += train_model(
                    train_sents[train_id_batch],
                    train_sent_masks[train_id_batch],
                    train_sent_labels[train_id_batch],
                    train_claims[train_id_batch],
                    train_claim_mask[train_id_batch],
                    #joint_sents_ids,joint_sents_mask,joint_sents_labels, joint_claim_ids, joint_claim_mask, joint_labels
                    joint_train_sents[joint_train_id_batch],
                    joint_train_sent_masks[joint_train_id_batch],
                    joint_train_sent_labels[joint_train_id_batch],
                    joint_train_claims[joint_train_id_batch],
                    joint_train_claim_mask[joint_train_id_batch],
                    joint_train_labels[joint_train_id_batch])

            #after each 1000 batches, we test the performance of the model on all test data
            # if (epoch==1 and iter%1000==0) or (epoch>=2 and iter%5==0):
            if iter % 100 == 0:
                print 'Epoch ', epoch, 'iter ' + str(
                    iter) + ' average cost: ' + str(cost_i / iter), 'uses ', (
                        time.time() - past_time) / 60.0, 'min'
                past_time = time.time()

                f1_sum = 0.0
                error_sum = 0.0
                full_evi = 0
                predictions = []
                for test_batch_id in test_batch_start:  # for each test batch
                    batch_prob, error_i, pred_i = test_model(
                        test_sents[test_batch_id:test_batch_id + batch_size],
                        test_sent_masks[test_batch_id:test_batch_id +
                                        batch_size],
                        test_sent_labels[test_batch_id:test_batch_id +
                                         batch_size],
                        test_claims[test_batch_id:test_batch_id + batch_size],
                        test_claim_mask[test_batch_id:test_batch_id +
                                        batch_size],
                        test_labels[test_batch_id:test_batch_id + batch_size])
                    error_sum += error_i
                    batch_sent_labels = test_sent_labels[
                        test_batch_id:test_batch_id + batch_size]
                    batch_sent_names = test_sent_names[
                        test_batch_id:test_batch_id + batch_size]
                    batch_ground_names = test_ground_names[
                        test_batch_id:test_batch_id + batch_size]
                    batch_ground_labels = test_labels[
                        test_batch_id:test_batch_id + batch_size]
                    for i in range(batch_size):
                        instance_i = {}
                        instance_i['label'] = pred_id2label.get(
                            batch_ground_labels[i])
                        instance_i['predicted_label'] = pred_id2label.get(
                            pred_i[i])
                        pred_sent_names = []
                        gold_sent_names = batch_ground_names[i]
                        zipped = [(batch_prob[i, k], batch_sent_labels[i][k],
                                   batch_sent_names[i][k])
                                  for k in range(cand_size)]
                        sorted_zip = sorted(zipped,
                                            key=lambda x: x[0],
                                            reverse=True)
                        for j in range(cand_size):
                            triple = sorted_zip[j]
                            if triple[1] == 1.0:
                                '''
                                we should consider a rank, instead of binary
                                if triple[0] >0.5: can control the recall, influence the strict_acc
                                '''
                                if triple[0] > 0.5:
                                    # pred_sent_names.append(batch_sent_names[i][j])
                                    pred_sent_names.append(triple[2])
                                # if len(pred_sent_names) == max_pred_pick:
                                #     break
                        instance_i['predicted_evidence'] = pred_sent_names
                        # print 'pred_sent_names:',pred_sent_names
                        # print 'gold_sent_names:',gold_sent_names
                        new_gold_names = []
                        for gold_name in gold_sent_names:
                            new_gold_names.append([None, None] + gold_name)
                        instance_i['evidence'] = [new_gold_names]
                        predictions.append(instance_i)
                strict_score, label_accuracy, precision, recall, f1 = fever_score(
                    predictions)
                print 'strict_score, label_accuracy, precision, recall, f1: ', strict_score, label_accuracy, precision, recall, f1

                if strict_score > max_strict_acc and f1 > max_test_f1:
                    max_strict_acc = strict_score
                    max_test_f1 = f1
                    writefile_2class = codecs.open(
                        root + 'class_2_erroranalysis.txt', 'w', 'utf-8')
                    for dic in predictions:
                        json.dump(dic, writefile_2class)
                        writefile_2class.write('\n')
                    writefile_2class.close()
                    print 'writefile_2class write over'

                for test_batch_id in test_3th_batch_start:  # for each test batch
                    _, error_i, pred_i = test_model(
                        test_3th_sents[test_batch_id:test_batch_id +
                                       batch_size],
                        test_3th_sent_masks[test_batch_id:test_batch_id +
                                            batch_size],
                        test_3th_sent_labels[test_batch_id:test_batch_id +
                                             batch_size],
                        test_3th_claims[test_batch_id:test_batch_id +
                                        batch_size],
                        test_3th_claim_mask[test_batch_id:test_batch_id +
                                            batch_size],
                        test_3th_labels[test_batch_id:test_batch_id +
                                        batch_size])
                    for i in range(batch_size):
                        instance_i = {}
                        instance_i['label'] = pred_id2label.get(2)
                        instance_i['predicted_label'] = pred_id2label.get(
                            pred_i[i])
                        instance_i['predicted_evidence'] = []
                        instance_i['evidence'] = []
                        predictions.append(instance_i)

                strict_score, label_accuracy, precision, recall, f1 = fever_score(
                    predictions)
                print 'strict_score, label_accuracy, precision, recall, f1: ', strict_score, label_accuracy, precision, recall, f1
                if label_accuracy > max_all_acc:
                    max_all_acc = label_accuracy
                    writefile_3class = codecs.open(
                        root + 'class_3_erroranalysis.txt', 'w', 'utf-8')
                    for dic in predictions:
                        json.dump(dic, writefile_3class)
                        writefile_3class.write('\n')
                    writefile_3class.close()
                    print 'writefile_3class write over'
        print 'Epoch ', epoch, 'uses ', (time.time() - mid_time) / 60.0, 'min'
        mid_time = time.time()

        #print 'Batch_size: ', update_freq
    end_time = time.time()

    print >> sys.stderr, ('The code for file ' + os.path.split(__file__)[1] +
                          ' ran for %.2fm' % ((end_time - start_time) / 60.))

    return max_acc_test
Esempio n. 41
0
    def __init__(self,
                 input_width,
                 input_height,
                 num_actions,
                 num_frames,
                 discount,
                 learning_rate,
                 rho,
                 rms_epsilon,
                 momentum,
                 clip_delta,
                 freeze_interval,
                 batch_size,
                 network_type,
                 update_rule,
                 batch_accumulator,
                 rng,
                 action_selection,
                 input_scale=255.0):

        self.input_width = input_width
        self.input_height = input_height
        self.num_actions = num_actions
        self.num_frames = num_frames
        self.batch_size = batch_size
        self.discount = discount
        self.rho = rho
        self.lr = learning_rate
        self.rms_epsilon = rms_epsilon
        self.momentum = momentum
        self.clip_delta = clip_delta
        self.freeze_interval = freeze_interval
        self.rng = rng

        if action_selection == 'epsilon-greedy':
            self.choose_action = self.choose_action_epsilon_greedy
        elif action_selection == 'softmax':
            self.choose_action = self.choose_action_softmax
        else:
            raise ValueError(
                "Unrecognized action selection: {}".format(action_selection))

        lasagne.random.set_rng(self.rng)

        self.update_counter = 0

        self.l_out = self.build_network(network_type, input_width,
                                        input_height, num_actions, num_frames,
                                        batch_size)
        if self.freeze_interval > 0:
            self.next_l_out = self.build_network(network_type, input_width,
                                                 input_height, num_actions,
                                                 num_frames, batch_size)
            self.reset_q_hat()

        states = T.tensor4('states')
        next_states = T.tensor4('next_states')
        rewards = T.col('rewards')
        actions = T.icol('actions')
        terminals = T.icol('terminals')

        self.states_shared = theano.shared(
            np.zeros((batch_size, num_frames, input_height, input_width),
                     dtype=theano.config.floatX))

        self.next_states_shared = theano.shared(
            np.zeros((batch_size, num_frames, input_height, input_width),
                     dtype=theano.config.floatX))

        self.rewards_shared = theano.shared(np.zeros(
            (batch_size, 1), dtype=theano.config.floatX),
                                            broadcastable=(False, True))

        self.actions_shared = theano.shared(np.zeros((batch_size, 1),
                                                     dtype='int32'),
                                            broadcastable=(False, True))

        self.terminals_shared = theano.shared(np.zeros((batch_size, 1),
                                                       dtype='int32'),
                                              broadcastable=(False, True))

        q_vals = lasagne.layers.get_output(self.l_out, states / input_scale)

        if self.freeze_interval > 0:
            next_q_vals = lasagne.layers.get_output(self.next_l_out,
                                                    next_states / input_scale)
        else:
            next_q_vals = lasagne.layers.get_output(self.l_out,
                                                    next_states / input_scale)
            next_q_vals = theano.gradient.disconnected_grad(next_q_vals)

        target = (rewards + (T.ones_like(terminals) - terminals) *
                  self.discount * T.max(next_q_vals, axis=1, keepdims=True))
        diff = target - q_vals[T.arange(batch_size),
                               actions.reshape((-1, ))].reshape((-1, 1))

        if self.clip_delta > 0:
            # If we simply take the squared clipped diff as our loss,
            # then the gradient will be zero whenever the diff exceeds
            # the clip bounds. To avoid this, we extend the loss
            # linearly past the clip point to keep the gradient constant
            # in that regime.
            #
            # This is equivalent to declaring d loss/d q_vals to be
            # equal to the clipped diff, then backpropagating from
            # there, which is what the DeepMind implementation does.
            quadratic_part = T.minimum(abs(diff), self.clip_delta)
            linear_part = abs(diff) - quadratic_part
            loss = 0.5 * quadratic_part**2 + self.clip_delta * linear_part
        else:
            loss = 0.5 * diff**2

        if batch_accumulator == 'sum':
            loss = T.sum(loss)
        elif batch_accumulator == 'mean':
            loss = T.mean(loss)
        else:
            raise ValueError("Bad accumulator: {}".format(batch_accumulator))

        params = lasagne.layers.helper.get_all_params(self.l_out)
        givens = {
            states: self.states_shared,
            next_states: self.next_states_shared,
            rewards: self.rewards_shared,
            actions: self.actions_shared,
            terminals: self.terminals_shared
        }
        if update_rule == 'deepmind_rmsprop':
            updates = deepmind_rmsprop(loss, params, self.lr, self.rho,
                                       self.rms_epsilon)
        elif update_rule == 'rmsprop':
            updates = lasagne.updates.rmsprop(loss, params, self.lr, self.rho,
                                              self.rms_epsilon)
        elif update_rule == 'sgd':
            updates = lasagne.updates.sgd(loss, params, self.lr)
        else:
            raise ValueError("Unrecognized update: {}".format(update_rule))

        if self.momentum > 0:
            updates = lasagne.updates.apply_momentum(updates, None,
                                                     self.momentum)

        self._train = theano.function([], [loss, q_vals],
                                      updates=updates,
                                      givens=givens)
        self._q_vals = theano.function([],
                                       q_vals,
                                       givens={states: self.states_shared})
Esempio n. 42
0
def logsumexp(x, axis=None):
    # Adapted from https://github.com/Theano/Theano/issues/1563
    x_max = tt.max(x, axis=axis, keepdims=True)
    return tt.log(tt.sum(tt.exp(x - x_max), axis=axis, keepdims=True)) + x_max
Esempio n. 43
0
 def get_output(self, train=False):
     X = self.get_input(train)
     # -- don't need activation since it's just linear.
     output = T.max(T.dot(X, self.W) + self.b, axis=1)
     return output
Esempio n. 44
0
def log_sum_exp(x, axis=None):
    x_max = T.max(x, axis=axis, keepdims=True)
    return T.log(T.sum(T.exp(x - x_max), axis=axis, keepdims=True)) + x_max
Esempio n. 45
0
def max(x, axis=None, keepdims=False):
    return T.max(x, axis=axis, keepdims=keepdims)
Esempio n. 46
0
def Adam(cost,
         params,
         lr=0.0002,
         b1=0.1,
         b2=0.001,
         e=1e-8,
         clip=None,
         grad_fn=None):
    """
    Adam optimizer. Returns a set of gradient descent updates.
    This is ported from the GitHub Gist by Alec Radford
    https://gist.github.com/Newmu/acb738767acb4788bac3 (MIT License).

    TODO: Track which parameter(s) triggers the rescaling. This would help
    debugging / setting fitting parameters: if it's always the same parameter
    triggering clipping, its learning rate should probably be reduced.

    .. Caution:: The values of `b1` and `b2` are equivalent to 1-β1, 1-β2,
       where β1 and β2 are their corresponding values in Kingma et al. (2014).

    Parameters
    ----------
    cost: theano variable
        We want to minimize this cost.

    params: List[Shared] | List[Tuple[Shared, mask]] | Dict[Shared, mask]
        List of Theano shared variables. Any element may be specified instead
        as a tuple pair, whose first element is the shared variable, and the
        second is a boolean mask array. If given, the mask array should be of
        the same shape as the shared variable – False entries indicate that
        we are not fitting for this parameter component, and so its gradient
        is to be set to zero.

    lr: float, > 0
        Learning rate.

    b1, b2: float, between 0 (exclusive) and 1 (inclusive)
        Decay rates for the mean (`b1`) and variance (`b2`) of the gradient.
        Specifically, if we think of the optimization step i as continuous,
        then the gradient mean `m` decays roughly as

        dm/di = -b m    &   m(i) = m(0) exp(-bi)

        A plain SGD optimizer with no momentum can be obtained by setting both
        `b1` and `b2` to zero.

    e: float, >0. Default: 1e-8
        Epsilon. This value is used in the following calculation to ensure
        numerical stability::

           g_t = m_t / (tt.sqrt(v_t) + e)

        where `g_t` is the ultimately returned gradient, `m_t` its inertial mean
        and `v_t` its inertial variance.

    clip: positive float
        Clip gradients such that no components are greater than this value.
        ADAM provides some automatic adjustment of the gradient based. For cases
        where the cost exhibits cliffs however (as is common with RNNs), this
        might not be sufficient, as very large gradients can overpower ADAM's
        adaptation. In this case clipping the final gradient can help stabilize
        the optimization algorithm. Clipping is done on the gradient's L∞ norm,
        so the direction is conserved. Specifically, the gradient for each
        parameter `p` is independently divided by `clip`; the largest
        of these ratios, if it exceeds 1, is used to rescale the whole gradient.
        This allows us to have different learning rates for different parameters,
        and for the clipping to scale reasonably with the number of parameters.
        Clip value can be chosen by what we think is the maximum reasonable
        parameter change in one iteration, since this change is roughly
        bounded by `lr` x `clip`.
        Note that we clip the raw gradient, so the internal `m` and `v`
        variables are updated with the clipped gradient; this is why
        we say "roughly bounded" above. We do this because `m` and `v` are
        momentum variable, and so should reflect the actual movement of the
        'particle'. We haven't however made extensive tests to check whether
        this is the most reasonable choice in practice.
        Setting `clip` to `None` disables clipping completely. This is the
        default.

    grad_fn: function
        If specified, use this instead of `T.grad` to compute the cost's gradient.
        Should have the same signature (i.e. `grad_fn(cost, params)`) and return
        a result of the same shape as `T.grad`.

    Returns
    -------
    Theano update dictionary for the parameters in `params`
    """
    # The MIT License (MIT)
    # Copyright (c) 2015 Alec Radford
    # Copyright (c) 2018-2020 Alexandre René
    # Permission is hereby granted, free of charge, to any person obtaining a copy
    # of this software and associated documentation files (the "Software"), to deal
    # in the Software without restriction, including without limitation the rights
    # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
    # copies of the Software, and to permit persons to whom the Software is
    # furnished to do so, subject to the following conditions:
    # The above copyright notice and this permission notice shall be included in all
    # copies or substantial portions of the Software.
    # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
    # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
    # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
    # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
    # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
    # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
    # SOFTWARE.

    tmpparams = []
    param_masks = []
    # Standardize the form of params
    if isinstance(params, shim.config.GraphTypes):
        params = [params]
    if isinstance(params, dict):
        # Convert dictionary to a list of (param, mask_descriptor) tuples
        params = list(params.items())
    else:
        _params = []
        for p in params:
            if isinstance(p, tuple):
                _params.append(p)
            else:
                # Param has no mask: set it to None
                _params.append((p, None))
        params = _params
    # `params` is a list of size 2 tuples
    assert all(isinstance(p, tuple) and len(p) == 2 for p in params)

    # Standardize the learning rate form
    errmsg = ("Learning rate must be specified either as a scalar, "
              "or as a dictionary with a key matching each parameter. "
              "Provided learning rate: {}".format(lr))
    if shim.isscalar(lr):
        lr = {p[0]: lr for p in params}
    elif not isinstance(lr, dict):
        raise ValueError(errmsg)
    _lr = lr.copy()
    for key, plr in _lr.items():
        if isinstance(key, str):
            # We expect lr to be indexed by variable, not variable name
            for p, mask in params:
                if p.name == key:
                    lr[p] = plr
                    del lr[key]
                    break
    if not isinstance(lr, dict) or not all(p[0] in lr for p in params):
        raise ValueError(errmsg)

    # Extract the gradient mask for each parameter
    for p in params:
        tmpparams.append(p[0])
        if p[1] is not None:
            if isinstance(p[1], bool):
                param_masks.append(
                    np.ones(p[0].get_value().shape, dtype=int) * p[1])
            else:
                if p[1].shape != p[0].get_value().shape:
                    raise ValueError(
                        "Provided mask (shape {}) for parameter {} "
                        "(shape {}) has a different shape.".format(
                            p[1].shape, p[0].name, p[0].get_value().shape))
                param_masks.append(p[1])
        else:
            param_masks.append(None)
    params = tmpparams

    updates = OrderedDict()
    gs = {}
    lrs = {}

    if grad_fn is None:
        try:
            grads = tt.grad(cost, params)
        except theano.gradient.DisconnectedInputError as e:
            disconnected_inputs = set(params).difference(
                shim.graph.shared_inputs(cost))
            raise theano.gradient.DisconnectedInputError(
                "The following parameters do not appear in the expression for "
                "the cost: {}.".format(disconnected_inputs))
    else:
        grads = grad_fn(cost, params)

    # Clip gradients
    if clip is not None:
        # Rescale is set by the component which most exceeds `clip`
        rescale = tt.max([1] + [tt.max(abs(g / clip)) for g in grads])
        rescale.name = "rescale"
        # rescale = shim.print(rescale)
        for i in range(len(grads)):
            grads[i] /= rescale

    # DEBUG This is useful for finding which gradients are returning NaN,
    # but is this the best place / way ?
    newp = {p: p for p in params}  # Need to keep handle to original shared var
    # which may be overwritten by print
    if 'print grads' in debug_flags:
        for i, p in enumerate(params):
            if (debug_flags['print grads'] is True
                    or p.name in debug_flags['print grads']):
                newp[p] = shim.print(p)
                grads[i] = shim.ifelse(
                    shim.eq(rescale, 1),
                    shim.print(grads[i], 'gradient ' + p.name),
                    shim.print(grads[i], 'gradient ' + p.name + ' RESCALED'))
    # for p in params:
    #     gs[p] = shim.ifelse(shim.eq(rescale, 1),
    #                         shim.print(gs[p], 'g_t (' + p.name + ')'),
    #                         shim.print(gs[p], 'g_t (' + p.name + ') RESCALED')
    #                         )

    # Mask out the gradient for parameters we aren't fitting
    for i, mask in enumerate(param_masks):
        if mask is not None:
            grads[i] = grads[i] * mask
            # `mask` is an array of ones and zeros

    i = theano.shared(shim.cast_floatX(0.), name='adam_i')
    i_t = i + 1.
    fix1 = 1. - (1. - b1)**i_t
    fix2 = 1. - (1. - b2)**i_t
    for p, g in zip(params, grads):
        g = shim.cast_floatX(g)
        # FIXME: prior logp's still have dtype='float64',
        # no matter the value of floatX.
        # This is probably due to some internal constants
        # which are double precision.
        # Until this is fixed we need the explicit cast
        if not shim.all((0 < shim.eval(b1) <= 1)) and shim.all(
            (0 < shim.eval(b2) <= 1)):
            raise ValueError("Arguments `b1` and `b2` to the Adam optimizer "
                             "must be within (0, 1]. Received:\n"
                             f"b1: {b1}\nb2: {b2}")
        lr_t = lr[p] * (tt.sqrt(fix2) / fix1)
        initval = shim.cast_floatX(p.get_value() * 0.)
        if p.name is not None:
            namem = 'adam_' + p.name + '_m'
            namev = 'adam_' + p.name + '_v'
        else:
            p.name = ""
            namem = namev = None
        if hasattr(p, 'broadcastable'):
            m = shim.shared(initval, broadcastable=p.broadcastable, name=namem)
            v = shim.shared(initval, broadcastable=p.broadcastable, name=namev)
        else:
            m = shim.shared(initval, name=namem)
            v = shim.shared(initval, name=namev)
        m_t = (b1 * g) + ((1. - b1) * m)
        # m_t = shim.print(m_t, 'm_t (' + p.name + ')')
        v_t = (b2 * tt.sqr(g)) + ((1. - b2) * v)
        g_t = m_t / (tt.sqrt(v_t) + e)
        # ms[p] = [m, m_t]
        # vs[p] = [v, v_t]
        updates[m] = m_t
        updates[v] = v_t
        # lrs[p] = lr_t
        # gs[p] = g_t

        # lr_t = shim.print(lr_t, 'lr_t (' + p.name + ')')
        p_t = newp[p] - (lr_t * g_t)
        # Using newp allows printing, if it was requested
        if newp[p] != p:
            # We printed p, so also print the updated value
            p_t = shim.print(p_t, p.name + ' (updated)')
        updates[p] = shim.cast(p_t, p.dtype)
    updates[i] = i_t
    return updates
Esempio n. 47
0
    def __init__(self,
                 num_actions,
                 phi_length,
                 width,
                 height,
                 discount=.9,
                 learning_rate=.01,
                 batch_size=32,
                 approximator='none'):
        self._batch_size = batch_size
        self._num_input_features = phi_length
        self._phi_length = phi_length
        self._img_width = width
        self._img_height = height
        self._discount = discount
        self.num_actions = num_actions
        self.learning_rate = learning_rate
        self.scale_input_by = 255.0

        # CONSTRUCT THE LAYERS
        self.q_layers = []
        self.q_layers.append(
            layers.Input2DLayer(self._batch_size, self._num_input_features,
                                self._img_height, self._img_width,
                                self.scale_input_by))

        if approximator == 'cuda_conv':
            self.q_layers.append(
                cc_layers.ShuffleBC01ToC01BLayer(self.q_layers[-1]))
            self.q_layers.append(
                cc_layers.CudaConvnetConv2DLayer(self.q_layers[-1],
                                                 n_filters=16,
                                                 filter_size=8,
                                                 stride=4,
                                                 weights_std=.01,
                                                 init_bias_value=0.1))
            self.q_layers.append(
                cc_layers.CudaConvnetConv2DLayer(self.q_layers[-1],
                                                 n_filters=32,
                                                 filter_size=4,
                                                 stride=2,
                                                 weights_std=.01,
                                                 init_bias_value=0.1))
            self.q_layers.append(
                cc_layers.ShuffleC01BToBC01Layer(self.q_layers[-1]))

        elif approximator == 'conv':
            self.q_layers.append(
                layers.StridedConv2DLayer(self.q_layers[-1],
                                          n_filters=16,
                                          filter_width=8,
                                          filter_height=8,
                                          stride_x=4,
                                          stride_y=4,
                                          weights_std=.01,
                                          init_bias_value=0.01))

            self.q_layers.append(
                layers.StridedConv2DLayer(self.q_layers[-1],
                                          n_filters=32,
                                          filter_width=4,
                                          filter_height=4,
                                          stride_x=2,
                                          stride_y=2,
                                          weights_std=.01,
                                          init_bias_value=0.01))
        if approximator == 'cuda_conv' or approximator == 'conv':

            self.q_layers.append(
                layers.DenseLayer(self.q_layers[-1],
                                  n_outputs=256,
                                  weights_std=0.01,
                                  init_bias_value=0.1,
                                  dropout=0,
                                  nonlinearity=layers.rectify))

            self.q_layers.append(
                layers.DenseLayer(self.q_layers[-1],
                                  n_outputs=num_actions,
                                  weights_std=0.01,
                                  init_bias_value=0.1,
                                  dropout=0,
                                  nonlinearity=layers.identity))

        if approximator == 'none':
            self.q_layers.append(\
                layers.DenseLayerNoBias(self.q_layers[-1],
                                        n_outputs=num_actions,
                                        weights_std=0.00,
                                        dropout=0,
                                        nonlinearity=layers.identity))

        self.q_layers.append(layers.OutputLayer(self.q_layers[-1]))

        for i in range(len(self.q_layers) - 1):
            print self.q_layers[i].get_output_shape()

        # Now create a network (using the same weights)
        # for next state q values
        self.next_layers = copy_layers(self.q_layers)
        self.next_layers[0] = layers.Input2DLayer(self._batch_size,
                                                  self._num_input_features,
                                                  self._img_width,
                                                  self._img_height,
                                                  self.scale_input_by)
        self.next_layers[1].input_layer = self.next_layers[0]

        self.rewards = T.col()
        self.actions = T.icol()

        # Build the loss function ...
        q_vals = self.q_layers[-1].predictions()
        next_q_vals = self.next_layers[-1].predictions()
        next_maxes = T.max(next_q_vals, axis=1, keepdims=True)
        target = self.rewards + discount * next_maxes
        target = theano.gradient.consider_constant(target)
        diff = target - q_vals
        # Zero out all entries for actions that were not chosen...
        mask = build_mask(T.zeros_like(diff), self.actions, 1.0)
        diff_masked = diff * mask
        error = T.mean(diff_masked**2)
        self._loss = error * diff_masked.shape[1]  #

        self._parameters = layers.all_parameters(self.q_layers[-1])

        self._idx = T.lscalar('idx')

        # CREATE VARIABLES FOR INPUT AND OUTPUT
        self.states_shared = theano.shared(
            np.zeros((1, 1, 1, 1), dtype=theano.config.floatX))
        self.states_shared_next = theano.shared(
            np.zeros((1, 1, 1, 1), dtype=theano.config.floatX))
        self.rewards_shared = theano.shared(np.zeros(
            (1, 1), dtype=theano.config.floatX),
                                            broadcastable=(False, True))
        self.actions_shared = theano.shared(np.zeros((1, 1), dtype='int32'),
                                            broadcastable=(False, True))

        self._givens = \
            {self.q_layers[0].input_var:
             self.states_shared[self._idx*self._batch_size:
                                (self._idx+1)*self._batch_size, :, :, :],
             self.next_layers[0].input_var:
             self.states_shared_next[self._idx*self._batch_size:
                                     (self._idx+1)*self._batch_size, :, :, :],

             self.rewards:
             self.rewards_shared[self._idx*self._batch_size:
                                 (self._idx+1)*self._batch_size, :],
             self.actions:
             self.actions_shared[self._idx*self._batch_size:
                                 (self._idx+1)*self._batch_size, :]
             }

        self._updates = layers.gen_updates_rmsprop_and_nesterov_momentum(\
            self._loss, self._parameters, learning_rate=self.learning_rate,
            rho=0.9, momentum=0.9, epsilon=1e-6)

        self._train = theano.function([self._idx],
                                      self._loss,
                                      givens=self._givens,
                                      updates=self._updates)
        self._compute_loss = theano.function([self._idx],
                                             self._loss,
                                             givens=self._givens)
        self._compute_q_vals = \
            theano.function([self.q_layers[0].input_var],
                            self.q_layers[-1].predictions(),
                            on_unused_input='ignore')
Esempio n. 48
0
def SoftMax(x):
    x = T.exp(x - T.max(x, axis=x.ndim - 1, keepdims=True))
    return x / T.sum(x, axis=x.ndim - 1, keepdims=True)
def _max_along_time(input, **kwargs):
    return T.max(input, axis=1)
Esempio n. 50
0
def SSL2(coding_dist, true_dist):
    def set_inf_in2dim(j, coding_dist, true_label_id):
        """
        Search true_label_id==j,and set coding_dist[i][j]="-inf"
        """
        return T.switch(T.eq(j, true_label_id), T.constant(float("-inf")),
                        coding_dist[j])

    def set_inf_in1dim(i, coding_dist, true_label_id):
        # coding_dist[:,label_id] doesn't become "-0.0"
        loss_margin, updates = theano.scan(set_inf_in2dim, \
                                           outputs_info=None, \
                                           sequences=T.arange(coding_dist.shape[1]), \
                                           non_sequences=[coding_dist[i], true_label_id[i]])
        return loss_margin

    if true_dist.ndim == coding_dist.ndim:
        '''
        #Calculation: predictioin to true_label
        y_pre2true=T.sum(true_dist * coding_dist, axis=1)

        #Calculation: prediction to false_label
        y_pre2false=T.max((1-true_dist) * coding_dist, axis=1)

        loss=1+y_pre2true-y_pre2false
        '''
        # Calculation: predictioin to true_label
        #        y_pre2true=T.sum(true_dist * T.log(1+T.exp(2*(3-coding_dist))),axis=1)
        y_pre2true_softmax = T.sum(true_dist * T.nnet.softmax(coding_dist),
                                   axis=1)

        true_pre = T.sum(true_dist * coding_dist, axis=1)
        y_pre2true = T.sum(true_dist * T.exp((3 - coding_dist)), axis=1)

        #        #Negative loss in y_pre2true
        #        y_pre2true=T.nnet.sigmoid(y_pre2true)*y_pre2true

        # search the true label id
        true_label_id = T.argmax(true_dist, axis=1)
        # persist the false label in coding_dist
        coding_dist = (1 - true_dist) * coding_dist
        # set true label to "-inf"
        coding_dist_true2inf, updates = theano.scan(set_inf_in1dim, \
                                                    outputs_info=None, \
                                                    sequences=T.arange(coding_dist.shape[0]), \
                                                    non_sequences=[coding_dist, true_label_id])
        # search the max in false label
        coding_dist_true2inf = T.max(coding_dist_true2inf, axis=1)
        # Calculation: predictioin to false_label
        #        y_pre2false=T.log(1+T.exp(2*(0.5+coding_dist_true2inf)))
        y_pre2false = T.exp((0.5 + coding_dist_true2inf))

        # Negative loss in y_pre2false
        #        y_pre2false=T.nnet.sigmoid(k*y_pre2false)*y_pre2false
        stimulative = T.exp(2 + coding_dist_true2inf - true_pre)
        loss = 4 * T.nnet.sigmoid(y_pre2true) * T.nnet.sigmoid(
            y_pre2false) * stimulative * T.log(1 + y_pre2true + y_pre2false)

        #        loss=2*T.nnet.sigmoid(y_pre2true)*T.nnet.sigmoid(y_pre2false)*T.log(1+y_pre2true+y_pre2false)

        return loss, stimulative, y_pre2false

    else:
        print "true_dist.ndim != coding_dist.ndim"
Esempio n. 51
0
def MyLogSumExp(x, axis=None):
    x_max = tt.max(x, axis=axis, keepdims=True)
    return tt.log(tt.sum(tt.exp(x - x_max), axis=axis, keepdims=True)) + x_max
Esempio n. 52
0
def SSL_mutual2(coding_dist, true_dist):
    def set_inf_in2dim(j, coding_dist, true_label_id):
        """
        Search true_label_id==j,and set coding_dist[i][j]="-inf"
        """
        return T.switch(T.eq(j, true_label_id), T.constant(float("-inf")),
                        coding_dist[j])

    def set_inf_in1dim(i, coding_dist, true_label_id):
        # coding_dist[:,label_id] doesn't become "-0.0"
        loss_margin, updates = theano.scan(set_inf_in2dim, \
                                           outputs_info=None, \
                                           sequences=T.arange(coding_dist.shape[1]), \
                                           non_sequences=[coding_dist[i], true_label_id[i]])
        return loss_margin

    if true_dist.ndim == coding_dist.ndim:
        """"""
        coding_dist1 = T.tanh(coding_dist)
        y_pre2true = T.sum(true_dist * T.exp((-coding_dist1)), axis=1)

        # search the true label id
        true_label_id = T.argmax(true_dist, axis=1)
        # persist the false label in coding_dist
        coding_dist_false = (1 - true_dist) * coding_dist1
        # set true label to "-inf"
        coding_dist_true2inf, updates = theano.scan(set_inf_in1dim, \
                                                    outputs_info=None, \
                                                    sequences=T.arange(coding_dist_false.shape[0]), \
                                                    non_sequences=[coding_dist_false, true_label_id])
        # search the max in false label
        coding_dist_true2inf = T.max(coding_dist_true2inf, axis=1)
        y_pre2false = T.exp((coding_dist_true2inf))
        """stimulative"""
        coding_dist = T.nnet.softmax(coding_dist)

        # Calculation: predictioin to true_label
        true_pre = T.sum(true_dist * coding_dist, axis=1)
        #        y_pre2true=T.sum(true_dist * T.exp((3-coding_dist)),axis=1)

        # search the true label id
        true_label_id = T.argmax(true_dist, axis=1)
        # persist the false label in coding_dist
        coding_dist_false = (1 - true_dist) * coding_dist
        # set true label to "-inf"
        coding_dist_true2inf, updates = theano.scan(set_inf_in1dim, \
                                                    outputs_info=None, \
                                                    sequences=T.arange(coding_dist_false.shape[0]), \
                                                    non_sequences=[coding_dist_false, true_label_id])
        # search the max in false label
        coding_dist_true2inf = T.max(coding_dist_true2inf, axis=1)
        #        y_pre2false=T.exp((0.25+coding_dist_true2inf))

        # SSL
        stimulative = 1 + coding_dist_true2inf - true_pre

        #        loss=stimulative*(-T.log(1e-8+true_pre))
        loss = stimulative * T.log(1 + y_pre2true + y_pre2false)

        return loss, y_pre2false, y_pre2true, stimulative

    else:
        print "true_dist.ndim != coding_dist.ndim"
Esempio n. 53
0
 def get_output_for(self, input, **kwargs):
     R = (T.max(input, axis=1) - T.min(input, axis=1)).dimshuffle(0, 'x')
     input = self.k * input / T.maximum(R, 0.1)
     return T.nnet.softmax(input)
Esempio n. 54
0
 def get_output_for(self, input, **kwargs):
     return T.clip(T.max(input, axis=1), 0.01, 0.99)
Esempio n. 55
0
def max_cross_corrs(filters,
                    things_to_scan,
                    min_overlap,
                    batch_size=50,
                    func_params_size=1000000,
                    progress_update=1000):
    """
        func_params_size: when compiling functions
    """
    #reverse the patterns as the func is a conv not a cross corr
    assert len(filters.shape) == 3, "Did you pass in filters of unequal len?"
    assert filters.shape[-1] == things_to_scan.shape[-1]
    filters = filters.astype("float32")[:, ::-1, ::-1]
    to_return = np.zeros((filters.shape[0], len(things_to_scan)))
    #compile the number of filters that result in a function with
    #params equal to func_params_size
    params_per_filter = np.prod(filters[0].shape)
    filter_batch_size = int(func_params_size / params_per_filter)
    filter_length = filters.shape[1]
    filter_idx = 0
    while filter_idx < filters.shape[0]:
        if (progress_update is not None):
            print("On filters", filter_idx, "to",
                  min((filter_idx + filter_batch_size), len(filters)))
            sys.stdout.flush()

        filter_batch = filters[filter_idx:min((
            filter_idx + filter_batch_size), len(filters))]

        padding_amount = int((filter_length) * (1 - min_overlap))
        padded_input = [
            np.pad(array=x,
                   pad_width=((padding_amount, padding_amount), (0, 0)),
                   mode="constant") for x in things_to_scan
        ]

        input_var = theano.tensor.TensorType(dtype=theano.config.floatX,
                                             broadcastable=[False] *
                                             3)("input")
        theano_filters = theano.tensor.as_tensor_variable(x=filter_batch,
                                                          name="filters")
        conv_out = theano.tensor.nnet.conv2d(
            input=input_var[:, None, :, :],
            filters=theano_filters[:, None, ::-1, ::-1],
            border_mode='valid')[:, :, :, 0]

        max_out = T.max(conv_out, axis=-1)

        max_cross_corr_func = theano.function([input_var],
                                              max_out,
                                              allow_input_downcast=True)

        max_cross_corrs = np.array(
            run_function_in_batches(func=max_cross_corr_func,
                                    input_data_list=[padded_input],
                                    batch_size=batch_size,
                                    progress_update=progress_update))
        assert len(max_cross_corrs.shape) == 2, max_cross_corrs.shape
        to_return[filter_idx:
                  min((filter_idx+filter_batch_size),len(filters)),:] =\
                  np.transpose(max_cross_corrs)
        filter_idx += filter_batch_size

    return to_return
Esempio n. 56
0
def log_sum_exp(x, axis=1):
    m = T.max(x, axis=axis)
    return m + T.log(T.sum(T.exp(x - m.dimshuffle(0, 'x')), axis=axis))
Esempio n. 57
0
def log_sum_exp(x, axis=1):
    m = T.max(x, axis=axis, keepdims=True)
    return m+T.log(T.sum(T.exp(x-m), axis=axis) + 1e-9)
Esempio n. 58
0
def fmask(df: pandas.DataFrame) -> pandas.DataFrame:
    # ############################### DataFrame column aliases ##########################################
    df_swir1 = df['band6_reflectance_corrected']
    df_cirrus = df['band9_reflectance_corrected']
    df_bt1 = df['band10_bt']

    # ################ Formulae from [Zhu 2012] #######################################################
    """
    This test cuts out pixels that are clearly just snow or vegetation, or that are too warm to be clouds.
    """
    print('Formula1')
    """
    Formula 5 from [Zhu 2012] is split into three parts, as each may be useful in its own right.
    This one is true if the pixel suggests thin clouds over water.
    """
    print('Formula5a')
    """
    This one suggests clear skies over water if true.
    """
    print('Formula5b')
    """
    This one evaluates to True if it is definitely water, either with clear skies or thin cloud. False if it is land, 
    thick clouds over land, or thick clouds over water.
    """
    print('Formula5c')
    """
    This test produces true values for pixels that have a high probability of being cloud.
    It labels it as a Potential Cloud Pixel (PCP).
    """
    print('Formula6')
    df['pcp'] = df['basic_test'] & df['whiteness_test'] & df['hot_test'] & df[
        'b4b5_test']
    """
    This further refines the Water Test of formula 5 to take advantage of the newer, second short-wave infrared band.
    """
    # TODO: Shouldn't this just be folded into the original test then?
    print('Formula7')
    """
    For pixels which are water under clear skies, estimate the temperature
    """
    print('Formula8')
    # TODO: What if all the water is under clouds? What if there's no water at all?
    # noinspection PyTypeChecker
    """
    """
    print('Formula10')
    e10_brightness_prob = tt.clip(v0_swir1 / C10_MAX_WATER_REFLECTANCE,
                                  -999999, 1.0)
    df['brightness_prob'] = theano.function([v0_swir1],
                                            e10_brightness_prob)(df_swir1)
    """
    From [Zhu, 2015]
    This uses the cirrus cloud band 9 to account for high-altitude clouds.
    See: https://landsat.usgs.gov/how-is-landsat-8s-cirrus-band-9-used
    """
    print('2015, Formula1')
    e20151_cirrus_cloud_probability = v0_cirrus / C20151_CIRRUS_REFLECTANCE_THRESHOLD
    df['cirrus_cloud_probability'] = theano.function(
        [v0_cirrus], e20151_cirrus_cloud_probability)(df_cirrus)
    """
    """
    print('Formula11 replaced by 2015 Formula 2')
    e11_w_cloud_prob = v10_brightness_prob + v20151_cirrus_cloud_probability
    df['w_cloud_prob'] = theano.function(
        [v10_brightness_prob, v20151_cirrus_cloud_probability],
        e11_w_cloud_prob)(df['brightness_prob'],
                          df['cirrus_cloud_probability'])
    """
    """
    print('Formula12')
    df['clearsky_land'] = ~df['pcp'] & ~df['water_test']
    """
    """
    print('Formula13')
    df13_clearskyland = df[df['clearsky_land']]
    df13_clearskyland_bt = df13_clearskyland['band6_reflectance_corrected']
    # noinspection PyTypeChecker
    c13_t_lo = numpy.percentile(df13_clearskyland_bt,
                                C13_LOWER_PERCENTILE_FOR_CLEARSKY_LAND)
    # noinspection PyTypeChecker
    c13_t_hi = numpy.percentile(df13_clearskyland_bt,
                                C13_UPPER_PERCENTILE_FOR_CLEARSKY_LAND)
    """
    """
    print('Formula14')
    c14_temperature_magnitude = c13_t_hi - c13_t_lo
    e14_l_temperature_prob = (c13_t_hi + 4 -
                              v0_bt1) / c14_temperature_magnitude
    df['l_temperature_prob'] = theano.function([v0_bt1],
                                               e14_l_temperature_prob)(df_bt1)
    """
    """
    print("Formula15")
    # TODO: The whitepaper explanation is weird about this one. It's talking about saturation of one band, and another
    # band being larger than the other... but I think it's basically just saying that negative values for ndvi and
    # ndsi are cropped to zero. At which point the absolute values don't do anything. And we don't even need to modify
    # the ndsi/ndvi values, we can just make zero a minimum for our max function. Is that right???
    e15_variability_prob = (tt.max([0, v1_ndvi, v1_ndsi, v2_whiteness]))
    df['variability_prob'] = theano.function(
        [v1_ndvi, v1_ndsi, v2_whiteness],
        e15_variability_prob)(df['ndvi'], df['ndsi'], df['whiteness'])
    """
    """
    print("Formula16")
    e16_l_cloud_prob = v14_l_temperature_prob * v15_variability_prob
    df['l_cloud_prob'] = theano.function(
        [v14_l_temperature_prob, v15_variability_prob],
        e16_l_cloud_prob)(df['l_temperature_prob'], df['variability_prob'])

    return df
Esempio n. 59
0
def normalize(input,newmin=-1,newmax=1):
    mini = T.min(input)
    maxi = T.max(input)
    return (input-mini)*(newmax-newmin)/(maxi-mini)+newmin
Esempio n. 60
0
        t_b.append(_shared(ones((n_out,), dtype=floatX)*0.1))
        t_conv = t_conv + t_b[-1].dimshuffle('x',0)
        t_conv = activation(t_conv)

    conv_length = prod(traj_shape[1:-1])*trajconv.res_shape
    t_conv = t_conv.reshape((batch.micro, conv_length))
    if trajconv.append: 
        traj_ = T.concatenate([t.flatten(2), t_conv.flatten(2)], axis=1)
    else: 
        traj_ = t_conv.flatten(2)
        n_in_MLP -= traj_size
    n_in_MLP += conv_length

elif use.traj: traj_ = t.flatten(2)

insp = T.stack(T.min(vid_), T.mean(vid_), T.max(vid_), T.std(vid_))#, T.min(traj_), T.mean(traj_), T.max(traj_), T.std(traj_))
# dropout
if use.drop: 
    if use.traj: traj_ = DropoutLayer(traj_, rng=rng, p=drop.p_traj).output
    vid_ = DropoutLayer(vid_, rng=rng, p=drop.p_vid).output

# MLP
# ------------------------------------------------------------------------------

# fusion
if net.fusion == "early":
    if use.traj:
        out = T.concatenate([vid_, traj_], axis=1)
    else: out = vid_
    # hidden layer
    layers.append(HiddenLayer(out, n_in=n_in_MLP, n_out=net.hidden, rng=rng,