Esempio n. 1
0
    def logp_t(cls, value, transport, inputs):
        #print(value.tag.test_value)
        #print(mu.tag.test_value)
        #print(mapping.inv(value).tag.test_value)

        value = debug(value, 'value', force=False)
        delta = transport.inv(inputs, value, noise=True)
        det_m = transport.logdet_dinv(inputs, value)
        delta = debug(delta, 'delta', force=False)

        npi = np.float32(-0.5) * value.shape[0].astype(
            th.config.floatX) * tt.log(np.float32(2.0 * np.pi))
        dot2 = np.float32(-0.5) * delta.dot(delta.T)

        npi = debug(npi, 'npi', force=False)
        dot2 = debug(dot2, 'dot2', force=False)
        det_m = debug(det_m, 'det_m', force=False)

        r = npi + dot2 + det_m

        cond1 = tt.or_(tt.any(tt.isinf_(delta)), tt.any(tt.isnan_(delta)))
        cond2 = tt.or_(tt.any(tt.isinf_(det_m)), tt.any(tt.isnan_(det_m)))

        return ifelse(cond1, np.float32(-1e30),
                      ifelse(cond2, np.float32(-1e30), r))
Esempio n. 2
0
def get_nesterov_sgd_updates(param_list, gradients, velocities, lr, mu):
    """Do SGD updates with Nesterov momentum."""
    updates = []
    for p, g, v in zip(param_list, gradients, velocities):
        new_v = mu * v - lr * g
        new_p = p - mu * v + (1 + mu) * new_v
        has_non_finite = (T.any(T.isnan(new_p) + T.isinf(new_p)) +
                          T.any(T.isnan(new_v) + T.isinf(new_v)))
        updates.append((p, ifelse(has_non_finite, p, new_p)))
        updates.append((v, ifelse(has_non_finite, v, new_v)))
    return updates
Esempio n. 3
0
    def __init__(self, n_comp=10, verbose=False):

        # Theano initialization
        self.T_weights = shared(np.eye(n_comp, dtype=np.float32))
        self.T_bias = shared(np.ones((n_comp, 1), dtype=np.float32))

        T_p_x_white = T.fmatrix()
        T_lrate = T.fscalar()
        T_block = T.fscalar()
        T_unmixed = T.dot(self.T_weights,T_p_x_white) + T.addbroadcast(self.T_bias,1)
        T_logit = 1 - 2 / (1 + T.exp(-T_unmixed))

        T_out =  self.T_weights +  T_lrate * T.dot(T_block * T.identity_like(self.T_weights) + T.dot(T_logit, T.transpose(T_unmixed)), self.T_weights)
        T_bias_out = self.T_bias + T_lrate * T.reshape(T_logit.sum(axis=1), (-1,1))
        T_max_w = T.max(self.T_weights)
        T_isnan = T.any(T.isnan(self.T_weights))

        self.w_up_fun = theano.function([T_p_x_white, T_lrate, T_block],
                                        [T_max_w, T_isnan],
                                        updates=[(self.T_weights, T_out),
                                                 (self.T_bias, T_bias_out)],
                                        allow_input_downcast=True)

        T_matrix = T.fmatrix()
        T_cov = T.dot(T_matrix,T.transpose(T_matrix))/T_block
        self.cov_fun = theano.function([T_matrix, T_block], T_cov, allow_input_downcast=True)
        
        self.loading = None
        self.sources = None
        self.weights = None
        self.n_comp = n_comp
        self.verbose = verbose
Esempio n. 4
0
    def accurate_pixels_class(self, y):
        """
        Returns number of correctly classified pixels per class
        and total number of pixels per class.
        (pair of numpy 1d arrays)

        :type y: theano.tensor.TensorType
        :param y: corresponds to a vector that gives for each example the
                  correct label
        """
        # check if y has same dimension of y_pred
        if y.ndim != self.y_pred.ndim:
            raise TypeError(
                'y should have the same shape as self.y_pred',
                ('y', y.type, 'y_pred', self.y_pred.type)
            )
        # check if y is of the correct datatype
        if not y.dtype.startswith('int'):
            raise NotImplementedError()

        correct = T.zeros((self.n_classes), dtype='int32')
        total = T.zeros((self.n_classes), dtype='int32')
        for i in range(self.n_classes):
            correct = T.set_subtensor(
                correct[i],
                T.switch(
                    T.any(T.eq(y, i)),
                    T.sum(T.eq(y[T.eq(y, i).nonzero()],
                               self.y_pred[T.eq(y, i).nonzero()])),
                    0)
                )
            total = T.set_subtensor(total[i], T.sum(T.eq(y, i)))
        return correct, total
Esempio n. 5
0
def compile_eval_function(nnet):

    X = T.tensor4()
    y = T.ivector()

    # get prediciton by fully convolutional network
    prediction = lasagne.layers.get_output(nnet.dense3_conv_layer,
                                           deterministic=True,
                                           inputs=X)

    # get output scores on first dim
    # before flattening on 2dim and then get scores on second dim
    prediction = prediction.transpose((1, 0, 2, 3))\
        .flatten(2).transpose((1, 0))
    prediction = T.nnet.softmax(prediction)

    # spatial averaging
    prediction = T.mean(prediction, axis=0)

    # compute top1 and top5 accuracies
    sorted_pred = T.argsort(prediction)
    top1_acc = T.mean(T.eq(sorted_pred[-1], y), dtype='floatX')
    top5_acc = T.mean(T.any(T.eq(sorted_pred[-5:], T.shape_padright(y)),
                            axis=1),
                      dtype='floatX')

    return theano.function([X, y], [top1_acc, top5_acc])
Esempio n. 6
0
    def in_transit(self, t, r=None, texp=None, light_delay=False):
        """Get a list of timestamps that are in transit

        Args:
            t (vector): A vector of timestamps to be evaluated.
            r (Optional): The radii of the planets.
            texp (Optional[float]): The exposure time.

        Returns:
            The indices of the timestamps that are in transit.

        """
        if light_delay:
            raise NotImplementedError(
                "Light travel time delay is not implemented for simple orbits"
            )
        dt = tt.mod(tt.shape_padright(t) - self._ref_time, self.period)
        dt -= self._half_period
        if r is None:
            tol = 0.5 * self.duration
        else:
            x = (r + self.r_star) ** 2 - self._b_norm ** 2
            tol = tt.sqrt(x) / self.speed
        if texp is not None:
            tol += 0.5 * texp
        mask = tt.any(tt.abs_(dt) < tol, axis=-1)
        return tt.arange(t.size)[mask]
Esempio n. 7
0
    def posdef(self, x, diag):
        """ Check to determine postive definiteness of the Kronecker-structured 
            covariance matrix. This operation is slow, and is thus not recommended 
            to be called repeatedly as a check during optimization. Rather, the user 
            should use this function as a guide to ensuring positive definiteness 
            of the model for varying values of the kernel parameters. 
            
            Args:
                tensor x: The input coordinates.
                tensor diag: The white noise variances. This should be an NxM 
                    array where N is the length of x and M is the size of 
                    alpha.
                    
            Returns: 
                isposdef: A boolean that is True if the covariance matrix 
                    is positive definite and False otherwise. The user will 
                    need to call ``isposdef.eval()`` to compute the returned value 
                    from the theano tensor variable. 
        """

        diag = tt.as_tensor_variable(diag)
        diag = tt.reshape(diag.T, (1, diag.size))[0]
        x = tt.as_tensor_variable(x)
        T = self.term.value(x[:, None] - x[None, :])
        if 'alpha' in vars(self):
            R = self.alpha[:, None] * self.alpha[None, :]
            K = tt.slinalg.kron(T, R)
        elif 'R' in vars(self):
            K = tt.slinalg(T, self.R)
        chol = tt.slinalg.Cholesky(on_error='nan')
        L = chol(K + tt.diag(diag))
        return tt.switch(tt.any(tt.isnan(L)), np.array(False), np.array(True))
Esempio n. 8
0
 def cost(self):
   """
   :rtype: (theano.Variable | None, dict[theano.Variable,theano.Variable] | None)
   :returns: cost, known_grads
   """
   known_grads = None
   if self.loss == 'ce' or self.loss == 'priori':
     if self.attrs.get("target", "").endswith("[sparse:coo]"):
       assert isinstance(self.y, tuple)
       assert len(self.y) == 3
       from NativeOp import crossentropy_softmax_and_gradient_z_sparse
       y_mask = self.network.j[self.attrs.get("target", "").replace("[sparse:coo]", "[sparse:coo:2:0]")]
       ce, grad_z = crossentropy_softmax_and_gradient_z_sparse(
         self.z, self.index, self.y[0], self.y[1], self.y[2], y_mask)
       return self.norm * T.sum(ce), {self.z: grad_z}
     if self.y_data_flat.type == T.ivector().type:
       # Use crossentropy_softmax_1hot to have a more stable and more optimized gradient calculation.
       # Theano fails to use it automatically; I guess our self.i indexing is too confusing.
       #idx = self.index.flatten().dimshuffle(0,'x').repeat(self.y_m.shape[1],axis=1) # faster than line below
       #nll, pcx = T.nnet.crossentropy_softmax_1hot(x=self.y_m * idx, y_idx=self.y_data_flat * self.index.flatten())
       nll, pcx = T.nnet.crossentropy_softmax_1hot(x=self.y_m[self.i], y_idx=self.y_data_flat[self.i])
       #nll, pcx = T.nnet.crossentropy_softmax_1hot(x=self.y_m, y_idx=self.y_data_flat)
       #nll = -T.log(T.nnet.softmax(self.y_m)[self.i,self.y_data_flat[self.i]])
       #z_c = T.exp(self.z[:,self.y])
       #nll = -T.log(z_c / T.sum(z_c,axis=2,keepdims=True))
       #nll, pcx = T.nnet.crossentropy_softmax_1hot(x=self.y_m, y_idx=self.y_data_flat)
       #nll = T.set_subtensor(nll[self.j], T.constant(0.0))
     else:
       nll = -T.dot(T.log(T.clip(self.p_y_given_x[self.i], 1.e-38, 1.e20)), self.y_data_flat[self.i].T)
     return self.norm * T.sum(nll), known_grads
   elif self.loss == 'entropy':
     h_e = T.exp(self.y_m) #(TB)
     pcx = T.clip((h_e / T.sum(h_e, axis=1, keepdims=True)).reshape((self.index.shape[0],self.index.shape[1],self.attrs['n_out'])), 1.e-6, 1.e6) # TBD
     ee = -T.sum(pcx[self.i] * T.log(pcx[self.i])) # TB
     #nll, pcxs = T.nnet.crossentropy_softmax_1hot(x=self.y_m[self.i], y_idx=self.y[self.i])
     nll, _ = T.nnet.crossentropy_softmax_1hot(x=self.y_m, y_idx=self.y_data_flat) # TB
     ce = nll.reshape(self.index.shape) * self.index # TB
     y = self.y_data_flat.reshape(self.index.shape) * self.index # TB
     f = T.any(T.gt(y,0), axis=0) # B
     return T.sum(f * T.sum(ce, axis=0) + (1-f) * T.sum(ee, axis=0)), known_grads
     #return T.sum(T.switch(T.gt(T.sum(y,axis=0),0), T.sum(ce, axis=0), -T.sum(ee, axis=0))), known_grads
     #return T.switch(T.gt(T.sum(self.y_m[self.i]),0), T.sum(nll), -T.sum(pcx * T.log(pcx))), known_grads
   elif self.loss == 'priori':
     pcx = self.p_y_given_x[self.i, self.y_data_flat[self.i]]
     pcx = T.clip(pcx, 1.e-38, 1.e20)  # For pcx near zero, the gradient will likely explode.
     return -T.sum(T.log(pcx)), known_grads
   elif self.loss == 'sse':
     if self.y_data_flat.dtype.startswith('int'):
       y_f = T.cast(T.reshape(self.y_data_flat, (self.y_data_flat.shape[0] * self.y_data_flat.shape[1]), ndim=1), 'int32')
       y_oh = T.eq(T.shape_padleft(T.arange(self.attrs['n_out']), y_f.ndim), T.shape_padright(y_f, 1))
       return T.mean(T.sqr(self.p_y_given_x[self.i] - y_oh[self.i])), known_grads
     else:
       #return T.sum(T.sum(T.sqr(self.y_m - self.y.reshape(self.y_m.shape)), axis=1)[self.i]), known_grads
       return T.sum(T.sqr(self.y_m[self.i] - self.y_data_flat.reshape(self.y_m.shape)[self.i])), known_grads
       #return T.sum(T.sum(T.sqr(self.z - (self.y.reshape((self.index.shape[0], self.index.shape[1], self.attrs['n_out']))[:self.z.shape[0]])), axis=2).flatten()[self.i]), known_grads
       #y_z = T.set_subtensor(T.zeros((self.index.shape[0],self.index.shape[1],self.attrs['n_out']), dtype='float32')[:self.z.shape[0]], self.z).flatten()
       #return T.sum(T.sqr(y_z[self.i] - self.y[self.i])), known_grads
       #return T.sum(T.sqr(self.y_m - self.y[:self.z.shape[0]*self.index.shape[1]]).flatten()[self.i]), known_grads
   else:
     assert False, "unknown loss: %s" % self.loss
Esempio n. 9
0
    def in_transit(self, t, r=0.0, texp=None):
        """Get a list of timestamps that are in transit

        Args:
            t (vector): A vector of timestamps to be evaluated.
            r (Optional): The radii of the planets.
            texp (Optional[float]): The exposure time.

        Returns:
            The indices of the timestamps that are in transit.

        """
        z = tt.zeros_like(self.a)
        r = tt.as_tensor_variable(r) + z
        R = self.r_star + z

        # Wrap the times into time since transit
        hp = 0.5 * self.period
        dt = tt.mod(self._warp_times(t) + hp, self.period) - hp

        if self.ecc is None:
            # Equation 14 from Winn (2010)
            k = r / R
            arg = tt.square(1 + k) - tt.square(self.b)
            factor = R / (self.a * self.sin_incl)
            hdur = hp * tt.arcsin(factor * tt.sqrt(arg)) / np.pi
            t_start = -hdur
            t_end = hdur
            flag = z

        else:
            M_contact = self.contact_points_op(
                self.a,
                self.ecc,
                self.cos_omega,
                self.sin_omega,
                self.cos_incl + z,
                self.sin_incl + z,
                R + r,
            )
            flag = M_contact[2]

            t_start = (M_contact[0] - self.M0) / self.n
            t_start = tt.mod(t_start + hp, self.period) - hp
            t_end = (M_contact[1] - self.M0) / self.n
            t_end = tt.mod(t_end + hp, self.period) - hp

            t_start = tt.switch(tt.gt(t_start, 0.0), t_start - self.period,
                                t_start)
            t_end = tt.switch(tt.lt(t_end, 0.0), t_end + self.period, t_end)

        if texp is not None:
            t_start -= 0.5 * texp
            t_end += 0.5 * texp

        mask = tt.any(tt.and_(dt >= t_start, dt <= t_end), axis=-1)
        result = ifelse(tt.all(tt.eq(flag, 0)),
                        tt.arange(t.size)[mask], tt.arange(t.size))

        return result
Esempio n. 10
0
    def compute_step(self, param, previous_step):
        not_finite = tensor.any(
            tensor.or_(tensor.isnan(previous_step),
                       tensor.isinf(previous_step)))
        step = tensor.switch(not_finite, self.scaler * param, previous_step)

        return step, []
Esempio n. 11
0
    def accurate_pixels_class(self, y):
        """
        Returns number of correctly classified pixels per class
        and total number of pixels per class.
        (pair of numpy 1d arrays)

        :type y: theano.tensor.TensorType
        :param y: corresponds to a vector that gives for each example the
                  correct label
        """
        # check if y has same dimension of y_pred
        if y.ndim != self.y_pred.ndim:
            raise TypeError('y should have the same shape as self.y_pred',
                            ('y', y.type, 'y_pred', self.y_pred.type))
        # check if y is of the correct datatype
        if not y.dtype.startswith('int'):
            raise NotImplementedError()

        correct = T.zeros((self.n_classes), dtype='int32')
        total = T.zeros((self.n_classes), dtype='int32')
        for i in range(self.n_classes):
            correct = T.set_subtensor(
                correct[i],
                T.switch(
                    T.any(T.eq(y, i)),
                    T.sum(
                        T.eq(y[T.eq(y, i).nonzero()],
                             self.y_pred[T.eq(y, i).nonzero()])), 0))
            total = T.set_subtensor(total[i], T.sum(T.eq(y, i)))
        return correct, total
Esempio n. 12
0
def get_vanilla_sgd_updates(param_list, gradients, lr):
    """Do SGD updates with vanilla step rule."""
    updates = []
    for p, g in zip(param_list, gradients):
        new_p = p - lr * g
        has_non_finite = T.any(T.isnan(new_p) + T.isinf(new_p))
        updates.append((p, ifelse(has_non_finite, p, new_p)))
    return updates
Esempio n. 13
0
def any(x, axis=None, keepdims=False):
    """Bitwise reduction (logical OR).
    """
    y = T.any(x, axis=axis, keepdims=keepdims)
    if isinstance(get_shape(x), (tuple, list)):
        output_shape = auto_infer_shape(T.any, x, axis=axis, keepdims=keepdims)
        add_shape(y, output_shape)
    return y
Esempio n. 14
0
    def in_transit(self, t, r=0.0, texp=None):
        """Get a list of timestamps that are in transit

        Args:
            t (vector): A vector of timestamps to be evaluated.
            r (Optional): The radii of the planets.
            texp (Optional[float]): The exposure time.

        Returns:
            The indices of the timestamps that are in transit.

        """

        z = tt.zeros_like(self.a)
        r = tt.as_tensor_variable(r) + z
        R = self.r_star + z

        # Wrap the times into time since transit
        hp = 0.5 * self.period
        dt = tt.mod(self._warp_times(t) - self.t0 + hp, self.period) - hp

        if self.ecc is None:
            # Equation 14 from Winn (2010)
            k = r / R
            arg = tt.square(1 + k) - tt.square(self.b)
            factor = R / (self.a * self.sin_incl)
            hdur = hp * tt.arcsin(factor * tt.sqrt(arg)) / np.pi
            t_start = -hdur
            t_end = hdur
            flag = z

        else:
            M_contact = self.contact_points_op(
                self.a, self.ecc, self.cos_omega, self.sin_omega,
                self.cos_incl + z, self.sin_incl + z, R + r)
            flag = M_contact[2]

            t_start = (M_contact[0] - self.M0) / self.n
            t_start = tt.mod(t_start + hp, self.period) - hp
            t_end = (M_contact[1] - self.M0) / self.n
            t_end = tt.mod(t_end + hp, self.period) - hp

            t_start = tt.switch(tt.gt(t_start, 0.0),
                                t_start - self.period, t_start)
            t_end = tt.switch(tt.lt(t_end, 0.0),
                              t_end + self.period, t_end)

        if texp is not None:
            t_start -= 0.5*texp
            t_end += 0.5*texp

        mask = tt.any(tt.and_(dt >= t_start, dt <= t_end), axis=-1)
        result = ifelse(tt.all(tt.eq(flag, 0)),
                        tt.arange(t.size)[mask],
                        tt.arange(t.size))

        return result
def model_4(gamble, exclude, A_v, B_v, C_v, amb_A, amb_B, amb_C, rho, lambda_param, alpha_noloss_context, 
           alpha_loss_context, gamma, amb_gain_value, amb_loss_value):
    
    # When using non-centered paramaterisation parameters can go below zero - need to stop this
    rho = T.switch(T.lt(rho, 0.01), 0.01, rho)
    lambda_param = T.switch(T.lt(lambda_param, 0.01), 0.01, lambda_param)
    gamma = T.switch(T.lt(gamma, 0.01), 0.01, gamma)
    alpha_noloss_context = T.switch(T.lt(alpha_noloss_context, 0.01), 0.01, alpha_noloss_context)
    alpha_loss_context = T.switch(T.lt(alpha_loss_context, 0.01), 0.01, alpha_loss_context)
    
    alpha = T.switch(T.any(T.stack([A_v, B_v, C_v]).squeeze() < 0, axis=0), alpha_loss_context, alpha_noloss_context)
    
    # Calculate values for the 3 options (one of these may not be an option, in which case its value ends up being zero)
    u_A = T.switch(T.gt(A_v, 0), 
                   ((1 - amb_A) * T.power(A_v, rho)) + 
                            amb_A * (gamble[0] * (alpha * T.power(amb_gain_value, rho)) + 
                            ((1 - gamble[0]) * (alpha * T.power(amb_gain_value, rho)))), 
                   -((1 - amb_A) * lambda_param * T.power(T.abs_(A_v), rho) + 
                             amb_A * (gamble[0] * (alpha * T.power(amb_loss_value, rho)) + 
                              ((1 - gamble[0]) * (alpha * T.power(amb_loss_value, rho))))))
                   
    u_B = T.switch(T.gt(B_v, 0), 
                   ((1 - amb_B) * T.power(B_v, rho)) + 
                            amb_B * (gamble[1] * (alpha * T.power(amb_gain_value, rho)) + 
                            ((1 - gamble[1]) * (alpha * T.power(amb_gain_value, rho)))), 
                   -((1 - amb_B) * lambda_param * T.power(T.abs_(B_v), rho) + 
                            amb_B * (gamble[1] * (alpha * T.power(amb_loss_value, rho)) + 
                              ((1 - gamble[1]) * (alpha * T.power(amb_loss_value, rho))))))
                   
    u_C = T.switch(T.gt(C_v, 0), 
                   ((1 - amb_C) * T.power(C_v, rho)) + 
                           amb_C * (gamble[2] * (alpha * T.power(amb_gain_value, rho)) + 
                            ((1 - gamble[2]) * (alpha * T.power(amb_gain_value, rho)))),
                   -((1 - amb_C) * lambda_param * T.power(T.abs_(C_v), rho) + 
                             amb_C * (gamble[2] * (alpha * T.power(amb_loss_value, rho)) + 
                              ((1 - gamble[2]) * (alpha * T.power(amb_loss_value, rho))))))

    # If we have only two choices (i.e. no gamble), the ambiguous option should be labelled as a gamble
    gamble = T.switch(T.eq(exclude.sum(axis=0), 1), T.stack([amb_A, amb_B, amb_C]).squeeze(), gamble)
    
    # Get value of gamble option
    gamble_weighting = gamble / gamble.sum(axis=0)
    u_gamble = gamble_weighting[0] * (u_A * (1 - exclude[0])) + \
               gamble_weighting[1] * (u_B * (1 - exclude[1])) + \
               gamble_weighting[2] * (u_C * (1 - exclude[2]))
    
    # Get value of sure option
    sure_weighting = (1 - gamble) / ((1 - gamble).sum(axis=0) - exclude.sum(axis=0))
    u_sure = sure_weighting[0] * (u_A * (1 - exclude[0])) + \
             sure_weighting[1] * (u_B * (1 - exclude[1])) + \
             sure_weighting[2] * (u_C * (1 - exclude[2]))
    
    # Calculate choice probability
    p = inv_logit(gamma * (u_gamble - u_sure))
    
    return p
Esempio n. 16
0
    def logp_cho(cls, value, mu, cho, mapping):
        """
        Calculates the log p of the parameters given the data
        :param value: the data
        :param mu: the location (obtained from the hiperparameters)
        :param cho: the cholesky decomposition of the dispersion matrix
        :param mapping: the mapping of the warped.
        :return: it returns the value of the log p of the parameters given the data (values)
        """
        #print(value.tag.test_value)
        #print(mu.tag.test_value)
        #print(mapping.inv(value).tag.test_value)
        #mu = debug(mu, 'mu', force=True)

        #value = debug(value, 'value', force=False)
        delta = mapping.inv(value) - mu

        #delta = debug(delta, 'delta', force=True)
        #cho = debug(cho, 'cho', force=True)
        lcho = tsl.solve_lower_triangular(cho, delta)
        #lcho = debug(lcho, 'lcho', force=False)

        lcho2 = lcho.T.dot(lcho)
        #lcho2 = debug(lcho2, 'lcho2', force=True)

        npi = np.float32(-0.5) * cho.shape[0].astype(
            th.config.floatX) * tt.log(np.float32(2.0 * np.pi))
        dot2 = np.float32(-0.5) * lcho2

        #diag = debug(tnl.diag(cho), 'diag', force=True)
        #_log= debug(tt.log(diag), 'log', force=True)

        det_k = -tt.sum(tt.log(tnl.diag(cho)))
        det_m = mapping.logdet_dinv(value)

        #npi = debug(npi, 'npi', force=False)
        #dot2 = debug(dot2, 'dot2', force=False)
        #det_k = debug(det_k, 'det_k', force=False)
        #det_m = debug(det_m, 'det_m', force=False)

        r = npi + dot2 + det_k + det_m

        cond1 = tt.or_(tt.any(tt.isinf_(delta)), tt.any(tt.isnan_(delta)))
        cond2 = tt.or_(tt.any(tt.isinf_(det_m)), tt.any(tt.isnan_(det_m)))
        cond3 = tt.or_(tt.any(tt.isinf_(cho)), tt.any(tt.isnan_(cho)))
        cond4 = tt.or_(tt.any(tt.isinf_(lcho)), tt.any(tt.isnan_(lcho)))
        return ifelse(
            cond1, np.float32(-1e30),
            ifelse(
                cond2, np.float32(-1e30),
                ifelse(cond3, np.float32(-1e30),
                       ifelse(cond4, np.float32(-1e30), r))))
Esempio n. 17
0
 def _step(input, *states):
     output, new_states = step_function(input, states)
     if masking:
         # if all-zero input timestep, return
         # all-zero output and unchanged states
         switch = T.any(input, axis=-1, keepdims=True)
         output = T.switch(switch, output, 0. * output)
         return_states = []
         for state, new_state in zip(states, new_states):
             return_states.append(T.switch(switch, new_state, state))
         return [output] + return_states
     else:
         return [output] + new_states
Esempio n. 18
0
 def _step(input, *states):
     output, new_states = step_function(input, states)
     if masking:
         # if all-zero input timestep, return
         # all-zero output and unchanged states
         switch = T.any(input, axis=-1, keepdims=True)
         output = T.switch(switch, output, 0. * output)
         return_states = []
         for state, new_state in zip(states, new_states):
             return_states.append(T.switch(switch, new_state, state))
         return [output] + return_states
     else:
         return [output] + new_states
Esempio n. 19
0
    def do_compute(self, quiet):
        if quiet:
            self._d, self._W, _ = ops.factor_quiet(self._a, self._U, self._V,
                                                   self._P)
            self._log_det = tt.switch(tt.any(self._d < 0.0), -np.inf,
                                      tt.sum(tt.log(self._d)))

        else:
            self._d, self._W, _ = ops.factor(self._a, self._U, self._V,
                                             self._P)
            self._log_det = tt.sum(tt.log(self._d))

        self._norm = -0.5 * (self._log_det + self._size * np.log(2 * np.pi))
Esempio n. 20
0
    def grad(self, inputs, gradients):
        """
        Cholesky decomposition reverse-mode gradient update.

        Symbolic expression for reverse-mode Cholesky gradient taken from [0]_

        References
        ----------
        .. [0] I. Murray, "Differentiation of the Cholesky decomposition",
           http://arxiv.org/abs/1602.07527

        """

        x = inputs[0]
        dz = gradients[0]
        chol_x = self(x)

        # Replace the cholesky decomposition with 1 if there are nans
        # or solve_upper_triangular will throw a ValueError.
        if self.on_error == 'nan':
            ok = ~tensor.any(tensor.isnan(chol_x))
            chol_x = tensor.switch(ok, chol_x, 1)
            dz = tensor.switch(ok, dz, 1)

        # deal with upper triangular by converting to lower triangular
        if not self.lower:
            chol_x = chol_x.T
            dz = dz.T

        def tril_and_halve_diagonal(mtx):
            """Extracts lower triangle of square matrix and halves diagonal."""
            return tensor.tril(mtx) - tensor.diag(tensor.diagonal(mtx) / 2.)

        def conjugate_solve_triangular(outer, inner):
            """Computes L^{-T} P L^{-1} for lower-triangular L."""
            return solve_upper_triangular(
                outer.T,
                solve_upper_triangular(outer.T, inner.T).T)

        s = conjugate_solve_triangular(
            chol_x, tril_and_halve_diagonal(chol_x.T.dot(dz)))

        if self.lower:
            grad = tensor.tril(s + s.T) - tensor.diag(tensor.diagonal(s))
        else:
            grad = tensor.triu(s + s.T) - tensor.diag(tensor.diagonal(s))

        if self.on_error == 'nan':
            return [tensor.switch(ok, grad, np.nan)]
        else:
            return [grad]
Esempio n. 21
0
    def build_aligner(self):
        tgt_action_seq = ndim_itensor(3, 'tgt_action_seq')
        tgt_action_seq_type = ndim_itensor(3, 'tgt_action_seq_type')
        tgt_node_seq = ndim_itensor(2, 'tgt_node_seq')
        tgt_par_rule_seq = ndim_itensor(2, 'tgt_par_rule_seq')
        tgt_par_t_seq = ndim_itensor(2, 'tgt_par_t_seq')

        tgt_node_embed = self.node_embedding[tgt_node_seq]
        query_tokens = ndim_itensor(2, 'query_tokens')
        query_token_embed, query_token_embed_mask = self.query_embedding(
            query_tokens, mask_zero=True)
        batch_size = tgt_action_seq.shape[0]
        max_example_action_num = tgt_action_seq.shape[1]

        tgt_action_seq_embed = T.switch(
            T.shape_padright(tgt_action_seq[:, :, 0] > 0),
            self.rule_embedding_W[tgt_action_seq[:, :, 0]],
            self.vocab_embedding_W[tgt_action_seq[:, :, 1]])
        tgt_action_seq_embed_tm1 = tensor_right_shift(tgt_action_seq_embed)
        tgt_par_rule_embed = T.switch(tgt_par_rule_seq[:, :, None] < 0,
                                      T.alloc(0., 1, config.rule_embed_dim),
                                      self.rule_embedding_W[tgt_par_rule_seq])

        if not config.frontier_node_type_feed:
            tgt_node_embed *= 0.
        if not config.parent_action_feed:
            tgt_par_rule_embed *= 0.

        decoder_input = T.concatenate(
            [tgt_action_seq_embed_tm1, tgt_node_embed, tgt_par_rule_embed],
            axis=-1)
        query_embed = self.query_encoder_lstm(query_token_embed,
                                              mask=query_token_embed_mask,
                                              dropout=0,
                                              srng=self.srng)

        tgt_action_seq_mask = T.any(tgt_action_seq_type, axis=-1)

        alignments = self.decoder_lstm.align(
            decoder_input,
            context=query_embed,
            context_mask=query_token_embed_mask,
            mask=tgt_action_seq_mask,
            parent_t_seq=tgt_par_t_seq,
            srng=self.srng)

        alignment_inputs = [
            query_tokens, tgt_action_seq, tgt_action_seq_type, tgt_node_seq,
            tgt_par_rule_seq, tgt_par_t_seq
        ]
        self.align = theano.function(alignment_inputs, [alignments])
Esempio n. 22
0
    def grad(self, inputs, gradients):
        """
        Cholesky decomposition reverse-mode gradient update.

        Symbolic expression for reverse-mode Cholesky gradient taken from [0]_

        References
        ----------
        .. [0] I. Murray, "Differentiation of the Cholesky decomposition",
           http://arxiv.org/abs/1602.07527

        """

        x = inputs[0]
        dz = gradients[0]
        chol_x = self(x)

        # Replace the cholesky decomposition with 1 if there are nans
        # or solve_upper_triangular will throw a ValueError.
        if self.on_error == 'nan':
            ok = ~tensor.any(tensor.isnan(chol_x))
            chol_x = tensor.switch(ok, chol_x, 1)
            dz = tensor.switch(ok, dz, 1)

        # deal with upper triangular by converting to lower triangular
        if not self.lower:
            chol_x = chol_x.T
            dz = dz.T

        def tril_and_halve_diagonal(mtx):
            """Extracts lower triangle of square matrix and halves diagonal."""
            return tensor.tril(mtx) - tensor.diag(tensor.diagonal(mtx) / 2.)

        def conjugate_solve_triangular(outer, inner):
            """Computes L^{-T} P L^{-1} for lower-triangular L."""
            return solve_upper_triangular(
                outer.T, solve_upper_triangular(outer.T, inner.T).T)

        s = conjugate_solve_triangular(
            chol_x, tril_and_halve_diagonal(chol_x.T.dot(dz)))

        if self.lower:
            grad = tensor.tril(s + s.T) - tensor.diag(tensor.diagonal(s))
        else:
            grad = tensor.triu(s + s.T) - tensor.diag(tensor.diagonal(s))

        if self.on_error == 'nan':
            return [tensor.switch(ok, grad, np.nan)]
        else:
            return [grad]
Esempio n. 23
0
        def get_idx(q_nbrs, q_mem):
            """Gets the index of sample in memory for computing loss.

            We first look to see if the query label can be found in the
            retrieved neighbours, and if not, look to memory for a key with
            the same value.

            We keep track of a boolean mask, which indicates whether or not we
            were able to find a sample with a label that matches the query.
            """

            # Whether a matching sample can be found in neighbours or memory
            any_match_nbrs = T.any(q_nbrs, axis=1)
            any_match_mem = T.any(q_mem, axis=1)
            any_match = T.or_(any_match_nbrs, any_match_mem)

            # Look in neighbours then memory for corresponding sample.
            # If from neighbours, we need to retrieve the full mem idx.
            rows = T.arange(nbrs.shape[0])
            idx = T.switch(any_match_nbrs,
                           nbrs[rows, tensor_choose_k(q_nbrs, self.rng, k=1)],
                           tensor_choose_k(q_mem, self.rng, k=1, random=True))

            return (idx, any_match)
Esempio n. 24
0
    def _get_accuracy(self, top_range, data_type):
        return_list = isinstance(top_range, list)
        if not return_list:
            top_range = [top_range]
        max_top_range = max(top_range)

        expanded = self._correct_answers.dimshuffle(0, 'x')
        expanded = expanded.repeat(max_top_range, axis=1)
        eq = T.eq(expanded, self.answers[:, :max_top_range])

        # Compile new function only if top range or data type has changed
        if self._accuracy_config != [top_range, data_type]:
            self._accuracy = theano.function(
                inputs=[self._batch_index],
                outputs=[
                    T.any(eq[:, :top], axis=1).mean() for top in top_range
                ],
                givens={
                    self._input:
                    self.data_loader.input(self._batch_index, data_type),
                    self._correct_answers:
                    self.data_loader.output(self._batch_index, data_type)
                },
            )
            self._accuracy_config = [top_range, data_type]

        n_batches = self.data_loader.n_batches(data_type)
        accuracy = np.zeros(shape=(n_batches, len(top_range)))
        interval = n_batches / 10
        if interval == 0:
            interval = 1
        for batch_index in xrange(n_batches):
            self.data_loader.load_data(batch_index, data_type)
            accuracy[batch_index, :] = np.asarray(self._accuracy(batch_index))
            if self.verbosity >= 3 or \
                    (self.verbosity >= 2 and batch_index % interval == 0):
                partial_accuracy = accuracy[:batch_index + 1, :].mean(axis=0)
                text = ''
                for a in partial_accuracy:
                    text += ' {:.2f}%'.format(100 * a)
                overwrite('{}/{} minibatches accuracy:{}'.format(
                    batch_index + 1, n_batches, text))
        overwrite()

        accuracy = accuracy.mean(axis=0).tolist()
        if not return_list:
            return accuracy[0]
        return accuracy
Esempio n. 25
0
File: eca.py Progetto: arasmus/eca
    def compile_prop_f(self, signals, has_input, min_tau=0.0):
        tau_in = T.scalar('min_tau', dtype=FLOATX)
        inputs = [tau_in]
        x = self.signal(signals)

        # Get estimate of the state from layer above
        estimate = self.estimate(signals)

        # Feedforward originates from previous layer's state or given input
        if not has_input:
            feedforward = self.feedforward(signals)
            has_nans = T.as_tensor_variable(0)
            nans = 0.0
        else:
            input_t = T.matrix('input', dtype=FLOATX)
            inputs += [input_t]
            nans = T.isnan(input_t)
            has_nans = T.any(nans)
            feedforward = T.where(nans, 0.0, input_t)

        self.info('Compiling propagation: [%6s] -> %4s <- [%6s]' %
                  (",".join([p.name for p in self.prev] if self.prev else 'u/y'),
                   self.name,
                   ",".join([p.name for p in self.next] if self.next else '')))

        # Apply nonlinearity to feedforward path only
        if self.nonlin:
            feedforward = self.nonlin(feedforward)

        if self.merge_op:
            assert not self.persistent, 'cannot combine with merge_op'
            new_value = self.merge_op(feedforward, estimate)
        elif self.persistent:
            new_value = feedforward
        else:
            new_value = feedforward - estimate

        # If predicting missing values, force them to zero in residual so
        # that they don't influence learning
        new_value = ifelse(has_nans, T.where(nans, 0.0, new_value), new_value)

        (new_X, t, d) = lerp(x.var, new_value, tau_in)
        d = T.max(d)
        updates = [(x.var, ifelse(self.enabled, new_X, x.var))]

        return theano.function(inputs=inputs,
                               outputs=d,
                               updates=updates)
Esempio n. 26
0
 def _step(*args):
     global single_result
     input = args[0]
     states = args[1:]
     output, new_states = step_function(input, states)
     if masking:
         # if all-zero input timestep, return
         # all-zero output and unchanged states
         switch = T.any(input)
         output = T.switch(switch, output, 0. * output)
         return_states = []
         for state, new_state in zip(states, new_states):
             return_states.append(T.switch(switch, new_state, state))
         return [output] + return_states
     else:
         return [output] + new_states
Esempio n. 27
0
 def _step(*args):
     global single_result
     input = args[0]
     states = args[1:]
     output, new_states = step_function(input, states)
     if masking:
         # if all-zero input timestep, return
         # all-zero output and unchanged states
         switch = T.any(input)
         output = T.switch(switch, output, 0. * output)
         return_states = []
         for state, new_state in zip(states, new_states):
             return_states.append(T.switch(switch, new_state, state))
         return [output] + return_states
     else:
         return [output] + new_states
Esempio n. 28
0
File: eca.py Progetto: afcentry/eca
    def compile_prop_f(self, signals, has_input, min_tau=0.0):
        tau_in = T.scalar('min_tau', dtype=FLOATX)
        inputs = [tau_in]
        x = self.signal(signals)

        # Get estimate of the state from layer above
        estimate = self.estimate(signals)

        # Feedforward originates from previous layer's state or given input
        if not has_input:
            feedforward = self.feedforward(signals)
            has_nans = T.as_tensor_variable(0)
            nans = 0.0
        else:
            input_t = T.matrix('input', dtype=FLOATX)
            inputs += [input_t]
            nans = T.isnan(input_t)
            has_nans = T.any(nans)
            feedforward = T.where(nans, 0.0, input_t)

        self.info(
            'Compiling propagation: [%6s] -> %4s <- [%6s]' %
            (",".join([p.name for p in self.prev] if self.prev else 'u/y'),
             self.name, ",".join([p.name
                                  for p in self.next] if self.next else '')))

        # Apply nonlinearity to feedforward path only
        if self.nonlin:
            feedforward = self.nonlin(feedforward)

        if self.merge_op:
            assert not self.persistent, 'cannot combine with merge_op'
            new_value = self.merge_op(feedforward, estimate)
        elif self.persistent:
            new_value = feedforward
        else:
            new_value = feedforward - estimate

        # If predicting missing values, force them to zero in residual so
        # that they don't influence learning
        new_value = ifelse(has_nans, T.where(nans, 0.0, new_value), new_value)

        (new_X, t, d) = lerp(x.var, new_value, tau_in)
        d = T.max(d)
        updates = [(x.var, ifelse(self.enabled, new_X, x.var))]

        return theano.function(inputs=inputs, outputs=d, updates=updates)
Esempio n. 29
0
 def _step(input, *args): 
     # separate states and contexts
     states = args[0:nb_states]
     output, other_outputs, new_states = step_function(input, args)
     if masking:
         # if all-zero input timestep, return
         # all-zero output and unchanged states
         switch = T.any(input, axis=-1, keepdims=True)
         output = T.switch(switch, output, 0. * output)
         for other_output in other_outputs:
             other_output = T.switch(switch, other_output, 0. * other_output)      
         return_states = []
         for state, new_state in zip(states, new_states):
             return_states.append(T.switch(switch, new_state, state))
         return [output] + other_outputs + return_states
     else:
         return [output] + other_outputs + new_states
Esempio n. 30
0
def categorical_acc(predictions, targets, top_k=1):

    if targets.ndim == predictions.ndim:
        targets = T.argmax(targets, axis=-1)
    elif targets.ndim != predictions.ndim - 1:
        raise TypeError('rank mismatch between targets and predictions')

    if top_k == 1:
        # standard categorical accuracy
        top = T.argmax(predictions, axis=-1)
        return T.eq(top, targets)
    else:
        # top-k accuracy
        top = T.argsort(predictions, axis=-1)
        # (Theano cannot index with [..., -top_k:], we need to simulate that)
        top = top[[slice(None)
                   for _ in range(top.ndim - 1)] + [slice(-top_k, None)]]
        targets = T.shape_padaxis(targets, axis=-1)
        return T.any(T.eq(top, targets), axis=-1)
Esempio n. 31
0
def get_net_fun(phonemeViseme, networkType, k=5, print_network= False):
    outputLayer, inputs = load_model(phonemeViseme, networkType, print_network)

    targets = T.ivector('targets')

    all_predictions = lasagne.layers.get_output(outputLayer, deterministic=True)
    get_all_prob = theano.function([inputs], all_predictions)

    maxprob = T.argmax(all_predictions, axis=1)
    get_first_prediction = theano.function([inputs], maxprob)

    accuracy = T.eq(maxprob, targets)
    avg_accuracy= T.mean(accuracy, dtype=theano.config.floatX)
    get_accuracy = theano.function([inputs, targets], avg_accuracy)

    # Top k accuracy
    # topk_accuracy = T.mean(T.any(T.eq(T.argsort(all_predictions, axis=1)[:, -k:], targets.dimshuffle(0, 'x')), axis=1), axis=1)
    topk_accuracy = T.any(T.eq(T.argsort(all_predictions, axis=1)[:, -k:], targets.dimshuffle(0, 'x')), axis=1)

    avg_topk_accuracy = T.mean(topk_accuracy, dtype=theano.config.floatX)
    get_topk_accuracy = theano.function([inputs, targets], avg_topk_accuracy)

    val_fn = theano.function([inputs, targets], [all_predictions, maxprob, avg_accuracy, avg_topk_accuracy])

    def print_topk(im_path, k):
        im = prep_image(im_path)
        prob = get_all_prob(im)[0]
        #print(prob)
        phonemeNumberMap = classToPhoneme39
        pred = []
        for i in range(0, len(prob)):
            p = prob[i]
            prob_phoneme = phonemeNumberMap[i]
            pred.append([prob_phoneme, p])
            # print(p, " ", prob_phoneme)
        pred = sorted(pred, key=lambda t: t[1], reverse=True)
        pred = pred[:k]
        for p in pred:
            print(p)

    return get_all_prob, get_first_prediction, print_topk, get_accuracy, get_topk_accuracy, val_fn
Esempio n. 32
0
    def posdef(self, x, diag):
        diag = tt.as_tensor_variable(diag)
        diag = tt.reshape(diag.T, (1, diag.size))[0]
        x = tt.as_tensor_variable(x)

        T = self.terms[0].value(x[:, None] - x[None, :])
        if self.terms[0].alpha.ndim == 1:
            R = self.terms[0].alpha[:, None] * self.terms[0].alpha[None, :]
            K = tt.slinalg.kron(T, R)
        else:
            K = tt.slinalg(T, self.terms[0].alpha)

        for term in self.terms:
            T = term.value(x[:, None] - x[None, :])
            if term.alpha.ndim == 1:
                R = term.alpha[:, None] * term.alpha[None, :]
                K += tt.slinalg.kron(T, R)
            else:
                K += tt.slinalg(T, term.alpha)
        chol = tt.slinalg.Cholesky(on_error='nan')
        L = chol(K + tt.diag(diag))
        return tt.switch(tt.any(tt.isnan(L)), np.array(False), np.array(True))
Esempio n. 33
0
    def logp_cho(cls, value, mu, cho, freedom, mapping):
        delta = mapping.inv(value) - mu

        lcho = tsl.solve_lower_triangular(cho, delta)
        beta = lcho.T.dot(lcho)
        n = cho.shape[0].astype(th.config.floatX)

        np5 = np.float32(0.5)
        np2 = np.float32(2.0)
        npi = np.float32(np.pi)

        r1 = -np5 * (freedom + n) * tt.log1p(beta / (freedom - np2))
        r2 = ifelse(
            tt.le(np.float32(1e6), freedom), -n * np5 * np.log(np2 * npi),
            tt.gammaln((freedom + n) * np5) - tt.gammaln(freedom * np5) -
            np5 * n * tt.log((freedom - np2) * npi))
        r3 = -tt.sum(tt.log(tnl.diag(cho)))
        det_m = mapping.logdet_dinv(value)

        r1 = debug(r1, name='r1', force=True)
        r2 = debug(r2, name='r2', force=True)
        r3 = debug(r3, name='r3', force=True)
        det_m = debug(det_m, name='det_m', force=True)

        r = r1 + r2 + r3 + det_m

        cond1 = tt.or_(tt.any(tt.isinf_(delta)), tt.any(tt.isnan_(delta)))
        cond2 = tt.or_(tt.any(tt.isinf_(det_m)), tt.any(tt.isnan_(det_m)))
        cond3 = tt.or_(tt.any(tt.isinf_(cho)), tt.any(tt.isnan_(cho)))
        cond4 = tt.or_(tt.any(tt.isinf_(lcho)), tt.any(tt.isnan_(lcho)))
        return ifelse(
            cond1, np.float32(-1e30),
            ifelse(
                cond2, np.float32(-1e30),
                ifelse(cond3, np.float32(-1e30),
                       ifelse(cond4, np.float32(-1e30), r))))
Esempio n. 34
0
    def __init__(self, n_comp=10, verbose=False):

        # Theano initialization
        self.T_weights = shared(np.eye(n_comp, dtype=np.float32))
        self.T_bias = shared(np.ones((n_comp, 1), dtype=np.float32))

        T_p_x_white = T.fmatrix()
        T_lrate = T.fscalar()
        T_block = T.fscalar()
        T_unmixed = T.dot(self.T_weights, T_p_x_white) + T.addbroadcast(
            self.T_bias, 1)
        T_logit = 1 - 2 / (1 + T.exp(-T_unmixed))

        T_out = self.T_weights + T_lrate * T.dot(
            T_block * T.identity_like(self.T_weights) +
            T.dot(T_logit, T.transpose(T_unmixed)), self.T_weights)
        T_bias_out = self.T_bias + T_lrate * T.reshape(T_logit.sum(axis=1),
                                                       (-1, 1))
        T_max_w = T.max(self.T_weights)
        T_isnan = T.any(T.isnan(self.T_weights))

        self.w_up_fun = theano.function([T_p_x_white, T_lrate, T_block],
                                        [T_max_w, T_isnan],
                                        updates=[(self.T_weights, T_out),
                                                 (self.T_bias, T_bias_out)],
                                        allow_input_downcast=True)

        T_matrix = T.fmatrix()
        T_cov = T.dot(T_matrix, T.transpose(T_matrix)) / T_block
        self.cov_fun = theano.function([T_matrix, T_block],
                                       T_cov,
                                       allow_input_downcast=True)

        self.loading = None
        self.sources = None
        self.weights = None
        self.n_comp = n_comp
        self.verbose = verbose
Esempio n. 35
0
    def in_transit(self, t, r=None, texp=None):
        """Get a list of timestamps that are in transit

        Args:
            t (vector): A vector of timestamps to be evaluated.
            r (Optional): The radii of the planets.
            texp (Optional[float]): The exposure time.

        Returns:
            The indices of the timestamps that are in transit.

        """
        dt = tt.mod(tt.shape_padright(t) - self._ref_time, self.period)
        dt -= self._half_period
        if self.r is None:
            tol = 0.5 * self.duration
        else:
            x = (r + self.r_star)**2 - self._b_norm**2
            tol = tt.sqrt(x) / self.speed
        if texp is not None:
            tol += 0.5 * texp
        mask = tt.any(tt.abs_(dt) < tol, axis=-1)
        return tt.arange(t.size)[mask]
Esempio n. 36
0
    def in_transit(self, t, r=0.0, texp=None):
        """Get a list of timestamps that are in transit

        Args:
            t (vector): A vector of timestamps to be evaluated.
            r (Optional): The radii of the planets.
            texp (Optional[float]): The exposure time.

        Returns:
            The indices of the timestamps that are in transit.

        """
        z = tt.zeros_like(self.a)
        r = tt.as_tensor_variable(r) + z
        R = self.r_star + z

        if self.ecc is None:
            M_contact = self.contact_points_op(self.a, self.incl + z, r, R)
        else:
            M_contact = self.contact_points_op(self.a, self.ecc, self.omega,
                                               self.incl + z, r, R)

        # Wrap the times into time since transit
        hp = 0.5 * self.period
        t_start = (M_contact[0] - self.M0) / self.n
        t_start = tt.mod(t_start + hp, self.period) - hp
        t_end = (M_contact[3] - self.M0) / self.n
        t_end = tt.mod(t_end + hp, self.period) - hp
        dt = tt.mod(self._warp_times(t) - self.t0 + hp, self.period) - hp
        if texp is not None:
            t_start -= 0.5 * texp
            t_end += 0.5 * texp

        mask = tt.any(tt.and_(dt >= t_start, dt <= t_end), axis=-1)

        return tt.arange(t.size)[mask]
Esempio n. 37
0
def any(x, axis=None, keepdims=False):
    '''Bitwise reduction (logical OR).
    '''
    return T.any(x, axis=axis, keepdims=keepdims)
Esempio n. 38
0
 def get_output_mask(self, train=False):
     X = self.get_input(train)
     return T.any(T.ones_like(X) * (1. - T.eq(X, self.mask_value)), axis=-1)
Esempio n. 39
0
    def _compile_functions(self):
        self._gradnorm = T.zeros([])
        for _param, _grad in zip(self._params, self._grads):
            # apply rmsprop to before clipping gradients
            if self.rmsprop:
                avg_grad_sqr = self._avg_grad_sqrs[_param]
                new_avg_grad_sqr = self.averaging_coeff * avg_grad_sqr + \
                    (1 - self.averaging_coeff) * T.sqr(_grad)
                self._avg_grad_sqrs_updates[avg_grad_sqr] = new_avg_grad_sqr
                rms_grad_t = T.sqrt(new_avg_grad_sqr)
                rms_grad_t = T.maximum(rms_grad_t, self.stabilizer)
                _grad = _grad / rms_grad_t
            self._gradnorm += T.sum(_grad**2)
        self._gradnorm = T.sqrt(self._gradnorm)

        self._givens = {}
        self._givens[self._inputvar] = self._inputs_theano[
            self._batch_idx * self.batchsize:
            (self._batch_idx + 1) * self.batchsize]
        if self.is_supervised:
            self._givens[self._targetvar] = self._outputs_theano[
                self._batch_idx * self.batchsize:
                (self._batch_idx + 1) * self.batchsize]
        if self.has_masks:
            self._givens[self._maskvar] = self._masks_theano[
                self._batch_idx * self.batchsize:
                (self._batch_idx + 1) * self.batchsize]

        self.gradnorm = theano.function(
            inputs=[],
            outputs=self._gradnorm,
            givens=self._givens)

        avg_gradnorm_update = {
            self._avg_gradnorm: self._avg_gradnorm * .8 + self._gradnorm * .2}

        self._update_weight_norm_ratios = []
        for _param, _grad in zip(self._params, self._grads):
            if hasattr(self._model, 'skip_params'):
                if _param.name in self._model.skip_params:
                    continue

            _clip_grad = T.switch(
                T.gt(self._gradnorm, self._gradient_clip_threshold),
                _grad * self._gradient_clip_threshold / self._gradnorm, _grad)

            try:  # ... to apply learningrate_modifiers
                # Cliphid version:
                self._inc_updates[self._incs[_param]] = \
                    self._momentum * self._incs[_param] - \
                    self._learningrate * \
                    self._model.learningrate_modifiers[
                        _param.name] * _clip_grad

                self._updates[_param] = _param + self._incs[_param]
                self._updates_nomomentum[_param] = _param - \
                    self._learningrate * \
                    self._model.learningrate_modifiers[_param.name] * \
                    _clip_grad
                print 'Learning rate modifier for {0}: {1}'.format(
                    _param.name, self._model.learningrate_modifiers[_param.name])

            except (AttributeError, KeyError):
                self._inc_updates[self._incs[_param]] = self._momentum * \
                    self._incs[_param] - self._learningrate * _clip_grad
                self._updates[_param] = _param + self._incs[_param]
                self._updates_nomomentum[_param] = _param - \
                    self._learningrate * _clip_grad

            if self.monitor_update_weight_norm_ratio:
                print 'building update weight norm ratio graph for ', _param.name
                self._update_weight_norm_ratios.append(
                    self._incs[_param].norm(2) / _param.norm(2))

        self.any_isnan = T.any(T.isnan(
            T.concatenate([x.flatten() for x in self._grads], axis=0)))
        # compute function to get update_weight_norm_ratios (returned in same
        # order as params list)
        print 'compiling update weight norm ratio function...'
        self.get_update_weight_norm_ratios = theano.function(
            [], self._update_weight_norm_ratios)
        print 'done'

        # first update gradient norm running avg
        ordered_updates = collections.OrderedDict()
        try:
            ordered_updates.update(self._model.updates)
        except AttributeError:
            pass
        ordered_updates.update(avg_gradnorm_update)
        # so that it is considered in the parameter update computations
        ordered_updates.update(self._inc_updates)

        print 'compiling updateincs...'
        self._updateincs = theano.function(
            [], [self._cost, self._avg_gradnorm, self.any_isnan], updates=ordered_updates,
            givens=self._givens)
        print 'done'

        print 'compiling trainmodel...'
        self._trainmodel = theano.function(
            [self._n], self._noop, updates=self._updates)
        print 'done'

        print 'compiling trainmodel_nomomentum...'
        self._trainmodel_nomomentum = theano.function(
            [self._n], self._noop, updates=self._updates_nomomentum,
            givens=self._givens)
        print 'done'

        self._momentum_batchcounter = 0
Esempio n. 40
0
  def setup_backprop(self):
    eta = T.scalar('eta_for_backprop')
    x = T.lvector('x_for_backprop')
    y = T.lvector('y_for_backprop')
    y_in_x_inds = T.lmatrix('y_in_x_inds_for_backprop')
    dec_init_state, annotations = self._symb_encoder(x)

    def decoder_recurrence(y_t, cur_y_in_x_inds, h_prev, annotations, *params):
      h_for_write = self.spec.decoder.get_h_for_write(h_prev)
      scores = self.spec.get_attention_scores(h_for_write, annotations)
      alpha = self.spec.get_alpha(scores)
      c_t = self.spec.get_context(alpha, annotations)
      write_dist = self.spec.f_write(h_for_write, c_t, scores)
      base_p_y_t = write_dist[y_t] 
      if self.spec.attention_copying:
        copying_p_y_t = T.dot(
            write_dist[self.out_vocabulary.size():],
            cur_y_in_x_inds)
        p_y_t = base_p_y_t + copying_p_y_t
      else:
        p_y_t = base_p_y_t
      h_t = self.spec.f_dec(y_t, c_t, h_prev)
      return (h_t, p_y_t)

    dec_results, _ = theano.scan(
        fn=decoder_recurrence, sequences=[y, y_in_x_inds],
        outputs_info=[dec_init_state, None],
        non_sequences=[annotations] + self.spec.get_all_shared())
    p_y_seq = dec_results[1]
    log_p_y = T.sum(T.log(p_y_seq))
    gradients = T.grad(log_p_y, self.params)

    # Do the updates here
    updates = []
    if self.spec.step_rule in ('adagrad', 'rmsprop'):
      # Adagrad updates
      for p, g, c in zip(self.params, gradients, self.grad_cache):
        grad_norm = g.norm(2)
        clipped_grad = ifelse(grad_norm >= CLIP_THRESH, 
                              g * CLIP_THRESH / grad_norm, g)
        if self.spec.step_rule == 'adagrad':
          new_c = c + clipped_grad ** 2
        else:  # rmsprop
          decay_rate = 0.9  # Use fixed decay rate of 0.9
          new_c = decay_rate * c + (1.0 - decay_rate) * clipped_grad ** 2
        new_p = p + eta * clipped_grad / T.sqrt(new_c + 1e-4)
        has_non_finite = T.any(T.isnan(new_p) + T.isinf(new_p))
        updates.append((p, ifelse(has_non_finite, p, new_p)))
        updates.append((c, ifelse(has_non_finite, c, new_c)))
    elif self.spec.step_rule == 'nesterov':
      # Nesterov momentum
      for p, g, v in zip(self.params, gradients, self.grad_cache):
        grad_norm = g.norm(2)
        clipped_grad = ifelse(grad_norm >= CLIP_THRESH, 
                              g * CLIP_THRESH / grad_norm, g)
        new_v = NESTEROV_MU * v + eta * clipped_grad
        new_p = p - NESTEROV_MU * v + (1 + NESTEROV_MU) * new_v
        has_non_finite = (T.any(T.isnan(new_p) + T.isinf(new_p)) +
                          T.any(T.isnan(new_v) + T.isinf(new_v)))
        updates.append((p, ifelse(has_non_finite, p, new_p)))
        updates.append((v, ifelse(has_non_finite, v, new_v)))
    else:
      # Simple SGD updates
      for p, g in zip(self.params, gradients):
        grad_norm = g.norm(2)
        clipped_grad = ifelse(grad_norm >= CLIP_THRESH, 
                              g * CLIP_THRESH / grad_norm, g)
        new_p = p + eta * clipped_grad
        has_non_finite = T.any(T.isnan(new_p) + T.isinf(new_p))
        updates.append((p, ifelse(has_non_finite, p, new_p)))
        #updates.append((p, new_p))

    self._backprop = theano.function(
        inputs=[x, y, eta, y_in_x_inds],
        outputs=[p_y_seq, log_p_y],
        updates=updates)
Esempio n. 41
0
def any(x, axis=None, keepdims=False):
    '''Bitwise reduction (logical OR).
    '''
    return T.any(x, axis=axis, keepdims=keepdims)
Esempio n. 42
0
    def build(self):
        # (batch_size, max_example_action_num, action_type)
        tgt_action_seq = ndim_itensor(3, 'tgt_action_seq')

        # (batch_size, max_example_action_num, action_type)
        tgt_action_seq_type = ndim_itensor(3, 'tgt_action_seq_type')

        # (batch_size, max_example_action_num)
        tgt_node_seq = ndim_itensor(2, 'tgt_node_seq')

        # (batch_size, max_example_action_num)
        tgt_par_rule_seq = ndim_itensor(2, 'tgt_par_rule_seq')

        # (batch_size, max_example_action_num)
        tgt_par_t_seq = ndim_itensor(2, 'tgt_par_t_seq')

        # (batch_size, max_example_action_num, symbol_embed_dim)
        # tgt_node_embed = self.node_embedding(tgt_node_seq, mask_zero=False)
        tgt_node_embed = self.node_embedding[tgt_node_seq]

        # (batch_size, max_query_length)
        query_tokens = ndim_itensor(2, 'query_tokens')

        mask = T.TensorType(dtype='int32',
                            name='mask',
                            broadcastable=(True, False))()

        # (batch_size, max_query_length, query_token_embed_dim)
        # (batch_size, max_query_length)
        query_token_embed, query_token_embed_mask = self.query_embedding(
            query_tokens, mask_zero=True)

        # if WORD_DROPOUT > 0:
        #     logging.info('used word dropout for source, p = %f', WORD_DROPOUT)
        #     query_token_embed, query_token_embed_intact = WordDropout(WORD_DROPOUT, self.srng)(query_token_embed, False)

        batch_size = tgt_action_seq.shape[0]
        max_example_action_num = tgt_action_seq.shape[1]

        # previous action embeddings
        # (batch_size, max_example_action_num, action_embed_dim)
        tgt_action_seq_embed = T.switch(
            T.shape_padright(tgt_action_seq[:, :, 0] > 0),
            self.rule_embedding_W[tgt_action_seq[:, :, 0]],
            self.vocab_embedding_W[tgt_action_seq[:, :, 1]])

        tgt_action_seq_embed_tm1 = tensor_right_shift(tgt_action_seq_embed)

        # parent rule application embeddings
        tgt_par_rule_embed = T.switch(tgt_par_rule_seq[:, :, None] < 0,
                                      T.alloc(0., 1, config.rule_embed_dim),
                                      self.rule_embedding_W[tgt_par_rule_seq])

        if not config.frontier_node_type_feed:
            tgt_node_embed *= 0.

        if not config.parent_action_feed:
            tgt_par_rule_embed *= 0.

        # (batch_size, max_example_action_num, action_embed_dim + symbol_embed_dim + action_embed_dim)
        decoder_input = T.concatenate(
            [tgt_action_seq_embed_tm1, tgt_node_embed, tgt_par_rule_embed],
            axis=-1)

        # (batch_size, max_query_length, query_embed_dim)
        query_embed = self.query_encoder_lstm(query_token_embed,
                                              mask=query_token_embed_mask,
                                              dropout=config.dropout,
                                              srng=self.srng)

        # (batch_size, max_example_action_num)
        tgt_action_seq_mask = T.any(tgt_action_seq_type, axis=-1)

        # decoder_hidden_states: (batch_size, max_example_action_num, lstm_hidden_state)
        # ctx_vectors: (batch_size, max_example_action_num, encoder_hidden_dim)
        decoder_hidden_states, _, ctx_vectors = self.decoder_lstm(
            decoder_input,
            context=query_embed,
            context_mask=query_token_embed_mask,
            mask=tgt_action_seq_mask,
            parent_t_seq=tgt_par_t_seq,
            dropout=config.dropout,
            srng=self.srng)

        # if DECODER_DROPOUT > 0:
        #     logging.info('used dropout for decoder output, p = %f', DECODER_DROPOUT)
        #     decoder_hidden_states = Dropout(DECODER_DROPOUT, self.srng)(decoder_hidden_states)

        # ====================================================
        # apply additional non-linearity transformation before
        # predicting actions
        # ====================================================

        decoder_hidden_state_trans_rule = self.decoder_hidden_state_W_rule(
            decoder_hidden_states)
        decoder_hidden_state_trans_token = self.decoder_hidden_state_W_token(
            T.concatenate([decoder_hidden_states, ctx_vectors], axis=-1))

        # (batch_size, max_example_action_num, rule_num)
        rule_predict = softmax(
            T.dot(decoder_hidden_state_trans_rule,
                  T.transpose(self.rule_embedding_W)) + self.rule_embedding_b)

        # (batch_size, max_example_action_num, 2)
        terminal_gen_action_prob = self.terminal_gen_softmax(
            decoder_hidden_states)

        # (batch_size, max_example_action_num, target_vocab_size)
        logits = T.dot(decoder_hidden_state_trans_token,
                       T.transpose(
                           self.vocab_embedding_W)) + self.vocab_embedding_b
        # vocab_predict = softmax(T.dot(decoder_hidden_state_trans_token, T.transpose(self.vocab_embedding_W)) + self.vocab_embedding_b)
        vocab_predict = softmax(
            logits.transpose(1, 0, 2) * mask +
            (T.min(logits.transpose(1, 0, 2), axis=1, keepdims=True) - 1) *
            (1 - mask)).transpose(1, 0, 2)
        # (batch_size, max_example_action_num, lstm_hidden_state + encoder_hidden_dim)
        ptr_net_decoder_state = T.concatenate(
            [decoder_hidden_states, ctx_vectors], axis=-1)

        # (batch_size, max_example_action_num, max_query_length)
        copy_prob = self.src_ptr_net(query_embed, query_token_embed_mask,
                                     ptr_net_decoder_state)

        # (batch_size, max_example_action_num)
        rule_tgt_prob = rule_predict[
            T.shape_padright(T.arange(batch_size)),
            T.shape_padleft(T.arange(max_example_action_num)),
            tgt_action_seq[:, :, 0]]

        # (batch_size, max_example_action_num)
        vocab_tgt_prob = vocab_predict[
            T.shape_padright(T.arange(batch_size)),
            T.shape_padleft(T.arange(max_example_action_num)),
            tgt_action_seq[:, :, 1]]

        # (batch_size, max_example_action_num)
        copy_tgt_prob = copy_prob[
            T.shape_padright(T.arange(batch_size)),
            T.shape_padleft(T.arange(max_example_action_num)),
            tgt_action_seq[:, :, 2]]

        # (batch_size, max_example_action_num)
        tgt_prob = tgt_action_seq_type[:, :, 0] * rule_tgt_prob + \
                   tgt_action_seq_type[:, :, 1] * terminal_gen_action_prob[:, :, 0] * vocab_tgt_prob + \
                   tgt_action_seq_type[:, :, 2] * terminal_gen_action_prob[:, :, 1] * copy_tgt_prob

        likelihood = T.log(tgt_prob + 1.e-7 * (1 - tgt_action_seq_mask))
        loss = -(likelihood * tgt_action_seq_mask).sum(
            axis=-1)  # / tgt_action_seq_mask.sum(axis=-1)
        loss = T.mean(loss)

        # let's build the function!
        train_inputs = [
            query_tokens, tgt_action_seq, tgt_action_seq_type, tgt_node_seq,
            tgt_par_rule_seq, tgt_par_t_seq, mask
        ]
        optimizer = optimizers.get(config.optimizer)
        optimizer.clip_grad = config.clip_grad
        updates, grads = optimizer.get_updates(self.params, loss)
        self.train_func = theano.function(
            train_inputs,
            [loss],
            # [loss, tgt_action_seq_type, tgt_action_seq,
            #  rule_tgt_prob, vocab_tgt_prob, copy_tgt_prob,
            #  copy_prob, terminal_gen_action_prob],
            updates=updates)

        # if WORD_DROPOUT > 0:
        #     self.build_decoder(query_tokens, query_token_embed_intact, query_token_embed_mask)
        # else:
        #     self.build_decoder(query_tokens, query_token_embed, query_token_embed_mask)

        self.build_decoder(query_tokens, query_token_embed,
                           query_token_embed_mask, mask)
Esempio n. 43
0
def logpow(x, m):
    """
    Calculates log(x**m) since m*log(x) will fail when m, x = 0.
    """
    # return m * log(x)
    return T.switch(T.any(T.eq(x, 0)), -np.inf, m * T.log(x))
Esempio n. 44
0
 def pgrad(g_out):
     g_out = T.clip(g_out, self.clip_lower_bound, self.clip_upper_bound)
     g_out = ifelse(T.any(T.isnan(g_out)), T.ones_like(g_out)*0.00001, g_out)
     return g_out
Esempio n. 45
0
File: core.py Progetto: lxastro/dlx
 def mask(self, train=False):
     X = self.get_input('input')(train)
     return T.any(T.ones_like(X) * (1. - T.eq(X, self.mask_value)), axis=-1) 
Esempio n. 46
0
File: core.py Progetto: nehz/keras
 def get_output_mask(self, train=False):
     X = self.get_input(train)
     return T.any(T.ones_like(X) * (1.0 - T.eq(X, self.mask_value)), axis=-1)
Esempio n. 47
0
    def __init__(self, n_in, hidden_layer_size, n_out, L1_reg, L2_reg, hidden_layer_type, output_type='LINEAR', dropout_rate=0.0, optimizer='sgd', loss_function='MMSE', rnn_batch_training=False):
        """ This function initialises a neural network

        :param n_in: Dimensionality of input features
        :type in: Integer
        :param hidden_layer_size: The layer size for each hidden layer
        :type hidden_layer_size: A list of integers
        :param n_out: Dimensionality of output features
        :type n_out: Integrer
        :param hidden_layer_type: the activation types of each hidden layers, e.g., TANH, LSTM, GRU, BLSTM
        :param L1_reg: the L1 regulasation weight
        :param L2_reg: the L2 regulasation weight
        :param output_type: the activation type of the output layer, by default is 'LINEAR', linear regression.
        :param dropout_rate: probability of dropout, a float number between 0 and 1.
        """

        logger = logging.getLogger("DNN initialization")

        self.n_in = int(n_in)
        self.n_out = int(n_out)

        self.n_layers = len(hidden_layer_size)

        self.dropout_rate = dropout_rate
        self.optimizer = optimizer
        self.loss_function = loss_function
        self.is_train = T.iscalar('is_train')
        self.rnn_batch_training = rnn_batch_training

        assert len(hidden_layer_size) == len(hidden_layer_type)

        self.list_of_activations = ['TANH', 'SIGMOID', 'SOFTMAX', 'RELU', 'RESU']

        if self.rnn_batch_training:
            self.x = T.tensor3('x')
            self.y = T.tensor3('y')
        else:
            self.x = T.matrix('x')
            self.y = T.matrix('y')

        self.L1_reg = L1_reg
        self.L2_reg = L2_reg

        self.rnn_layers = []
        self.params = []
        self.delta_params = []

        rng = np.random.RandomState(123)

        for i in range(self.n_layers):
            if i == 0:
                input_size = n_in
            else:
                input_size = hidden_layer_size[i-1]

            if i == 0:
                layer_input = self.x
            else:
                layer_input = self.rnn_layers[i-1].output
                if hidden_layer_type[i-1]  == 'BSLSTM' or hidden_layer_type[i-1]  == 'BLSTM':
                    input_size = hidden_layer_size[i-1]*2

            
            if hidden_layer_type[i] in self.list_of_activations:
                hidden_activation = hidden_layer_type[i].lower()
                hidden_layer = GeneralLayer(rng, layer_input, input_size, hidden_layer_size[i], activation=hidden_activation, p=self.dropout_rate, training=self.is_train)
            elif hidden_layer_type[i] == 'TANH_LHUC':
                hidden_layer = SigmoidLayer_LHUC(rng, layer_input, input_size, hidden_layer_size[i], activation=T.tanh, p=self.dropout_rate, training=self.is_train)
            elif hidden_layer_type[i] == 'SLSTM':
                hidden_layer = SimplifiedLstm(rng, layer_input, input_size, hidden_layer_size[i], p=self.dropout_rate, training=self.is_train, rnn_batch_training=self.rnn_batch_training)
            elif hidden_layer_type[i] == 'SGRU':
                hidden_layer = SimplifiedGRU(rng, layer_input, input_size, hidden_layer_size[i], p=self.dropout_rate, training=self.is_train, rnn_batch_training=self.rnn_batch_training)
            elif hidden_layer_type[i] == 'GRU':
                hidden_layer = GatedRecurrentUnit(rng, layer_input, input_size, hidden_layer_size[i], p=self.dropout_rate, training=self.is_train, rnn_batch_training=self.rnn_batch_training)
            elif hidden_layer_type[i] == 'LSTM_NFG':
                hidden_layer = LstmNFG(rng, layer_input, input_size, hidden_layer_size[i], p=self.dropout_rate, training=self.is_train, rnn_batch_training=self.rnn_batch_training)
            elif hidden_layer_type[i] == 'LSTM_NOG':
                hidden_layer = LstmNOG(rng, layer_input, input_size, hidden_layer_size[i], p=self.dropout_rate, training=self.is_train, rnn_batch_training=self.rnn_batch_training)
            elif hidden_layer_type[i] == 'LSTM_NIG':
                hidden_layer = LstmNIG(rng, layer_input, input_size, hidden_layer_size[i], p=self.dropout_rate, training=self.is_train, rnn_batch_training=self.rnn_batch_training)
            elif hidden_layer_type[i] == 'LSTM_NPH':
                hidden_layer = LstmNoPeepholes(rng, layer_input, input_size, hidden_layer_size[i], p=self.dropout_rate, training=self.is_train, rnn_batch_training=self.rnn_batch_training)
            elif hidden_layer_type[i] == 'LSTM':
                hidden_layer = VanillaLstm(rng, layer_input, input_size, hidden_layer_size[i], p=self.dropout_rate, training=self.is_train, rnn_batch_training=self.rnn_batch_training)
            elif hidden_layer_type[i] == 'BSLSTM':
                hidden_layer = BidirectionSLstm(rng, layer_input, input_size, hidden_layer_size[i], hidden_layer_size[i], p=self.dropout_rate, training=self.is_train, rnn_batch_training=self.rnn_batch_training)
            elif hidden_layer_type[i] == 'BLSTM':
                hidden_layer = BidirectionLstm(rng, layer_input, input_size, hidden_layer_size[i], hidden_layer_size[i], p=self.dropout_rate, training=self.is_train, rnn_batch_training=self.rnn_batch_training)
            elif hidden_layer_type[i] == 'RNN':
                hidden_layer = VanillaRNN(rng, layer_input, input_size, hidden_layer_size[i], p=self.dropout_rate, training=self.is_train, rnn_batch_training=self.rnn_batch_training)
            elif hidden_layer_type[i] == 'LSTM_LHUC':
                hidden_layer = VanillaLstm_LHUC(rng, layer_input, input_size, hidden_layer_size[i], p=self.dropout_rate, training=self.is_train, rnn_batch_training=self.rnn_batch_training)
            else:
                logger.critical("This hidden layer type: %s is not supported right now! \n Please use one of the following: SLSTM, BSLSTM, TANH, SIGMOID\n" %(hidden_layer_type[i]))
                sys.exit(1)

            self.rnn_layers.append(hidden_layer)
            self.params.extend(hidden_layer.params)

        input_size = hidden_layer_size[-1]
        if hidden_layer_type[-1]  == 'BSLSTM' or hidden_layer_type[-1]  == 'BLSTM':
            input_size = hidden_layer_size[-1]*2

        output_activation = output_type.lower()
        if output_activation == 'linear':
            self.final_layer = LinearLayer(rng, self.rnn_layers[-1].output, input_size, self.n_out)
        elif output_activation == 'recurrent':
            self.final_layer = RecurrentOutputLayer(rng, self.rnn_layers[-1].output, input_size, self.n_out, rnn_batch_training=self.rnn_batch_training)
        elif output_type.upper() in self.list_of_activations:
            self.final_layer = GeneralLayer(rng, self.rnn_layers[-1].output, input_size, self.n_out, activation=output_activation)
        else:
            logger.critical("This output layer type: %s is not supported right now! \n Please use one of the following: LINEAR, BSLSTM\n" %(output_type))
            sys.exit(1)

        self.params.extend(self.final_layer.params)

        self.updates = {}
        for param in self.params:
            self.updates[param] = theano.shared(value = np.zeros(param.get_value(borrow = True).shape,
                                                dtype = theano.config.floatX), name = 'updates')

        if self.loss_function == 'CCE':
            self.finetune_cost = self.categorical_crossentropy_loss(self.final_layer.output, self.y) 
            self.errors        = self.categorical_crossentropy_loss(self.final_layer.output, self.y) 
        elif self.loss_function == 'Hinge':    
            self.finetune_cost = self.multiclass_hinge_loss(self.final_layer.output, self.y)
            self.errors        = self.multiclass_hinge_loss(self.final_layer.output, self.y)
        elif self.loss_function == 'MMSE':
            if self.rnn_batch_training:
                self.y_mod = T.reshape(self.y, (-1, n_out))
                self.final_layer_output = T.reshape(self.final_layer.output, (-1, n_out))

                nonzero_rows = T.any(self.y_mod, 1).nonzero()
            
                self.y_mod = self.y_mod[nonzero_rows]
                self.final_layer_output = self.final_layer_output[nonzero_rows]
            
                self.finetune_cost = T.mean(T.sum((self.final_layer_output - self.y_mod) ** 2, axis=1))
                self.errors = T.mean(T.sum((self.final_layer_output - self.y_mod) ** 2, axis=1))
            else:
                self.finetune_cost = T.mean(T.sum((self.final_layer.output - self.y) ** 2, axis=1))
                self.errors = T.mean(T.sum((self.final_layer.output - self.y) ** 2, axis=1))
Esempio n. 48
0
    def get_reward(self,session_states,session_actions,batch_i):
        """
        WARNING! this runs on a single session, not on a batch
        reward given for taking the action in current environment state
        arguments:
            session_states float[batch_id, memory_id]: environment state before taking action
            session_actions int[batch_id]: agent action at this tick
        returns:
            reward float[batch_id]: reward for taking action from the given state
        """
        #unpach states and actions
        session_states = check_list(session_states)[0]
        session_actions = check_list(session_actions)[0]
        
        
        time_range = T.arange(session_actions.shape[0])
        

        has_tried_already = session_states[time_range,session_actions]
        session_is_active = T.eq(session_states[:,self.end_action_id],0)
        
        has_finished_now = T.eq(session_actions,self.end_action_id)
        has_finished_now = T.set_subtensor(has_finished_now[-1],1)
        end_tick = has_finished_now.nonzero()[0][0]
        
        action_is_categorical = in1d(session_actions, self.category_action_ids)
                
        response = self.joint_data[batch_i,session_actions].ravel()
        
        at_least_one_category_guessed = T.any(action_is_categorical[:end_tick] & (response[:end_tick]>0))

        
        #categorical and attributes
        reward_for_intermediate_action = T.switch(
            action_is_categorical,
            response*(self.rw["category_positive"]-self.rw["category_negative"]) + self.rw["category_negative"],
            response*(self.rw["attribute_positive"]-self.rw["attribute_negative"]) + self.rw["attribute_negative"]
        )
        reward_for_intermediate_action_first_time = T.switch(
                has_tried_already,
                self.rw["repeated_poll"],
                reward_for_intermediate_action,
            )

        #ending session
        reward_for_end_action = T.switch(at_least_one_category_guessed, #if chosen at least 1 category
                                          self.rw["end_action"],   #do not penalize
                                          self.rw["end_action_if_no_category_predicted"])  #else punish
        
        #include end action
        reward_for_action = T.switch(
            has_finished_now,
            reward_for_end_action,
            reward_for_intermediate_action_first_time,
        )
        
        
        final_reward = T.switch(
            session_is_active,
            reward_for_action,
            0,

            
        )
        
        
        return final_reward.astype(theano.config.floatX)
Esempio n. 49
0
def unroll_scan(fn, sequences=(), outputs_info=(), non_sequences=(), n_steps=None,
                go_backwards=False):
  """
  Helper function to unroll for loops. Can be used to unroll theano.scan.
  The parameter names are identical to theano.scan, please refer to here
  for more information.

  Note that this function does not support the truncate_gradient
  setting from theano.scan.

  Code adapted from https://github.com/Lasagne/Lasagne.
  Thank you!

  Parameters
  ----------

  fn : function
      Function that defines calculations at each step.

  sequences : TensorVariable or list of TensorVariables
      List of TensorVariable with sequence data. The function iterates
      over the first dimension of each TensorVariable.

  outputs_info : list of TensorVariables
      List of tensors specifying the initial values for each recurrent
      value.

  non_sequences: list of TensorVariables
      List of theano.shared variables that are used in the step function.

  n_steps: int
      Number of steps to unroll.

  go_backwards: bool
      If true the recursion starts at sequences[-1] and iterates
      backwards.

  Returns
  -------
  Tuple of the form (outputs, updates).
  outputs is a list of TensorVariables. Each element in the list gives the recurrent
  values at each time step.
  updates is an empty dict for now.
  """
  if not isinstance(sequences, (list, tuple)):
    sequences = [sequences]
  sequences = list(sequences)
  outputs_info = list(outputs_info)
  non_sequences = list(non_sequences)

  # When backwards reverse the recursion direction
  counter = range(n_steps)
  if go_backwards:
    counter = counter[::-1]

  output = []
  prev_vals = outputs_info
  until = []
  for i in counter:
    assert len(prev_vals) == len(outputs_info)
    prev_vals = [prev for prev, out_info in zip(prev_vals, outputs_info) if out_info is not None]
    step_input = [s[i] for s in sequences] + prev_vals + non_sequences
    out_ = fn(*step_input)
    # The returned values from step can be either a TensorVariable,
    # a list, or a tuple.  Below, we force it to always be a list.
    if isinstance(out_, T.TensorVariable):
      out_ = [out_]
    if isinstance(out_, tuple):
      if len(out_) >= 1 and isinstance(out_[0], (list, tuple)):
        if len(out_) >= 2:
          assert not out_[1], "shared var updates not supported"
        if len(out_) >= 3:
          assert isinstance(out_[2], theano.scan_module.until)
          until.append(T.neq(out_[2].condition, 0))
        out_ = list(out_[0])
      else:
        out_ = list(out_)
    output.append(out_)

    prev_vals = output[-1]

  # iterate over each scan output and convert it to same format as scan:
  # [[output11, output12,...output1n],
  # [output21, output22,...output2n],...]
  output_scan = []
  for i in range(len(output[0])):
    l = map(lambda x: x[i], output)
    output_scan.append(T.stack(*l))

  if until:
    assert len(until) == n_steps
    until_conds = T.stack(*until)
    new_len = T.switch(T.any(until_conds),
                       T.minimum(T.argmax(until_conds) + 1, n_steps),
                       n_steps)
    output_scan = [out[:new_len] for out in output_scan]

  if len(output_scan) == 1:
    output_scan = output_scan[0]
  updates = {}
  return output_scan, updates
Esempio n. 50
0
    def __init__(self, n_in, hidden_layer_size, n_out, L1_reg, L2_reg, hidden_layer_type, output_type='LINEAR', network_type='S2S', ed_type='HED', dropout_rate=0.0, optimizer='sgd', MLU_div_lengths = [], loss_function='MMSE', rnn_batch_training=False):
        """ This function initialises a neural network

        :param n_in: Dimensionality of input features
        :type in: Integer
        :param hidden_layer_size: The layer size for each hidden layer
        :type hidden_layer_size: A list of integers
        :param n_out: Dimensionality of output features
        :type n_out: Integrer
        :param hidden_layer_type: the activation types of each hidden layers, e.g., TANH, LSTM, GRU, BLSTM
        :param L1_reg: the L1 regulasation weight
        :param L2_reg: the L2 regulasation weight
        :param output_type: the activation type of the output layer, by default is 'LINEAR', linear regression.
        :param dropout_rate: probability of dropout, a float number between 0 and 1.
        """

        logger = logging.getLogger("DNN initialization")

        self.n_in = int(n_in)
        self.n_out = int(n_out)

        self.n_layers = len(hidden_layer_size)

        self.dropout_rate = dropout_rate
        self.optimizer = optimizer
        self.loss_function = loss_function
        self.is_train = T.iscalar('is_train')
        self.rnn_batch_training = rnn_batch_training

        assert len(hidden_layer_size) == len(hidden_layer_type)

        self.list_of_activations = ['TANH', 'SIGMOID', 'SOFTMAX', 'RELU', 'RESU']
        
        BLSTM_variants   = ['BLSTM', 'BSLSTM', 'BLSTME', 'BSLSTME']
        Encoder_variants = ['RNNE', 'LSTME', 'BLSTME', 'SLSTME', 'TANHE']
        Decoder_variants = ['RNND', 'LSTMD', 'SLSTMD']

        if self.rnn_batch_training:
            self.x = T.tensor3('x')
            self.y = T.tensor3('y')
        else:
            self.x = T.matrix('x')
            self.y = T.matrix('y')
        
        if network_type == "S2S":
            self.d = T.ivector('d')
            self.f = T.matrix('f')

        self.L1_reg = L1_reg
        self.L2_reg = L2_reg

        self.rnn_layers = []
        self.params = []
        self.delta_params = []

        rng = np.random.RandomState(123)

        prev_seg_end = 0
        encoder_count = 0
        MLU_div = MLU_div_lengths
        for i in range(self.n_layers):
            if i == 0:
                input_size = n_in
            else:
                input_size = hidden_layer_size[i-1]
                if hidden_layer_type[i-1] in BLSTM_variants:
                    input_size = hidden_layer_size[i-1]*2

            if i == 0:
                layer_input = self.x
            else:
                layer_input = self.rnn_layers[i-1].output
            
            ### sequence-to-sequence mapping ###
            if hidden_layer_type[i-1] in Encoder_variants:
                dur_input        = self.d
                frame_feat_input = self.f

                # vanilla encoder-decoder (phone-level features)
                if ed_type == "VED":
                    seq2seq_model = DistributedSequenceEncoder(rng, layer_input, dur_input)
                    layer_input   = T.concatenate((seq2seq_model.encoded_output, frame_feat_input), axis=1)
                    input_size    = input_size+4
                # hierarchical encoder-decoder
                elif ed_type == "HED":
                    seg_len       = layer_input.size//input_size
                    seg_dur_input = dur_input[prev_seg_end: prev_seg_end+seg_len]
                    num_of_segs   = T.sum(seg_dur_input)
                    seq2seq_model = DistributedSequenceEncoder(rng, layer_input, seg_dur_input)
                    addfeat_input = frame_feat_input[0:num_of_segs, MLU_div[encoder_count]:MLU_div[encoder_count+1]]  
                    layer_input   = T.concatenate((seq2seq_model.encoded_output, addfeat_input), axis=1)
                    input_size    = input_size + (MLU_div[encoder_count+1]-MLU_div[encoder_count])
                    prev_seg_end  = prev_seg_end + seg_len
                    encoder_count = encoder_count + 1

            # hidden layer activation
            if hidden_layer_type[i] in self.list_of_activations:
                hidden_activation = hidden_layer_type[i].lower()
                hidden_layer = GeneralLayer(rng, layer_input, input_size, hidden_layer_size[i], activation=hidden_activation, p=self.dropout_rate, training=self.is_train)
            elif hidden_layer_type[i] == 'TANHE' or hidden_layer_type[i] == 'SIGMOIDE':
                hidden_activation = hidden_layer_type[i][0:-1].lower()
                hidden_layer = GeneralLayer(rng, layer_input, input_size, hidden_layer_size[i], activation=hidden_activation, p=self.dropout_rate, training=self.is_train)
            elif hidden_layer_type[i] == 'TANH_LHUC':
                hidden_layer = SigmoidLayer_LHUC(rng, layer_input, input_size, hidden_layer_size[i], activation=T.tanh, p=self.dropout_rate, training=self.is_train)
            elif hidden_layer_type[i] == 'SLSTM' or hidden_layer_type[i] == 'SLSTME':
                hidden_layer = SimplifiedLstm(rng, layer_input, input_size, hidden_layer_size[i], p=self.dropout_rate, training=self.is_train, rnn_batch_training=self.rnn_batch_training)
            elif hidden_layer_type[i] == 'SLSTMD':
                hidden_layer = SimplifiedLstmDecoder(rng, layer_input, input_size, hidden_layer_size[i], self.n_out, p=self.dropout_rate, training=self.is_train, rnn_batch_training=self.rnn_batch_training)
            elif hidden_layer_type[i] == 'SGRU':
                hidden_layer = SimplifiedGRU(rng, layer_input, input_size, hidden_layer_size[i], p=self.dropout_rate, training=self.is_train, rnn_batch_training=self.rnn_batch_training)
            elif hidden_layer_type[i] == 'GRU':
                hidden_layer = GatedRecurrentUnit(rng, layer_input, input_size, hidden_layer_size[i], p=self.dropout_rate, training=self.is_train, rnn_batch_training=self.rnn_batch_training)
            elif hidden_layer_type[i] == 'LSTM' or hidden_layer_type[i] == 'LSTME':
                hidden_layer = VanillaLstm(rng, layer_input, input_size, hidden_layer_size[i], p=self.dropout_rate, training=self.is_train, rnn_batch_training=self.rnn_batch_training)
            elif hidden_layer_type[i] == 'LSTMD':
                hidden_layer = VanillaLstmDecoder(rng, layer_input, input_size, hidden_layer_size[i], self.n_out, p=self.dropout_rate, training=self.is_train, rnn_batch_training=self.rnn_batch_training)
            elif hidden_layer_type[i] == 'BSLSTM' or hidden_layer_type[i] == 'BSLSTME':
                hidden_layer = BidirectionSLstm(rng, layer_input, input_size, hidden_layer_size[i], hidden_layer_size[i], p=self.dropout_rate, training=self.is_train, rnn_batch_training=self.rnn_batch_training)
            elif hidden_layer_type[i] == 'BLSTM' or hidden_layer_type[i] == 'BLSTME':
                hidden_layer = BidirectionLstm(rng, layer_input, input_size, hidden_layer_size[i], hidden_layer_size[i], p=self.dropout_rate, training=self.is_train, rnn_batch_training=self.rnn_batch_training)
            elif hidden_layer_type[i] == 'RNN' or hidden_layer_type[i] == 'RNNE':
                hidden_layer = VanillaRNN(rng, layer_input, input_size, hidden_layer_size[i], p=self.dropout_rate, training=self.is_train, rnn_batch_training=self.rnn_batch_training)
            elif hidden_layer_type[i] == 'RNND':
                hidden_layer = VanillaRNNDecoder(rng, layer_input, input_size, hidden_layer_size[i], self.n_out, p=self.dropout_rate, training=self.is_train, rnn_batch_training=self.rnn_batch_training)
            elif hidden_layer_type[i] == 'LSTM_LHUC':
                hidden_layer = VanillaLstm_LHUC(rng, layer_input, input_size, hidden_layer_size[i], p=self.dropout_rate, training=self.is_train, rnn_batch_training=self.rnn_batch_training)
            else:
                logger.critical("This hidden layer type: %s is not supported right now! \n Please use one of the following: SLSTM, BSLSTM, TANH, SIGMOID\n" %(hidden_layer_type[i]))
                sys.exit(1)

            self.rnn_layers.append(hidden_layer)
            self.params.extend(hidden_layer.params)

        input_size = hidden_layer_size[-1]
        if hidden_layer_type[-1] in BLSTM_variants:
            input_size = hidden_layer_size[-1]*2

        if hidden_layer_type[-1] in Decoder_variants:
            self.final_layer = self.rnn_layers[-1]
        else:
            output_activation = output_type.lower()
            if output_activation == 'linear':
                self.final_layer = LinearLayer(rng, self.rnn_layers[-1].output, input_size, self.n_out)
            elif output_activation == 'recurrent':
                self.final_layer = RecurrentOutputLayer(rng, self.rnn_layers[-1].output, input_size, self.n_out, rnn_batch_training=self.rnn_batch_training)
            elif output_type.upper() in self.list_of_activations:
                self.final_layer = GeneralLayer(rng, self.rnn_layers[-1].output, input_size, self.n_out, activation=output_activation)
            else:
                logger.critical("This output layer type: %s is not supported right now! \n Please use one of the following: LINEAR, BSLSTM\n" %(output_type))
                sys.exit(1)

            self.params.extend(self.final_layer.params)

        self.updates = {}
        for param in self.params:
            self.updates[param] = theano.shared(value = np.zeros(param.get_value(borrow = True).shape,
                                                dtype = theano.config.floatX), name = 'updates')

        if self.loss_function == 'CCE':
            self.finetune_cost = self.categorical_crossentropy_loss(self.final_layer.output, self.y) 
            self.errors        = self.categorical_crossentropy_loss(self.final_layer.output, self.y) 
        elif self.loss_function == 'Hinge':    
            self.finetune_cost = self.multiclass_hinge_loss(self.final_layer.output, self.y)
            self.errors        = self.multiclass_hinge_loss(self.final_layer.output, self.y)
        elif self.loss_function == 'MMSE':
            if self.rnn_batch_training:
                self.y_mod = T.reshape(self.y, (-1, n_out))
                self.final_layer_output = T.reshape(self.final_layer.output, (-1, n_out))

                nonzero_rows = T.any(self.y_mod, 1).nonzero()
            
                self.y_mod = self.y_mod[nonzero_rows]
                self.final_layer_output = self.final_layer_output[nonzero_rows]
            
                self.finetune_cost = T.mean(T.sum((self.final_layer_output - self.y_mod) ** 2, axis=1))
                self.errors = T.mean(T.sum((self.final_layer_output - self.y_mod) ** 2, axis=1))
            else:
                self.finetune_cost = T.mean(T.sum((self.final_layer.output - self.y) ** 2, axis=1))
                self.errors = T.mean(T.sum((self.final_layer.output - self.y) ** 2, axis=1))
Esempio n. 51
0
    def compute_step(self, param, previous_step):
        not_finite = tensor.any(tensor.or_(
            tensor.isnan(previous_step), tensor.isinf(previous_step)))
        step = tensor.switch(not_finite, self.scaler * param, previous_step)

        return step, []
Esempio n. 52
0
def logpow(x, m):
    """
    Calculates log(x**m) since m*log(x) will fail when m, x = 0.
    """
    # return m * log(x)
    return switch(any(eq(x, 0)), -inf, m * log(x))
Esempio n. 53
0
File: core.py Progetto: nehz/keras
 def get_output(self, train=False):
     X = self.get_input(train)
     return X * T.shape_padright(T.any((1.0 - T.eq(X, self.mask_value)), axis=-1))
Esempio n. 54
0
    def build(self):
        # (batch_size, max_example_action_num, action_type)
        tgt_action_seq = ndim_itensor(3, 'tgt_action_seq')

        # (batch_size, max_example_action_num, action_type)
        tgt_action_seq_type = ndim_itensor(3, 'tgt_action_seq_type')

        # (batch_size, max_example_action_num)
        tgt_node_seq = ndim_itensor(2, 'tgt_node_seq')

        # (batch_size, max_example_action_num)
        tgt_par_rule_seq = ndim_itensor(2, 'tgt_par_rule_seq')

        # (batch_size, max_example_action_num)
        tgt_par_t_seq = ndim_itensor(2, 'tgt_par_t_seq')

        # (batch_size, max_example_action_num, symbol_embed_dim)
        # tgt_node_embed = self.node_embedding(tgt_node_seq, mask_zero=False)
        tgt_node_embed = self.node_embedding[tgt_node_seq]

        # (batch_size, max_query_length)
        query_tokens = ndim_itensor(2, 'query_tokens')

        # (batch_size, max_query_length, query_token_embed_dim)
        # (batch_size, max_query_length)
        query_token_embed, query_token_embed_mask = self.query_embedding(query_tokens, mask_zero=True)

        # if WORD_DROPOUT > 0:
        #     logging.info('used word dropout for source, p = %f', WORD_DROPOUT)
        #     query_token_embed, query_token_embed_intact = WordDropout(WORD_DROPOUT, self.srng)(query_token_embed, False)

        batch_size = tgt_action_seq.shape[0]
        max_example_action_num = tgt_action_seq.shape[1]

        # previous action embeddings
        # (batch_size, max_example_action_num, action_embed_dim)
        tgt_action_seq_embed = T.switch(T.shape_padright(tgt_action_seq[:, :, 0] > 0),
                                        self.rule_embedding_W[tgt_action_seq[:, :, 0]],
                                        self.vocab_embedding_W[tgt_action_seq[:, :, 1]])

        tgt_action_seq_embed_tm1 = tensor_right_shift(tgt_action_seq_embed)

        # parent rule application embeddings
        tgt_par_rule_embed = T.switch(tgt_par_rule_seq[:, :, None] < 0,
                                      T.alloc(0., 1, config.rule_embed_dim),
                                      self.rule_embedding_W[tgt_par_rule_seq])

        if not config.frontier_node_type_feed:
            tgt_node_embed *= 0.

        if not config.parent_action_feed:
            tgt_par_rule_embed *= 0.

        # (batch_size, max_example_action_num, action_embed_dim + symbol_embed_dim + action_embed_dim)
        decoder_input = T.concatenate([tgt_action_seq_embed_tm1, tgt_node_embed, tgt_par_rule_embed], axis=-1)

        # (batch_size, max_query_length, query_embed_dim)
        query_embed = self.query_encoder_lstm(query_token_embed, mask=query_token_embed_mask,
                                              dropout=config.dropout, srng=self.srng)

        # (batch_size, max_example_action_num)
        tgt_action_seq_mask = T.any(tgt_action_seq_type, axis=-1)
        
        # decoder_hidden_states: (batch_size, max_example_action_num, lstm_hidden_state)
        # ctx_vectors: (batch_size, max_example_action_num, encoder_hidden_dim)
        decoder_hidden_states, _, ctx_vectors = self.decoder_lstm(decoder_input,
                                                                  context=query_embed,
                                                                  context_mask=query_token_embed_mask,
                                                                  mask=tgt_action_seq_mask,
                                                                  parent_t_seq=tgt_par_t_seq,
                                                                  dropout=config.dropout,
                                                                  srng=self.srng)

        # if DECODER_DROPOUT > 0:
        #     logging.info('used dropout for decoder output, p = %f', DECODER_DROPOUT)
        #     decoder_hidden_states = Dropout(DECODER_DROPOUT, self.srng)(decoder_hidden_states)

        # ====================================================
        # apply additional non-linearity transformation before
        # predicting actions
        # ====================================================

        decoder_hidden_state_trans_rule = self.decoder_hidden_state_W_rule(decoder_hidden_states)
        decoder_hidden_state_trans_token = self.decoder_hidden_state_W_token(T.concatenate([decoder_hidden_states, ctx_vectors], axis=-1))

        # (batch_size, max_example_action_num, rule_num)
        rule_predict = softmax(T.dot(decoder_hidden_state_trans_rule, T.transpose(self.rule_embedding_W)) + self.rule_embedding_b)

        # (batch_size, max_example_action_num, 2)
        terminal_gen_action_prob = self.terminal_gen_softmax(decoder_hidden_states)

        # (batch_size, max_example_action_num, target_vocab_size)
        vocab_predict = softmax(T.dot(decoder_hidden_state_trans_token, T.transpose(self.vocab_embedding_W)) + self.vocab_embedding_b)

        # (batch_size, max_example_action_num, lstm_hidden_state + encoder_hidden_dim)
        ptr_net_decoder_state = T.concatenate([decoder_hidden_states, ctx_vectors], axis=-1)

        # (batch_size, max_example_action_num, max_query_length)
        copy_prob = self.src_ptr_net(query_embed, query_token_embed_mask, ptr_net_decoder_state)

        # (batch_size, max_example_action_num)
        rule_tgt_prob = rule_predict[T.shape_padright(T.arange(batch_size)),
                                     T.shape_padleft(T.arange(max_example_action_num)),
                                     tgt_action_seq[:, :, 0]]

        # (batch_size, max_example_action_num)
        vocab_tgt_prob = vocab_predict[T.shape_padright(T.arange(batch_size)),
                                       T.shape_padleft(T.arange(max_example_action_num)),
                                       tgt_action_seq[:, :, 1]]

        # (batch_size, max_example_action_num)
        copy_tgt_prob = copy_prob[T.shape_padright(T.arange(batch_size)),
                                  T.shape_padleft(T.arange(max_example_action_num)),
                                  tgt_action_seq[:, :, 2]]


        # (batch_size, max_example_action_num)
        tgt_prob = tgt_action_seq_type[:, :, 0] * rule_tgt_prob + \
                   tgt_action_seq_type[:, :, 1] * terminal_gen_action_prob[:, :, 0] * vocab_tgt_prob + \
                   tgt_action_seq_type[:, :, 2] * terminal_gen_action_prob[:, :, 1] * copy_tgt_prob

        likelihood = T.log(tgt_prob + 1.e-7 * (1 - tgt_action_seq_mask))
        loss = - (likelihood * tgt_action_seq_mask).sum(axis=-1) # / tgt_action_seq_mask.sum(axis=-1)
        loss = T.mean(loss)

        # let's build the function!
        train_inputs = [query_tokens, tgt_action_seq, tgt_action_seq_type,
                        tgt_node_seq, tgt_par_rule_seq, tgt_par_t_seq]
        optimizer = optimizers.get(config.optimizer)
        optimizer.clip_grad = config.clip_grad
        updates, grads = optimizer.get_updates(self.params, loss)
        self.train_func = theano.function(train_inputs, [loss],
                                          # [loss, tgt_action_seq_type, tgt_action_seq,
                                          #  rule_tgt_prob, vocab_tgt_prob, copy_tgt_prob,
                                          #  copy_prob, terminal_gen_action_prob],
                                          updates=updates)

        # if WORD_DROPOUT > 0:
        #     self.build_decoder(query_tokens, query_token_embed_intact, query_token_embed_mask)
        # else:
        #     self.build_decoder(query_tokens, query_token_embed, query_token_embed_mask)

        self.build_decoder(query_tokens, query_token_embed, query_token_embed_mask)