Esempio n. 1
0
 def call(self, x, mask=None):
     if K.backend() == 'theano':
         return K.softplus(
             K.pattern_broadcast(self.beta, self.param_broadcast) *
             x) * K.pattern_broadcast(self.alpha, self.param_broadcast)
     else:
         return K.softplus(self.beta * x) * self.alpha
Esempio n. 2
0
    def call(self, inputs, **kwargs):
        kernel_sigma = K.softplus(self.kernel_rho)
        kernel = self.kernel_mu + kernel_sigma * K.random_normal(
            self.kernel_mu.shape)

        bias_sigma = K.softplus(self.bias_rho)
        bias = self.bias_mu + bias_sigma * K.random_normal(self.bias_mu.shape)

        self.add_loss(
            self.kl_loss(kernel, self.kernel_mu, kernel_sigma) +
            self.kl_loss(bias, self.bias_mu, bias_sigma))

        return self.activation(K.dot(inputs, kernel) + bias)
Esempio n. 3
0
    def call(self, x):
        # Construct the pairwise distance matrix
        D = pairwise_dists(x, x, epsilon=self.epsilon)
        # get the max intra-class distance for each sample
        max_pos = [
            K.max(K.tf.slice(D,
                             begin=[i * self.k, i * self.k],
                             size=[self.k, self.k]),
                  axis=1) for i in range(self.p)
        ]
        max_pos = K.concatenate(max_pos, axis=0)
        # get the min inter-class distance for each sample
        min_neg = []
        for i in range(self.p):
            left = K.tf.slice(D,
                              begin=[i * self.k, 0],
                              size=[self.k, i * self.k])
            right = K.tf.slice(D,
                               begin=[i * self.k, (i + 1) * self.k],
                               size=[self.k, (self.p - i - 1) * self.k])
            min_neg.append(K.min(K.concatenate([left, right], axis=1), axis=1))
        min_neg = K.concatenate(min_neg, axis=0)

        if self.use_softplus:
            return K.mean(K.softplus(self.margin + max_pos - min_neg))
        else:
            return K.mean(K.relu(self.margin + max_pos - min_neg))
Esempio n. 4
0
def custom_loss(y_true, y_pred):
    coefs = [
        7.718, 2.1184316, 1.7462137, 2.7549687, 4.7066404, 7.6163553, 11.723778
    ]

    pt_true = y_true[2]
    loss_total = (y_pred - y_true + K.softplus(-2. * (y_pred - y_true)) -
                  K.log(2.))
    loss = K.switch(
        tf.math.logical_and(tf.greater(pt_true, 3), tf.less(pt_true, 4)),
        tf.math.multiply(loss_total, coefs[0]),
        K.switch(
            tf.less(pt_true, 5), tf.math.multiply(loss_total, coefs[1]),
            K.switch(
                tf.less(pt_true, 6), tf.math.multiply(loss_total, coefs[2]),
                K.switch(
                    tf.less(pt_true, 7),
                    tf.math.multiply(loss_total, coefs[3]),
                    K.switch(
                        tf.less(pt_true, 8),
                        tf.math.multiply(loss_total, coefs[4]),
                        K.switch(
                            tf.less(pt_true, 9),
                            tf.math.multiply(loss_total, coefs[5]),
                            K.switch(tf.less(pt_true, 10),
                                     tf.math.multiply(loss_total, coefs[6]),
                                     loss_total)))))))

    loss = K.mean(loss)
    return loss
Esempio n. 5
0
def word2vec_loss(y_true, y_pred):
    # y_true is label (0 or 1)
    # y_pred is the dot prod

    # 0 / 1 -> 1. -> -1.
    a = (K.cast(y_true, dtype='float32') * 2 - 1.0) * (-1.0)
    return K.softplus(a * y_pred)
Esempio n. 6
0
def mish(x, fast=False):
    if fast:  # faster but requires extra storage
        y = K.exp(-x)
        z = 1 + 2 * y
        return x * z / (z + 2 * y * y)
    #return x * tf.math.tanh(tf.math.softplus(x))
    return x * K.tanh(K.softplus(x))
Esempio n. 7
0
def discriminate_real(y_output, batch_size=batch_size):
    # logD(x) = logZ(x) - log(Z(x) + 1)  where Z(x) = sum_{k=1}^K exp(l_k(x))
    log_zx = K.logsumexp(y_output, axis=1)
    log_dx = log_zx - K.softplus(log_zx)
    dx = K.sum(K.exp(log_dx)) / batch_size
    loss = -K.sum(log_dx) / batch_size
    return loss, dx
Esempio n. 8
0
    def _get_weight_vector(self,M,w,head):
        cur = 0
        # split everything out
        k = head[:,cur:self.M]
        cur += self.M

        b = head[:,cur]
        cur += 1

        g = head[:,cur]
        cur += 1

        s = head[:,cur: cur + self.num_shift]
        cur += self.num_shift

        t = head[:,cur]

        # do the activations of the head
        # ref: https://blog.wtf.sg/2014/11/09/neural-turing-machines-implementation-hell/
        b = K.exp(b)
        g = K.sigmoid(g)
        s = K.softmax(s)
        t = K.softplus(t) + 1

        # DEBUG-ing purpose:
        # for _ in ['M','w','k','b','g','s','t']:
        #     print(_,eval(_))

        weight = _get_weight(M,w,k,b,g,s,t)
        return weight
Esempio n. 9
0
def neg_log_likelihood(y_true, y_pred):
    y_true = y_true[:, 0]
    mean = y_pred[:, 0]
    variance = K.softplus(y_pred[:, 1]) + 1e-6
    log_variance = K.log(variance)
    return 0.5 * K.mean(log_variance, axis=-1) + 0.5 * K.mean(
        K.square(y_true - mean) / variance, axis=-1) + 0.5 * K.log(2 * np.pi)
Esempio n. 10
0
 def dense(x, w, b, act):
     x = K.dot(x, w)
     if b:
         x = K.bias_add(x, b)
     if act.lower().strip() == 'softmax':
         x = K.softmax(x)
     elif act.lower().strip() == 'elu':
         x = K.elu(x)
     elif act.lower().strip() == 'gelu':
         x = 0.5 * x * (1 + K.tanh(
             np.sqrt(2 / np.pi) * (x + 0.044715 * K.pow(x, 3))))
     elif act.lower().strip() == 'selu':
         alpha = 1.6732632423543772848170429916717
         scale = 1.0507009873554804934193349852946
         x = scale * K.elu(x, alpha)
     elif act.lower().strip() == 'softplus':
         x = K.softplus(x)
     elif act.lower().strip() == 'softsign':
         x = K.softsign(x)
     elif act.lower().strip() == 'relu':
         x = K.relu(x)
     elif act.lower().strip() == 'leaky_relu':
         x = K.relu(x, alpha=0.01)
     elif act.lower().strip() == 'tanh':
         x = K.tanh(x)
     elif act.lower().strip() == 'sigmoid':
         x = K.sigmoid(x)
     elif act.lower().strip() == 'hard_sigmoid':
         x = K.hard_sigmoid(x)
     return x
Esempio n. 11
0
def pinball_loss(tau, y, q, alpha=0.01, smooth_loss=1, kappa=0, margin=0):
    error = (y - q)
    diff = q[:, 1:] - q[:, :-1]

    if smooth_loss == 0:  # pinball function
        quantile_loss = K.mean(K.maximum(tau * error, (tau - 1) * error))
    elif smooth_loss == 1:  # smooth pinball function
        quantile_loss = K.mean(tau * error +
                               alpha * K.softplus(-error / alpha))
    elif smooth_loss == 2:  # huber norm approximation
        epsilon = 2**-8

        # if K.abs(error) > epsilon:
        #     u = K.abs(error) - epsilon / 2
        # else:
        #     u = (error**2) / (2 * epsilon)

        logic = K.cast((K.abs(error) > epsilon), dtype='float64')

        u = (K.abs(error) - epsilon / 2) * logic + (
            (error**2) / (2 * epsilon)) * (1 - logic)

        quantile_loss = K.mean(K.maximum(tau * u, (tau - 1) * u))

    # penalty = -kappa * K.mean(alpha2*K.softplus(-diff / alpha2))
    # penalty = K.mean(K.maximum(tf.Variable(tf.zeros([1], dtype=tf.float64)), margin - diff)) * kappa
    penalty = kappa * K.mean(
        tf.square(
            K.maximum(tf.Variable(tf.zeros([1], dtype=tf.float64)),
                      margin - diff)))

    return quantile_loss + penalty
Esempio n. 12
0
def activate(ab):
    a = k.exp(ab[:, 0])
    b = k.softplus(ab[:, 1])

    a = k.reshape(a, (k.shape(a)[0], 1))
    b = k.reshape(b, (k.shape(b)[0], 1))

    return k.concatenate((a, b), axis=1)
Esempio n. 13
0
def softplus(x):
    """
    Softplus activation function.

    >>> round(softplus(0), 1)
    0.7
    """
    return K.eval(K.softplus(K.variable(x))).tolist()
Esempio n. 14
0
 def call(self, x):
     if not self.afixed:
         aloc = K.softplus(self.a - 4) * x
     else:
         aloc = self.a * x
     if self.useb:
         aloc += self.b
     return aloc
Esempio n. 15
0
def softplus(x):
    """
    Softplus activation function.

    >>> round(softplus(0), 1)
    0.7
    """
    return K.eval(K.softplus(K.variable(x))).tolist()
Esempio n. 16
0
def neg_log_likelihood(truth_n, pred_nx2):
    truth_n = truth_n[:, 0]
    mean_n = pred_nx2[:, 0]
    var_n = K.softplus(pred_nx2[:, 1]) + 1e-6
    logvar_n = K.log(var_n)
    nll_n = 0.5 * K.mean(logvar_n, axis=-1) + 0.5 * K.mean(K.square(truth_n - mean_n) / var_n, axis=-1) + \
            0.5 * K.log(2 * np.pi)
    return nll_n
Esempio n. 17
0
    def getOptmizer(self):

        adamOptmizer = Adam(lr=self.learning_rate)

        state = K.placeholder(shape=(None, 28))
        nextState = K.placeholder(shape=(None, 28))
        actionProb = K.placeholder(shape=(None, 200))

        state_d = K.placeholder(shape=(None, 28))
        nextState_d = K.placeholder(shape=(None, 28))
        actionProb_d = K.placeholder(shape=(None, 200))

        gamma = K.variable(self.gamma)

        stateValues = K.function([self.actor.input], self.actor.output)
        rewardValue = K.function([self.rewardNetwork.input],
                                 self.rewardNetwork.output)

        reward = rewardValue(state)
        stateValue = stateValues(state)
        nextStateValue = stateValue(nextState)

        reward_d = rewardValue(state_d)
        stateValue_d = stateValues(state_d)
        nextStateValue_d = stateValue(nextState_d)

        logits = reward + gamma * nextStateValue - stateValue - actionProb
        logits_d = reward_d + gamma * nextStateValue_d - stateValue_d - actionProb_d

        loss = K.mean(K.softplus(-(logits))) + K.mean(K.softplus((logits_d)))

        updatesOnline = adamOptmizer.get_updates(self.actor.trainable_weights,
                                                 [], loss)
        updatesReward = adamOptmizer.get_updates(
            self.rewardNetwork.trainable_weights, [], loss)

        self.updateOnline = K.function(
            [state, nextState, actionProb, state_d, nextState_d, actionProb_d],
            loss,
            updates=updatesOnline)
        self.updateReward = K.function(
            [state, nextState, actionProb, state_d, nextState_d, actionProb_d],
            loss,
            updates=updatesReward)
Esempio n. 18
0
def kokon_loss(y_true, y):
    dim0 = K.shape(y)[0]
    loss = tf.fill(tf.stack([dim0]), 0.0)
    for i in range(18):
        for j in range(18):
            #for j in range(i+1,18):
            softplus = K.softplus(y[:, j] - y[:, i])
            tanh = K.tanh(y_true[:, j] - y_true[:, i])
            loss = loss + K.clip(18 + softplus * tanh, 0.0, 18.0)
    loss = K.reshape(loss, [-1, 1]) / (18 * 18)
    return loss
Esempio n. 19
0
 def call(self, x):
     z, log_alpha, log_beta = x
     log_alpha = K.clip(log_alpha, -64, 64)
     log_beta = K.clip(log_beta, -64, 64)
     alpha = K.exp(log_alpha)
     beta = K.exp(log_beta)
     a = K.softplus(self.a - 4)
     loss = -alpha * log_beta + (
         alpha + z / a) * tf.log(1 + beta) - tf.lgamma(
             alpha + z / a) + tf.lgamma(alpha) + tf.lgamma(z / a + 1)
     posterior_mean = (a * alpha + z) / (beta + 1)
     return K.concatenate([loss, posterior_mean], axis=-1)
Esempio n. 20
0
def D_logistic_simplegp(x_real_score,
                        x_fake_score,
                        x_real,
                        x_fake,
                        r1_gamma=10.0,
                        r2_gamma=0.0):
    d_loss = K.mean(K.softplus(x_fake_score) - K.softplus(x_real_score))

    if r1_gamma != 0.0:
        with K.name_scope('R1Penalty'):
            r1_grads = K.gradients(x_real_score, [x_real])[0]
            r1_grads_norms = K.sqrt(K.sum(r1_grads**2, axis=[1, 2, 3]) + 1e-8)
        d_loss += r1_grads_norms * (r1_gamma * 0.5)

    if r2_gamma != 0.0:
        with K.name_scope('R2Penalty'):
            r2_grads = K.gradients(x_fake_score, [x_fake])[0]
            r2_grads_norms = K.sqrt(K.sum(r2_grads**2, axis=[1, 2, 3]) + 1e-8)
        d_loss += r2_grads_norms * (r1_gamma * 0.5)

    return d_loss
Esempio n. 21
0
    def get_updates(self, loss, params):
        grads = self.get_gradients(loss, params)
        self.updates = [K.update_add(self.iterations, 1)]

        lr = self.learning_rate
        if self.initial_decay > 0:
            lr = lr * (1. / (1. + self.decay * K.cast(self.iterations,
                                                      K.dtype(self.decay))))

        t = K.cast(self.iterations, K.floatx()) + 1
        lr_t = lr * (K.sqrt(1. - K.pow(self.beta_2, t)) /
                     (1. - K.pow(self.beta_1, t)))

        ms = [K.zeros(K.int_shape(p),
              dtype=K.dtype(p),
              name='m_' + str(i))
              for (i, p) in enumerate(params)]
        vs = [K.zeros(K.int_shape(p),
              dtype=K.dtype(p),
              name='v_' + str(i))
              for (i, p) in enumerate(params)]

        if self.amsgrad:
            vhats = [K.zeros(K.int_shape(p),
                     dtype=K.dtype(p),
                     name='vhat_' + str(i))
                     for (i, p) in enumerate(params)]
        else:
            vhats = [K.zeros(1, name='vhat_' + str(i))
                     for i in range(len(params))]
        self.weights = [self.iterations] + ms + vs + vhats

        for p, g, m, v, vhat in zip(params, grads, ms, vs, vhats):
            m_t = (self.beta_1 * m) + (1. - self.beta_1) * g
            v_t = (self.beta_2 * v) + (1. - self.beta_2) * K.square(g)
            if self.amsgrad:
                vhat_t = K.maximum(vhat, v_t)
                p_t = p - lr_t * m_t / (K.sqrt(vhat_t) + self.epsilon)
                self.updates.append(K.update(vhat, vhat_t))
            else:
                p_t = p - lr_t * m_t / K.softplus ((K.sqrt(v_t)) + self.epsilon)

            self.updates.append(K.update(m, m_t))
            self.updates.append(K.update(v, v_t))
            new_p = p_t

            # Apply constraints.
            if getattr(p, 'constraint', None) is not None:
                new_p = p.constraint(new_p)

            self.updates.append(K.update(p, new_p))
        return self.updates
Esempio n. 22
0
def triplet_loss(_, y_pred):
    '''
        Assume: y_pred shape is (batch_size, 2)
    '''
    margin = K.constant(triplet_margin)

    subtraction = K.constant([1, -1], shape=(2, 1))
    diff =  K.dot(K.square(y_pred), subtraction)

    #loss = K.maximum(K.constant(0), margin + diff)
    loss = K.softplus(diff)

    return loss
Esempio n. 23
0
def transform_z0(args):
    z0, w, u, b = args
    b2 = K.squeeze(b, 1)
    beta = K.sum(tf.multiply(w, z0), 1)

    # change u2 so that the transformation z0->z1 is invertible
    alpha = K.sum(tf.multiply(w, u), 1)
    diag1 = tf.diag(K.softplus(alpha) - 1 - alpha)
    u2 = u + K.dot(diag1, w) / K.sum(K.square(w)+1e-7)
    diag2 = tf.diag(K.tanh(beta + b2))

    # generate z1
    z1 = z0 + K.dot(diag2,u2) 
    return z1
Esempio n. 24
0
def activate(ab_pred):
    '''
     Keras doesn't support applying different activation functions to the individual neurons. 
     Thankfully, a custom activation function takes care of this...
    '''
    from keras import backend as K
    
    a = K.exp(ab_pred[:, 0])
    b = K.softplus(ab_pred[:, 1])

    a = K.reshape(a, (K.shape(a)[0], 1))
    b = K.reshape(b, (K.shape(b)[0], 1))

    return K.concatenate((a, b), axis=1)
Esempio n. 25
0
def logdet_loss(args):
    z0, w, u, b = args
    b2 = K.squeeze(b, 1)
    beta = K.sum(tf.multiply(w, z0), 1)  # <w|z0>
    linear_trans = beta + b2  # <w|z0> + b

    # change u2 so that the transformation z0->z1 is invertible
    alpha = K.sum(tf.multiply(w, u), 1)  # 
    diag1 = tf.diag(K.softplus(alpha) - 1 - alpha)
    u2 = u + K.dot(diag1, w) / K.sum(K.square(w)+1e-7)
    gamma = K.sum(tf.multiply(w,u2), 1)

    logdet = K.log(K.abs(1 + (1 - K.square(K.tanh(linear_trans)))*gamma) + 1e-6)

    return logdet
Esempio n. 26
0
def get_triplet_batch_hard_loss(batch_size, margin):
    if margin == 'soft':
        print("Using soft-margin in batch-hard loss")
        final_loss_tensor = lambda hard_pos, hard_neg: K.softplus(hard_pos -
                                                                  hard_neg)
    else:
        try:
            margin = float(margin)
            print("Using hard-margin of {} in batch-hard loss".format(margin))
            final_loss_tensor = lambda hard_pos, hard_neg: K.maximum(
                hard_pos - hard_neg + margin, 0)
        except ValueError:
            raise util.ScrnaException(
                'Batch hard margin must be a real number or "soft"!')

    def triplet_batch_hard_loss(y_true, y_pred):
        # y_pred is the embedding, y_true is the IDs (labels) of the samples (not 1-hot encoded)
        # They are mini-batched. If batch_size is B, and embedding dimension is D, shapes are:
        #   y_true: (B,)
        #   y_pred: (B,D)

        # Get all-pairs distances
        y_true = K.sum(y_true, axis=1)
        diffs = K.expand_dims(y_pred, axis=1) - K.expand_dims(y_pred, axis=0)
        dist_mat = K.sqrt(K.sum(K.square(diffs), axis=-1) + K.epsilon())
        same_identity_mask = K.equal(K.expand_dims(y_true, axis=1),
                                     K.expand_dims(y_true, axis=0))
        # TODO: make this backend-agnostic somehow
        negative_mask = T.bitwise_not(same_identity_mask)
        # XOR ensures that the same sample is paired with itself
        positive_mask = T.bitwise_xor(same_identity_mask,
                                      K.eye(batch_size, dtype='bool'))
        #print(K.int_shape(y_true))
        #print(K.int_shape(y_pred))

        #positive_mask = T.bitwise_xor(same_identity_mask, T.eye(K.int_shape(y_true)[0]))

        furthest_positive = K.max(dist_mat * positive_mask, axis=1)
        #closest_negative = K.min(dist_mat*negative_mask + np.inf*same_identity_mask, axis=1)
        closest_negative = K.min(dist_mat * negative_mask +
                                 1e6 * same_identity_mask,
                                 axis=1)

        loss = final_loss_tensor(furthest_positive, closest_negative)
        return loss

    return triplet_batch_hard_loss
def pinball_loss(tau, y, q, alpha):
    """
    Smooth Pinball loss function.

    Arguments:
        tau (ndarray) - quantile levels
        y (ndarray) - time series observations
        q (ndarray) - quantile predictions
        alpha (float) - smoothing rate
    Returns:
        quantile_loss (tensor) - loss
    """

    error = (y - q)
    quantile_loss = K.mean(tau * error + alpha * K.softplus(-error / alpha))

    return quantile_loss
Esempio n. 28
0
def univariate_gaussian(true, pred):
    """
    Generic, rank-agnostic bivariate gaussian function
    Returns results of eq # 24 of http://arxiv.org/abs/1308.0850
    :param true: truth values with at least [mu]
    :param pred: values predicted with at least [mu, sigma]
    :return: probability density function
    """
    x = true[..., 0]
    mu = pred[..., 0]
    sigma = pred[..., 1]

    norm = K.log(1 + K.abs(x - mu))  # needs log of norm to counter large mu diffs
    variance = K.softplus(K.square(sigma))
    z = K.exp(-K.square(K.abs(norm)) / (2 * variance) + epsilon())  # z -> 0 if sigma
    # pdf -> 0 if sigma is very large or z -> 0; NaN if variance -> 0
    pdf = z / K.sqrt((2 * np.pi * variance) + epsilon())
    return pdf
Esempio n. 29
0
    def call(self, x, mask=None):
        a_scaler, b_scaler = tf.unstack(x, 2, -1)

        a_scaler = K.tanh(2 * a_scaler)
        b_scaler = K.tanh(2 * b_scaler)

        a = self.a_bias + (self.a_scale * a_scaler)
        b = self.b_bias + (self.b_scale * b_scaler)

        a = K.exp(a)
        b = K.softplus(b)

        a = K.clip(a, .8, 100.)
        b = K.clip(b, .8, 10.)
        b = K.pow(b, -1.) * (a + 2.) * 2.

        x = K.stack([a, b], axis=-1)
        return x
Esempio n. 30
0
    def __call__(self, y_true: plaidml.tile.Value,
                 y_pred: plaidml.tile.Value) -> plaidml.tile.Value:
        """ Call the LogCosh loss function.
        Parameters
        ----------
        y_true: :class:`plaidml.tile.Value`
            The ground truth value
        y_pred: :class:`plaidml.tile.Value`
            The predicted value

        Returns
        -------
        :class:`plaidml.tile.Value`
            The loss value
        """
        diff = y_pred - y_true
        loss = diff + K.softplus(-2. * diff) - K.log(
            K.constant(2., dtype="float32"))
        return K.mean(loss, axis=-1)
Esempio n. 31
0
def mean_var_blindspot_network(input_shape):
    # create input layer
    inputs = Input(input_shape)

    # run blindspot network
    x = blindspot_network(inputs)

    mean = Conv2D(1, 1, name='mean')(x)
    var = Conv2D(1, 1, name='var')(x)
    scale = Lambda(lambda x: K.softplus(x) + 1e-3)(var)

    # create model
    model = Model(inputs=inputs, outputs=mean)

    # create loss function
    loss = mean_var_loss(inputs, mean, var)
    model.add_loss(loss)

    return model
from keras import backend as K
from keras.regularizers import ActivityRegularizer
import numpy as np

dummy_loss_val = K.variable(0.0)

softminus = lambda x: x - K.softplus(x)

# Dummy loss function which simply returns 0
# This is because we will be training the network using regularizers.
def dummy_loss(y_true, y_pred):
    return dummy_loss_val


def psnr(y_true, y_pred):
    assert y_true.shape == y_pred.shape, "Cannot calculate PSNR. Input shapes not same." \
                                         " y_true shape = %s, y_pred shape = %s" % (str(y_true.shape),
                                                                                   str(y_pred.shape))

    return -10. * np.log10(np.mean(np.square(y_pred - y_true)))

def PSNRLoss(y_true, y_pred):
    """
    PSNR is Peek Signal to Noise Ratio, which is similar to mean squared error.

    It can be calculated as
    PSNR = 20 * log10(MAXp) - 10 * log10(MSE)

    When providing an unscaled input, MAXp = 255. Therefore 20 * log10(255)== 48.1308036087.
    However, since we are scaling our input, MAXp = 1. Therefore 20 * log10(1) = 0.
    Thus we remove that component completely and only compute the remaining MSE component.
Esempio n. 33
0
 def call(self, x, mask=None):
     from keras import backend as K
     j = K.softplus((x - 1) / self.sigma) * self.sigma
     v = self.amplitude / (self.tau_ref + self.tau_rc*K.log(1 + 1/j))
     return K.switch(j > 0, v, 0)
Esempio n. 34
0
 def call(self, x, mask=None):
     if K.backend() == 'theano':
         return K.softplus(K.pattern_broadcast(self.beta, self.param_broadcast) * x) * K.pattern_broadcast(self.alpha, self.param_broadcast)
     else:
         return K.softplus(self.beta * x) * self.alpha