コード例 #1
0
ファイル: kuaiLSTM.py プロジェクト: sadeghst/geolearn
class slowLSTMcell_untied(torch.nn.Module):

    """
    """

    def __init__(self, *, inputSize, hiddenSize, train=True,
                 dr=0.5, drMethod='gal+sem', gpu=0):
        super(slowLSTMcell_untied, self).__init__()
        self.inputSize = inputSize
        self.hiddenSize = inputSize
        self.dr = dr

        self.w_xi = Parameter(torch.Tensor(hiddenSize, inputSize))
        self.w_xf = Parameter(torch.Tensor(hiddenSize, inputSize))
        self.w_xo = Parameter(torch.Tensor(hiddenSize, inputSize))
        self.w_xc = Parameter(torch.Tensor(hiddenSize, inputSize))

        self.w_hi = Parameter(torch.Tensor(hiddenSize, hiddenSize))
        self.w_hf = Parameter(torch.Tensor(hiddenSize, hiddenSize))
        self.w_ho = Parameter(torch.Tensor(hiddenSize, hiddenSize))
        self.w_hc = Parameter(torch.Tensor(hiddenSize, hiddenSize))

        self.b_i = Parameter(torch.Tensor(hiddenSize))
        self.b_f = Parameter(torch.Tensor(hiddenSize))
        self.b_o = Parameter(torch.Tensor(hiddenSize))
        self.b_c = Parameter(torch.Tensor(hiddenSize))

        self.drMethod = drMethod.split('+')
        self.gpu = gpu
        self.train = train
        if gpu >= 0:
            self = self.cuda(gpu)
            self.is_cuda = True
        else:
            self.is_cuda = False
        self.reset_parameters()

    def reset_parameters(self):
        std = 1.0 / math.sqrt(self.hiddenSize)
        for w in self.parameters():
            w.data.uniform_(-std, std)

    def init_mask(self, x, h, c):
        self.maskX_i = createMask(x, self.dr)
        self.maskX_f = createMask(x, self.dr)
        self.maskX_c = createMask(x, self.dr)
        self.maskX_o = createMask(x, self.dr)

        self.maskH_i = createMask(h, self.dr)
        self.maskH_f = createMask(h, self.dr)
        self.maskH_c = createMask(h, self.dr)
        self.maskH_o = createMask(h, self.dr)

        self.maskC = createMask(c, self.dr)

        self.maskW_xi = createMask(self.w_xi, self.dr)
        self.maskW_xf = createMask(self.w_xf, self.dr)
        self.maskW_xc = createMask(self.w_xc, self.dr)
        self.maskW_xo = createMask(self.w_xo, self.dr)
        self.maskW_hi = createMask(self.w_hi, self.dr)
        self.maskW_hf = createMask(self.w_hf, self.dr)
        self.maskW_hc = createMask(self.w_hc, self.dr)
        self.maskW_ho = createMask(self.w_ho, self.dr)

    def forward(self, x, hidden):
        h0, c0 = hidden
        doDrop = self.training and self.dr > 0.0

        if doDrop:
            self.init_mask(x, h0, c0)

        if doDrop and 'drH' in self.drMethod:
            h0_i = h0.mul(self.maskH_i)
            h0_f = h0.mul(self.maskH_f)
            h0_c = h0.mul(self.maskH_c)
            h0_o = h0.mul(self.maskH_o)
        else:
            h0_i = h0
            h0_f = h0
            h0_c = h0
            h0_o = h0

        if doDrop and 'drX' in self.drMethod:
            x_i = x.mul(self.maskX_i)
            x_f = x.mul(self.maskX_f)
            x_c = x.mul(self.maskX_c)
            x_o = x.mul(self.maskX_o)
        else:
            x_i = x
            x_f = x
            x_c = x
            x_o = x

        if doDrop and 'drW' in self.drMethod:
            w_xi = self.w_xi.mul(self.maskW_xi)
            w_xf = self.w_xf.mul(self.maskW_xf)
            w_xc = self.w_xc.mul(self.maskW_xc)
            w_xo = self.w_xo.mul(self.maskW_xo)
            w_hi = self.w_hi.mul(self.maskW_hi)
            w_hf = self.w_hf.mul(self.maskW_hf)
            w_hc = self.w_hc.mul(self.maskW_hc)
            w_ho = self.w_ho.mul(self.maskW_ho)
        else:
            w_xi = self.w_xi
            w_xf = self.w_xf
            w_xc = self.w_xc
            w_xo = self.w_xo
            w_hi = self.w_hi
            w_hf = self.w_hf
            w_hc = self.w_hc
            w_ho = self.w_ho

        gate_i = F.linear(x_i, w_xi)+F.linear(h0_i, w_hi) + self.b_i
        gate_f = F.linear(x_f, w_xf)+F.linear(h0_f, w_hf) + self.b_f
        gate_c = F.linear(x_c, w_xc)+F.linear(h0_c, w_hc) + self.b_c
        gate_o = F.linear(x_o, w_xo)+F.linear(h0_o, w_ho) + self.b_o

        gate_i = F.sigmoid(gate_i)
        gate_f = F.sigmoid(gate_f)
        gate_c = F.tanh(gate_c)
        gate_o = F.sigmoid(gate_o)

        if doDrop and 'drC' in self.drMethod:
            gate_c = gate_c.mul(self.maskC)

        c1 = (gate_f * c0) + (gate_i * gate_c)
        h1 = gate_o * F.tanh(c1)

        return h1, c1
コード例 #2
0
ファイル: kuaiLSTM.py プロジェクト: sadeghst/geolearn
class slowLSTMcell_tied(torch.nn.Module):
    """
    """

    def __init__(self, *, inputSize, hiddenSize, mode='train',
                 dr=0.5, drMethod='drX+drW+drC', gpu=1):
        super(slowLSTMcell_tied, self).__init__()

        self.inputSize = inputSize
        self.hiddenSize = hiddenSize
        self.dr = dr

        self.w_ih = Parameter(torch.Tensor(hiddenSize*4, inputSize))
        self.w_hh = Parameter(torch.Tensor(hiddenSize*4, hiddenSize))
        self.b_ih = Parameter(torch.Tensor(hiddenSize*4))
        self.b_hh = Parameter(torch.Tensor(hiddenSize*4))

        self.drMethod = drMethod.split('+')
        self.gpu = gpu
        self.mode = mode
        if mode == 'train':
            self.train(mode=True)
        elif mode == 'test':
            self.train(mode=False)
        elif mode == 'drMC':
            self.train(mode=False)

        if gpu >= 0:
            self = self.cuda()
            self.is_cuda = True
        else:
            self.is_cuda = False
        self.reset_parameters()

    def reset_parameters(self):
        stdv = 1.0 / math.sqrt(self.hiddenSize)
        for weight in self.parameters():
            weight.data.uniform_(-stdv, stdv)

    def reset_mask(self, x, h, c):
        self.maskX = createMask(x, self.dr)
        self.maskH = createMask(h, self.dr)
        self.maskC = createMask(c, self.dr)
        self.maskW_ih = createMask(self.w_ih, self.dr)
        self.maskW_hh = createMask(self.w_hh, self.dr)

    def forward(self, x, hidden):
        h0, c0 = hidden

        if self.dr > 0 and self.training is True:
            self.reset_mask()

        if self.training is True and 'drH' in self.drMethod:
            h0 = h0.mul(self.maskH)
            self.w_ih = dropMask.apply(self.w_ih)
        if self.training is True and 'drX' in self.drMethod:
            x = x.mul(self.maskX)

        if self.training is True and 'drW' in self.drMethod:
            # w_ih = self.w_ih.mul(self.maskW_ih)
            self.w_ih = dropMask.apply(self.w_ih)
            w_hh = self.w_hh.mul(self.maskW_hh)
        else:
            w_ih = self.w_ih
            w_hh = self.w_hh

        gates = F.linear(x, w_ih, self.b_ih) + \
            F.linear(h0, w_hh, self.b_hh)
        gate_i, gate_f, gate_c, gate_o = gates.chunk(4, 1)

        gate_i = F.sigmoid(gate_i)
        gate_f = F.sigmoid(gate_f)
        gate_c = F.tanh(gate_c)
        gate_o = F.sigmoid(gate_o)

        if self.training is True and 'drC' in self.drMethod:
            gate_c = gate_c.mul(self.maskC)

        c1 = (gate_f * c0) + (gate_i * gate_c)
        h1 = gate_o * F.tanh(c1)

        return h1, c1
コード例 #3
0
class BaseRNNCell(nn.Module):
    def __init__(self,
                 input_size,
                 hidden_size,
                 bias=False,
                 nonlinearity="tanh",
                 hidden_min_abs=0,
                 hidden_max_abs=None,
                 hidden_init=None,
                 recurrent_init=None,
                 gradient_clip=5):
        super(BaseRNNCell, self).__init__()
        self.hidden_max_abs = hidden_max_abs
        self.hidden_min_abs = hidden_min_abs
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.bias = bias
        self.nonlinearity = nonlinearity
        self.hidden_init = hidden_init
        self.recurrent_init = recurrent_init
        if self.nonlinearity == "tanh":
            self.activation = F.tanh
        elif self.nonlinearity == "relu":
            self.activation = F.relu
        elif self.nonlinearity == "sigmoid":
            self.activation = F.sigmoid
        elif self.nonlinearity == "log":
            self.activation = torch.log
        elif self.nonlinearity == "sin":
            self.activation = torch.sin
        else:
            raise RuntimeError("Unknown nonlinearity: {}".format(
                self.nonlinearity))

        self.weight_ih = Parameter(torch.eye(hidden_size, input_size))
        self.weight_hh = Parameter(torch.Tensor(hidden_size, 20).uniform_())
        self.weight_hh1 = Parameter(torch.eye(input_size, hidden_size))
        if bias:
            self.bias_ih = Parameter(torch.randn(hidden_size))
        else:
            self.register_parameter('bias_ih', None)
        # self.reset_parameters()

    def reset_parameters(self):
        stdv = 1.0 / math.sqrt(self.hidden_size)
        for weight in self.parameters():
            weight.data.uniform_(-stdv, stdv)

    # def reset_parameters(self):
    #     for name, weight in self.named_parameters():
    #         if "bias" in name:
    #             weight.data.zero_()
    #         elif "weight_hh" in name:
    #             if self.recurrent_init is None:
    #                 nn.init.constant_(weight, 1)
    #             else:
    #                 self.recurrent_init(weight)
    #         elif "weight_ih" in name:
    #             if self.hidden_init is None:
    #                 nn.init.normal_(weight, 0, 0.01)
    #             else:
    #                 self.hidden_init(weight)
    #         else:
    #             weight.data.normal_(0, 0.01)
    #             # weight.data.uniform_(-stdv, stdv)
    #     self.check_bounds()

    def check_bounds(self):
        if self.hidden_min_abs:
            abs_kernel = torch.abs(
                self.weight_hh.data).clamp_(min=self.hidden_min_abs)
            self.weight_hh.data = self.weight_hh.mul(
                torch.sign(self.weight_hh.data), abs_kernel)
        if self.hidden_max_abs:
            self.weight_hh.data = self.weight_hh.clamp(
                max=self.hidden_max_abs, min=-self.hidden_max_abs)

    def forward(self, input, hx):
        # x = F.linear(input, self.weight_ih, self.bias_ih) + torch.matmul(hx, self.weight_hh.matmul(self.weight_hh1))
        # return self.talor(x)
        return self.activation(
            F.linear(input, self.weight_ih, self.bias_ih) +
            torch.matmul(hx, self.weight_ih.matmul(self.weight_hh1)))

    def talor(self, x):
        return (x -
                1) - (x - 1) * (x - 1) / 2 + (x - 1) * (x - 1) * (x - 1) / 3
コード例 #4
0
ファイル: ladder.py プロジェクト: timatim/LadderNetwork
class LadderNetwork(nn.Module):
    layer_sizes = [784, 1000, 500, 250, 250, 250, 10]

    def __init__(self, layer_sizes=None):
        super(LadderNetwork, self).__init__()
        if layer_sizes:
            self.layer_sizes = layer_sizes
            self.L = layer_sizes - 1

        L = 6
        self.L = L

        self.encoder_layers = nn.ModuleList([None] + [
            nn.Linear(self.layer_sizes[i - 1], self.layer_sizes[i])
            for i in range(1, self.L + 1)
        ])
        self.decoder_layers = nn.ModuleList([None] + [
            nn.Linear(self.layer_sizes[i], self.layer_sizes[i - 1])
            for i in range(1, self.L + 1)
        ])

        def get_alpha(i):
            return nn.ParameterList([
                Parameter(
                    torch.FloatTensor(self.layer_sizes[i]).fill_(0).add_(
                        torch.FloatTensor(self.layer_sizes[i]).normal_(0,
                                                                       0.1))),
                Parameter(
                    torch.FloatTensor(self.layer_sizes[i]).fill_(1).add_(
                        torch.FloatTensor(self.layer_sizes[i]).normal_(0,
                                                                       0.1))),
                Parameter(
                    torch.FloatTensor(self.layer_sizes[i]).fill_(0).add_(
                        torch.FloatTensor(self.layer_sizes[i]).normal_(0,
                                                                       0.1))),
                Parameter(
                    torch.FloatTensor(self.layer_sizes[i]).fill_(0).add_(
                        torch.FloatTensor(self.layer_sizes[i]).normal_(0,
                                                                       0.1))),
                Parameter(
                    torch.FloatTensor(self.layer_sizes[i]).fill_(0).add_(
                        torch.FloatTensor(self.layer_sizes[i]).normal_(0,
                                                                       0.1))),
                Parameter(
                    torch.FloatTensor(self.layer_sizes[i]).fill_(0).add_(
                        torch.FloatTensor(self.layer_sizes[i]).normal_(0,
                                                                       0.1))),
                Parameter(
                    torch.FloatTensor(self.layer_sizes[i]).fill_(1).add_(
                        torch.FloatTensor(self.layer_sizes[i]).normal_(0,
                                                                       0.1))),
                Parameter(
                    torch.FloatTensor(self.layer_sizes[i]).fill_(0).add_(
                        torch.FloatTensor(self.layer_sizes[i]).normal_(0,
                                                                       0.1))),
                Parameter(
                    torch.FloatTensor(self.layer_sizes[i]).fill_(0).add_(
                        torch.FloatTensor(self.layer_sizes[i]).normal_(0,
                                                                       0.1))),
                Parameter(
                    torch.FloatTensor(self.layer_sizes[i]).fill_(0).add_(
                        torch.FloatTensor(self.layer_sizes[i]).normal_(0,
                                                                       0.1)))
            ])

        self.alpha_layers = nn.ModuleList(
            [get_alpha(i) for i in range(0, self.L + 1)])
        self.gamma = Parameter(
            torch.FloatTensor(self.layer_sizes[self.L]).fill_(1.).add_(
                torch.FloatTensor(self.layer_sizes[self.L]).normal_(0, 0.1)))
        self.beta = nn.ParameterList([
            Parameter(
                torch.FloatTensor(self.layer_sizes[l]).fill_(0.).add_(
                    torch.FloatTensor(self.layer_sizes[l]).normal_(0, 0.1)))
            for l in range(0, self.L + 1)
        ])

        self.means = [None] * (L + 1)
        self.stds = [None] * (L + 1)
        self.z = [None] * (L + 1)
        self.h = [None] * (L + 1)
        self.z_noise = [None] * (L + 1)
        self.h_noise = [None] * (L + 1)
        self.u = [None] * (L + 1)
        self.z_hat = [None] * (L + 1)
        self.z_hat_bn = [None] * (L + 1)
        self.noise_mean = 0.
        self.noise_std = 0.2

        self.denoising_cost = [1000., 10., 0.1, 0.1, 0.1, 0.1, 0.1]

    def encoder(self, x):
        L = self.L
        m = x.size()[0]

        self.z[0] = x.view(-1, self.layer_sizes[0])
        self.h[0] = self.z[0]
        self.z_noise[0] = add_noise(self.z[0])
        self.h_noise[0] = self.z_noise[0]
        # corrupted encoder
        for i in range(1, L + 1):
            self.z_noise[i] = self.encoder_layers[i](self.h_noise[i - 1])
            self.z_noise[i] = nn.BatchNorm1d(self.layer_sizes[i])(
                self.z_noise[i])
            self.z_noise[i] = add_noise(self.z_noise[i], self.noise_mean,
                                        self.noise_std)
            self.h_noise[i] = Variable(
                torch.FloatTensor(self.z_noise[i].size()))
            if i == L:
                for j in range(m):
                    self.h_noise[L][j] = self.gamma.mul(self.z_noise[L][j].add(
                        self.beta[L]))
            else:
                for j in range(m):
                    self.h_noise[i][j] = nn.ReLU()(self.z_noise[i][j] +
                                                   self.beta[i])

        self.y_noise = self.h_noise[L]

        self.means[0] = self.z[0].mean(0)
        self.stds[0] = Variable(self.z[0].data.std(0))
        self.stds[0].data.add_(
            torch.FloatTensor(self.stds[0].data.size()).fill_(1e-4))
        # clean encoder
        for i in range(1, L + 1):
            # linear transformation
            self.z[i] = self.encoder_layers[i](self.h[i - 1])
            # normalization
            self.means[i] = self.z[i].mean(0)
            self.stds[i] = Variable(self.z[i].data.std(0))

            self.z[i] = nn.BatchNorm1d(self.layer_sizes[i])(self.z[i])
            self.h[i] = Variable(torch.FloatTensor(self.z[i].size()))
            # non-linearity
            if i == L:
                for j in range(m):
                    self.h[L][j] = self.gamma.mul(self.z[L][j].add(
                        self.beta[L]))
            else:
                for j in range(m):
                    self.h[i][j] = nn.ReLU()(self.z[i][j] + self.beta[i])

        self.y_noise = nn.LogSoftmax()(self.h_noise[L])
        self.y = nn.LogSoftmax()(self.h[L])
        return self.y, self.y_noise

    def decoder(self, x):
        L = self.L
        # get batch size
        m = x.size()[0]

        for l in range(L, -1, -1):
            self.z_hat[l] = Variable(torch.FloatTensor(m, self.layer_sizes[l]))
            self.z_hat_bn[l] = Variable(
                torch.FloatTensor(m, self.layer_sizes[l]))
            if l == L:
                self.u[L] = nn.BatchNorm1d(self.layer_sizes[L])(
                    self.h_noise[L])
            else:
                self.u[l] = nn.BatchNorm1d(self.layer_sizes[l])(
                    self.decoder_layers[l + 1](self.z_hat[l + 1]))

            def g(z_noise, u, l):
                alpha = self.alpha_layers[l]
                m = z_noise.size()[0]
                mu = Variable(torch.FloatTensor(u.size()))
                v = Variable(torch.FloatTensor(u.size()))
                for i in range(m):
                    mu[i] = alpha[0] * nn.Sigmoid(
                    )(alpha[1] * u[i] + alpha[2]) + alpha[3] * u[i] + alpha[4]
                    v[i] = alpha[5] * nn.Sigmoid(
                    )(alpha[6] * u[i] + alpha[7]) + alpha[8] * u[i] + alpha[9]
                self.z_hat[l] = (z_noise - mu) * v + mu

                #self.z_hat[l][i] = params[6].add(params[0].mul(z_noise[i])).add(params[2].mul(u[i])).add(params[4].mul(z_noise[i]).mul(u[i])) \
                #	.add(params[8].mul(nn.Sigmoid()(params[7].add(params[1].mul(z_noise[i])).add(params[3].mul(u[i])) \
                #	.add(params[5].mul(z_noise[i]).mul(u[i])))))

            g(self.z_noise[l], self.u[l], l)
            for i in range(m):
                if l == 0:
                    n = self.layer_sizes[l]
                    self.z_hat_bn[l][i] = self.z_hat[l][i]
                else:
                    self.z_hat_bn[l][i] = (self.z_hat[l][i] -
                                           self.means[l]) / self.stds[l]
        return self.z_hat[0]

    def forward(self, x):
        self.batch_size = x.size()[0]
        y, y_noise = self.encoder(x)
        z_hat = self.decoder(y)
        return y, z_hat

    def unsup_cost(self):
        # unsupervised denoising reconstruction cost
        unsupervised_func = nn.MSELoss()
        CD = 0.
        for l in range(0, self.L + 1):
            clean_target = torch.Tensor(self.z[l].size())
            clean_target.copy_(self.z[l].data)
            clean_target = Variable(clean_target)
            #print(unsupervised_func(self.z_hat_bn[l], clean_target))
            CD += self.denoising_cost[l] * unsupervised_func(
                self.z_hat_bn[l], clean_target)
        return CD

    def sup_cost(self, target):
        # supervised cost
        supervised_func = nn.NLLLoss()
        CC = supervised_func(self.y_noise, target)
        return CC
コード例 #5
0
class IndRNNCell(nn.Module):
    r"""An IndRNN cell with tanh or ReLU non-linearity.

    .. math::

        h' = \tanh(w_{ih} * x + b_{ih}  +  w_{hh} (*) h)
    With (*) being element-wise vector multiplication.
    If nonlinearity='relu', then ReLU is used in place of tanh.

    Args:
        input_size: The number of expected features in the input x
        hidden_size: The number of features in the hidden state h
        bias: If ``False``, then the layer does not use bias weights b_ih and b_hh.
            Default: ``True``
        nonlinearity: The non-linearity to use ['tanh'|'relu']. Default: 'relu'
        hidden_min_abs: Minimal absolute inital value for hidden weights. Default: 0
        hidden_max_abs: Maximal absolute inital value for hidden weights. Default: None

    Inputs: input, hidden
        - **input** (batch, input_size): tensor containing input features
        - **hidden** (batch, hidden_size): tensor containing the initial hidden
          state for each element in the batch.

    Outputs: h'
        - **h'** (batch, hidden_size): tensor containing the next hidden state
          for each element in the batch

    Attributes:
        weight_ih: the learnable input-hidden weights, of shape
            `(input_size x hidden_size)`
        weight_hh: the learnable hidden-hidden weights, of shape
            `(hidden_size)`
        bias_ih: the learnable input-hidden bias, of shape `(hidden_size)`

    Examples::

        >>> rnn = nn.IndRNNCell(10, 20)
        >>> input = Variable(torch.randn(6, 3, 10))
        >>> hx = Variable(torch.randn(3, 20))
        >>> output = []
        >>> for i in range(6):
        ...     hx = rnn(input[i], hx)
        ...     output.append(hx)
    """
    def __init__(self,
                 input_size,
                 hidden_size,
                 bias=True,
                 nonlinearity="relu",
                 hidden_min_abs=0,
                 hidden_max_abs=None,
                 hidden_init=None,
                 recurrent_init=None,
                 gradient_clip=None):
        super(IndRNNCell, self).__init__()
        self.hidden_max_abs = hidden_max_abs
        self.hidden_min_abs = hidden_min_abs
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.bias = bias
        self.nonlinearity = nonlinearity
        self.hidden_init = hidden_init
        self.recurrent_init = recurrent_init
        if self.nonlinearity == "tanh":
            self.activation = F.tanh
        elif self.nonlinearity == "relu":
            self.activation = F.relu
        else:
            raise RuntimeError("Unknown nonlinearity: {}".format(
                self.nonlinearity))
        self.weight_ih = Parameter(torch.Tensor(hidden_size, input_size))
        self.weight_hh = Parameter(torch.Tensor(hidden_size))
        if bias:
            self.bias_ih = Parameter(torch.Tensor(hidden_size))
        else:
            self.register_parameter('bias_ih', None)

        if gradient_clip:
            if isinstance(gradient_clip, tuple):
                min_g, max_g = gradient_clip
            else:
                max_g = gradient_clip
                min_g = -max_g
            self.weight_ih.register_hook(
                lambda x: x.clamp(min=min_g, max=max_g))
            self.weight_hh.register_hook(
                lambda x: x.clamp(min=min_g, max=max_g))
            if bias:
                self.bias_ih.register_hook(
                    lambda x: x.clamp(min=min_g, max=max_g))

        self.reset_parameters()

    def reset_parameters(self):
        for name, weight in self.named_parameters():
            if "bias" in name:
                weight.data.zero_()
            elif "weight_hh" in name:
                if self.recurrent_init is None:
                    nn.init.constant_(weight, 1)
                else:
                    self.recurrent_init(weight)
            elif "weight_ih" in name:
                if self.hidden_init is None:
                    nn.init.normal_(weight, 0, 0.01)
                else:
                    self.hidden_init(weight)
            else:
                weight.data.normal_(0, 0.01)
                # weight.data.uniform_(-stdv, stdv)
        self.check_bounds()

    def check_bounds(self):
        if self.hidden_min_abs:
            abs_kernel = torch.abs(
                self.weight_hh.data).clamp_(min=self.hidden_min_abs)
            self.weight_hh.data = self.weight_hh.mul(
                torch.sign(self.weight_hh.data), abs_kernel)
        if self.hidden_max_abs:
            self.weight_hh.data = self.weight_hh.clamp(
                max=self.hidden_max_abs, min=-self.hidden_max_abs)

    def forward(self, input, hx):
        return self.activation(
            F.linear(input, self.weight_ih, self.bias_ih) +
            F.mul(self.weight_hh, hx))