class slowLSTMcell_untied(torch.nn.Module): """ """ def __init__(self, *, inputSize, hiddenSize, train=True, dr=0.5, drMethod='gal+sem', gpu=0): super(slowLSTMcell_untied, self).__init__() self.inputSize = inputSize self.hiddenSize = inputSize self.dr = dr self.w_xi = Parameter(torch.Tensor(hiddenSize, inputSize)) self.w_xf = Parameter(torch.Tensor(hiddenSize, inputSize)) self.w_xo = Parameter(torch.Tensor(hiddenSize, inputSize)) self.w_xc = Parameter(torch.Tensor(hiddenSize, inputSize)) self.w_hi = Parameter(torch.Tensor(hiddenSize, hiddenSize)) self.w_hf = Parameter(torch.Tensor(hiddenSize, hiddenSize)) self.w_ho = Parameter(torch.Tensor(hiddenSize, hiddenSize)) self.w_hc = Parameter(torch.Tensor(hiddenSize, hiddenSize)) self.b_i = Parameter(torch.Tensor(hiddenSize)) self.b_f = Parameter(torch.Tensor(hiddenSize)) self.b_o = Parameter(torch.Tensor(hiddenSize)) self.b_c = Parameter(torch.Tensor(hiddenSize)) self.drMethod = drMethod.split('+') self.gpu = gpu self.train = train if gpu >= 0: self = self.cuda(gpu) self.is_cuda = True else: self.is_cuda = False self.reset_parameters() def reset_parameters(self): std = 1.0 / math.sqrt(self.hiddenSize) for w in self.parameters(): w.data.uniform_(-std, std) def init_mask(self, x, h, c): self.maskX_i = createMask(x, self.dr) self.maskX_f = createMask(x, self.dr) self.maskX_c = createMask(x, self.dr) self.maskX_o = createMask(x, self.dr) self.maskH_i = createMask(h, self.dr) self.maskH_f = createMask(h, self.dr) self.maskH_c = createMask(h, self.dr) self.maskH_o = createMask(h, self.dr) self.maskC = createMask(c, self.dr) self.maskW_xi = createMask(self.w_xi, self.dr) self.maskW_xf = createMask(self.w_xf, self.dr) self.maskW_xc = createMask(self.w_xc, self.dr) self.maskW_xo = createMask(self.w_xo, self.dr) self.maskW_hi = createMask(self.w_hi, self.dr) self.maskW_hf = createMask(self.w_hf, self.dr) self.maskW_hc = createMask(self.w_hc, self.dr) self.maskW_ho = createMask(self.w_ho, self.dr) def forward(self, x, hidden): h0, c0 = hidden doDrop = self.training and self.dr > 0.0 if doDrop: self.init_mask(x, h0, c0) if doDrop and 'drH' in self.drMethod: h0_i = h0.mul(self.maskH_i) h0_f = h0.mul(self.maskH_f) h0_c = h0.mul(self.maskH_c) h0_o = h0.mul(self.maskH_o) else: h0_i = h0 h0_f = h0 h0_c = h0 h0_o = h0 if doDrop and 'drX' in self.drMethod: x_i = x.mul(self.maskX_i) x_f = x.mul(self.maskX_f) x_c = x.mul(self.maskX_c) x_o = x.mul(self.maskX_o) else: x_i = x x_f = x x_c = x x_o = x if doDrop and 'drW' in self.drMethod: w_xi = self.w_xi.mul(self.maskW_xi) w_xf = self.w_xf.mul(self.maskW_xf) w_xc = self.w_xc.mul(self.maskW_xc) w_xo = self.w_xo.mul(self.maskW_xo) w_hi = self.w_hi.mul(self.maskW_hi) w_hf = self.w_hf.mul(self.maskW_hf) w_hc = self.w_hc.mul(self.maskW_hc) w_ho = self.w_ho.mul(self.maskW_ho) else: w_xi = self.w_xi w_xf = self.w_xf w_xc = self.w_xc w_xo = self.w_xo w_hi = self.w_hi w_hf = self.w_hf w_hc = self.w_hc w_ho = self.w_ho gate_i = F.linear(x_i, w_xi)+F.linear(h0_i, w_hi) + self.b_i gate_f = F.linear(x_f, w_xf)+F.linear(h0_f, w_hf) + self.b_f gate_c = F.linear(x_c, w_xc)+F.linear(h0_c, w_hc) + self.b_c gate_o = F.linear(x_o, w_xo)+F.linear(h0_o, w_ho) + self.b_o gate_i = F.sigmoid(gate_i) gate_f = F.sigmoid(gate_f) gate_c = F.tanh(gate_c) gate_o = F.sigmoid(gate_o) if doDrop and 'drC' in self.drMethod: gate_c = gate_c.mul(self.maskC) c1 = (gate_f * c0) + (gate_i * gate_c) h1 = gate_o * F.tanh(c1) return h1, c1
class slowLSTMcell_tied(torch.nn.Module): """ """ def __init__(self, *, inputSize, hiddenSize, mode='train', dr=0.5, drMethod='drX+drW+drC', gpu=1): super(slowLSTMcell_tied, self).__init__() self.inputSize = inputSize self.hiddenSize = hiddenSize self.dr = dr self.w_ih = Parameter(torch.Tensor(hiddenSize*4, inputSize)) self.w_hh = Parameter(torch.Tensor(hiddenSize*4, hiddenSize)) self.b_ih = Parameter(torch.Tensor(hiddenSize*4)) self.b_hh = Parameter(torch.Tensor(hiddenSize*4)) self.drMethod = drMethod.split('+') self.gpu = gpu self.mode = mode if mode == 'train': self.train(mode=True) elif mode == 'test': self.train(mode=False) elif mode == 'drMC': self.train(mode=False) if gpu >= 0: self = self.cuda() self.is_cuda = True else: self.is_cuda = False self.reset_parameters() def reset_parameters(self): stdv = 1.0 / math.sqrt(self.hiddenSize) for weight in self.parameters(): weight.data.uniform_(-stdv, stdv) def reset_mask(self, x, h, c): self.maskX = createMask(x, self.dr) self.maskH = createMask(h, self.dr) self.maskC = createMask(c, self.dr) self.maskW_ih = createMask(self.w_ih, self.dr) self.maskW_hh = createMask(self.w_hh, self.dr) def forward(self, x, hidden): h0, c0 = hidden if self.dr > 0 and self.training is True: self.reset_mask() if self.training is True and 'drH' in self.drMethod: h0 = h0.mul(self.maskH) self.w_ih = dropMask.apply(self.w_ih) if self.training is True and 'drX' in self.drMethod: x = x.mul(self.maskX) if self.training is True and 'drW' in self.drMethod: # w_ih = self.w_ih.mul(self.maskW_ih) self.w_ih = dropMask.apply(self.w_ih) w_hh = self.w_hh.mul(self.maskW_hh) else: w_ih = self.w_ih w_hh = self.w_hh gates = F.linear(x, w_ih, self.b_ih) + \ F.linear(h0, w_hh, self.b_hh) gate_i, gate_f, gate_c, gate_o = gates.chunk(4, 1) gate_i = F.sigmoid(gate_i) gate_f = F.sigmoid(gate_f) gate_c = F.tanh(gate_c) gate_o = F.sigmoid(gate_o) if self.training is True and 'drC' in self.drMethod: gate_c = gate_c.mul(self.maskC) c1 = (gate_f * c0) + (gate_i * gate_c) h1 = gate_o * F.tanh(c1) return h1, c1
class BaseRNNCell(nn.Module): def __init__(self, input_size, hidden_size, bias=False, nonlinearity="tanh", hidden_min_abs=0, hidden_max_abs=None, hidden_init=None, recurrent_init=None, gradient_clip=5): super(BaseRNNCell, self).__init__() self.hidden_max_abs = hidden_max_abs self.hidden_min_abs = hidden_min_abs self.input_size = input_size self.hidden_size = hidden_size self.bias = bias self.nonlinearity = nonlinearity self.hidden_init = hidden_init self.recurrent_init = recurrent_init if self.nonlinearity == "tanh": self.activation = F.tanh elif self.nonlinearity == "relu": self.activation = F.relu elif self.nonlinearity == "sigmoid": self.activation = F.sigmoid elif self.nonlinearity == "log": self.activation = torch.log elif self.nonlinearity == "sin": self.activation = torch.sin else: raise RuntimeError("Unknown nonlinearity: {}".format( self.nonlinearity)) self.weight_ih = Parameter(torch.eye(hidden_size, input_size)) self.weight_hh = Parameter(torch.Tensor(hidden_size, 20).uniform_()) self.weight_hh1 = Parameter(torch.eye(input_size, hidden_size)) if bias: self.bias_ih = Parameter(torch.randn(hidden_size)) else: self.register_parameter('bias_ih', None) # self.reset_parameters() def reset_parameters(self): stdv = 1.0 / math.sqrt(self.hidden_size) for weight in self.parameters(): weight.data.uniform_(-stdv, stdv) # def reset_parameters(self): # for name, weight in self.named_parameters(): # if "bias" in name: # weight.data.zero_() # elif "weight_hh" in name: # if self.recurrent_init is None: # nn.init.constant_(weight, 1) # else: # self.recurrent_init(weight) # elif "weight_ih" in name: # if self.hidden_init is None: # nn.init.normal_(weight, 0, 0.01) # else: # self.hidden_init(weight) # else: # weight.data.normal_(0, 0.01) # # weight.data.uniform_(-stdv, stdv) # self.check_bounds() def check_bounds(self): if self.hidden_min_abs: abs_kernel = torch.abs( self.weight_hh.data).clamp_(min=self.hidden_min_abs) self.weight_hh.data = self.weight_hh.mul( torch.sign(self.weight_hh.data), abs_kernel) if self.hidden_max_abs: self.weight_hh.data = self.weight_hh.clamp( max=self.hidden_max_abs, min=-self.hidden_max_abs) def forward(self, input, hx): # x = F.linear(input, self.weight_ih, self.bias_ih) + torch.matmul(hx, self.weight_hh.matmul(self.weight_hh1)) # return self.talor(x) return self.activation( F.linear(input, self.weight_ih, self.bias_ih) + torch.matmul(hx, self.weight_ih.matmul(self.weight_hh1))) def talor(self, x): return (x - 1) - (x - 1) * (x - 1) / 2 + (x - 1) * (x - 1) * (x - 1) / 3
class LadderNetwork(nn.Module): layer_sizes = [784, 1000, 500, 250, 250, 250, 10] def __init__(self, layer_sizes=None): super(LadderNetwork, self).__init__() if layer_sizes: self.layer_sizes = layer_sizes self.L = layer_sizes - 1 L = 6 self.L = L self.encoder_layers = nn.ModuleList([None] + [ nn.Linear(self.layer_sizes[i - 1], self.layer_sizes[i]) for i in range(1, self.L + 1) ]) self.decoder_layers = nn.ModuleList([None] + [ nn.Linear(self.layer_sizes[i], self.layer_sizes[i - 1]) for i in range(1, self.L + 1) ]) def get_alpha(i): return nn.ParameterList([ Parameter( torch.FloatTensor(self.layer_sizes[i]).fill_(0).add_( torch.FloatTensor(self.layer_sizes[i]).normal_(0, 0.1))), Parameter( torch.FloatTensor(self.layer_sizes[i]).fill_(1).add_( torch.FloatTensor(self.layer_sizes[i]).normal_(0, 0.1))), Parameter( torch.FloatTensor(self.layer_sizes[i]).fill_(0).add_( torch.FloatTensor(self.layer_sizes[i]).normal_(0, 0.1))), Parameter( torch.FloatTensor(self.layer_sizes[i]).fill_(0).add_( torch.FloatTensor(self.layer_sizes[i]).normal_(0, 0.1))), Parameter( torch.FloatTensor(self.layer_sizes[i]).fill_(0).add_( torch.FloatTensor(self.layer_sizes[i]).normal_(0, 0.1))), Parameter( torch.FloatTensor(self.layer_sizes[i]).fill_(0).add_( torch.FloatTensor(self.layer_sizes[i]).normal_(0, 0.1))), Parameter( torch.FloatTensor(self.layer_sizes[i]).fill_(1).add_( torch.FloatTensor(self.layer_sizes[i]).normal_(0, 0.1))), Parameter( torch.FloatTensor(self.layer_sizes[i]).fill_(0).add_( torch.FloatTensor(self.layer_sizes[i]).normal_(0, 0.1))), Parameter( torch.FloatTensor(self.layer_sizes[i]).fill_(0).add_( torch.FloatTensor(self.layer_sizes[i]).normal_(0, 0.1))), Parameter( torch.FloatTensor(self.layer_sizes[i]).fill_(0).add_( torch.FloatTensor(self.layer_sizes[i]).normal_(0, 0.1))) ]) self.alpha_layers = nn.ModuleList( [get_alpha(i) for i in range(0, self.L + 1)]) self.gamma = Parameter( torch.FloatTensor(self.layer_sizes[self.L]).fill_(1.).add_( torch.FloatTensor(self.layer_sizes[self.L]).normal_(0, 0.1))) self.beta = nn.ParameterList([ Parameter( torch.FloatTensor(self.layer_sizes[l]).fill_(0.).add_( torch.FloatTensor(self.layer_sizes[l]).normal_(0, 0.1))) for l in range(0, self.L + 1) ]) self.means = [None] * (L + 1) self.stds = [None] * (L + 1) self.z = [None] * (L + 1) self.h = [None] * (L + 1) self.z_noise = [None] * (L + 1) self.h_noise = [None] * (L + 1) self.u = [None] * (L + 1) self.z_hat = [None] * (L + 1) self.z_hat_bn = [None] * (L + 1) self.noise_mean = 0. self.noise_std = 0.2 self.denoising_cost = [1000., 10., 0.1, 0.1, 0.1, 0.1, 0.1] def encoder(self, x): L = self.L m = x.size()[0] self.z[0] = x.view(-1, self.layer_sizes[0]) self.h[0] = self.z[0] self.z_noise[0] = add_noise(self.z[0]) self.h_noise[0] = self.z_noise[0] # corrupted encoder for i in range(1, L + 1): self.z_noise[i] = self.encoder_layers[i](self.h_noise[i - 1]) self.z_noise[i] = nn.BatchNorm1d(self.layer_sizes[i])( self.z_noise[i]) self.z_noise[i] = add_noise(self.z_noise[i], self.noise_mean, self.noise_std) self.h_noise[i] = Variable( torch.FloatTensor(self.z_noise[i].size())) if i == L: for j in range(m): self.h_noise[L][j] = self.gamma.mul(self.z_noise[L][j].add( self.beta[L])) else: for j in range(m): self.h_noise[i][j] = nn.ReLU()(self.z_noise[i][j] + self.beta[i]) self.y_noise = self.h_noise[L] self.means[0] = self.z[0].mean(0) self.stds[0] = Variable(self.z[0].data.std(0)) self.stds[0].data.add_( torch.FloatTensor(self.stds[0].data.size()).fill_(1e-4)) # clean encoder for i in range(1, L + 1): # linear transformation self.z[i] = self.encoder_layers[i](self.h[i - 1]) # normalization self.means[i] = self.z[i].mean(0) self.stds[i] = Variable(self.z[i].data.std(0)) self.z[i] = nn.BatchNorm1d(self.layer_sizes[i])(self.z[i]) self.h[i] = Variable(torch.FloatTensor(self.z[i].size())) # non-linearity if i == L: for j in range(m): self.h[L][j] = self.gamma.mul(self.z[L][j].add( self.beta[L])) else: for j in range(m): self.h[i][j] = nn.ReLU()(self.z[i][j] + self.beta[i]) self.y_noise = nn.LogSoftmax()(self.h_noise[L]) self.y = nn.LogSoftmax()(self.h[L]) return self.y, self.y_noise def decoder(self, x): L = self.L # get batch size m = x.size()[0] for l in range(L, -1, -1): self.z_hat[l] = Variable(torch.FloatTensor(m, self.layer_sizes[l])) self.z_hat_bn[l] = Variable( torch.FloatTensor(m, self.layer_sizes[l])) if l == L: self.u[L] = nn.BatchNorm1d(self.layer_sizes[L])( self.h_noise[L]) else: self.u[l] = nn.BatchNorm1d(self.layer_sizes[l])( self.decoder_layers[l + 1](self.z_hat[l + 1])) def g(z_noise, u, l): alpha = self.alpha_layers[l] m = z_noise.size()[0] mu = Variable(torch.FloatTensor(u.size())) v = Variable(torch.FloatTensor(u.size())) for i in range(m): mu[i] = alpha[0] * nn.Sigmoid( )(alpha[1] * u[i] + alpha[2]) + alpha[3] * u[i] + alpha[4] v[i] = alpha[5] * nn.Sigmoid( )(alpha[6] * u[i] + alpha[7]) + alpha[8] * u[i] + alpha[9] self.z_hat[l] = (z_noise - mu) * v + mu #self.z_hat[l][i] = params[6].add(params[0].mul(z_noise[i])).add(params[2].mul(u[i])).add(params[4].mul(z_noise[i]).mul(u[i])) \ # .add(params[8].mul(nn.Sigmoid()(params[7].add(params[1].mul(z_noise[i])).add(params[3].mul(u[i])) \ # .add(params[5].mul(z_noise[i]).mul(u[i]))))) g(self.z_noise[l], self.u[l], l) for i in range(m): if l == 0: n = self.layer_sizes[l] self.z_hat_bn[l][i] = self.z_hat[l][i] else: self.z_hat_bn[l][i] = (self.z_hat[l][i] - self.means[l]) / self.stds[l] return self.z_hat[0] def forward(self, x): self.batch_size = x.size()[0] y, y_noise = self.encoder(x) z_hat = self.decoder(y) return y, z_hat def unsup_cost(self): # unsupervised denoising reconstruction cost unsupervised_func = nn.MSELoss() CD = 0. for l in range(0, self.L + 1): clean_target = torch.Tensor(self.z[l].size()) clean_target.copy_(self.z[l].data) clean_target = Variable(clean_target) #print(unsupervised_func(self.z_hat_bn[l], clean_target)) CD += self.denoising_cost[l] * unsupervised_func( self.z_hat_bn[l], clean_target) return CD def sup_cost(self, target): # supervised cost supervised_func = nn.NLLLoss() CC = supervised_func(self.y_noise, target) return CC
class IndRNNCell(nn.Module): r"""An IndRNN cell with tanh or ReLU non-linearity. .. math:: h' = \tanh(w_{ih} * x + b_{ih} + w_{hh} (*) h) With (*) being element-wise vector multiplication. If nonlinearity='relu', then ReLU is used in place of tanh. Args: input_size: The number of expected features in the input x hidden_size: The number of features in the hidden state h bias: If ``False``, then the layer does not use bias weights b_ih and b_hh. Default: ``True`` nonlinearity: The non-linearity to use ['tanh'|'relu']. Default: 'relu' hidden_min_abs: Minimal absolute inital value for hidden weights. Default: 0 hidden_max_abs: Maximal absolute inital value for hidden weights. Default: None Inputs: input, hidden - **input** (batch, input_size): tensor containing input features - **hidden** (batch, hidden_size): tensor containing the initial hidden state for each element in the batch. Outputs: h' - **h'** (batch, hidden_size): tensor containing the next hidden state for each element in the batch Attributes: weight_ih: the learnable input-hidden weights, of shape `(input_size x hidden_size)` weight_hh: the learnable hidden-hidden weights, of shape `(hidden_size)` bias_ih: the learnable input-hidden bias, of shape `(hidden_size)` Examples:: >>> rnn = nn.IndRNNCell(10, 20) >>> input = Variable(torch.randn(6, 3, 10)) >>> hx = Variable(torch.randn(3, 20)) >>> output = [] >>> for i in range(6): ... hx = rnn(input[i], hx) ... output.append(hx) """ def __init__(self, input_size, hidden_size, bias=True, nonlinearity="relu", hidden_min_abs=0, hidden_max_abs=None, hidden_init=None, recurrent_init=None, gradient_clip=None): super(IndRNNCell, self).__init__() self.hidden_max_abs = hidden_max_abs self.hidden_min_abs = hidden_min_abs self.input_size = input_size self.hidden_size = hidden_size self.bias = bias self.nonlinearity = nonlinearity self.hidden_init = hidden_init self.recurrent_init = recurrent_init if self.nonlinearity == "tanh": self.activation = F.tanh elif self.nonlinearity == "relu": self.activation = F.relu else: raise RuntimeError("Unknown nonlinearity: {}".format( self.nonlinearity)) self.weight_ih = Parameter(torch.Tensor(hidden_size, input_size)) self.weight_hh = Parameter(torch.Tensor(hidden_size)) if bias: self.bias_ih = Parameter(torch.Tensor(hidden_size)) else: self.register_parameter('bias_ih', None) if gradient_clip: if isinstance(gradient_clip, tuple): min_g, max_g = gradient_clip else: max_g = gradient_clip min_g = -max_g self.weight_ih.register_hook( lambda x: x.clamp(min=min_g, max=max_g)) self.weight_hh.register_hook( lambda x: x.clamp(min=min_g, max=max_g)) if bias: self.bias_ih.register_hook( lambda x: x.clamp(min=min_g, max=max_g)) self.reset_parameters() def reset_parameters(self): for name, weight in self.named_parameters(): if "bias" in name: weight.data.zero_() elif "weight_hh" in name: if self.recurrent_init is None: nn.init.constant_(weight, 1) else: self.recurrent_init(weight) elif "weight_ih" in name: if self.hidden_init is None: nn.init.normal_(weight, 0, 0.01) else: self.hidden_init(weight) else: weight.data.normal_(0, 0.01) # weight.data.uniform_(-stdv, stdv) self.check_bounds() def check_bounds(self): if self.hidden_min_abs: abs_kernel = torch.abs( self.weight_hh.data).clamp_(min=self.hidden_min_abs) self.weight_hh.data = self.weight_hh.mul( torch.sign(self.weight_hh.data), abs_kernel) if self.hidden_max_abs: self.weight_hh.data = self.weight_hh.clamp( max=self.hidden_max_abs, min=-self.hidden_max_abs) def forward(self, input, hx): return self.activation( F.linear(input, self.weight_ih, self.bias_ih) + F.mul(self.weight_hh, hx))