class LSTM(nn.Module): def __init__(self, input_size, hidden_size, bias=True, attention=False, is_cuda=False): super(LSTM, self).__init__() self.input_size = input_size self.hidden_size = hidden_size self.lstm = nn.LSTM(input_size, hidden_size, bias=bias) self.attention = attention if attention: self.attention_linear = nn.Linear(2 * hidden_size, hidden_size, bias=bias) self.ws = Parameter(torch.randn(hidden_size, hidden_size)) self.vs = Parameter(torch.randn(hidden_size, 1)) self.previous_hidden = None self.device = "cuda:0" if is_cuda else "cpu" def detach_attention_params(self): self.ws.detach_().requires_grad = True self.vs.detach_().requires_grad = True def __str__(self): return 'LSTM ({}, {} attention:{})'.format(self.input_size, self.hidden_size, self.attention) def reset_parameters(self): std = 1.0 / math.sqrt(self.hidden_size) for w in self.parameters(): w.data.uniform_(-std, std) def forward(self, input, state): batch_size = input.size(0) output, new_state = self.lstm(input.unsqueeze(0), state) if not self.attention: return output.squeeze(0), state if self.previous_hidden is None: self.previous_hidden = torch.zeros( (batch_size, 1, self.hidden_size), device=self.device) combined_hy = decoder_function(output.squeeze(0), self.previous_hidden, self.ws, self.vs) new_hy = self.attention_linear(combined_hy) self.previous_hidden = torch.cat( [self.previous_hidden, output.permute([1, 0, 2])], dim=1) return new_hy, state
def _init_weight(out: nn.Parameter): """Identical to the XLM create_sinusoidal_embeddings except features are not interleaved. The cos features are in the 2nd half of the vector. [dim // 2:] """ n_pos, dim = out.shape position_enc = np.array( [[pos / np.power(10000, 2 * (j // 2) / dim) for j in range(dim)] for pos in range(n_pos)] ) out[:, 0 : dim // 2] = torch.FloatTensor(np.sin(position_enc[:, 0::2])) # This line breaks for odd n_pos out[:, dim // 2 :] = torch.FloatTensor(np.cos(position_enc[:, 1::2])) out.detach_() out.requires_grad = False return out
def _init_weight(out: nn.Parameter): n_pos, dim = out.shape position_enc = np.array( [ [pos / np.power(10000, 2 * (j // 2) / dim) for j in range(dim)] for pos in range(n_pos) ] ) out.requires_grad = False # set early to avoid an error in pytorch-1.8+ sentinel = dim // 2 if dim % 2 == 0 else (dim // 2) + 1 out[:, 0:sentinel] = torch.FloatTensor(np.sin(position_enc[:, 0::2])) out[:, sentinel:] = torch.FloatTensor(np.cos(position_enc[:, 1::2])) out.detach_() return out
def _init_weight(out: nn.Parameter): """ Identical to the XLM create_sinusoidal_embeddings except features are not interleaved. The cos features are in the 2nd half of the vector. [dim // 2:] """ n_pos, dim = out.shape position_enc = np.array( [[pos / np.power(10000, 2 * (j // 2) / dim) for j in range(dim)] for pos in range(n_pos)]) out.requires_grad = False # set early to avoid an error in pytorch-1.8+ sentinel = dim // 2 if dim % 2 == 0 else (dim // 2) + 1 out[:, 0:sentinel] = torch.FloatTensor(np.sin(position_enc[:, 0::2])) out[:, sentinel:] = torch.FloatTensor(np.cos(position_enc[:, 1::2])) out.detach_() return out
class GatedResidual(nn.Module): def __init__(self, layer, gate_init=0.0): super().__init__() self.layer = layer self.alpha = Parameter(torch.tensor([gate_init])) def forward(self, x): gate = activation.sigmoid(self.alpha) y = self.layer(x) return gate * x + (1 - gate) * y def json(self, params=False): res = OrderedDict([('type', "GatedResidual"), ('sublayers', self.layer.json(params))]) if params: res['params'] = OrderedDict([('alpha', float(self.alpha.detach_().numpy()[0]))]) return res