def __init__(self, kdim, qdim, adim, atype, n_heads, init_r, conv1d=False, conv_kernel_size=5, bias=True, param_init=''): """Energy function for the monotonic attenion. Args: kdim (int): dimension of key qdim (int): dimension of quary adim (int): dimension of attention space atype (str): type of attention mechanism n_heads (int): number of heads init_r (int): initial value for offset r conv1d (bool): use 1D causal convolution for energy calculation conv_kernel_size (int): kernel size for 1D convolution bias (bool): use bias term in linear layers param_init (str): parameter initialization method """ super().__init__() assert conv_kernel_size % 2 == 1, "Kernel size should be odd for 'same' conv." self.key = None self.mask = None self.atype = atype assert adim % n_heads == 0 self.d_k = adim // n_heads self.n_heads = n_heads self.scale = math.sqrt(adim) if atype == 'add': self.w_key = nn.Linear(kdim, adim) self.v = nn.Linear(adim, n_heads, bias=False) self.w_query = nn.Linear(qdim, adim, bias=False) elif atype == 'scaled_dot': self.w_key = nn.Linear(kdim, adim, bias=bias) self.w_query = nn.Linear(qdim, adim, bias=bias) else: raise NotImplementedError(atype) self.r = nn.Parameter(torch.Tensor([init_r])) logger.info('init_r is initialized with %d' % init_r) if atype == 'add': self.v = nn.utils.weight_norm(self.v, name='weight', dim=0) # initialization self.v.weight_g.data = torch.Tensor([1 / adim]).sqrt() elif atype == 'scaled_dot': # self.w_query = nn.utils.weight_norm(self.w_query, name='weight', dim=0) # initialization # self.w_query.weight_g.data = torch.Tensor([1 / adim]).sqrt() if param_init == 'xavier_uniform': self.reset_parameters(bias) # TODO: debug weight normalization self.conv1d = None if conv1d: self.conv1d = CausalConv1d(in_channels=kdim, out_channels=kdim, kernel_size=conv_kernel_size, stride=1)
def __init__(self, kdim, qdim, adim, atype, n_heads, init_r, bias=True, param_init='', conv1d=False, conv_kernel_size=5): super().__init__() assert conv_kernel_size % 2 == 1, "Kernel size should be odd for 'same' conv." self.key = None self.mask = None self.atype = atype assert adim % n_heads == 0 self.d_k = adim // n_heads self.n_heads = n_heads self.scale = math.sqrt(adim) if atype == 'add': self.w_key = nn.Linear(kdim, adim) self.v = nn.Linear(adim, n_heads, bias=False) self.w_query = nn.Linear(qdim, adim, bias=False) elif atype == 'scaled_dot': self.w_key = nn.Linear(kdim, adim, bias=bias) self.w_query = nn.Linear(qdim, adim, bias=bias) else: raise NotImplementedError(atype) self.r = nn.Parameter(torch.Tensor([init_r])) logger.info('init_r is initialized with %d' % init_r) self.conv1d = None if conv1d: self.conv1d = CausalConv1d(in_channels=kdim, out_channels=kdim, kernel_size=conv_kernel_size, param_init=param_init) # padding=(conv_kernel_size - 1) // 2 if atype == 'add': self.v = nn.utils.weight_norm(self.v, name='weight', dim=0) # initialization self.v.weight_g.data = torch.Tensor([1 / adim]).sqrt() elif atype == 'scaled_dot': if param_init == 'xavier_uniform': self.reset_parameters(bias)
def __init__(self, d_model, dropout, pe_type, param_init, max_len=5000, conv_kernel_size=3, layer_norm_eps=1e-12): super().__init__() self.d_model = d_model self.pe_type = pe_type self.scale = math.sqrt(self.d_model) if '1dconv' in pe_type: causal_conv1d = CausalConv1d(in_channels=d_model, out_channels=d_model, kernel_size=conv_kernel_size, param_init=param_init) layers = [] nlayers = int(pe_type.replace('1dconv', '')[0]) for _ in range(nlayers): layers.append(copy.deepcopy(causal_conv1d)) layers.append(nn.LayerNorm(d_model, eps=layer_norm_eps)) layers.append(nn.ReLU()) layers.append(nn.Dropout(p=dropout)) self.pe = nn.Sequential(*layers) elif pe_type != 'none': # Compute the positional encodings once in log space. pe = torch.zeros(max_len, d_model, dtype=torch.float32) position = torch.arange(0, max_len, dtype=torch.float32).unsqueeze(1) div_term = torch.exp( torch.arange(0, d_model, 2).float() * -(math.log(10000.0) / d_model)) pe[:, 0::2] = torch.sin(position * div_term) pe[:, 1::2] = torch.cos(position * div_term) pe = pe.unsqueeze(0) # for batch dimension self.register_buffer('pe', pe) self.dropout = nn.Dropout(p=dropout) logger.info('Positional encoding: %s' % pe_type)
def __init__(self, d_model, dropout, pe_type, max_len=5000, conv_kernel_size=3, layer_norm_eps=1e-12): super(PositionalEncoding, self).__init__() self.d_model = d_model self.pe_type = pe_type if pe_type == '1dconv': causal_conv1d = CausalConv1d(in_channels=d_model, out_channels=d_model, kernel_size=conv_kernel_size, stride=1) # padding=(conv_kernel_size - 1) // 2 self.pe = nn.Sequential(copy.deepcopy(causal_conv1d), nn.LayerNorm(d_model, eps=layer_norm_eps), nn.ReLU(), nn.Dropout(p=dropout), copy.deepcopy(causal_conv1d), nn.LayerNorm(d_model, eps=layer_norm_eps), nn.ReLU(), nn.Dropout(p=dropout), copy.deepcopy(causal_conv1d), nn.LayerNorm(d_model, eps=layer_norm_eps), nn.ReLU()) self.dropout = nn.Dropout(p=dropout) # for the last layer elif pe_type != 'none': # Compute the positional encodings once in log space. pe = torch.zeros(max_len, d_model, dtype=torch.float32) position = torch.arange(0, max_len, dtype=torch.float32).unsqueeze(1) div_term = torch.exp( torch.arange(0, d_model, 2).float() * -(math.log(10000.0) / d_model)) pe[:, 0::2] = torch.sin(position * div_term) pe[:, 1::2] = torch.cos(position * div_term) pe = pe.unsqueeze(0) # for batch dimension self.register_buffer('pe', pe) self.dropout = nn.Dropout(p=dropout) logger.info('Positional encoding: %s' % pe_type)
def __init__(self, kdim, qdim, adim, init_r=None, conv1d=False, conv_kernel_size=5): """Energy function. Args: kdim (int): dimension of key qdim (int): dimension of quary adim (int): dimension of attention space init_r (int): initial value for offset r conv1d (bool): use 1D causal convolution for energy calculation conv_kernel_size (int): kernel size for 1D convolution """ super().__init__() assert conv_kernel_size % 2 == 1, "Kernel size should be odd for 'same' conv." self.key = None self.mask = None self.w_key = nn.Linear(kdim, adim) self.w_query = nn.Linear(qdim, adim, bias=False) self.v = nn.Linear(adim, 1, bias=False) if init_r is not None: # for alpha self.r = nn.Parameter(torch.Tensor([init_r])) self.v = nn.utils.weight_norm(self.v, name='weight', dim=0) # initialization self.v.weight_g.data = torch.Tensor([1 / adim]).sqrt() else: # for beta self.r = None self.conv1d = None if conv1d: self.conv1d = CausalConv1d(in_channels=kdim, out_channels=kdim, kernel_size=conv_kernel_size, stride=1)