Exemple #1
0
    def __init__(self, kdim, qdim, adim, atype, n_heads, init_r,
                 conv1d=False, conv_kernel_size=5, bias=True, param_init=''):
        """Energy function for the monotonic attenion.

        Args:
            kdim (int): dimension of key
            qdim (int): dimension of quary
            adim (int): dimension of attention space
            atype (str): type of attention mechanism
            n_heads (int): number of heads
            init_r (int): initial value for offset r
            conv1d (bool): use 1D causal convolution for energy calculation
            conv_kernel_size (int): kernel size for 1D convolution
            bias (bool): use bias term in linear layers
            param_init (str): parameter initialization method

        """
        super().__init__()

        assert conv_kernel_size % 2 == 1, "Kernel size should be odd for 'same' conv."
        self.key = None
        self.mask = None

        self.atype = atype
        assert adim % n_heads == 0
        self.d_k = adim // n_heads
        self.n_heads = n_heads
        self.scale = math.sqrt(adim)

        if atype == 'add':
            self.w_key = nn.Linear(kdim, adim)
            self.v = nn.Linear(adim, n_heads, bias=False)
            self.w_query = nn.Linear(qdim, adim, bias=False)
        elif atype == 'scaled_dot':
            self.w_key = nn.Linear(kdim, adim, bias=bias)
            self.w_query = nn.Linear(qdim, adim, bias=bias)
        else:
            raise NotImplementedError(atype)

        self.r = nn.Parameter(torch.Tensor([init_r]))
        logger.info('init_r is initialized with %d' % init_r)

        if atype == 'add':
            self.v = nn.utils.weight_norm(self.v, name='weight', dim=0)
            # initialization
            self.v.weight_g.data = torch.Tensor([1 / adim]).sqrt()
        elif atype == 'scaled_dot':
            # self.w_query = nn.utils.weight_norm(self.w_query, name='weight', dim=0)
            # initialization
            # self.w_query.weight_g.data = torch.Tensor([1 / adim]).sqrt()
            if param_init == 'xavier_uniform':
                self.reset_parameters(bias)
            # TODO: debug weight normalization

        self.conv1d = None
        if conv1d:
            self.conv1d = CausalConv1d(in_channels=kdim,
                                       out_channels=kdim,
                                       kernel_size=conv_kernel_size,
                                       stride=1)
Exemple #2
0
    def __init__(self,
                 kdim,
                 qdim,
                 adim,
                 atype,
                 n_heads,
                 init_r,
                 bias=True,
                 param_init='',
                 conv1d=False,
                 conv_kernel_size=5):

        super().__init__()

        assert conv_kernel_size % 2 == 1, "Kernel size should be odd for 'same' conv."
        self.key = None
        self.mask = None

        self.atype = atype
        assert adim % n_heads == 0
        self.d_k = adim // n_heads
        self.n_heads = n_heads
        self.scale = math.sqrt(adim)

        if atype == 'add':
            self.w_key = nn.Linear(kdim, adim)
            self.v = nn.Linear(adim, n_heads, bias=False)
            self.w_query = nn.Linear(qdim, adim, bias=False)
        elif atype == 'scaled_dot':
            self.w_key = nn.Linear(kdim, adim, bias=bias)
            self.w_query = nn.Linear(qdim, adim, bias=bias)
        else:
            raise NotImplementedError(atype)

        self.r = nn.Parameter(torch.Tensor([init_r]))
        logger.info('init_r is initialized with %d' % init_r)

        self.conv1d = None
        if conv1d:
            self.conv1d = CausalConv1d(in_channels=kdim,
                                       out_channels=kdim,
                                       kernel_size=conv_kernel_size,
                                       param_init=param_init)
            # padding=(conv_kernel_size - 1) // 2

        if atype == 'add':
            self.v = nn.utils.weight_norm(self.v, name='weight', dim=0)
            # initialization
            self.v.weight_g.data = torch.Tensor([1 / adim]).sqrt()
        elif atype == 'scaled_dot':
            if param_init == 'xavier_uniform':
                self.reset_parameters(bias)
    def __init__(self,
                 d_model,
                 dropout,
                 pe_type,
                 param_init,
                 max_len=5000,
                 conv_kernel_size=3,
                 layer_norm_eps=1e-12):

        super().__init__()

        self.d_model = d_model
        self.pe_type = pe_type
        self.scale = math.sqrt(self.d_model)

        if '1dconv' in pe_type:
            causal_conv1d = CausalConv1d(in_channels=d_model,
                                         out_channels=d_model,
                                         kernel_size=conv_kernel_size,
                                         param_init=param_init)
            layers = []
            nlayers = int(pe_type.replace('1dconv', '')[0])
            for _ in range(nlayers):
                layers.append(copy.deepcopy(causal_conv1d))
                layers.append(nn.LayerNorm(d_model, eps=layer_norm_eps))
                layers.append(nn.ReLU())
                layers.append(nn.Dropout(p=dropout))
            self.pe = nn.Sequential(*layers)

        elif pe_type != 'none':
            # Compute the positional encodings once in log space.
            pe = torch.zeros(max_len, d_model, dtype=torch.float32)
            position = torch.arange(0, max_len,
                                    dtype=torch.float32).unsqueeze(1)
            div_term = torch.exp(
                torch.arange(0, d_model, 2).float() *
                -(math.log(10000.0) / d_model))
            pe[:, 0::2] = torch.sin(position * div_term)
            pe[:, 1::2] = torch.cos(position * div_term)
            pe = pe.unsqueeze(0)  # for batch dimension
            self.register_buffer('pe', pe)

        self.dropout = nn.Dropout(p=dropout)

        logger.info('Positional encoding: %s' % pe_type)
Exemple #4
0
    def __init__(self,
                 d_model,
                 dropout,
                 pe_type,
                 max_len=5000,
                 conv_kernel_size=3,
                 layer_norm_eps=1e-12):
        super(PositionalEncoding, self).__init__()

        self.d_model = d_model
        self.pe_type = pe_type

        if pe_type == '1dconv':
            causal_conv1d = CausalConv1d(in_channels=d_model,
                                         out_channels=d_model,
                                         kernel_size=conv_kernel_size,
                                         stride=1)
            # padding=(conv_kernel_size - 1) // 2
            self.pe = nn.Sequential(copy.deepcopy(causal_conv1d),
                                    nn.LayerNorm(d_model, eps=layer_norm_eps),
                                    nn.ReLU(), nn.Dropout(p=dropout),
                                    copy.deepcopy(causal_conv1d),
                                    nn.LayerNorm(d_model, eps=layer_norm_eps),
                                    nn.ReLU(), nn.Dropout(p=dropout),
                                    copy.deepcopy(causal_conv1d),
                                    nn.LayerNorm(d_model, eps=layer_norm_eps),
                                    nn.ReLU())
            self.dropout = nn.Dropout(p=dropout)  # for the last layer
        elif pe_type != 'none':
            # Compute the positional encodings once in log space.
            pe = torch.zeros(max_len, d_model, dtype=torch.float32)
            position = torch.arange(0, max_len,
                                    dtype=torch.float32).unsqueeze(1)
            div_term = torch.exp(
                torch.arange(0, d_model, 2).float() *
                -(math.log(10000.0) / d_model))
            pe[:, 0::2] = torch.sin(position * div_term)
            pe[:, 1::2] = torch.cos(position * div_term)
            pe = pe.unsqueeze(0)  # for batch dimension
            self.register_buffer('pe', pe)
            self.dropout = nn.Dropout(p=dropout)

        logger.info('Positional encoding: %s' % pe_type)
Exemple #5
0
    def __init__(self,
                 kdim,
                 qdim,
                 adim,
                 init_r=None,
                 conv1d=False,
                 conv_kernel_size=5):
        """Energy function.

        Args:
            kdim (int): dimension of key
            qdim (int): dimension of quary
            adim (int): dimension of attention space
            init_r (int): initial value for offset r
            conv1d (bool): use 1D causal convolution for energy calculation
            conv_kernel_size (int): kernel size for 1D convolution

        """
        super().__init__()

        assert conv_kernel_size % 2 == 1, "Kernel size should be odd for 'same' conv."
        self.key = None
        self.mask = None

        self.w_key = nn.Linear(kdim, adim)
        self.w_query = nn.Linear(qdim, adim, bias=False)
        self.v = nn.Linear(adim, 1, bias=False)
        if init_r is not None:
            # for alpha
            self.r = nn.Parameter(torch.Tensor([init_r]))
            self.v = nn.utils.weight_norm(self.v, name='weight', dim=0)
            # initialization
            self.v.weight_g.data = torch.Tensor([1 / adim]).sqrt()
        else:
            # for beta
            self.r = None

        self.conv1d = None
        if conv1d:
            self.conv1d = CausalConv1d(in_channels=kdim,
                                       out_channels=kdim,
                                       kernel_size=conv_kernel_size,
                                       stride=1)