Exemple #1
0
    def from_options(cls, dim, attn_type="dot", attn_func="softmax"):
        str2score = {"dot": DotScorer(), "general": GeneralScorer(dim)}
        str2func = {
            "softmax": nn.Softmax(dim=-1),
            "sparsemax": Sparsemax(dim=-1),
            "fusedmax": Fusedmax(),
            "oscarmax": Oscarmax()
        }
        score = str2score[attn_type]
        transform = str2func[attn_func]

        return cls(score, transform)
Exemple #2
0
    def from_options(cls, dim, attn_type="dot",
                     attn_func="softmax", gate_func="softmax"):
        lemma_attn = AttentionHead.from_options(
            dim, attn_type=attn_type, attn_func=attn_func)
        inflection_attn = AttentionHead.from_options(
            dim, attn_type=attn_type, attn_func=attn_func)
        attn_output_layer = nn.Sequential(
            nn.Linear(dim * 2, dim, bias=False), nn.Tanh()
        )
        str2func = {
            "softmax": nn.Softmax(dim=-1), "sparsemax": Sparsemax(dim=-1)
        }
        gate_transform = str2func[gate_func]

        # try it with bias?
        gate = nn.Sequential(nn.Linear(dim * 3, 2, bias=True), gate_transform)
        return cls(lemma_attn, inflection_attn, attn_output_layer, gate)
Exemple #3
0
    def from_options(cls,
                     dim,
                     attn_type="dot",
                     attn_func="softmax",
                     gate_func="softmax",
                     combine_gate_input=False,
                     n_global_heads=1,
                     infl_attn_func=None):
        lemma_attn = AttentionHead.from_options(dim,
                                                attn_type=attn_type,
                                                attn_func=attn_func)
        if infl_attn_func == None:
            infl_attn_func = attn_func
        inflection_attn = AttentionHead.from_options(dim,
                                                     attn_type=attn_type,
                                                     attn_func=infl_attn_func)
        lemma_out = nn.Sequential(nn.Linear(dim * 2, dim, bias=False),
                                  nn.Tanh())
        infl_out = nn.Sequential(nn.Linear(dim * 2, dim, bias=False),
                                 nn.Tanh())
        str2func = {
            "softmax": nn.Softmax(dim=-1),
            "sparsemax": Sparsemax(dim=-1)
        }
        gate_transform = str2func[gate_func]

        # try it with bias?
        if combine_gate_input:
            # input is global head (1 or more), two local heads and query (decoder state)
            gate = nn.Sequential(
                nn.Linear(dim * (n_global_heads + 3), 2, bias=True),
                gate_transform)
        else:
            # input is global head (1 or more) and query (decoder state)
            gate = nn.Sequential(
                nn.Linear(dim * (n_global_heads + 1), 2, bias=True),
                gate_transform)

        return cls(lemma_attn, inflection_attn, lemma_out, infl_out, gate,
                   combine_gate_input)
    def __init__(
        self,
        head_count,
        model_dim,
        dropout=0.1,
        attn_func="softmax",
        attn_alpha=None,
        attn_bisect_iter=0,
    ):
        assert model_dim % head_count == 0
        self.dim_per_head = model_dim // head_count
        self.model_dim = model_dim

        super(MultiHeadedAttention, self).__init__()
        self.head_count = head_count

        self.linear_keys = nn.Linear(model_dim, head_count * self.dim_per_head)
        self.linear_values = nn.Linear(model_dim, head_count * self.dim_per_head)
        self.linear_query = nn.Linear(model_dim, head_count * self.dim_per_head)

        if attn_func == "softmax":
            self.normalization = nn.Softmax(dim=-1)
        elif attn_func == "esoftmax":
            self.normalization = ESoftmax(dim=-1)
        elif attn_func == "sparsemax":
            self.normalization = Sparsemax(dim=-1)
        elif attn_func == "tsallis15":
            self.normalization = Tsallis15(dim=-1)
        elif attn_func == "tsallis":
            self.normalization = TsallisBisect(
                alpha=attn_alpha, n_iter=attn_bisect_iter
            )
        else:
            raise ValueError(f"Unsupported attention function: {attn_func}")

        self.dropout = nn.Dropout(dropout)
        self.final_linear = nn.Linear(model_dim, model_dim)
import pytest

from onmt.modules.sparse_activations import (
    Sparsemax,
    Tsallis15,
    SparsemaxTopK,
    Tsallis15TopK,
)

from onmt.modules.root_finding import (
    sparsemax_bisect,
    tsallis_bisect,
)

funcs = [
    Sparsemax(dim=1),
    Tsallis15(dim=1),
    SparsemaxTopK(dim=1),
    Tsallis15TopK(dim=1),
    sparsemax_bisect,
    tsallis_bisect,
]


@pytest.mark.parametrize('func', funcs)
@pytest.mark.parametrize('dtype', (torch.float32, torch.float64))
def test_mask(func, dtype):
    torch.manual_seed(42)
    x = torch.randn(2, 6, dtype=dtype)
    x[:, 3:] = -float('inf')
    x0 = x[:, :3]