def __init__(self, head_count, model_dim, p=0.1): """ Args: head_count(int): number of parallel heads. model_dim(int): the dimension of keys/values/queries in this MultiHeadedAttention, must be divisible by head_count. """ assert model_dim % head_count == 0 self.dim_per_head = model_dim // head_count self.model_dim = model_dim super(MultiHeadedAttention, self).__init__() self.head_count = head_count self.linear_keys = BottleLinear(model_dim, head_count * self.dim_per_head, bias=False) self.linear_values = BottleLinear(model_dim, head_count * self.dim_per_head, bias=False) self.linear_query = BottleLinear(model_dim, head_count * self.dim_per_head, bias=False) self.sm = BottleSoftmax() self.activation = nn.ReLU() self.layer_norm = BottleLayerNorm(model_dim) self.dropout = nn.Dropout(p) self.res_dropout = nn.Dropout(p)
def __init__(self, dim, coverage=False, attn_type="dotprod"): super(GlobalAttention, self).__init__() self.dim = dim self.attn_type = attn_type assert (self.attn_type in ["dotprod", "mlp"]), ("Please select a valid attention type.") if self.attn_type == "dotprod": self.linear_in = nn.Linear(dim, dim, bias=False) self.linear_out = nn.Linear(dim * 2, dim, bias=False) elif self.attn_type == "mlp": self.linear_context = BottleLinear(dim, dim, bias=False) self.linear_query = nn.Linear(dim, dim, bias=True) self.mlp_tanh = nn.Tanh() self.v = BottleLinear(dim, 1, bias=False) self.linear_out = nn.Linear(dim * 2, dim, bias=True) self.sm = nn.Softmax() self.tanh = nn.Tanh() self.mask = None if coverage: self.linear_cover = nn.Linear(1, dim, bias=False)
def __init__(self, n_head, d_model, p=0.1): self.d_k = d_model // n_head super(MultiHeadedAttention, self).__init__() heads = self.heads = n_head self.linear_keys = BottleLinear(d_model, heads * self.d_k, bias=False) self.linear_values = BottleLinear(d_model, heads * self.d_k, bias=False) self.linear_query = BottleLinear(d_model, heads * self.d_k, bias=False) self.sm = BottleSoftmax() self.activation = nn.ReLU() self.layer_norm = BottleLayerNorm(d_model) self.dropout = nn.Dropout(p) self.res_dropout = nn.Dropout(p)
def __init__(self, dim, coverage=False, attn_type="dotprod", attn_transform="softmax", c_attn=0.0): super(GlobalAttention, self).__init__() self.dim = dim self.attn_type = attn_type assert (self.attn_type in ["dotprod", "mlp"]), ("Please select a valid attention type.") if self.attn_type == "dotprod": self.linear_in = nn.Linear(dim, dim, bias=False) self.linear_out = nn.Linear(dim * 2, dim, bias=False) elif self.attn_type == "mlp": self.linear_context = BottleLinear(dim, dim, bias=False) self.linear_query = nn.Linear(dim, dim, bias=False) self.v = BottleLinear(dim, 1, bias=False) # Modify initialization of self.v to have high variance # self.v.weight.data.normal_(0, 1000) if attn_transform == 'softmax': self.sm = nn.Softmax() elif attn_transform == 'sparsemax': self.sm = Sparsemax() elif attn_transform == 'constrained_softmax': self.sm = ConstrainedSoftmax() elif attn_transform == 'constrained_sparsemax': self.sm = ConstrainedSparsemax() else: raise NotImplementedError self.attn_transform = attn_transform self.tanh = nn.Tanh() self.mask = None self.c_attn = c_attn if coverage: self.linear_cover = nn.Linear(1, dim, bias=False)