Exemple #1
0
 def __init__(
     self,
     input_sz,
     output_sz,
     d_model,
     nhead,
     num_encoder_layers,
     num_decoder_layers,
     dim_feedforward,
     dropout,
 ):
     super(TransformerModel, self).__init__()
     self.transformer = Transformer(
         d_model=d_model,
         nhead=nhead,
         num_encoder_layers=num_encoder_layers,
         num_decoder_layers=num_decoder_layers,
         dim_feedforward=dim_feedforward,
         dropout=dropout,
         batch_first=False,
     )
     self.softmax = nn.Softmax(dim=2)
     self.linear = nn.Linear(d_model, output_sz)
     self.pos_encoder = PositionalEncoding(d_model, dropout)
     self.pos_decoder = PositionalEncoding(d_model, dropout)
     self.src_embedding = Embeddings(input_sz, d_model)
     self.tgt_embedding = Embeddings(output_sz, d_model)
Exemple #2
0
 def __init__(self, emb_sz, emb_dim, hidden_size, nfc, n_classes, num_layers=1):
     super(LSTMText, self).__init__()
     self.emb_sz = emb_sz
     self.emb_dim = emb_dim
     self.n_classes = n_classes
     self.hidden_size = hidden_size
     self.nfc = nfc
     self.num_layers = num_layers
     self.bilstm = BiLSTM(emb_dim, hidden_size, num_layers)
     self.embedding = nn.Embedding(self.emb_sz, self.emb_dim)
     self.linear = nn.Linear(hidden_size * 2 * nfc, n_classes)
     self.softmax = nn.Softmax(dim=1)
Exemple #3
0
    def _attn(self, query, key, value):
        attn_weights = flow.matmul(query, key.transpose(-2, -1))

        if self.scale_attn_weights:
            attn_weights = attn_weights / (float(value.size(-1))**0.5)

        query_length, key_length = query.size(-2), key.size(-2)
        causal_mask = self.bias[:, :, key_length -
                                query_length:key_length, :key_length]
        attn_weights = flow.where(causal_mask, attn_weights,
                                  self.masked_bias.to(attn_weights.dtype))

        attn_weights = nn.Softmax(dim=-1)(attn_weights)
        attn_weights = self.attn_dropout(attn_weights)

        attn_output = flow.matmul(attn_weights, value)
        return attn_output, attn_weights
    def __init__(
        self,
        dim,
        window_size,
        num_heads,
        qkv_bias=True,
        qk_scale=None,
        attn_drop=0.0,
        proj_drop=0.0,
    ):

        super().__init__()
        self.dim = dim
        self.window_size = window_size  # Wh, Ww
        self.num_heads = num_heads
        head_dim = dim // num_heads
        self.scale = qk_scale or head_dim**-0.5

        # define a parameter table of relative position bias
        # Author zzk: we add trunc normal here!
        self.relative_position_bias_table = nn.Parameter(
            flow.zeros((2 * window_size[0] - 1) * (2 * window_size[1] - 1),
                       num_heads))  # 2*Wh-1 * 2*Ww-1, nH
        self.relative_position_bias_table.trunc_normal_(std=0.02)

        # get pair-wise relative position index for each token inside the window
        coords_h = flow.arange(self.window_size[0])
        coords_w = flow.arange(self.window_size[1])
        coords = flow.stack(flow.meshgrid(*[coords_h, coords_w]))  # 2, Wh, Ww
        coords_flatten = flow.flatten(coords, 1)  # 2, Wh*Ww
        relative_coords = (coords_flatten[:, :, None] -
                           coords_flatten[:, None, :])  # 2, Wh*Ww, Wh*Ww
        relative_coords = relative_coords.permute(1, 2, 0)  # Wh*Ww, Wh*Ww, 2
        relative_coords[:, :,
                        0] += self.window_size[0] - 1  # shift to start from 0
        relative_coords[:, :, 1] += self.window_size[1] - 1
        relative_coords[:, :, 0] *= 2 * self.window_size[1] - 1
        relative_position_index = relative_coords.sum(-1)  # Wh*Ww, Wh*Ww
        self.register_buffer("relative_position_index",
                             relative_position_index)

        self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias)
        self.attn_drop = nn.Dropout(attn_drop)
        self.proj = nn.Linear(dim, dim)
        self.proj_drop = nn.Dropout(proj_drop)
        self.softmax = nn.Softmax(dim=-1)
Exemple #5
0
    def forward(self, x):
        b, n, _ = x.shape
        q = self.query(x)
        k = self.key(x)
        v = self.value(x)

        q = self.transpose_for_scores(q)
        k = self.transpose_for_scores(k)
        v = self.transpose_for_scores(v)

        attn_weights = flow.matmul(q, k.transpose(-2, -1)) / self.scale
        attn_weights = nn.Softmax(dim=-1)(attn_weights)
        out = flow.matmul(attn_weights, v)
        out = out.permute(0, 2, 1, 3)
        new_out_shape = tuple(out.size()[:-2]) + (self.heads * self.head_dim, )
        out = out.view(*new_out_shape)
        out = self.out(out)

        return out
Exemple #6
0
 def forward(
     self,
     query: flow.Tensor,
     key: flow.Tensor,
     value: flow.Tensor,
     attn_mask: Optional[flow.Tensor] = None,
 ) -> Tuple[flow.Tensor, flow.Tensor]:
     r"""
     Args:
         query: [batch, num_attention_heads, len_query, dim_query]
         key: [batch, num_attention_heads, len_key, dim_key]
         value: [batch, num_attention_heads, len_value, dim_value]
         attn_mask: [batch, num_attention_heads, len_query, len_key]
     """
     attention = flow.matmul(query, key.transpose(-1, -2))
     attention = attention / math.sqrt(query.size(-1))
     if attn_mask is not None:
         attention = attention + attn_mask
     attention = nn.Softmax(dim=-1)(attention)
     attention = self.dropout(attention)
     context = flow.matmul(attention, value)
     return context, attention
Exemple #7
0
    def __init__(
        self, model, input_size, output_size, num_experts, noisy_gating=True, k=4
    ):
        super(MoE, self).__init__()
        self.noisy_gating = noisy_gating
        self.num_experts = num_experts
        self.output_size = output_size
        self.input_size = input_size
        self.k = k

        # instantiate experts
        self.experts = nn.ModuleList([model for i in range(self.num_experts)])

        self.w_gate = nn.Parameter(
            flow.zeros(input_size, num_experts), requires_grad=True
        )
        self.w_noise = nn.Parameter(
            flow.zeros(input_size, num_experts), requires_grad=True
        )

        self.softplus = nn.Softplus()
        self.softmax = nn.Softmax(1)

        assert self.k <= self.num_experts
Exemple #8
0
    def forward(
        self,
        hidden_states,
        attention_mask=None,
        head_mask=None,
        encoder_hidden_states=None,
        encoder_attention_mask=None,
        past_key_value=None,
        output_attentions=False,
    ):
        mixed_query_layer = self.query(hidden_states)

        # If this is instantiated as a cross-attention module, the keys
        # and values come from an encoder; the attention mask needs to be
        # such that the encoder's padding tokens are not attended to.
        is_cross_attention = encoder_hidden_states is not None

        if is_cross_attention and past_key_value is not None:
            # reuse k,v, cross_attentions
            key_layer = past_key_value[0]
            value_layer = past_key_value[1]
            attention_mask = encoder_attention_mask
        elif is_cross_attention:
            key_layer = self.transpose_for_scores(
                self.key(encoder_hidden_states))
            value_layer = self.transpose_for_scores(
                self.value(encoder_hidden_states))
            attention_mask = encoder_attention_mask
        elif past_key_value is not None:
            key_layer = self.transpose_for_scores(self.key(hidden_states))
            value_layer = self.transpose_for_scores(self.value(hidden_states))
            key_layer = flow.cat([past_key_value[0], key_layer], dim=2)
            value_layer = flow.cat([past_key_value[1], value_layer], dim=2)
        else:
            key_layer = self.transpose_for_scores(self.key(hidden_states))
            value_layer = self.transpose_for_scores(self.value(hidden_states))

        query_layer = self.transpose_for_scores(mixed_query_layer)

        if self.is_decoder:
            # if cross_attention save Tuple(flow.Tensor, flow.Tensor) of all cross attention key/value_states.
            # Further calls to cross_attention layer can then reuse all cross-attention
            # key/value_states (first "if" case)
            # if uni-directional self-attention (decoder) save Tuple(flow.Tensor, flow.Tensor) of
            # all previous decoder key/value_states. Further calls to uni-directional self-attention
            # can concat previous decoder key/value_states to current projected key/value_states (third "elif" case)
            # if encoder bi-directional self-attention `past_key_value` is always `None`
            past_key_value = (key_layer, value_layer)

        # Take the dot product between "query" and "key" to get the raw attention scores.
        attention_scores = flow.matmul(query_layer,
                                       key_layer.transpose(-1, -2))

        if (self.position_embedding_type == "relative_key"
                or self.position_embedding_type == "relative_key_query"):
            seq_length = hidden_states.size()[1]
            position_ids_l = flow.arange(seq_length,
                                         dtype=flow.int64,
                                         device=hidden_states.device).view(
                                             -1, 1)
            position_ids_r = flow.arange(seq_length,
                                         dtype=flow.int64,
                                         device=hidden_states.device).view(
                                             1, -1)
            distance = position_ids_l - position_ids_r
            positional_embedding = self.distance_embedding(
                distance + self.max_position_embeddings - 1)
            positional_embedding = positional_embedding.to(
                dtype=query_layer.dtype)  # fp16 compatibility

            if self.position_embedding_type == "relative_key":
                relative_position_scores = position_scores(
                    query_layer, positional_embedding)
                attention_scores = attention_scores + relative_position_scores
            elif self.position_embedding_type == "relative_key_query":
                relative_position_scores_query = position_scores(
                    query_layer, positional_embedding)
                relative_position_scores_key = position_scores(
                    key_layer, positional_embedding)
                attention_scores = (attention_scores +
                                    relative_position_scores_query +
                                    relative_position_scores_key)
        attention_scores = attention_scores / math.sqrt(
            self.attention_head_size)
        if attention_mask is not None:
            # Apply the attention mask is (precomputed for all layers in BertModel forward() function)
            attention_scores = attention_scores + attention_mask

        # Normalize the attention scores to probabilities.
        attention_probs = nn.Softmax(dim=-1)(attention_scores)

        # This is actually dropping out entire tokens to attend to, which might
        # seem a bit unusual, but is taken from the original Transformer paper.
        attention_probs = self.dropout(attention_probs)

        # Mask heads if we want to
        if head_mask is not None:
            attention_probs = attention_probs * head_mask

        context_layer = flow.matmul(attention_probs, value_layer)

        # oneflow doesnot support contiguous()
        context_layer = context_layer.permute(0, 2, 1, 3)  # .contiguous()
        new_context_layer_shape = tuple(
            context_layer.size()[:-2]) + (self.all_head_size, )
        context_layer = context_layer.view(*new_context_layer_shape)

        outputs = ((context_layer, attention_probs) if output_attentions else
                   (context_layer, ))

        if self.is_decoder:
            outputs = outputs + (past_key_value, )
        return outputs
Exemple #9
0
 def __init__(self, temperature, attn_dropout=0.1):
     super().__init__()
     self.temperature = temperature
     self.dropout = nn.Dropout(attn_dropout)
     self.softmax = nn.Softmax(dim=2)