Ejemplo n.º 1
0
 def __init__(self,
              vocab_size,
              embedding_dim,
              cls_idx=0,
              sep_idx=1,
              unk_idx=2,
              pad_idx=3,
              mask_idx=4,
              dropout_rate=0.1,
              add_noise=False,
              noise_intensity=0.05):
     """
     :param vocab_size: total vocab size
     :param embed_size: embedding size of token embedding
     :param dropout: dropout rate
     """
     super().__init__()
     self.token = Embedding(num_embeddings=vocab_size,
                            embedding_dim=embedding_dim,
                            padding_idx=3,
                            add_noise=add_noise,
                            noise_intensity=noise_intensity)
     self.position = PositionalEmbedding(d_model=self.token.embedding_dim)
     self.segment = Embedding(num_embeddings=3,
                              embedding_dim=self.token.embedding_dim,
                              padding_idx=0)
     self.cls_idx = cls_idx
     self.sep_idx = sep_idx
     self.pad_idx = pad_idx
     self.unk_idx = unk_idx
     self.mask_idx = mask_idx
     self.dropout_rate = dropout_rate
     self.dropout = Dropout(dropout_rate)
     self.norm = LayerNorm()
     self.embedding_dim = embedding_dim
Ejemplo n.º 2
0
    def __init__(self,
                 dim,
                 num_heads=8,
                 qkv_bias=False,
                 qk_scale=None,
                 attn_drop=0.,
                 proj_drop=0.):
        super().__init__()
        self.num_heads = num_heads
        head_dim = dim // num_heads
        # NOTE scale factor was wrong in my original version, can set manually to be compat with prev weights
        self.scale = qk_scale or head_dim**-0.5

        self.qkv = Dense(num_filters=dim * 3, use_bias=qkv_bias)
        self.attn_drop = Dropout(attn_drop)
        self.proj = Dense(num_filters=dim)
        self.proj_drop = Dropout(proj_drop)
Ejemplo n.º 3
0
    def __init__(self, h, d_model, dropout_rate=0.1):
        super().__init__()
        assert d_model % h == 0

        # We assume d_v always equals d_k
        self.d_k = d_model // h
        self.h = h

        self.linear_layers = ModuleList([Dense(d_model) for _ in range(3)])
        self.output_linear = Dense(d_model)
        self.attention = Attention()

        self.dropout = Dropout(dropout_rate)
Ejemplo n.º 4
0
 def __init__(self, vocab_size, embed_size, dropout_rate=0.1):
     """
     :param vocab_size: total vocab size
     :param embed_size: embedding size of token embedding
     :param dropout: dropout rate
     """
     super().__init__()
     self.token = Embedding(num_embeddings=vocab_size,
                            embedding_dim=embed_size)
     self.position = PositionalEmbedding(d_model=self.token.embedding_dim)
     self.segment = Embedding(num_embeddings=3,
                              embedding_dim=self.token.embedding_dim)
     self.dropout = Dropout(dropout_rate)
     self.embed_size = embed_size
Ejemplo n.º 5
0
    def __init__(self,
                 hidden,
                 attn_heads,
                 feed_forward_hidden,
                 dropout_rate=0.1):
        """
        :param hidden: hidden size of transformer
        :param attn_heads: head sizes of multi-head attention
        :param feed_forward_hidden: feed_forward_hidden, usually 4*hidden_size
        :param dropout: dropout rate
        """

        super().__init__()
        self.attention = MultiHeadedAttention(h=attn_heads, d_model=hidden)
        self.feed_forward = PositionwiseFeedForward(d_model=hidden,
                                                    d_ff=feed_forward_hidden,
                                                    dropout_rate=dropout_rate)
        self.input_sublayer = SublayerConnection(size=hidden,
                                                 dropout_rate=dropout_rate)
        self.output_sublayer = SublayerConnection(size=hidden,
                                                  dropout_rate=dropout_rate)
        self.dropout = Dropout(dropout_rate=dropout_rate)
Ejemplo n.º 6
0
 def __init__(self, mode='trainable', drop_rate=0, use_cls_token=True):
     super().__init__()
     self.mode = mode
     self.use_cls_token = use_cls_token
     self.pos_drop = Dropout(dropout_rate=drop_rate)
Ejemplo n.º 7
0
 def __init__(self, d_model, d_ff, dropout_rate=0.1):
     super(PositionwiseFeedForward, self).__init__()
     self.w_1 = nn.Linear(d_model, d_ff)
     self.w_2 = nn.Linear(d_ff, d_model)
     self.dropout = Dropout(dropout_rate)
     self.activation = Gelu()
Ejemplo n.º 8
0
 def __init__(self, size, dropout_rate=0.0):
     super(SublayerConnection, self).__init__()
     self.norm = LayerNorm(size)
     self.dropout = Dropout(dropout_rate)