def __init__(self, vocab_size, embedding_dim, cls_idx=0, sep_idx=1, unk_idx=2, pad_idx=3, mask_idx=4, dropout_rate=0.1, add_noise=False, noise_intensity=0.05): """ :param vocab_size: total vocab size :param embed_size: embedding size of token embedding :param dropout: dropout rate """ super().__init__() self.token = Embedding(num_embeddings=vocab_size, embedding_dim=embedding_dim, padding_idx=3, add_noise=add_noise, noise_intensity=noise_intensity) self.position = PositionalEmbedding(d_model=self.token.embedding_dim) self.segment = Embedding(num_embeddings=3, embedding_dim=self.token.embedding_dim, padding_idx=0) self.cls_idx = cls_idx self.sep_idx = sep_idx self.pad_idx = pad_idx self.unk_idx = unk_idx self.mask_idx = mask_idx self.dropout_rate = dropout_rate self.dropout = Dropout(dropout_rate) self.norm = LayerNorm() self.embedding_dim = embedding_dim
def __init__(self, dim, num_heads=8, qkv_bias=False, qk_scale=None, attn_drop=0., proj_drop=0.): super().__init__() self.num_heads = num_heads head_dim = dim // num_heads # NOTE scale factor was wrong in my original version, can set manually to be compat with prev weights self.scale = qk_scale or head_dim**-0.5 self.qkv = Dense(num_filters=dim * 3, use_bias=qkv_bias) self.attn_drop = Dropout(attn_drop) self.proj = Dense(num_filters=dim) self.proj_drop = Dropout(proj_drop)
def __init__(self, h, d_model, dropout_rate=0.1): super().__init__() assert d_model % h == 0 # We assume d_v always equals d_k self.d_k = d_model // h self.h = h self.linear_layers = ModuleList([Dense(d_model) for _ in range(3)]) self.output_linear = Dense(d_model) self.attention = Attention() self.dropout = Dropout(dropout_rate)
def __init__(self, vocab_size, embed_size, dropout_rate=0.1): """ :param vocab_size: total vocab size :param embed_size: embedding size of token embedding :param dropout: dropout rate """ super().__init__() self.token = Embedding(num_embeddings=vocab_size, embedding_dim=embed_size) self.position = PositionalEmbedding(d_model=self.token.embedding_dim) self.segment = Embedding(num_embeddings=3, embedding_dim=self.token.embedding_dim) self.dropout = Dropout(dropout_rate) self.embed_size = embed_size
def __init__(self, hidden, attn_heads, feed_forward_hidden, dropout_rate=0.1): """ :param hidden: hidden size of transformer :param attn_heads: head sizes of multi-head attention :param feed_forward_hidden: feed_forward_hidden, usually 4*hidden_size :param dropout: dropout rate """ super().__init__() self.attention = MultiHeadedAttention(h=attn_heads, d_model=hidden) self.feed_forward = PositionwiseFeedForward(d_model=hidden, d_ff=feed_forward_hidden, dropout_rate=dropout_rate) self.input_sublayer = SublayerConnection(size=hidden, dropout_rate=dropout_rate) self.output_sublayer = SublayerConnection(size=hidden, dropout_rate=dropout_rate) self.dropout = Dropout(dropout_rate=dropout_rate)
def __init__(self, mode='trainable', drop_rate=0, use_cls_token=True): super().__init__() self.mode = mode self.use_cls_token = use_cls_token self.pos_drop = Dropout(dropout_rate=drop_rate)
def __init__(self, d_model, d_ff, dropout_rate=0.1): super(PositionwiseFeedForward, self).__init__() self.w_1 = nn.Linear(d_model, d_ff) self.w_2 = nn.Linear(d_ff, d_model) self.dropout = Dropout(dropout_rate) self.activation = Gelu()
def __init__(self, size, dropout_rate=0.0): super(SublayerConnection, self).__init__() self.norm = LayerNorm(size) self.dropout = Dropout(dropout_rate)