def __init__(self, d_model, n_heads, dim_feedforward=2048, dropout=0.1, activation="relu", layernorm=True, attn='standard', dfa='none'): super().__init__() if attn == 'standard': self.self_attn = nn.MultiheadAttention(d_model, n_heads, dropout=dropout) elif attn == 'fixed': self.self_attn = attention.FixedAttention(d_model, language_model=True) elif attn in ['dense', 'random']: self.self_attn = attention.SynthesizerAttention(d_model, n_heads, synth=attn, dropout=dropout) else: raise ValueError( "attn must be in ['standard', 'fixed', 'dense', 'random']") # Implementation of Feedforward model self.linear1 = nn.Linear(d_model, dim_feedforward) self.dropout = nn.Dropout(dropout) self.linear2 = nn.Linear(dim_feedforward, d_model) if layernorm: self.norm1 = nn.LayerNorm(d_model) self.norm2 = nn.LayerNorm(d_model) else: self.norm1 = nn.Identity() self.norm2 = nn.Identity() print("WARNING: layer normalization is deactivated") self.dropout1 = nn.Dropout(dropout) self.dropout2 = nn.Dropout(dropout) self.activation = _get_activation_fn(activation) self.dfas = [] if dfa == 'simple': self.dfas = [DFALayer()] elif dfa == 'full': self.dfas = [DFALayer(), DFALayer(), DFALayer()]
def __init__(self, config): super().__init__() self.ln1 = nn.LayerNorm(config.n_embd) self.ln2 = nn.LayerNorm(config.n_embd) if config.additive: self.attn = attention.AdditiveSelfAttention(config) elif config.synthesizer: self.attn = attention.SynthesizerAttention(config) else: self.attn = attention.CausalSelfAttention(config) self.mlp = nn.Sequential( nn.Linear(config.n_embd, 4 * config.n_embd), nn.GELU(), nn.Linear(4 * config.n_embd, config.n_embd), nn.Dropout(config.resid_pdrop), )
def __init__(self, config, type): super().__init__() self.ln1 = nn.LayerNorm(config.n_embd) self.ln2 = nn.LayerNorm(config.n_embd) if config.additive: self.attn = attention.AdditiveSelfAttention(config) else: if type == "vanilla": self.attn = attention.CausalSelfAttention(config) else: self.attn = attention.SynthesizerAttention(config) print("Attention Block is initialized as {} type".format(type)) self.mlp = nn.Sequential( nn.Linear(config.n_embd, 4 * config.n_embd), nn.GELU(), nn.Linear(4 * config.n_embd, config.n_embd), nn.Dropout(config.resid_pdrop), )
def __init__(self, config: GPTConfig): super().__init__() self.ln1 = nn.LayerNorm(config.n_embd) self.ln2 = nn.LayerNorm(config.n_embd) if config.attention_mode == attention.AttentionMode.additive: self.attn = attention.AdditiveSelfAttention(config) elif config.attention_mode == attention.AttentionMode.vanilla: self.attn = attention.CausalSelfAttention(config) elif config.attention_mode == attention.AttentionMode.synthesizer: self.attn = attention.SynthesizerAttention(config) elif config.attention_mode == attention.AttentionMode.dense_and_causual: self.attn = attention.DenseAndCausalAttention(config) else: raise RuntimeError( f"Unsupported attention mode {config.attention_mode}") self.mlp = nn.Sequential( nn.Linear(config.n_embd, 4 * config.n_embd), nn.GELU(), nn.Linear(4 * config.n_embd, config.n_embd), nn.Dropout(config.resid_pdrop), )