def __init__(self, config): super().__init__() self.ln1 = nn.LayerNorm(config.n_embd) self.ln2 = nn.LayerNorm(config.n_embd) if config.additive: self.attn = attention.AdditiveSelfAttention(config) else: self.attn = attention.CausalSelfAttention(config) self.mlp = nn.Sequential( nn.Linear(config.n_embd, 4 * config.n_embd), nn.GELU(), nn.Linear(4 * config.n_embd, config.n_embd), nn.Dropout(config.resid_pdrop), )
def __init__(self, config, type): super().__init__() self.ln1 = nn.LayerNorm(config.n_embd) self.ln2 = nn.LayerNorm(config.n_embd) if config.additive: self.attn = attention.AdditiveSelfAttention(config) else: if type == "vanilla": self.attn = attention.CausalSelfAttention(config) else: self.attn = attention.SynthesizerAttention(config) print("Attention Block is initialized as {} type".format(type)) self.mlp = nn.Sequential( nn.Linear(config.n_embd, 4 * config.n_embd), nn.GELU(), nn.Linear(4 * config.n_embd, config.n_embd), nn.Dropout(config.resid_pdrop), )
def __init__(self, config: GPTConfig): super().__init__() self.ln1 = nn.LayerNorm(config.n_embd) self.ln2 = nn.LayerNorm(config.n_embd) if config.attention_mode == attention.AttentionMode.additive: self.attn = attention.AdditiveSelfAttention(config) elif config.attention_mode == attention.AttentionMode.vanilla: self.attn = attention.CausalSelfAttention(config) elif config.attention_mode == attention.AttentionMode.synthesizer: self.attn = attention.SynthesizerAttention(config) elif config.attention_mode == attention.AttentionMode.dense_and_causual: self.attn = attention.DenseAndCausalAttention(config) else: raise RuntimeError( f"Unsupported attention mode {config.attention_mode}") self.mlp = nn.Sequential( nn.Linear(config.n_embd, 4 * config.n_embd), nn.GELU(), nn.Linear(4 * config.n_embd, config.n_embd), nn.Dropout(config.resid_pdrop), )