def __init__(self, n_ctx, config, scale=False): super(Block, self).__init__() nx = config.n_embd self.ln_1 = LayerNorm(nx, eps=config.layer_norm_epsilon) self.attn = Attention(nx, n_ctx, config, scale) self.ln_2 = LayerNorm(nx, eps=config.layer_norm_epsilon) self.mlp = MLP(4 * nx, config)
def __init__(self, n_ctx, config, scale=False, output_attentions=False): super(Block1, self).__init__() nx = config.n_embd self.output_attentions = output_attentions #self.attn = Attention(nx, n_ctx, config, scale, output_attentions) self.multi_attn = MultiheadAttention(nx, n_ctx, config, scale, output_attentions) self.ln_1 = LayerNorm(nx, eps=config.layer_norm_epsilon) self.mlp = MLP(4 * nx, config) self.ln_2 = LayerNorm(nx, eps=config.layer_norm_epsilon)
def __init__(self, config): super(GPT2Model, self).__init__(config) self.wte = nn.Embedding(config.vocab_size, config.n_embd) self.wpe = nn.Embedding(config.n_positions, config.n_embd) block = Block(config.n_ctx, config, scale=True) self.h = nn.ModuleList([copy.deepcopy(block) for _ in range(config.n_layer)]) self.ln_f = LayerNorm(config.n_embd, eps=config.layer_norm_epsilon) self.apply(self.init_weights)