def __init__(self, n_ctx, config, scale=False):
     super(Block, self).__init__()
     nx = config.n_embd
     self.ln_1 = LayerNorm(nx, eps=config.layer_norm_epsilon)
     self.attn = Attention(nx, n_ctx, config, scale)
     self.ln_2 = LayerNorm(nx, eps=config.layer_norm_epsilon)
     self.mlp = MLP(4 * nx, config)
Exemple #2
0
 def __init__(self, n_ctx, config, scale=False, output_attentions=False):
     super(Block1, self).__init__()
     nx = config.n_embd
     self.output_attentions = output_attentions
     #self.attn = Attention(nx, n_ctx, config, scale, output_attentions)
     self.multi_attn = MultiheadAttention(nx, n_ctx, config, scale,
                                          output_attentions)
     self.ln_1 = LayerNorm(nx, eps=config.layer_norm_epsilon)
     self.mlp = MLP(4 * nx, config)
     self.ln_2 = LayerNorm(nx, eps=config.layer_norm_epsilon)
Exemple #3
0
    def __init__(self, config):
        super(GPT2Model, self).__init__(config)
        self.wte = nn.Embedding(config.vocab_size, config.n_embd)
        self.wpe = nn.Embedding(config.n_positions, config.n_embd)
        block = Block(config.n_ctx, config, scale=True)
        self.h = nn.ModuleList([copy.deepcopy(block) for _ in range(config.n_layer)])
        self.ln_f = LayerNorm(config.n_embd, eps=config.layer_norm_epsilon)

        self.apply(self.init_weights)