Exemple #1
0
    def __init__(self,
                 d_model,
                 n_heads,
                 dim_feedforward=2048,
                 dropout=0.1,
                 activation="relu",
                 layernorm=True,
                 attn='standard',
                 dfa='none'):
        super().__init__()

        if attn == 'standard':
            self.self_attn = nn.MultiheadAttention(d_model,
                                                   n_heads,
                                                   dropout=dropout)
        elif attn == 'fixed':
            self.self_attn = attention.FixedAttention(d_model,
                                                      language_model=True)
        elif attn in ['dense', 'random']:
            self.self_attn = attention.SynthesizerAttention(d_model,
                                                            n_heads,
                                                            synth=attn,
                                                            dropout=dropout)
        else:
            raise ValueError(
                "attn must be in ['standard', 'fixed', 'dense', 'random']")

        # Implementation of Feedforward model
        self.linear1 = nn.Linear(d_model, dim_feedforward)
        self.dropout = nn.Dropout(dropout)
        self.linear2 = nn.Linear(dim_feedforward, d_model)

        if layernorm:
            self.norm1 = nn.LayerNorm(d_model)
            self.norm2 = nn.LayerNorm(d_model)
        else:
            self.norm1 = nn.Identity()
            self.norm2 = nn.Identity()
            print("WARNING: layer normalization is deactivated")

        self.dropout1 = nn.Dropout(dropout)
        self.dropout2 = nn.Dropout(dropout)

        self.activation = _get_activation_fn(activation)

        self.dfas = []
        if dfa == 'simple':
            self.dfas = [DFALayer()]
        elif dfa == 'full':
            self.dfas = [DFALayer(), DFALayer(), DFALayer()]
Exemple #2
0
 def __init__(self, config):
     super().__init__()
     self.ln1 = nn.LayerNorm(config.n_embd)
     self.ln2 = nn.LayerNorm(config.n_embd)
     if config.additive:
         self.attn = attention.AdditiveSelfAttention(config)
     elif config.synthesizer:
         self.attn = attention.SynthesizerAttention(config)
     else:
         self.attn = attention.CausalSelfAttention(config)
     self.mlp = nn.Sequential(
         nn.Linear(config.n_embd, 4 * config.n_embd),
         nn.GELU(),
         nn.Linear(4 * config.n_embd, config.n_embd),
         nn.Dropout(config.resid_pdrop),
     )
Exemple #3
0
    def __init__(self, config, type):
        super().__init__()
        self.ln1 = nn.LayerNorm(config.n_embd)
        self.ln2 = nn.LayerNorm(config.n_embd)
        if config.additive:
            self.attn = attention.AdditiveSelfAttention(config)
        else:
            if type == "vanilla":
                self.attn = attention.CausalSelfAttention(config)
            else:
                self.attn = attention.SynthesizerAttention(config)

            print("Attention Block is initialized as {} type".format(type))
        self.mlp = nn.Sequential(
            nn.Linear(config.n_embd, 4 * config.n_embd),
            nn.GELU(),
            nn.Linear(4 * config.n_embd, config.n_embd),
            nn.Dropout(config.resid_pdrop),
        )
Exemple #4
0
    def __init__(self, config: GPTConfig):
        super().__init__()
        self.ln1 = nn.LayerNorm(config.n_embd)
        self.ln2 = nn.LayerNorm(config.n_embd)

        if config.attention_mode == attention.AttentionMode.additive:
            self.attn = attention.AdditiveSelfAttention(config)
        elif config.attention_mode == attention.AttentionMode.vanilla:
            self.attn = attention.CausalSelfAttention(config)
        elif config.attention_mode == attention.AttentionMode.synthesizer:
            self.attn = attention.SynthesizerAttention(config)
        elif config.attention_mode == attention.AttentionMode.dense_and_causual:
            self.attn = attention.DenseAndCausalAttention(config)
        else:
            raise RuntimeError(
                f"Unsupported attention mode {config.attention_mode}")
        self.mlp = nn.Sequential(
            nn.Linear(config.n_embd, 4 * config.n_embd),
            nn.GELU(),
            nn.Linear(4 * config.n_embd, config.n_embd),
            nn.Dropout(config.resid_pdrop),
        )