def __init__(self, n_state, config):  # in MLP: n_state=3072 (4 * n_embd)
     super(MLP, self).__init__()
     nx = config.n_embd
     self.c_fc = Conv1D(n_state, nx)
     self.c_proj = Conv1D(nx, n_state)
     self.act = gelu
     self.dropout = nn.Dropout(config.resid_pdrop)
 def __init__(self, n_state, config):  # in MLP: n_state=3072 (4 * n_embd)
     super().__init__()
     nx = config.n_embd
     self.c_fc = Conv1D(n_state, nx)
     self.c_proj = Conv1D(nx, n_state)
     self.act = ACT2FN[config.activation_function]
     self.dropout = nn.Dropout(config.resid_pdrop)
Beispiel #3
0
 def __init__(self, n_state, cfg):  # in MLP: n_state=3072 (4 * n_embd)
     super(MLP, self).__init__()
     nx = cfg["n_embd"]
     self.c_fc = Conv1D(n_state, nx)
     self.c_proj = Conv1D(nx, n_state)
     self.act = ACT_FNS[cfg["afn"]]
     self.dropout = nn.Dropout(cfg["resid_pdrop"])
    def __init__(self, nx, n_ctx, config, scale=False):
        super().__init__()
        self.output_attentions = config.output_attentions

        n_state = nx  # in Attention: n_state=768 (nx=n_embd)
        # [switch nx => n_state from Block to Attention to keep identical to TF implem]
        assert n_state % config.n_head == 0
        self.register_buffer("bias", torch.tril(torch.ones(n_ctx, n_ctx)).view(1, 1, n_ctx, n_ctx))
        self.n_head = config.n_head
        self.split_size = n_state
        self.scale = scale

        self.c_attn = Conv1D(n_state * 3, nx)
        self.c_proj = Conv1D(n_state, nx)
        self.attn_dropout = nn.Dropout(config.attn_pdrop)
        self.resid_dropout = nn.Dropout(config.resid_pdrop)
        self.pruned_heads = set()
Beispiel #5
0
def prune_conv1d_layer(layer, index, dim=1):
    """ Prune a Conv1D layer (a model parameters) to keep only entries in index.
        A Conv1D work as a Linear layer (see e.g. BERT) but the weights are transposed.
        Return the pruned layer as a new layer with requires_grad=True.
        Used to remove heads.
    """
    index = index.to(layer.weight.device)
    W = layer.weight.index_select(dim, index).clone().detach()
    if dim == 0:
        b = layer.bias.clone().detach()
    else:
        b = layer.bias[index].clone().detach()
    new_size = list(layer.weight.size())
    new_size[dim] = len(index)
    new_layer = Conv1D(new_size[1], new_size[0]).to(layer.weight.device)
    new_layer.weight.requires_grad = False
    new_layer.weight.copy_(W.contiguous())
    new_layer.weight.requires_grad = True
    new_layer.bias.requires_grad = False
    new_layer.bias.copy_(b.contiguous())
    new_layer.bias.requires_grad = True
    return new_layer