def __init__(self, n_state, config): # in MLP: n_state=3072 (4 * n_embd) super(MLP, self).__init__() nx = config.n_embd self.c_fc = Conv1D(n_state, nx) self.c_proj = Conv1D(nx, n_state) self.act = gelu self.dropout = nn.Dropout(config.resid_pdrop)
def __init__(self, n_state, config): # in MLP: n_state=3072 (4 * n_embd) super().__init__() nx = config.n_embd self.c_fc = Conv1D(n_state, nx) self.c_proj = Conv1D(nx, n_state) self.act = ACT2FN[config.activation_function] self.dropout = nn.Dropout(config.resid_pdrop)
def __init__(self, n_state, cfg): # in MLP: n_state=3072 (4 * n_embd) super(MLP, self).__init__() nx = cfg["n_embd"] self.c_fc = Conv1D(n_state, nx) self.c_proj = Conv1D(nx, n_state) self.act = ACT_FNS[cfg["afn"]] self.dropout = nn.Dropout(cfg["resid_pdrop"])
def __init__(self, nx, n_ctx, config, scale=False): super().__init__() self.output_attentions = config.output_attentions n_state = nx # in Attention: n_state=768 (nx=n_embd) # [switch nx => n_state from Block to Attention to keep identical to TF implem] assert n_state % config.n_head == 0 self.register_buffer("bias", torch.tril(torch.ones(n_ctx, n_ctx)).view(1, 1, n_ctx, n_ctx)) self.n_head = config.n_head self.split_size = n_state self.scale = scale self.c_attn = Conv1D(n_state * 3, nx) self.c_proj = Conv1D(n_state, nx) self.attn_dropout = nn.Dropout(config.attn_pdrop) self.resid_dropout = nn.Dropout(config.resid_pdrop) self.pruned_heads = set()
def prune_conv1d_layer(layer, index, dim=1): """ Prune a Conv1D layer (a model parameters) to keep only entries in index. A Conv1D work as a Linear layer (see e.g. BERT) but the weights are transposed. Return the pruned layer as a new layer with requires_grad=True. Used to remove heads. """ index = index.to(layer.weight.device) W = layer.weight.index_select(dim, index).clone().detach() if dim == 0: b = layer.bias.clone().detach() else: b = layer.bias[index].clone().detach() new_size = list(layer.weight.size()) new_size[dim] = len(index) new_layer = Conv1D(new_size[1], new_size[0]).to(layer.weight.device) new_layer.weight.requires_grad = False new_layer.weight.copy_(W.contiguous()) new_layer.weight.requires_grad = True new_layer.bias.requires_grad = False new_layer.bias.copy_(b.contiguous()) new_layer.bias.requires_grad = True return new_layer