def __init__(self, isize, hsize=None, dropout=0.0, num_pos=cache_len_default, custom_act=use_adv_act_default): super(AverageAttn, self).__init__() _hsize = isize if hsize is None else hsize self.num_pos = num_pos self.register_buffer('w', torch.Tensor(num_pos, 1)) self.ffn = nn.Sequential( Linear(isize, _hsize), Custom_Act() if custom_act else nn.ReLU(inplace=True), Dropout(dropout, inplace=inplace_after_Custom_Act), Linear(_hsize, isize), Dropout( dropout, inplace=True)) if dropout > 0.0 else nn.Sequential( Linear(isize, _hsize), Custom_Act() if custom_act else nn.ReLU( inplace=True), Linear(_hsize, isize)) self.gw = Linear(isize * 2, isize * 2) self.reset_parameters()
def __init__(self, isize, ncomb=2, hsize=None, dropout=0.0, custom_act=use_adv_act_default, enable_bias=enable_prev_ln_bias_default): super(ResidueCombiner, self).__init__() _hsize = isize * 2 * ncomb if hsize is None else hsize # should dropout be in front of sigmoid or not? self.net = nn.Sequential( Linear(isize * ncomb, _hsize), Custom_Act() if custom_act else nn.Sigmoid(), Dropout(dropout, inplace=inplace_after_Custom_Act), Linear(_hsize, isize, bias=enable_bias), Dropout(dropout, inplace=True)) if dropout > 0.0 else nn.Sequential( Linear(isize * ncomb, _hsize), Custom_Act() if custom_act else nn.Sigmoid(), Linear(_hsize, isize, bias=enable_bias)) self.out_normer = nn.LayerNorm(isize, eps=ieps_ln_default, elementwise_affine=enable_ln_parameters)
def __init__(self, isize, hsize=None, dropout=0.0, norm_residual=norm_residual_default, custom_act=use_adv_act_default, enable_bias=enable_prev_ln_bias_default): super(PositionwiseFF, self).__init__() _hsize = isize * 4 if hsize is None else hsize self.net = nn.Sequential( Linear(isize, _hsize), Custom_Act() if custom_act else nn.ReLU(inplace=True), Dropout(dropout, inplace=inplace_after_Custom_Act), Linear(_hsize, isize, bias=enable_bias), Dropout( dropout, inplace=True)) if dropout > 0.0 else nn.Sequential( Linear(isize, _hsize), Custom_Act() if custom_act else nn.ReLU( inplace=True), Linear(_hsize, isize, bias=enable_bias)) self.normer = nn.LayerNorm(isize, eps=ieps_ln_default, elementwise_affine=enable_ln_parameters) self.norm_residual = norm_residual
def __init__(self, isize, hsize, osize, num_head=8, dropout=0.0, k_isize=None, enable_bias=enable_prev_ln_bias_default, enable_proj_bias=enable_proj_bias_default, sparsenorm=False): super(CrossAttn, self).__init__() self.attn_dim = hsize // num_head self.hsize = self.attn_dim * num_head self.num_head = num_head self.query_adaptor = Linear(isize, self.hsize, bias=enable_proj_bias) self.kv_adaptor = Linear(isize if k_isize is None else k_isize, self.hsize * 2, bias=enable_proj_bias) self.outer = Linear(self.hsize, osize, bias=enable_bias) #self.normer = MHSparseNormer(num_head, dim=-1) if sparsenorm else nn.Softmax(dim=-1) self.normer = SparseNormer(dim=-1) if sparsenorm else nn.Softmax( dim=-1) self.drop = Dropout(dropout, inplace=sparsenorm) if dropout > 0.0 else None
def __init__(self, isize, hsize, osize, num_head=8, dropout=0.0, k_isize=None, v_isize=None, enable_bias=enable_prev_ln_bias_default, enable_proj_bias=enable_proj_bias_default, k_rel_pos=0, uni_direction_reduction=False, is_left_to_right_reduction=True, zero_reduction=relpos_reduction_with_zeros, sparsenorm=False, bind_qk=False, xseql=cache_len_default): super(MultiHeadAttn, self).__init__() self.attn_dim = hsize // num_head self.hsize = self.attn_dim * num_head self.num_head = num_head self.query_adaptor = Linear(isize, self.hsize, bias=enable_proj_bias) _k_isize = isize if k_isize is None else k_isize self.key_adaptor = self.query_adaptor if bind_qk and isize == _k_isize else Linear(_k_isize, self.hsize, bias=enable_proj_bias) self.value_adaptor = Linear(_k_isize if v_isize is None else v_isize, self.hsize, bias=enable_proj_bias) self.outer = Linear(self.hsize, osize, bias=enable_bias) #self.normer = MHSparseNormer(num_head, dim=-1) if sparsenorm else nn.Softmax(dim=-1) self.normer = SparseNormer(dim=-1) if sparsenorm else nn.Softmax(dim=-1) self.drop = Dropout(dropout, inplace=sparsenorm) if dropout > 0.0 else None if k_rel_pos > 0: self.rel_shift = k_rel_pos padding_idx = None if uni_direction_reduction: _n_pemb = k_rel_pos + 1 if is_left_to_right_reduction: self.clamp_min, self.clamp_max = -k_rel_pos, 0, else: self.clamp_min, self.clamp_max, self.rel_shift = 0, k_rel_pos, 0 if zero_reduction: _n_pemb += 1 if is_left_to_right_reduction: self.clamp_max += 1 padding_idx = self.clamp_max else: self.clamp_min -= 1 self.rel_shift += 1 padding_idx = 0 else: _n_pemb = k_rel_pos + k_rel_pos + 1 self.clamp_min, self.clamp_max = -k_rel_pos, k_rel_pos self.rel_pemb = nn.Embedding(_n_pemb, self.attn_dim, padding_idx=padding_idx) _rpm = torch.arange(-xseql + 1, 1, dtype=torch.long).unsqueeze(0) self.register_buffer("rel_pos", (_rpm - _rpm.t()).clamp(min=self.clamp_min, max=self.clamp_max) + self.rel_shift) self.xseql = xseql # the buffer can be shared inside the encoder or the decoder across layers for saving memory, by setting self.ref_rel_posm of self attns in deep layers to SelfAttn in layer 0, and sharing corresponding self.rel_pos self.ref_rel_posm = None self.register_buffer("rel_pos_cache", None) else: self.rel_pemb = None self.register_buffer('real_iK', None) self.register_buffer('real_iV', None) self.register_buffer('iK', None) self.register_buffer('iV', None) if self.c_available(): self.c_init()
def __init__(self, isize, hsize, num_head=8, dropout=0.0, norm_residual=norm_residual_default, **kwargs): super(ResCrossAttn, self).__init__() self.net = CrossAttn(isize, hsize, isize, num_head=num_head, dropout=dropout, **kwargs) self.normer = nn.LayerNorm(isize, eps=ieps_ln_default, elementwise_affine=enable_ln_parameters) self.drop = Dropout(dropout, inplace=True) if dropout > 0.0 else None self.norm_residual = norm_residual if self.c_available(): self.c_init()
def __init__(self, isize, hsize, osize, num_head=8, dropout=0.0, k_isize=None, v_isize=None, enable_bias=enable_prev_ln_bias_default, enable_proj_bias=enable_proj_bias_default, k_rel_pos=0, sparsenorm=False, bind_qk=False, xseql=cache_len_default): super(MultiHeadAttn, self).__init__() self.attn_dim = hsize // num_head self.hsize = self.attn_dim * num_head self.num_head = num_head self.query_adaptor = Linear(isize, self.hsize, bias=enable_proj_bias) _k_isize = isize if k_isize is None else k_isize self.key_adaptor = self.query_adaptor if bind_qk and isize == _k_isize else Linear( _k_isize, self.hsize, bias=enable_proj_bias) self.value_adaptor = Linear(_k_isize if v_isize is None else v_isize, self.hsize, bias=enable_proj_bias) self.outer = Linear(self.hsize, osize, bias=enable_bias) #self.normer = MHSparseNormer(num_head, dim=-1) if sparsenorm else nn.Softmax(dim=-1) self.normer = SparseNormer(dim=-1) if sparsenorm else nn.Softmax( dim=-1) self.drop = Dropout(dropout, inplace=sparsenorm) if dropout > 0.0 else None if k_rel_pos > 0: self.k_rel_pos = k_rel_pos self.rel_pemb = nn.Embedding(k_rel_pos * 2 + 1, self.attn_dim) _rpm = torch.arange(-xseql + 1, 1, dtype=torch.long).unsqueeze(0) self.register_buffer( "rel_pos", (_rpm - _rpm.t()).clamp(min=-k_rel_pos, max=k_rel_pos) + k_rel_pos) self.xseql = xseql # the buffer can be shared inside the encoder or the decoder across layers for saving memory, by setting self.ref_rel_posm of self attns in deep layers to SelfAttn in layer 0, and sharing corresponding self.rel_pos self.ref_rel_posm = None else: self.rel_pemb = None