Example #1
0
    def __init__(self,
                 isize,
                 hsize=None,
                 dropout=0.0,
                 num_pos=cache_len_default,
                 custom_act=use_adv_act_default):

        super(AverageAttn, self).__init__()

        _hsize = isize if hsize is None else hsize

        self.num_pos = num_pos
        self.register_buffer('w', torch.Tensor(num_pos, 1))

        self.ffn = nn.Sequential(
            Linear(isize, _hsize),
            Custom_Act() if custom_act else nn.ReLU(inplace=True),
            Dropout(dropout, inplace=inplace_after_Custom_Act),
            Linear(_hsize, isize), Dropout(
                dropout, inplace=True)) if dropout > 0.0 else nn.Sequential(
                    Linear(isize, _hsize),
                    Custom_Act() if custom_act else nn.ReLU(
                        inplace=True), Linear(_hsize, isize))

        self.gw = Linear(isize * 2, isize * 2)

        self.reset_parameters()
Example #2
0
    def __init__(self,
                 isize,
                 ncomb=2,
                 hsize=None,
                 dropout=0.0,
                 custom_act=use_adv_act_default,
                 enable_bias=enable_prev_ln_bias_default):

        super(ResidueCombiner, self).__init__()

        _hsize = isize * 2 * ncomb if hsize is None else hsize

        # should dropout be in front of sigmoid or not?
        self.net = nn.Sequential(
            Linear(isize * ncomb, _hsize),
            Custom_Act() if custom_act else nn.Sigmoid(),
            Dropout(dropout, inplace=inplace_after_Custom_Act),
            Linear(_hsize, isize, bias=enable_bias),
            Dropout(dropout,
                    inplace=True)) if dropout > 0.0 else nn.Sequential(
                        Linear(isize * ncomb, _hsize),
                        Custom_Act() if custom_act else nn.Sigmoid(),
                        Linear(_hsize, isize, bias=enable_bias))

        self.out_normer = nn.LayerNorm(isize,
                                       eps=ieps_ln_default,
                                       elementwise_affine=enable_ln_parameters)
Example #3
0
    def __init__(self,
                 isize,
                 hsize=None,
                 dropout=0.0,
                 norm_residual=norm_residual_default,
                 custom_act=use_adv_act_default,
                 enable_bias=enable_prev_ln_bias_default):

        super(PositionwiseFF, self).__init__()

        _hsize = isize * 4 if hsize is None else hsize

        self.net = nn.Sequential(
            Linear(isize, _hsize),
            Custom_Act() if custom_act else nn.ReLU(inplace=True),
            Dropout(dropout, inplace=inplace_after_Custom_Act),
            Linear(_hsize, isize, bias=enable_bias),
            Dropout(
                dropout, inplace=True)) if dropout > 0.0 else nn.Sequential(
                    Linear(isize, _hsize),
                    Custom_Act() if custom_act else nn.ReLU(
                        inplace=True), Linear(_hsize, isize, bias=enable_bias))

        self.normer = nn.LayerNorm(isize,
                                   eps=ieps_ln_default,
                                   elementwise_affine=enable_ln_parameters)

        self.norm_residual = norm_residual
Example #4
0
    def __init__(self,
                 isize,
                 hsize,
                 osize,
                 num_head=8,
                 dropout=0.0,
                 k_isize=None,
                 enable_bias=enable_prev_ln_bias_default,
                 enable_proj_bias=enable_proj_bias_default,
                 sparsenorm=False):

        super(CrossAttn, self).__init__()

        self.attn_dim = hsize // num_head
        self.hsize = self.attn_dim * num_head
        self.num_head = num_head

        self.query_adaptor = Linear(isize, self.hsize, bias=enable_proj_bias)

        self.kv_adaptor = Linear(isize if k_isize is None else k_isize,
                                 self.hsize * 2,
                                 bias=enable_proj_bias)

        self.outer = Linear(self.hsize, osize, bias=enable_bias)

        #self.normer = MHSparseNormer(num_head, dim=-1) if sparsenorm else nn.Softmax(dim=-1)
        self.normer = SparseNormer(dim=-1) if sparsenorm else nn.Softmax(
            dim=-1)

        self.drop = Dropout(dropout,
                            inplace=sparsenorm) if dropout > 0.0 else None
Example #5
0
	def __init__(self, isize, hsize, osize, num_head=8, dropout=0.0, k_isize=None, v_isize=None, enable_bias=enable_prev_ln_bias_default, enable_proj_bias=enable_proj_bias_default, k_rel_pos=0, uni_direction_reduction=False, is_left_to_right_reduction=True, zero_reduction=relpos_reduction_with_zeros, sparsenorm=False, bind_qk=False, xseql=cache_len_default):

		super(MultiHeadAttn, self).__init__()

		self.attn_dim = hsize // num_head
		self.hsize = self.attn_dim * num_head
		self.num_head = num_head

		self.query_adaptor = Linear(isize, self.hsize, bias=enable_proj_bias)
		_k_isize = isize if k_isize is None else k_isize
		self.key_adaptor = self.query_adaptor if bind_qk and isize == _k_isize else Linear(_k_isize, self.hsize, bias=enable_proj_bias)
		self.value_adaptor = Linear(_k_isize if v_isize is None else v_isize, self.hsize, bias=enable_proj_bias)

		self.outer = Linear(self.hsize, osize, bias=enable_bias)

		#self.normer = MHSparseNormer(num_head, dim=-1) if sparsenorm else nn.Softmax(dim=-1)
		self.normer = SparseNormer(dim=-1) if sparsenorm else nn.Softmax(dim=-1)

		self.drop = Dropout(dropout, inplace=sparsenorm) if dropout > 0.0 else None

		if k_rel_pos > 0:
			self.rel_shift = k_rel_pos
			padding_idx = None
			if uni_direction_reduction:
				_n_pemb = k_rel_pos + 1
				if is_left_to_right_reduction:
					self.clamp_min, self.clamp_max = -k_rel_pos, 0,
				else:
					self.clamp_min, self.clamp_max, self.rel_shift = 0, k_rel_pos, 0
				if zero_reduction:
					_n_pemb += 1
					if is_left_to_right_reduction:
						self.clamp_max += 1
						padding_idx = self.clamp_max
					else:
						self.clamp_min -= 1
						self.rel_shift += 1
						padding_idx = 0
			else:
				_n_pemb = k_rel_pos + k_rel_pos + 1
				self.clamp_min, self.clamp_max = -k_rel_pos, k_rel_pos
			self.rel_pemb = nn.Embedding(_n_pemb, self.attn_dim, padding_idx=padding_idx)
			_rpm = torch.arange(-xseql + 1, 1, dtype=torch.long).unsqueeze(0)
			self.register_buffer("rel_pos", (_rpm - _rpm.t()).clamp(min=self.clamp_min, max=self.clamp_max) + self.rel_shift)
			self.xseql = xseql
			# the buffer can be shared inside the encoder or the decoder across layers for saving memory, by setting self.ref_rel_posm of self attns in deep layers to SelfAttn in layer 0, and sharing corresponding self.rel_pos
			self.ref_rel_posm = None
			self.register_buffer("rel_pos_cache", None)
		else:
			self.rel_pemb = None

		self.register_buffer('real_iK', None)
		self.register_buffer('real_iV', None)
		self.register_buffer('iK', None)
		self.register_buffer('iV', None)

		if self.c_available():
			self.c_init()
Example #6
0
	def __init__(self, isize, hsize, num_head=8, dropout=0.0, norm_residual=norm_residual_default, **kwargs):

		super(ResCrossAttn, self).__init__()

		self.net = CrossAttn(isize, hsize, isize, num_head=num_head, dropout=dropout, **kwargs)
		self.normer = nn.LayerNorm(isize, eps=ieps_ln_default, elementwise_affine=enable_ln_parameters)
		self.drop = Dropout(dropout, inplace=True) if dropout > 0.0 else None
		self.norm_residual = norm_residual

		if self.c_available():
			self.c_init()
Example #7
0
    def __init__(self,
                 isize,
                 hsize,
                 osize,
                 num_head=8,
                 dropout=0.0,
                 k_isize=None,
                 v_isize=None,
                 enable_bias=enable_prev_ln_bias_default,
                 enable_proj_bias=enable_proj_bias_default,
                 k_rel_pos=0,
                 sparsenorm=False,
                 bind_qk=False,
                 xseql=cache_len_default):

        super(MultiHeadAttn, self).__init__()

        self.attn_dim = hsize // num_head
        self.hsize = self.attn_dim * num_head
        self.num_head = num_head

        self.query_adaptor = Linear(isize, self.hsize, bias=enable_proj_bias)
        _k_isize = isize if k_isize is None else k_isize
        self.key_adaptor = self.query_adaptor if bind_qk and isize == _k_isize else Linear(
            _k_isize, self.hsize, bias=enable_proj_bias)
        self.value_adaptor = Linear(_k_isize if v_isize is None else v_isize,
                                    self.hsize,
                                    bias=enable_proj_bias)

        self.outer = Linear(self.hsize, osize, bias=enable_bias)

        #self.normer = MHSparseNormer(num_head, dim=-1) if sparsenorm else nn.Softmax(dim=-1)
        self.normer = SparseNormer(dim=-1) if sparsenorm else nn.Softmax(
            dim=-1)

        self.drop = Dropout(dropout,
                            inplace=sparsenorm) if dropout > 0.0 else None

        if k_rel_pos > 0:
            self.k_rel_pos = k_rel_pos
            self.rel_pemb = nn.Embedding(k_rel_pos * 2 + 1, self.attn_dim)
            _rpm = torch.arange(-xseql + 1, 1, dtype=torch.long).unsqueeze(0)
            self.register_buffer(
                "rel_pos",
                (_rpm - _rpm.t()).clamp(min=-k_rel_pos, max=k_rel_pos) +
                k_rel_pos)
            self.xseql = xseql
            # the buffer can be shared inside the encoder or the decoder across layers for saving memory, by setting self.ref_rel_posm of self attns in deep layers to SelfAttn in layer 0, and sharing corresponding self.rel_pos
            self.ref_rel_posm = None
        else:
            self.rel_pemb = None