def __init__(self, isize, nwd, num_layer, fhsize=None, dropout=0.0, attn_drop=0.0, num_head=8, xseql=512, ahsize=None, norm_output=True, num_layer_dec=6): _ahsize = isize if ahsize is None else ahsize _fhsize = _ahsize * 4 if fhsize is None else fhsize super(Encoder, self).__init__(isize, nwd, num_layer, _fhsize, dropout, attn_drop, num_head, xseql, _ahsize, norm_output) self.nets = nn.ModuleList([ EncoderLayer(isize, _fhsize, dropout, attn_drop, num_head, _ahsize) for i in range(num_layer) ]) self.tattn_w = nn.Parameter( torch.Tensor(num_layer + 1, num_layer_dec).uniform_( -sqrt(2.0 / (num_layer + num_layer_dec + 1)), sqrt(2.0 / (num_layer + num_layer_dec + 1)))) self.tattn_drop = Dropout(dropout) if dropout > 0.0 else None
def __init__(self, isize, hsize, osize, num_head=8, dropout=0.0, enable_bias=False, sparsenorm=False): super(CrossAttn, self).__init__() self.attn_dim = hsize // num_head self.hsize = self.attn_dim * num_head self.num_head = num_head self.query_adaptor = Linear(isize, self.hsize, bias=enable_bias) self.kv_adaptor = Linear(isize, self.hsize * 2, bias=enable_bias) self.outer = Linear(self.hsize, osize, bias=enable_bias) #self.normer = MHSparseNormer(num_head, dim=-1) if sparsenorm else nn.Softmax(dim=-1) self.normer = SparseNormer(dim=-1) if sparsenorm else nn.Softmax( dim=-1) self.drop = Dropout(dropout, inplace=sparsenorm) if dropout > 0.0 else None
def __init__(self, isize, fhsize=None, dropout=0.0, attn_drop=0.0, num_head=8, ahsize=None): super(EncoderLayer, self).__init__() _ahsize = isize if ahsize is None else ahsize _fhsize = _ahsize * 4 if fhsize is None else fhsize self.attn = SelfAttn(isize, _ahsize, isize, num_head, dropout=attn_drop) self.ff = PositionwiseFF(isize, _fhsize, dropout) self.layer_normer = nn.LayerNorm(isize, eps=1e-06) self.drop = Dropout(dropout, inplace=True) if dropout > 0.0 else None
def __init__(self, isize, fhsize=None, dropout=0.0, attn_drop=0.0, num_head=8, ahsize=None, norm_residual=norm_residual_default): super(DecoderLayer, self).__init__() _ahsize = isize if ahsize is None else ahsize _fhsize = _ahsize * 4 if fhsize is None else fhsize self.net = HPLSTM(isize, num_head=num_head, osize=isize, fhsize=_fhsize, dropout=dropout) self.cross_attn = ResCrossAttn(isize, _ahsize, num_head=num_head, dropout=attn_drop, norm_residual=norm_residual) self.ff = PositionwiseFF(isize, hsize=_fhsize, dropout=dropout, norm_residual=norm_residual) self.drop = Dropout(dropout, inplace=True) if dropout > 0.0 else None
def __init__(self, isize, fhsize=None, dropout=0.0, attn_drop=0.0, num_head=8, ahsize=None, norm_residual=norm_residual_default): super(DecoderLayer, self).__init__() _ahsize = isize if ahsize is None else ahsize _fhsize = _ahsize * 4 if fhsize is None else fhsize self.net = HPLSTM(isize, num_head=num_head, osize=isize, fhsize=_fhsize, dropout=dropout) self.cross_attn = ResCrossAttn(isize, _ahsize, num_head=num_head, dropout=attn_drop, norm_residual=norm_residual) self.layer_normer = nn.LayerNorm( isize, eps=ieps_ln_default, elementwise_affine=enable_ln_parameters) self.drop = Dropout(dropout, inplace=True) if dropout > 0.0 else None
def __init__(self, isize, fhsize=None, dropout=0.0, num_head=8): super(EncoderLayer, self).__init__() _fhsize = isize * 4 if fhsize is None else fhsize self.net = BiHPLSTM(isize, num_head=num_head, osize=isize, fhsize=_fhsize, dropout=dropout) self.drop = Dropout(dropout, inplace=True) if dropout > 0.0 else None
def __init__(self, isize, nwd, num_layer, fhsize=None, dropout=0.0, attn_drop=0.0, num_head=8, xseql=cache_len_default, ahsize=None, norm_output=True, share_layer=False, num_layer_dec=6, max_chunk_tokens=8, min_chunks=4, **kwargs): _ahsize = isize if ahsize is None else ahsize _fhsize = _ahsize * 4 if fhsize is None else fhsize super(Encoder, self).__init__(isize, nwd, num_layer, fhsize=_fhsize, dropout=dropout, attn_drop=attn_drop, num_head=num_head, xseql=xseql, ahsize=_ahsize, norm_output=norm_output, share_layer=share_layer, num_layer_dec=num_layer_dec, **kwargs) if share_layer: _shared_layer = EncoderLayer(isize, _fhsize, dropout, attn_drop, num_head, _ahsize) self.nets = nn.ModuleList( [_shared_layer for i in range(num_layer)]) else: self.nets = nn.ModuleList([ EncoderLayer(isize, _fhsize, dropout, attn_drop, num_head, _ahsize) for i in range(num_layer) ]) self.sc_tattn_w = nn.Parameter( torch.Tensor(num_layer + 1, num_layer_dec).uniform_(-sqrt(1.0 / (num_layer + 1)), sqrt(1.0 / (num_layer + 1)))) self.sc_tattn_drop = Dropout(dropout) if dropout > 0.0 else None self.mxct = max_chunk_tokens self.mnck = float(min_chunks)
def __init__(self, isize, hsize=None, dropout=0.0, enable_ffn=False, num_pos=cache_len_default, custom_act=use_adv_act_default, enable_bias=enable_prev_ln_bias_default, enable_proj_bias=enable_proj_bias_default): super(AverageAttn, self).__init__() _hsize = isize if hsize is None else hsize self.num_pos = num_pos self.register_buffer('w', torch.Tensor(num_pos, 1)) if enable_ffn: self.ffn = nn.Sequential( Linear(isize, _hsize, bias=enable_bias), nn.LayerNorm(_hsize, eps=ieps_ln_default, elementwise_affine=enable_ln_parameters), Custom_Act() if custom_act else nn.ReLU(inplace=True), Dropout(dropout, inplace=inplace_after_Custom_Act), Linear(_hsize, isize, bias=enable_proj_bias), Dropout( dropout, inplace=True)) if dropout > 0.0 else nn.Sequential( Linear(isize, _hsize, bias=enable_bias), nn.LayerNorm(_hsize, eps=ieps_ln_default, elementwise_affine=enable_ln_parameters), Custom_Act() if custom_act else nn.ReLU(inplace=True), Linear(_hsize, isize, bias=enable_proj_bias)) else: self.ffn = None self.gw = Linear(isize * 2, isize * 2) self.reset_parameters()
def __init__(self, isize, fhsize=None, dropout=0.0, attn_drop=0.0, num_head=8, ahsize=None, norm_residue=False): super(DecoderLayer, self).__init__() _ahsize = isize if ahsize is None else ahsize _fhsize = _ahsize * 4 if fhsize is None else fhsize self.self_attn = SelfAttn(isize, _ahsize, isize, num_head, dropout=attn_drop) self.cross_attn = CrossAttn(isize, _ahsize, isize, num_head, dropout=attn_drop) self.ff = PositionwiseFF(isize, _fhsize, dropout, norm_residue) self.layer_normer1 = nn.LayerNorm(isize, eps=1e-06) self.layer_normer2 = nn.LayerNorm(isize, eps=1e-06) if dropout > 0: self.d1 = Dropout(dropout, inplace=True) self.d2 = Dropout(dropout, inplace=True) else: self.d1 = None self.d2 = None self.norm_residue = norm_residue
def __init__(self, isize, hsize=None, dropout=0.0, custom_act=use_adv_act_default): super(DATTNCombiner, self).__init__() _hsize = isize * 4 if hsize is None else hsize self.net = nn.Sequential( Linear(isize * 2, _hsize), Dropout(dropout, inplace=True), Custom_Act() if custom_act else nn.Sigmoid(), Scorer(_hsize, bias=False)) if dropout > 0.0 else nn.Sequential( Linear(isize * 2, _hsize), Custom_Act() if custom_act else nn.Sigmoid(), Scorer(_hsize, bias=False))
def __init__(self, isize, nwd, num_layer, fhsize=None, dropout=0.0, attn_drop=0.0, emb_w=None, num_head=8, xseql=cache_len_default, ahsize=None, norm_output=True, bindemb=False, forbidden_index=None, **kwargs): _ahsize = isize if ahsize is None else ahsize _fhsize = _ahsize * 4 if fhsize is None else fhsize super(Decoder, self).__init__(isize, nwd, num_layer, fhsize=_fhsize, dropout=dropout, attn_drop=attn_drop, emb_w=emb_w, num_head=num_head, xseql=xseql, ahsize=_ahsize, norm_output=norm_output, bindemb=bindemb, forbidden_index=forbidden_index, **kwargs) self.nets = nn.ModuleList([ DecoderLayer(isize, _fhsize, dropout, attn_drop, num_head, _ahsize) for i in range(num_layer) ]) self.tattn_w = nn.Parameter( torch.Tensor(num_layer * num_head).uniform_( -sqrt(1.0 / (num_layer * num_head)), sqrt(1.0 / (num_layer * num_head)))) self.tattn_drop = Dropout(dropout) if dropout > 0.0 else None self.trans = Linear(isize, isize, bias=False)
def __init__(self, isize, hsize=None, dropout=0.0, use_GeLU=use_adv_act_default): super(ATTNCombiner, self).__init__() _hsize = isize * 4 if hsize is None else hsize self.net = nn.Sequential( Linear(isize * 2, _hsize), Dropout(dropout, inplace=True), GeLU() if use_GeLU else nn.Sigmoid(), Scorer(_hsize), nn.Sigmoid()) if dropout > 0.0 else nn.Sequential( Linear(isize * 2, _hsize), GeLU() if use_GeLU else nn.Sigmoid(), Scorer(_hsize), nn.Sigmoid())
def __init__(self, isize, nwd, num_layer, fhsize=None, dropout=0.0, attn_drop=0.0, num_head=8, xseql=cache_len_default, ahsize=None, norm_output=True, emb_w=None, share_layer=False, disable_pemb=disable_std_pemb_encoder): super(MSEncoder, self).__init__() _ahsize = isize if ahsize is None else ahsize _fhsize = _ahsize * 4 if fhsize is None else fhsize self.drop = Dropout(dropout, inplace=True) if dropout > 0.0 else None self.wemb = nn.Embedding(nwd, isize, padding_idx=pad_id) if emb_w is not None: self.wemb.weight = emb_w self.pemb = None if disable_pemb else PositionalEmb(isize, xseql, 0, 0) if share_layer: _shared_layer = MSEncoderLayer(isize, _fhsize, dropout, attn_drop, num_head, _ahsize) self.nets = nn.ModuleList( [_shared_layer for i in range(num_layer)]) else: self.nets = nn.ModuleList([ MSEncoderLayer(isize, _fhsize, dropout, attn_drop, num_head, _ahsize) for i in range(num_layer) ]) self.out_normer = nn.LayerNorm( isize, eps=ieps_ln_default, elementwise_affine=enable_ln_parameters) if norm_output else None
def __init__(self, isize, num_head=8, osize=None, dropout=0.0, custom_act=use_adv_act_default, enable_bias=enable_prev_ln_bias_default): super(MHPLSTMCore, self).__init__() _osize = isize if osize is None else osize i_head_dim = float2odd(float(isize) / num_head) i_hsize = i_head_dim * num_head o_head_dim = float2odd(float(_osize) / num_head) o_hsize = o_head_dim * num_head self.trans_hid = GroupLinear(i_hsize + i_hsize, o_hsize * 3, num_head, bias=enable_bias, shuffle=False, trans_input=False, flatten_output=False) self.trans_og = nn.Sequential(GroupLinear(i_hsize + o_hsize, o_hsize, num_head, bias=enable_bias, shuffle=False, trans_input=False, flatten_output=False), nn.LayerNorm((num_head, o_head_dim), eps=ieps_ln_default, elementwise_affine=enable_ln_parameters)) self.normer_csum = nn.LayerNorm((num_head, i_head_dim), eps=ieps_ln_default, elementwise_affine=enable_ln_parameters) self.normer_hid = nn.LayerNorm((num_head, 3, o_head_dim), eps=ieps_ln_default, elementwise_affine=enable_ln_parameters) self.act = Custom_Act() if custom_act else nn.ReLU()#Tanh() self.drop = Dropout(dropout, inplace=inplace_after_Custom_Act) if dropout > 0.0 else None self.init_cx = nn.Parameter(torch.zeros(1, num_head, o_head_dim))
def __init__(self, isize, nwd, num_layer, fhsize=None, dropout=0.0, attn_drop=0.0, emb_w=None, num_head=8, xseql=512, ahsize=None, norm_output=True, bindemb=False, forbidden_index=None): _ahsize = isize if ahsize is None else ahsize _fhsize = _ahsize * 4 if fhsize is None else fhsize super(Decoder, self).__init__(isize, nwd, num_layer, _fhsize, dropout, attn_drop, emb_w, num_head, xseql, _ahsize, norm_output, bindemb, forbidden_index) self.nets = nn.ModuleList([ DecoderLayer(isize, _fhsize, dropout, attn_drop, num_head, _ahsize) for i in range(num_layer) ]) self.tattn_w = nn.Parameter( torch.Tensor(num_layer * num_head).uniform_( -sqrt(1.0 / (num_layer * num_head)), sqrt(1.0 / (num_layer * num_head)))) self.tattn_drop = Dropout(dropout) if dropout > 0.0 else None self.classifier = nn.Sequential(Linear(isize * 2, isize, bias=False), Linear(isize, nwd)) # be careful since this line of code is trying to share the weight of the wemb and the classifier, which may cause problems if torch.nn updates if bindemb: list(self.classifier.modules())[-1].weight = self.wemb.weight