Esempio n. 1
0
    def __init__(self,
                 isize,
                 nwd,
                 num_layer,
                 fhsize=None,
                 dropout=0.0,
                 attn_drop=0.0,
                 num_head=8,
                 xseql=512,
                 ahsize=None,
                 norm_output=True,
                 num_layer_dec=6):

        _ahsize = isize if ahsize is None else ahsize

        _fhsize = _ahsize * 4 if fhsize is None else fhsize

        super(Encoder,
              self).__init__(isize, nwd, num_layer, _fhsize, dropout,
                             attn_drop, num_head, xseql, _ahsize, norm_output)

        self.nets = nn.ModuleList([
            EncoderLayer(isize, _fhsize, dropout, attn_drop, num_head, _ahsize)
            for i in range(num_layer)
        ])

        self.tattn_w = nn.Parameter(
            torch.Tensor(num_layer + 1, num_layer_dec).uniform_(
                -sqrt(2.0 / (num_layer + num_layer_dec + 1)),
                sqrt(2.0 / (num_layer + num_layer_dec + 1))))
        self.tattn_drop = Dropout(dropout) if dropout > 0.0 else None
Esempio n. 2
0
    def __init__(self,
                 isize,
                 hsize,
                 osize,
                 num_head=8,
                 dropout=0.0,
                 enable_bias=False,
                 sparsenorm=False):

        super(CrossAttn, self).__init__()

        self.attn_dim = hsize // num_head
        self.hsize = self.attn_dim * num_head
        self.num_head = num_head

        self.query_adaptor = Linear(isize, self.hsize, bias=enable_bias)
        self.kv_adaptor = Linear(isize, self.hsize * 2, bias=enable_bias)

        self.outer = Linear(self.hsize, osize, bias=enable_bias)

        #self.normer = MHSparseNormer(num_head, dim=-1) if sparsenorm else nn.Softmax(dim=-1)
        self.normer = SparseNormer(dim=-1) if sparsenorm else nn.Softmax(
            dim=-1)

        self.drop = Dropout(dropout,
                            inplace=sparsenorm) if dropout > 0.0 else None
Esempio n. 3
0
    def __init__(self,
                 isize,
                 fhsize=None,
                 dropout=0.0,
                 attn_drop=0.0,
                 num_head=8,
                 ahsize=None):

        super(EncoderLayer, self).__init__()

        _ahsize = isize if ahsize is None else ahsize

        _fhsize = _ahsize * 4 if fhsize is None else fhsize

        self.attn = SelfAttn(isize,
                             _ahsize,
                             isize,
                             num_head,
                             dropout=attn_drop)

        self.ff = PositionwiseFF(isize, _fhsize, dropout)

        self.layer_normer = nn.LayerNorm(isize, eps=1e-06)

        self.drop = Dropout(dropout, inplace=True) if dropout > 0.0 else None
Esempio n. 4
0
    def __init__(self,
                 isize,
                 fhsize=None,
                 dropout=0.0,
                 attn_drop=0.0,
                 num_head=8,
                 ahsize=None,
                 norm_residual=norm_residual_default):

        super(DecoderLayer, self).__init__()

        _ahsize = isize if ahsize is None else ahsize
        _fhsize = _ahsize * 4 if fhsize is None else fhsize

        self.net = HPLSTM(isize,
                          num_head=num_head,
                          osize=isize,
                          fhsize=_fhsize,
                          dropout=dropout)
        self.cross_attn = ResCrossAttn(isize,
                                       _ahsize,
                                       num_head=num_head,
                                       dropout=attn_drop,
                                       norm_residual=norm_residual)
        self.ff = PositionwiseFF(isize,
                                 hsize=_fhsize,
                                 dropout=dropout,
                                 norm_residual=norm_residual)

        self.drop = Dropout(dropout, inplace=True) if dropout > 0.0 else None
Esempio n. 5
0
    def __init__(self,
                 isize,
                 fhsize=None,
                 dropout=0.0,
                 attn_drop=0.0,
                 num_head=8,
                 ahsize=None,
                 norm_residual=norm_residual_default):

        super(DecoderLayer, self).__init__()

        _ahsize = isize if ahsize is None else ahsize
        _fhsize = _ahsize * 4 if fhsize is None else fhsize

        self.net = HPLSTM(isize,
                          num_head=num_head,
                          osize=isize,
                          fhsize=_fhsize,
                          dropout=dropout)
        self.cross_attn = ResCrossAttn(isize,
                                       _ahsize,
                                       num_head=num_head,
                                       dropout=attn_drop,
                                       norm_residual=norm_residual)

        self.layer_normer = nn.LayerNorm(
            isize,
            eps=ieps_ln_default,
            elementwise_affine=enable_ln_parameters)

        self.drop = Dropout(dropout, inplace=True) if dropout > 0.0 else None
Esempio n. 6
0
	def __init__(self, isize, fhsize=None, dropout=0.0, num_head=8):

		super(EncoderLayer, self).__init__()

		_fhsize = isize * 4 if fhsize is None else fhsize

		self.net = BiHPLSTM(isize, num_head=num_head, osize=isize, fhsize=_fhsize, dropout=dropout)

		self.drop = Dropout(dropout, inplace=True) if dropout > 0.0 else None
Esempio n. 7
0
    def __init__(self,
                 isize,
                 nwd,
                 num_layer,
                 fhsize=None,
                 dropout=0.0,
                 attn_drop=0.0,
                 num_head=8,
                 xseql=cache_len_default,
                 ahsize=None,
                 norm_output=True,
                 share_layer=False,
                 num_layer_dec=6,
                 max_chunk_tokens=8,
                 min_chunks=4,
                 **kwargs):

        _ahsize = isize if ahsize is None else ahsize

        _fhsize = _ahsize * 4 if fhsize is None else fhsize

        super(Encoder, self).__init__(isize,
                                      nwd,
                                      num_layer,
                                      fhsize=_fhsize,
                                      dropout=dropout,
                                      attn_drop=attn_drop,
                                      num_head=num_head,
                                      xseql=xseql,
                                      ahsize=_ahsize,
                                      norm_output=norm_output,
                                      share_layer=share_layer,
                                      num_layer_dec=num_layer_dec,
                                      **kwargs)

        if share_layer:
            _shared_layer = EncoderLayer(isize, _fhsize, dropout, attn_drop,
                                         num_head, _ahsize)
            self.nets = nn.ModuleList(
                [_shared_layer for i in range(num_layer)])
        else:
            self.nets = nn.ModuleList([
                EncoderLayer(isize, _fhsize, dropout, attn_drop, num_head,
                             _ahsize) for i in range(num_layer)
            ])

        self.sc_tattn_w = nn.Parameter(
            torch.Tensor(num_layer + 1,
                         num_layer_dec).uniform_(-sqrt(1.0 / (num_layer + 1)),
                                                 sqrt(1.0 / (num_layer + 1))))
        self.sc_tattn_drop = Dropout(dropout) if dropout > 0.0 else None

        self.mxct = max_chunk_tokens
        self.mnck = float(min_chunks)
Esempio n. 8
0
    def __init__(self,
                 isize,
                 hsize=None,
                 dropout=0.0,
                 enable_ffn=False,
                 num_pos=cache_len_default,
                 custom_act=use_adv_act_default,
                 enable_bias=enable_prev_ln_bias_default,
                 enable_proj_bias=enable_proj_bias_default):

        super(AverageAttn, self).__init__()

        _hsize = isize if hsize is None else hsize

        self.num_pos = num_pos
        self.register_buffer('w', torch.Tensor(num_pos, 1))

        if enable_ffn:
            self.ffn = nn.Sequential(
                Linear(isize, _hsize, bias=enable_bias),
                nn.LayerNorm(_hsize,
                             eps=ieps_ln_default,
                             elementwise_affine=enable_ln_parameters),
                Custom_Act() if custom_act else nn.ReLU(inplace=True),
                Dropout(dropout, inplace=inplace_after_Custom_Act),
                Linear(_hsize, isize, bias=enable_proj_bias),
                Dropout(
                    dropout,
                    inplace=True)) if dropout > 0.0 else nn.Sequential(
                        Linear(isize, _hsize, bias=enable_bias),
                        nn.LayerNorm(_hsize,
                                     eps=ieps_ln_default,
                                     elementwise_affine=enable_ln_parameters),
                        Custom_Act() if custom_act else nn.ReLU(inplace=True),
                        Linear(_hsize, isize, bias=enable_proj_bias))
        else:
            self.ffn = None

        self.gw = Linear(isize * 2, isize * 2)

        self.reset_parameters()
Esempio n. 9
0
    def __init__(self,
                 isize,
                 fhsize=None,
                 dropout=0.0,
                 attn_drop=0.0,
                 num_head=8,
                 ahsize=None,
                 norm_residue=False):

        super(DecoderLayer, self).__init__()

        _ahsize = isize if ahsize is None else ahsize

        _fhsize = _ahsize * 4 if fhsize is None else fhsize

        self.self_attn = SelfAttn(isize,
                                  _ahsize,
                                  isize,
                                  num_head,
                                  dropout=attn_drop)
        self.cross_attn = CrossAttn(isize,
                                    _ahsize,
                                    isize,
                                    num_head,
                                    dropout=attn_drop)

        self.ff = PositionwiseFF(isize, _fhsize, dropout, norm_residue)

        self.layer_normer1 = nn.LayerNorm(isize, eps=1e-06)
        self.layer_normer2 = nn.LayerNorm(isize, eps=1e-06)

        if dropout > 0:
            self.d1 = Dropout(dropout, inplace=True)
            self.d2 = Dropout(dropout, inplace=True)
        else:
            self.d1 = None
            self.d2 = None

        self.norm_residue = norm_residue
Esempio n. 10
0
    def __init__(self,
                 isize,
                 hsize=None,
                 dropout=0.0,
                 custom_act=use_adv_act_default):

        super(DATTNCombiner, self).__init__()

        _hsize = isize * 4 if hsize is None else hsize

        self.net = nn.Sequential(
            Linear(isize * 2, _hsize), Dropout(dropout, inplace=True),
            Custom_Act() if custom_act else nn.Sigmoid(),
            Scorer(_hsize, bias=False)) if dropout > 0.0 else nn.Sequential(
                Linear(isize * 2, _hsize),
                Custom_Act() if custom_act else nn.Sigmoid(),
                Scorer(_hsize, bias=False))
Esempio n. 11
0
    def __init__(self,
                 isize,
                 nwd,
                 num_layer,
                 fhsize=None,
                 dropout=0.0,
                 attn_drop=0.0,
                 emb_w=None,
                 num_head=8,
                 xseql=cache_len_default,
                 ahsize=None,
                 norm_output=True,
                 bindemb=False,
                 forbidden_index=None,
                 **kwargs):

        _ahsize = isize if ahsize is None else ahsize

        _fhsize = _ahsize * 4 if fhsize is None else fhsize

        super(Decoder, self).__init__(isize,
                                      nwd,
                                      num_layer,
                                      fhsize=_fhsize,
                                      dropout=dropout,
                                      attn_drop=attn_drop,
                                      emb_w=emb_w,
                                      num_head=num_head,
                                      xseql=xseql,
                                      ahsize=_ahsize,
                                      norm_output=norm_output,
                                      bindemb=bindemb,
                                      forbidden_index=forbidden_index,
                                      **kwargs)

        self.nets = nn.ModuleList([
            DecoderLayer(isize, _fhsize, dropout, attn_drop, num_head, _ahsize)
            for i in range(num_layer)
        ])

        self.tattn_w = nn.Parameter(
            torch.Tensor(num_layer * num_head).uniform_(
                -sqrt(1.0 / (num_layer * num_head)),
                sqrt(1.0 / (num_layer * num_head))))
        self.tattn_drop = Dropout(dropout) if dropout > 0.0 else None
        self.trans = Linear(isize, isize, bias=False)
Esempio n. 12
0
    def __init__(self,
                 isize,
                 hsize=None,
                 dropout=0.0,
                 use_GeLU=use_adv_act_default):

        super(ATTNCombiner, self).__init__()

        _hsize = isize * 4 if hsize is None else hsize

        self.net = nn.Sequential(
            Linear(isize * 2, _hsize), Dropout(dropout, inplace=True),
            GeLU() if use_GeLU else nn.Sigmoid(), Scorer(_hsize),
            nn.Sigmoid()) if dropout > 0.0 else nn.Sequential(
                Linear(isize * 2, _hsize),
                GeLU() if use_GeLU else nn.Sigmoid(), Scorer(_hsize),
                nn.Sigmoid())
Esempio n. 13
0
    def __init__(self,
                 isize,
                 nwd,
                 num_layer,
                 fhsize=None,
                 dropout=0.0,
                 attn_drop=0.0,
                 num_head=8,
                 xseql=cache_len_default,
                 ahsize=None,
                 norm_output=True,
                 emb_w=None,
                 share_layer=False,
                 disable_pemb=disable_std_pemb_encoder):

        super(MSEncoder, self).__init__()

        _ahsize = isize if ahsize is None else ahsize
        _fhsize = _ahsize * 4 if fhsize is None else fhsize

        self.drop = Dropout(dropout, inplace=True) if dropout > 0.0 else None

        self.wemb = nn.Embedding(nwd, isize, padding_idx=pad_id)
        if emb_w is not None:
            self.wemb.weight = emb_w

        self.pemb = None if disable_pemb else PositionalEmb(isize, xseql, 0, 0)
        if share_layer:
            _shared_layer = MSEncoderLayer(isize, _fhsize, dropout, attn_drop,
                                           num_head, _ahsize)
            self.nets = nn.ModuleList(
                [_shared_layer for i in range(num_layer)])
        else:
            self.nets = nn.ModuleList([
                MSEncoderLayer(isize, _fhsize, dropout, attn_drop, num_head,
                               _ahsize) for i in range(num_layer)
            ])

        self.out_normer = nn.LayerNorm(
            isize,
            eps=ieps_ln_default,
            elementwise_affine=enable_ln_parameters) if norm_output else None
Esempio n. 14
0
	def __init__(self, isize, num_head=8, osize=None, dropout=0.0, custom_act=use_adv_act_default, enable_bias=enable_prev_ln_bias_default):

		super(MHPLSTMCore, self).__init__()

		_osize = isize if osize is None else osize

		i_head_dim = float2odd(float(isize) / num_head)
		i_hsize = i_head_dim * num_head
		o_head_dim = float2odd(float(_osize) / num_head)
		o_hsize = o_head_dim * num_head

		self.trans_hid = GroupLinear(i_hsize + i_hsize, o_hsize * 3, num_head, bias=enable_bias, shuffle=False, trans_input=False, flatten_output=False)
		self.trans_og = nn.Sequential(GroupLinear(i_hsize + o_hsize, o_hsize, num_head, bias=enable_bias, shuffle=False, trans_input=False, flatten_output=False), nn.LayerNorm((num_head, o_head_dim), eps=ieps_ln_default, elementwise_affine=enable_ln_parameters))

		self.normer_csum = nn.LayerNorm((num_head, i_head_dim), eps=ieps_ln_default, elementwise_affine=enable_ln_parameters)
		self.normer_hid = nn.LayerNorm((num_head, 3, o_head_dim), eps=ieps_ln_default, elementwise_affine=enable_ln_parameters)

		self.act = Custom_Act() if custom_act else nn.ReLU()#Tanh()
		self.drop = Dropout(dropout, inplace=inplace_after_Custom_Act) if dropout > 0.0 else None
		self.init_cx = nn.Parameter(torch.zeros(1, num_head, o_head_dim))
Esempio n. 15
0
    def __init__(self,
                 isize,
                 nwd,
                 num_layer,
                 fhsize=None,
                 dropout=0.0,
                 attn_drop=0.0,
                 emb_w=None,
                 num_head=8,
                 xseql=512,
                 ahsize=None,
                 norm_output=True,
                 bindemb=False,
                 forbidden_index=None):

        _ahsize = isize if ahsize is None else ahsize

        _fhsize = _ahsize * 4 if fhsize is None else fhsize

        super(Decoder,
              self).__init__(isize, nwd, num_layer, _fhsize, dropout,
                             attn_drop, emb_w, num_head, xseql, _ahsize,
                             norm_output, bindemb, forbidden_index)

        self.nets = nn.ModuleList([
            DecoderLayer(isize, _fhsize, dropout, attn_drop, num_head, _ahsize)
            for i in range(num_layer)
        ])

        self.tattn_w = nn.Parameter(
            torch.Tensor(num_layer * num_head).uniform_(
                -sqrt(1.0 / (num_layer * num_head)),
                sqrt(1.0 / (num_layer * num_head))))
        self.tattn_drop = Dropout(dropout) if dropout > 0.0 else None

        self.classifier = nn.Sequential(Linear(isize * 2, isize, bias=False),
                                        Linear(isize, nwd))
        # be careful since this line of code is trying to share the weight of the wemb and the classifier, which may cause problems if torch.nn updates
        if bindemb:
            list(self.classifier.modules())[-1].weight = self.wemb.weight