def __init__(self, conf: AttentionPlainConf, **kwargs): super().__init__(conf, **kwargs) conf: AttentionPlainConf = self.conf dim_q, dim_k, dim_v, nh_qk, d_qk, nh_v, d_v = \ conf.dim_q, conf.dim_k, conf.dim_v, conf.nh_qk, conf.d_qk, conf.nh_v, conf.d_v # -- self._att_scale = math.sqrt(conf.d_qk) # scale for score # pre-att affines (no dropouts here!) _eg_q = BK.get_inita_xavier_uniform((d_qk, dim_q)) / BK.get_inita_xavier_uniform((nh_qk*d_qk, dim_q)) self.affine_q = AffineNode(None, isize=dim_q, osize=nh_qk*d_qk, no_drop=True, init_scale=_eg_q*conf.init_scale_hin) _eg_k = BK.get_inita_xavier_uniform((d_qk, dim_k)) / BK.get_inita_xavier_uniform((nh_qk*d_qk, dim_k)) self.affine_k = AffineNode(None, isize=dim_k, osize=nh_qk*d_qk, no_drop=True, init_scale=_eg_k*conf.init_scale_hin) self.affine_v = AffineNode(None, isize=dim_v, osize=nh_v*d_v, no_drop=True) # rel dist keys self.rposi = RelDistNode(conf.rel, _dim=d_qk) if conf.use_rposi else None # att & output if conf.useaff_qk2v: self.aff_qk2v = AffineNode(None, isize=nh_qk, osize=nh_v) else: # assert nh_qk == nh_v if nh_qk != nh_v: zwarn(f"Possible problems with AttNode since hin({nh_qk}) != hout({nh_v})") self.adrop = DropoutNode(None, drop_rate=conf.att_drop, fix_drop=False) # todo(note): with drops(y) & act(?) & bias(y)? self.final_linear = AffineNode(None, isize=nh_v*d_v, osize=dim_v, out_act=conf.out_act)
def __init__(self, conf: PairwiseBlockConf, **kwargs): super().__init__(conf, **kwargs) conf: PairwiseBlockConf = self.conf ndim, nlab, nhead_in, dim_qk, nhead_out, dim_v = \ conf.ndim, conf.nlab, conf.nhead_in, conf.dim_qk, conf.nhead_out, conf.dim_v # -- self._att_scale = math.sqrt(conf.dim_qk) # scale for score; note: no scale here since already small # pre-att affines, (no dropouts here) _extra_gain = BK.get_inita_xavier_uniform((dim_qk, ndim)) / BK.get_inita_xavier_uniform((nhead_in*dim_qk, ndim)) self.affine_q = AffineNode(None, isize=ndim, osize=nhead_in*dim_qk, no_drop=True, init_scale=_extra_gain*conf.hin_init_scale) self.affine_k = AffineNode(None, isize=ndim, osize=nhead_in*dim_qk, no_drop=True, init_scale=_extra_gain*conf.hin_init_scale) self.affine_v = AffineNode(None, isize=ndim, osize=nhead_out*dim_v, no_drop=True) # pred self.pred_in = AffineNode(None, isize=nhead_in, osize=nlab, no_drop=True) self.aff_cf = AffineNode(None, isize=nhead_in, osize=1, no_drop=True, init_scale=conf.cf_init_scale) # for pairwise confident score # final layers self.adrop = DropoutNode(None, drop_rate=conf.att_drop, fix_drop=False) self.fl_score = AffineNode(None, isize=nlab, osize=nhead_out, no_drop=True) self.fl_expr = AffineNode(None, isize=nhead_out*dim_v, osize=ndim, out_act=conf.out_act) self.norm = LayerNormNode(None, osize=ndim) # add&norm