def multi_head_w_global( # Added and Modified by xxx xxx x, scope, n_state, n_head, train=None, scale=False, resid_dropout=0.9, attn_dropout=0.9, use_global=False, use_direction=False, b=None, global_afn='exp', ): assert n_state % n_head == 0 with tf.variable_scope(scope): sl = shape_list(x)[-2] if not use_direction: b = tf.matrix_band_part(tf.ones([sl, sl]), -1, 0) # Lower triangular part. b = tf.reshape(b, [1, 1, sl, sl]) c = conv1d(x, 'c_attn_openai_trans', n_state * 3, 1, train=train) q, k, v = tf.split(c, 3, 2) q = split_heads(q, n_head) # bs,hd,sl,d k = split_heads(k, n_head, k=True) # bs,hd,d,sl v = split_heads(v, n_head) # bs,hd,sl,d # 1. t2t w = tf.matmul(q, k) # bs,hd,sl, sl if scale: n_state_hd = shape_list(v)[-1] w = w * tf.rsqrt(tf.cast(n_state_hd, tf.float32)) if use_global: e_w = activation_name_to_func(global_afn)(w) * b # 2. s2t w_g = split_heads(conv1d(x, "c_w_g", n_state, 1, train=train), n_head) # bs,hd,sl,d e_w_g = tf.exp(w_g) # # bs,hd,sl,d # 3. mtsa accum_z_deno = tf.matmul(e_w, e_w_g) # bs,hd,sl,dim accum_z_deno = tf.where( # in case of NaN and Inf tf.greater(accum_z_deno, tf.zeros_like(accum_z_deno)), accum_z_deno, tf.ones_like(accum_z_deno) ) e_w = dropout(e_w, math.sqrt(attn_dropout), train) e_w_g = dropout(e_w_g, math.sqrt(attn_dropout), train) rep_mul_score = v * e_w_g accum_rep_mul_score = tf.matmul(e_w, rep_mul_score) a = accum_rep_mul_score / accum_z_deno else: w = w * b + -1e9 * (1 - b) w = tf.nn.softmax(w) w = w * b # fixed the bug w = dropout(w, attn_dropout, train) # attention dropout a = tf.matmul(w, v) a = merge_heads(a) a = conv1d(a, 'c_proj_openai_trans', n_state, 1, train=train) a = dropout(a, resid_dropout, train, ) return a
def mlp(x, scope, n_state, train=None, afn='gelu', resid_dropout=0.9): # read: 3layer mlp with tf.variable_scope(scope): nx = shape_list(x)[-1] act = act_name2fn(afn) h = act(conv1d(x, 'c_fc_openai_trans', n_state, 1, train=train)) h2 = conv1d(h, 'c_proj_openai_trans', nx, 1, train=train) h2 = dropout(h2, resid_dropout, train) return h2
def qqp_logits_sentence_encoding(s1_rep, s2_rep, afn, n_state, is_train, clf_dropout, highway=False): # TODO: change this to my style (bn_dense_layer) out_rep = tf.concat([tf.abs(s1_rep - s2_rep), s1_rep * s2_rep], -1) act = act_name2fn(afn) h = act(conv1d(out_rep, 'c_fc', n_state, 1, train=is_train)) if highway: trans = conv1d(h, 'c_trans', n_state, 1, train=is_train) gate = tf.nn.sigmoid(conv1d(h, 'c_gate', n_state, 1, train=is_train)) h = gate * trans + (1 - gate) * h h_dp = dropout(h, clf_dropout, is_train) return conv1d(h_dp, 'c_logits', 2, 1, train=is_train)
def multi_head(x, scope, n_state, n_head, train=None, scale=False, resid_dropout=0.9, attn_dropout=0.9): assert n_state % n_head == 0 with tf.variable_scope(scope): c = conv1d(x, 'c_attn_openai_trans', n_state * 3, 1, train=train) # position-wise fully-connected layer q, k, v = tf.split(c, 3, 2) q = split_heads(q, n_head) k = split_heads(k, n_head, k=True) v = split_heads(v, n_head) a = _attn(q, k, v, train=train, scale=scale, attn_dropout=attn_dropout) a = merge_heads(a) a = conv1d(a, 'c_proj_openai_trans', n_state, 1, train=train) a = dropout(a, resid_dropout, train, ) return a
def _attn(q, k, v, train=None, scale=False, attn_dropout=0.9): # read w = tf.matmul(q, k) if scale: n_state = shape_list(v)[-1] w = w*tf.rsqrt(tf.cast(n_state, tf.float32)) w = mask_attn_weights(w) # highlight, this is uni-directional self-attention w = tf.nn.softmax(w) # w = tf.Print(w, [tf.shape(w)]) w = dropout(w, attn_dropout, train) # attention dropout a = tf.matmul(w, v) return a
def s2t_self_attn( tensor_input, tensor_mask, deep_act=None, method='multi_dim', wd=0., keep_prob=1., is_training=None, scope=None, **kwargs ): use_deep = isinstance(deep_act, str) # use Two layers or Single layer for the alignment score with tf.variable_scope(scope or 's2t_self_attn_{}'.format(method)): tensor_shape = get_shape_list(tensor_input) hn = tensor_shape[-1] # hidden state number if method == 'additive': align_scores = bn_dense_layer_v2( # bs,sl,hn/1 tensor_input, hn if use_deep else 1, True, 0., 'align_score_1', 'linear', False, wd, keep_prob, is_training ) if use_deep: align_scores = bn_dense_layer_v2( # bs,sl,1 act_name2fn(deep_act)(align_scores), 1, True, 0., 'align_score_2', 'linear', False, wd, keep_prob, is_training ) elif method == 'multi_dim': align_scores = bn_dense_layer_v2( # bs,sl,hn tensor_input, hn, False, 0., 'align_score_1', 'linear', False, wd, keep_prob, is_training ) if use_deep: align_scores = bn_dense_layer_v2( # bs,sl,hn act_name2fn(deep_act)(align_scores), hn, True, 0., 'align_score_2', 'linear', False, wd, keep_prob, is_training ) elif method == 'multi_dim_head': get_shape_list(tensor_input, expected_rank=3) # the input should be rank-3 assert 'head_num' in kwargs and isinstance(kwargs['head_num'], int) head_num = kwargs['head_num'] assert hn % head_num == 0 head_dim = hn // head_num tensor_input_heads = split_head(tensor_input, head_num) # [bs,hd,sl,hd_dim] align_scores_heads = bn_dense_layer_multi_head( # [bs,hd,sl,hd_dim] tensor_input_heads, head_dim, True, 0., 'align_scores_heads_1', 'linear', False, wd, keep_prob, is_training ) if use_deep: align_scores_heads = bn_dense_layer_multi_head( # [bs,hd,sl,hd_dim] act_name2fn(deep_act)(align_scores_heads), head_dim, True, 0., 'align_scores_heads_2', 'linear', False, wd, keep_prob, is_training ) align_scores = combine_head(align_scores_heads) # [bs,sl,dim] else: raise AttributeError # attention procedure align_scores [bs,sl,1/dim] align_scores_masked = exp_mask_v3(align_scores, tensor_mask, multi_head=False, high_dim=True) # bs,sl,hn attn_prob = tf.nn.softmax(align_scores_masked, axis=-2) # bs,sl,hn if 'attn_keep_prob' in kwargs and isinstance(kwargs['attn_keep_prob'], float): attn_prob = dropout(attn_prob, kwargs['attn_keep_prob'], is_training) # bs,sl,hn attn_res = tf.reduce_sum( # [bs,sl,hn] -> [bs,dim] mask_v3(attn_prob*tensor_input, tensor_mask, high_dim=True), axis=-2 ) return attn_res # [bs,hn]
def cond_attn( pairwise_scores, featurewise_scores, value_features, from_mask, to_mask, attn_keep_prob=1., is_training=None, extra_pairwise_mask=None, name=None ): """ :param pairwise_scores: [bs,[head],slf,slt] :param featurewise_scores: [bs,[head],slt,hn] :param value_features: [bs,[head],slt,hn] :param from_mask: :param to_mask: :param extra_pairwise_mask: :return: """ with tf.name_scope(name or 'cond_attn'): # sanity check pairwise_shape = get_shape_list(pairwise_scores) featurewise_shape = get_shape_list(featurewise_scores) value_shape = get_shape_list(value_features) pairwise_ndim = len(pairwise_shape) featurewise_ndim = len(featurewise_shape) value_ndim = len(value_shape) assert featurewise_shape[-1] == value_shape[-1] assert pairwise_ndim in [3, 4] and pairwise_ndim == featurewise_ndim and featurewise_ndim == value_ndim multi_head = True if pairwise_ndim == 4 else False # if the multi-head included cross_attn_mask = cross_attn_mask_generation( # [bs,slf,slt] from_mask, to_mask, mutual=True ) if multi_head: # add the multi-head dim cross_attn_mask = tf.expand_dims(cross_attn_mask, 1) # [bs,[1],slf,slt] if not isinstance(extra_pairwise_mask, type(None)): # the extra_pairwise_mask could be include the multi-head extra_pairwise_mask_shape = get_shape_list(extra_pairwise_mask) assert len(extra_pairwise_mask_shape) in [3, 4] assert multi_head or len(extra_pairwise_mask_shape) == 3 # if multi_head=False, shape must be 3-D if multi_head and len(extra_pairwise_mask_shape) == 3: extra_pairwise_mask = tf.expand_dims(cross_attn_mask, 1) # [bs,[1],slf,slt] cross_attn_mask = tf.logical_and(cross_attn_mask, extra_pairwise_mask) # [bs,[1],slf,slt] e_dot_logits = mask_v3( # bs,head,sl1,sl2 tf.exp(pairwise_scores), cross_attn_mask, multi_head=False, high_dim=False) # the multi-head has been add e_multi_logits = mask_v3( tf.exp(featurewise_scores), to_mask, multi_head=multi_head, high_dim=True ) with tf.name_scope("hybrid_attn"): # Z: softmax normalization term in attention probabilities calculation accum_z_deno = tf.matmul(e_dot_logits, e_multi_logits) # num,bs,sl,dim accum_z_deno = tf.where( # in case of NaN and Inf tf.greater(accum_z_deno, tf.zeros_like(accum_z_deno)), accum_z_deno, tf.ones_like(accum_z_deno) ) # attention dropout e_dot_logits = dropout(e_dot_logits, math.sqrt(attn_keep_prob), is_training) e_multi_logits = dropout(e_multi_logits, math.sqrt(attn_keep_prob), is_training) # sum of exp(logits) \multiply attention target sequence rep_mul_score = value_features * e_multi_logits accum_rep_mul_score = tf.matmul(e_dot_logits, rep_mul_score) # calculate the final attention results attn_res = accum_rep_mul_score / accum_z_deno if multi_head: attn_res = combine_head(attn_res) # [bs,slf,hd_num*hd_dim] return attn_res # [bs,slf,hn/hd_num*hd_dim]
def multihead_attention_decoder( tensor_from, tensor_to, mask_to, mask_direction=None, # [bs,slf,slt] act_name="relu", hn=768, head_num=12, wd=0., is_training=None, keep_prob_dense=1., keep_prob_attn=1., tensor_to_prev=None, mask_prev_to=None, scope=None, ): head_dim = hn // head_num with tf.variable_scope(scope or "multihead_attention_decoder"): # if not isinstance(tensor_to_prev, type(None)): # to print the shape # tensor_from = tf.Print(tensor_from, [ # tf.shape(tensor_from), tf.shape(tensor_to), tf.shape(mask_to), tf.shape(tensor_to_prev)]) if isinstance(tensor_to_prev, type(None)): tensor_to_all = tensor_to # bs,sl,hn mask_to_all = mask_to # bs,sl else: tensor_to_all = tf.concat([tensor_to_prev, tensor_to], -2) # bs,psl+1,hn if mask_prev_to is None: mask_prev_to = tf.cast( tf.ones(get_shape_list(tensor_to_prev, 3)[:2], tf.int32), tf.bool) # bs,psl mask_to_all = tf.concat([mask_prev_to, mask_to], -1) # bs,psl+1 attn_scores = compatibility_fn( tensor_from, tensor_to_all, method="multi_head", head_num=head_num, hn=hn, wd=wd, is_training=is_training, keep_prob=keep_prob_dense, ) # [bs,hd_num,slf,slt] v_heads = bn_dense_layer_v2( # bs,slt,hd_dim * hd_num tensor_to_all, head_dim, True, 0., 'v_heads', 'linear', False, wd, keep_prob_dense, is_training, dup_num=head_num) v_heads = split_head(v_heads, head_num) # # bs,hd_num,slt,hd_dim # mask the self-attention scores attn_scores_mask = tf.expand_dims(mask_to_all, 1) # bs,1,tsl if (not isinstance(mask_direction, type(None))) and isinstance( tensor_to_prev, type(None)): attn_scores_mask = tf.logical_and(attn_scores_mask, mask_direction) # bs,tsl,tsl attn_scores_masked = exp_mask_v3( attn_scores, attn_scores_mask, multi_head=True) # [bs,hd_num,slf,slt] attn_prob = tf.nn.softmax(attn_scores_masked) attn_prob = dropout(attn_prob, keep_prob_attn, is_training) # [bs,hd_num,slf,slt] v_heads_etd = tf.expand_dims(v_heads, 2) # bs,hd_num,1,slt,hd_dim attn_prob_etd = tf.expand_dims(attn_prob, -1) # bs,hd_num,slf,slt,1 attn_res = tf.reduce_sum(v_heads_etd * attn_prob_etd, 3) # bs,hd_num,slf,hd_dim out_prev = combine_head(attn_res) # bs,fsl,hn # if mask_direction is not None and tensor_to_prev is None: # attn_scores = exp_mask_v3(attn_scores, mask_direction, multi_head=True) # [bs,hd_num,slf,slt] # attn_scores = dropout(attn_scores, keep_prob_attn, is_training) # # attn_res = softsel( # [bs,hd_num,slf,dhn] # v_heads, attn_scores, mask_to_all, # mask_add_head_dim_for_scores=True, # input_add_multi_head_dim=False, # score_add_hn_dim=True, # axis=3) # out_prev = combine_head(attn_res) # dense layer out = bn_dense_layer_v2(out_prev, hn, True, 0., "output_transformer", act_name, False, wd, keep_prob_dense, is_training) return out