def build_network(self): _logger.add() _logger.add('building %s neural network structure...' % cfg.network_type) tds, cds = self.tds, self.cds tl = self.tl tel, cel, cos, ocd, fh = self.tel, self.cel, self.cos, self.ocd, self.fh hn = self.hn bs = self.bs with tf.variable_scope('emb'): token_emb_mat = generate_embedding_mat(tds, tel, init_mat=self.token_emb_mat, extra_mat=self.glove_emb_mat, extra_trainable=self.finetune_emb, scope='gene_token_emb_mat') emb = tf.nn.embedding_lookup(token_emb_mat, self.token_seq) # bs,sl1,tel with tf.variable_scope('sent_encoding'): rep = sentence_encoding_models( emb, self.token_mask, cfg.context_fusion_method, 'relu', 'ct_based_sent2vec', cfg.wd, self.is_train, cfg.dropout, block_len=cfg.block_len) with tf.variable_scope('output'): pre_logits = tf.nn.relu(linear([rep], hn, True, scope='pre_logits_linear', wd=cfg.wd, input_keep_prob=cfg.dropout, is_train=self.is_train)) # bs, hn logits = linear([pre_logits], self.output_class, False, scope='get_output', wd=cfg.wd, input_keep_prob=cfg.dropout, is_train=self.is_train) # bs, 5 _logger.done() return logits
def build_network(self): _logger.add() _logger.add('building %s neural network structure...' % cfg.network_type) tds, cds = self.tds, self.cds tl = self.tl tel, cel, cos, ocd, fh = self.tel, self.cel, self.cos, self.ocd, self.fh hn = self.hn bs, sl, ol, mc = self.bs, self.sl, self.ol, self.mc with tf.variable_scope('emb'): token_emb_mat = generate_embedding_mat(tds, tel, init_mat=self.token_emb_mat, extra_mat=self.glove_emb_mat, extra_trainable=self.finetune_emb, scope='gene_token_emb_mat') emb = tf.nn.embedding_lookup(token_emb_mat, self.token_seq) # bs,sl,tel self.tensor_dict['emb'] = emb rep = disan( emb, self.token_mask, 'DiSAN', cfg.dropout, self.is_train, cfg.wd, 'relu', tensor_dict=self.tensor_dict, name='') with tf.variable_scope('output'): pre_logits = tf.nn.relu(linear([rep], hn, True, scope='pre_logits_linear', wd=cfg.wd, input_keep_prob=cfg.dropout, is_train=self.is_train)) # bs, hn logits = linear([pre_logits], self.output_class, False, scope='get_output', wd=cfg.wd, input_keep_prob=cfg.dropout, is_train=self.is_train) # bs, 5 _logger.done() return logits
def fusion_gate(rep1, rep2, wd, keep_prob, is_train): ivec = rep1.get_shape().as_list()[1] with tf.variable_scope('output'): o_bias = tf.get_variable('o_bias', [ivec], tf.float32, tf.constant_initializer(0.)) # input gate fusion_g = tf.nn.sigmoid( linear(rep1, ivec, True, 0., 'linear_fusion_i', False, wd, keep_prob, is_train) + linear(rep2, ivec, True, 0., 'linear_fusion_a', False, wd, keep_prob, is_train) + o_bias) output = fusion_g * rep1 + (1 - fusion_g) * rep2 return output
def do_shift(self, data_for_shift): hn, dropout, is_train, wd = self.hn, self.dropout, self.is_train, self.wd with tf.variable_scope('sr_%s' % self.method_type): I = tf.nn.sigmoid( linear([data_for_shift], hn, True, 0., 'W_i_0', False, 0., dropout, is_train)) O = tf.nn.sigmoid( linear([data_for_shift], hn, True, 0., 'W_o_0', False, 0., dropout, is_train)) U = tf.nn.tanh( linear([data_for_shift], hn, True, 0., 'W_u_0', False, 0., dropout, is_train)) C = I * U # bs, hn H = O * tf.nn.tanh(C) # bs, hn return tf.concat([H, C], -1) # bs, hn*2
def normal_attention(rep_tensor, rep_mask, scope=None, keep_prob=1., is_train=None, wd=0., activation='elu', tensor_dict=None, name=None): batch_size, code_len, vec_size = tf.shape(rep_tensor)[0], tf.shape( rep_tensor)[1], tf.shape(rep_tensor)[2] ivec = rep_tensor.get_shape()[2] with tf.variable_scope(scope or 'normal_attention'): rep_tensor_map = bn_dense_layer(rep_tensor, ivec, True, 0., 'bn_dense_map', activation, False, wd, keep_prob, is_train) rep_tensor_logits = get_logits([rep_tensor_map], None, False, scope='self_attn_logits', mask=rep_mask, input_keep_prob=keep_prob, is_train=is_train) # bs,sl attn_result = softsel(rep_tensor, rep_tensor_logits, rep_mask) # bs,vec # save attn if tensor_dict is not None and name is not None: tensor_dict[name] = tf.nn.softmax(rep_tensor_logits) with tf.variable_scope('output'): o_bias = tf.get_variable('o_bias', [ivec], tf.float32, tf.constant_initializer(0.)) # input gate fusion_gate = tf.nn.sigmoid( linear(rep_tensor_map, ivec, True, 0., 'linear_fusion_i', False, wd, keep_prob, is_train) + linear(attn_result, ivec, True, 0., 'linear_fusion_a', False, wd, keep_prob, is_train) + o_bias) output = fusion_gate * rep_tensor_map + (1 - fusion_gate) * attn_result output = mask_for_high_rank(output, rep_mask) # bs,sl,vec return output
def do_reduce(self, data_for_reduce, mask_for_reduce): hn, dropout, is_train, wd = self.hn, self.dropout, self.is_train, self.wd with tf.variable_scope('sr_%s' % self.method_type): left_child_hid = data_for_reduce[:, 0, :hn] left_child_cell = data_for_reduce[:, 0, hn:] right_child_hid = data_for_reduce[:, 1, :hn] right_child_cell = data_for_reduce[:, 1, hn:] # LSTM update I = tf.nn.sigmoid( linear([left_child_hid], hn, False, 0., 'W_i_l', False, 0., dropout, is_train) + linear([right_child_hid], hn, True, 0., 'W_i_r', False, 0., dropout, is_train), ) F_l = tf.nn.sigmoid( linear([left_child_hid], hn, False, 0., 'W_f_l_l', False, 0., dropout, is_train) + linear([right_child_hid], hn, True, 0., 'W_f_l_r', False, 0., dropout, is_train)) F_r = tf.nn.sigmoid( linear([left_child_hid], hn, False, 0., 'W_f_r_l', False, 0., dropout, is_train) + linear([right_child_hid], hn, True, 0., 'W_f_r_r', False, 0., dropout, is_train)) O = tf.nn.sigmoid( linear([left_child_hid], hn, False, 0., 'W_o_l', False, 0., dropout, is_train) + linear([right_child_hid], hn, True, 0., 'W_o_r', False, 0., dropout, is_train)) U = tf.nn.tanh( linear([left_child_hid], hn, False, 0., 'W_u_l', False, 0., dropout, is_train) + linear([right_child_hid], hn, True, 0., 'W_u_r', False, 0., dropout, is_train)) C = I * U + F_l * left_child_cell + F_r * right_child_cell H = O * tf.nn.tanh(C) return tf.concat([H, C], -1)
def do_shift(self, data_for_shift): hn, dropout, is_train, wd = self.hn, self.dropout, self.is_train, self.wd with tf.variable_scope('sr_%s' % self.method_type): print('var num in (2.1) :', len(tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES))) I = tf.nn.sigmoid( linear([data_for_shift], hn, False, 0., 'W_i_0', False, 0., dropout, is_train) + self.bias_I) O = tf.nn.sigmoid( linear([data_for_shift], hn, False, 0., 'W_o_0', False, 0., dropout, is_train) + self.bias_O) U = tf.nn.tanh( linear([data_for_shift], hn, False, 0., 'W_u_0', False, 0., dropout, is_train) + self.bias_U) print('var num in (2.2) :', len(tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES))) C = I * U # bs, hn H = O * tf.nn.tanh(C) # bs, hn return tf.concat([H, C], -1) # bs, 2*hn
def do_shift(self, data_for_shift): with tf.variable_scope('sr_%s' % self.method_type): shifted_value = tf.nn.relu( linear([data_for_shift], self.hn, True, 0., 'shift_linear', False, input_keep_prob=self.dropout, is_train=self.is_train)) return shifted_value
def build_network(self): _logger.add() _logger.add('building %s neural network structure...' % cfg.network_type) tds, cds = self.tds, self.cds tl = self.tl tel, cel, cos, ocd, fh = self.tel, self.cel, self.cos, self.ocd, self.fh hn = self.hn bs, sl1, sl2 = self.bs, self.sl1, self.sl2 with tf.variable_scope('emb'): token_emb_mat = generate_embedding_mat(tds, tel, init_mat=self.token_emb_mat, extra_mat=self.glove_emb_mat, extra_trainable=self.finetune_emb, scope='gene_token_emb_mat') s1_emb = tf.nn.embedding_lookup(token_emb_mat, self.sent1_token) # bs,sl1,tel s2_emb = tf.nn.embedding_lookup(token_emb_mat, self.sent2_token) # bs,sl2,tel self.tensor_dict['s1_emb'] = s1_emb self.tensor_dict['s2_emb'] = s2_emb with tf.variable_scope('sent_enc_attn'): s1_rep = traditional_attention( s1_emb, self.sent1_token_mask, 'traditional_attention', cfg.dropout, self.is_train, cfg.wd, tensor_dict=self.tensor_dict, name='s1_attn') tf.get_variable_scope().reuse_variables() s2_rep = traditional_attention( s2_emb, self.sent2_token_mask, 'traditional_attention', cfg.dropout, self.is_train, cfg.wd, tensor_dict=self.tensor_dict, name='s2_attn') self.tensor_dict['s1_rep'] = s1_rep self.tensor_dict['s2_rep'] = s2_rep with tf.variable_scope('output'): out_rep = tf.concat([s1_rep, s2_rep, s1_rep - s2_rep, s1_rep * s2_rep], -1) pre_output = tf.nn.elu(linear([out_rep], hn, True, 0., scope= 'pre_output', squeeze=False, wd=cfg.wd, input_keep_prob=cfg.dropout,is_train=self.is_train)) logits = linear([pre_output], self.output_class, True, 0., scope= 'logits', squeeze=False, wd=cfg.wd, input_keep_prob=cfg.dropout,is_train=self.is_train) self.tensor_dict[logits] = logits return logits # logits
def self_attention_for_selected_head( head_selection, head_org_idx, sl_head, rep_head_mask, dep_selection, dep_org_idx, sl_dep, rep_dep_mask, rep_map, rep_dep_tensor, keep_prob, is_train, direction, ivec ): # data for self-attention rep_map_dp = dropout(rep_map, keep_prob, is_train) rep_dep_tensor_dp, _, _ = reduce_data_rep_max_len(rep_map_dp, dep_selection) rep_head_tensor_dp, _, _ = reduce_data_rep_max_len(rep_map_dp, head_selection) # mask generation dep_idxs = tf.tile(tf.expand_dims(dep_org_idx, 1), [1, sl_head, 1]) head_idxs = tf.tile(tf.expand_dims(head_org_idx, 2), [1, 1, sl_dep]) if direction is None: direct_mask = tf.not_equal(head_idxs, dep_idxs) # [bs, slh, sld] else: if direction == 'forward': direct_mask = tf.greater(head_idxs, dep_idxs) # [bs, slh, sld] else: direct_mask = tf.less(head_idxs, dep_idxs) # [bs, slh, sld] # [bs, slh, slh] rep_mask_tile = tf.logical_and(tf.expand_dims(rep_dep_mask, 1), tf.expand_dims(rep_head_mask, 2)) attn_mask = tf.logical_and(direct_mask, rep_mask_tile) # [bs, slh, sld] # tensor tile rep_map_tile = tf.tile(tf.expand_dims(rep_dep_tensor, 1), [1, sl_head, 1, 1]) # bs,slh,sld,vec with tf.variable_scope('attention'): # bs,sl,sl,vec f_bias = tf.get_variable('f_bias', [ivec], tf.float32, tf.constant_initializer(0.)) dependent = linear(rep_dep_tensor_dp, ivec, False, scope='linear_dependent') # bs,sld,vec dependent_etd = tf.expand_dims(dependent, 1) # bs,1,sld,vec head = linear(rep_head_tensor_dp, ivec, False, scope='linear_head') # bs,slh,vec head_etd = tf.expand_dims(head, 2) # bs,slh,1,vec logits = scaled_tanh(dependent_etd + head_etd + f_bias, 5.0) # bs,slh,sld,vec logits_masked = exp_mask_for_high_rank(logits, attn_mask) # bs,slh,sld,vec attn_score = tf.nn.softmax(logits_masked, 2) # bs,slh,sld,vec attn_score = mask_for_high_rank(attn_score, attn_mask) attn_result = tf.reduce_sum(attn_score * rep_map_tile, 2) # bs,slh,vec -> head_org_idx return attn_result
def do_reduce(self, data_for_reduce, mask_for_reduce): with tf.variable_scope('sr_%s' % self.method_type): data_for_reduce_re = tf.reshape(data_for_reduce, [-1, 2 * self.hn]) reduced_value = tf.nn.relu( linear([data_for_reduce_re], self.hn, True, 0., 'reduce_linear', False, input_keep_prob=self.dropout, is_train=self.is_train)) return reduced_value
def gene_similarity_mat_and_mask(tensor_row, tensor_col, mask_for_tensor_row, mask_for_tensor_col, similarity_method='inner', hn=100, scope = None): with tf.variable_scope(scope or 'gene_similarity_mat_and_mask'): # --------parameters-------- t_main = tensor_row # [bs,sl,vec] t_sec = tensor_col # [bs,ql,vec] mask_main = mask_for_tensor_row # [bs,sl] mask_sec = mask_for_tensor_col # [bs,ql] bs, sl, vec = tf.shape(t_main)[0], tf.shape(t_main)[1], tf.shape(t_main)[2] ql = tf.shape(t_sec)[1] # ------------------------------- # --------similarity_mat-------- mask_main_etd = tf.expand_dims(mask_main, 2) # bs,sl,1 mask_sec_etd = tf.expand_dims(mask_sec, 1) # bs,1,ql mask_similarity_mat = tf.logical_and(mask_main_etd, mask_sec_etd) # bs,sl,ql if similarity_method == 'inner': t_main_etd = tf.expand_dims(t_main, 2) # bs,sl,1,vec t_sec_etd = tf.expand_dims(t_sec, 1) # bs,1,ql,vec similarity_mat = tf.reduce_sum(t_main_etd*t_sec_etd, -1) # bs,sl,ql elif similarity_method == 'tri_linear': t_main_tiled = tf.tile(tf.expand_dims(t_main, 2), [1, 1, ql, 1]) # bs,sl,ql,vec t_sec_tiled = tf.tile(tf.expand_dims(t_sec, 1), [1, sl, 1, 1]) # bs,sl,ql,vec similarity_mat = get_logits([t_main_tiled, t_sec_tiled], None, False, scope='tri_linear_tri_linear', func='tri_linear') elif similarity_method == 'map_linear': t_main_map = tf.nn.relu(linear([t_main], hn, True, scope='linear_map_main')) t_sec_map = tf.nn.relu(linear([t_sec], hn, True, scope='linear_map_sec')) t_main_map_etd = tf.expand_dims(t_main_map, 2) # bs,sl,1,hn t_sec_map_etd = tf.expand_dims(t_sec_map, 1) # bs,1,ql,hn similarity_mat = tf.reduce_sum(t_main_map_etd * t_sec_map_etd, -1) # bs,sl,ql else: raise AttributeError('No similarity matrix calculation method \'%s\'' % similarity_method) return similarity_mat, mask_similarity_mat
def self_align_attention(rep_tensor, mask, scope=None, simplify=True, hn=None): # correct """ attention strategy 4: self * self => attention self :param rep_tensor: rank is three [bs,sl,hn] :param mask: [bs,sl] tf.bool :param scope :param simplify: :return: attended tensor [bs,sl,hn] """ with tf.name_scope(scope or 'self_attention'): bs = tf.shape(rep_tensor)[0] sl = tf.shape(rep_tensor)[1] #vec = tf.shape(rep_tensor)[2] ivec = rep_tensor.get_shape().as_list()[-1] to_be_attended = tf.tile(tf.expand_dims(rep_tensor, 1), [1, sl, 1, 1]) if not simplify: assert hn is not None rep_tensor = tf.nn.relu(linear([rep_tensor], hn, True, 0., 'linear_transform')) # 1. self alignment mask_tiled_sec = tf.tile(tf.expand_dims(mask, 1), [1, sl, 1]) # bs,sl,sl mask_tiled_mian = tf.tile(tf.expand_dims(mask, 2), [1, 1, sl]) # bs,sl,sl mask_tiled = tf.logical_and(mask_tiled_sec, mask_tiled_mian) input_sec = tf.tile(tf.expand_dims(rep_tensor, 1), [1, sl, 1, 1]) # bs,1-sl,sl,hn input_main = tf.tile(tf.expand_dims(rep_tensor, 2), [1, 1, sl, 1]) # bs,sl,1-sl,hn # self_alignment = tf.reduce_sum(input_sec * input_main, -1) # bs,sl,sl self_alignment = (1.0 / ivec) * tf.reduce_sum(input_sec * input_main, -1) # bs,sl,sl # 2. generate diag~/ mat # diag = tf.expand_dims( # tf.cast(tf.logical_not( # tf.cast( # tf.diag( # tf.ones([sl], tf.int32)), tf.bool) # ), tf.float32), 0) # 1,sl,sl diag = tf.expand_dims(tf.logical_not( tf.cast(tf.diag(tf.ones([sl], tf.int32)), tf.bool)), 0) # 1,sl,sl diag = tf.tile(diag, [bs, 1, 1]) # bs, sl, sl # self_alignment = self_alignment * diag # bs,sl,sl # 3. attend data context = softsel(to_be_attended, self_alignment, tf.logical_and(mask_tiled, diag)) # [bs,sl,sl], bs,sl,hn return context
def self_choose_attention(rep_tensor, rep_mask, hn, # correct keep_prob=1., is_train=None, scope=None, simplify=False): """ self soft choose attention with :param rep_tensor: rank must be 3 [bs,sl,hn] :param rep_mask: [bs,sl] :param hn: :param keep_prob: :param is_train: :param scope: :param simplify :return: """ with tf.variable_scope(scope or 'self_choose_attention'): if not simplify: rep_tensor_map = tf.nn.relu(linear([rep_tensor], hn, True, scope='linear_map', input_keep_prob=keep_prob, is_train=is_train)) else: rep_tensor_map = tf.identity(rep_tensor) rep_tensor_logits = get_logits([rep_tensor_map], None, False, scope='self_attn_logits', mask=rep_mask, input_keep_prob=keep_prob, is_train=is_train) # bs,sl attn_res = softsel(rep_tensor, rep_tensor_logits, rep_mask) # bs,vec return attn_res
def context_fusion_layers(rep_tensor, rep_mask, method, activation_function, scope=None, wd=0., is_train=None, keep_prob=1., **kwargs): method_name_list = [ 'lstm', 'gru', 'sru', 'sru_normal', # rnn 'cnn', 'multi_head', 'multi_head_git', 'disa', 'mpsa', 'block' ] bs, sl, vec = tf.shape(rep_tensor)[0], tf.shape(rep_tensor)[1], tf.shape( rep_tensor)[2] ivec = rep_tensor.get_shape().as_list()[2] context_fusion_output = None with tf.variable_scope(scope or 'context_fusion_layers'): if method in ['lstm', 'gru', 'sru_normal']: context_fusion_output = contextual_bi_rnn(rep_tensor, rep_mask, ivec, method, False, wd, keep_prob, is_train, 'ct_bi_%s' % method) elif method == 'sru': context_fusion_output = bi_sru_recurrent_network( rep_tensor, rep_mask, is_train, keep_prob, wd, 'ct_bi_sru') elif method == 'cnn': context_fusion_output = cnn_for_context_fusion( rep_tensor, rep_mask, (3, 4, 5), 200, 'ct_cnn', is_train, keep_prob, wd) elif method == 'multi_head': context_fusion_output = multi_head_attention( rep_tensor, rep_mask, 8, 75, 'ct_multi_head', is_train, keep_prob, wd) elif method == 'multi_head_git': context_fusion_output = multi_head_attention_git( rep_tensor, rep_mask, 8, 600, 'ct_multi_head', is_train, keep_prob, wd) elif method == 'disa': with tf.variable_scope('ct_disa'): disa_fw = directional_attention_with_dense( rep_tensor, rep_mask, 'forward', 'fw_disa', keep_prob, is_train, wd, activation_function) disa_bw = directional_attention_with_dense( rep_tensor, rep_mask, 'backward', 'bw_disa', keep_prob, is_train, wd, activation_function) context_fusion_output = tf.concat([disa_fw, disa_bw], -1) elif method == 'block': if 'block_len' in kwargs.keys(): block_len = kwargs['block_len'] else: block_len = None if block_len is None: block_len = tf.cast( tf.ceil(tf.pow(tf.cast(2 * sl, tf.float32), 1.0 / 3)), tf.int32) context_fusion_output = bi_directional_simple_block_attention( rep_tensor, rep_mask, block_len, 'ct_block_attn', keep_prob, is_train, wd, activation_function) elif method == 'mpsa': with tf.variable_scope('ct_mpsa'): mpsa_fw = masked_positional_self_attention( 0, rep_tensor, rep_mask, 'forward', 'fw_mpsa', keep_prob, is_train, wd, activation_function) mpsa_bw = masked_positional_self_attention( 0, rep_tensor, rep_mask, 'backward', 'bw_mpsa', keep_prob, is_train, wd, activation_function) mpsa_2g = masked_positional_self_attention( 2, rep_tensor, rep_mask, None, '2g_mpsa', keep_prob, is_train, wd, activation_function) mpsa_3g = masked_positional_self_attention( 3, rep_tensor, rep_mask, None, '3g_mpsa', keep_prob, is_train, wd, activation_function) sen_tensor = mask_for_high_rank(rep_tensor, rep_mask) sen_tensor_t = tf.expand_dims(sen_tensor, 2) fw_res = tf.expand_dims(mpsa_fw, 2) bw_res = tf.expand_dims(mpsa_bw, 2) g2_res = tf.expand_dims(mpsa_2g, 2) g3_res = tf.expand_dims(mpsa_3g, 2) tmp_res = tf.concat( [sen_tensor_t, fw_res, bw_res, g2_res, g3_res], 2) # bs,sl,5,ivec bs, sl = tf.shape(rep_tensor)[0], tf.shape(rep_tensor)[1] ivec = rep_tensor.get_shape()[2] num = tmp_res.get_shape()[2] bias = tf.get_variable('bias', [num * ivec], tf.float32, tf.constant_initializer(0.)) softmax_gate = linear(sen_tensor, num * ivec, True, 0., 'linear_softmax', False, wd, keep_prob, is_train) + bias # bs,sl,5*ivec fusion_gate = tf.nn.softmax( tf.reshape(softmax_gate, [bs, sl, num, ivec]), 2) context_fusion_output = tf.reduce_sum(fusion_gate * tmp_res, 2) # bs,sl,ivec else: raise RuntimeError return context_fusion_output
def simple_block_attention(rep_tensor, rep_mask, block_len=5, scope=None, direction=None, keep_prob=1., is_train=None, wd=0., activation='elu', hn=None): assert direction is not None def scaled_tanh(x, scale=5.): return scale * tf.nn.tanh(1. / scale * x) bs, sl, vec = tf.shape(rep_tensor)[0], tf.shape(rep_tensor)[1], tf.shape( rep_tensor)[2] org_ivec = rep_tensor.get_shape().as_list()[2] ivec = hn or org_ivec with tf.variable_scope(scope or 'block_simple'): # @1. split sequence with tf.variable_scope('split_seq'): block_num = tf.cast( tf.ceil( tf.divide(tf.cast(sl, tf.float32), tf.cast(block_len, tf.float32))), tf.int32) comp_len = block_num * block_len - sl rep_tensor_comp = tf.concat( [rep_tensor, tf.zeros([bs, comp_len, org_ivec], tf.float32)], 1) rep_mask_comp = tf.concat([ rep_mask, tf.cast(tf.zeros([bs, comp_len], tf.int32), tf.bool) ], 1) rep_tensor_split = tf.reshape( rep_tensor_comp, [bs, block_num, block_len, org_ivec]) # bs,bn,bl,d rep_mask_split = tf.reshape(rep_mask_comp, [bs, block_num, block_len]) # bs,bn,bl # non-linear rep_map = bn_dense_layer(rep_tensor_split, ivec, True, 0., 'bn_dense_map', activation, False, wd, keep_prob, is_train) # bs,bn,bl,vec rep_map_tile = tf.tile(tf.expand_dims(rep_map, 2), [1, 1, block_len, 1, 1]) # bs,bn,bl,bl,vec # rep_map_dp = dropout(rep_map, keep_prob, is_train) bn = block_num bl = block_len with tf.variable_scope('self_attention'): # @2.self-attention in block # mask generation sl_indices = tf.range(block_len, dtype=tf.int32) sl_col, sl_row = tf.meshgrid(sl_indices, sl_indices) if direction == 'forward': direct_mask = tf.greater(sl_row, sl_col) # bl,bl else: direct_mask = tf.greater(sl_col, sl_row) # bl,bl direct_mask_tile = tf.tile( tf.expand_dims(tf.expand_dims(direct_mask, 0), 0), [bs, bn, 1, 1]) # bs,bn,bl,bl rep_mask_tile_1 = tf.tile(tf.expand_dims(rep_mask_split, 2), [1, 1, bl, 1]) # bs,bn,bl,bl rep_mask_tile_2 = tf.tile(tf.expand_dims(rep_mask_split, 3), [1, 1, 1, bl]) # bs,bn,bl,bl rep_mask_tile = tf.logical_and(rep_mask_tile_1, rep_mask_tile_2) attn_mask = tf.logical_and(direct_mask_tile, rep_mask_tile, name='attn_mask') # bs,bn,bl,bl # attention f_bias = tf.get_variable('f_bias', [ivec], tf.float32, tf.constant_initializer(0.)) dependent_head = linear(rep_map, 2 * ivec, False, 0., 'linear_dependent_head', False, wd, keep_prob, is_train) # bs,bn,bl,2vec dependent, head = tf.split(dependent_head, 2, 3) dependent_etd = tf.expand_dims(dependent, 2) # bs,bn,1,bl,vec head_etd = tf.expand_dims(head, 3) # bs,bn,bl,1,vec logits = scaled_tanh(dependent_etd + head_etd + f_bias, 5.0) # bs,bn,bl,bl,vec logits_masked = exp_mask_for_high_rank(logits, attn_mask) attn_score = tf.nn.softmax(logits_masked, 3) # bs,bn,bl,bl,vec attn_score = mask_for_high_rank(attn_score, attn_mask) # bs,bn,bl,bl,vec self_attn_result = tf.reduce_sum(attn_score * rep_map_tile, 3) # bs,bn,bl,vec with tf.variable_scope('source2token_self_attn'): inter_block_logits = bn_dense_layer(self_attn_result, ivec, True, 0., 'bn_dense_map', 'linear', False, wd, keep_prob, is_train) # bs,bn,bl,vec inter_block_logits_masked = exp_mask_for_high_rank( inter_block_logits, rep_mask_split) # bs,bn,bl,vec inter_block_soft = tf.nn.softmax(inter_block_logits_masked, 2) # bs,bn,bl,vec inter_block_attn_output = tf.reduce_sum( self_attn_result * inter_block_soft, 2) # bs,bn,vec with tf.variable_scope('self_attn_inter_block'): inter_block_attn_output_mask = tf.cast(tf.ones([bs, bn], tf.int32), tf.bool) block_ct_res = directional_attention_with_dense( inter_block_attn_output, inter_block_attn_output_mask, direction, 'disa', keep_prob, is_train, wd, activation) # [bs,bn,vec] block_ct_res_tile = tf.tile(tf.expand_dims( block_ct_res, 2), [1, 1, bl, 1]) #[bs,bn,vec]->[bs,bn,bl,vec] with tf.variable_scope('combination'): # input:1.rep_map[bs,bn,bl,vec]; 2.self_attn_result[bs,bn,bl,vec]; 3.rnn_res_tile[bs,bn,bl,vec] rep_tensor_with_ct = tf.concat( [rep_map, self_attn_result, block_ct_res_tile], -1) # [bs,bn,bl,3vec] new_context_and_gate = linear(rep_tensor_with_ct, 2 * ivec, True, 0., 'linear_new_context_and_gate', False, wd, keep_prob, is_train) # [bs,bn,bl,2vec] new_context, gate = tf.split(new_context_and_gate, 2, 3) # bs,bn,bl,vec if activation == "relu": new_context_act = tf.nn.relu(new_context) elif activation == "elu": new_context_act = tf.nn.elu(new_context) elif activation == "linear": new_context_act = tf.identity(new_context) else: raise RuntimeError gate_sig = tf.nn.sigmoid(gate) combination_res = gate_sig * new_context_act + ( 1 - gate_sig) * rep_map # bs,bn,bl,vec with tf.variable_scope('restore_original_length'): combination_res_reshape = tf.reshape( combination_res, [bs, bn * bl, ivec]) # bs,bn*bl,vec output = combination_res_reshape[:, :sl, :] return output
def build_network(self): _logger.add() _logger.add('building %s neural network structure...' % cfg.network_type) tds, cds = self.tds, self.cds tl = self.tl tel, cel, cos, ocd, fh = self.tel, self.cel, self.cos, self.ocd, self.fh hn = self.hn bs, sl1, sl2 = self.bs, self.sl1, self.sl2 with tf.variable_scope('emb'): token_emb_mat = generate_embedding_mat( tds, tel, init_mat=self.token_emb_mat, extra_mat=self.glove_emb_mat, extra_trainable=self.finetune_emb, scope='gene_token_emb_mat') s1_emb = tf.nn.embedding_lookup(token_emb_mat, self.sent1_token) # bs,sl1,tel s2_emb = tf.nn.embedding_lookup(token_emb_mat, self.sent2_token) # bs,sl2,tel self.tensor_dict['s1_emb'] = s1_emb self.tensor_dict['s2_emb'] = s2_emb with tf.variable_scope('context_fusion'): s1_seq_rep = multi_mask_tensorized_self_attn( s1_emb, self.sent1_token_mask, hn=2 * hn, head_num=2, is_train=self.is_train, attn_keep_prob=1., dense_keep_prob=cfg.dropout, wd=cfg.wd, use_direction=True, attn_self=False, use_fusion_gate=True, final_mask_ft=None, dot_activation_name='sigmoid', use_input_for_attn=False, add_layer_for_multi=True, activation_func_name='elu', apply_act_for_v=True, input_hn=None, output_hn=None, accelerate=False, merge_var=False, scope='multi_mask_tensorized_self_attn') tf.get_variable_scope().reuse_variables() s2_seq_rep = multi_mask_tensorized_self_attn( s2_emb, self.sent2_token_mask, hn=2 * hn, head_num=2, is_train=self.is_train, attn_keep_prob=1., dense_keep_prob=cfg.dropout, wd=cfg.wd, use_direction=True, attn_self=False, use_fusion_gate=True, final_mask_ft=None, dot_activation_name='sigmoid', use_input_for_attn=False, add_layer_for_multi=True, activation_func_name='elu', apply_act_for_v=True, input_hn=None, output_hn=None, accelerate=False, merge_var=False, scope='multi_mask_tensorized_self_attn') with tf.variable_scope('compression'): s1_rep = multi_dimensional_attention(s1_seq_rep, self.sent1_token_mask, 's2t_attn', cfg.dropout, self.is_train, cfg.wd, 'elu') tf.get_variable_scope().reuse_variables() s2_rep = multi_dimensional_attention(s2_seq_rep, self.sent2_token_mask, 's2t_attn', cfg.dropout, self.is_train, cfg.wd, 'elu') with tf.variable_scope('output'): out_rep = tf.concat( [s1_rep, s2_rep, s1_rep - s2_rep, s1_rep * s2_rep], -1) pre_output = tf.nn.elu( linear([out_rep], hn, True, 0., scope='pre_output', squeeze=False, wd=cfg.wd, input_keep_prob=cfg.dropout, is_train=self.is_train)) pre_output1 = highway_net(pre_output, hn, True, 0., 'pre_output1', 'elu', False, cfg.wd, cfg.dropout, self.is_train) logits = linear([pre_output1], self.output_class, True, 0., scope='logits', squeeze=False, wd=cfg.wd, input_keep_prob=cfg.dropout, is_train=self.is_train) self.tensor_dict[logits] = logits return logits # logits
def build_network(self): _logger.add() _logger.add('building %s neural network structure...' % cfg.network_type) tds, cds = self.tds, self.cds tl = self.tl tel, cel, cos, ocd, fh = self.tel, self.cel, self.cos, self.ocd, self.fh hn = self.hn bs, sl, ol, mc = self.bs, self.sl, self.ol, self.mc with tf.variable_scope('emb'): token_emb_mat = generate_embedding_mat( tds, tel, init_mat=self.token_emb_mat, extra_mat=self.glove_emb_mat, extra_trainable=self.finetune_emb, scope='gene_token_emb_mat') emb = tf.nn.embedding_lookup(token_emb_mat, self.token_seq) # bs,sl,tel self.tensor_dict['emb'] = emb with tf.variable_scope('sent_encoding'): act_name = 'relu' seq_rep = multi_mask_tensorized_self_attn( emb, self.token_mask, hn=2 * hn, head_num=2, is_train=self.is_train, attn_keep_prob=1., dense_keep_prob=cfg.dropout, wd=cfg.wd, use_direction=True, attn_self=False, use_fusion_gate=True, final_mask_ft=None, dot_activation_name='sigmoid', use_input_for_attn=False, add_layer_for_multi=True, activation_func_name=act_name, apply_act_for_v=True, input_hn=None, output_hn=None, accelerate=False, merge_var=False, scope='proposed_model') rep = multi_dim_souce2token_self_attn(seq_rep, self.token_mask, 's2t_self_attn', cfg.dropout, self.is_train, cfg.wd, act_name) with tf.variable_scope('output'): pre_logits = tf.nn.relu( linear([rep], hn, True, scope='pre_logits_linear', wd=cfg.wd, input_keep_prob=cfg.dropout, is_train=self.is_train)) # bs, hn logits = linear([pre_logits], self.output_class, False, scope='get_output', wd=cfg.wd, input_keep_prob=cfg.dropout, is_train=self.is_train) # bs, 5 _logger.done() return logits
def masked_positional_self_attention(sigma, rep_tensor, rep_mask, direction=None, scope=None, keep_prob=1., is_train=None, wd=0., activation='elu', tensor_dict=None, name=None): def scaled_tanh(x, scale=5.): return scale * tf.nn.tanh(1./scale * x) bs, sl, vec = tf.shape(rep_tensor)[0], tf.shape(rep_tensor)[1], tf.shape(rep_tensor)[2] ivec = rep_tensor.get_shape()[2] with tf.variable_scope(scope or 'directional_attention_%s' % direction or 'diag'): # mask generation sl_indices = tf.range(sl, dtype=tf.int32) sl_col, sl_row = tf.meshgrid(sl_indices, sl_indices) if direction is None: direct_mask0 = tf.greater(sl_row + sigma, sl_col) direct_mask1 = tf.greater(sl_col + sigma, sl_row) direct_mask2 = tf.cast(1 - tf.diag(tf.ones([sl], tf.int32)), tf.bool) direct_mask = tf.logical_and(tf.logical_and(direct_mask0, direct_mask1), direct_mask2) else: if direction == 'forward': direct_mask = tf.greater(sl_row, sl_col) else: direct_mask = tf.greater(sl_col, sl_row) direct_mask_tile = tf.tile(tf.expand_dims(direct_mask, 0), [bs, 1, 1]) # bs,sl,sl rep_mask_tile = tf.tile(tf.expand_dims(rep_mask, 1), [1, sl, 1]) # bs,sl,sl attn_mask = tf.logical_and(direct_mask_tile, rep_mask_tile) # bs,sl,sl # non-linear rep_map = bn_dense_layer(rep_tensor, ivec, True, 0., 'bn_dense_map', activation, False, wd, keep_prob, is_train) rep_map_tile = tf.tile(tf.expand_dims(rep_map, 1), [1, sl, 1, 1]) # bs,sl,sl,vec rep_map_dp = dropout(rep_map, keep_prob, is_train) # attention with tf.variable_scope('attention'): # bs,sl,sl,1 f_bias = tf.get_variable('f_bias', [1], tf.float32, tf.constant_initializer(0.)) dependent = linear(rep_map_dp, 1, False, scope='linear_dependent') # bs,sl,1 dependent_etd = tf.expand_dims(dependent, 1) # bs,1,sl,1 head = linear(rep_map_dp, 1, False, scope='linear_head') # bs,sl,1 head_etd = tf.expand_dims(head, 2) # bs,sl,1,1 logits = scaled_tanh(dependent_etd + head_etd + f_bias, 5.0) # bs,sl,sl,1 logits_masked = exp_mask_for_high_rank(logits, attn_mask) if direction is not None: dis_mask = -tf.log(tf.cast(tf.abs(sl_col - sl_row) + tf.diag(tf.ones([sl], tf.int32)), tf.float32)) logits_masked = dis_mask_for_high_rank(logits_masked, dis_mask) attn_score = tf.nn.softmax(logits_masked, 2) # bs,sl,sl,vec attn_score = mask_for_high_rank(attn_score, attn_mask) attn_score = tf.tile(tf.expand_dims(tf.reshape(attn_score, [bs, sl, sl]), 3), [1, 1, 1, ivec]) attn_result = tf.reduce_sum(attn_score * rep_map_tile, 2) # bs,sl,vec with tf.variable_scope('output'): output = attn_result # save attn if tensor_dict is not None and name is not None: tensor_dict[name + '_dependent'] = dependent tensor_dict[name + '_head'] = head tensor_dict[name] = attn_score return output
def directional_attention_with_selections( rep_tensor, rep_mask, dep_selection, head_selection, direction=None, hn=None, keep_unselected=True, scope=None, keep_prob=1., is_train=None, wd=0., activation='elu'): bs, sl, vec = tf.shape(rep_tensor)[0], tf.shape(rep_tensor)[1], tf.shape(rep_tensor)[2] org_ivec = rep_tensor.get_shape().as_list()[2] ivec = hn or org_ivec with tf.variable_scope(scope or 'directional_attention_%s' % direction or 'diag'): # non-linear rep_map = bn_dense_layer(rep_tensor, ivec, True, 0., 'bn_dense_map', activation, False, wd, keep_prob, is_train) # ensure the seletion is right dep_selection = tf.logical_and(rep_mask, dep_selection) head_selection = tf.logical_and(rep_mask, head_selection) rep_dep_tensor, rep_dep_mask, dep_org_idx = reduce_data_rep_max_len(rep_map, dep_selection) rep_head_tensor,rep_head_mask, head_org_idx = reduce_data_rep_max_len(rep_map, head_selection) sl_dep, sl_head = tf.shape(rep_dep_tensor)[1], tf.shape(rep_head_tensor)[1] if keep_unselected: unhead_selection = tf.logical_and(rep_mask, tf.logical_not(head_selection)) rep_unhead_tensor, rep_unhead_mask, unhead_org_idx = reduce_data_rep_max_len(rep_map, unhead_selection) sl_unhead = tf.shape(rep_unhead_tensor)[1] attn_result = tf.cond( tf.equal(sl_head, 0), lambda: tf.zeros([bs, 0, hn], tf.float32), lambda: self_attention_for_selected_head( head_selection, head_org_idx, sl_head, rep_head_mask, dep_selection, dep_org_idx, sl_dep, rep_dep_mask, rep_map, rep_dep_tensor, keep_prob, is_train, direction, ivec ) ) if keep_unselected: input_idx = tf.tile(tf.expand_dims(tf.range(sl), 0), [bs, 1]) pooling_result = tf.cond( tf.equal(sl_unhead, 0), lambda: tf.zeros([bs, 0, hn], tf.float32), lambda: mean_pooling_for_unselected_head( unhead_org_idx, sl_unhead, rep_unhead_mask, input_idx, sl, rep_mask, rep_map, None) # todo: point ! ) with tf.variable_scope('output'): if keep_unselected: range_head = tf.tile(tf.expand_dims(tf.range(bs), -1), [1, sl_head]) scatter_attn = tf.cond( tf.equal(sl_head, 0), lambda: tf.zeros([bs, sl+1, hn], tf.float32), lambda: tf.scatter_nd( tf.stack([range_head, head_org_idx], -1), attn_result, [bs, sl+1, hn]) ) range_unhead = tf.tile(tf.expand_dims(tf.range(bs), -1), [1, sl_unhead]) scatter_pooling = tf.cond( tf.equal(sl_unhead, 0), lambda: tf.zeros([bs, sl+1, hn], tf.float32), lambda: tf.scatter_nd( tf.stack([range_unhead, unhead_org_idx], -1), pooling_result, [bs, sl+1, hn]) ) self_attn_input = rep_map context_features = tf.add(scatter_attn[:, :-1], scatter_pooling[:, :-1], 'context_features') output_mask = rep_mask else: self_attn_input = rep_head_tensor context_features = attn_result output_mask = rep_head_mask # context fusion gate o_bias = tf.get_variable('o_bias', [ivec], tf.float32, tf.constant_initializer(0.)) fusion_gate = tf.nn.sigmoid( linear(self_attn_input, ivec, True, 0., 'linear_fusion_i', False, wd, keep_prob, is_train) + linear(context_features, ivec, True, 0., 'linear_fusion_a', False, wd, keep_prob, is_train) + o_bias) output = fusion_gate * self_attn_input + (1 - fusion_gate) * context_features return output, output_mask
def build_network(self): _logger.add() _logger.add('building %s neural network structure...' % cfg.network_type) tds, cds = self.tds, self.cds tl = self.tl tel, cel, cos, ocd, fh = self.tel, self.cel, self.cos, self.ocd, self.fh hn = self.hn bs, sl1, sl2 = self.bs, self.sl1, self.sl2 with tf.variable_scope('emb'): token_emb_mat = generate_embedding_mat( tds, tel, init_mat=self.token_emb_mat, extra_mat=self.glove_emb_mat, extra_trainable=self.finetune_emb, scope='gene_token_emb_mat') s1_emb = tf.nn.embedding_lookup(token_emb_mat, self.sent1_token) # bs,sl1,tel s2_emb = tf.nn.embedding_lookup(token_emb_mat, self.sent2_token) # bs,sl2,tel self.tensor_dict['s1_emb'] = s1_emb self.tensor_dict['s2_emb'] = s2_emb with tf.variable_scope('sent_encoding'): act_func_str = 'elu' if cfg.context_fusion_method in [ 'block', 'disa' ] else 'relu' s1_rep = sentence_encoding_models(s1_emb, self.sent1_token_mask, cfg.context_fusion_method, act_func_str, 'ct_based_sent2vec', cfg.wd, self.is_train, cfg.dropout, block_len=cfg.block_len) tf.get_variable_scope().reuse_variables() s2_rep = sentence_encoding_models(s2_emb, self.sent2_token_mask, cfg.context_fusion_method, act_func_str, 'ct_based_sent2vec', cfg.wd, self.is_train, cfg.dropout, block_len=cfg.block_len) self.tensor_dict['s1_rep'] = s1_rep self.tensor_dict['s2_rep'] = s2_rep with tf.variable_scope('output'): act_func = tf.nn.elu if cfg.context_fusion_method in [ 'block', 'disa' ] else tf.nn.relu out_rep = tf.concat( [s1_rep, s2_rep, s1_rep - s2_rep, s1_rep * s2_rep], -1) pre_output = act_func( linear([out_rep], hn, True, 0., scope='pre_output', squeeze=False, wd=cfg.wd, input_keep_prob=cfg.dropout, is_train=self.is_train)) logits = linear([pre_output], self.output_class, True, 0., scope='logits', squeeze=False, wd=cfg.wd, input_keep_prob=cfg.dropout, is_train=self.is_train) self.tensor_dict[logits] = logits return logits # logits
def build_network(self): _logger.add() _logger.add('building %s neural network structure...' % cfg.network_type) tds, cds = self.tds, self.cds tl = self.tl tel, cel, cos, ocd, fh = self.tel, self.cel, self.cos, self.ocd, self.fh hn = self.hn bs, sl, ol, mc = self.bs, self.sl, self.ol, self.mc with tf.variable_scope('emb'): token_emb_mat = generate_embedding_mat( tds, tel, init_mat=self.token_emb_mat, extra_mat=self.glove_emb_mat, extra_trainable=self.finetune_emb, scope='gene_token_emb_mat') emb = tf.nn.embedding_lookup(token_emb_mat, self.token_seq) # bs,sl,tel self.tensor_dict['emb'] = emb with tf.variable_scope('ct_attn'): rep_fw = directional_attention_with_dense( emb, self.token_mask, 'forward', 'dir_attn_fw', cfg.dropout, self.is_train, cfg.wd, 'relu', tensor_dict=self.tensor_dict, name='fw_attn') rep_bw = directional_attention_with_dense( emb, self.token_mask, 'backward', 'dir_attn_bw', cfg.dropout, self.is_train, cfg.wd, 'relu', tensor_dict=self.tensor_dict, name='bw_attn') seq_rep = tf.concat([rep_fw, rep_bw], -1) with tf.variable_scope('sent_enc_attn'): rep = multi_dimensional_attention(seq_rep, self.token_mask, 'multi_dimensional_attention', cfg.dropout, self.is_train, cfg.wd, 'relu', tensor_dict=self.tensor_dict, name='attn') with tf.variable_scope('output'): pre_logits = tf.nn.relu( linear([rep], hn, True, scope='pre_logits_linear', wd=cfg.wd, input_keep_prob=cfg.dropout, is_train=self.is_train)) # bs, hn logits = linear([pre_logits], self.output_class, False, scope='get_output', wd=cfg.wd, input_keep_prob=cfg.dropout, is_train=self.is_train) # bs, 5 _logger.done() return logits
def do_reduce(self, data_for_reduce, mask_for_reduce): hn, dropout, is_train, wd = self.hn, self.dropout, self.is_train, self.wd mc = tf.shape(data_for_reduce)[1] with tf.variable_scope('sr_%s' % self.method_type): self_choose_attention(data_for_reduce, mask_for_reduce, hn, dropout, is_train, 'change_me') children_hid = data_for_reduce[:, :, :hn] children_cell = data_for_reduce[:, :, hn:] I = tf.nn.sigmoid( linear([ self_choose_attention(children_hid, mask_for_reduce, hn, dropout, is_train, 'self_ch_i') ], hn, True, 0., 'linear_i', False, 0., dropout, is_train)) # bs,mc,hn/ -> bs,1,mc,hn/2 -> bs,mc,mc,hn/2 children_hid_tile_1 = tf.tile(tf.expand_dims(children_hid, 1), [1, mc, 1, 1]) # children_hid_tile_2 = tf.tile(tf.expand_dims(children_hid, 2), [1, 1, mc, 1]) # children_hid_tile = tf.concat( [children_hid_tile_1, children_hid_tile_2], -1) # bs,mc,mc,2* hn children_hid_tile_re = tf.reshape( children_hid_tile, [-1, mc, 2 * hn]) # bs*mc,mc,2* hn # # mask mask_tile_1 = tf.tile(tf.expand_dims(mask_for_reduce, 1), [1, mc, 1]) mask_tile_2 = tf.tile(tf.expand_dims(mask_for_reduce, 2), [1, 1, mc]) mask_tile = tf.logical_and(mask_tile_1, mask_tile_2) mask_tile_re = tf.reshape(mask_tile, [-1, mc]) # bs*mc, 2* hn -linear-> bs*mc,hn -re-> bs,mc,hn F = tf.nn.sigmoid( tf.reshape( linear([ self_choose_attention(children_hid_tile_re, mask_tile_re, 2 * hn, dropout, is_train, 'self_ch_f') ], hn, True, 0., 'linear_f', False, 0., dropout, is_train), [-1, mc, hn])) O = tf.nn.sigmoid( linear([ self_choose_attention(children_hid, mask_for_reduce, hn, dropout, is_train, 'self_ch_o') ], hn, True, 0., 'linear_o', False, 0., dropout, is_train)) U = tf.nn.tanh( linear([ self_choose_attention(children_hid, mask_for_reduce, hn, dropout, is_train, 'self_ch_u') ], hn, True, 0., 'linear_u', False, 0., dropout, is_train)) # children_cell * F--[bs,mc,hn] mask_for_reduce [bs,mc]->[bs,mc,1] C = I * U + tf.reduce_sum( normal_mask(children_cell * F, tf.expand_dims(mask_for_reduce, -1)), 1) H = O * tf.nn.tanh(C) return tf.concat([H, C], -1)
def build_network(self): _logger.add() _logger.add('building %s neural network structure...' % cfg.network_type) tds, cds = self.tds, self.cds tl = self.tl tel, cel, cos, ocd, fh = self.tel, self.cel, self.cos, self.ocd, self.fh hn = self.hn bs, sl1, sl2 = self.bs, self.sl1, self.sl2 with tf.variable_scope('emb'): token_emb_mat = generate_embedding_mat( tds, tel, init_mat=self.token_emb_mat, extra_mat=self.glove_emb_mat, extra_trainable=self.finetune_emb, scope='gene_token_emb_mat') s1_emb = tf.nn.embedding_lookup(token_emb_mat, self.sent1_token) # bs,sl1,tel s2_emb = tf.nn.embedding_lookup(token_emb_mat, self.sent2_token) # bs,sl2,tel self.tensor_dict['s1_emb'] = s1_emb self.tensor_dict['s2_emb'] = s2_emb with tf.variable_scope('hard_network'): # for sentence 1 s1_emb_new = sequence_conditional_feature(s1_emb, self.sent1_token_mask) s1_logpa_dep, s1_act_dep, s1_percentage_dep = generate_mask_with_rl( s1_emb_new, self.sent1_token_mask, False, 'generate_mask_with_rl_dep', cfg.dropout, self.is_train, cfg.wd, 'relu', self.disable_rl, self.global_step, cfg.mode, cfg.start_only_rl, hn) # [bs, sl] & [bs, sl] s1_logpa_head, s1_act_head, s1_percentage_head = generate_mask_with_rl( s1_emb_new, self.sent1_token_mask, False, 'generate_mask_with_rl_head', cfg.dropout, self.is_train, cfg.wd, 'relu', self.disable_rl, self.global_step, cfg.mode, cfg.start_only_rl, hn) # [bs, sl] & [bs, sl] s1_logpa = tf.concat([s1_logpa_dep, s1_logpa_head], -1) s1_act = tf.logical_and(tf.expand_dims(s1_act_dep, 1), tf.expand_dims(s1_act_head, 2)) s1_percentage = s1_percentage_dep * s1_percentage_head tf.get_variable_scope().reuse_variables() # for sentence 2 s2_emb_new = sequence_conditional_feature(s2_emb, self.sent2_token_mask) s2_logpa_dep, s2_act_dep, s2_percentage_dep = generate_mask_with_rl( s2_emb_new, self.sent2_token_mask, False, 'generate_mask_with_rl_dep', cfg.dropout, self.is_train, cfg.wd, 'relu', self.disable_rl, self.global_step, cfg.mode, cfg.start_only_rl, hn) # [bs, sl] & [bs, sl] s2_logpa_head, s2_act_head, s2_percentage_head = generate_mask_with_rl( s2_emb_new, self.sent2_token_mask, False, 'generate_mask_with_rl_head', cfg.dropout, self.is_train, cfg.wd, 'relu', self.disable_rl, self.global_step, cfg.mode, cfg.start_only_rl, hn) # [bs, sl] & [bs, sl] s2_logpa = tf.concat([s2_logpa_dep, s2_logpa_head], -1) s2_act = tf.logical_and(tf.expand_dims(s2_act_dep, 1), tf.expand_dims(s2_act_head, 2)) s2_percentage = s2_percentage_dep * s2_percentage_head keep_unselected = True with tf.variable_scope('ct_attn'): s1_fw, s1_token_mask_new = directional_attention_with_selections( s1_emb, self.sent1_token_mask, s1_act_dep, s1_act_head, 'forward', hn, keep_unselected, 'dir_attn_fw', cfg.dropout, self.is_train, cfg.wd, 'relu') s1_bw, _ = directional_attention_with_selections( s1_emb, self.sent1_token_mask, s1_act_dep, s1_act_head, 'backward', hn, keep_unselected, 'dir_attn_bw', cfg.dropout, self.is_train, cfg.wd, 'relu') s1_seq_rep = tf.concat([s1_fw, s1_bw], -1) tf.get_variable_scope().reuse_variables() s2_fw, s2_token_mask_new = directional_attention_with_selections( s2_emb, self.sent2_token_mask, s2_act_dep, s2_act_head, 'forward', hn, keep_unselected, 'dir_attn_fw', cfg.dropout, self.is_train, cfg.wd, 'relu') s2_bw, _ = directional_attention_with_selections( s2_emb, self.sent2_token_mask, s2_act_dep, s2_act_head, 'backward', hn, keep_unselected, 'dir_attn_bw', cfg.dropout, self.is_train, cfg.wd, 'relu') s2_seq_rep = tf.concat([s2_fw, s2_bw], -1) with tf.variable_scope('sentence_enc'): s1_rep = multi_dimensional_attention(s1_seq_rep, s1_token_mask_new, 'multi_dimensional_attention', cfg.dropout, self.is_train, cfg.wd, 'relu', tensor_dict=self.tensor_dict, name='s1_attn') tf.get_variable_scope().reuse_variables() s2_rep = multi_dimensional_attention(s2_seq_rep, s2_token_mask_new, 'multi_dimensional_attention', cfg.dropout, self.is_train, cfg.wd, 'relu', tensor_dict=self.tensor_dict, name='s2_attn') with tf.variable_scope('output'): out_rep = tf.concat([s1_rep * s2_rep, tf.abs(s1_rep - s2_rep)], -1) out_rep_map = bn_dense_layer(out_rep, hn, True, 0., 'out_rep_map', 'relu', False, cfg.wd, cfg.dropout, self.is_train) if cfg.use_mse and cfg.mse_logits: logits = tf.nn.sigmoid( linear(out_rep_map, 1, True, 0., scope='logits', squeeze=True, wd=cfg.wd, input_keep_prob=cfg.dropout, is_train=self.is_train)) * 2. + 3. else: logits = linear([out_rep_map], self.output_class, True, 0., scope='logits', squeeze=False, wd=cfg.wd, input_keep_prob=cfg.dropout, is_train=self.is_train) return logits, (s1_act, s1_logpa), (s2_act, s2_logpa), (s1_percentage, s2_percentage ) # logits
def build_network(self): _logger.add() _logger.add('building %s neural network structure...' % cfg.network_type) tds, cds = self.tds, self.cds tl = self.tl tel, cel, cos, ocd, fh = self.tel, self.cel, self.cos, self.ocd, self.fh hn = self.hn bs, sl1, sl2 = self.bs, self.sl1, self.sl2 with tf.variable_scope('emb'): token_emb_mat = generate_embedding_mat(tds, tel, init_mat=self.token_emb_mat, extra_mat=self.glove_emb_mat, extra_trainable=self.finetune_emb, scope='gene_token_emb_mat') s1_emb = tf.nn.embedding_lookup(token_emb_mat, self.sent1_token) # bs,sl1,tel s2_emb = tf.nn.embedding_lookup(token_emb_mat, self.sent2_token) # bs,sl2,tel self.tensor_dict['s1_emb'] = s1_emb self.tensor_dict['s2_emb'] = s2_emb with tf.variable_scope('hard_network'): # s1_act, s1_logpa, s2_act, s2_logpa, choose_percentage s1_act = self.sent1_token_mask s1_logpa = tf.cast(s1_act, tf.float32) s2_act = self.sent2_token_mask s2_logpa = tf.cast(s2_act, tf.float32) s1_percentage = tf.ones([bs], tf.float32) s2_percentage = tf.ones([bs], tf.float32) with tf.variable_scope('ct_attn'): s1_fw = directional_attention_with_dense( s1_emb, self.sent1_token_mask, 'forward', 'dir_attn_fw', cfg.dropout, self.is_train, cfg.wd, tensor_dict=self.tensor_dict, name='s1_fw_attn') s1_bw = directional_attention_with_dense( s1_emb, self.sent1_token_mask, 'backward', 'dir_attn_bw', cfg.dropout, self.is_train, cfg.wd, tensor_dict=self.tensor_dict, name='s1_bw_attn') s1_seq_rep = tf.concat([s1_fw, s1_bw], -1) tf.get_variable_scope().reuse_variables() s2_fw = directional_attention_with_dense( s2_emb, self.sent2_token_mask, 'forward', 'dir_attn_fw', cfg.dropout, self.is_train, cfg.wd, tensor_dict=self.tensor_dict, name='s2_fw_attn') s2_bw = directional_attention_with_dense( s2_emb, self.sent2_token_mask, 'backward', 'dir_attn_bw', cfg.dropout, self.is_train, cfg.wd, tensor_dict=self.tensor_dict, name='s2_bw_attn') s2_seq_rep = tf.concat([s2_fw, s2_bw], -1) with tf.variable_scope('sentence_enc'): s1_rep = multi_dimensional_attention( s1_seq_rep, self.sent1_token_mask, 'multi_dimensional_attention', cfg.dropout, self.is_train, cfg.wd, tensor_dict=self.tensor_dict, name='s1_attn') tf.get_variable_scope().reuse_variables() s2_rep = multi_dimensional_attention( s2_seq_rep, self.sent2_token_mask, 'multi_dimensional_attention', cfg.dropout, self.is_train, cfg.wd, tensor_dict=self.tensor_dict, name='s2_attn') with tf.variable_scope('output'): out_rep = tf.concat([s1_rep, s2_rep, s1_rep - s2_rep, s1_rep * s2_rep], -1) out_rep_map = bn_dense_layer( out_rep, hn, True, 0., 'out_rep_map', 'elu', False, cfg.wd, cfg.dropout, self.is_train) pre_output1 = highway_network( out_rep_map, hn, True, 0., 'pre_output1', 'elu', False, cfg.wd, cfg.dropout, self.is_train) logits = linear([pre_output1], self.output_class, True, 0., scope='logits', squeeze=False, wd=cfg.wd, input_keep_prob=cfg.dropout, is_train=self.is_train) return logits, (s1_act, s1_logpa), (s2_act, s2_logpa), (s1_percentage, s2_percentage) # logits
def normal_attention(tensor_base, tensor_to_attend, mask_for_tensor_base, mask_for_tensor_to_attend, similarity_method='inner', hn=100, use_pooling=False, pooling_method='max', reverse=False, scope=None): """ normal_attention for attention strategy 2 :param tensor_base: rank 3 [bs,sl,vec] :param tensor_to_attend: rank 3 [bs,ql,vec] :param mask_for_tensor_base: [bs,ql] :param mask_for_tensor_to_attend: [bs,sl] :param similarity_method: 'inner' 'tri_linear' 'map_linear' :param hn: some method need :param use_pooling: True or False :param pooling_method: 'max' or 'mean' :param reverse: if use strategy 3 :param scope: :return: use_pooling==True: [bs,sl,hn] else [bs,hn] """ with tf.variable_scope(scope or 'normal_attention'): # --------parameters-------- t_main = tensor_base # [bs,sl,vec] t_sec = tensor_to_attend # [bs,ql,vec] mask_main = mask_for_tensor_base # [bs,sl] mask_sec = mask_for_tensor_to_attend # [bs,ql] bs, sl, vec = tf.shape(t_main)[0], tf.shape(t_main)[1], tf.shape(t_main)[2] ql = tf.shape(t_sec)[1] # ------------------------------- # --------similarity_mat-------- mask_main_etd = tf.expand_dims(mask_main, 2) # bs,sl,1 mask_sec_etd = tf.expand_dims(mask_sec, 1) # bs,1,ql mask_similarity_mat = tf.logical_and(mask_main_etd, mask_sec_etd) # bs,sl,ql if similarity_method == 'inner': t_main_etd = tf.expand_dims(t_main, 2) # bs,sl,1,vec t_sec_etd = tf.expand_dims(t_sec, 1) # bs,1,ql,vec similarity_mat = tf.reduce_sum(t_main_etd*t_sec_etd, -1) # bs,sl,ql elif similarity_method == 'tri_linear': t_main_tiled = tf.tile(tf.expand_dims(t_main, 2), [1, 1, ql, 1]) # bs,sl,ql,vec t_sec_tiled = tf.tile(tf.expand_dims(t_sec, 1), [1, sl, 1, 1]) # bs,sl,ql,vec similarity_mat = get_logits([t_main_tiled, t_sec_tiled], None, False, scope='tri_linear_tri_linear', func='tri_linear') elif similarity_method == 'map_linear': t_main_map = tf.nn.relu(linear([t_main], hn, True, scope='linear_map_main')) t_sec_map = tf.nn.relu(linear([t_sec], hn, True, scope='linear_map_sec')) t_main_map_etd = tf.expand_dims(t_main_map, 2) # bs,sl,1,hn t_sec_map_etd = tf.expand_dims(t_sec_map, 1) # bs,1,ql,hn similarity_mat = tf.reduce_sum(t_main_map_etd * t_sec_map_etd, -1) # bs,sl,ql else: raise AttributeError('No similarity matrix calculation method \'%s\'' % similarity_method) # ------------------------------- if use_pooling: # pool mat along -2 if pooling_method == 'max': pooling_out = tf.reduce_max(exp_mask(similarity_mat, mask_similarity_mat), -2) # bs,sl,ql -> bs,ql elif pooling_method == 'mean': sum_out = tf.reduce_sum(normal_mask(similarity_mat, mask_similarity_mat), -2) # bs,sl,ql -> bs,ql num = tf.reduce_sum(tf.cast(mask_similarity_mat, tf.int32), -2) # bs,ql num = tf.where(tf.equal(num, tf.zeros_like(num, tf.int32)), tf.ones_like(num, tf.int32), num) pooling_out = sum_out / tf.cast(num, tf.float32) # bs,ql else: raise AttributeError('No pooling method \'%s\'' % pooling_method) return softsel(t_sec, pooling_out, mask_sec) # bs,ql,vec -> bs,ql else: t_sec_tiled = tf.tile(tf.expand_dims(t_sec, 1), [1, sl, 1, 1]) # bs,sl,ql,vec # target: q_tiled:[bs,sl,ql,hn]; logits: [bs,sl,ql] if not reverse: out = normal_softsel(t_sec_tiled, similarity_mat, mask_similarity_mat) else: out = reverse_softsel(t_sec_tiled, similarity_mat, mask_similarity_mat) return out # bs,sl,vec
def do_reduce(self, data_for_reduce, mask_for_reduce): hn, dropout, is_train, wd = self.hn, self.dropout, self.is_train, self.wd mc = tf.shape(data_for_reduce)[1] with tf.variable_scope('sr_%s' % self.method_type): print('var num in (2.3) :', len(tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES))) # bs, mc, hn children_hid_un = data_for_reduce[:, :, :hn] children_cell = data_for_reduce[:, :, hn:] # bs, mc, hn children_hid = tf.concat([ children_hid_un, self_align_attention(children_hid_un, mask_for_reduce), ], -1) I = tf.nn.sigmoid( linear([ self_choose_attention(children_hid, mask_for_reduce, hn, dropout, is_train, 'self_ch_i', True) ], hn, False, 0., 'linear_i', False, 0., dropout, is_train) + self.bias_I) print('var num in (2.4) :', len(tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES))) # bs*mc, 2* hn -linear-> bs*mc,hn -re-> bs,mc,hn F = tf.nn.sigmoid( linear([children_hid], hn, True, 0., 'linear_f', False, 0., dropout, is_train)) print('var num in (2.5) :', len(tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES))) O = tf.nn.sigmoid( linear([ self_choose_attention(children_hid, mask_for_reduce, hn, dropout, is_train, 'self_ch_o', True) ], hn, False, 0., 'linear_o', False, 0., dropout, is_train) + self.bias_O) print('var num in (2.6) :', len(tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES))) U = tf.nn.tanh( linear([ self_choose_attention(children_hid, mask_for_reduce, hn, dropout, is_train, 'self_ch_u', True) ], hn, False, 0., 'linear_u', False, 0., dropout, is_train) + self.bias_U) print('var num in (2.7) :', len(tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES))) # children_cell * F--[bs,mc,hn] mask_for_reduce [bs,mc]->[bs,mc,1] C = I * U + tf.reduce_sum( normal_mask(children_cell * F, tf.expand_dims(mask_for_reduce, -1)), 1) H = O * tf.nn.tanh(C) return tf.concat([H, C], -1)
def directional_attention_with_dense(rep_tensor, rep_mask, direction=None, scope=None, keep_prob=1., is_train=None, wd=0., activation='elu', tensor_dict=None, name=None, hn=None): def scaled_tanh(x, scale=5.): return scale * tf.nn.tanh(1. / scale * x) bs, sl, vec = tf.shape(rep_tensor)[0], tf.shape(rep_tensor)[1], tf.shape( rep_tensor)[2] ivec = hn or rep_tensor.get_shape()[2] with tf.variable_scope(scope or 'directional_attention_%s' % direction or 'diag'): # mask generation sl_indices = tf.range(sl, dtype=tf.int32) sl_col, sl_row = tf.meshgrid(sl_indices, sl_indices) if direction is None: direct_mask = tf.cast( tf.diag(-tf.ones([sl], tf.int32)) + 1, tf.bool) else: if direction == 'forward': direct_mask = tf.greater(sl_row, sl_col) else: direct_mask = tf.greater(sl_col, sl_row) direct_mask_tile = tf.tile(tf.expand_dims(direct_mask, 0), [bs, 1, 1]) # bs,sl,sl rep_mask_tile = tf.tile(tf.expand_dims(rep_mask, 1), [1, sl, 1]) # bs,sl,sl attn_mask = tf.logical_and(direct_mask_tile, rep_mask_tile) # bs,sl,sl # non-linear rep_map = bn_dense_layer(rep_tensor, ivec, True, 0., 'bn_dense_map', activation, False, wd, keep_prob, is_train) rep_map_tile = tf.tile(tf.expand_dims(rep_map, 1), [1, sl, 1, 1]) # bs,sl,sl,vec rep_map_dp = dropout(rep_map, keep_prob, is_train) # attention with tf.variable_scope('attention'): # bs,sl,sl,vec f_bias = tf.get_variable('f_bias', [ivec], tf.float32, tf.constant_initializer(0.)) dependent = linear(rep_map_dp, ivec, False, scope='linear_dependent') # bs,sl,vec dependent_etd = tf.expand_dims(dependent, 1) # bs,1,sl,vec head = linear(rep_map_dp, ivec, False, scope='linear_head') # bs,sl,vec head_etd = tf.expand_dims(head, 2) # bs,sl,1,vec logits = scaled_tanh(dependent_etd + head_etd + f_bias, 5.0) # bs,sl,sl,vec logits_masked = exp_mask_for_high_rank(logits, attn_mask) attn_score = tf.nn.softmax(logits_masked, 2) # bs,sl,sl,vec attn_score = mask_for_high_rank(attn_score, attn_mask) attn_result = tf.reduce_sum(attn_score * rep_map_tile, 2) # bs,sl,vec with tf.variable_scope('output'): o_bias = tf.get_variable('o_bias', [ivec], tf.float32, tf.constant_initializer(0.)) # input gate fusion_gate = tf.nn.sigmoid( linear(rep_map, ivec, True, 0., 'linear_fusion_i', False, wd, keep_prob, is_train) + linear(attn_result, ivec, True, 0., 'linear_fusion_a', False, wd, keep_prob, is_train) + o_bias) output = fusion_gate * rep_map + (1 - fusion_gate) * attn_result output = mask_for_high_rank(output, rep_mask) # save attn if tensor_dict is not None and name is not None: tensor_dict[name + '_dependent'] = dependent tensor_dict[name + '_head'] = head tensor_dict[name] = attn_score tensor_dict[name + '_gate'] = fusion_gate return output
def visit_sa_with_dense(rep_tensor, keep_prob=1., is_train=None, wd=0., activation='relu', hn=None, is_scale=True, is_plus_sa=True): batch_size, sw_len, vec_size = tf.shape(rep_tensor)[0], tf.shape( rep_tensor)[1], tf.shape(rep_tensor)[2] ivec = rep_tensor.get_shape().as_list()[2] ivec = hn or ivec with tf.variable_scope('temporal_attention'): # mask generation attn_mask = tf.cast( tf.diag(-tf.ones([sw_len], tf.int32)) + 1, tf.bool) # batch_size, code_len, code_len # non-linear for context rep_map = bn_dense_layer(rep_tensor, ivec, True, 0., 'bn_dense_map', activation, False, wd, keep_prob, is_train) rep_map_tile = tf.tile(tf.expand_dims(rep_map, 1), [1, sw_len, 1, 1]) # bs,sl,sl,vec rep_map_dp = dropout(rep_map, keep_prob, is_train) # attention with tf.variable_scope('attention'): # bs,sl,sl,vec f_bias = tf.get_variable('f_bias', [ivec], tf.float32, tf.constant_initializer(0.)) dependent = linear( rep_map_dp, ivec, False, scope='linear_dependent') # batch_size, code_len, vec_size dependent_etd = tf.expand_dims( dependent, 1) # batch_size, code_len,code_len, vec_size head = linear( rep_map_dp, ivec, False, scope='linear_head') # batch_size, code_len, vec_size head_etd = tf.expand_dims( head, 2) # batch_size, code_len,code_len, vec_size if is_plus_sa: attention_fact = dependent_etd + head_etd + f_bias else: return rep_map if is_scale: logits = scaled_tanh(attention_fact, 5.0) # bs,sl,sl,vec else: logits = linear(tf.nn.tanh(attention_fact), ivec, True, scope='linear_attn_fact') logits_masked = exp_mask_for_high_rank(logits, attn_mask) attn_score = tf.nn.softmax(logits_masked, 2) # bs,sl,sl,vec attn_score = mask_for_high_rank(attn_score, attn_mask) attn_result = tf.reduce_sum(attn_score * rep_map_tile, 2) # bs,sl,vec with tf.variable_scope('output'): o_bias = tf.get_variable('o_bias', [ivec], tf.float32, tf.constant_initializer(0.)) # input gate fusion_gate = tf.nn.sigmoid( linear(rep_map, ivec, True, 0., 'linear_fusion_i', False, wd, keep_prob, is_train) + linear(attn_result, ivec, True, 0., 'linear_fusion_a', False, wd, keep_prob, is_train) + o_bias) output = fusion_gate * rep_map + (1 - fusion_gate) * attn_result return output