def final_merge(qw_repr, pw_repr, path_size, dyn_path_max_size, dim_hidden, scoring_mode): """ Copied from compq_acl18.BaseRelationMatchingKernel :param qw_repr: (ds * path_max_len, dim_hidden) :param pw_repr: (ds * path_max_len, dim_hidden) :param path_size: (ds, ) :param dyn_path_max_size: A tensor representing the max path_len in this batch :param dim_hidden: dim_hidden :param scoring_mode: compact / separated """ LogInfo.logs('scoring_mode = [%s]', scoring_mode) if scoring_mode == 'compact': # aggregate by max-pooling, then overall cosine qw_repr = tf.reshape( qw_repr, shape=[-1, dyn_path_max_size, dim_hidden], name='qw_repr') # (ds, path_max_size, dim_hidden) pw_repr = tf.reshape(pw_repr, shape=[-1, dyn_path_max_size, dim_hidden], name='pw_repr') q_final_repr = seq_hidden_max_pooling(seq_hidden_input=qw_repr, len_input=path_size) p_final_repr = seq_hidden_max_pooling(seq_hidden_input=pw_repr, len_input=path_size) # (ds, dim_hidden) score = cosine_sim(lf_input=q_final_repr, rt_input=p_final_repr) # (ds, ) return {'rm_score': score} else: # separately calculate cosine, then sum together (with threshold control) raw_score = cosine_sim(lf_input=qw_repr, rt_input=pw_repr) # (ds * path_max_len, ) raw_score = tf.reshape(raw_score, shape=[-1, dyn_path_max_size], name='raw_score') # (ds, path_max_len) sim_ths = tf.get_variable(name='sim_ths', dtype=tf.float32, shape=[]) path_score = tf.subtract( raw_score, sim_ths, name='path_score') # add penalty to each potential seq. sc_mask = tf.sequence_mask( lengths=path_size, maxlen=dyn_path_max_size, dtype=tf.float32, name='sc_mask') # (ds, sc_max_len) as mask score = tf.reduce_sum(path_score * sc_mask, axis=-1, name='score') # (ds, ) return {'rm_score': score, 'rm_path_score': path_score}
def final_merge(q_rep, path_rep, sc_len, sc_max_len, dim_hidden, scoring_mode): """ :param q_rep: (ds * sc_max_len, dim_hidden) :param path_rep: (ds * sc_max_len, dim_hidden), pay attention to the first dimension! :param sc_len: (ds, ) :param sc_max_len: sc_max_len :param dim_hidden: dim_hidden :param scoring_mode: compact / separated """ if scoring_mode == 'compact': # aggregate by max-pooling, then overall cosine q_att_rep = tf.reshape( q_rep, shape=[-1, sc_max_len, dim_hidden], name='q_att_rep') # (ds, sc_max_len, dim_hidden) path_att_rep = tf.reshape( path_rep, shape=[-1, sc_max_len, dim_hidden], name='path_att_rep') # (ds, sc_max_len, dim_hidden) q_final_rep = seq_hidden_max_pooling(seq_hidden_input=q_att_rep, len_input=sc_len) path_final_rep = seq_hidden_max_pooling( seq_hidden_input=path_att_rep, len_input=sc_len) # (ds, dim_hidden) score = cosine_sim(lf_input=q_final_rep, rt_input=path_final_rep) # (ds, ) return {'rm_score': score} else: # separately calculate cosine, then sum together (with threshold control) raw_score = cosine_sim(lf_input=q_rep, rt_input=path_rep) # (ds * sc_max_len, ) raw_score = tf.reshape(raw_score, shape=[-1, sc_max_len], name='raw_score') # (ds, sc_max_len) sim_ths = tf.get_variable(name='sim_ths', dtype=tf.float32, shape=[]) path_score = tf.subtract( raw_score, sim_ths, name='path_score') # add penalty to each potential seq. sc_mask = tf.sequence_mask( lengths=sc_len, maxlen=sc_max_len, dtype=tf.float32, name='sc_mask') # (ds, sc_max_len) as mask score = tf.reduce_sum(path_score * sc_mask, axis=-1, name='score') # (ds, ) return {'rm_score': score, 'rm_path_score': path_score}
def get_score(self, mode, qwords_embedding, qwords_len, sc_len, preds_embedding, preds_len, pwords_embedding, pwords_len): """ Produce the final similarity score. This function is the most important part in the optm/eval model. Just use cosine similarity :param mode: tf.contrib.learn.ModeKeys.TRAIN/INFER, which affects the dropout setting :param qwords_embedding: (ds, q_max_len, dim_emb) :param qwords_len: (ds, ) :param sc_len: (ds, ) :param preds_embedding: (ds, sc_max_len, path_max_len, dim_emb) :param preds_len: (ds, sc_max_len) :param pwords_embedding: (ds, sc_max_len, pword_max_len, dim_emb) :param pwords_len: (ds, sc_max_len) :return: score and attention matrices pred_att_mat: (ds, sc_max_len, q_max_len, path_max_len) pword_att_mat: (ds, sc_max_len, q_max_len, pword_max_len) score: (ds,) """ assert mode in (tf.contrib.learn.ModeKeys.TRAIN, tf.contrib.learn.ModeKeys.INFER) encoder_args = {'config': self.rnn_config, 'mode': mode} # set dropout according to the current mode (TRAIN/INFER) q_encoder = BidirectionalRNNEncoder(**encoder_args) pred_encoder = BidirectionalRNNEncoder(**encoder_args) pword_encoder = BidirectionalRNNEncoder(**encoder_args) cross_att = IndirectCrossAttention(**self.cross_att_config) with tf.name_scope('separated_relation_matching_kernel'): """ Preprocess: reshaping, merge ds and sc_max_len into one dimension """ qwords_embedding = tf.reshape( tf.stack([qwords_embedding] * self.sc_max_len, axis=1), shape=(-1, self.q_max_len, self.dim_emb), name='qwords_hidden' ) # (ds * sc_max_len, q_max_len, dim_hidden) qwords_len = tf.reshape( tf.stack([qwords_len] * self.sc_max_len, axis=1), shape=(-1,), name='qwords_len' ) # (ds * sc_max_len, ) comb_tensor_list = [] for tensor_input in (preds_embedding, preds_len, pwords_embedding, pwords_len): ori_shape = tensor_input.get_shape().as_list() comb_shape = [-1] + ori_shape[2:] # keep the dimensions after (ds, sc_max_len) # show_tensor(tensor_input) # LogInfo.logs('ori_shape: %s, comb_shape: %s', ori_shape, comb_shape) comb_tensor_list.append(tf.reshape(tensor_input, shape=comb_shape)) [preds_embedding, preds_len, pwords_embedding, pwords_len] = comb_tensor_list # (ds * sc_max_len, xxxxxxx) # for tensor in comb_tensor_list: # show_tensor(tensor) """ Step 1: Intra-attention (Optional) """ # TODO: Question and pred_words """ Step 2: Cross-attention, make sure pword and preds treat properly """ qwords_att_embedding, preds_att_info, pwords_att_info = cross_att.forward( q_input=qwords_embedding, p_input=preds_embedding, pw_input=pwords_embedding, q_len=qwords_len, p_len=preds_len, pw_len=pwords_len ) preds_att_embedding, preds_att_mat = preds_att_info pwords_att_embedding, pwords_att_mat = pwords_att_info # x_embedding: (ds * sc_max_len, x_max_len, dim_emb) # x_att_mat: (ds * sc_max_len, q_max_len, x_max_len) """ Step 3: Perform RNN over embeddings """ """ Want to share RNN parameters? Put'em into one var_scope """ with tf.variable_scope('qwords', reuse=self.reuse): qwords_hidden = seq_encoding( emb_input=qwords_att_embedding, len_input=qwords_len, encoder=q_encoder, reuse=self.reuse ) # (ds * sc_max_len, q_max_len, dim_hidden) qword_final_hidden = seq_hidden_max_pooling( seq_hidden_input=qwords_hidden, len_input=qwords_len) with tf.variable_scope('preds', reuse=self.reuse): preds_hidden = seq_encoding( emb_input=preds_att_embedding, len_input=preds_len, encoder=pred_encoder, reuse=self.reuse ) # (ds * sc_max_len, path_max_len, dim_hidden) pred_final_hidden = seq_hidden_max_pooling( seq_hidden_input=preds_hidden, len_input=preds_len) with tf.variable_scope('pwords', reuse=self.reuse): pwords_hidden = seq_encoding( emb_input=pwords_att_embedding, len_input=pwords_len, encoder=pword_encoder, reuse=self.reuse ) # (ds * sc_max_len, pword_max_len, dim_hidden) pword_final_hidden = seq_hidden_max_pooling( seq_hidden_input=pwords_hidden, len_input=pwords_len) # x_final_hidden: (ds * sc_max_len, dim_hidden) """ Step 4: Path merging, calculate final score """ # TODO: use pword/pred or not if self.path_merge_mode == 'sum': path_final_hidden = tf.add(pword_final_hidden, pred_final_hidden, name='path_final_hidden') # (ds * sc_max_len, dim_hidden) else: # max path_final_hidden = tf.reduce_max( tf.stack([pword_final_hidden, pred_final_hidden], axis=0), # (2, ds * sc_max_len, dim_hidden) axis=0, name='path_final_hidden') # (ds * sc_max_len, dim_hidden) if self.final_score_mode == 'cos': path_score = cosine_sim(lf_input=qword_final_hidden, rt_input=path_final_hidden) # (ds * sc_max_len, ) else: # dot path_score = tf.reduce_sum(qword_final_hidden * path_final_hidden, axis=-1) # (ds * sc_max_len, ) path_score = tf.reshape(path_score, shape=[-1, self.sc_max_len], name='path_score') # (ds, sc_max_len) sc_mask = tf.sequence_mask(lengths=sc_len, maxlen=self.sc_max_len, dtype=tf.float32, name='sc_mask') # (ds, sc_max_len) as mask score = tf.reduce_sum(path_score * sc_mask, axis=-1, name='score') # (ds, ) pred_att_mat = tf.reshape(preds_att_mat, [-1, self.sc_max_len, self.q_max_len, self.path_max_len], name='pred_att_mat') # (ds, sc_max_len, q_max_len, path_max_len) pword_att_mat = tf.reshape(pwords_att_mat, [-1, self.sc_max_len, self.q_max_len, self.pword_max_len], name='pword_att_mat') # (ds, sc_max_len, q_max_len, pword_max_len) return pred_att_mat, pword_att_mat, score
def forward(self, el_size, qw_emb, qw_len, pw_sup_emb, pw_sup_len, sup_size, type_trans, el_type_signa, el_indv_feats, el_comb_feats, mode): """ Note: number of paths in a schema == number of entities in the schema :param el_size: (ds, ) :param qw_emb: (ds, path_max_size, qw_max_len, dim_emb) :param qw_len: (ds, path_max_size) :param pw_sup_emb: (ds, path_max_size, sup_max_size, pw_max_len, dim_emb) :param pw_sup_len: (ds, path_max_size, sup_max_size) :param sup_size: (ds, path_max_size) :param type_trans: (ds, path_max_size, sup_max_size, dim_type) :param el_type_signa: (ds, el_max_size, dim_type) :param el_indv_feats: (ds, el_max_size, el_feat_size) :param el_comb_feats: (ds, 1) :param mode: TRAIN / INFER """ LogInfo.begin_track('Build kernel: [el_kernel]') assert mode in (tf.contrib.learn.ModeKeys.INFER, tf.contrib.learn.ModeKeys.TRAIN) rnn_encoder = None if self.rnn_config is not None: encoder_args = {'config': self.rnn_config, 'mode': mode} rnn_encoder = BidirectionalRNNEncoder(**encoder_args) raw_shape = tf.shape(pw_sup_len) dyn_el_max_size = raw_shape[1] dyn_sup_max_size = raw_shape[2] """ Possible reshapes """ qw_emb = tf.reshape(qw_emb, [-1, self.qw_max_len, self.dim_emb]) # (ds * el_max_size, qw_max_len, dim_emb) qw_len = tf.reshape(qw_len, [-1]) # (ds * el_max_size) pw_sup_emb = tf.reshape(pw_sup_emb, [-1, self.pw_max_len, self.dim_emb]) # (ds * el_max_size * sup_max_size, pw_max_len, dim_emb) pw_sup_len = tf.reshape(pw_sup_len, [-1]) """ Calculate attention / non-attention question representation """ pw_sup_repr = seq_encoding_with_aggregation( emb_input=pw_sup_emb, len_input=pw_sup_len, rnn_encoder=rnn_encoder, seq_merge_mode=self.seq_merge_mode) # (ds*el_max_size*sup_max_size, dim_hidden) if self.att_config is not None: dim_att_len = self.att_config['dim_att_hidden'] att_func = self.att_config['att_func'] qw_hidden = seq_encoding(emb_input=qw_emb, len_input=qw_len, encoder=rnn_encoder) # (ds * el_max_size, qw_max_len, dim_hidden) qw_mask = tf.sequence_mask(lengths=qw_len, maxlen=self.qw_max_len, dtype=tf.float32, name='qw_mask') # (DS, qw_max_len) tile_qw_hidden = tf.tile( tf.expand_dims( qw_hidden, axis=1), # (ds*el_max_size, 1, qw_max_len, dim_hidden) multiples=[1, dyn_sup_max_size, 1, 1], name='tile_qw_hidden' ) # (ds*el_max_size, sup_max_size, qw_max_len, dim_hidden) tile_qw_mask = tf.tile( tf.expand_dims(qw_mask, axis=1), multiples=[1, dyn_sup_max_size, 1], name='tile_qw_mask' ) # (ds*el_max_size, sup_max_size, qw_max_len) expand_qw_mask = tf.reshape(tile_qw_mask, [-1, self.qw_max_len]) expand_qw_hidden = tf.reshape( tile_qw_hidden, [-1, self.qw_max_len, self.dim_hidden]) # (ds*el_max_size*sup_max_size, qw_max_len, dim_hidden) simple_att = SimpleAttention(lf_max_len=self.qw_max_len, dim_att_hidden=dim_att_len, att_func=att_func) qw_att_repr, _, _ = simple_att.forward(lf_input=expand_qw_hidden, lf_mask=expand_qw_mask, fix_rt_input=pw_sup_repr) # (ds*el_max_size*sup_max_size, dim_hidden) final_qw_repr = qw_att_repr else: qw_repr = seq_encoding_with_aggregation( emb_input=qw_emb, len_input=qw_len, rnn_encoder=rnn_encoder, seq_merge_mode=self.seq_merge_mode) # (ds*el_max_size, dim_hidden) tile_qw_repr = tf.tile( tf.expand_dims(qw_repr, axis=1), multiples=[1, dyn_sup_max_size, 1], name='tile_qw_repr' ) # (ds*el_max_size, sup_max_size, dim_hidden) expand_qw_repr = tf.reshape(tile_qw_repr, [-1, self.dim_hidden]) final_qw_repr = expand_qw_repr with tf.variable_scope('el_kernel', reuse=tf.AUTO_REUSE): """ Calculate cosine similarity, and turning into type distribution """ sim_score = cosine_sim( lf_input=final_qw_repr, rt_input=pw_sup_repr) # (ds*el_max_size, sup_max_size) sim_score = tf.reshape( sim_score, shape=raw_shape, name='sim_score') # (ds, el_max_size, sup_max_size) sup_mask = tf.sequence_mask( lengths=sup_size, maxlen=dyn_sup_max_size, dtype=tf.float32, name='sup_mask') # (ds, el_max_size, sup_max_size) mask_score = sup_mask * sim_score + (1. - sup_mask) * tf.float32.min pred_prob = tf.nn.softmax( logits=mask_score, name='pred_prob') # (ds, el_max_size, sup_max_size) type_prob = tf.matmul( a=tf.expand_dims(pred_prob, axis=2), # (ds, el_max_size, 1, sup_max_size) b=type_trans # (ds, el_max_size, sup_max_size, dim_type) ) # (ds, el_max_size, 1, dim_type) type_prob = tf.squeeze( input=type_prob, axis=2, name='type_prob') # (ds, el_max_size, dim_type) type_match_score = tf.reduce_sum( el_type_signa * type_prob, axis=-1, keep_dims=True, name='type_match_score') # (ds, el_max_size, 1) """ Feature concat and produce scores """ el_indv_concat = tf.concat( [type_match_score, el_indv_feats], axis=-1, name='el_indv_concat') # (ds, el_max_size, 1+el_feat_size) el_mask = tf.sequence_mask(lengths=el_size, maxlen=dyn_el_max_size, dtype=tf.float32, name='el_mask') # (ds, el_max_size) sum_indv_feats = tf.reduce_sum( el_indv_concat * tf.expand_dims(el_mask, axis=-1), axis=1, name='sum_indv_feats') # (ds, 1+el_feat_size) final_feats = tf.concat([sum_indv_feats, el_comb_feats], axis=-1, name='final_feats') # (ds, 1+el_max_size+1) --> type_match + indv_feats + comb_feat el_score = tf.contrib.layers.fully_connected( inputs=final_feats, num_outputs=1, activation_fn=None, scope='out_fc', reuse=tf.AUTO_REUSE ) # (ds, 1), representing type matching score LogInfo.end_track() return el_score, final_feats
def forward(self, el_size, qw_emb, qw_len, pw_sup_emb, pw_sup_len, type_trans, el_sup_mask, el_type_signa, el_indv_feats, el_comb_feats, mode): """ Note: number of paths in a schema == number of entities in the schema local_mem_size: the local number of relevant paths in the current batch. :param el_size: (ds, ) :param qw_emb: (ds, el_max_size, qw_max_len, dim_emb) :param qw_len: (ds, el_max_size) :param pw_sup_emb: (local_mem_size, pw_max_len, dim_emb) :param pw_sup_len: (local_mem_size,) :param type_trans: (local_mem_size, dim_type) :param el_sup_mask: (ds, el_max_size, local_mem_size) :param el_type_signa: (ds, el_max_size, dim_type) :param el_indv_feats: (ds, el_max_size, el_feat_size) :param el_comb_feats: (ds, 1) :param mode: TRAIN / INFER """ """ 180416: Let's assume ds=16*2=32, el_max_size=3, qw_max_len=20, dim_emb=300, local_mem_size=6K Then ds*el_max_size*qw_max_len ~= 2K """ LogInfo.begin_track('Build kernel: [el_kernel]') assert mode in (tf.contrib.learn.ModeKeys.INFER, tf.contrib.learn.ModeKeys.TRAIN) rnn_encoder = None if self.rnn_config is not None: encoder_args = {'config': self.rnn_config, 'mode': mode} rnn_encoder = BidirectionalRNNEncoder(**encoder_args) raw_shape = tf.shape(el_sup_mask) el_max_size = raw_shape[1] local_mem_size = raw_shape[2] dim_type = tf.shape(type_trans)[1] """ Possible reshapes """ qw_emb = tf.reshape(qw_emb, [-1, self.qw_max_len, self.dim_emb]) # (ds * el_max_size, qw_max_len, dim_emb) qw_len = tf.reshape(qw_len, [-1]) # (ds * el_max_size) """ Calculate attention / non-attention question representation """ pw_sup_repr = seq_encoding_with_aggregation(emb_input=pw_sup_emb, len_input=pw_sup_len, rnn_encoder=rnn_encoder, seq_merge_mode=self.seq_merge_mode) # (local_mem_size, dim_hidden) if self.att_config is not None: att_func = self.att_config['att_func'] assert att_func == 'dot' # TODO: Currently only support dot product qw_hidden = seq_encoding(emb_input=qw_emb, len_input=qw_len, encoder=rnn_encoder) # (ds*el_max_size, qw_max_len, dim_hidden) qw_mask = tf.sequence_mask(lengths=qw_len, maxlen=self.qw_max_len, dtype=tf.float32, name='qw_mask') # (ds*el_max_size, qw_max_len) flat_qw_hidden = tf.reshape(qw_hidden, shape=[-1, self.dim_hidden], name='flat_qw_hidden') # (ds*el_max_size*qw_max_len, dim_hidden) """ Step 1: Very simple & fast way to calculate dot attention """ raw_mutual_att_mat = tf.matmul( flat_qw_hidden, tf.transpose(pw_sup_repr), name='raw_mutual_att_mat' ) # (ds*el_max_size*qw_max_len, local_mem_size) mutual_att_mat = tf.reshape( raw_mutual_att_mat, shape=[-1, self.qw_max_len, local_mem_size], name='mutual_att_mat') # (ds*el_max_size, qw_max_len, local_mem_size) """ Step 2: Prepare masked att_mat and normalized distribution """ qw_mask_3dim = tf.expand_dims(qw_mask, axis=-1, name='qw_mask_3dim') # (ds*el_max_size, qw_max_len, 1) masked_att_mat = ( qw_mask_3dim * mutual_att_mat + (1. - qw_mask_3dim) * mutual_att_mat * tf.float32.min ) # (ds*el_max_size, qw_max_len, local_mem_size) unnorm_weight = tf.transpose(masked_att_mat, [0, 2, 1], name='masked_att_mat') # (ds*el_max_size, local_mem_size, qw_max_len) norm_weight = tf.nn.softmax(unnorm_weight, name='norm_weight') """ Step 3: Got final qw_repr w.r.t different support paths """ qw_repr = tf.matmul(norm_weight, qw_hidden, name='qw_repr') # batch_matmul: (ds*el_max_size, local_mem_size, qw_max_len) else: # noAtt, very simple raw_qw_repr = seq_encoding_with_aggregation(emb_input=qw_emb, len_input=qw_len, rnn_encoder=rnn_encoder, seq_merge_mode=self.seq_merge_mode) # (ds*el_max_size, dim_hidden) qw_repr = tf.expand_dims(raw_qw_repr, axis=1, name='qw_repr') # (ds*el_max_size, 1, dim_hidden) with tf.variable_scope('el_kernel', reuse=tf.AUTO_REUSE): """ Calculate cosine similarity """ flat_pw_sup_repr = tf.expand_dims(pw_sup_repr, axis=0, name='flat_pw_sup_repr') # (1, local_mem_size, dim_hidden) sim_score = cosine_sim( lf_input=qw_repr, # (ds*el_max_size, [1 or local_mem_size], qw_max_len) rt_input=flat_pw_sup_repr # (1, local_mem_size, dim_hidden) ) # (ds*el_max_size, local_mem_size) """ Turning into type distribution """ flat_el_sup_mask = tf.reshape(el_sup_mask, shape=[-1, local_mem_size], name='flat_el_sup_mask') # (ds*el_max_size, local_mem_size) mask_score = flat_el_sup_mask * sim_score + (1. - flat_el_sup_mask) * tf.float32.min pred_prob = tf.nn.softmax(logits=mask_score, name='pred_prob') # (ds*el_max_size, local_mem_size) raw_type_prob = tf.matmul(pred_prob, type_trans, name='raw_type_prob') # (ds*el_max_size, dim_type) type_prob = tf.reshape(raw_type_prob, shape=[-1, el_max_size, dim_type], name='type_prob') # (ds, el_max_size, dim_type) type_match_score = tf.reduce_sum(el_type_signa*type_prob, axis=-1, keep_dims=True, name='type_match_score') # (ds, el_max_size, 1) """ Feature concat and produce scores """ el_indv_concat = tf.concat([type_match_score, el_indv_feats], axis=-1, name='el_indv_concat') # (ds, el_max_size, 1+el_feat_size) el_mask = tf.sequence_mask(lengths=el_size, maxlen=el_max_size, dtype=tf.float32, name='el_mask') # (ds, el_max_size) sum_indv_feats = tf.reduce_sum( el_indv_concat * tf.expand_dims(el_mask, axis=-1), axis=1, name='sum_indv_feats' ) # (ds, 1+el_feat_size) final_feats = tf.concat([sum_indv_feats, el_comb_feats], axis=-1, name='final_feats') # (ds, 1+el_max_size+1) --> type_match + indv_feats + comb_feat el_score = tf.contrib.layers.fully_connected( inputs=final_feats, num_outputs=1, activation_fn=None, scope='out_fc', reuse=tf.AUTO_REUSE ) # (ds, 1), representing type matching score LogInfo.end_track() return el_score, final_feats
def get_score(self, mode, qwords_embedding, qwords_len, sc_len, preds_embedding, preds_len, pwords_embedding, pwords_len): """ Produce the final similarity score. This function is the most important part in the optm/eval model. Just use cosine similarity :param mode: tf.contrib.learn.ModeKeys.TRAIN/INFER, which affects the dropout setting :param qwords_embedding: (ds, q_max_len, dim_emb) :param qwords_len: (ds, ) :param sc_len: (ds, ) :param preds_embedding: (ds, sc_max_len, path_max_len, dim_emb) :param preds_len: (ds, sc_max_len) :param pwords_embedding: (ds, sc_max_len, pword_max_len, dim_emb) :param pwords_len: (ds, sc_max_len) :return: (ds, ) as the final similarity score """ assert mode in (tf.contrib.learn.ModeKeys.TRAIN, tf.contrib.learn.ModeKeys.INFER) if self.rnn_config['cell_class'] == 'None': # won't use any recurrent layer, but just using pure embedding as instead self.dim_hidden = self.dim_emb # force set dim_hidden to be dim_emb q_encoder = pred_encoder = pword_encoder = None else: encoder_args = {'config': self.rnn_config, 'mode': mode} q_encoder = BidirectionalRNNEncoder(**encoder_args) pred_encoder = BidirectionalRNNEncoder(**encoder_args) pword_encoder = BidirectionalRNNEncoder(**encoder_args) """ BidirectionalRNNEncoder will set the dropout according to the current mode (TRAIN/INFER) """ with tf.name_scope('RelationMatchingKernel'): with tf.variable_scope('Question', reuse=self.reuse): if q_encoder is None: qwords_hidden = qwords_embedding # (ds, q_max_len, dim_hidden=dim_emb) else: qwords_hidden = seq_encoding( emb_input=qwords_embedding, len_input=qwords_len, encoder=q_encoder, reuse=self.reuse) # (ds, q_max_len, dim_hidden) q_hidden = seq_hidden_max_pooling( seq_hidden_input=qwords_hidden, len_input=qwords_len) # (ds, dim_hidden), will be used in the final cosine similarity calculation # Step 1: split schemas into paths # merge ds and sc_max_len into one dimension qwords_hidden = tf.reshape( tf.stack([qwords_hidden] * self.sc_max_len, axis=1), shape=(-1, self.q_max_len, self.dim_hidden), name='qwords_hidden' ) # (ds * sc_max_len, q_max_len, dim_hidden) qwords_len = tf.reshape(tf.stack([qwords_len] * self.sc_max_len, axis=1), shape=(-1, ), name='qwords_len') # (ds * sc_max_len, ) # Now combine ds and sc_max_len into one dimension comb_tensor_list = [] for tensor_input in (preds_embedding, preds_len, pwords_embedding, pwords_len): ori_shape = tensor_input.get_shape().as_list() comb_shape = [ -1 ] + ori_shape[2:] # keep the dimensions after (ds, sc_max_len) # show_tensor(tensor_input) # LogInfo.logs('ori_shape: %s, comb_shape: %s', ori_shape, comb_shape) comb_tensor_list.append( tf.reshape(tensor_input, shape=comb_shape)) [preds_embedding, preds_len, pwords_embedding, pwords_len] = comb_tensor_list # (ds * sc_max_len, xxxxxxx) # for tensor in comb_tensor_list: # show_tensor(tensor) # Step 2: Compute basic hidden repr. # xxx_final_hidden: (ds * sc_max_len, dim_hidden) # (Optional) xxx_att_mat: (ds * sc_max_len, q_max_len, xxx_max_len) with tf.name_scope('Schema'): with tf.variable_scope('preds', reuse=self.reuse): if pred_encoder is None: preds_hidden = preds_embedding # (ds * sc_max_len, path_max_len, dim_hidden=dim_emb) else: preds_hidden = seq_encoding( emb_input=preds_embedding, len_input=preds_len, encoder=pred_encoder, reuse=self.reuse ) # (ds * sc_max_len, path_max_len, dim_hidden) pred_final_hidden, pred_att_mat = self.aggregate_within_path( qwords_hidden=qwords_hidden, qwords_len=qwords_len, pitems_hidden=preds_hidden, pitems_len=preds_len, item_max_len=self.path_max_len, item_agg_mode=self.preds_agg_mode) with tf.variable_scope('pwords', reuse=self.reuse): if pword_encoder is None: pwords_hidden = pwords_embedding # (ds * sc_max_len, pword_max_len, dim_hidden=dim_emb) else: pwords_hidden = seq_encoding( emb_input=pwords_embedding, len_input=pwords_len, encoder=pword_encoder, reuse=self.reuse ) # (ds * sc_max_len, pword_max_len, dim_hidden) pword_final_hidden, pword_att_mat = self.aggregate_within_path( qwords_hidden=qwords_hidden, qwords_len=qwords_len, pitems_hidden=pwords_hidden, pitems_len=pwords_len, item_max_len=self.pword_max_len, item_agg_mode=self.pwords_agg_mode) # Step 3: 1. merge preds and pwords # 2. combine paths into schemas # 3. produce the final score # path_merge_mode: Max: max pooling # Sum: simple summation with tf.name_scope('PathMerge'): assert not (pword_final_hidden is None and pred_final_hidden is None) if pword_final_hidden is None: # information comes from pwords only path_final_hidden = pred_final_hidden elif pred_final_hidden is None: # information comes from preds only path_final_hidden = pword_final_hidden else: # combine the information from both pwords and preds assert self.path_merge_mode in ('Sum', 'Max') if self.path_merge_mode == 'Sum': path_final_hidden = tf.add( pword_final_hidden, pred_final_hidden, name='path_final_hidden' ) # (ds * sc_max_len, dim_hidden) else: path_final_hidden = tf.reduce_max( tf.stack( [pword_final_hidden, pred_final_hidden], axis=0 ), # (2, ds * sc_max_len, dim_hidden) axis=0, name='path_final_hidden' ) # (ds * sc_max_len, dim_hidden) sc_path_hidden = tf.reshape( path_final_hidden, shape=[-1, self.sc_max_len, self.dim_hidden], name='sc_path_hidden') # (ds, sc_max_len, dim_hidden) # max pooling along all paths sc_hidden = seq_hidden_max_pooling( seq_hidden_input=sc_path_hidden, len_input=sc_len) # (ds, dim_hidden) score = cosine_sim(lf_input=q_hidden, rt_input=sc_hidden) # (ds, ) if pred_att_mat is not None: pred_att_mat = tf.reshape( pred_att_mat, [-1, self.sc_max_len, self.q_max_len, self.path_max_len], name='pred_att_mat' ) # (ds, sc_max_len, q_max_len, path_max_len) if pword_att_mat is not None: pword_att_mat = tf.reshape( pword_att_mat, [-1, self.sc_max_len, self.q_max_len, self.pword_max_len], name='pword_att_mat' ) # (ds, sc_max_len, q_max_len, pword_max_len) return pred_att_mat, pword_att_mat, score
def compute_attention(self, left_tensor, left_len, right_tensor, right_len): """ :param left_tensor: [B, T1, dim1] :param right_tensor: [B, T2, dim2] :param left_len: [B, ] real length of left tensor :param right_len: [B, ] real length of right tensor :return: [B, ] similarity score, [B, T1, T2] attention matrix """ # Fully connected layers to transform both left and right tensor # into a tensor with `hidden_dim` units # [B, T1, dim] att_left = tf.contrib.layers.fully_connected( inputs=left_tensor, num_outputs=self.hidden_dim, activation_fn=None, scope="att_keys") # [B, T2, dim] att_right = tf.contrib.layers.fully_connected( inputs=right_tensor, num_outputs=self.hidden_dim, activation_fn=None, scope="att_query") # [B, T1, 1, dim] att_left = tf.expand_dims(att_left, axis=2) # [B, T1, T2, dim] att_left = tf.tile(att_left, multiples=[1, 1, self.right_max_len, 1]) # [B, T2, 1, dim] att_right = tf.expand_dims(att_right, axis=2) # [B, T2, T1, dim] att_right = tf.tile(att_right, multiples=[1, 1, self.left_max_len, 1]) # [B, T1, T2, dim] att_right = tf.transpose(att_right, perm=[0, 2, 1, 3]) v_att = tf.get_variable(name="v_att", shape=[self.hidden_dim], dtype=tf.float32) # [B, T1, T2] att_matrix = tf.reduce_sum(v_att * tf.tanh(att_left + att_right), axis=3) # [B, T1] att_val_left = tf.reduce_sum(att_matrix, axis=2) # [B, T2] att_val_right = tf.reduce_sum(att_matrix, axis=1) """ Kangqi on 180211: A bit mistake here. att_matrix haven't removed padding elements (att_maxtrix[i][j]) yet, but those elements make contribution to att_val_left/right. The masking process below cannot remove such information. """ # Replace all scores for padded inputs with tf.float32.min left_mask = tf.sequence_mask(lengths=tf.to_int32(left_len), maxlen=tf.to_int32(self.left_max_len), dtype=tf.float32) # [B, T1] left_val = att_val_left * left_mask + ( (1.0 - left_mask) * tf.float32.min) right_mask = tf.sequence_mask(lengths=tf.to_int32(right_len), maxlen=tf.to_int32(self.right_max_len), dtype=tf.float32) # [B, T2] right_val = att_val_right * right_mask + ( (1.0 - right_mask) * tf.float32.min) # Normalize the scores left_normalized = tf.nn.softmax(left_val, name="left_normalized") right_normalized = tf.nn.softmax(right_val, name="right_normalized") # Calculate the weighted average of the attention inputs # according to the attention values # [B, T1, 1] * [B, T1, dim] --> [B, T1, dim] --> [B, dim] left_weighted = tf.expand_dims(left_normalized, axis=2) * left_tensor left_weighted = tf.reduce_sum(left_weighted, axis=1) # [B, dim] right_weighted = tf.expand_dims(right_normalized, axis=2) * right_tensor right_weighted = tf.reduce_sum(right_weighted, axis=1) # Kangqi edit: cosine similarity is much better # score = tf.contrib.layers.fully_connected( # inputs=tf.concat([left_weighted, right_weighted], axis=1), # num_outputs=1, # activation_fn=None, # scope="output") score = cosine_sim(lf_input=left_weighted, rt_input=right_weighted) # Kangqi edit: return more items. # return score, att_matrix # Kangqi edit: we need masked att_matrix, padding 0 on useless rows / columns left_cube_mask = tf.stack([left_mask] * self.right_max_len, axis=-1) # [B, T1, T2] right_cube_mask = tf.stack([right_mask] * self.left_max_len, axis=1) # [B, T1, T2] masked_att_matrix = att_matrix * left_cube_mask * right_cube_mask # [B, T1, T2] return left_weighted, right_weighted, masked_att_matrix, score
def _rm_final_merge(self, path_repr, sent_repr, path_cates, path_size, scope_name): """ Kernel part of rm_final_merge. :param path_repr: (ds, path_max_len, dim_path_hidden) :param sent_repr: (ds, path_max_len, dim_hidden) :param path_cates: (ds, path_max_len, 5) :param path_size: (ds, ) :param scope_name: """ with tf.variable_scope(scope_name, reuse=tf.AUTO_REUSE): dim_path_hidden = path_repr.get_shape().as_list()[-1] if self.scoring_mode == 'compact': sent_repr = seq_hidden_max_pooling(seq_hidden_input=sent_repr, len_input=path_size) path_repr = seq_hidden_max_pooling(seq_hidden_input=path_repr, len_input=path_size) # (ds, dim_xx_hidden) else: assert self.scoring_mode in ('separated', 'bao') sent_repr = tf.reshape(sent_repr, [-1, self.dim_hidden]) path_repr = tf.reshape(path_repr, [-1, dim_path_hidden]) # (ds*path_max_size, dim_xx_hidden) """ Now apply final functions """ if self.final_func == 'dot': assert dim_path_hidden == self.dim_hidden merge_score = tf.reduce_sum(sent_repr*path_repr, axis=-1, name='merge_score') elif self.final_func == 'cos': assert dim_path_hidden == self.dim_hidden merge_score = cosine_sim(lf_input=sent_repr, rt_input=path_repr) elif self.final_func == 'bilinear': bilinear_mat = tf.get_variable(name='bilinear_mat', shape=[dim_path_hidden, self.dim_hidden], dtype=tf.float32, initializer=tf.contrib.layers.xavier_initializer()) proj_repr = tf.matmul(path_repr, bilinear_mat, name='proj_repr') merge_score = tf.reduce_sum(sent_repr * proj_repr, axis=-1, name='merge_score') else: assert self.final_func.startswith('fc') hidden_size = int(self.final_func[2:]) concat_repr = tf.concat([sent_repr, path_repr], axis=-1, name='concat_repr') concat_hidden = tf.contrib.layers.fully_connected( inputs=concat_repr, num_outputs=hidden_size, activation_fn=tf.nn.relu, scope='fc1', reuse=tf.AUTO_REUSE ) # (ds / ds*path_max_len, 32) merge_score = tf.contrib.layers.fully_connected( inputs=concat_hidden, num_outputs=1, activation_fn=None, scope='fc2', reuse=tf.AUTO_REUSE ) # (ds / ds*path_max_len, 1) merge_score = tf.squeeze(merge_score, axis=-1, name='merge_score') """ add scores together, if working in separated / bao mode """ if self.scoring_mode == 'compact': rm_score = merge_score rm_final_feats = tf.expand_dims(rm_score, -1, 'rm_final_feats') # (ds, 1) else: assert self.scoring_mode in ('separated', 'bao') merge_score = tf.reshape(merge_score, [-1, self.path_max_size]) # (ds, path_max_size) path_mask = tf.sequence_mask( lengths=path_size, maxlen=self.path_max_size, dtype=tf.float32, name='path_mask' ) # (ds, path_max_size) as mask if self.scoring_mode == 'separated': rm_score = tf.reduce_sum(merge_score*path_mask, axis=-1, name='rm_score') # (ds, ) rm_final_feats = tf.expand_dims(rm_score, -1, 'rm_final_feats') # (ds, 1) else: # Imitate Bao's implementation, care about the detail path category mask_score_3d = tf.expand_dims( merge_score * path_mask, axis=1, name='mask_score_3d' ) # (ds, 1, path_max_size) rm_final_feats = tf.squeeze( tf.matmul(mask_score_3d, path_cates), # (ds, 1, 5) axis=1, name='rm_final_feats' ) # (ds, 5) rm_score_2d = tf.contrib.layers.fully_connected( inputs=rm_final_feats, num_outputs=1, activation_fn=None, scope='out_fc', reuse=tf.AUTO_REUSE ) # (ds / ds*path_max_len, 1) rm_score = tf.squeeze(rm_score_2d, axis=-1, name='rm_score') return rm_final_feats, rm_score