def cnn_for_sentence_encoding( # kim rep_tensor, rep_mask, filter_sizes=(3,4,5), num_filters=200, scope=None, is_train=None, keep_prob=1., wd=0.): """ :param rep_tensor: :param rep_mask: :param filter_sizes: :param num_filters: :param scope: :param is_train: :param keep_prob: :param wd: :return: """ bs, sl, vec = tf.shape(rep_tensor)[0], tf.shape(rep_tensor)[1], tf.shape(rep_tensor)[2] ivec = rep_tensor.get_shape().as_list()[2] with tf.variable_scope(scope or 'cnn_for_sentence_encoding'): rep_tensor = mask_for_high_rank(rep_tensor, rep_mask) rep_tensor_expand = tf.expand_dims(rep_tensor, 3) rep_tensor_expand_dp = dropout(rep_tensor_expand, keep_prob, is_train) # Create a convolution + maxpool layer for each filter size pooled_outputs = [] for i, filter_size in enumerate(filter_sizes): with tf.variable_scope("conv-maxpool-%s" % filter_size): # Convolution Layer filter_shape = [filter_size, ivec, 1, num_filters] W = tf.get_variable('W', filter_shape, tf.float32) b = tf.get_variable('b', [num_filters], tf.float32) conv = tf.nn.conv2d( rep_tensor_expand_dp, W, strides=[1, 1, 1, 1], padding="VALID", name="conv") # Apply nonlinearity h = tf.nn.relu(tf.nn.bias_add(conv, b), name="relu") # bs, sl-fs+1, 1, fn # Maxpooling over the outputs # pooled = tf.nn.max_pool( # h, # ksize=[1, sl - filter_size + 1, 1, 1], # strides=[1, 1, 1, 1], # padding='VALID', # name="pool") pooled = tf.reduce_max(h, 1, True) # bs, 1, 1, fn pooled_outputs.append(pooled) # Combine all the pooled features num_filters_total = num_filters * len(filter_sizes) h_pool = tf.concat(pooled_outputs, 3) h_pool_flat = tf.reshape(h_pool, [-1, num_filters_total]) if wd > 0.: add_reg_without_bias() return h_pool_flat
def hierarchical_cnn_res_gate( rep_tensor, rep_mask, n_gram=5, layer_num=5, hn=None, scope=None, is_train=None, keep_prob=1., wd=0.): # padding if n_gram % 2 == 1: padding_front = padding_back = int((n_gram - 1) / 2) else: padding_front = (n_gram - 1) // 2 padding_back = padding_front + 1 padding = [[0, 0], [padding_front, padding_back], [0, 0], [0, 0]] # lengths bs, sl, vec = tf.shape(rep_tensor)[0], tf.shape(rep_tensor)[1], tf.shape(rep_tensor)[2] org_ivec = rep_tensor.get_shape().as_list()[2] ivec = hn or org_ivec with tf.variable_scope(scope or 'cnn_for_sentence_encoding'): rep_tensor = mask_for_high_rank(rep_tensor, rep_mask) # bs, sl, hn iter_rep = rep_tensor layer_res_list = [] for layer_idx in range(layer_num): with tf.variable_scope("conv_maxpool_%s" % layer_idx): iter_rep_etd = tf.expand_dims(iter_rep, 3) # bs,sl,hn,1 iter_rep_etd_dp = dropout(iter_rep_etd, keep_prob, is_train) # Convolution Layer feature_size = org_ivec if layer_idx == 0 else ivec filter_shape = [n_gram, feature_size, 1, 2 * ivec] W = tf.get_variable('W', filter_shape, tf.float32) b = tf.get_variable('b', [2 * ivec], tf.float32) iter_rep_etd_pad = tf.pad(iter_rep_etd_dp, padding) conv = tf.nn.conv2d( iter_rep_etd_pad, W, strides=[1, 1, 1, 1], padding="VALID", name="conv") map_res = tf.nn.relu(tf.nn.bias_add(conv, b), name="relu") # bs,sl,1,2hn map_res = tf.squeeze(map_res, [2]) # bs,sl,2*hn # gate map_res_a, map_res_b = tf.split(map_res, num_or_size_splits=2, axis=2) iter_rep = map_res_a * tf.nn.sigmoid(map_res_b) # res if len(layer_res_list) > 0: iter_rep = iter_rep + layer_res_list[-1] layer_res_list.append(iter_rep) if wd > 0.: add_reg_without_bias() return iter_rep
def time_aware_attention(train_inputs, embed, mask, embedding_size, k): with tf.variable_scope('time_aware_attention'): attn_weights = tf.Variable( tf.truncated_normal([embedding_size, k], stddev=1.0 / math.sqrt(k))) attn_biases = tf.Variable(tf.zeros([k])) # weight add bias attn_embed = tf.nn.bias_add(attn_weights, attn_biases) # multiplying it with Ei attn_scalars = tf.tensordot(embed, attn_embed, axes=[[2], [0]]) # get abs of distance train_delta = tf.abs(train_inputs[:, :, 1]) # distance function is log(dist+1) dist_fun = tf.log(tf.to_float(train_delta) + 1.0) # reshape the dist_fun dist_fun = tf.reshape( dist_fun, [tf.shape(dist_fun)[0], tf.shape(dist_fun)[1], 1]) # the attribution logits attn_logits = tf.multiply(attn_scalars, dist_fun) # the attribution logits sum attn_logits_sum = tf.reduce_sum(attn_logits, -1, keepdims=True) attn_logits_sum = exp_mask_for_high_rank(attn_logits_sum, mask) # get weights via softmax attn_softmax = tf.nn.softmax(attn_logits_sum, 1) # the weighted sum attn_embed_weighted = tf.multiply(attn_softmax, embed) attn_embed_weighted = mask_for_high_rank(attn_embed_weighted, mask) reduced_embed = tf.reduce_sum(attn_embed_weighted, 1) # obtain two scalars scalar1 = tf.log(tf.to_float(tf.shape(embed)[1]) + 1.0) scalar2 = tf.reduce_sum(tf.pow(attn_softmax, 2), 1) # the scalared embed reduced_embed = tf.multiply(reduced_embed, scalar1) reduced_embed = tf.multiply(reduced_embed, scalar2) return reduced_embed, attn_embed_weighted
def cnn_for_context_fusion( rep_tensor, rep_mask, filter_sizes=(3,4,5), num_filters=200, scope=None, is_train=None, keep_prob=1., wd=0.): bs, sl, vec = tf.shape(rep_tensor)[0], tf.shape(rep_tensor)[1], tf.shape(rep_tensor)[2] ivec = rep_tensor.get_shape().as_list()[2] with tf.variable_scope(scope or 'cnn_for_sentence_encoding'): rep_tensor = mask_for_high_rank(rep_tensor, rep_mask) rep_tensor_expand = tf.expand_dims(rep_tensor, 3) # bs, sl, rep_tensor_expand_dp = dropout(rep_tensor_expand, keep_prob, is_train) # Create a convolution + maxpool layer for each filter size pooled_outputs = [] for i, filter_size in enumerate(filter_sizes): with tf.variable_scope("conv-maxpool-%s" % filter_size): # Convolution Layer filter_shape = [filter_size, ivec, 1, num_filters] W = tf.get_variable('W', filter_shape, tf.float32) b = tf.get_variable('b', [num_filters], tf.float32) # # pading in the sequence if filter_size % 2 == 1: padding_front = padding_back = int((filter_size - 1) / 2) else: padding_front = (filter_size - 1) // 2 padding_back = padding_front + 1 padding = [[0, 0], [padding_front, padding_back], [0, 0], [0, 0]] rep_tensor_expand_dp_pad = tf.pad(rep_tensor_expand_dp, padding) conv = tf.nn.conv2d( rep_tensor_expand_dp_pad, W, strides=[1, 1, 1, 1], padding="VALID", name="conv") # Apply nonlinearity h = tf.nn.relu(tf.nn.bias_add(conv, b), name="relu") # bs, sl, 1, fn h_squeeze = tf.squeeze(h, [2]) # bs, sl, fn pooled_outputs.append(h_squeeze) # Combine all the pooled features result = tf.concat(pooled_outputs, 2) # bs, sl, 3 * fn if wd > 0.: add_reg_without_bias() return result
def normal_attention(rep_tensor, rep_mask, scope=None, keep_prob=1., is_train=None, wd=0., activation='elu', tensor_dict=None, name=None): batch_size, code_len, vec_size = tf.shape(rep_tensor)[0], tf.shape( rep_tensor)[1], tf.shape(rep_tensor)[2] ivec = rep_tensor.get_shape()[2] with tf.variable_scope(scope or 'normal_attention'): rep_tensor_map = bn_dense_layer(rep_tensor, ivec, True, 0., 'bn_dense_map', activation, False, wd, keep_prob, is_train) rep_tensor_logits = get_logits([rep_tensor_map], None, False, scope='self_attn_logits', mask=rep_mask, input_keep_prob=keep_prob, is_train=is_train) # bs,sl attn_result = softsel(rep_tensor, rep_tensor_logits, rep_mask) # bs,vec # save attn if tensor_dict is not None and name is not None: tensor_dict[name] = tf.nn.softmax(rep_tensor_logits) with tf.variable_scope('output'): o_bias = tf.get_variable('o_bias', [ivec], tf.float32, tf.constant_initializer(0.)) # input gate fusion_gate = tf.nn.sigmoid( linear(rep_tensor_map, ivec, True, 0., 'linear_fusion_i', False, wd, keep_prob, is_train) + linear(attn_result, ivec, True, 0., 'linear_fusion_a', False, wd, keep_prob, is_train) + o_bias) output = fusion_gate * rep_tensor_map + (1 - fusion_gate) * attn_result output = mask_for_high_rank(output, rep_mask) # bs,sl,vec return output
def pooling_with_mask(rep_tensor, rep_mask, method='max', scope=None): # rep_tensor have one more rank than rep_mask with tf.name_scope(scope or '%s_pooling' % method): if method == 'max': rep_tensor_masked = exp_mask_for_high_rank(rep_tensor, rep_mask) output = tf.reduce_max(rep_tensor_masked, -2) elif method == 'mean': rep_tensor_masked = mask_for_high_rank(rep_tensor, rep_mask) # [...,sl,hn] rep_sum = tf.reduce_sum(rep_tensor_masked, -2) #[..., hn] denominator = tf.reduce_sum(tf.cast(rep_mask, tf.int32), -1, True) # [..., 1] denominator = tf.where( tf.equal(denominator, tf.zeros_like(denominator, tf.int32)), tf.ones_like(denominator, tf.int32), denominator) output = rep_sum / tf.cast(denominator, tf.float32) else: raise AttributeError('No Pooling method name as %s' % method) return output
def visit_sa_with_dense(rep_tensor, keep_prob=1., is_train=None, wd=0., activation='relu', hn=None, is_scale=True, is_plus_sa=True): batch_size, sw_len, vec_size = tf.shape(rep_tensor)[0], tf.shape( rep_tensor)[1], tf.shape(rep_tensor)[2] ivec = rep_tensor.get_shape().as_list()[2] ivec = hn or ivec with tf.variable_scope('temporal_attention'): # mask generation attn_mask = tf.cast( tf.diag(-tf.ones([sw_len], tf.int32)) + 1, tf.bool) # batch_size, code_len, code_len # non-linear for context rep_map = bn_dense_layer(rep_tensor, ivec, True, 0., 'bn_dense_map', activation, False, wd, keep_prob, is_train) rep_map_tile = tf.tile(tf.expand_dims(rep_map, 1), [1, sw_len, 1, 1]) # bs,sl,sl,vec rep_map_dp = dropout(rep_map, keep_prob, is_train) # attention with tf.variable_scope('attention'): # bs,sl,sl,vec f_bias = tf.get_variable('f_bias', [ivec], tf.float32, tf.constant_initializer(0.)) dependent = linear( rep_map_dp, ivec, False, scope='linear_dependent') # batch_size, code_len, vec_size dependent_etd = tf.expand_dims( dependent, 1) # batch_size, code_len,code_len, vec_size head = linear( rep_map_dp, ivec, False, scope='linear_head') # batch_size, code_len, vec_size head_etd = tf.expand_dims( head, 2) # batch_size, code_len,code_len, vec_size if is_plus_sa: attention_fact = dependent_etd + head_etd + f_bias else: return rep_map if is_scale: logits = scaled_tanh(attention_fact, 5.0) # bs,sl,sl,vec else: logits = linear(tf.nn.tanh(attention_fact), ivec, True, scope='linear_attn_fact') logits_masked = exp_mask_for_high_rank(logits, attn_mask) attn_score = tf.nn.softmax(logits_masked, 2) # bs,sl,sl,vec attn_score = mask_for_high_rank(attn_score, attn_mask) attn_result = tf.reduce_sum(attn_score * rep_map_tile, 2) # bs,sl,vec with tf.variable_scope('output'): o_bias = tf.get_variable('o_bias', [ivec], tf.float32, tf.constant_initializer(0.)) # input gate fusion_gate = tf.nn.sigmoid( linear(rep_map, ivec, True, 0., 'linear_fusion_i', False, wd, keep_prob, is_train) + linear(attn_result, ivec, True, 0., 'linear_fusion_a', False, wd, keep_prob, is_train) + o_bias) output = fusion_gate * rep_map + (1 - fusion_gate) * attn_result return output
def directional_attention_with_dense(rep_tensor, rep_mask, direction=None, scope=None, keep_prob=1., is_train=None, wd=0., activation='elu', tensor_dict=None, name=None, hn=None): bs, sl, vec = tf.shape(rep_tensor)[0], tf.shape(rep_tensor)[1], tf.shape( rep_tensor)[2] ivec = rep_tensor.get_shape().as_list()[2] ivec = hn or ivec with tf.variable_scope(scope or 'directional_attention_%s' % direction or 'diag'): # mask generation sl_indices = tf.range(sl, dtype=tf.int32) sl_col, sl_row = tf.meshgrid(sl_indices, sl_indices) if direction is None: direct_mask = tf.cast( tf.diag(-tf.ones([sl], tf.int32)) + 1, tf.bool) else: if direction == 'forward': direct_mask = tf.greater(sl_row, sl_col) else: direct_mask = tf.greater(sl_col, sl_row) direct_mask_tile = tf.tile(tf.expand_dims(direct_mask, 0), [bs, 1, 1]) # bs,sl,sl rep_mask_tile = tf.tile(tf.expand_dims(rep_mask, 1), [1, sl, 1]) # bs,sl,sl attn_mask = tf.logical_and(direct_mask_tile, rep_mask_tile) # bs,sl,sl # non-linear rep_map = bn_dense_layer(rep_tensor, ivec, True, 0., 'bn_dense_map', activation, False, wd, keep_prob, is_train) rep_map_tile = tf.tile(tf.expand_dims(rep_map, 1), [1, sl, 1, 1]) # bs,sl,sl,vec rep_map_dp = dropout(rep_map, keep_prob, is_train) # attention with tf.variable_scope('attention'): # bs,sl,sl,vec f_bias = tf.get_variable('f_bias', [ivec], tf.float32, tf.constant_initializer(0.)) dependent = linear(rep_map_dp, ivec, False, scope='linear_dependent') # bs,sl,vec dependent_etd = tf.expand_dims(dependent, 1) # bs,1,sl,vec head = linear(rep_map_dp, ivec, False, scope='linear_head') # bs,sl,vec head_etd = tf.expand_dims(head, 2) # bs,sl,1,vec logits = scaled_tanh(dependent_etd + head_etd + f_bias, 5.0) # bs,sl,sl,vec logits_masked = exp_mask_for_high_rank(logits, attn_mask) attn_score = tf.nn.softmax(logits_masked, 2) # bs,sl,sl,vec attn_score = mask_for_high_rank(attn_score, attn_mask) attn_result = tf.reduce_sum(attn_score * rep_map_tile, 2) # bs,sl,vec with tf.variable_scope('output'): o_bias = tf.get_variable('o_bias', [ivec], tf.float32, tf.constant_initializer(0.)) # input gate fusion_gate = tf.nn.sigmoid( linear(rep_map, ivec, True, 0., 'linear_fusion_i', False, wd, keep_prob, is_train) + linear(attn_result, ivec, True, 0., 'linear_fusion_a', False, wd, keep_prob, is_train) + o_bias) output = fusion_gate * rep_map + (1 - fusion_gate) * attn_result output = mask_for_high_rank(output, rep_mask) # save attn if tensor_dict is not None and name is not None: tensor_dict[name + '_dependent'] = dependent tensor_dict[name + '_head'] = head tensor_dict[name] = attn_score tensor_dict[name + '_gate'] = fusion_gate return output