def linear(args, output_size, bias, bias_start=0.0, scope=None, squeeze=False, wd=0.0, input_keep_prob=1.0, is_train=None): if args is None or (isinstance(args, (tuple, list)) and not args): raise ValueError("`args` must be specified") if not isinstance(args, (tuple, list)): args = [args] flat_args = [flatten(arg, 1) for arg in args] # for dense layer [(-1, d)] if input_keep_prob < 1.0: assert is_train is not None flat_args = [ tf.cond(is_train, lambda: tf.nn.dropout(arg, input_keep_prob), lambda: arg) # for dense layer [(-1, d)] for arg in flat_args ] flat_out = _linear(flat_args, output_size, bias, bias_start=bias_start, scope=scope) # dense out = reconstruct(flat_out, args[0], 1) # () if squeeze: out = tf.squeeze(out, [len(args[0].get_shape().as_list()) - 1]) if wd: add_reg_without_bias() return out
def cnn_for_sentence_encoding( # kim rep_tensor, rep_mask, filter_sizes=(3,4,5), num_filters=200, scope=None, is_train=None, keep_prob=1., wd=0.): """ :param rep_tensor: :param rep_mask: :param filter_sizes: :param num_filters: :param scope: :param is_train: :param keep_prob: :param wd: :return: """ bs, sl, vec = tf.shape(rep_tensor)[0], tf.shape(rep_tensor)[1], tf.shape(rep_tensor)[2] ivec = rep_tensor.get_shape().as_list()[2] with tf.variable_scope(scope or 'cnn_for_sentence_encoding'): rep_tensor = mask_for_high_rank(rep_tensor, rep_mask) rep_tensor_expand = tf.expand_dims(rep_tensor, 3) rep_tensor_expand_dp = dropout(rep_tensor_expand, keep_prob, is_train) # Create a convolution + maxpool layer for each filter size pooled_outputs = [] for i, filter_size in enumerate(filter_sizes): with tf.variable_scope("conv-maxpool-%s" % filter_size): # Convolution Layer filter_shape = [filter_size, ivec, 1, num_filters] W = tf.get_variable('W', filter_shape, tf.float32) b = tf.get_variable('b', [num_filters], tf.float32) conv = tf.nn.conv2d( rep_tensor_expand_dp, W, strides=[1, 1, 1, 1], padding="VALID", name="conv") # Apply nonlinearity h = tf.nn.relu(tf.nn.bias_add(conv, b), name="relu") # bs, sl-fs+1, 1, fn # Maxpooling over the outputs # pooled = tf.nn.max_pool( # h, # ksize=[1, sl - filter_size + 1, 1, 1], # strides=[1, 1, 1, 1], # padding='VALID', # name="pool") pooled = tf.reduce_max(h, 1, True) # bs, 1, 1, fn pooled_outputs.append(pooled) # Combine all the pooled features num_filters_total = num_filters * len(filter_sizes) h_pool = tf.concat(pooled_outputs, 3) h_pool_flat = tf.reshape(h_pool, [-1, num_filters_total]) if wd > 0.: add_reg_without_bias() return h_pool_flat
def contextual_bi_rnn(tensor_rep, mask_rep, hn, cell_type, only_final=False, wd=0., keep_prob=1., is_train=None, scope=None): """ fusing contextual information using bi-direction rnn :param tensor_rep: [..., sl, vec] :param mask_rep: [..., sl] :param hn: :param cell_type: 'gru', 'lstm', basic_lstm' and 'basic_rnn' :param only_final: True or False :param wd: :param keep_prob: :param is_train: :param scope: :return: """ with tf.variable_scope(scope or 'contextual_bi_rnn'): # correct reuse = None if not tf.get_variable_scope().reuse else True #print(reuse) if cell_type == 'gru': cell_fw = tf.contrib.rnn.GRUCell(hn, reuse=reuse) cell_bw = tf.contrib.rnn.GRUCell(hn, reuse=reuse) elif cell_type == 'lstm': cell_fw = tf.contrib.rnn.LSTMCell(hn, reuse=reuse) cell_bw = tf.contrib.rnn.LSTMCell(hn, reuse=reuse) elif cell_type == 'basic_lstm': cell_fw = tf.contrib.rnn.BasicLSTMCell(hn, reuse=reuse) cell_bw = tf.contrib.rnn.BasicLSTMCell(hn, reuse=reuse) elif cell_type == 'basic_rnn': cell_fw = tf.contrib.rnn.BasicRNNCell(hn, reuse=reuse) cell_bw = tf.contrib.rnn.BasicRNNCell(hn, reuse=reuse) else: raise AttributeError('no cell type \'%s\'' % cell_type) cell_dp_fw = SwitchableDropoutWrapper(cell_fw, is_train, keep_prob) cell_dp_bw = SwitchableDropoutWrapper(cell_bw, is_train, keep_prob) tensor_len = tf.reduce_sum(tf.cast(mask_rep, tf.int32), -1) # [bs] (outputs_fw, output_bw), _ = bidirectional_dynamic_rnn(cell_dp_fw, cell_dp_bw, tensor_rep, tensor_len, dtype=tf.float32) rnn_outputs = tf.concat([outputs_fw, output_bw], -1) # [...,sl,2hn] if wd > 0: add_reg_without_bias() if not only_final: return rnn_outputs # [....,sl, 2hn] else: return get_last_state(rnn_outputs, mask_rep) # [...., 2hn]
def hierarchical_cnn_res_gate( rep_tensor, rep_mask, n_gram=5, layer_num=5, hn=None, scope=None, is_train=None, keep_prob=1., wd=0.): # padding if n_gram % 2 == 1: padding_front = padding_back = int((n_gram - 1) / 2) else: padding_front = (n_gram - 1) // 2 padding_back = padding_front + 1 padding = [[0, 0], [padding_front, padding_back], [0, 0], [0, 0]] # lengths bs, sl, vec = tf.shape(rep_tensor)[0], tf.shape(rep_tensor)[1], tf.shape(rep_tensor)[2] org_ivec = rep_tensor.get_shape().as_list()[2] ivec = hn or org_ivec with tf.variable_scope(scope or 'cnn_for_sentence_encoding'): rep_tensor = mask_for_high_rank(rep_tensor, rep_mask) # bs, sl, hn iter_rep = rep_tensor layer_res_list = [] for layer_idx in range(layer_num): with tf.variable_scope("conv_maxpool_%s" % layer_idx): iter_rep_etd = tf.expand_dims(iter_rep, 3) # bs,sl,hn,1 iter_rep_etd_dp = dropout(iter_rep_etd, keep_prob, is_train) # Convolution Layer feature_size = org_ivec if layer_idx == 0 else ivec filter_shape = [n_gram, feature_size, 1, 2 * ivec] W = tf.get_variable('W', filter_shape, tf.float32) b = tf.get_variable('b', [2 * ivec], tf.float32) iter_rep_etd_pad = tf.pad(iter_rep_etd_dp, padding) conv = tf.nn.conv2d( iter_rep_etd_pad, W, strides=[1, 1, 1, 1], padding="VALID", name="conv") map_res = tf.nn.relu(tf.nn.bias_add(conv, b), name="relu") # bs,sl,1,2hn map_res = tf.squeeze(map_res, [2]) # bs,sl,2*hn # gate map_res_a, map_res_b = tf.split(map_res, num_or_size_splits=2, axis=2) iter_rep = map_res_a * tf.nn.sigmoid(map_res_b) # res if len(layer_res_list) > 0: iter_rep = iter_rep + layer_res_list[-1] layer_res_list.append(iter_rep) if wd > 0.: add_reg_without_bias() return iter_rep
def cnn_for_context_fusion( rep_tensor, rep_mask, filter_sizes=(3,4,5), num_filters=200, scope=None, is_train=None, keep_prob=1., wd=0.): bs, sl, vec = tf.shape(rep_tensor)[0], tf.shape(rep_tensor)[1], tf.shape(rep_tensor)[2] ivec = rep_tensor.get_shape().as_list()[2] with tf.variable_scope(scope or 'cnn_for_sentence_encoding'): rep_tensor = mask_for_high_rank(rep_tensor, rep_mask) rep_tensor_expand = tf.expand_dims(rep_tensor, 3) # bs, sl, rep_tensor_expand_dp = dropout(rep_tensor_expand, keep_prob, is_train) # Create a convolution + maxpool layer for each filter size pooled_outputs = [] for i, filter_size in enumerate(filter_sizes): with tf.variable_scope("conv-maxpool-%s" % filter_size): # Convolution Layer filter_shape = [filter_size, ivec, 1, num_filters] W = tf.get_variable('W', filter_shape, tf.float32) b = tf.get_variable('b', [num_filters], tf.float32) # # pading in the sequence if filter_size % 2 == 1: padding_front = padding_back = int((filter_size - 1) / 2) else: padding_front = (filter_size - 1) // 2 padding_back = padding_front + 1 padding = [[0, 0], [padding_front, padding_back], [0, 0], [0, 0]] rep_tensor_expand_dp_pad = tf.pad(rep_tensor_expand_dp, padding) conv = tf.nn.conv2d( rep_tensor_expand_dp_pad, W, strides=[1, 1, 1, 1], padding="VALID", name="conv") # Apply nonlinearity h = tf.nn.relu(tf.nn.bias_add(conv, b), name="relu") # bs, sl, 1, fn h_squeeze = tf.squeeze(h, [2]) # bs, sl, fn pooled_outputs.append(h_squeeze) # Combine all the pooled features result = tf.concat(pooled_outputs, 2) # bs, sl, 3 * fn if wd > 0.: add_reg_without_bias() return result
def multi_head_attention(rep_tensor, rep_mask, head_num=8, hidden_units_num=64, scope=None, is_train=None, keep_prob=1., wd=0.): bs, sl, vec = tf.shape(rep_tensor)[0], tf.shape(rep_tensor)[1], tf.shape( rep_tensor)[2] ivec = rep_tensor.get_shape().as_list()[2] with tf.variable_scope(scope or 'multi_head_attention'): with tf.variable_scope('positional_encoding'): seq_idxs = tf.tile(tf.expand_dims(tf.range(sl), 1), [1, ivec]) # sl, ivec feature_idxs = tf.tile(tf.expand_dims(tf.range(ivec), 0), [sl, 1]) # sl, ivec pos_enc = tf.where( tf.equal(tf.mod(feature_idxs, 2), 0), tf.sin( tf.cast(seq_idxs, tf.float32) / tf.pow( 10000., 2.0 * tf.cast(feature_idxs, tf.float32) / (1.0 * ivec))), tf.cos( tf.cast(seq_idxs, tf.float32) / tf.pow( 10000., 2.0 * tf.cast(feature_idxs - 1, tf.float32) / (1.0 * ivec))), ) rep_tensor_pos = mask_for_high_rank(rep_tensor + pos_enc, rep_mask) # bs, sl, ivec with tf.variable_scope('multi_head_attention'): W = tf.get_variable('W', [3, head_num, ivec, hidden_units_num], tf.float32) rep_tile = tf.tile( tf.expand_dims(tf.expand_dims(rep_tensor_pos, 0), 0), [3, head_num, 1, 1, 1]) # 3,head_num,bs,sl,ivec rep_tile_reshape = tf.reshape( rep_tile, [3, head_num, bs * sl, ivec]) # head_num,bs*sl,ivec maps = tf.reshape( # 3,head_num,bs*sl,hn -> 3,head_num,bs,sl,hn tf.matmul(dropout(rep_tile_reshape, keep_prob, is_train), W), [3, head_num, bs, sl, hidden_units_num]) Q_map, K_map, V_map = tf.split(maps, 3, 0) Q_map = tf.squeeze(Q_map, [0]) # head_num,bs,sl,hn K_map = tf.squeeze(K_map, [0]) # head_num,bs,sl,hn V_map = tf.squeeze(V_map, [0]) # head_num,bs,sl,hn # head_num,bs,sl,sl # similarity_mat = tf.reduce_sum(Q_map_tile * K_map_tile, -1) / math.sqrt(1. * hidden_units_num) similarity_mat = tf.matmul(Q_map, tf.transpose( K_map, [0, 1, 3, 2])) / math.sqrt(1. * hidden_units_num) # mask: bs,sl -> head_num,bs,sl multi_mask = tf.tile(tf.expand_dims(rep_mask, 0), [head_num, 1, 1]) # head_num,bs,sl multi_mask_tile_1 = tf.expand_dims(multi_mask, 2) # head_num,bs,1,sl multi_mask_tile_2 = tf.expand_dims(multi_mask, 3) # head_num,bs,sl,1 multi_mask_tile = tf.logical_and( multi_mask_tile_1, multi_mask_tile_2) # head_num,bs,sl,sl similarity_mat_masked = exp_mask( similarity_mat, multi_mask_tile) # head_num,bs,sl,sl prob_dist = tf.nn.softmax( similarity_mat_masked) # head_num,bs,sl,sl prob_dist_dp = dropout(prob_dist, keep_prob, is_train) attn_res = tf.matmul(prob_dist_dp, V_map) # head_num,bs,sl,hn attn_res_tran = tf.transpose(attn_res, [1, 2, 0, 3]) output = tf.reshape(attn_res_tran, [bs, sl, head_num * hidden_units_num]) if wd > 0.: add_reg_without_bias() return output