def __init__(self, FLAGS=None): self.FLAGS = FLAGS self.config = config self.diff_len = config.max_diff_len self.seq_len = config.max_sent_len self.embed_size = config.word_dim self.num_class = config.num_class self.lstm_size = config.lstm_size # Add Word Embedding self.we = tf.Variable(FLAGS.we, name='emb') # Add PlaceHolder # define basic four input layers - for warrant0, warrant1, reason, claim self.input_warrant0 = tf.placeholder(tf.int32, (None, self.seq_len), name='warrant0') # [batch_size, sent_len] self.input_warrant1 = tf.placeholder(tf.int32, (None, self.seq_len), name='warrant1') # [batch_size, sent_len] self.input_reason = tf.placeholder(tf.int32, (None, self.seq_len), name='reason') # [batch_size, sent_len] self.input_claim = tf.placeholder(tf.int32, (None, self.seq_len), name='claim') # [batch_size, sent_len] self.input_debate = tf.placeholder(tf.int32, (None, self.seq_len), name='debate') # [batch_size, sent_len] self.warrant0_len = tf.placeholder(tf.int32, (None, ), name='warrant0_len') # [batch_size,] self.warrant1_len = tf.placeholder(tf.int32, (None, ), name='warrant1_len') # [batch_size,] self.reason_len = tf.placeholder(tf.int32, (None, ), name='reason_len') # [batch_size,] self.claim_len = tf.placeholder(tf.int32, (None, ), name='claim_len') # [batch_size,] self.debate_len = tf.placeholder(tf.int32, (None, ), name='debate_len') # [batch_size,] self.target_label = tf.placeholder(tf.int32, (None, self.num_class), name='label') # [batch_size, num_class] self.drop_keep_rate = tf.placeholder(tf.float32) self.learning_rate = tf.placeholder(tf.float32) self.input_diff_warrant0 = tf.placeholder(tf.int32, (None, self.diff_len), name='diff_warrant0') # [batch_size, sent_len] self.input_diff_warrant1 = tf.placeholder(tf.int32, (None, self.diff_len), name='diff_warrant1') # [batch_size, sent_len] self.diff_warrant0_len = tf.placeholder(tf.int32, (None,), name='diff_warrant0_len') # [batch_size,] self.diff_warrant1_len = tf.placeholder(tf.int32, (None,), name='diff_warrant1_len') # [batch_size,] self.input_diff_claim = tf.placeholder(tf.int32, (None, self.diff_len), name='diff_claim') self.diff_claim_len = tf.placeholder(tf.int32, (None,), name='diff_claim_len') # now define embedded layers of the input embedded_warrant0 = tf.nn.embedding_lookup(self.we, self.input_warrant0) embedded_warrant1 = tf.nn.embedding_lookup(self.we, self.input_warrant1) embedded_reason = tf.nn.embedding_lookup(self.we, self.input_reason) embedded_claim = tf.nn.embedding_lookup(self.we, self.input_claim) embedded_debate = tf.nn.embedding_lookup(self.we, self.input_debate) embedded_diff_warrant0 = tf.nn.embedding_lookup(self.we, self.input_diff_warrant0) embedded_diff_warrant1 = tf.nn.embedding_lookup(self.we, self.input_diff_warrant1) embedded_diff_claim = tf.nn.embedding_lookup(self.we, self.input_diff_claim) def conv_ngram(input_x, filter_sizes=(1, 2, 3), num_filters=32): """ Conv ngram """ sent_len = input_x.get_shape()[1] embed_size = input_x.get_shape()[2] input_x = tf.expand_dims(input_x, axis=-1) outputs = [] for i, filter_size in enumerate(filter_sizes): with tf.variable_scope("conv-maxpool-%s" % filter_size): filter_shape = [filter_size, embed_size, 1, num_filters] W = tf.get_variable("W", filter_shape, initializer=tf.random_normal_initializer()) b = tf.get_variable("b", [num_filters], initializer=tf.constant_initializer(0.0)) conv = tf.nn.conv2d(input_x, W, strides=[1, 1, embed_size, 1], padding='SAME', name="conv") h = tf.nn.relu(tf.nn.bias_add(conv, b), name="relu") h = tf.squeeze(h, axis=2) outputs.append(h) outputs = tf.concat(outputs, axis=2) return outputs with tf.variable_scope("conv") as s: conv_warrant0 = conv_ngram(embedded_warrant0) s.reuse_variables() conv_warrant1 = conv_ngram(embedded_warrant1) conv_reason = conv_ngram(embedded_reason) conv_claim = conv_ngram(embedded_claim) conv_debate = conv_ngram(embedded_debate) conv_diff_warrant0 = conv_ngram(embedded_diff_warrant0) conv_diff_warrant1 = conv_ngram(embedded_diff_warrant1) conv_diff_claim = conv_ngram(embedded_diff_claim) def AttBiLSTM(attention_vector, input_x, input_x_len, hidden_size, rnn_type='lstm', return_sequence=True): """ AttBiLSTM layer """ if rnn_type == 'lstm': Cell = AttBasicLSTMCell elif rnn_type == 'gru': Cell = AttGRUCell else: raise NotImplementedError cell_fw = Cell(attention_vector, num_units=hidden_size) cell_bw = Cell(attention_vector, num_units=hidden_size) b_outputs, b_states = tf.nn.bidirectional_dynamic_rnn(cell_fw, cell_bw, input_x, sequence_length=input_x_len, dtype=tf.float32) if return_sequence: outputs = tf.concat(b_outputs, axis=2) else: # states: [c, h] if rnn_type == 'lstm': outputs = tf.concat([b_states[0][1], b_states[1][1]], axis=-1) elif rnn_type == 'gru': outputs = tf.concat(b_states, axis=-1) else: raise NotImplementedError return outputs pooling_diff_warrant0 = tf_utils.MaxPooling(conv_diff_warrant0, self.diff_warrant0_len) pooling_diff_warrant1 = tf_utils.MaxPooling(conv_diff_warrant1, self.diff_warrant1_len) print(self.diff_claim_len.shape) pooling_diff_claim = tf_utils.MaxPooling(conv_diff_claim, self.diff_claim_len) with tf.variable_scope("att_warrant_lstm") as s: bilstm_warrant0 = AttBiLSTM(pooling_diff_warrant0, conv_warrant0, self.warrant0_len, self.lstm_size, rnn_type=FLAGS.rnn_type) s.reuse_variables() bilstm_warrant1 = AttBiLSTM(pooling_diff_warrant1, conv_warrant1, self.warrant1_len, self.lstm_size, rnn_type=FLAGS.rnn_type) bilstm_claim = AttBiLSTM(pooling_diff_claim, conv_claim, self.claim_len, self.lstm_size) with tf.variable_scope("bi_lstm") as s: bilstm_reason = tf_utils.BiLSTM(conv_reason, self.reason_len, self.lstm_size, rnn_type=FLAGS.rnn_type) s.reuse_variables() # bilstm_claim = tf_utils.BiLSTM(conv_claim, self.claim_len, self.lstm_size, rnn_type=FLAGS.rnn_type) bilstm_debate = tf_utils.BiLSTM(conv_debate, self.debate_len, self.lstm_size, rnn_type=FLAGS.rnn_type) with tf.variable_scope("pooling") as s: ''' Pooling Layer ''' pooling_warrant0 = tf_utils.MaxPooling(bilstm_warrant0, self.warrant0_len) pooling_warrant1 = tf_utils.MaxPooling(bilstm_warrant1, self.warrant1_len) pooling_reason = tf_utils.MaxPooling(bilstm_reason, self.reason_len) pooling_claim = tf_utils.MaxPooling(bilstm_claim, self.claim_len) pooling_debate = tf_utils.MaxPooling(bilstm_debate, self.debate_len) attention_vector_for_W0 = tf.concat([pooling_debate, pooling_reason, pooling_warrant0, pooling_claim, pooling_diff_warrant0], axis=-1) attention_vector_for_W1 = tf.concat([pooling_debate, pooling_reason, pooling_warrant1, pooling_claim, pooling_diff_warrant1], axis=-1) with tf.variable_scope("att_lstm") as s: attention_warrant0 = AttBiLSTM(attention_vector_for_W0, bilstm_warrant0, self.warrant0_len, self.lstm_size, rnn_type = FLAGS.rnn_type, return_sequence=False) s.reuse_variables() attention_warrant1 = AttBiLSTM(attention_vector_for_W1, bilstm_warrant1, self.warrant1_len, self.lstm_size, rnn_type=FLAGS.rnn_type, return_sequence=False) self.attention_warrant0 = attention_warrant0 self.attention_warrant1 = attention_warrant1 # concatenate them merge_warrant = tf.concat([pooling_reason * pooling_claim, attention_warrant0, attention_warrant1, attention_warrant0 - attention_warrant1, attention_warrant0 * attention_warrant1], axis=-1) dropout_warrant = tf.nn.dropout(merge_warrant, self.drop_keep_rate) # and add one extra layer with ReLU with tf.variable_scope("linear") as s: dense1 = tf.nn.relu(tf_utils.linear(dropout_warrant, int(self.lstm_size / 2), bias=True, scope='dense')) logits = tf_utils.linear(dense1, self.num_class, bias=True, scope='logit') # Obtain the Predict, Loss, Train_op predict_prob = tf.nn.softmax(logits, name='predict_prob') predict_label = tf.cast(tf.argmax(logits, axis=1), tf.int32) loss = tf.nn.softmax_cross_entropy_with_logits(logits=logits, labels=self.target_label) loss = tf.reduce_mean(loss) # Build the loss global_step = tf.Variable(0, name='global_step', trainable=False) train_op = tf_utils.optimize(loss, 'adam', FLAGS.lambda_l2, self.learning_rate, global_step, FLAGS.clipper) self.predict_prob = predict_prob self.predict_label = predict_label self.loss = loss self.train_op = train_op self.global_step = global_step
def __init__(self, FLAGS=None): self.FLAGS = FLAGS self.config = config self.epoch_step = tf.Variable(0, trainable=False, name="Epoch_Step") self.epoch_increment = tf.assign(self.epoch_step, tf.add(self.epoch_step, tf.constant(1))) self.seq_len = config.max_sent_len self.embed_size = config.word_dim self.num_class = config.num_class self.filter_sizes = [1, 2, 3, 4] self.num_filters = FLAGS.num_filters self.initializer = tf.random_normal_initializer(stddev=0.1) # Add PlaceHolder self.input_x = tf.placeholder(tf.int32, (None, self.seq_len)) # [batch_size, sent_len] self.input_x_len = tf.placeholder(tf.int32, (None,)) self.input_y = tf.placeholder(tf.int32, (None, self.num_class)) # self.mlp_h1_size = 200 # self.mlp_h2_size = 140 self.drop_keep_rate = tf.placeholder(tf.float32) self.drop_hidden1 = tf.placeholder(tf.float32) self.drop_hidden2 = tf.placeholder(tf.float32) self.learning_rate = tf.placeholder(tf.float32) # Add Word Embedding self.we = tf.Variable(FLAGS.we, name='emb') # Build the Computation Graph def CNN(input_x, seq_len, filter_sizes, num_filters=1, dropout_rate=None): """ CNN Layer Args: input_x: [batch, sent_len, emb_size, 1] seq_len: int filter_sizes: list num_filters: int dropout_rate: float Returns: outputs: [batch, num_filters * len(filter_sizes)] """ pooled_outputs = [] for i, filter_size in enumerate(filter_sizes): with tf.name_scope("convolution-pooling-%s" % filter_size): # ====>a.create filter filter = tf.get_variable("filter-%s" % filter_size, [filter_size, self.embed_size, 1, num_filters], initializer=self.initializer) # ====>b.conv operation: conv2d===>computes a 2-D convolution given 4-D `input` and `filter` tensors. # Conv.Input: given an input tensor of shape `[batch, in_height, in_width, in_channels]` and a filter / kernel tensor of shape `[filter_height, filter_width, in_channels, out_channels]` # Conv.Returns: A `Tensor`. Has the same type as `input`. # A 4-D tensor. The dimension order is determined by the value of `data_format`, see below for details. # 1) each filter with conv2d's output a shape:[1, sent_len-filter_size+1, 1, 1];2) * num_filters--->[1, sent_len - filter_size+1,1,num_filters];3)*batch_size--->[batch_size,sequence_length-filter_size+1,1,num_filters] # input data format: NHWC: [batch, height, width, channels]; output:4-D # shape:[batch_size, sent_len - filter_size + 1, 1, num_filters] conv = tf.nn.conv2d(input_x, filter, strides=[1, 1, 1, 1], padding="VALID", name="conv") # ====>c. apply nolinearity b = tf.get_variable("b-%s" % filter_size, [num_filters]) # ADD 2017-06-09 # shape: [batch_size, sent_len - filter_size + 1, 1, num_filters]. tf.nn.bias_add:adds `bias` to `value` h = tf.nn.relu(tf.nn.bias_add(conv, b), "relu") # ====>. max-pooling. value: A 4-D `Tensor` with shape `[batch, height, width, channels] # ksize: A list of ints that has length >= 4. The size of the window for each dimension of the input tensor. # strides: A list of ints that has length >= 4. The stride of the sliding window for each dimension of the input tensor. # shape:[batch_size, 1, 1, num_filters].max_pool: performs the max pooling on the input. pooled = tf.nn.max_pool(h, ksize=[1, seq_len - filter_size + 1, 1, 1], strides=[1, 1, 1, 1], padding='VALID', name="pool") pooled_outputs.append(pooled) # 3.=====>combine all pooled features, and flatten the feature.output' shape is a [1,None] # e.g. >>> x1=tf.ones([3,3]);x2=tf.ones([3,3]);x=[x1,x2] # x12_0=tf.concat(x,0)---->x12_0' shape:[6,3] # x12_1=tf.concat(x,1)---->x12_1' shape;[3,6] # shape:[batch_size, 1, 1, num_filters_total]. tf.concat=>concatenates tensors along one dimension.where num_filters_total=num_filters_1+num_filters_2+num_filters_3 h_pool = tf.concat(pooled_outputs, -1) num_filters_total = num_filters * len(filter_sizes) # shape should be:[None,num_filters_total]. here this operation has some result as tf.sequeeze().e.g. x's shape:[3,3];tf.reshape(-1,x) & (3, 3)---->(1,9) outputs = tf.reshape(h_pool, [-1, num_filters_total]) # 4.=====>add dropout: use tf.nn.dropout if dropout_rate is not None: # [None, num_filters_total] outputs = tf.nn.dropout(outputs, keep_prob=dropout_rate) # 5. logits(use linear layer)and predictions(argmax) # with tf.name_scope("output"): # # shape:[None, self.num_classes]==tf.matmul([None,self.embed_size],[self.embed_size,self.num_classes]) # logits = tf.matmul(self.h_drop, self.W_projection) + self.b_projection return outputs # TODO: implenment CNN Begin: inputs = tf.nn.embedding_lookup(self.we, self.input_x) # [batch_size, sent_len, emd_size] inputs_embeddings_expanded = tf.expand_dims(inputs, -1) cnn_x = CNN(inputs_embeddings_expanded, self.seq_len, self.filter_sizes, self.num_filters, self.drop_keep_rate) # TODO: implenment CNN end # hidden1 = tf.nn.relu(tf_utils.linear(cnn_x, self.mlp_h1_size, bias=True, scope='h1')) # hidden1_drop = tf.nn.dropout(hidden1, keep_prob=self.drop_keep_rate) # hidden2 = tf.nn.relu(tf_utils.linear(hidden1_drop, self.mlp_h2_size, bias=True, scope='h2')) # hidden2_drop = tf.nn.dropout(hidden2, keep_prob=self.drop_keep_rate) logits = tf_utils.linear(cnn_x, self.num_class, bias=True, scope='softmax') # Obtain the Predict, Loss, Train_op predict_prob = tf.nn.softmax(logits, name='predict_prob') predict_label = tf.cast(tf.argmax(logits, 1), tf.int32) loss = tf.nn.softmax_cross_entropy_with_logits(logits=logits, labels=self.input_y) loss = tf.reduce_mean(loss) l2_loss = tf.add_n([tf.nn.l2_loss(v) for v in tf.trainable_variables() if v.get_shape().ndims > 1]) reg_loss = loss + FLAGS.lambda_l2 * l2_loss # Build the loss global_step = tf.Variable(0, name='global_step', trainable=False) optimizer = tf.train.AdamOptimizer(self.learning_rate) # optimizer = tf.train.AdadeltaOptimizer(self.learning_rate) # optimizer = tf.train.AdagradOptimizer(self.learning_rate) if FLAGS.clipper: tvars = tf.trainable_variables() grads, _ = tf.clip_by_global_norm(tf.gradients(loss, tvars), FLAGS.clipper) train_op = optimizer.apply_gradients(list(zip(grads, tvars))) else: train_op = optimizer.minimize(loss, global_step=global_step) self.predict_prob = predict_prob self.predict_label = predict_label self.seq_res = cnn_x self.logits = logits self.loss = loss self.reg_loss = reg_loss self.train_op = train_op self.global_step = global_step
def __init__(self, FLAGS=None): self.FLAGS = FLAGS self.config = config self.epoch_step = tf.Variable(0, trainable=False, name="Epoch_Step") self.epoch_increment = tf.assign( self.epoch_step, tf.add(self.epoch_step, tf.constant(1))) self.seq_len = config.max_sent_len self.word_len = config.max_word_len self.word_embed_size = config.word_dim self.char_embed_size = config.char_dim self.num_class = config.num_class self.num_vocab = FLAGS.num_vocab self.initializer = tf.random_normal_initializer(stddev=0.1) self.filter_sizes = [1, 2, 3, 4] self.num_filters = FLAGS.num_filters self.char_lstm_size = 50 self.lstm_size = 512 self.mlp_h1_size = 200 self.layer_size = FLAGS.layer_size self.with_char = FLAGS.with_char self.char_type = FLAGS.char_type self.with_ner = FLAGS.with_ner self.with_pos = FLAGS.with_pos self.with_rf = FLAGS.with_rf self.with_pun = FLAGS.with_pun self.with_senti = FLAGS.with_senti self.with_attention = FLAGS.with_attention self.with_cnn = FLAGS.with_cnn self.with_cnn_lstm = FLAGS.with_cnn_lstm self.drop_keep_rate = tf.placeholder(tf.float32) self.drop_hidden1 = tf.placeholder(tf.float32) self.learning_rate = tf.placeholder(tf.float32) # Add PlaceHolder self.input_x = tf.placeholder( tf.int32, (None, self.seq_len)) # [batch_size, sent_len] self.input_x_len = tf.placeholder(tf.int32, (None, )) # [batch_len] self.input_y = tf.placeholder( tf.int32, (None, self.num_class)) # [batch_size, label_size] # Add Word Embedding self.we = tf.Variable(FLAGS.we, name='emb') if self.with_ner: self.input_x_ner = tf.placeholder(tf.int32, (None, self.seq_len)) self.ner_we = tf.Variable(FLAGS.ner_we, name='ner_emb') if self.with_pos: self.input_x_pos = tf.placeholder(tf.int32, (None, self.seq_len)) self.pos_we = tf.Variable(FLAGS.pos_we, name='pos_emb') if self.with_rf: self.input_rf = tf.placeholder(tf.float32, (None, self.num_vocab)) if self.with_pun: self.input_x_pun = tf.placeholder(tf.float32, (None, 9)) if self.with_senti: self.input_x_senti = tf.placeholder(tf.float32, (None, 110)) if self.with_char: # [batch_size, sent_len, word_len] self.input_x_char = tf.placeholder( tf.int32, (None, self.seq_len, self.word_len)) self.input_x_char_len = tf.placeholder( tf.int32, (None, self.seq_len)) # [batch_size, sen_len] # The Char Embedding is Random Initialization self.char_we = tf.Variable(FLAGS.char_we, name='char_emb') # attention process: # 1.get logits for each word in the sentence. # 2.get possibility distribution for each word in the sentence. # 3.get weighted sum for the sentence as sentence representation. def attention_word_level(hidden_state, hidden_size, sequence_length, seq_len, scope=None, reuse=None): """ hidden_state: [batch_size, sequence_length, hidden_size*2] context vector: :return [batch_size*num_sentences, hidden_size*2] """ with tf.variable_scope(scope or "attention", reuse=reuse): self.W_w_attention_word = tf.get_variable( "W_w_attention_word", shape=[hidden_size * 2, hidden_size * 2]) self.W_b_attention_word = tf.get_variable( "W_b_attention_word", shape=[hidden_size * 2]) self.context_vecotor_word = tf.get_variable( "what_is_the_informative_word", shape=[hidden_size * 2 ]) # TODO o.k to use batch_size in first demension? # 0) one layer of feed forward network # shape: [batch_size*sequence_length, hidden_size*2] hidden_state_ = tf.reshape(hidden_state, shape=[-1, hidden_size * 2]) # hidden_state_: [batch_size*sequence_length, hidden_size*2] # W_w_attention_sentence: [hidden_size*2, hidden_size*2] hidden_representation = tf.nn.tanh( tf.matmul(hidden_state_, self.W_w_attention_word) + self.W_b_attention_word) # shape: [batch_size, sequence_length, hidden_size*2] hidden_representation = tf.reshape( hidden_representation, shape=[-1, sequence_length, hidden_size * 2]) # 1) get logits for each word in the sentence. # hidden_representation: [batch_size, sequence_length, hidden_size*2] # context_vecotor_word: [hidden_size*2] hidden_state_context_similiarity = tf.multiply( hidden_representation, self.context_vecotor_word) # 对应相乘再求和,得到权重 # shape: [batch_size, sequence_length] attention_logits = tf.reduce_sum( hidden_state_context_similiarity, axis=2) # subtract max for numerical stability (softmax is shift invariant). # tf.reduce_max:Computes the maximum of elements across dimensions of a tensor. # shape: [batch_size, 1] attention_logits_max = tf.reduce_max(attention_logits, axis=1, keep_dims=True) # 2) get possibility distribution for each word in the sentence. # shape: [batch_size, sequence_length] # 归一化 p_attention = tf.nn.softmax(attention_logits - attention_logits_max) # 3) get weighted hidden state by attention vector # shape: [batch_size, sequence_length, 1] p_attention_expanded = tf.expand_dims(p_attention, axis=2) # below sentence_representation # shape:[batch_size, sequence_length, hidden_size*2]<---- # p_attention_expanded: [batch_size, sequence_length, 1] # hidden_state_: [batch_size, sequence_length, hidden_size*2] # shape: [batch_size, sequence_length, hidden_size*2] sentence_representation = tf.multiply(p_attention_expanded, hidden_state) # shape: [batch_size, hidden_size*2] sentence_representation = tf_utils.Mask( sentence_representation, seq_len, config.max_sent_len) sentence_representation = tf.reduce_sum( sentence_representation, axis=1) # shape: [batch_size, hidden_size*2] return sentence_representation def BiLSTM(input_x, input_x_len, hidden_size, num_layers=1, dropout_rate=None, return_sequence=True): """ Update 2017.11.21 fix a bug ref: https://stackoverflow.com/questions/44615147/valueerror-trying-to-share-variable-rnn-multi-rnn-cell-cell-0-basic-lstm-cell-k ====== BiLSTM Layer Args: input_x: [batch, sent_len, emb_size] input_x_len: [batch, ] hidden_size: int num_layers: int dropout_rate: float return_sequence: True/False Returns: if return_sequence=True: outputs: [batch, sent_len, hidden_size*2] else: output: [batch, hidden_size*2] """ def lstm_cell(): return tf.contrib.rnn.BasicLSTMCell(hidden_size) # cell = tf.contrib.rnn.GRUCell(hidden_size) # cell_fw = tf.contrib.rnn.BasicLSTMCell(hidden_size) # cell_bw = tf.contrib.rnn.BasicLSTMCell(hidden_size) if num_layers >= 1: # Warning! Please consider that whether the cell to stack are the same cell_fw = tf.contrib.rnn.MultiRNNCell( [lstm_cell() for _ in range(num_layers)]) cell_bw = tf.contrib.rnn.MultiRNNCell( [lstm_cell() for _ in range(num_layers)]) if dropout_rate is not None: cell_fw = tf.contrib.rnn.DropoutWrapper( cell_fw, output_keep_prob=dropout_rate) cell_bw = tf.contrib.rnn.DropoutWrapper( cell_bw, output_keep_prob=dropout_rate) b_outputs, b_states = tf.nn.bidirectional_dynamic_rnn( cell_fw, cell_bw, input_x, sequence_length=input_x_len, dtype=tf.float32) if return_sequence: # b_outputs: [[b, sl, h],[b, sl, h]] outputs = tf.concat(b_outputs, axis=2) else: # b_states: (([b, c], [b, h]), ([b, c], [b, h])) outputs = tf.concat([b_states[0][1], b_states[1][1]], axis=-1) return outputs def CNN(input_x, seq_len, filter_sizes, num_filters, embed_size, dropout_rate=None): """ CNN Layer Args: input_x: [batch, sent_len, emb_size, 1] seq_len: int filter_sizes: list num_filters: int dropout_rate: float Returns: outputs: [batch, num_filters * len(filter_sizes)] """ pooled_outputs = [] for i, filter_size in enumerate(filter_sizes): with tf.name_scope("convolution-pooling-%s" % filter_size): # ====>a.create filter filter = tf.get_variable( "filter-%s" % filter_size, [filter_size, embed_size, 1, num_filters], initializer=self.initializer) # ====>b.conv operation: conv2d===>computes a 2-D convolution given 4-D `input` and `filter` tensors. # Conv.Input: given an input tensor of shape `[batch, in_height, in_width, in_channels]` and a filter / kernel tensor of shape `[filter_height, filter_width, in_channels, out_channels]` # Conv.Returns: A `Tensor`. Has the same type as `input`. # A 4-D tensor. The dimension order is determined by the value of `data_format`, see below for details. # 1) each filter with conv2d's output a shape:[1, sent_len-filter_size+1, 1, 1];2) * num_filters--->[1, sent_len - filter_size+1,1,num_filters];3)*batch_size--->[batch_size,sequence_length-filter_size+1,1,num_filters] # input data format: NHWC: [batch, height, width, channels]; output:4-D # shape:[batch_size, sent_len - filter_size + 1, 1, num_filters] conv = tf.nn.conv2d(input_x, filter, strides=[1, 1, 1, 1], padding="VALID", name="conv") # ====>c. apply nolinearity b = tf.get_variable("b-%s" % filter_size, [num_filters]) # shape: [batch_size, sent_len - filter_size + 1, 1, num_filters]. tf.nn.bias_add:adds `bias` to `value` h = tf.nn.relu(tf.nn.bias_add(conv, b), "relu") # ====>. max-pooling. value: A 4-D `Tensor` with shape `[batch, height, width, channels] # ksize: A list of ints that has length >= 4. The size of the window for each dimension of the input tensor. # strides: A list of ints that has length >= 4. The stride of the sliding window for each dimension of the input tensor. # shape:[batch_size, 1, 1, num_filters].max_pool: performs the max pooling on the input. pooled = tf.nn.max_pool( h, ksize=[1, seq_len - filter_size + 1, 1, 1], strides=[1, 1, 1, 1], padding='VALID', name="pool") pooled_outputs.append(pooled) # 3.=====>combine all pooled features, and flatten the feature.output' shape is a [1,None] # e.g. >>> x1=tf.ones([3,3]);x2=tf.ones([3,3]);x=[x1,x2] # x12_0=tf.concat(x,0)---->x12_0' shape:[6,3] # x12_1=tf.concat(x,1)---->x12_1' shape;[3,6] # shape:[batch_size, 1, 1, num_filters_total]. tf.concat=>concatenates tensors along one dimension.where num_filters_total=num_filters_1+num_filters_2+num_filters_3 h_pool = tf.concat(pooled_outputs, -1) num_filters_total = num_filters * len(filter_sizes) # shape should be:[None,num_filters_total]. here this operation has some result as tf.sequeeze().e.g. x's shape:[3,3];tf.reshape(-1,x) & (3, 3)---->(1,9) outputs = tf.reshape(h_pool, [-1, num_filters_total]) # 4.=====>add dropout: use tf.nn.dropout if dropout_rate is not None: # [None, num_filters_total] outputs = tf.nn.dropout(outputs, keep_prob=dropout_rate) # 5. logits(use linear layer)and predictions(argmax) # with tf.name_scope("output"): # # shape:[None, self.num_classes]==tf.matmul([None,self.embed_size],[self.embed_size,self.num_classes]) # logits = tf.matmul(self.h_drop, self.W_projection) + self.b_projection return outputs # Build the Computation Graph # [batch_size, sent_len, word_emd_size] embedded_x = tf.nn.embedding_lookup(self.we, self.input_x) batch_size = tf.shape(embedded_x)[0] if self.with_char: if self.char_type == 'lstm': # [batch_size, sent_len, word_len, char_emd_size] embedded_x_char = tf.nn.embedding_lookup( self.char_we, self.input_x_char) # batch_size = tf.shape(embedded_x_char)[0] # [batch_size * sent_len, word_len, char_emd_size] embedded_x_char = tf.reshape( embedded_x_char, [-1, self.word_len, self.char_embed_size]) input_x_char_lens = tf.reshape(self.input_x_char_len, [-1]) with tf.variable_scope("char_bilstm") as clstm: # [batch_size * sent_len, word_len, char_emd_size] char_lstm_x = BiLSTM(embedded_x_char, input_x_char_lens, self.char_lstm_size, dropout_rate=1.0, return_sequence=True) char_lstm_x = char_lstm_x[:, -1, :] char_x = tf.reshape( char_lstm_x, [batch_size, self.seq_len, self.char_lstm_size * 2]) if self.char_type == 'cnn': embedded_x_char = tf.nn.embedding_lookup( self.char_we, self.input_x_char) embedded_x_char = tf.reshape( embedded_x_char, [-1, self.word_len, self.char_embed_size]) with tf.variable_scope("char_cnn") as ccnn: inputs_char_embeddings_expanded = tf.expand_dims( embedded_x_char, -1) char_cnn_x = CNN(inputs_char_embeddings_expanded, self.word_len, self.filter_sizes, self.num_filters, self.char_embed_size, self.drop_keep_rate) num_filters_total = self.num_filters * len( self.filter_sizes) char_x = tf.reshape( char_cnn_x, [batch_size, self.seq_len, num_filters_total]) if self.with_ner: embedded_x_ner = tf.nn.embedding_lookup(self.ner_we, self.input_x_ner) if self.with_pos: embedded_x_pos = tf.nn.embedding_lookup(self.pos_we, self.input_x_pos) with tf.variable_scope("seq_bilstm") as s: if self.with_ner: embedded_x = tf.concat([embedded_x, embedded_x_ner], axis=-1) if self.with_pos: embedded_x = tf.concat([embedded_x, embedded_x_pos], axis=-1) if self.with_char: embedded_x = tf.concat([embedded_x, char_x], axis=-1) lstm_x = BiLSTM(embedded_x, self.input_x_len, self.lstm_size, self.layer_size, self.drop_keep_rate, return_sequence=True) if self.with_cnn: inputs_embeddings_expanded = tf.expand_dims(embedded_x, -1) cnn_x = CNN(inputs_embeddings_expanded, self.seq_len, self.filter_sizes, self.num_filters, self.word_embed_size, self.drop_keep_rate) if self.with_cnn_lstm: inputs_hidden_expanded = tf.expand_dims(lstm_x, -1) cnn_x = CNN(inputs_hidden_expanded, self.seq_len, self.filter_sizes, self.num_filters, self.lstm_size * 2, self.drop_keep_rate) avg_pooling = tf_utils.AvgPooling(lstm_x, self.input_x_len, self.seq_len) max_pooling = tf_utils.MaxPooling(lstm_x, self.input_x_len) last_lstm = lstm_x[:, -1, :] last_lstm = tf.reshape(last_lstm, [batch_size, self.lstm_size * 2]) seq_distribution = tf.concat([avg_pooling, max_pooling, last_lstm], axis=-1) if self.with_attention: attention = attention_word_level(lstm_x, self.lstm_size, self.seq_len, self.input_x_len) seq_distribution = tf.concat([last_lstm, attention], axis=-1) if self.with_rf: seq_distribution = tf.concat([seq_distribution, self.input_rf], axis=-1) if self.with_pun: seq_distribution = tf.concat([seq_distribution, self.input_x_pun], axis=-1) if self.with_senti: seq_distribution = tf.concat( [seq_distribution, self.input_x_senti], axis=-1) if self.with_cnn: seq_distribution = tf.concat([seq_distribution, cnn_x], axis=-1) if self.with_cnn_lstm: seq_distribution = tf.concat([seq_distribution, cnn_x], axis=-1) hidden1 = tf.nn.relu( tf_utils.linear(seq_distribution, self.mlp_h1_size, bias=True, scope='h1')) logits = tf_utils.linear(hidden1, self.num_class, bias=True, scope='softmax') # Obtain the Predict, Loss, Train_op predict_prob = tf.nn.softmax(logits, name='predict_prob') predict_label = tf.cast(tf.argmax(logits, 1), tf.int32) loss = tf.nn.softmax_cross_entropy_with_logits(logits=logits, labels=self.input_y) loss = tf.reduce_mean(loss) l2_loss = tf.add_n([ tf.nn.l2_loss(v) for v in tf.trainable_variables() if v.get_shape().ndims > 1 ]) reg_loss = loss + FLAGS.lambda_l2 * l2_loss # Build the loss global_step = tf.Variable(0, name='global_step', trainable=False) optimizer = tf.train.AdamOptimizer(self.learning_rate) # optimizer = tf.train.AdadeltaOptimizer(self.learning_rate) # optimizer = tf.train.AdagradOptimizer(self.learning_rate) if FLAGS.clipper: tvars = tf.trainable_variables() grads, _ = tf.clip_by_global_norm(tf.gradients(loss, tvars), FLAGS.clipper) train_op = optimizer.apply_gradients(list(zip(grads, tvars)), global_step=global_step) else: train_op = optimizer.minimize(loss, global_step=global_step) self.predict_prob = predict_prob self.predict_label = predict_label self.seq_res = hidden1 self.logits = logits self.loss = loss self.reg_loss = reg_loss self.train_op = train_op self.global_step = global_step
def __init__(self, FLAGS=None): self.FLAGS = FLAGS self.config = config self.epoch_step = tf.Variable(0, trainable=False, name="Epoch_Step") self.epoch_increment = tf.assign( self.epoch_step, tf.add(self.epoch_step, tf.constant(1))) self.seq_len = config.max_sent_len self.embed_size = config.word_dim self.num_class = config.num_class self.mlp_h1_size = 140 self.mlp_h2_size = 140 # Add PlaceHolder self.input_x = tf.placeholder( tf.int32, (None, self.seq_len)) # [batch_size, sent_len] self.input_x_len = tf.placeholder(tf.int32, (None, )) self.input_y = tf.placeholder(tf.int32, (None, self.num_class)) self.drop_keep_rate = tf.placeholder(tf.float32) self.drop_hidden1 = tf.placeholder(tf.float32) self.drop_hidden2 = tf.placeholder(tf.float32) self.learning_rate = tf.placeholder(tf.float32) # Add Word Embedding self.we = tf.Variable(FLAGS.we, name='emb') # Build the Computation Graph inputs = tf.nn.embedding_lookup( self.we, self.input_x) # [batch_size, sent_len, emd_size] avg_pooling = tf_utils.AvgPooling(inputs, self.input_x_len, self.seq_len) hidden1 = tf.nn.relu( tf_utils.linear(avg_pooling, self.mlp_h1_size, bias=True, scope='h1')) hidden1_drop = tf.nn.dropout(hidden1, keep_prob=self.drop_hidden1) hidden2 = tf.nn.relu( tf_utils.linear(hidden1_drop, self.mlp_h2_size, bias=True, scope='h2')) hidden2_drop = tf.nn.dropout(hidden2, keep_prob=self.drop_hidden2) logits = tf_utils.linear(hidden2_drop, self.num_class, bias=True, scope='softmax') # logits = tf_utils.linear(avg_pooling, self.num_class, bias=True, scope='softmax') # Obtain the Predict, Loss, Train_op predict_prob = tf.nn.softmax(logits, name='predict_prob') predict_label = tf.cast(tf.argmax(logits, 1), tf.int32) loss = tf.nn.softmax_cross_entropy_with_logits(logits=logits, labels=self.input_y) loss = tf.reduce_mean(loss) l2_loss = tf.add_n([ tf.nn.l2_loss(v) for v in tf.trainable_variables() if v.get_shape().ndims > 1 ]) reg_loss = loss + FLAGS.lambda_l2 * l2_loss # Build the loss global_step = tf.Variable(0, name='global_step', trainable=False) optimizer = tf.train.AdamOptimizer(self.learning_rate) # optimizer = tf.train.AdagradOptimizer(self.learning_rate) if FLAGS.clipper: tvars = tf.trainable_variables() grads, _ = tf.clip_by_global_norm(tf.gradients(loss, tvars), FLAGS.clipper) train_op = optimizer.apply_gradients(list(zip(grads, tvars))) else: train_op = optimizer.minimize(loss, global_step=global_step) self.predict_prob = predict_prob self.predict_label = predict_label self.seq_res = hidden2_drop self.logits = logits self.loss = loss self.reg_loss = reg_loss self.train_op = train_op self.global_step = global_step
def __init__(self, init='xavier', num_inputs=None, input_dim=None, embed_size=None, l2_w=None, l2_v=None, norm=False, real_inputs=None, comb_mask=None, weight_base=0.6, third_prune=False, comb_mask_third=None, weight_base_third=0.6, retrain_stage=0): self.l2_w = l2_w self.l2_v = l2_v self.l2_ps = l2_v self.third_prune = third_prune self.retrain_stage = retrain_stage self.inputs, self.labels, self.training = create_placeholder(num_inputs, tf, True) inputs, mask, flag, num_inputs = split_data_mask(self.inputs, num_inputs, norm=norm, real_inputs=real_inputs) self.xw, self.xv, b, self.xps = embedding_lookup(init=init, input_dim=input_dim, factor=embed_size, inputs=inputs, apply_mask=flag, mask=mask, third_order=third_prune) l = linear(self.xw) self.cols, self.rows = generate_pairs(range(self.xv.shape[1]),mask=comb_mask) t_embedding_matrix = tf.transpose(self.xv, perm=[1, 0, 2]) left = tf.transpose(tf.gather(t_embedding_matrix, self.rows), perm=[1, 0, 2]) right = tf.transpose(tf.gather(t_embedding_matrix, self.cols), perm=[1, 0, 2]) level_2_matrix = tf.reduce_sum(tf.multiply(left, right), axis=-1) with tf.variable_scope("edge_weight", reuse=tf.AUTO_REUSE): self.edge_weights = tf.get_variable('weights', shape=[len(self.cols)], initializer=tf.random_uniform_initializer( minval=weight_base - 0.001, maxval=weight_base + 0.001)) normed_wts = tf.identity(self.edge_weights, name="normed_wts") tf.add_to_collection("structure", self.edge_weights) tf.add_to_collection("edge_weights", self.edge_weights) mask = tf.identity(normed_wts, name="unpruned_mask") mask = tf.expand_dims(mask, axis=0) level_2_matrix = tf.layers.batch_normalization(level_2_matrix, axis=-1, training=self.training, reuse=tf.AUTO_REUSE, scale=False, center=False, name='prune_BN') level_2_matrix *= mask if third_prune: self.first, self.second, self.third = generate_pairs(range(self.xps.shape[1]), mask=comb_mask_third, order=3) t_embedding_matrix = tf.transpose(self.xps, perm=[1, 0, 2]) first_embed = tf.transpose(tf.gather(t_embedding_matrix, self.first), perm=[1, 0, 2]) second_embed = tf.transpose(tf.gather(t_embedding_matrix, self.second), perm=[1, 0, 2]) third_embed = tf.transpose(tf.gather(t_embedding_matrix, self.third), perm=[1, 0, 2]) level_3_matrix = tf.reduce_sum(tf.multiply(tf.multiply(first_embed, second_embed), third_embed), axis=-1) with tf.variable_scope("third_edge_weight", reuse=tf.AUTO_REUSE): self.third_edge_weights = tf.get_variable('third_weights', shape=[len(self.first)], initializer=tf.random_uniform_initializer( minval=weight_base_third - 0.001, maxval=weight_base_third + 0.001)) third_normed_wts = tf.identity(self.third_edge_weights, name="third_normed_wts") tf.add_to_collection("third_structure", self.third_edge_weights) tf.add_to_collection("third_edge_weights", self.third_edge_weights) third_mask = tf.identity(third_normed_wts, name="third_unpruned_mask") third_mask = tf.expand_dims(third_mask, axis=0) level_3_matrix = tf.layers.batch_normalization(level_3_matrix, axis=-1, training=self.training, reuse=tf.AUTO_REUSE, scale=False, center=False, name="level_3_matrix_BN") level_3_matrix *= third_mask fm_out = tf.reduce_sum(level_2_matrix, axis=-1) if third_prune: fm_out2 = tf.reduce_sum(level_3_matrix, axis=-1) if third_prune: self.logits, self.outputs = output([l, fm_out,fm_out2, b, ]) else: self.logits, self.outputs = output([l, fm_out, b, ])
def __init__(self, FLAGS=None): self.FLAGS = FLAGS self.config = config self.seq_len = config.max_sent_len self.embed_size = config.word_dim self.num_class = config.num_class self.lstm_size = config.lstm_size # Add Word Embedding self.we = tf.Variable(FLAGS.we, name='emb') # Add PlaceHolder # define basic four input layers - for warrant0, warrant1, reason, claim self.input_warrant0 = tf.placeholder( tf.int32, (None, self.seq_len), name='warrant0') # [batch_size, sent_len] self.input_warrant1 = tf.placeholder( tf.int32, (None, self.seq_len), name='warrant1') # [batch_size, sent_len] self.input_reason = tf.placeholder( tf.int32, (None, self.seq_len), name='reason') # [batch_size, sent_len] self.input_claim = tf.placeholder( tf.int32, (None, self.seq_len), name='claim') # [batch_size, sent_len] self.input_debate = tf.placeholder( tf.int32, (None, self.seq_len), name='debate') # [batch_size, sent_len] self.warrant0_len = tf.placeholder( tf.int32, (None, ), name='warrant0_len') # [batch_size,] self.warrant1_len = tf.placeholder( tf.int32, (None, ), name='warrant1_len') # [batch_size,] self.reason_len = tf.placeholder(tf.int32, (None, ), name='reason_len') # [batch_size,] self.claim_len = tf.placeholder(tf.int32, (None, ), name='claim_len') # [batch_size,] self.debate_len = tf.placeholder(tf.int32, (None, ), name='debate_len') # [batch_size,] self.target_label = tf.placeholder( tf.int32, (None, self.num_class), name='label') # [batch_size, num_class] self.drop_keep_rate = tf.placeholder(tf.float32) self.learning_rate = tf.placeholder(tf.float32) # now define embedded layers of the input embedded_warrant0 = tf.nn.embedding_lookup(self.we, self.input_warrant0) embedded_warrant1 = tf.nn.embedding_lookup(self.we, self.input_warrant1) embedded_reason = tf.nn.embedding_lookup(self.we, self.input_reason) embedded_claim = tf.nn.embedding_lookup(self.we, self.input_claim) embedded_debate = tf.nn.embedding_lookup(self.we, self.input_debate) ''' BiLSTM layer ''' def BiLSTM(input_x, input_x_len, hidden_size, num_layers=1, dropout_rate=None, return_sequence=True): """ TODO: return_sequence Bug """ # cell = tf.contrib.rnn.GRUCell(hidden_size) cell_fw = tf.contrib.rnn.BasicLSTMCell(hidden_size) cell_bw = tf.contrib.rnn.BasicLSTMCell(hidden_size) if num_layers > 1: # Warning! Please consider that whether the cell to stack are the same cell_fw = tf.contrib.rnn.MultiRNNCell( [cell_fw for _ in range(num_layers)]) cell_bw = tf.contrib.rnn.MultiRNNCell( [cell_bw for _ in range(num_layers)]) if dropout_rate: cell_fw = tf.contrib.rnn.DropoutWrapper( cell_fw, output_keep_prob=(1 - dropout_rate)) cell_bw = tf.contrib.rnn.DropoutWrapper( cell_bw, output_keep_prob=(1 - dropout_rate)) b_outputs, b_states = tf.nn.bidirectional_dynamic_rnn( cell_fw, cell_bw, input_x, sequence_length=input_x_len, dtype=tf.float32) if return_sequence: outputs = tf.concat(b_outputs, axis=2) else: outputs = tf.concat([b_states[0][0], b_outputs[1][0]], axis=-1) return outputs with tf.variable_scope("bi_lstm") as s: bilstm_warrant0 = BiLSTM(embedded_warrant0, self.warrant0_len, self.lstm_size) s.reuse_variables() bilstm_warrant1 = BiLSTM(embedded_warrant1, self.warrant1_len, self.lstm_size) bilstm_reason = BiLSTM(embedded_reason, self.reason_len, self.lstm_size) bilstm_claim = BiLSTM(embedded_claim, self.claim_len, self.lstm_size) bilstm_debate = BiLSTM(embedded_debate, self.debate_len, self.lstm_size) ''' MaxPooling Layer ''' pooling_warrant0 = tf_utils.MaxPooling(bilstm_warrant0, self.warrant0_len) pooling_warrant1 = tf_utils.MaxPooling(bilstm_warrant1, self.warrant1_len) pooling_reason = tf_utils.MaxPooling(bilstm_reason, self.reason_len) pooling_claim = tf_utils.MaxPooling(bilstm_claim, self.claim_len) pooling_debate = tf_utils.MaxPooling(bilstm_debate, self.debate_len) attention_vector_for_W0 = tf.concat( [pooling_debate, pooling_reason, pooling_warrant0, pooling_claim], axis=-1) attention_vector_for_W1 = tf.concat( [pooling_debate, pooling_reason, pooling_warrant1, pooling_claim], axis=-1) def AttBiLSTM(attention_vector, input_x, input_x_len, hidden_size, return_sequence=True): cell_fw = AttBasicLSTMCell(attention_vector, num_units=hidden_size) cell_bw = AttBasicLSTMCell(attention_vector, num_units=hidden_size) b_outputs, b_states = tf.nn.bidirectional_dynamic_rnn( cell_fw, cell_bw, input_x, sequence_length=input_x_len, dtype=tf.float32) if return_sequence: outputs = tf.concat(b_outputs, axis=2) else: # states: [c, h] outputs = tf.concat([b_states[0][1], b_states[1][1]], axis=-1) return outputs with tf.variable_scope("att_lstm") as s: attention_warrant0 = AttBiLSTM(attention_vector_for_W0, bilstm_warrant0, self.warrant0_len, self.lstm_size, return_sequence=False) s.reuse_variables() attention_warrant1 = AttBiLSTM(attention_vector_for_W1, bilstm_warrant1, self.warrant1_len, self.lstm_size, return_sequence=False) self.attention_warrant0 = attention_warrant0 self.attention_warrant1 = attention_warrant1 # concatenate them warrant_0minus1 = attention_warrant0 - attention_warrant1 warrant_1minus0 = attention_warrant1 - attention_warrant0 merge_warrant = tf.concat([warrant_1minus0, warrant_0minus1], axis=-1) merge_warrant = tf.concat([ attention_warrant0, attention_warrant1, attention_warrant0 - attention_warrant1, attention_warrant0 * attention_warrant1 ], axis=-1) dropout_warrant = tf.nn.dropout(merge_warrant, self.drop_keep_rate) # and add one extra layer with ReLU with tf.variable_scope("linear") as s: dense1 = tf.nn.relu( tf_utils.linear(dropout_warrant, int(self.lstm_size / 2), bias=True, scope='dense')) logits = tf_utils.linear(dense1, self.num_class, bias=True, scope='logit') # Obtain the Predict, Loss, Train_op predict_prob = tf.nn.softmax(logits, name='predict_prob') predict_label = tf.cast(tf.argmax(logits, axis=1), tf.int32) loss = tf.nn.softmax_cross_entropy_with_logits( logits=logits, labels=self.target_label) loss = tf.reduce_mean(loss) # Build the loss global_step = tf.Variable(0, name='global_step', trainable=False) optimizer = tf.train.AdamOptimizer(self.learning_rate) # optimizer = tf.train.AdagradOptimizer(self.learning_rate) if FLAGS.clipper: tvars = tf.trainable_variables() grads, _ = tf.clip_by_global_norm(tf.gradients(loss, tvars), FLAGS.clipper) train_op = optimizer.apply_gradients(list(zip(grads, tvars))) else: train_op = optimizer.minimize(loss, global_step=global_step) self.predict_prob = predict_prob self.predict_label = predict_label self.loss = loss self.train_op = train_op self.global_step = global_step