def __call__(self, inputs, state, scope=None): """ :param inputs: [N, d + JQ + JQ * d] :param state: [N, d] :param scope: :return: """ with tf.variable_scope(scope or self.__class__.__name__): c_prev, h_prev = state x = tf.slice(inputs, [0, 0], [-1, self._input_size]) q_mask = tf.slice(inputs, [0, self._input_size], [-1, self._q_len]) # [N, JQ] qs = tf.slice(inputs, [0, self._input_size + self._q_len], [-1, -1]) qs = tf.reshape(qs, [-1, self._q_len, self._input_size]) # [N, JQ, d] x_tiled = tf.tile(tf.expand_dims(x, 1), [1, self._q_len, 1]) # [N, JQ, d] h_prev_tiled = tf.tile(tf.expand_dims(h_prev, 1), [1, self._q_len, 1]) # [N, JQ, d] f = tf.tanh( linear([qs, x_tiled, h_prev_tiled], self._input_size, True, scope='f')) # [N, JQ, d] a = tf.nn.softmax( exp_mask(linear(f, 1, True, squeeze=True, scope='a'), q_mask)) # [N, JQ] q = tf.reduce_sum(qs * tf.expand_dims(a, -1), 1) z = tf.concat(1, [x, q]) # [N, 2d] return self._cell(z, state)
def __call__(self, inputs, state, scope=None): with tf.variable_scope(scope or "SHCell"): a_size = 1 if self._scalar else self._state_size h, u = tf.split(1, 2, inputs) if self._logit_func == 'mul_linear': args = [h * u] a = tf.nn.sigmoid(linear(args, a_size, True, bias_start=self._bias, scope='a')) r = tf.nn.sigmoid(linear(args, a_size, True, bias_start=self._bias, scope='r')) elif self._logit_func == 'linear': args = [h, u] a = tf.nn.sigmoid(linear(args, a_size, True, bias_start=self._bias, scope='a')) r = tf.nn.sigmoid(linear(args, a_size, True, bias_start=self._bias, scope='r')) elif self._logit_func == 'tri_linear': args = [h, u, h * u] a = tf.nn.sigmoid(linear(args, a_size, True, bias_start=self._bias, scope='a')) r = tf.nn.sigmoid(linear(args, a_size, True, bias_start=self._bias, scope='r')) elif self._logit_func == 'double': args = [h, u] a = tf.nn.sigmoid(linear(tf.tanh(linear(args, a_size, True)), self._state_size, True, bias_start=self._bias)) r = tf.nn.sigmoid(linear(tf.tanh(linear(args, a_size, True)), self._state_size, True, bias_start=self._bias)) else: raise Exception() new_state = a * state + r * (1 - a) * h outputs = state return outputs, new_state
def double_linear_controller(inputs, state, A_m): """ :param inputs: [N, i] :param state: [N, d] :param memory: [N, M, m] :return: [N, M] """ if isinstance(state, LSTMStateTuple): in_ = tf.concat([inputs, state.c, state.h], -1) else: in_ = tf.concat([inputs, state], -1) A_IS = linear(in_, size, bias, scope='first', input_keep_prob=input_keep_prob, is_train=is_train) rank = len(A_m.get_shape()) _memory_size = tf.shape(A_m)[rank - 2] tiled_A_IS = tf.tile(tf.expand_dims(A_IS, 1), [1, _memory_size, 1]) in_ = tf.tanh(tf.add(tiled_A_IS, A_m)) # [N * M, JX, d] out = linear(in_, 1, bias, squeeze=True, scope='second', input_keep_prob=input_keep_prob, is_train=is_train) # [N * M, JX] return out
def __call__(self, inputs, state, scope=None): """Gated recurrent unit (GRU) with nunits cells.""" with tf.variable_scope(scope or type(self).__name__): # "GRUCell" with tf.variable_scope("Gates"): # Reset gate and update gate. # We start with bias of 1.0 to not reset and not update. state = tf.reshape(state, inputs.get_shape().as_list()[:-1] + state.get_shape().as_list()[-1:]) # explicit shape definition, to use my linaer function r, u = tf.split(1, 2, linear([inputs, state], 2 * self._num_units, True, 1.0)) r, u = tf.sigmoid(r), tf.sigmoid(u) with tf.variable_scope("Candidate"): c = tf.tanh(linear([inputs, r * state], self._num_units, True, var_on_cpu=self.var_on_cpu, wd=self.wd)) new_h = u * state + (1 - u) * c return new_h, new_h
def __call__(self, inputs, state, scope=None): scope = scope or type(self).__name__ with tf.variable_scope(scope): tensors = self.tensors N, _ = state.get_shape().as_list() R, A, C = self._rel_size, self._arg_size, self._num_args with tf.name_scope("Split"): ru = tf.slice(state, [0, 0], [-1, R], name='ru') # [N, d] au_flat = tf.slice(state, [0, R], [-1, -1], name='au_flat') au = tf.reshape(au_flat, [N, C, A], name='au') rf = tf.slice(inputs, [0, 0], [-1, R], name='rf') af_flat = tf.slice(inputs, [0, R], [-1, -1], name='af_flat') af = tf.reshape(af_flat, [N, C, A], name='af') with tf.variable_scope("Attention"): p_flat = tf.nn.softmax(linear([ru, rf], 2*C**2, True), name='p_flat') p = tf.reshape(p_flat, [N, C, 2*C], name='p') p_key = "{}/{}".format(scope, 'p') assert p_key not in tensors tensors[p_key] = p with tf.name_scope("Out"): ru_out, _ = self._cell(rf, ru) # [N, R] a = tf.concat(1, [au, af], name='a') a_aug = tf.tile(tf.expand_dims(a, 1), [1, C, 1, 1], name='a_aug') au_out = tf.reduce_sum(a_aug * tf.expand_dims(p, -1), 2, name='au_out') # [N, C, A] au_out_flat = tf.reshape(au_out, [N, C*A], name='au_out_flat') out = tf.concat(1, [ru_out, au_out_flat], name='out') # [N, R+A*C] return out, out
def pre(self, inputs, scope=None): """Preprocess inputs to be used by the cell. Assumes [N, J, *] [x, u]""" is_train = self._is_train keep_prob = self._keep_prob gate_size = self._gate_size with tf.variable_scope(scope or "pre"): x, u, _, _ = tf.split(2, 4, tf.slice(inputs, [0, 0, gate_size], [-1, -1, -1])) # [N, J, d] a_raw = linear([x * u], gate_size, True, scope='a_raw', var_on_cpu=self._var_on_cpu, wd=self._wd, initializer=self._initializer) a = tf.sigmoid(a_raw - self._forget_bias, name='a') if keep_prob < 1.0: x = tf.cond(is_train, lambda: tf.nn.dropout(x, keep_prob), lambda: x) u = tf.cond(is_train, lambda: tf.nn.dropout(u, keep_prob), lambda: u) v_t = tf.nn.tanh(linear([x, u], self._num_units, True, var_on_cpu=self._var_on_cpu, wd=self._wd, scope='v_raw'), name='v') new_inputs = tf.concat(2, [a, x, u, v_t]) # [N, J, 3*d + 1] return new_inputs
def __call__(self, inputs, state, scope=None): """ :param inputs: [N, d + JQ + JQ * d] :param state: [N, d] :param scope: :return: """ with tf.variable_scope(scope or self.__class__.__name__): c_prev, h_prev = state x = tf.slice(inputs, [0, 0], [-1, self._input_size]) q_mask = tf.slice(inputs, [0, self._input_size], [-1, self._q_len]) # [N, JQ] qs = tf.slice(inputs, [0, self._input_size + self._q_len], [-1, -1]) qs = tf.reshape(qs, [-1, self._q_len, self._input_size]) # [N, JQ, d] x_tiled = tf.tile(tf.expand_dims(x, 1), [1, self._q_len, 1]) # [N, JQ, d] h_prev_tiled = tf.tile(tf.expand_dims(h_prev, 1), [1, self._q_len, 1]) # [N, JQ, d] f = tf.tanh(linear([qs, x_tiled, h_prev_tiled], self._input_size, True, scope='f')) # [N, JQ, d] a = tf.nn.softmax(exp_mask(linear(f, 1, True, squeeze=True, scope='a'), q_mask)) # [N, JQ] q = tf.reduce_sum(qs * tf.expand_dims(a, -1), 1) z = tf.concat(1, [x, q]) # [N, 2d] return self._cell(z, state)
def __call__(self, inputs, state, name_scope=None): """Long short-term memory cell (GRU).""" with tf.variable_scope(name_scope or type(self).__name__): # "BasicLSTMCell" # Parameters of gates are concatenated into one multiply for efficiency. c, h = tf.split(1, 2, state) concat = linear([inputs, h], 4 * self._num_units, True, var_on_cpu=self.var_on_cpu, wd=self.wd) # i = input_gate, j = new_input, f = forget_gate, o = output_gate i, j, f, o = tf.split(1, 4, concat) new_c = c * tf.sigmoid(f + self._forget_bias) + tf.sigmoid(i) * tf.tanh(j) new_h = tf.tanh(new_c) * tf.sigmoid(o) return new_h, tf.concat(1, [new_c, new_h])
def linear_controller(inputs, state, memory): rank = len(memory.get_shape()) _memory_size = tf.shape(memory)[rank-2] tiled_inputs = tf.tile(tf.expand_dims(inputs, 1), [1, _memory_size, 1]) if isinstance(state, tuple): tiled_states = [tf.tile(tf.expand_dims(each, 1), [1, _memory_size, 1]) for each in state] else: tiled_states = [tf.tile(tf.expand_dims(state, 1), [1, _memory_size, 1])] # [N, M, d] in_ = tf.concat(2, [tiled_inputs] + tiled_states + [memory]) out = linear(in_, 1, bias, squeeze=True, input_keep_prob=input_keep_prob, is_train=is_train) return out
def __call__(self, inputs, state, scope=None): with tf.variable_scope(scope or "SHCell"): a_size = 1 if self._scalar else self._state_size h, u = tf.split(axis=1, num_or_size_splits=2, value=inputs) if self._logit_func == 'mul_linear': args = [h * u, state * u] a = tf.nn.sigmoid(linear(args, a_size, True)) elif self._logit_func == 'linear': args = [h, u, state] a = tf.nn.sigmoid(linear(args, a_size, True)) elif self._logit_func == 'tri_linear': args = [h, u, state, h * u, state * u] a = tf.nn.sigmoid(linear(args, a_size, True)) elif self._logit_func == 'double': args = [h, u, state] a = tf.nn.sigmoid( linear(tf.tanh(linear(args, a_size, True)), self._state_size, True)) else: raise Exception() new_state = a * state + (1 - a) * h outputs = state return outputs, new_state
def __call__(self, inputs, state, scope=None): gate_size = self._gate_size with tf.variable_scope(scope or type(self).__name__): # "RSMCell" with tf.name_scope("Split"): # Reset gate and update gate. a = tf.slice(inputs, [0, 0], [-1, gate_size]) x, u, v_t = tf.split(1, 3, tf.slice(inputs, [0, gate_size], [-1, -1])) o = tf.slice(state, [0, 0], [-1, 1]) h, v = tf.split(1, 2, tf.slice(state, [0, gate_size], [-1, -1])) with tf.variable_scope("Main"): r_raw = linear([x * u], 1, True, scope='r_raw', var_on_cpu=self._var_on_cpu, initializer=self._initializer) r = tf.sigmoid(r_raw, name='a') new_o = a * r + (1 - a) * o new_v = a * v_t + (1 - a) * v g = r * v_t new_h = a * g + (1 - a) * h with tf.name_scope("Concat"): new_state = tf.concat(1, [new_o, new_h, new_v]) outputs = tf.concat(1, [a, r, x, new_h, new_v, g]) return outputs, new_state
def __init__(self, cell, memory, size, mask=None, controller=None, mapper=None, input_keep_prob=1.0, is_train=None): """ Early fusion attention cell: uses the (inputs, state) to control the current attention. :param cell: :param memory: [N, M, m] :param mask: :param controller: (inputs, prev_state, memory) -> memory_logits """ self._cell = cell self._memory = memory self._mask = mask self._flat_memory = flatten(memory, 2) self._flat_mask = flatten(mask, 1) if controller is None: controller = AttentionCell.get_double_linear_controller( size, True, input_keep_prob=input_keep_prob, is_train=is_train) self.A_m = linear(self._memory, size, True, scope='memory_prepare', input_keep_prob=input_keep_prob, is_train=is_train) # [N * M, JX, d] self._controller = controller if mapper is None: mapper = AttentionCell.get_concat_mapper() elif mapper == 'sim': mapper = AttentionCell.get_sim_mapper() self._mapper = mapper
def initialize(self): params = self.params placeholders = self.placeholders tensors = self.tensors variables_dict = self.variables_dict N, J, V, Q, M = params.batch_size, params.max_sent_size, params.vocab_size, params.max_ques_size, params.mem_size d = params.hidden_size L = params.mem_num_layers att_forget_bias = params.att_forget_bias use_vector_gate = params.use_vector_gate wd = params.wd initializer = tf.random_uniform_initializer(-np.sqrt(3), np.sqrt(3)) with tf.name_scope("placeholders"): x = tf.placeholder('int32', shape=[N, M, J], name='x') x_mask = tf.placeholder('bool', shape=[N, M, J], name='x_mask') q = tf.placeholder('int32', shape=[N, J], name='q') q_mask = tf.placeholder('bool', shape=[N, J], name='q_mask') y = tf.placeholder('int32', shape=[N], name='y') is_train = tf.placeholder('bool', shape=[], name='is_train') placeholders['x'] = x placeholders['x_mask'] = x_mask placeholders['q'] = q placeholders['q_mask'] = q_mask placeholders['y'] = y placeholders['is_train'] = is_train with tf.variable_scope("embedding"): A = VariableEmbedder(params, wd=wd, initializer=initializer, name='A') Aq = A(q, name='Aq') # [N, S, J, d] Ax = A(x, name='Ax') # [N, S, J, d] with tf.name_scope("encoding"): encoder = PositionEncoder(J, d) u = encoder(Aq, q_mask) # [N, d] m = encoder(Ax, x_mask) # [N, M, d] with tf.variable_scope("networks"): m_mask = tf.reduce_max(tf.cast(x_mask, 'int64'), 2, name='m_mask') # [N, M] gate_mask = tf.expand_dims(m_mask, -1) m_length = tf.reduce_sum(m_mask, 1, name='m_length') # [N] prev_u = tf.tile(tf.expand_dims(u, 1), [1, M, 1]) # [N, M, d] reg_layer = VectorReductionLayer( N, M, d) if use_vector_gate else ReductionLayer(N, M, d) gate_size = d if use_vector_gate else 1 h = None # [N, M, d] as_, rfs, rbs = [], [], [] hs = [] for layer_idx in range(L): with tf.name_scope("layer_{}".format(layer_idx)): u_t = tf.tanh( linear([prev_u, m], d, True, wd=wd, scope='u_t')) a = tf.cast(gate_mask, 'float') * tf.sigmoid( linear([prev_u * m], gate_size, True, initializer=initializer, wd=wd, scope='a') - att_forget_bias) h = reg_layer(u_t, a, 1.0 - a, scope='h') if layer_idx + 1 < L: if params.use_reset: rf, rb = tf.split( 2, 2, tf.cast(gate_mask, 'float') * tf.sigmoid( linear([prev_u * m], 2 * gate_size, True, initializer=initializer, wd=wd, scope='r'))) else: rf = rb = tf.ones(a.get_shape().as_list()) u_t_rev = tf.reverse_sequence(u_t, m_length, 1) a_rev, rb_rev = tf.reverse_sequence( a, m_length, 1), tf.reverse_sequence(rb, m_length, 1) uf = reg_layer(u_t, a * rf, 1.0 - a, scope='uf') ub_rev = reg_layer(u_t_rev, a_rev * rb_rev, 1.0 - a_rev, scope='ub_rev') ub = tf.reverse_sequence(ub_rev, m_length, 1) prev_u = uf + ub else: rf = rb = tf.zeros(a.get_shape().as_list()) rfs.append(rf) rbs.append(rb) as_.append(a) hs.append(h) tf.get_variable_scope().reuse_variables() h_last = tf.squeeze(tf.slice(h, [0, M - 1, 0], [-1, -1, -1]), [1]) # [N, d] hs_last = [ tf.squeeze(tf.slice(each, [0, M - 1, 0], [-1, -1, -1]), [1]) for each in hs ] a = tf.transpose(tf.pack(as_, name='a'), [1, 0, 2, 3]) rf = tf.transpose(tf.pack(rfs, name='rf'), [1, 0, 2, 3]) rb = tf.transpose(tf.pack(rbs, name='rb'), [1, 0, 2, 3]) tensors['a'] = a tensors['rf'] = rf tensors['rb'] = rb with tf.variable_scope("class"): class_mode = params.class_mode use_class_bias = params.use_class_bias if class_mode == 'h': # W = tf.transpose(A.emb_mat, name='W') logits = linear([h_last], V, use_class_bias, wd=wd) elif class_mode == 'uh': logits = linear([h_last, u], V, use_class_bias, wd=wd) elif class_mode == 'hs': logits = linear(hs_last, V, use_class_bias, wd=wd) elif class_mode == 'hss': logits = linear(sum(hs_last), V, use_class_bias, wd=wd) else: raise Exception("Invalid class mode: {}".format(class_mode)) yp = tf.cast(tf.argmax(logits, 1), 'int32') correct = tf.equal(yp, y) tensors['yp'] = yp tensors['correct'] = correct with tf.name_scope("loss"): with tf.name_scope("ans_loss"): ce = tf.nn.sparse_softmax_cross_entropy_with_logits(logits, y, name='ce') avg_ce = tf.reduce_mean(ce, name='avg_ce') tf.add_to_collection('losses', avg_ce) losses = tf.get_collection('losses') loss = tf.add_n(losses, name='loss') tensors['loss'] = loss variables_dict['all'] = tf.trainable_variables()
def initialize(self): params = self.params placeholders = self.placeholders tensors = self.tensors variables_dict = self.variables_dict N, J, V, Q, M = ( params.batch_size, params.max_sent_size, params.vocab_size, params.max_ques_size, params.mem_size, ) d = params.hidden_size L = params.mem_num_layers forget_bias = params.forget_bias wd = params.wd initializer = tf.random_uniform_initializer(-np.sqrt(3), np.sqrt(3)) with tf.name_scope("placeholders"): x = tf.placeholder("int32", shape=[N, M, J], name="x") x_mask = tf.placeholder("bool", shape=[N, M, J], name="x_mask") q = tf.placeholder("int32", shape=[N, J], name="q") q_mask = tf.placeholder("bool", shape=[N, J], name="q_mask") y = tf.placeholder("int32", shape=[N], name="y") is_train = tf.placeholder("bool", shape=[], name="is_train") placeholders["x"] = x placeholders["x_mask"] = x_mask placeholders["q"] = q placeholders["q_mask"] = q_mask placeholders["y"] = y placeholders["is_train"] = is_train with tf.variable_scope("embedding"): A = VariableEmbedder(params, wd=wd, initializer=initializer, name="A") Aq = A(q, name="Aq") # [N, S, J, d] Ax = A(x, name="Ax") # [N, S, J, d] with tf.name_scope("encoding"): encoder = PositionEncoder(J, d) u = encoder(Aq, q_mask) # [N, d] m = encoder(Ax, x_mask) # [N, M, d] with tf.variable_scope("networks"): m_mask = tf.reduce_max(tf.cast(x_mask, "int64"), 2, name="m_mask") # [N, M] m_length = tf.reduce_sum(m_mask, 1, name="m_length") # [N] initializer = tf.random_uniform_initializer(-np.sqrt(3), np.sqrt(3)) cell = RSMCell(d, forget_bias=forget_bias, wd=wd, initializer=initializer) us = tf.tile(tf.expand_dims(u, 1, name="u_prev_aug"), [1, M, 1]) # [N, d] -> [N, M, d] in_ = tf.concat(2, [tf.ones([N, M, 1]), m, us, tf.zeros([N, M, 2 * d])], name="x_h_in") # [N, M, 4*d + 1] out, fw_state, bw_state, bi_tensors = dynamic_bidirectional_rnn( cell, in_, sequence_length=m_length, dtype="float", num_layers=L ) a = tf.slice(out, [0, 0, 0], [-1, -1, 1]) # [N, M, 1] _, _, v, g = tf.split(2, 4, tf.slice(out, [0, 0, 1], [-1, -1, -1])) fw_h, fw_v = tf.split(1, 2, tf.slice(fw_state, [0, 1], [-1, -1])) bw_h, bw_v = tf.split(1, 2, tf.slice(bw_state, [0, 1], [-1, -1])) _, fw_u_out, fw_v_out, _ = tf.split( 2, 4, tf.squeeze(tf.slice(bi_tensors["fw_out"], [0, L - 1, 0, 2], [-1, -1, -1, -1]), [1]) ) _, bw_u_out, bw_v_out, _ = tf.split( 2, 4, tf.squeeze(tf.slice(bi_tensors["bw_out"], [0, L - 1, 0, 2], [-1, -1, -1, -1]), [1]) ) tensors["a"] = tf.squeeze(tf.slice(bi_tensors["in"], [0, 0, 0, 0], [-1, -1, -1, 1]), [3]) tensors["of"] = tf.squeeze(tf.slice(bi_tensors["fw_out"], [0, 0, 0, 1], [-1, -1, -1, 1]), [3]) tensors["ob"] = tf.squeeze(tf.slice(bi_tensors["bw_out"], [0, 0, 0, 1], [-1, -1, -1, 1]), [3]) with tf.variable_scope("selection"): # w = tf.nn.relu(linear([fw_v + 1e-9*(fw_h+bw_h)], d, True, wd=wd)) w = fw_v + 1e-9 * (fw_h + bw_h) tensors["s"] = a with tf.variable_scope("class"): if params.use_ques: logits = linear([w, u], V, True, wd=wd) else: # W = tf.transpose(A.emb_mat, name='W') W = tf.get_variable("W", shape=[d, V]) logits = tf.matmul(w, W, name="logits") yp = tf.cast(tf.argmax(logits, 1), "int32") correct = tf.equal(yp, y) tensors["yp"] = yp tensors["correct"] = correct with tf.name_scope("loss"): with tf.name_scope("ans_loss"): ce = tf.nn.sparse_softmax_cross_entropy_with_logits(logits, y, name="ce") avg_ce = tf.reduce_mean(ce, name="avg_ce") tf.add_to_collection("losses", avg_ce) losses = tf.get_collection("losses") loss = tf.add_n(losses, name="loss") tensors["loss"] = loss variables_dict["all"] = tf.trainable_variables()
def __init__(self, config, seq_length, emb_dim, hidden_dim, emb_train, embeddings=None, pred_size=3, context_seq_len=None, query_seq_len=None): ## Define hyperparameters # tf.reset_default_graph() self.embedding_dim = emb_dim self.dim = hidden_dim self.sequence_length = seq_length self.pred_size = pred_size self.context_seq_len = context_seq_len self.query_seq_len = query_seq_len # self.config = config ## Define the placeholders self.premise_x = tf.placeholder(tf.int32, [None, self.sequence_length], name='premise') self.hypothesis_x = tf.placeholder(tf.int32, [None, self.sequence_length], name='hypothesis') self.premise_pos = tf.placeholder(tf.int32, [None, self.sequence_length, 47], name='premise_pos') self.hypothesis_pos = tf.placeholder(tf.int32, [None, self.sequence_length, 47], name='hypothesis_pos') self.premise_char = tf.placeholder( tf.int32, [None, self.sequence_length, config.char_in_word_size], name='premise_char') self.hypothesis_char = tf.placeholder( tf.int32, [None, self.sequence_length, config.char_in_word_size], name='hypothesis_char') self.premise_exact_match = tf.placeholder( tf.int32, [None, self.sequence_length, 1], name='premise_exact_match') self.hypothesis_exact_match = tf.placeholder( tf.int32, [None, self.sequence_length, 1], name='hypothesis_exact_match') self.global_step = tf.Variable(0, name='global_step', trainable=False) self.dropout_keep_rate = tf.train.exponential_decay( config.keep_rate, self.global_step, config.dropout_decay_step, config.dropout_decay_rate, staircase=False, name='dropout_keep_rate') config.keep_rate = self.dropout_keep_rate tf.summary.scalar('dropout_keep_rate', self.dropout_keep_rate) self.y = tf.placeholder(tf.int32, [None], name='label_y') self.keep_rate_ph = tf.placeholder(tf.float32, [], name='keep_prob') self.is_train = tf.placeholder('bool', [], name='is_train') ## Fucntion for embedding lookup and dropout at embedding layer def emb_drop(E, x): emb = tf.nn.embedding_lookup(E, x) emb_drop = tf.cond(self.is_train, lambda: tf.nn.dropout(emb, config.keep_rate), lambda: emb) return emb_drop # Get lengths of unpadded sentences prem_seq_lengths, prem_mask = blocks.length( self.premise_x) # mask [N, L , 1] hyp_seq_lengths, hyp_mask = blocks.length(self.hypothesis_x) self.prem_mask = prem_mask self.hyp_mask = hyp_mask ### Embedding layer ### with tf.variable_scope("emb"): with tf.variable_scope("emb_var"), tf.device("/cpu:0"): self.E = tf.Variable(embeddings, trainable=emb_train) premise_in = emb_drop(self.E, self.premise_x) #P hypothesis_in = emb_drop(self.E, self.hypothesis_x) #H with tf.variable_scope("char_emb"): char_emb_mat = tf.get_variable( "char_emb_mat", shape=[config.char_vocab_size, config.char_emb_size]) with tf.variable_scope("char") as scope: char_pre = tf.nn.embedding_lookup(char_emb_mat, self.premise_char) char_hyp = tf.nn.embedding_lookup(char_emb_mat, self.hypothesis_char) filter_sizes = list( map(int, config.out_channel_dims.split(','))) #[100] heights = list(map(int, config.filter_heights.split(','))) #[5] assert sum(filter_sizes) == config.char_out_size, ( filter_sizes, config.char_out_size) with tf.variable_scope("conv") as scope: conv_pre = multi_conv1d(char_pre, filter_sizes, heights, "VALID", self.is_train, config.keep_rate, scope='conv') scope.reuse_variables() conv_hyp = multi_conv1d(char_hyp, filter_sizes, heights, "VALID", self.is_train, config.keep_rate, scope='conv') conv_pre = tf.reshape( conv_pre, [-1, self.sequence_length, config.char_out_size]) conv_hyp = tf.reshape( conv_hyp, [-1, self.sequence_length, config.char_out_size]) premise_in = tf.concat([premise_in, conv_pre], axis=2) hypothesis_in = tf.concat([hypothesis_in, conv_hyp], axis=2) premise_in = tf.concat( (premise_in, tf.cast(self.premise_pos, tf.float32)), axis=2) hypothesis_in = tf.concat( (hypothesis_in, tf.cast(self.hypothesis_pos, tf.float32)), axis=2) premise_in = tf.concat( [premise_in, tf.cast(self.premise_exact_match, tf.float32)], axis=2) hypothesis_in = tf.concat( [hypothesis_in, tf.cast(self.hypothesis_exact_match, tf.float32)], axis=2) with tf.variable_scope("highway") as scope: premise_in = highway_network(premise_in, config.highway_num_layers, True, wd=config.wd, is_train=self.is_train) scope.reuse_variables() hypothesis_in = highway_network(hypothesis_in, config.highway_num_layers, True, wd=config.wd, is_train=self.is_train) with tf.variable_scope("prepro") as scope: pre = premise_in hyp = hypothesis_in for i in range(config.self_att_enc_layers): with tf.variable_scope(tf.get_variable_scope(), reuse=False): p = self_attention_layer( config, self.is_train, pre, p_mask=prem_mask, scope="{}_layer_self_att_enc".format( i)) # [N, len, dim] h = self_attention_layer( config, self.is_train, hyp, p_mask=hyp_mask, scope="{}_layer_self_att_enc_h".format(i)) pre = p hyp = h variable_summaries(p, "p_self_enc_summary_layer_{}".format(i)) variable_summaries(h, "h_self_enc_summary_layer_{}".format(i)) with tf.variable_scope("main") as scope: def model_one_side(config, main, support, main_length, support_length, main_mask, support_mask, scope): bi_att_mx = bi_attention_mx(config, self.is_train, main, support, p_mask=main_mask, h_mask=support_mask) # [N, PL, HL] bi_att_mx = tf.cond( self.is_train, lambda: tf.nn.dropout(bi_att_mx, config.keep_rate), lambda: bi_att_mx) out_final = dense_net(config, bi_att_mx, self.is_train) return out_final premise_final = model_one_side(config, p, h, prem_seq_lengths, hyp_seq_lengths, prem_mask, hyp_mask, scope="premise_as_main") f0 = premise_final print('f0:', f0.get_shape().as_list()) self.logits = linear(f0, self.pred_size, True, bias_start=0.0, scope="logit", squeeze=False, wd=config.wd, input_keep_prob=config.keep_rate, is_train=self.is_train) tf.summary.histogram('logit_histogram', self.logits) # Define the cost function self.total_cost = tf.reduce_mean( tf.nn.sparse_softmax_cross_entropy_with_logits(labels=self.y, logits=self.logits)) self.acc = tf.reduce_mean( tf.cast( tf.equal(tf.arg_max(self.logits, dimension=1), tf.cast(self.y, tf.int64)), tf.float32)) tf.summary.scalar('acc', self.acc) tf.summary.scalar('loss', self.total_cost) # calculate acc # L2 Loss if config.l2_loss: if config.sigmoid_growing_l2loss: weights_added = tf.add_n([ tf.nn.l2_loss(tensor) for tensor in tf.trainable_variables() if tensor.name.endswith("weights:0") and not tensor.name.endswith("weighted_sum/weights:0") or tensor.name.endswith('kernel:0') ]) full_l2_step = tf.constant(config.weight_l2loss_step_full_reg, dtype=tf.int32, shape=[], name='full_l2reg_step') full_l2_ratio = tf.constant(config.l2_regularization_ratio, dtype=tf.float32, shape=[], name='l2_regularization_ratio') gs_flt = tf.cast(self.global_step, tf.float32) half_l2_step_flt = tf.cast(full_l2_step / 2, tf.float32) # (self.global_step - full_l2_step / 2) # tf.cast((self.global_step - full_l2_step / 2) * 8, tf.float32) / tf.cast(full_l2_step / 2 ,tf.float32) # l2loss_ratio = tf.sigmoid( tf.cast((self.global_step - full_l2_step / 2) * 8, tf.float32) / tf.cast(full_l2_step / 2 ,tf.float32)) * full_l2_ratio l2loss_ratio = tf.sigmoid(((gs_flt - half_l2_step_flt) * 8) / half_l2_step_flt) * full_l2_ratio tf.summary.scalar('l2loss_ratio', l2loss_ratio) l2loss = weights_added * l2loss_ratio else: l2loss = tf.add_n([ tf.nn.l2_loss(tensor) for tensor in tf.trainable_variables() if tensor.name. endswith("weights:0") or tensor.name.endswith('kernel:0') ]) * tf.constant(config.l2_regularization_ratio, dtype='float', shape=[], name='l2_regularization_ratio') tf.summary.scalar('l2loss', l2loss) self.total_cost += l2loss if config.wo_enc_sharing or config.wo_highway_sharing_but_penalize_diff: diffs = [] for i in range(config.self_att_enc_layers): for tensor in tf.trainable_variables(): print(tensor.name) if tensor.name == "prepro/{}_layer_self_att_enc/self_attention/h_logits/first/kernel:0".format( i): l_lg = tensor elif tensor.name == "prepro/{}_layer_self_att_enc_h/self_attention/h_logits/first/kernel:0".format( i): r_lg = tensor elif tensor.name == "prepro/{}_layer_self_att_enc/self_att_fuse_gate/lhs_1/kernel:0".format( i): l_fg_lhs_1 = tensor elif tensor.name == "prepro/{}_layer_self_att_enc_h/self_att_fuse_gate/lhs_1/kernel:0".format( i): r_fg_lhs_1 = tensor elif tensor.name == "prepro/{}_layer_self_att_enc/self_att_fuse_gate/rhs_1/kernel:0".format( i): l_fg_rhs_1 = tensor elif tensor.name == "prepro/{}_layer_self_att_enc_h/self_att_fuse_gate/rhs_1/kernel:0".format( i): r_fg_rhs_1 = tensor elif tensor.name == "prepro/{}_layer_self_att_enc/self_att_fuse_gate/lhs_2/kernel:0".format( i): l_fg_lhs_2 = tensor elif tensor.name == "prepro/{}_layer_self_att_enc_h/self_att_fuse_gate/lhs_2/kernel:0".format( i): r_fg_lhs_2 = tensor elif tensor.name == "prepro/{}_layer_self_att_enc/self_att_fuse_gate/rhs_2/kernel:0".format( i): l_fg_rhs_2 = tensor elif tensor.name == "prepro/{}_layer_self_att_enc_h/self_att_fuse_gate/rhs_2/kernel:0".format( i): r_fg_rhs_2 = tensor if config.two_gate_fuse_gate: if tensor.name == "prepro/{}_layer_self_att_enc/self_att_fuse_gate/lhs_3/kernel:0".format( i): l_fg_lhs_3 = tensor elif tensor.name == "prepro/{}_layer_self_att_enc_h/self_att_fuse_gate/lhs_3/kernel:0".format( i): r_fg_lhs_3 = tensor elif tensor.name == "prepro/{}_layer_self_att_enc/self_att_fuse_gate/rhs_3/kernel:0".format( i): l_fg_rhs_3 = tensor elif tensor.name == "prepro/{}_layer_self_att_enc_h/self_att_fuse_gate/rhs_3/kernel:0".format( i): r_fg_rhs_3 = tensor diffs += [ l_lg - r_lg, l_fg_lhs_1 - r_fg_lhs_1, l_fg_rhs_1 - r_fg_rhs_1, l_fg_lhs_2 - r_fg_lhs_2, l_fg_rhs_2 - r_fg_rhs_2 ] if config.two_gate_fuse_gate: diffs += [l_fg_lhs_3 - r_fg_lhs_3, l_fg_rhs_3 - r_fg_rhs_3] diff_loss = tf.add_n([tf.nn.l2_loss(tensor) for tensor in diffs]) * tf.constant( config.diff_penalty_loss_ratio, dtype='float', shape=[], name='diff_penalty_loss_ratio') tf.summary.scalar('diff_penalty_loss', diff_loss) self.total_cost += diff_loss self.summary = tf.summary.merge_all() total_parameters = 0 for v in tf.global_variables(): if not v.name.endswith("weights:0") and not v.name.endswith( "biases:0") and not v.name.endswith( 'kernel:0') and not v.name.endswith('bias:0'): continue print(v.name) # print(type(v.name)) shape = v.get_shape().as_list() param_num = 1 for dim in shape: param_num *= dim print(param_num) total_parameters += param_num print(total_parameters)
def _build_forward(self): config = self.config N, M, JX, JQ, VW, VC, d, W = \ config.batch_size, config.max_num_sents, config.max_sent_size, \ config.max_ques_size, config.word_vocab_size, config.char_vocab_size, config.hidden_size, \ config.max_word_size JX = tf.shape(self.x)[2] JQ = tf.shape(self.q)[1] M = tf.shape(self.x)[1] dc, dw, dco = config.char_emb_size, config.word_emb_size, config.char_out_size with tf.variable_scope("emb"): if config.use_char_emb: with tf.variable_scope("emb_var"), tf.device("/cpu:0"): char_emb_mat = tf.get_variable("char_emb_mat", shape=[VC, dc], dtype='float') with tf.variable_scope("char"): Acx = tf.nn.embedding_lookup(char_emb_mat, self.cx) # [N, M, JX, W, dc] Acq = tf.nn.embedding_lookup(char_emb_mat, self.cq) # [N, JQ, W, dc] Acx = tf.reshape(Acx, [-1, JX, W, dc]) Acq = tf.reshape(Acq, [-1, JQ, W, dc]) filter_sizes = list( map(int, config.out_channel_dims.split(','))) heights = list(map(int, config.filter_heights.split(','))) assert sum(filter_sizes) == dco, (filter_sizes, dco) with tf.variable_scope("conv"): xx = multi_conv1d(Acx, filter_sizes, heights, "VALID", self.is_train, config.keep_prob, scope="xx") if config.share_cnn_weights: tf.get_variable_scope().reuse_variables() qq = multi_conv1d(Acq, filter_sizes, heights, "VALID", self.is_train, config.keep_prob, scope="xx") else: qq = multi_conv1d(Acq, filter_sizes, heights, "VALID", self.is_train, config.keep_prob, scope="qq") xx = tf.reshape(xx, [-1, M, JX, dco]) qq = tf.reshape(qq, [-1, JQ, dco]) if config.use_word_emb: with tf.variable_scope("emb_var"), tf.device("/cpu:0"): if config.mode == 'train': word_emb_mat = tf.get_variable( "word_emb_mat", dtype='float', shape=[VW, dw], initializer=get_initializer(config.emb_mat)) else: word_emb_mat = tf.get_variable("word_emb_mat", shape=[VW, dw], dtype='float') if config.use_glove_for_unk: word_emb_mat = tf.concat( 0, [word_emb_mat, self.new_emb_mat]) with tf.name_scope("word"): Ax = tf.nn.embedding_lookup(word_emb_mat, self.x) # [N, M, JX, d] Aq = tf.nn.embedding_lookup(word_emb_mat, self.q) # [N, JQ, d] self.tensor_dict['x'] = Ax self.tensor_dict['q'] = Aq if config.use_char_emb: xx = tf.concat(3, [xx, Ax]) # [N, M, JX, di] qq = tf.concat(2, [qq, Aq]) # [N, JQ, di] else: xx = Ax qq = Aq # highway network if config.highway: with tf.variable_scope("highway"): xx = highway_network(xx, config.highway_num_layers, True, wd=config.wd, is_train=self.is_train) tf.get_variable_scope().reuse_variables() qq = highway_network(qq, config.highway_num_layers, True, wd=config.wd, is_train=self.is_train) self.tensor_dict['xx'] = xx self.tensor_dict['qq'] = qq cell = BasicLSTMCell(d, state_is_tuple=True) d_cell = SwitchableDropoutWrapper( cell, self.is_train, input_keep_prob=config.input_keep_prob) x_len = tf.reduce_sum(tf.cast(self.x_mask, 'int32'), 2) # [N, M] q_len = tf.reduce_sum(tf.cast(self.q_mask, 'int32'), 1) # [N] with tf.variable_scope("prepro"): (fw_u, bw_u), ((_, fw_u_f), (_, bw_u_f)) = bidirectional_dynamic_rnn( d_cell, d_cell, qq, q_len, dtype='float', scope='u1') # [N, J, d], [N, d] u = tf.concat(2, [fw_u, bw_u]) if config.share_lstm_weights: tf.get_variable_scope().reuse_variables() (fw_h, bw_h), _ = bidirectional_dynamic_rnn( cell, cell, xx, x_len, dtype='float', scope='u1') # [N, M, JX, 2d] h = tf.concat(3, [fw_h, bw_h]) # [N, M, JX, 2d] else: (fw_h, bw_h), _ = bidirectional_dynamic_rnn( cell, cell, xx, x_len, dtype='float', scope='h1') # [N, M, JX, 2d] h = tf.concat(3, [fw_h, bw_h]) # [N, M, JX, 2d] self.tensor_dict['u'] = u self.tensor_dict['h'] = h with tf.variable_scope("main"): if config.dynamic_att: p0 = h u = tf.reshape(tf.tile(tf.expand_dims(u, 1), [1, M, 1, 1]), [N * M, JQ, 2 * d]) q_mask = tf.reshape( tf.tile(tf.expand_dims(self.q_mask, 1), [1, M, 1]), [N * M, JQ]) first_cell = AttentionCell( cell, u, mask=q_mask, mapper='sim', input_keep_prob=self.config.input_keep_prob, is_train=self.is_train) else: p0 = attention_layer(config, self.is_train, h, u, h_mask=self.x_mask, u_mask=self.q_mask, scope="p0", tensor_dict=self.tensor_dict) first_cell = d_cell self.p = p0 (fw_g0, bw_g0), _ = bidirectional_dynamic_rnn( first_cell, first_cell, p0, x_len, dtype='float', scope='g0') # [N, M, JX, 2d] g0 = tf.concat(3, [fw_g0, bw_g0]) (fw_g1, bw_g1), _ = bidirectional_dynamic_rnn( first_cell, first_cell, g0, x_len, dtype='float', scope='g1') # [N, M, JX, 2d] g1 = tf.concat(3, [fw_g1, bw_g1]) with tf.variable_scope("output"): if config.model_name == "basic": logits = get_logits([g1, p0], d, True, wd=config.wd, \ input_keep_prob=config.input_keep_prob, mask=self.x_mask, is_train=self.is_train, \ func=config.answer_func, scope='logits1') a1i = softsel(tf.reshape(g1, [N, M * JX, 2 * d]), \ tf.reshape(logits, [N, M * JX])) a1i = tf.tile(tf.expand_dims(tf.expand_dims(a1i, 1), 1), \ [1, M, JX, 1]) (fw_g2, bw_g2), _ = bidirectional_dynamic_rnn(d_cell, d_cell, \ tf.concat(3, [p0, g1, a1i, g1 * a1i]), x_len, dtype='float', scope='g2') # [N, M, JX, 2d] g2 = tf.concat(3, [fw_g2, bw_g2]) logits2 = get_logits([g2, p0], d, True, wd=config.wd, \ input_keep_prob=config.input_keep_prob, mask=self.x_mask, is_train=self.is_train, func=config.answer_func, scope='logits2') flat_logits = tf.reshape(logits, [-1, M * JX]) flat_yp = tf.nn.softmax(flat_logits) # [-1, M*JX] yp = tf.reshape(flat_yp, [-1, M, JX]) flat_logits2 = tf.reshape(logits2, [-1, M * JX]) flat_yp2 = tf.nn.softmax(flat_logits2) yp2 = tf.reshape(flat_yp2, [-1, M, JX]) self.tensor_dict['g1'] = g1 self.tensor_dict['g2'] = g2 self.logits = flat_logits self.logits2 = flat_logits2 self.yp = yp self.yp2 = yp2 elif config.model_name == "basic-class": C = 3 if config.data_dir.startswith('data/snli') else 2 (fw_g2, bw_g2) = (fw_g1, bw_g1) if config.classifier == 'maxpool': g2 = tf.concat(3, [fw_g2, bw_g2]) # [N, M, JX, 2d] g2 = tf.reduce_max(g2, 2) # [N, M, 2d] g2_dim = 2 * d elif config.classifier == 'sumpool': g2 = tf.concat(3, [fw_g2, bw_g2]) g2 = tf.reduce_sum(g2, 2) g2_dim = 2 * d else: fw_g2_ = tf.gather(tf.transpose(fw_g2, [2, 0, 1, 3]), JX - 1) bw_g2_ = tf.gather(tf.transpose(bw_g2, [2, 0, 1, 3]), 0) g2 = tf.concat(2, [fw_g2_, bw_g2_]) g2_dim = 2 * d g2_ = tf.reshape(g2, [N, g2_dim]) logits0 = linear(g2_, C, True, wd=config.wd, input_keep_prob=config.input_keep_prob, is_train=self.is_train, scope='classifier') flat_yp0 = tf.nn.softmax(logits0) yp0 = tf.reshape(flat_yp0, [N, M, C]) self.tensor_dict['g1'] = g1 self.logits0 = logits0 self.yp0 = yp0 self.logits = logits0 self.yp = yp0
def initialize(self): params = self.params placeholders = self.placeholders tensors = self.tensors variables_dict = self.variables_dict N, J, V, Q, M = params.batch_size, params.max_sent_size, params.vocab_size, params.max_ques_size, params.mem_size d = params.hidden_size L = params.mem_num_layers att_forget_bias = params.att_forget_bias use_vector_gate = params.use_vector_gate wd = params.wd initializer = tf.random_uniform_initializer(-np.sqrt(3), np.sqrt(3)) with tf.name_scope("placeholders"): x = tf.placeholder('int32', shape=[N, M, J], name='x') x_mask = tf.placeholder('bool', shape=[N, M, J], name='x_mask') q = tf.placeholder('int32', shape=[N, J], name='q') q_mask = tf.placeholder('bool', shape=[N, J], name='q_mask') y = tf.placeholder('int32', shape=[N], name='y') is_train = tf.placeholder('bool', shape=[], name='is_train') placeholders['x'] = x placeholders['x_mask'] = x_mask placeholders['q'] = q placeholders['q_mask'] = q_mask placeholders['y'] = y placeholders['is_train'] = is_train with tf.variable_scope("embedding"): A = VariableEmbedder(params, wd=wd, initializer=initializer, name='A') Aq = A(q, name='Aq') # [N, S, J, d] Ax = A(x, name='Ax') # [N, S, J, d] with tf.name_scope("encoding"): encoder = PositionEncoder(J, d) u = encoder(Aq, q_mask) # [N, d] m = encoder(Ax, x_mask) # [N, M, d] with tf.variable_scope("networks"): m_mask = tf.reduce_max(tf.cast(x_mask, 'int64'), 2, name='m_mask') # [N, M] gate_mask = tf.expand_dims(m_mask, -1) m_length = tf.reduce_sum(m_mask, 1, name='m_length') # [N] prev_u = tf.tile(tf.expand_dims(u, 1), [1, M, 1]) # [N, M, d] reg_layer = VectorReductionLayer(N, M, d) if use_vector_gate else ReductionLayer(N, M, d) gate_size = d if use_vector_gate else 1 h = None # [N, M, d] as_, rfs, rbs = [], [], [] hs = [] for layer_idx in range(L): with tf.name_scope("layer_{}".format(layer_idx)): u_t = tf.tanh(linear([prev_u, m], d, True, wd=wd, scope='u_t')) a = tf.cast(gate_mask, 'float') * tf.sigmoid(linear([prev_u * m], gate_size, True, initializer=initializer, wd=wd, scope='a') - att_forget_bias) h = reg_layer(u_t, a, 1.0-a, scope='h') if layer_idx + 1 < L: if params.use_reset: rf, rb = tf.split(2, 2, tf.cast(gate_mask, 'float') * tf.sigmoid(linear([prev_u * m], 2 * gate_size, True, initializer=initializer, wd=wd, scope='r'))) else: rf = rb = tf.ones(a.get_shape().as_list()) u_t_rev = tf.reverse_sequence(u_t, m_length, 1) a_rev, rb_rev = tf.reverse_sequence(a, m_length, 1), tf.reverse_sequence(rb, m_length, 1) uf = reg_layer(u_t, a*rf, 1.0-a, scope='uf') ub_rev = reg_layer(u_t_rev, a_rev*rb_rev, 1.0-a_rev, scope='ub_rev') ub = tf.reverse_sequence(ub_rev, m_length, 1) prev_u = uf + ub else: rf = rb = tf.zeros(a.get_shape().as_list()) rfs.append(rf) rbs.append(rb) as_.append(a) hs.append(h) tf.get_variable_scope().reuse_variables() h_last = tf.squeeze(tf.slice(h, [0, M-1, 0], [-1, -1, -1]), [1]) # [N, d] hs_last = [tf.squeeze(tf.slice(each, [0, M-1, 0], [-1, -1, -1]), [1]) for each in hs] a = tf.transpose(tf.pack(as_, name='a'), [1, 0, 2, 3]) rf = tf.transpose(tf.pack(rfs, name='rf'), [1, 0, 2, 3]) rb = tf.transpose(tf.pack(rbs, name='rb'), [1, 0, 2, 3]) tensors['a'] = a tensors['rf'] = rf tensors['rb'] = rb with tf.variable_scope("class"): class_mode = params.class_mode use_class_bias = params.use_class_bias if class_mode == 'h': # W = tf.transpose(A.emb_mat, name='W') logits = linear([h_last], V, use_class_bias, wd=wd) elif class_mode == 'uh': logits = linear([h_last, u], V, use_class_bias, wd=wd) elif class_mode == 'hs': logits = linear(hs_last, V, use_class_bias, wd=wd) elif class_mode == 'hss': logits = linear(sum(hs_last), V, use_class_bias, wd=wd) else: raise Exception("Invalid class mode: {}".format(class_mode)) yp = tf.cast(tf.argmax(logits, 1), 'int32') correct = tf.equal(yp, y) tensors['yp'] = yp tensors['correct'] = correct with tf.name_scope("loss"): with tf.name_scope("ans_loss"): ce = tf.nn.sparse_softmax_cross_entropy_with_logits(logits, y, name='ce') avg_ce = tf.reduce_mean(ce, name='avg_ce') tf.add_to_collection('losses', avg_ce) losses = tf.get_collection('losses') loss = tf.add_n(losses, name='loss') tensors['loss'] = loss variables_dict['all'] = tf.trainable_variables()
def _build_forward(self): config = self.config N, M, JX, JQ, VW, VC, d, W = \ config.batch_size, config.max_num_sents, config.max_sent_size, \ config.max_ques_size, config.word_vocab_size, config.char_vocab_size, config.hidden_size, \ config.max_word_size JQ = JX print('VC:{} NEW_EMB:{}'.format(VW, self.new_emb_mat.get_shape())) dc, dw, dco = config.char_emb_size, config.word_emb_size, config.char_out_size with tf.variable_scope("emb"): if config.use_char_emb: with tf.variable_scope("emb_var"), tf.device("/cpu:0"): char_emb_mat = tf.get_variable("char_emb_mat", shape=[VC, dc], dtype='float') with tf.variable_scope("char"): Acx = tf.nn.embedding_lookup(char_emb_mat, self.cx) # [N, M, JX, W, dc] Acq = tf.nn.embedding_lookup(char_emb_mat, self.cq) # [N, JQ, W, dc] Acx = tf.reshape(Acx, [-1, JX, W, dc]) Acq = tf.reshape(Acq, [-1, JQ, W, dc]) filter_sizes = list( map(int, config.out_channel_dims.split(','))) heights = list(map(int, config.filter_heights.split(','))) assert sum(filter_sizes) == dco, (filter_sizes, dco) with tf.variable_scope("conv"): xx = multi_conv1d(Acx, filter_sizes, heights, "VALID", self.is_train, config.keep_prob, scope="xx") if config.share_cnn_weights: tf.get_variable_scope().reuse_variables() qq = multi_conv1d(Acq, filter_sizes, heights, "VALID", self.is_train, config.keep_prob, scope="xx") else: qq = multi_conv1d(Acq, filter_sizes, heights, "VALID", self.is_train, config.keep_prob, scope="qq") xx = tf.reshape(xx, [-1, M, JX, dco]) qq = tf.reshape(qq, [-1, JQ, dco]) if config.use_word_emb: with tf.variable_scope("emb_var"), tf.device("/cpu:0"): if config.mode == 'train': word_emb_mat = tf.get_variable( "word_emb_mat", dtype='float', shape=[VW, dw], initializer=get_initializer(config.emb_mat)) else: word_emb_mat = tf.get_variable("word_emb_mat", shape=[VW, dw], dtype='float') if config.use_glove_for_unk: word_emb_mat = tf.concat( axis=0, values=[word_emb_mat, self.new_emb_mat]) with tf.name_scope("word"): Ax = tf.nn.embedding_lookup(word_emb_mat, self.x) # [N, M, JX, d] Aq = tf.nn.embedding_lookup(word_emb_mat, self.q) # [N, JQ, d] self.tensor_dict['x'] = Ax self.tensor_dict['q'] = Aq if config.use_char_emb: xx = tf.concat(axis=3, values=[xx, Ax]) # [N, M, JX, di] qq = tf.concat(axis=2, values=[qq, Aq]) # [N, JQ, di] else: xx = Ax qq = Aq xx = tf.reshape(xx, [-1, M, JX, d]) qq = tf.reshape(qq, [-1, JQ, d]) if config.use_pos_emb: with tf.variable_scope("pos_onehot"), tf.device("/cpu:0"): pos_x = tf.one_hot( self.x_pos, depth=config.pos_tag_num) # [N,M,JX,depth] pos_q = tf.one_hot( self.q_pos, depth=config.pos_tag_num) # [N,JQ,depth] xx = tf.concat(axis=3, values=[xx, pos_x]) # [N, M, JX, di] qq = tf.concat(axis=2, values=[qq, pos_q]) if config.use_sem_emb: with tf.variable_scope("sem_onehot"), tf.device("/cpu:0"): sem_x = tf.one_hot(self.x_sem, depth=3) # [N,M,JX,3] sem_q = tf.one_hot(self.q_sem, depth=3) # [N,JQ,3] xx = tf.concat(axis=3, values=[xx, sem_x]) qq = tf.concat(axis=2, values=[qq, sem_q]) if config.use_neg_emb: with tf.variable_scope("neg_onehot"), tf.device("/cpu:0"): neg_x = tf.one_hot(self.x_neg, depth=2) # [N,M,JX,2] neg_q = tf.one_hot(self.q_neg, depth=2) # [N,JQ,2] xx = tf.concat(axis=3, values=[xx, neg_x]) qq = tf.concat(axis=2, values=[qq, neg_q]) if config.highway: with tf.variable_scope("highway"): xx = highway_network(xx, config.highway_num_layers, True, wd=config.wd, is_train=self.is_train) tf.get_variable_scope().reuse_variables() qq = highway_network(qq, config.highway_num_layers, True, wd=config.wd, is_train=self.is_train) self.tensor_dict['xx'] = xx self.tensor_dict['qq'] = qq cell_fw = BasicLSTMCell(d, state_is_tuple=True) cell_bw = BasicLSTMCell(d, state_is_tuple=True) d_cell_fw = SwitchableDropoutWrapper( cell_fw, self.is_train, input_keep_prob=config.input_keep_prob) d_cell_bw = SwitchableDropoutWrapper( cell_bw, self.is_train, input_keep_prob=config.input_keep_prob) cell_fw2 = BasicLSTMCell(d, state_is_tuple=True) cell_bw2 = BasicLSTMCell(d, state_is_tuple=True) d_cell_fw2 = SwitchableDropoutWrapper( cell_fw2, self.is_train, input_keep_prob=config.input_keep_prob) d_cell_bw2 = SwitchableDropoutWrapper( cell_bw2, self.is_train, input_keep_prob=config.input_keep_prob) x_len = tf.reduce_sum(tf.cast(self.x_mask, 'int32'), 2) # [N, M] q_len = tf.reduce_sum(tf.cast(self.q_mask, 'int32'), 1) # [N] if config.lstm: with tf.variable_scope("prepro"): (fw_u, bw_u), ((_, fw_u_f), (_, bw_u_f)) = bidirectional_dynamic_rnn( d_cell_fw, d_cell_bw, qq, q_len, dtype='float', scope='u1') # [N, J, d], [N, d] print('fw_u_f hsape:{}'.format(fw_u_f.get_shape())) u = tf.concat(axis=2, values=[fw_u, bw_u]) #[N,JQ,2d] if config.share_lstm_weights: tf.get_variable_scope().reuse_variables() (fw_h, bw_h), _ = bidirectional_dynamic_rnn( cell_fw, cell_bw, xx, x_len, dtype='float', scope='u1') # [N, M, JX, 2d] h = tf.concat(axis=3, values=[fw_h, bw_h]) # [N, M, JX, 2d] print('fw_u_f nn hsape:{}'.format(fw_u_f.get_shape())) else: (fw_h, bw_h), _ = bidirectional_dynamic_rnn( cell_fw, cell_bw, xx, x_len, dtype='float', scope='h1') # [N, M, JX, 2d] h = tf.concat(axis=3, values=[fw_h, bw_h]) # [N, M, JX, 2d] self.tensor_dict['u'] = u self.tensor_dict['h'] = h else: h = xx u = qq h1 = h[:, 0, :, :] h2 = h[:, 1, :, :] h3 = h[:, 2, :, :] h4 = h[:, 3, :, :] n_1 = tf.reshape(self.x_mask[:, 0, :], [N, JX]) n_2 = tf.reshape(self.x_mask[:, 1, :], [N, JX]) n_3 = tf.reshape(self.x_mask[:, 2, :], [N, JX]) n_4 = tf.reshape(self.x_mask[:, 3, :], [N, JX]) if config.self_attention: with tf.variable_scope("h_self_weight"): print(h.get_shape()) for i in range(2): with tf.variable_scope("self-attention"): h1 = self_attention_layer( config, self.is_train, h1, p_mask=tf.expand_dims(n_1, -1), scope="{}_layer_self_att_enc_e".format( i)) # [N, len, dim] tf.get_variable_scope().reuse_variables() h2 = self_attention_layer( config, self.is_train, h2, p_mask=tf.expand_dims(n_2, -1), scope="{}_layer_self_att_enc_e".format(i)) tf.get_variable_scope().reuse_variables() h3 = self_attention_layer( config, self.is_train, h3, p_mask=tf.expand_dims(n_3, -1), scope="{}_layer_self_att_enc_e".format(i)) tf.get_variable_scope().reuse_variables() h4 = self_attention_layer( config, self.is_train, h4, p_mask=tf.expand_dims(n_4, -1), scope="{}_layer_self_att_enc_e".format(i)) with tf.variable_scope("self-attention"): u = self_attention_layer( config, self.is_train, u, p_mask=tf.expand_dims(self.q_mask, -1), scope="{}_layer_self_att_enc_p".format(i)) if config.plot_encoder == "concate": h = tf.concat([h1, h2, h3, h4], axis=1) print("h concate shape".format(h.get_shape())) n_n = tf.concat([n_1, n_2, n_3, n_4], axis=1) elif config.plot_encoder == "sum": h1 = tf.expand_dims(h1, axis=1) h2 = tf.expand_dims(h2, axis=1) h3 = tf.expand_dims(h3, axis=1) h4 = tf.expand_dims(h4, axis=1) h = tf.concat([h1, h2, h3, h4], axis=1) h = tf.reduce_sum(h, axis=1) print("h sum shape".format(h.get_shape())) elif config.plot_encoder == "lstm": # h1 = tf.reduce_sum(h1, axis=1) h1 = tf.expand_dims(tf.reduce_sum(h1, axis=-1), axis=1) h2 = tf.expand_dims(tf.reduce_sum(h2, axis=-1), axis=1) h3 = tf.expand_dims(tf.reduce_sum(h3, axis=-1), axis=1) h4 = tf.expand_dims(tf.reduce_sum(h4, axis=-1), axis=1) (fw_u, bw_u), ((_, fw_u_f), (_, bw_u_f)) = bidirectional_dynamic_rnn( d_cell_fw2, d_cell_bw2, tf.concat([h1, h2, h3, h4], axis=1), dtype='float', scope='1') # [N, J, d], [N, d] print('fw_u_f hsape:{}'.format(fw_u_f.get_shape())) h = tf.concat(axis=2, values=[fw_u, bw_u]) # [N,JQ,2d] u = tf.expand_dims(tf.reduce_sum(u, axis=-1), axis=1) tf.get_variable_scope().reuse_variables() (fw_u, bw_u), ((_, fw_u_f), (_, bw_u_f)) = bidirectional_dynamic_rnn( d_cell_fw2, d_cell_bw2, tf.concat([u], axis=1), dtype='float', scope='1') # [N, J, d], [N, d] print('fw_u_f hsape:{}'.format(fw_u_f.get_shape())) u = tf.concat(axis=2, values=[fw_u, bw_u]) # [N,JQ,2d] if config.interact: with tf.variable_scope("interact"): def get_attention(h, u, m): JX = tf.shape(h)[1] JQ = tf.shape(u)[1] h = tf.expand_dims(h, 2) u = tf.expand_dims(u, 1) h = tf.tile(h, [1, 1, JQ, 1]) u = tf.tile(u, [1, JX, 1, 1]) attention = h * u # N,JX,JQ,2d return attention if config.plot_encoder == "concate": attention = get_attention(h, u, M) else: attention = get_attention(h, u, 1) with tf.variable_scope('conv_dense'): if config.plot_encoder == "concate": out_final = dense_net(config, attention, self.is_train) else: out_final = tf.reshape(attention, shape=[N, -1]) else: h = tf.reshape(h, [-1, M * 2 * d * JX]) print("h shape {}".format(h.get_shape())) u = tf.reshape(u, [-1, 2 * d * JQ]) print("U shape {}".format(u.get_shape())) attention = tf.concat([h, u], axis=-1) out_final = attention out_final = linear(tf.concat([attention], axis=-1), 1000, True, bias_start=0.0, scope="logit8", squeeze=False, wd=config.wd, input_keep_prob=config.output_keep_pro, is_train=self.is_train) out_final = tf.nn.relu(out_final) out_final = linear(tf.concat([out_final], axis=-1), 400, True, bias_start=0.0, scope="logit9", squeeze=False, wd=config.wd, input_keep_prob=config.output_keep_pro, is_train=self.is_train) out_final = tf.nn.relu(out_final) out_final = linear(out_final, 300, True, bias_start=0.0, scope="logit3", squeeze=False, wd=config.wd, input_keep_prob=config.output_keep_pro, is_train=self.is_train) out_final = tf.nn.relu(out_final) with tf.variable_scope('conv_dense'): if config.hao: out_final = linear(tf.concat( [out_final, self.haoruopeng_feature], axis=-1), 200, True, bias_start=0.0, scope="logit", squeeze=False, wd=config.wd, input_keep_prob=config.output_keep_pro, is_train=self.is_train) out_final = tf.nn.relu(out_final) out_final = linear(out_final, 100, True, bias_start=0.0, scope="logit3", squeeze=False, wd=config.wd, input_keep_prob=config.output_keep_pro, is_train=self.is_train) out_final = tf.nn.relu(out_final) else: out_final = linear(tf.concat([out_final], axis=-1), 200, True, bias_start=0.0, scope="logit", squeeze=False, wd=config.wd, input_keep_prob=config.output_keep_pro, is_train=self.is_train) out_final = linear(out_final, 100, True, bias_start=0.0, scope="logit3", squeeze=False, wd=config.wd, input_keep_prob=config.output_keep_pro, is_train=self.is_train) out_final = tf.nn.relu(out_final) self.tensor_dict['outfinal'] = out_final self.prediction = linear(tf.concat([out_final], axis=-1), 1, True, bias_start=0.0, scope="logit2", squeeze=False, wd=config.wd, input_keep_prob=config.output_keep_pro, is_train=self.is_train)
def _build_forward(self): config = self.config N, M, JX, JQ, VW, d, dc, W = \ config.batch_size, config.max_num_sents, config.max_sent_size, \ config.max_ques_size, config.word_vocab_size, config.hidden_size, \ config.char_emb_size, config.max_word_size H = config.max_tree_height x_mask = self.x > 0 q_mask = self.q > 0 tx_mask = self.tx > 0 # [N, M, H, JX] with tf.variable_scope("char_emb"): char_emb_mat = tf.get_variable("char_emb_mat", shape=[VC, dc], dtype='float') Acx = tf.nn.embedding_lookup(char_emb_mat, self.cx) # [N, M, JX, W, dc] Acq = tf.nn.embedding_lookup(char_emb_mat, self.cq) # [N, JQ, W, dc] filter = tf.get_variable("filter", shape=[1, config.char_filter_height, dc, d], dtype='float') bias = tf.get_variable("bias", shape=[d], dtype='float') strides = [1, 1, 1, 1] Acx = tf.reshape(Acx, [-1, JX, W, dc]) Acq = tf.reshape(Acq, [-1, JQ, W, dc]) xxc = tf.nn.conv2d(Acx, filter, strides, "VALID") + bias # [N*M, JX, W/filter_stride, d] qqc = tf.nn.conv2d(Acq, filter, strides, "VALID") + bias # [N, JQ, W/filter_stride, d] xxc = tf.reshape(tf.reduce_max(tf.nn.relu(xxc), 2), [-1, M, JX, d]) qqc = tf.reshape(tf.reduce_max(tf.nn.relu(qqc), 2), [-1, JQ, d]) with tf.variable_scope("word_emb"): if config.mode == 'train': word_emb_mat = tf.get_variable("word_emb_mat", dtype='float', shape=[VW, config.word_emb_size], initializer=get_initializer(config.emb_mat)) else: word_emb_mat = tf.get_variable("word_emb_mat", shape=[VW, config.word_emb_size], dtype='float') Ax = tf.nn.embedding_lookup(word_emb_mat, self.x) # [N, M, JX, d] Aq = tf.nn.embedding_lookup(word_emb_mat, self.q) # [N, JQ, d] # Ax = linear([Ax], d, False, scope='Ax_reshape') # Aq = linear([Aq], d, False, scope='Aq_reshape') xx = tf.concat(3, [xxc, Ax]) # [N, M, JX, 2d] qq = tf.concat(2, [qqc, Aq]) # [N, JQ, 2d] D = d + config.word_emb_size with tf.variable_scope("pos_emb"): pos_emb_mat = tf.get_variable("pos_emb_mat", shape=[config.pos_vocab_size, d], dtype='float') Atx = tf.nn.embedding_lookup(pos_emb_mat, self.tx) # [N, M, H, JX, d] cell = BasicLSTMCell(D, state_is_tuple=True) cell = SwitchableDropoutWrapper(cell, self.is_train, input_keep_prob=config.input_keep_prob) x_len = tf.reduce_sum(tf.cast(x_mask, 'int32'), 2) # [N, M] q_len = tf.reduce_sum(tf.cast(q_mask, 'int32'), 1) # [N] with tf.variable_scope("rnn"): (fw_h, bw_h), _ = bidirectional_dynamic_rnn(cell, cell, xx, x_len, dtype='float', scope='start') # [N, M, JX, 2d] tf.get_variable_scope().reuse_variables() (fw_us, bw_us), (_, (fw_u, bw_u)) = bidirectional_dynamic_rnn(cell, cell, qq, q_len, dtype='float', scope='start') # [N, J, d], [N, d] u = (fw_u + bw_u) / 2.0 h = (fw_h + bw_h) / 2.0 with tf.variable_scope("h"): no_op_cell = NoOpCell(D) tree_rnn_cell = TreeRNNCell(no_op_cell, d, tf.reduce_max) initial_state = tf.reshape(h, [N*M*JX, D]) # [N*M*JX, D] inputs = tf.concat(4, [Atx, tf.cast(self.tx_edge_mask, 'float')]) # [N, M, H, JX, d+JX] inputs = tf.reshape(tf.transpose(inputs, [0, 1, 3, 2, 4]), [N*M*JX, H, d + JX]) # [N*M*JX, H, d+JX] length = tf.reshape(tf.reduce_sum(tf.cast(tx_mask, 'int32'), 2), [N*M*JX]) # length = tf.reshape(tf.reduce_sum(tf.cast(tf.transpose(tx_mask, [0, 1, 3, 2]), 'float'), 3), [-1]) h, _ = dynamic_rnn(tree_rnn_cell, inputs, length, initial_state=initial_state) # [N*M*JX, H, D] h = tf.transpose(tf.reshape(h, [N, M, JX, H, D]), [0, 1, 3, 2, 4]) # [N, M, H, JX, D] u = tf.expand_dims(tf.expand_dims(tf.expand_dims(u, 1), 1), 1) # [N, 1, 1, 1, 4d] dot = linear(h * u, 1, True, squeeze=True, scope='dot') # [N, M, H, JX] # self.logits = tf.reshape(dot, [N, M * H * JX]) self.logits = tf.reshape(exp_mask(dot, tx_mask), [N, M * H * JX]) # [N, M, H, JX] self.yp = tf.reshape(tf.nn.softmax(self.logits), [N, M, H, JX])
def initialize(self): params = self.params placeholders = self.placeholders tensors = self.tensors variables_dict = self.variables_dict self.task = int(params.task) self.dstc = self.task % 10 == 6 self.match = params.use_match self.rnn = params.use_rnn N, J, Q, M = params.batch_size, params.max_sent_size, params.max_ques_size, params.mem_size V, Alist = params.vocab_size d = params.hidden_size L = params.mem_num_layers att_forget_bias = params.att_forget_bias use_vector_gate = params.use_vector_gate wd = params.wd initializer = tf.random_uniform_initializer(-np.sqrt(3), np.sqrt(3)) self.ans_dic = { 1: range(5), 2: range(5), 3: [0, 5], 4: [0, 6, 7], 5: range(8), 6: range(11) } self.num_candidate = Alist[0] + 1 data_task = self.task % 10 if not self.rnn else self.task self.ans = self.ans_dic.get(data_task, [0]) self.num_ans = len(self.ans) if self.rnn and self.task == 3: self.num_ans = 3 elif self.rnn and self.task == 4: self.num_ans = 4 elif self.rnn: self.num_ans = 6 with tf.name_scope("placeholders"): x = tf.placeholder('int32', shape=[N, M, J], name='x') x_mask = tf.placeholder('bool', shape=[N, M, J], name='x_mask') q = tf.placeholder('int32', shape=[N, J], name='q') q_mask = tf.placeholder('bool', shape=[N, J], name='q_mask') y = tf.placeholder('int32', shape=[N, self.num_ans], name='y') y_mask = tf.placeholder('bool', shape=[N, self.num_ans], name='y_mask') y_feats = [] for i in self.ans[1:]: A = Alist[0] if self.rnn else Alist[i] y_feats.append( tf.placeholder('int32', shape=[N, 2, A], name='y_feat' + str(i))) self.y_state_dim = self.num_ans - 2 if self.rnn else self.num_ans - 1 y_state = tf.placeholder('bool', shape=[N, self.y_state_dim], name='y_state') is_train = tf.placeholder('bool', shape=[], name='is_train') placeholders['x'] = x placeholders['x_mask'] = x_mask placeholders['q'] = q placeholders['q_mask'] = q_mask placeholders['y'] = y placeholders['y_mask'] = y_mask placeholders['y_feats'] = y_feats placeholders['y_state'] = y_state placeholders['is_train'] = is_train with tf.variable_scope("embedding"): A = VariableEmbedder(params, wd=wd, initializer=initializer, name='A') Aq = A(q, name='Aq') # [N, S, J, d] Ax = A(x, name='Ax') # [N, S, J, d] with tf.name_scope("encoding"): encoder = PositionEncoder(J, d) u = encoder(Aq, q_mask) # [N, d] m = encoder(Ax, x_mask) # [N, M, d] with tf.variable_scope("networks"): m_mask = tf.reduce_max(tf.cast(x_mask, 'int64'), 2, name='m_mask') # [N, M] gate_mask = tf.expand_dims(m_mask, -1) m_length = tf.reduce_sum(m_mask, 1, name='m_length') # [N] prev_u = tf.tile(tf.expand_dims(u, 1), [1, M, 1]) # [N, M, d] reg_layer = VectorReductionLayer( N, M, d) if use_vector_gate else ReductionLayer(N, M, d) gate_size = d if use_vector_gate else 1 h = None # [N, M, d] as_, rfs, rbs = [], [], [] hs = [] for layer_idx in range(L): with tf.name_scope("layer_{}".format(layer_idx)): u_t = tf.tanh( linear([prev_u, m], d, True, wd=wd, scope='u_t')) a = tf.cast(gate_mask, 'float') * tf.sigmoid( linear([prev_u * m], gate_size, True, initializer=initializer, wd=wd, scope='a') - att_forget_bias) h = reg_layer(u_t, a, 1.0 - a, scope='h') if layer_idx + 1 < L: if params.use_reset: rf, rb = tf.split( 2, 2, tf.cast(gate_mask, 'float') * tf.sigmoid( linear([prev_u * m], 2 * gate_size, True, initializer=initializer, wd=wd, scope='r'))) else: rf = rb = tf.ones(a.get_shape().as_list()) u_t_rev = tf.reverse_sequence(u_t, m_length, 1) a_rev, rb_rev = tf.reverse_sequence( a, m_length, 1), tf.reverse_sequence(rb, m_length, 1) uf = reg_layer(u_t, a * rf, 1.0 - a, scope='uf') ub_rev = reg_layer(u_t_rev, a_rev * rb_rev, 1.0 - a_rev, scope='ub_rev') ub = tf.reverse_sequence(ub_rev, m_length, 1) prev_u = uf + ub else: rf = rb = tf.zeros(a.get_shape().as_list()) rfs.append(rf) rbs.append(rb) as_.append(a) hs.append(h) tf.get_variable_scope().reuse_variables() h_last = tf.squeeze(tf.slice(h, [0, M - 1, 0], [-1, -1, -1]), [1]) # [N, d] hs_last = [ tf.squeeze(tf.slice(each, [0, M - 1, 0], [-1, -1, -1]), [1]) for each in hs ] a = tf.transpose(tf.pack(as_, name='a'), [1, 0, 2, 3]) rf = tf.transpose(tf.pack(rfs, name='rf'), [1, 0, 2, 3]) rb = tf.transpose(tf.pack(rbs, name='rb'), [1, 0, 2, 3]) tensors['a'] = a tensors['rf'] = rf tensors['rb'] = rb with tf.variable_scope("class"): class_mode = params.class_mode use_class_bias = params.use_class_bias logits = [] drop_rate = tf.cond(is_train, lambda: tf.constant(0.5), lambda: tf.constant(1.0)) if class_mode == 'h': if self.rnn: # rnn decoder hiddens = [] # previous hidden vector A = self.num_candidate for i in range(self.num_ans): # Inverse Embedding Matrix of Answers [A, A] E_inv = tf.get_variable( "E_inv", [A, A], initializer=tf.constant_initializer(0.0)) prev_h = h_last if i == 0: # If it is the first answer, use initial y prev_y = tf.reshape( tf.tile( tf.get_variable( "Wx", A, initializer=tf.constant_initializer( 0.0)), [N]), [N, A]) else: # Otherwise, use Inverse Embedding Matrix _prev_y = tf.reshape( tf.gather(tf.transpose(y), i - 1), [N]) prev_y = tf.nn.embedding_lookup(E_inv, _prev_y) #prev_h = hiddens[-1] _logit = linear([prev_h], A, use_class_bias, wd=wd, name='0') logit = _logit * prev_y hiddens.append(S2) logits.append(S2) tf.get_variable_scope().reuse_variables() else: if self.match: # Input of softmax when using match all_y_feats = [None] + y_feats all_y_states = [y_state ] + [None] * (len(all_y_feats) - 1) for i, j in enumerate(self.ans): if self.match: logits.append( linear([h_last], Alist[j], use_class_bias, wd=wd, name=str(i), feat=all_y_feats[i], state=all_y_states[i], drop_rate=drop_rate)) else: logits.append( linear([h_last], Alist[j], use_class_bias, wd=wd, name=str(i))) elif class_mode == 'uh': logits = linear([h_last, u], A, use_class_bias, wd=wd) elif class_mode == 'hs': logits = linear(hs_last, A, use_class_bias, wd=wd) elif class_mode == 'hss': logits = linear(sum(hs_last), A, use_class_bias, wd=wd) else: raise Exception("Invalid class mode: {}".format(class_mode)) for i in range(self.num_ans): yp_each = tf.cast(tf.expand_dims(tf.argmax(logits[i], 1), 1), 'int32') if i == 0: yp = yp_each else: yp = tf.concat(1, [yp, yp_each]) correct_ = tf.cast(tf.equal(yp, y), 'float') correct_sum = tf.reduce_sum(correct_ * tf.cast(y_mask, 'float'), 1) mask_ = tf.reduce_sum(tf.cast(y_mask, 'float'), 1) correct = tf.truediv(correct_sum, mask_) tensors['yp'] = yp tensors['correct_'] = correct_ tensors['mask_'] = mask_ tensors['y_mask'] = y_mask tensors['y'] = y tensors['correct'] = correct tensors['q'] = q if self.task > 20: tensors['y_state'] = y_state for i, j in enumerate(self.ans[1:]): tensors['y_feat' + str(i)] = tf.reshape( y_feats[i], [N, 2 * self.ans_num[j]]) with tf.name_scope("loss"): with tf.name_scope("ans_loss"): tot_ce = 0 for i in range(self.num_ans): _y = tf.gather(tf.transpose(y), i) ce = tf.nn.sparse_softmax_cross_entropy_with_logits( logits[i], _y) m = tf.cast(tf.gather(tf.transpose(y_mask), i), 'float32') tot_ce += tf.reduce_sum(ce * m, name='avg_ce') tf.add_to_collection('losses', tot_ce) losses = tf.get_collection('losses') loss = tf.add_n(losses, name='loss') tensors['loss'] = loss variables_dict['all'] = tf.trainable_variables()
def _build_forward(self): config = self.config N, M, JX, JQ, VW, VC, d, dc, W = \ config.batch_size, config.max_num_sents, config.max_sent_size, \ config.max_ques_size, config.word_vocab_size, config.char_vocab_size, config.hidden_size, \ config.char_emb_size, config.max_word_size H = config.max_tree_height x_mask = self.x > 0 q_mask = self.q > 0 tx_mask = self.tx > 0 # [N, M, H, JX] with tf.variable_scope("char_emb"): char_emb_mat = tf.get_variable("char_emb_mat", shape=[VC, dc], dtype='float') Acx = tf.nn.embedding_lookup(char_emb_mat, self.cx) # [N, M, JX, W, dc] Acq = tf.nn.embedding_lookup(char_emb_mat, self.cq) # [N, JQ, W, dc] filter = tf.get_variable("filter", shape=[1, config.char_filter_height, dc, d], dtype='float') bias = tf.get_variable("bias", shape=[d], dtype='float') strides = [1, 1, 1, 1] Acx = tf.reshape(Acx, [-1, JX, W, dc]) Acq = tf.reshape(Acq, [-1, JQ, W, dc]) xxc = tf.nn.conv2d(Acx, filter, strides, "VALID") + bias # [N*M, JX, W/filter_stride, d] qqc = tf.nn.conv2d(Acq, filter, strides, "VALID") + bias # [N, JQ, W/filter_stride, d] xxc = tf.reshape(tf.reduce_max(tf.nn.relu(xxc), 2), [-1, M, JX, d]) qqc = tf.reshape(tf.reduce_max(tf.nn.relu(qqc), 2), [-1, JQ, d]) with tf.variable_scope("word_emb"): if config.mode == 'train': word_emb_mat = tf.get_variable("word_emb_mat", dtype='float', shape=[VW, config.word_emb_size], initializer=get_initializer(config.emb_mat)) else: word_emb_mat = tf.get_variable("word_emb_mat", shape=[VW, config.word_emb_size], dtype='float') Ax = tf.nn.embedding_lookup(word_emb_mat, self.x) # [N, M, JX, d] Aq = tf.nn.embedding_lookup(word_emb_mat, self.q) # [N, JQ, d] # Ax = linear([Ax], d, False, scope='Ax_reshape') # Aq = linear([Aq], d, False, scope='Aq_reshape') xx = tf.concat(3, [xxc, Ax]) # [N, M, JX, 2d] qq = tf.concat(2, [qqc, Aq]) # [N, JQ, 2d] D = d + config.word_emb_size with tf.variable_scope("pos_emb"): pos_emb_mat = tf.get_variable("pos_emb_mat", shape=[config.pos_vocab_size, d], dtype='float') Atx = tf.nn.embedding_lookup(pos_emb_mat, self.tx) # [N, M, H, JX, d] cell = BasicLSTMCell(D, state_is_tuple=True) cell = SwitchableDropoutWrapper(cell, self.is_train, input_keep_prob=config.input_keep_prob) x_len = tf.reduce_sum(tf.cast(x_mask, 'int32'), 2) # [N, M] q_len = tf.reduce_sum(tf.cast(q_mask, 'int32'), 1) # [N] with tf.variable_scope("rnn"): (fw_h, bw_h), _ = bidirectional_dynamic_rnn(cell, cell, xx, x_len, dtype='float', scope='start') # [N, M, JX, 2d] tf.get_variable_scope().reuse_variables() (fw_us, bw_us), (_, (fw_u, bw_u)) = bidirectional_dynamic_rnn(cell, cell, qq, q_len, dtype='float', scope='start') # [N, J, d], [N, d] u = (fw_u + bw_u) / 2.0 h = (fw_h + bw_h) / 2.0 with tf.variable_scope("h"): no_op_cell = NoOpCell(D) tree_rnn_cell = TreeRNNCell(no_op_cell, d, tf.reduce_max) initial_state = tf.reshape(h, [N*M*JX, D]) # [N*M*JX, D] inputs = tf.concat(4, [Atx, tf.cast(self.tx_edge_mask, 'float')]) # [N, M, H, JX, d+JX] inputs = tf.reshape(tf.transpose(inputs, [0, 1, 3, 2, 4]), [N*M*JX, H, d + JX]) # [N*M*JX, H, d+JX] length = tf.reshape(tf.reduce_sum(tf.cast(tx_mask, 'int32'), 2), [N*M*JX]) # length = tf.reshape(tf.reduce_sum(tf.cast(tf.transpose(tx_mask, [0, 1, 3, 2]), 'float'), 3), [-1]) h, _ = dynamic_rnn(tree_rnn_cell, inputs, length, initial_state=initial_state) # [N*M*JX, H, D] h = tf.transpose(tf.reshape(h, [N, M, JX, H, D]), [0, 1, 3, 2, 4]) # [N, M, H, JX, D] u = tf.expand_dims(tf.expand_dims(tf.expand_dims(u, 1), 1), 1) # [N, 1, 1, 1, 4d] dot = linear(h * u, 1, True, squeeze=True, scope='dot') # [N, M, H, JX] # self.logits = tf.reshape(dot, [N, M * H * JX]) self.logits = tf.reshape(exp_mask(dot, tx_mask), [N, M * H * JX]) # [N, M, H, JX] self.yp = tf.reshape(tf.nn.softmax(self.logits), [N, M, H, JX])
def initialize(self): params = self.params placeholders = self.placeholders tensors = self.tensors variables_dict = self.variables_dict N, J, V, Q, M = params.batch_size, params.max_sent_size, params.vocab_size, params.max_ques_size, params.mem_size d = params.hidden_size L = params.mem_num_layers forget_bias = params.forget_bias wd = params.wd initializer = tf.random_uniform_initializer(-np.sqrt(3), np.sqrt(3)) with tf.name_scope("placeholders"): x = tf.placeholder('int32', shape=[N, M, J], name='x') x_mask = tf.placeholder('bool', shape=[N, M, J], name='x_mask') q = tf.placeholder('int32', shape=[N, J], name='q') q_mask = tf.placeholder('bool', shape=[N, J], name='q_mask') y = tf.placeholder('int32', shape=[N], name='y') is_train = tf.placeholder('bool', shape=[], name='is_train') placeholders['x'] = x placeholders['x_mask'] = x_mask placeholders['q'] = q placeholders['q_mask'] = q_mask placeholders['y'] = y placeholders['is_train'] = is_train with tf.variable_scope("embedding"): A = VariableEmbedder(params, wd=wd, initializer=initializer, name='A') Aq = A(q, name='Aq') # [N, S, J, d] Ax = A(x, name='Ax') # [N, S, J, d] with tf.name_scope("encoding"): encoder = PositionEncoder(J, d) u = encoder(Aq, q_mask) # [N, d] m = encoder(Ax, x_mask) # [N, M, d] with tf.variable_scope("networks"): m_mask = tf.reduce_max(tf.cast(x_mask, 'int64'), 2, name='m_mask') # [N, M] m_length = tf.reduce_sum(m_mask, 1, name='m_length') # [N] initializer = tf.random_uniform_initializer( -np.sqrt(3), np.sqrt(3)) cell = RSMCell(d, forget_bias=forget_bias, wd=wd, initializer=initializer) us = tf.tile(tf.expand_dims(u, 1, name='u_prev_aug'), [1, M, 1]) # [N, d] -> [N, M, d] in_ = tf.concat( 2, [tf.ones([N, M, 1]), m, us, tf.zeros([N, M, 2 * d])], name='x_h_in') # [N, M, 4*d + 1] out, fw_state, bw_state, bi_tensors = dynamic_bidirectional_rnn( cell, in_, sequence_length=m_length, dtype='float', num_layers=L) a = tf.slice(out, [0, 0, 0], [-1, -1, 1]) # [N, M, 1] _, _, v, g = tf.split(2, 4, tf.slice(out, [0, 0, 1], [-1, -1, -1])) fw_h, fw_v = tf.split(1, 2, tf.slice(fw_state, [0, 1], [-1, -1])) bw_h, bw_v = tf.split(1, 2, tf.slice(bw_state, [0, 1], [-1, -1])) _, fw_u_out, fw_v_out, _ = tf.split( 2, 4, tf.squeeze( tf.slice(bi_tensors['fw_out'], [0, L - 1, 0, 2], [-1, -1, -1, -1]), [1])) _, bw_u_out, bw_v_out, _ = tf.split( 2, 4, tf.squeeze( tf.slice(bi_tensors['bw_out'], [0, L - 1, 0, 2], [-1, -1, -1, -1]), [1])) tensors['a'] = tf.squeeze( tf.slice(bi_tensors['in'], [0, 0, 0, 0], [-1, -1, -1, 1]), [3]) tensors['of'] = tf.squeeze( tf.slice(bi_tensors['fw_out'], [0, 0, 0, 1], [-1, -1, -1, 1]), [3]) tensors['ob'] = tf.squeeze( tf.slice(bi_tensors['bw_out'], [0, 0, 0, 1], [-1, -1, -1, 1]), [3]) with tf.variable_scope("selection"): # w = tf.nn.relu(linear([fw_v + 1e-9*(fw_h+bw_h)], d, True, wd=wd)) w = fw_v + 1e-9 * (fw_h + bw_h) tensors['s'] = a with tf.variable_scope("class"): if params.use_ques: logits = linear([w, u], V, True, wd=wd) else: # W = tf.transpose(A.emb_mat, name='W') W = tf.get_variable('W', shape=[d, V]) logits = tf.matmul(w, W, name='logits') yp = tf.cast(tf.argmax(logits, 1), 'int32') correct = tf.equal(yp, y) tensors['yp'] = yp tensors['correct'] = correct with tf.name_scope("loss"): with tf.name_scope("ans_loss"): ce = tf.nn.sparse_softmax_cross_entropy_with_logits(logits, y, name='ce') avg_ce = tf.reduce_mean(ce, name='avg_ce') tf.add_to_collection('losses', avg_ce) losses = tf.get_collection('losses') loss = tf.add_n(losses, name='loss') tensors['loss'] = loss variables_dict['all'] = tf.trainable_variables()
def initialize(self): params = self.params placeholders = self.placeholders tensors = self.tensors variables_dict = self.variables_dict self.task = int(params.task) self.dstc = self.task%10 == 6 self.match = params.use_match self.rnn = params.use_rnn N, J, Q, M = params.batch_size, params.max_sent_size, params.max_ques_size, params.mem_size V, Alist = params.vocab_size d = params.hidden_size L = params.mem_num_layers att_forget_bias = params.att_forget_bias use_vector_gate = params.use_vector_gate wd = params.wd initializer = tf.random_uniform_initializer(-np.sqrt(3), np.sqrt(3)) self.ans_dic = { 1 : range(5), 2 : range(5), 3 : [0, 5], 4 : [0,6,7], 5 : range(8), 6 : range(11) } self.num_candidate = Alist[0]+1 data_task = self.task%10 if not self.rnn else self.task self.ans = self.ans_dic.get(data_task, [0]) self.num_ans = len(self.ans) if self.rnn and self.task==3 : self.num_ans = 3 elif self.rnn and self.task==4: self.num_ans = 4 elif self.rnn: self.num_ans = 6 with tf.name_scope("placeholders"): x = tf.placeholder('int32', shape=[N, M, J], name='x') x_mask = tf.placeholder('bool', shape=[N, M, J], name='x_mask') q = tf.placeholder('int32', shape=[N, J], name='q') q_mask = tf.placeholder('bool', shape=[N, J], name='q_mask') y = tf.placeholder('int32', shape=[N, self.num_ans], name='y') y_mask = tf.placeholder('bool', shape=[N, self.num_ans], name='y_mask') y_feats = [] for i in self.ans[1:]: A = Alist[0] if self.rnn else Alist[i] y_feats.append(tf.placeholder('int32', shape=[N, 2, A], name='y_feat'+str(i))) self.y_state_dim = self.num_ans-2 if self.rnn else self.num_ans-1 y_state =tf.placeholder('bool', shape=[N, self.y_state_dim], name='y_state') is_train = tf.placeholder('bool', shape=[], name='is_train') placeholders['x'] = x placeholders['x_mask'] = x_mask placeholders['q'] = q placeholders['q_mask'] = q_mask placeholders['y'] = y placeholders['y_mask'] = y_mask placeholders['y_feats'] = y_feats placeholders['y_state'] = y_state placeholders['is_train'] = is_train with tf.variable_scope("embedding"): A = VariableEmbedder(params, wd=wd, initializer=initializer, name='A') Aq = A(q, name='Aq') # [N, S, J, d] Ax = A(x, name='Ax') # [N, S, J, d] with tf.name_scope("encoding"): encoder = PositionEncoder(J, d) u = encoder(Aq, q_mask) # [N, d] m = encoder(Ax, x_mask) # [N, M, d] with tf.variable_scope("networks"): m_mask = tf.reduce_max(tf.cast(x_mask, 'int64'), 2, name='m_mask') # [N, M] gate_mask = tf.expand_dims(m_mask, -1) m_length = tf.reduce_sum(m_mask, 1, name='m_length') # [N] prev_u = tf.tile(tf.expand_dims(u, 1), [1, M, 1]) # [N, M, d] reg_layer = VectorReductionLayer(N, M, d) if use_vector_gate else ReductionLayer(N, M, d) gate_size = d if use_vector_gate else 1 h = None # [N, M, d] as_, rfs, rbs = [], [], [] hs = [] for layer_idx in range(L): with tf.name_scope("layer_{}".format(layer_idx)): u_t = tf.tanh(linear([prev_u, m], d, True, wd=wd, scope='u_t')) a = tf.cast(gate_mask, 'float') * tf.sigmoid(linear([prev_u * m], gate_size, True, initializer=initializer, wd=wd, scope='a') - att_forget_bias) h = reg_layer(u_t, a, 1.0-a, scope='h') if layer_idx + 1 < L: if params.use_reset: rf, rb = tf.split(2, 2, tf.cast(gate_mask, 'float') * tf.sigmoid(linear([prev_u * m], 2 * gate_size, True, initializer=initializer, wd=wd, scope='r'))) else: rf = rb = tf.ones(a.get_shape().as_list()) u_t_rev = tf.reverse_sequence(u_t, m_length, 1) a_rev, rb_rev = tf.reverse_sequence(a, m_length, 1), tf.reverse_sequence(rb, m_length, 1) uf = reg_layer(u_t, a*rf, 1.0-a, scope='uf') ub_rev = reg_layer(u_t_rev, a_rev*rb_rev, 1.0-a_rev, scope='ub_rev') ub = tf.reverse_sequence(ub_rev, m_length, 1) prev_u = uf + ub else: rf = rb = tf.zeros(a.get_shape().as_list()) rfs.append(rf) rbs.append(rb) as_.append(a) hs.append(h) tf.get_variable_scope().reuse_variables() h_last = tf.squeeze(tf.slice(h, [0, M-1, 0], [-1, -1, -1]), [1]) # [N, d] hs_last = [tf.squeeze(tf.slice(each, [0, M-1, 0], [-1, -1, -1]), [1]) for each in hs] a = tf.transpose(tf.pack(as_, name='a'), [1, 0, 2, 3]) rf = tf.transpose(tf.pack(rfs, name='rf'), [1, 0, 2, 3]) rb = tf.transpose(tf.pack(rbs, name='rb'), [1, 0, 2, 3]) tensors['a'] = a tensors['rf'] = rf tensors['rb'] = rb with tf.variable_scope("class"): class_mode = params.class_mode use_class_bias = params.use_class_bias logits = [] drop_rate = tf.cond(is_train, lambda: tf.constant(0.5), lambda: tf.constant(1.0)) if class_mode == 'h': if self.rnn: # rnn decoder hiddens = [] # previous hidden vector A = self.num_candidate for i in range(self.num_ans): # Inverse Embedding Matrix of Answers [A, A] E_inv = tf.get_variable("E_inv", [A, A], initializer = tf.constant_initializer(0.0)) prev_h = h_last if i==0: # If it is the first answer, use initial y prev_y = tf.reshape(tf.tile(tf.get_variable("Wx", A, initializer = tf.constant_initializer(0.0)), [N]), [N, A]) else: # Otherwise, use Inverse Embedding Matrix _prev_y = tf.reshape(tf.gather(tf.transpose(y), i-1), [N]) prev_y = tf.nn.embedding_lookup(E_inv, _prev_y) #prev_h = hiddens[-1] _logit = linear([prev_h], A, use_class_bias, wd=wd, name='0') logit = _logit * prev_y hiddens.append(S2) logits.append(S2) tf.get_variable_scope().reuse_variables() else: if self.match: # Input of softmax when using match all_y_feats = [None] + y_feats all_y_states = [y_state] + [None]*(len(all_y_feats)-1) for i, j in enumerate(self.ans): if self.match: logits.append(linear([h_last], Alist[j], use_class_bias, wd=wd, name=str(i), feat = all_y_feats[i], state = all_y_states[i], drop_rate = drop_rate)) else: logits.append(linear([h_last], Alist[j], use_class_bias, wd=wd, name=str(i) )) elif class_mode == 'uh': logits = linear([h_last, u], A, use_class_bias, wd=wd) elif class_mode == 'hs': logits = linear(hs_last, A, use_class_bias, wd=wd) elif class_mode == 'hss': logits = linear(sum(hs_last), A, use_class_bias, wd=wd) else: raise Exception("Invalid class mode: {}".format(class_mode)) for i in range(self.num_ans): yp_each = tf.cast(tf.expand_dims(tf.argmax(logits[i], 1), 1), 'int32') if i == 0: yp = yp_each else: yp = tf.concat(1, [yp, yp_each]) correct_ = tf.cast(tf.equal(yp, y), 'float') correct_sum = tf.reduce_sum(correct_ * tf.cast(y_mask, 'float'), 1) mask_ = tf.reduce_sum(tf.cast(y_mask, 'float'), 1) correct = tf.truediv(correct_sum, mask_) tensors['yp'] = yp tensors['correct_'] = correct_ tensors['mask_'] = mask_ tensors['y_mask'] = y_mask tensors['y'] = y tensors['correct'] = correct tensors['q'] = q if self.task>20: tensors['y_state'] = y_state for i, j in enumerate(self.ans[1:]): tensors['y_feat'+str(i)] = tf.reshape(y_feats[i], [N, 2*self.ans_num[j]]) with tf.name_scope("loss"): with tf.name_scope("ans_loss"): tot_ce = 0 for i in range(self.num_ans): _y = tf.gather(tf.transpose(y), i) ce = tf.nn.sparse_softmax_cross_entropy_with_logits(logits[i], _y) m = tf.cast(tf.gather(tf.transpose(y_mask), i), 'float32') tot_ce += tf.reduce_sum(ce*m, name='avg_ce') tf.add_to_collection('losses', tot_ce) losses = tf.get_collection('losses') loss = tf.add_n(losses, name='loss') tensors['loss'] = loss variables_dict['all'] = tf.trainable_variables()