def inference(self, X, length, reuse=False): length_64 = tf.cast(length, tf.int64) with tf.variable_scope("bilstm", reuse=reuse): forward_output, _ = tf.nn.dynamic_rnn( tf.contrib.rnn.LSTMCell(self.num_hidden, reuse=reuse), X, dtype=tf.float32, sequence_length=length, scope="RNN_forward") backward_output_, _ = tf.nn.dynamic_rnn( tf.contrib.rnn.LSTMCell(self.num_hidden, reuse=reuse), inputs=tf.reverse_sequence(X, length_64, seq_dim=1), dtype=tf.float32, sequence_length=length, scope="RNN_backword") backward_output = tf.reverse_sequence(backward_output_, length_64, seq_dim=1) output = tf.concat([forward_output, backward_output], 2) output = tf.reshape(output, [-1, self.num_hidden * 2]) if reuse is None or not reuse: output = tf.nn.dropout(output, 0.5) matricized_unary_scores = tf.matmul(output, self.W) + self.b unary_scores = tf.reshape( matricized_unary_scores, [-1, self.max_seq_len, self.num_tags], name="Reshape_7" if reuse else None) return unary_scores
def _composition_function(self, inputs, length, init_state=None): if self._composition == "GRU": cell = GRUCell(self._size) return dynamic_rnn(cell, inputs, sequence_length=length, time_major=True, initial_state=init_state, dtype=tf.float32)[0] elif self._composition == "LSTM": cell = BasicLSTMCell(self._size) init_state = tf.concat(1, [tf.zeros_like(init_state, tf.float32), init_state]) if init_state else None outs = dynamic_rnn(cell, inputs, sequence_length=length, time_major=True, initial_state=init_state, dtype=tf.float32)[0] return outs elif self._composition == "BiGRU": cell = GRUCell(self._size // 2, self._size) init_state_fw, init_state_bw = tf.split(1, 2, init_state) if init_state else (None, None) with tf.variable_scope("forward"): fw_outs = dynamic_rnn(cell, inputs, sequence_length=length, time_major=True, initial_state=init_state_fw, dtype=tf.float32)[0] with tf.variable_scope("backward"): rev_inputs = tf.reverse_sequence(tf.pack(inputs), length, 0, 1) rev_inputs = [tf.reshape(x, [-1, self._size]) for x in tf.split(0, len(inputs), rev_inputs)] bw_outs = dynamic_rnn(cell, rev_inputs, sequence_length=length, time_major=True, initial_state=init_state_bw, dtype=tf.float32)[0] bw_outs = tf.reverse_sequence(tf.pack(bw_outs), length, 0, 1) bw_outs = [tf.reshape(x, [-1, self._size]) for x in tf.split(0, len(inputs), bw_outs)] return [tf.concat(1, [fw_out, bw_out]) for fw_out, bw_out in zip(fw_outs, bw_outs)] else: raise NotImplementedError("Other compositions not implemented yet.")
def __call__(self, inputs, seq_len, keep_prob=1.0, is_train=None, concat_layers=True): outputs = [tf.transpose(inputs, [1, 0, 2])] for layer in range(self.num_layers): gru_fw, gru_bw = self.grus[layer] init_fw, init_bw = self.inits[layer] mask_fw, mask_bw = self.dropout_mask[layer] with tf.variable_scope('fw_{}'.format(layer), reuse=tf.AUTO_REUSE): with tf.variable_scope('cudnn_gru', reuse=tf.AUTO_REUSE): out_fw, _ = tf.nn.dynamic_rnn(cell=gru_fw, inputs=outputs[-1] * mask_fw, time_major=True, initial_state=tuple(tf.unstack(init_fw, axis=0))) with tf.variable_scope('bw_{}'.format(layer), reuse=tf.AUTO_REUSE): with tf.variable_scope('cudnn_gru', reuse=tf.AUTO_REUSE): inputs_bw = tf.reverse_sequence( outputs[-1] * mask_bw, seq_lengths=seq_len, seq_dim=0, batch_dim=1) out_bw, _ = tf.nn.dynamic_rnn(cell=gru_bw, inputs=inputs_bw, time_major=True, initial_state=tuple(tf.unstack(init_bw, axis=0))) out_bw = tf.reverse_sequence( out_bw, seq_lengths=seq_len, seq_dim=0, batch_dim=1) outputs.append(tf.concat([out_fw, out_bw], axis=2)) if concat_layers: res = tf.concat(outputs[1:], axis=2) else: res = outputs[-1] res = tf.transpose(res, [1, 0, 2]) return res
def set_observations(self, observations, seq_lengths): """Stores the model's observations. Stores the observations (inputs and targets) in TensorArrays and precomputes things for later like the reverse RNN output and encoded targets. Args: observations: The observations of the model, a tuple containing two Tensors of shape [max_seq_len, batch_size, data_size]. The Tensors should be the inputs and targets, respectively. seq_lengths: An int Tensor of shape [batch_size] containing the length of each sequence in observations. """ inputs, targets = observations self.seq_lengths = seq_lengths self.max_seq_len = tf.reduce_max(seq_lengths) self.inputs_ta = base.ta_for_tensor(inputs, clear_after_read=False) self.targets_ta = base.ta_for_tensor(targets, clear_after_read=False) targets_encoded = base.encode_all(targets, self.data_encoder) self.targets_encoded_ta = base.ta_for_tensor(targets_encoded, clear_after_read=False) if self.rev_rnn_cell: reverse_targets_encoded = tf.reverse_sequence( targets_encoded, seq_lengths, seq_axis=0, batch_axis=1) # Compute the reverse rnn over the targets. reverse_rnn_out, _ = tf.nn.dynamic_rnn(self.rev_rnn_cell, reverse_targets_encoded, time_major=True, dtype=tf.float32) reverse_rnn_out = tf.reverse_sequence(reverse_rnn_out, seq_lengths, seq_axis=0, batch_axis=1) self.reverse_rnn_ta = base.ta_for_tensor(reverse_rnn_out, clear_after_read=False)
def build(self): print('Building model') self.x_embeddings = tf.Variable( tf.random_normal([self.alphabet_src_size, self.embedd_dims], stddev=0.1), name='x_embeddings') self.t_embeddings = tf.Variable( tf.random_normal([self.alphabet_tar_size, self.embedd_dims], stddev=0.1), name='t_embeddings') X_embedded = tf.gather(self.x_embeddings, self.Xs, name='embed_X') t_embedded = tf.gather(self.t_embeddings, self.ts_go, name='embed_t') with tf.variable_scope('dense_out'): W_out = tf.get_variable('W_out', [self.word_encoder_units*2, self.alphabet_tar_size]) b_out = tf.get_variable('b_out', [self.alphabet_tar_size]) # forward encoding char_enc_state, char_enc_out = encoder(X_embedded, self.X_len, 'char_encoder', self.char_encoder_units) char2word = _grid_gather(char_enc_out, self.X_spaces) char2word.set_shape([None, None, self.char_encoder_units]) word_enc_state, word_enc_out = encoder(char2word, self.X_spaces_len, 'word_encoder', self.word_encoder_units) # backward encoding words char2word = tf.reverse_sequence(char2word, tf.to_int64(self.X_spaces_len), 1) char2word.set_shape([None, None, self.char_encoder_units]) word_enc_state_bck, word_enc_out_bck = encoder(char2word, self.X_spaces_len, 'word_encoder_backwards', self.word_encoder_units) word_enc_out_bck = tf.reverse_sequence(word_enc_out_bck, tf.to_int64(self.X_spaces_len), 1) word_enc_state = tf.concat(1, [word_enc_state, word_enc_state_bck]) word_enc_out = tf.concat(2, [word_enc_out, word_enc_out_bck]) # decoding dec_state, dec_out, valid_dec_out, valid_attention_tracker = ( attention_decoder(word_enc_out, self.X_spaces_len, word_enc_state, t_embedded, self.t_len, self.attn_units, self.t_embeddings, W_out, b_out)) out_tensor = tf.reshape(dec_out, [-1, self.word_encoder_units*2]) out_tensor = tf.matmul(out_tensor, W_out) + b_out out_shape = tf.concat(0, [tf.expand_dims(tf.shape(self.X_len)[0], 0), tf.expand_dims(tf.shape(t_embedded)[1], 0), tf.expand_dims(tf.constant(self.alphabet_tar_size), 0)]) self.valid_attention_tracker = valid_attention_tracker.pack() self.out_tensor = tf.reshape(out_tensor, out_shape) self.out_tensor.set_shape([None, None, self.alphabet_tar_size]) valid_out_tensor = tf.reshape(valid_dec_out, [-1, self.word_encoder_units*2]) valid_out_tensor = tf.matmul(valid_out_tensor, W_out) + b_out self.valid_out_tensor = tf.reshape(valid_out_tensor, out_shape) self.out = None # add TensorBoard summaries for all variables tf.contrib.layers.summarize_variables()
def inference(self, wX, cX, reuse=None, trainMode=True): word_vectors = tf.nn.embedding_lookup(self.words, wX) char_vectors = tf.nn.embedding_lookup(self.chars, cX) char_vectors = tf.reshape(char_vectors, [-1, FLAGS.max_sentence_len, FLAGS.max_chars_per_word, FLAGS.embedding_char_size]) char_vectors = tf.transpose(char_vectors, perm=[1, 0, 2, 3]) char_vectors = tf.expand_dims(char_vectors, -1) length = self.length(wX) length_64 = tf.cast(length, tf.int64) # do conv def do_char_conv(x): return self.char_convolution(x) char_vectors_x = tf.map_fn(do_char_conv, char_vectors) char_vectors_x = tf.transpose(char_vectors_x, perm=[1, 0, 2]) word_vectors = tf.concat([word_vectors, char_vectors_x], axis=2) # if trainMode: # word_vectors = tf.nn.dropout(word_vectors, 0.5) reuse = None if trainMode else True with tf.variable_scope("rnn_fwbw", reuse=reuse) as scope: forward_output, _ = tf.nn.dynamic_rnn( tf.contrib.rnn.LSTMCell(self.numHidden, reuse=reuse), word_vectors, dtype=tf.float32, sequence_length=length, scope="RNN_forward") backward_output_, _ = tf.nn.dynamic_rnn( tf.contrib.rnn.LSTMCell(self.numHidden, reuse=reuse), inputs=tf.reverse_sequence(word_vectors, length_64, seq_dim=1), dtype=tf.float32, sequence_length=length, scope="RNN_backword") backward_output = tf.reverse_sequence(backward_output_, length_64, seq_dim=1) output = tf.concat([forward_output, backward_output], 2) output = tf.reshape(output, [-1, self.numHidden * 2]) if trainMode: output = tf.nn.dropout(output, 0.5) matricized_unary_scores = tf.matmul(output, self.W) + self.b # matricized_unary_scores = tf.nn.log_softmax(matricized_unary_scores) unary_scores = tf.reshape( matricized_unary_scores, [-1, FLAGS.max_sentence_len, self.distinctTagNum]) return unary_scores, length
def cudnn_bi_gru(units, n_hidden, seq_lengths=None, n_layers=1, trainable_initial_states=False, name='cudnn_bi_gru', reuse=False): """ Fast CuDNN Bi-GRU implementation Args: units: tf.Tensor with dimensions [B x T x F], where B - batch size T - number of tokens F - features n_hidden: dimensionality of hidden state seq_lengths: number of tokens in each sample in the batch n_layers: number of layers trainable_initial_states: whether to create a special trainable variable to initialize the hidden states of the network or use just zeros name: name of the variable scope to use reuse:whether to reuse already initialized variable Returns: h - all hidden states along T dimension, tf.Tensor with dimensionality [B x T x F] h_last - last hidden state, tf.Tensor with dimensionality [B x H * 2] where H - number of hidden units """ with tf.variable_scope(name, reuse=reuse): if seq_lengths is None: seq_lengths = tf.ones([tf.shape(units)[0]], dtype=tf.int32) * tf.shape(units)[1] with tf.variable_scope('Forward'): h_fw, h_last_fw = cudnn_gru_wrapper(units, n_hidden, n_layers=n_layers, trainable_initial_states=trainable_initial_states, seq_lengths=seq_lengths, reuse=reuse) with tf.variable_scope('Backward'): reversed_units = tf.reverse_sequence(units, seq_lengths=seq_lengths, seq_dim=1, batch_dim=0) h_bw, h_last_bw = cudnn_gru_wrapper(reversed_units, n_hidden, n_layers=n_layers, trainable_initial_states=trainable_initial_states, seq_lengths=seq_lengths, reuse=reuse) h_bw = tf.reverse_sequence(h_bw, seq_lengths=seq_lengths, seq_dim=1, batch_dim=0) return (h_fw, h_bw), (h_last_fw, h_last_bw)
def testShapeFunctionEdgeCases(self): # Batch size mismatched between input and seq_lengths. with self.assertRaises(ValueError): tf.reverse_sequence( tf.placeholder(tf.float32, shape=(32, 2, 3)), seq_lengths=tf.placeholder(tf.int64, shape=(33,)), seq_dim=3) # seq_dim out of bounds. with self.assertRaisesRegexp(ValueError, "seq_dim must be < input.dims()"): tf.reverse_sequence( tf.placeholder(tf.float32, shape=(32, 2, 3)), seq_lengths=tf.placeholder(tf.int64, shape=(32,)), seq_dim=3)
def lstm_seq2seq_internal(inputs, targets, hparams, train): """The basic LSTM seq2seq model, main step used for training.""" with tf.variable_scope("lstm_seq2seq"): if inputs is not None: inputs_length = common_layers.length_from_embedding(inputs) # Flatten inputs. inputs = common_layers.flatten4d3d(inputs) # LSTM encoder. inputs = tf.reverse_sequence(inputs, inputs_length, seq_axis=1) _, final_encoder_state = lstm(inputs, inputs_length, hparams, train, "encoder") else: final_encoder_state = None # LSTM decoder. shifted_targets = common_layers.shift_right(targets) # Add 1 to account for the padding added to the left from shift_right targets_length = common_layers.length_from_embedding(shifted_targets) + 1 decoder_outputs, _ = lstm( common_layers.flatten4d3d(shifted_targets), targets_length, hparams, train, "decoder", initial_state=final_encoder_state) return tf.expand_dims(decoder_outputs, axis=2)
def testFloatReverseSequenceGrad(self): x = np.asarray([ [[1, 2, 3, 4], [5, 6, 7, 8]], [[9, 10, 11, 12], [13, 14, 15, 16]], [[17, 18, 19, 20], [21, 22, 23, 24]]], dtype=np.float) x = x.reshape(3, 2, 4, 1, 1) x = x.transpose([2, 1, 0, 3, 4]) # transpose axes 0 <=> 2 # reverse dim 0 up to (0:3, none, 0:4) along dim=2 seq_dim = 0 batch_dim = 2 seq_lengths = np.asarray([3, 0, 4], dtype=np.int64) with self.test_session(): input_t = tf.constant(x, shape=x.shape) seq_lengths_t = tf.constant(seq_lengths, shape=seq_lengths.shape) reverse_sequence_out = tf.reverse_sequence(input_t, batch_dim=batch_dim, seq_dim=seq_dim, seq_lengths=seq_lengths_t) err = tf.test.compute_gradient_error(input_t, x.shape, reverse_sequence_out, x.shape, x_init_value=x) print("ReverseSequence gradient error = %g" % err) self.assertLess(err, 1e-8)
def step(self, time_, inputs, state, name=None): cell_output, cell_state = self.cell(inputs, state) cell_output_new, logits, attention_scores, attention_context = \ self.compute_output(cell_output) if self.reverse_scores_lengths is not None: attention_scores = tf.reverse_sequence( input=attention_scores, seq_lengths=self.reverse_scores_lengths, seq_dim=1, batch_dim=0) sample_ids = self.helper.sample( time=time_, outputs=logits, state=cell_state) outputs = AttentionDecoderOutput( logits=logits, predicted_ids=sample_ids, cell_output=cell_output_new, attention_scores=attention_scores, attention_context=attention_context) finished, next_inputs, next_state = self.helper.next_inputs( time=time_, outputs=outputs, state=cell_state, sample_ids=sample_ids) return (outputs, next_state, next_inputs, finished)
def __init__(self, embedding=None, hidden_state_d=100, max_length=80, learning_rate=0.001, dropout_rate=0.5, vocab_size=400001, embedding_d=300, num_classes=2): self.data = tf.placeholder(dtype=tf.int32, shape=[None, max_length]) self.len = tf.placeholder(dtype=tf.int32, shape=[None]) self.label = tf.placeholder(dtype=tf.float32, shape=[None]) self.neg_label = 1 - self.label self.co_label = tf.transpose(tf.reshape(tf.concat(0, [self.label, self.neg_label]), [2, -1])) self.init_embedding(embedding, vocab_size, embedding_d) # filter len to maxlength self.maxlen = tf.cast(tf.fill([tf.shape(self.len)[0]], max_length), tf.int64) self.filter = tf.less_equal(tf.cast(self.len, tf.int64), self.maxlen) self.clean_len = tf.select(self.filter, tf.cast(self.len, tf.int64), self.maxlen) self.vec_data = tf.nn.embedding_lookup(self.embedding, self.data) self.reversed_vec_data = tf.reverse_sequence(self.vec_data, seq_dim=1, seq_lengths=self.clean_len) with tf.variable_scope('left2right'): left2right_lstm_cell = tf.nn.rnn_cell.BasicLSTMCell(hidden_state_d, state_is_tuple=True) self.output, self.state = tf.nn.dynamic_rnn( left2right_lstm_cell, self.vec_data, dtype=tf.float32, sequence_length=self.len, ) with tf.variable_scope('right2left'): right2left_lstm_cell = tf.nn.rnn_cell.BasicLSTMCell(hidden_state_d, state_is_tuple=True) self.reversed_output, self.reversed_state = tf.nn.dynamic_rnn( right2left_lstm_cell, self.reversed_vec_data, dtype=tf.float32, sequence_length=self.len, ) self.last = BiLSTM.last_relevant(self.output, self.len) self.reversed_last = BiLSTM.last_relevant(self.reversed_output, self.len) self.final_output = tf.concat(1, [self.last, self.reversed_last]) self.dropout_last = tf.nn.dropout(self.final_output, keep_prob=dropout_rate) self.weight = tf.Variable(tf.truncated_normal([hidden_state_d * 2, num_classes], stddev=0.1)) self.bias = tf.Variable(tf.constant(0.1, shape=[num_classes])) self.prediction = tf.nn.softmax(tf.matmul(self.final_output, self.weight) + self.bias) self.cost = tf.nn.softmax_cross_entropy_with_logits(tf.matmul(self.dropout_last, self.weight) + self.bias, self.co_label) self.train_op = tf.train.AdamOptimizer(learning_rate=learning_rate).minimize(self.cost) self.init_op = tf.initialize_all_variables() self.prediction_a = tf.argmax(self.prediction, dimension=1) self.prediction_b = tf.argmax(self.co_label, dimension=1) self.score = tf.reduce_sum(tf.cast(tf.equal(self.prediction_a, self.prediction_b), dtype=tf.int32)) / tf.size(self.label) self.sess = tf.Session() self.sess.run(self.init_op)
def _bidirectional_rnn(self, data, length): length_64 = tf.cast(length, tf.int64) forward, _ = tf.nn.dynamic_rnn( cell=self.params.rnn_cell(self.params.rnn_hidden), inputs=data, dtype=tf.float32, sequence_length=length, scope='rnn-forward') backward, _ = tf.nn.dynamic_rnn( cell=self.params.rnn_cell(self.params.rnn_hidden), inputs=tf.reverse_sequence(data, length_64, seq_dim=1), dtype=tf.float32, sequence_length=self.length, scope='rnn-backward') backward = tf.reverse_sequence(backward, length_64, seq_dim=1) output = tf.concat(2, [forward, backward]) return output
def __init__(self, input_, length_, hidden_state_d, name, cell=None, input_keep_rate=1.0, output_keep_rate=1.0, init_state=None): """ lstm_step, input_d, hidden_state_d :param name: :return: self.input (shape=[None, lstm_step, input_d], dtype=tf.float32, name='input') self.length (shape=[None], dtype=tf.int32, name='length') """ with tf.variable_scope(name): self.input = input_ self.length = length_ self.reverse_input = tf.reverse_sequence(self.input, seq_dim=1, seq_lengths=tf.cast(self.length, tf.int64)) if len(cell) > 1: cell_f, cell_r = cell elif len(cell) == 1: cell_f = cell[0] cell_r = cell[0] else: # cell is None cell_f = tf.nn.rnn_cell.BasicLSTMCell(hidden_state_d, state_is_tuple=True) cell_r = tf.nn.rnn_cell.BasicLSTMCell(hidden_state_d, state_is_tuple=True) if not init_state: init_state_f = None init_state_b = None elif len(init_state) > 1: init_state_f = init_state[0] init_state_b = init_state[1] else: init_state_f = init_state[0] init_state_b = init_state[0] # print('blala', init_state_f) # print('blala', init_state_b) with tf.variable_scope('forward'): self.output, self.last_state = tf.nn.dynamic_rnn( cell_f, tf.nn.dropout(self.input, input_keep_rate), dtype=tf.float32, sequence_length=self.length, initial_state=init_state_f ) self.last = tf.nn.dropout(BasicSeqModel.last_relevant(self.output, self.length), output_keep_rate) with tf.variable_scope('backward'): self.reverse_output, self.reverse_last_state = tf.nn.dynamic_rnn( cell_r, tf.nn.dropout(self.reverse_input, input_keep_rate), dtype=tf.float32, sequence_length=self.length, initial_state=init_state_b ) self.reverse_last = tf.nn.dropout(BasicSeqModel.last_relevant(self.reverse_output, self.length), output_keep_rate)
def testShapeFunctionEdgeCases(self): # Batch size mismatched between input and seq_lengths. with self.assertRaises(ValueError): tf.reverse_sequence( tf.placeholder(tf.float32, shape=(32, 2, 3)), seq_lengths=tf.placeholder(tf.int64, shape=(33,)), seq_dim=3) # seq_dim out of bounds. with self.assertRaisesRegexp(ValueError, "seq_dim must be < input.dims()"): tf.reverse_sequence( tf.placeholder(tf.float32, shape=(32, 2, 3)), seq_lengths=tf.placeholder(tf.int64, shape=(32,)), seq_dim=3) # batch_dim out of bounds. with self.assertRaisesRegexp( ValueError, "batch_dim must be < input.dims()"): tf.reverse_sequence( tf.placeholder(tf.float32, shape=(32, 2, 3)), seq_lengths=tf.placeholder(tf.int64, shape=(32,)), seq_dim=0, batch_dim=3) with self.test_session(): inputs = tf.placeholder(tf.float32, shape=(32, 2, 3)) seq_lengths = tf.placeholder(tf.int64, shape=(32,)) output = tf.reverse_sequence( inputs, seq_lengths=seq_lengths, seq_dim=0) # batch_dim default is 0 with self.assertRaisesOpError("batch_dim == seq_dim"): output.eval(feed_dict={inputs: np.random.rand(32, 2, 3), seq_lengths: xrange(32)})
def bidirectional_rnn(self, cell, inputs, lengths, scope=None): name = scope.name or "BiRNN" # Forward direction with vs.variable_scope(name + "_FW") as fw_scope: output_fw, output_state_fw = rnn.dynamic_rnn(cell, inputs, time_major=True, dtype=dtypes.float32, sequence_length=lengths, scope=fw_scope) # Backward direction inputs_bw = tf.reverse_sequence(inputs, tf.to_int64(lengths), seq_dim=0, batch_dim=1) with vs.variable_scope(name + "_BW") as bw_scope: output_bw, output_state_bw = rnn.dynamic_rnn(cell, inputs_bw, time_major=True, dtype=dtypes.float32, sequence_length=lengths, scope=bw_scope) output_bw = tf.reverse_sequence(output_bw, tf.to_int64(lengths), seq_dim=0, batch_dim=1) outputs = output_fw + output_bw output_state = output_state_fw + output_state_bw return (outputs, output_state)
def encode(self, features, labels): features["source_ids"] = tf.reverse_sequence(features["source_ids"], features["source_len"], batch_dim=0, seq_dim=1) # [[1,2,3,4,PAD,PAD,PAD],[2,3,PAD,PAD,PAD,PAD,PAD]] [4,2] features["source_ids"] = tf.reverse(features["source_ids"],[1]) # --> [[4,3,2,1,PAD,PAD,PAD],[3,2,PAD,PAD,PAD,PAD,PAD]] --> [[PAD,PAD,PAD,1,2,3,4],[PAD,PAD,PAD,PAD,PAD,2,3]] source_embedded = tf.nn.embedding_lookup(self.source_embedding_fairseq(), features["source_ids"]) encoder_fn = self.encoder_class(self.params["encoder.params"], self.mode, self.source_pos_embedding_fairseq()) return encoder_fn(source_embedded, features["source_len"])
def reverse(self): x = tf.Variable(self.image, name='x') model = tf.initialize_all_variables() with tf.Session() as session: x = tf.reverse_sequence(x, [self.width] * self.height, 1, batch_dim=0) session.run(model) result = session.run(x) return result
def bw_dynamic_rnn(cell, inputs, sequence_length=None, initial_state=None, dtype=None, parallel_iterations=None, swap_memory=False, time_major=False, scope=None): assert not time_major # TODO : to be implemented later! flat_inputs = flatten(inputs, 2) # [-1, J, d] flat_len = None if sequence_length is None else tf.cast(flatten(sequence_length, 0), 'int64') flat_inputs = tf.reverse(flat_inputs, 1) if sequence_length is None \ else tf.reverse_sequence(flat_inputs, sequence_length, 1) flat_outputs, final_state = _dynamic_rnn(cell, flat_inputs, sequence_length=flat_len, initial_state=initial_state, dtype=dtype, parallel_iterations=parallel_iterations, swap_memory=swap_memory, time_major=time_major, scope=scope) flat_outputs = tf.reverse(flat_outputs, 1) if sequence_length is None \ else tf.reverse_sequence(flat_outputs, sequence_length, 1) outputs = reconstruct(flat_outputs, inputs, 2) return outputs, final_state
def create_graph(g): initer = tf.random_uniform_initializer(0.0,INIT_SCALE) with tf.variable_scope("graph", reuse=None, initializer=initer): g['x'] = list() g['y'] = list() g['s'] = list() g['seq_lengths'] = tf.placeholder(tf.int64,shape=[BATCH_SIZE]); for _ in range(UNROLLS): g['x'].append( tf.placeholder(tf.float32,shape=[BATCH_SIZE,INPUT_SIZE]) ) g['y'].append( tf.placeholder(tf.float32,shape=[BATCH_SIZE,INPUT_SIZE]) ) g['s'].append( tf.placeholder(tf.float32,shape=[BATCH_SIZE]) ) num_inputs = INPUT_SIZE * UNROLLS # num_outputs = OUTPUT_SIZE * UNROLLS g['w'] = tf.get_variable("softmax_w", [num_inputs,OUTPUT_SIZE]) g['b'] = tf.get_variable("softmax_b", [OUTPUT_SIZE]) g['cat_x'] = tf.concat(1, g['x'] ) g['logits'] = tf.nn.xw_plus_b(g['cat_x'], g['w'], g['b'] ) g['cat_y'] = tf.unpack(tf.reverse_sequence(tf.reshape( tf.concat(1, g['y'] ), [BATCH_SIZE,UNROLLS,OUTPUT_SIZE] ),g['seq_lengths'],1,0),axis=1)[0] g['loss'] = tf.nn.softmax_cross_entropy_with_logits(g['logits'], g['cat_y']) g['r_s'] = tf.unpack(tf.reverse_sequence(tf.transpose( tf.reshape( tf.concat(0, g['s'] ), [UNROLLS, BATCH_SIZE] ) ), g['seq_lengths'],1,0),axis=1)[0] g['train_loss'] = tf.mul( g['loss'], g['r_s'] ) g['preds'] = tf.nn.softmax(g['logits']) g['class_preds'] = tf.floor( g['preds'] + 0.5 ) g['accy'] = tf.mul( g['class_preds'], g['cat_y'] ) g['w_accy'] = tf.mul(g['accy'], tf.reshape(g['r_s'],shape=[BATCH_SIZE,1]))
def dynamic_bidirectional_rnn(cell, pre_inputs, sequence_length=None, initial_state=None, dtype=None, parallel_iterations=None, swap_memory=False, time_major=False, scope=None, feed_prev_out=False, num_layers=1, reuse_layers=True): isinstance(cell, BiRNNCell) with vs.variable_scope(scope or "Bi-RNN") as root_scope: inputs_list = [] outputs_list = [] outputs_fw_list = [] outputs_bw_list = [] state_fw_list = [] state_bw_list = [] for layer_idx in range(num_layers): scope_name = "layer_{}".format(layer_idx) with name_scope(scope_name) if reuse_layers else vs.variable_scope(scope_name): inputs = cell.pre(pre_inputs) outputs_fw, state_fw = dynamic_rnn(cell, inputs, sequence_length=sequence_length, initial_state=initial_state, dtype=dtype, parallel_iterations=parallel_iterations, swap_memory=swap_memory, time_major=time_major, feed_prev_out=feed_prev_out, scope='FW') inputs_rev = reverse_sequence(inputs, sequence_length, 1) outputs_bw_rev, state_bw = dynamic_rnn(cell, inputs_rev, sequence_length=sequence_length, initial_state=initial_state, dtype=dtype, parallel_iterations=parallel_iterations, swap_memory=swap_memory, time_major=time_major, feed_prev_out=feed_prev_out, scope='BW') outputs_bw = reverse_sequence(outputs_bw_rev, sequence_length, 1) outputs = cell.post(outputs_fw, outputs_bw) pre_inputs = outputs inputs_list.append(inputs) outputs_list.append(outputs) outputs_fw_list.append(outputs_fw) outputs_bw_list.append(outputs_bw) state_fw_list.append(state_fw) state_bw_list.append(state_bw) if reuse_layers: root_scope.reuse_variables() tensors = dict() tensors['in'] = transpose(pack(inputs_list), [1, 0, 2, 3]) tensors['out'] = transpose(pack(outputs_list), [1, 0, 2, 3]) tensors['fw_out'] = transpose(pack(outputs_fw_list), [1, 0, 2, 3]) # [N, L, M, d] tensors['bw_out'] = transpose(pack(outputs_bw_list), [1, 0, 2, 3]) # [N, L, M, d] tensors['fw_state'] = transpose(pack(state_fw_list), [1, 0, 2]) # [N, L, d] tensors['bw_state'] = transpose(pack(state_bw_list), [1, 0, 2]) # [N, L, d] return outputs_list[-1], state_fw_list[-1], state_bw_list[-1], tensors
def encode_sentences(self, text_emb, text_len, text_len_mask): num_sentences = tf.shape(text_emb)[0] max_sentence_length = tf.shape(text_emb)[1] # Transpose before and after for efficiency. inputs = tf.transpose(text_emb, [1, 0, 2]) # [max_sentence_length, num_sentences, emb] with tf.variable_scope("fw_cell"): cell_fw = util.CustomLSTMCell(self.config["lstm_size"], num_sentences, self.dropout) preprocessed_inputs_fw = cell_fw.preprocess_input(inputs) with tf.variable_scope("bw_cell"): cell_bw = util.CustomLSTMCell(self.config["lstm_size"], num_sentences, self.dropout) preprocessed_inputs_bw = cell_bw.preprocess_input(inputs) preprocessed_inputs_bw = tf.reverse_sequence(preprocessed_inputs_bw, seq_lengths=text_len, seq_dim=0, batch_dim=1) state_fw = tf.contrib.rnn.LSTMStateTuple(tf.tile(cell_fw.initial_state.c, [num_sentences, 1]), tf.tile(cell_fw.initial_state.h, [num_sentences, 1])) state_bw = tf.contrib.rnn.LSTMStateTuple(tf.tile(cell_bw.initial_state.c, [num_sentences, 1]), tf.tile(cell_bw.initial_state.h, [num_sentences, 1])) with tf.variable_scope("lstm"): with tf.variable_scope("fw_lstm"): fw_outputs, fw_states = tf.nn.dynamic_rnn(cell=cell_fw, inputs=preprocessed_inputs_fw, sequence_length=text_len, initial_state=state_fw, time_major=True) with tf.variable_scope("bw_lstm"): bw_outputs, bw_states = tf.nn.dynamic_rnn(cell=cell_bw, inputs=preprocessed_inputs_bw, sequence_length=text_len, initial_state=state_bw, time_major=True) bw_outputs = tf.reverse_sequence(bw_outputs, seq_lengths=text_len, seq_dim=0, batch_dim=1) text_outputs = tf.concat([fw_outputs, bw_outputs], 2) text_outputs = tf.transpose(text_outputs, [1, 0, 2]) # [num_sentences, max_sentence_length, emb] return self.flatten_emb_by_sentence(text_outputs, text_len_mask)
def flip_randomly(inputs, horizontally, vertically, is_training, name=None): """Flip images randomly. Make separate flipping decision for each image. Args: inputs (4-D tensor): Input images (batch size, height, width, channels). horizontally (bool): If True, flip horizontally with 50% probability. Otherwise, don't. vertically (bool): If True, flip vertically with 50% probability. Otherwise, don't. is_training (bool): If False, no flip is performed. scope: A name for the operation. """ with tf.name_scope(name, "flip_randomly") as scope: batch_size, height, width, _ = tf.unstack(tf.shape(inputs)) vertical_choices = (tf.random_uniform([batch_size], 0, 2, tf.int32) * tf.to_int32(vertically) * tf.to_int32(is_training)) horizontal_choices = (tf.random_uniform([batch_size], 0, 2, tf.int32) * tf.to_int32(horizontally) * tf.to_int32(is_training)) vertically_flipped = tf.reverse_sequence(inputs, vertical_choices * height, 1) both_flipped = tf.reverse_sequence(vertically_flipped, horizontal_choices * width, 2) return tf.identity(both_flipped, name=scope)
def rnn_layer(rnn_input: tf.Tensor, lengths: tf.Tensor, rnn_spec: RNNSpec) -> Tuple[tf.Tensor, tf.Tensor]: """Construct a RNN layer given its inputs and specs. Arguments: rnn_inputs: The input sequence to the RNN. lengths: Lengths of input sequences. rnn_spec: A valid RNNSpec tuple specifying the network architecture. add_residual: Add residual connections to the layer output. """ if rnn_spec.direction == "bidirectional": fw_cell = _make_rnn_cell(rnn_spec) bw_cell = _make_rnn_cell(rnn_spec) outputs_tup, states_tup = tf.nn.bidirectional_dynamic_rnn( fw_cell, bw_cell, rnn_input, sequence_length=lengths, dtype=tf.float32) outputs = tf.concat(outputs_tup, 2) if rnn_spec.cell_type == "LSTM": states_tup = (state.h for state in states_tup) final_state = tf.concat(list(states_tup), 1) else: if rnn_spec.direction == "backward": rnn_input = tf.reverse_sequence(rnn_input, lengths, seq_axis=1) cell = _make_rnn_cell(rnn_spec) outputs, final_state = tf.nn.dynamic_rnn( cell, rnn_input, sequence_length=lengths, dtype=tf.float32) if rnn_spec.direction == "backward": outputs = tf.reverse_sequence(outputs, lengths, seq_axis=1) if rnn_spec.cell_type == "LSTM": final_state = final_state.h return outputs, final_state
def body(self, features): if self._hparams.initializer == "orthogonal": raise ValueError("LSTM models fail with orthogonal initializer.") train = self._hparams.mode == tf.estimator.ModeKeys.TRAIN inputs = features.get("inputs") inputs_length = common_layers.length_from_embedding(inputs) # Flatten inputs. inputs = common_layers.flatten4d3d(inputs) # LSTM encoder. inputs = tf.reverse_sequence(inputs, inputs_length, seq_axis=1) encoder_output, _ = lstm(inputs, inputs_length, self._hparams, train, "encoder") return tf.expand_dims(encoder_output, axis=2)
def _testReverseSequence(self, x, seq_dim, seq_lengths, truth, use_gpu=False, expected_err_re=None): with self.test_session(use_gpu=use_gpu): ans = tf.reverse_sequence(x, seq_dim=seq_dim, seq_lengths=seq_lengths) if expected_err_re is None: tf_ans = ans.eval() self.assertAllClose(tf_ans, truth, atol=1e-10) self.assertShapeEqual(truth, ans) else: with self.assertRaisesOpError(expected_err_re): ans.eval()
def _comp_f(self): """ Encodes all queries (including supporting queries) :return: encoded queries """ with tf.device("/cpu:0"): max_length = tf.cast(tf.reduce_max(self._length), tf.int32) context_t = tf.transpose(self._context) context_t = tf.slice(context_t, [0, 0], tf.pack([max_length, -1])) embedded = tf.nn.embedding_lookup(self.input_embedding, context_t) embedded = tf.nn.dropout(embedded, self.keep_prob) batch_size = tf.shape(self._context)[0] batch_size_32 = tf.reshape(batch_size, [1]) batch_size_64 = tf.cast(batch_size, tf.int64) with tf.device(self._device1): #use other device for backward rnn with tf.variable_scope("backward"): min_end = tf.segment_min(self._ends, self._span_context) init_state = tf.get_variable("init_state", [self._size], initializer=self._init) init_state = tf.reshape(tf.tile(init_state, batch_size_32), [-1, self._size]) rev_embedded = tf.reverse_sequence(embedded, self._length, 0, 1) # TIME-MAJOR: [T, B, S] outs_bw = self._composition_function(rev_embedded, self._length - min_end, init_state) # reshape to all possible queries for all sequences. Dim[0]=batch_size*(max_length+1). # "+1" because we include the initial state outs_bw = tf.reshape(tf.concat(0, [tf.expand_dims(init_state, 0), outs_bw]), [-1, self._size]) # gather respective queries via their lengths-start (because reversed sequence) lengths_aligned = tf.gather(self._length, self._span_context) out_bw = tf.gather(outs_bw, (lengths_aligned - self._ends) * batch_size_64 + self._span_context) with tf.device(self._device2): with tf.variable_scope("forward"): #e_inputs = [tf.reshape(e, [-1, self._size]) for e in tf.split(1, self._max_length, embedded)] max_start = tf.segment_max(self._starts, self._span_context) init_state = tf.get_variable("init_state", [self._size], initializer=self._init) init_state = tf.reshape(tf.tile(init_state, batch_size_32), [-1, self._size]) # TIME-MAJOR: [T, B, S] outs_fw = self._composition_function(embedded, max_start, init_state) # reshape to all possible queries for all sequences. Dim[0]=batch_size*(max_length+1). # "+1" because we include the initial state outs_fw = tf.reshape(tf.concat(0, [tf.expand_dims(init_state, 0), outs_fw]), [-1, self._size]) # gather respective queries via their positions (with offset of batch_size*ends) out_fw = tf.gather(outs_fw, self._starts * batch_size_64 + self._span_context) # form query from forward and backward compositions query = tf.contrib.layers.fully_connected(tf.concat(1, [out_fw, out_bw]), self._size, activation_fn=None, weights_initializer=None, biases_initializer=None) query = tf.add_n([query, out_bw, out_fw]) return query
def create_model(self, model_input, vocab_size, num_frames, is_training=True, **unused_params): """Creates a model which uses a stack of LSTMs to represent the video. Args: model_input: A 'batch_size' x 'max_frames' x 'num_features' matrix of input features. vocab_size: The number of classes in the dataset. num_frames: A vector of length 'batch' which indicates the number of frames for each video (before padding). Returns: A dictionary with a tensor containing the probability predictions of the model in the 'predictions' key. The dimensions of the tensor are 'batch_size' x 'num_classes'. """ lstm_size = FLAGS.lstm_cells number_of_layers = FLAGS.lstm_layers random_frames = FLAGS.lstm_random_sequence iterations = FLAGS.iterations backward = FLAGS.lstm_backward if random_frames: num_frames_2 = tf.cast(tf.expand_dims(num_frames, 1), tf.float32) model_input = utils.SampleRandomFrames(model_input, num_frames_2, iterations) if backward: model_input = tf.reverse_sequence(model_input, num_frames, seq_axis=1) stacked_lstm = tf.contrib.rnn.MultiRNNCell( [ tf.contrib.rnn.BasicLSTMCell( lstm_size, forget_bias=1.0, state_is_tuple=False) for _ in range(number_of_layers) ], state_is_tuple=False) loss = 0.0 with tf.variable_scope("RNN"): outputs, state = tf.nn.dynamic_rnn(stacked_lstm, model_input, sequence_length=num_frames, dtype=tf.float32) aggregated_model = getattr(video_level_models, FLAGS.video_level_classifier_model) return aggregated_model().create_model( model_input=state, vocab_size=vocab_size, is_training=is_training, **unused_params)
def _create_position_embedding(self, lengths, maxlen): # Slice to size of current sequence pe_slice = self.pos_embed[2:maxlen+2, :] # Replicate encodings for each element in the batch batch_size = tf.shape(lengths)[0] pe_batch = tf.tile([pe_slice], [batch_size, 1, 1]) # Mask out positions that are padded positions_mask = tf.sequence_mask( lengths=lengths, maxlen=maxlen, dtype=tf.float32) positions_embed = pe_batch * tf.expand_dims(positions_mask, 2) positions_embed = tf.reverse_sequence(positions_embed, lengths, batch_dim=0, seq_dim=1) # [[1,2,3,4,PAD,PAD,PAD],[2,3,PAD,PAD,PAD,PAD,PAD]] [4,2] positions_embed = tf.reverse(positions_embed,[1]) # --> [[4,3,2,1,PAD,PAD,PAD],[3,2,PAD,PAD,PAD,PAD,PAD]] --> [[PAD,PAD,PAD,1,2,3,4],[PAD,PAD,PAD,PAD,PAD,2,3]] return positions_embed
def testScanSumEquivalenceWithSeqLen(self): with self.test_session() as sess: sequence_lengths = [0, 2] bootstrap = tf.constant([0.5, 1.5], dtype=tf.float32) sequence = [[1, 2, 3, 4, 5], [6, 7, 8, 9, 10]] decays = [[.1, .2, .3, .4, .5], [.6, .7, .8, .9, .10]] eq_sequence = [[0, 0, 0, 0, 0], [6, 7, 0, 0, 0]] eq_decays = [[0, 0, 0, 0, 0], [.6, .7, 0, 0, 0]] eq_reverse_sequence = [[0, 0, 0, 0, 0], [7, 6, 0, 0, 0]] eq_reverse_decays = [[0, 0, 0, 0, 0], [.7, .6, 0, 0, 0]] # We use transpose because it is easier to define the input data in # BxT (batch x time) form, while scan_discounted_sum assumes TxB form. sequence_in = tf.transpose(tf.constant(sequence, dtype=tf.float32)) decays_in = tf.transpose(tf.constant(decays, dtype=tf.float32)) eq_sequence_in = tf.transpose(tf.constant(eq_sequence, dtype=tf.float32)) eq_decays_in = tf.transpose(tf.constant(eq_decays, dtype=tf.float32)) eq_reverse_sequence_in = tf.transpose( tf.constant(eq_reverse_sequence, dtype=tf.float32)) eq_reverse_decays_in = tf.transpose( tf.constant(eq_reverse_decays, dtype=tf.float32)) eq_result = sequence_ops.scan_discounted_sum( sequence_in, decays_in, bootstrap, reverse=False, sequence_lengths=sequence_lengths) exp_eq_result = sequence_ops.scan_discounted_sum( eq_sequence_in, eq_decays_in, bootstrap) eq_reverse_result = sequence_ops.scan_discounted_sum( sequence_in, decays_in, bootstrap, reverse=True, sequence_lengths=sequence_lengths) exp_eq_reverse_result = sequence_ops.scan_discounted_sum( eq_reverse_sequence_in, eq_reverse_decays_in, bootstrap) exp_eq_reverse_result = tf.reverse_sequence( exp_eq_reverse_result, sequence_lengths, seq_axis=0, batch_axis=1) self.assertAllClose(sess.run(eq_result), sess.run(exp_eq_result)) self.assertAllClose(sess.run(eq_reverse_result), sess.run(exp_eq_reverse_result))
def construct(self, args, source_chars, target_chars, bow, eow): with self.session.graph.as_default(): if args.recodex: tf.get_variable_scope().set_initializer(tf.glorot_uniform_initializer(seed=42)) # Inputs self.sentence_lens = tf.placeholder(tf.int32, [None], name="sentence_lens") self.source_ids = tf.placeholder(tf.int32, [None, None], name="source_ids") self.source_seqs = tf.placeholder(tf.int32, [None, None], name="source_seqs") self.source_seq_lens = tf.placeholder(tf.int32, [None], name="source_seq_lens") self.target_ids = tf.placeholder(tf.int32, [None, None], name="target_ids") self.target_seqs = tf.placeholder(tf.int32, [None, None], name="target_seqs") self.target_seq_lens = tf.placeholder(tf.int32, [None], name="target_seq_lens") # Append EOW after target_seqs target_seqs = tf.reverse_sequence(self.target_seqs, self.target_seq_lens, 1) target_seqs = tf.pad(target_seqs, [[0, 0], [1, 0]], constant_values=eow) target_seq_lens = self.target_seq_lens + 1 target_seqs = tf.reverse_sequence(target_seqs, target_seq_lens, 1) # Encoder # TODO: Generate source embeddings for source chars, of shape [source_chars, args.char_dim]. # TODO: Embed the self.source_seqs using the source embeddings. # TODO: Using a GRU with dimension args.rnn_dim, process the embedded self.source_seqs # using forward RNN and store the resulting states into `source_states`. # Index the unique words using self.source_ids and self.target_ids. sentence_mask = tf.sequence_mask(self.sentence_lens) source_states = tf.boolean_mask(tf.nn.embedding_lookup(source_states, self.source_ids), sentence_mask) source_lens = tf.boolean_mask(tf.nn.embedding_lookup(self.source_seq_lens, self.source_ids), sentence_mask) target_seqs = tf.boolean_mask(tf.nn.embedding_lookup(target_seqs, self.target_ids), sentence_mask) target_lens = tf.boolean_mask(tf.nn.embedding_lookup(target_seq_lens, self.target_ids), sentence_mask) # Decoder # TODO: Generate target embeddings for target chars, of shape [target_chars, args.char_dim]. # TODO: Embed the target_seqs using the target embeddings. # TODO: Generate a decoder GRU with wimension args.rnn_dim. # TODO: Create a `decoder_layer` -- a fully connected layer with # target_chars neurons used in the decoder to classify into target characters. # The DecoderTraining will be used during training. It will output logits for each # target character. class DecoderTraining(tf.contrib.seq2seq.Decoder): @property def batch_size(self): return # TODO: Return size of the batch, using for example source_states size @property def output_dtype(self): return tf.float32 # Type for logits of target characters @property def output_size(self): return target_chars # Length of logits for every output def initialize(self, name=None): finished = # TODO: False if target_lens > 0, True otherwise states = # TODO: Initial decoder state to use inputs = # TODO: embedded BOW characters of shape [self.batch_size] using target embeddings. # You can use tf.fill to generate BOWs of appropriate size. return finished, inputs, states def step(self, time, inputs, states, name=None): outputs, states = # TODO: Run the decoder GRU cell using inputs and states. outputs = # TODO: Apply the decoder_layer on outputs. next_input = # TODO: Next input are character embeddings with index `time` in target_embedded. finished = # TODO: False if target_lens > time + 1, True otherwise. return outputs, states, next_input, finished output_layer, _, _ = tf.contrib.seq2seq.dynamic_decode(DecoderTraining()) self.predictions_training = tf.argmax(output_layer, axis=2, output_type=tf.int32) # The DecoderPrediction will be used during prediction. It will # directly output the predicted target characters. class DecoderPrediction(tf.contrib.seq2seq.Decoder): @property def batch_size(self): return # TODO: Return size of the batch, using for example source_states size @property def output_dtype(self): return tf.int32 # Type for predicted target characters @property def output_size(self): return 1 # Will return just one output def initialize(self, name=None): finished = # TODO: False of shape [self.batch_size]. states = # TODO: Initial decoder state to use. inputs = # TODO: embedded BOW characters of shape [self.batch_size] using target embeddings. # You can use tf.fill to generate BOWs of appropriate size. return finished, inputs, states def step(self, time, inputs, states, name=None): outputs, states = # TODO: Run the decoder GRU cell using inputs and states. outputs = # TODO: Apply the decoder_layer on outputs. outputs = # TODO: Use tf.argmax to choose most probable class (supply parameter `output_type=tf.int32`). next_input = # TODO: Embed `outputs` using target_embeddings finished = # TODO: True where outputs==eow, False otherwise # Use tf.equal for the comparison, Python's '==' is not overloaded return outputs, states, next_input, finished self.predictions, _, self.prediction_lens = tf.contrib.seq2seq.dynamic_decode( DecoderPrediction(), maximum_iterations=tf.reduce_max(source_lens) + 10) # Training weights = tf.sequence_mask(target_lens, dtype=tf.float32) loss = tf.losses.sparse_softmax_cross_entropy(target_seqs, output_layer, weights=weights) global_step = tf.train.create_global_step() self.training = tf.train.AdamOptimizer().minimize(loss, global_step=global_step, name="training") # Summaries accuracy_training = tf.reduce_all(tf.logical_or( tf.equal(self.predictions_training, target_seqs), tf.logical_not(tf.sequence_mask(target_lens))), axis=1) self.current_accuracy_training, self.update_accuracy_training = tf.metrics.mean(accuracy_training) minimum_length = tf.minimum(tf.shape(self.predictions)[1], tf.shape(target_seqs)[1]) accuracy = tf.logical_and( tf.equal(self.prediction_lens, target_lens), tf.reduce_all(tf.logical_or( tf.equal(self.predictions[:, :minimum_length], target_seqs[:, :minimum_length]), tf.logical_not(tf.sequence_mask(target_lens, maxlen=minimum_length))), axis=1)) self.current_accuracy, self.update_accuracy = tf.metrics.mean(accuracy) self.current_loss, self.update_loss = tf.metrics.mean(loss, weights=tf.reduce_sum(weights)) self.reset_metrics = tf.variables_initializer(tf.get_collection(tf.GraphKeys.METRIC_VARIABLES)) summary_writer = tf.contrib.summary.create_file_writer(args.logdir, flush_millis=10 * 1000) self.summaries = {} with summary_writer.as_default(), tf.contrib.summary.record_summaries_every_n_global_steps(10): self.summaries["train"] = [tf.contrib.summary.scalar("train/loss", self.update_loss), tf.contrib.summary.scalar("train/accuracy", self.update_accuracy_training)] with summary_writer.as_default(), tf.contrib.summary.always_record_summaries(): for dataset in ["dev", "test"]: self.summaries[dataset] = [tf.contrib.summary.scalar(dataset + "/loss", self.current_loss), tf.contrib.summary.scalar(dataset + "/accuracy", self.current_accuracy)] # Initialize variables self.session.run(tf.global_variables_initializer()) with summary_writer.as_default(): tf.contrib.summary.initialize(session=self.session, graph=self.session.graph)
def build_model(self): with tf.variable_scope('RNNTEST'): self.sense = tf.placeholder(tf.int32,[None]) self.arg1 = tf.placeholder(tf.int32,[None,None]) self.arg2 = tf.placeholder(tf.int32,[None,None]) self.arg1_len = tf.placeholder(tf.int32,[None]) self.arg2_len = tf.placeholder(tf.int32,[None]) self.keep_prob = tf.placeholder(tf.float32) with tf.device('/cpu:0'): if self.use_pre_trained_embedding: word_W = tf.get_variable('word_embed',initializer = tf.convert_to_tensor(self.data_loader.pre_trained_word_embeddings,dtype=tf.float32)) else: word_W = tf.get_variable('word_embed',shape = [self.data_loader.word_vocab_size,self.word_embed_size]) arg1 = tf.nn.dropout(tf.nn.embedding_lookup(word_W,self.arg1),self.keep_prob) arg2 = tf.nn.dropout(tf.nn.embedding_lookup(word_W,self.arg2),self.keep_prob) encoder_lstm_unit = rnn_cell.BasicLSTMCell(self.encoder_size) decoder_lstm_unit = rnn_cell.BasicLSTMCell(self.decoder_size) with tf.variable_scope('forward_encoder'): forward_encoder_outputs,forward_encoder_state = rnn.dynamic_rnn(encoder_lstm_unit,arg1,self.arg1_len,dtype=tf.float32) with tf.variable_scope('backward_encoder'): backward_encoder_outputs,backward_encoder_state= rnn.dynamic_rnn(encoder_lstm_unit,tf.reverse_sequence(arg1,tf.cast(self.arg1_len,tf.int64),1),dtype=tf.float32) encoder_outputs = tf.concat(2,[forward_encoder_outputs,tf.reverse_sequence(backward_encoder_outputs,tf.cast(self.arg1_len,tf.int64),1)]) encoder_state = tf.concat(1,[forward_encoder_state,backward_encoder_state]) source = tf.expand_dims(encoder_outputs,2) #batch_size x source_len x 1 x source_depth(2*encoder_size) attention_W = tf.get_variable('attention_W',[1,1,2*self.encoder_size,self.attention_judge_size]) attention_V = tf.get_variable('attention_V',[self.attention_judge_size]) WxH = tf.nn.conv2d(source, attention_W,[1,1,1,1],'SAME') #batch_size x source_len x 1 x attention self.mask = tf.placeholder(tf.float32,[None,None]) def attention(input_t,output_t_minus_1,time): with tf.variable_scope('attention'): VxS = tf.reshape(rnn_cell.linear(output_t_minus_1,self.attention_judge_size,True),[-1,1,1,self.attention_judge_size]) #batch_size x 1 x 1 x attention _exp = tf.exp(tf.reduce_sum( attention_V * tf.tanh(WxH+VxS), [3]))#batch_size x source_len x 1 _exp = _exp*tf.expand_dims(self.mask,-1) attention_weight = _exp/tf.reduce_sum(_exp,[1], keep_dims=True) attention_t = tf.reduce_sum(encoder_outputs*attention_weight,[1]) feed_in_t = tf.tanh(rnn_cell.linear([attention_t,input_t],self.embedding_size,True)) return feed_in_t with tf.variable_scope('decoder'): decoder_outputs,decoder_state = dynamic_rnn_decoder(arg2,decoder_lstm_unit,sequence_length=self.arg2_len,loop_function=attention) judge = tf.concat(1,[tf.reduce_sum(decoder_outputs,[1])/tf.expand_dims(tf.cast(self.arg2_len,tf.float32),-1),tf.reduce_sum(encoder_outputs,[1])/tf.expand_dims(tf.cast(self.arg1_len,tf.float32),-1)]) unscaled_log_distribution = rnn_cell.linear(judge,self.data_loader.sense_vocab_size,True) self.output = tf.cast(tf.argmax(unscaled_log_distribution,1),tf.int32) self.accuracy = tf.reduce_mean(tf.cast(tf.equal(self.output,self.sense), tf.float32)) #max-margin method #self._MM = tf.placeholder(tf.int32,[None]) #margin = tf.sub(tf.reduce_max(unscaled_log_distribution,[1]),tf.gather(tf.reshape(unscaled_log_distribution,[-1]),self._MM)) #self.loss = tf.reduce_mean(margin) #maximum likelihood method self.loss = tf.reduce_mean(tf.nn.sparse_softmax_cross_entropy_with_logits(unscaled_log_distribution, self.sense)) self.optimizer = tf.train.AdagradOptimizer(self.lr) self.train_op = self.optimizer.minimize(self.loss)
def _preprocess(self, features, labels): """Model-specific preprocessing for features and labels: - Creates vocabulary lookup tables for source and target vocab - Converts tokens into vocabulary ids """ # Create vocabulary lookup for source source_vocab_to_id, source_id_to_vocab, source_word_to_count, _ = \ vocab.create_vocabulary_lookup_table(self.source_vocab_info.path) source_candidate_vocab_to_id, source_candidate_id_to_vocab, source_candidate_word_to_count, _ = \ vocab.create_vocabulary_lookup_table(self.source_candidate_vocab_info.path) # Create vocabulary look for target target_vocab_to_id, target_id_to_vocab, target_word_to_count, _ = \ vocab.create_vocabulary_lookup_table(self.target_vocab_info.path) # Add vocab tables to graph colection so that we can access them in # other places. graph_utils.add_dict_to_collection( { "source_vocab_to_id": source_vocab_to_id, "source_id_to_vocab": source_id_to_vocab, "source_word_to_count": source_word_to_count, "source_candidate_vocab_to_id": source_candidate_vocab_to_id, "source_candidate_id_to_vocab": source_candidate_id_to_vocab, "source_candidate_word_to_count": source_candidate_word_to_count, "target_vocab_to_id": target_vocab_to_id, "target_id_to_vocab": target_id_to_vocab, "target_word_to_count": target_word_to_count }, "vocab_tables") # Slice source to max_len if self.params["source.max_seq_len"] is not None: features["source_tokens"] = features[ "source_tokens"][:, :self.params["source.max_seq_len"]] features["source_len"] = tf.minimum( features["source_len"], self.params["source.max_seq_len"]) # Slice source_candidate to max_len if self.params["source_candidate.max_seq_len"] is not None: features["source_candidate_tokens"] = features[ "source_candidate_tokens"][:, :self.params[ "source_candidate.max_seq_len"]] features["source_candidate_len"] = tf.minimum( features["source_candidate_len"], self.params["source_candidate.max_seq_len"]) # Look up the source ids in the vocabulary features["source_ids"] = source_vocab_to_id.lookup( features["source_tokens"]) features["source_candidate_ids"] = source_candidate_vocab_to_id.lookup( features["source_candidate_tokens"]) # Maybe reverse the source if self.params["source.reverse"] is True: features["source_ids"] = tf.reverse_sequence( input=features["source_ids"], seq_lengths=features["source_len"], seq_dim=1, batch_dim=0, name=None) features["source_candidate_ids"] = tf.reverse_sequence( input=features["source_candidate_ids"], seq_lengths=features["source_candidate_len"], seq_dim=1, batch_dim=0, name=None) features["source_len"] = tf.to_int32(features["source_len"]) tf.summary.histogram("source_len", tf.to_float(features["source_len"])) features["source_candidate_len"] = tf.to_int32( features["source_candidate_len"]) tf.summary.histogram("source_candidate_len", tf.to_float(features["source_candidate_len"])) if labels is None: return features, None labels = labels.copy() # Slices targets to max length if self.params["target.max_seq_len"] is not None: labels["target_tokens"] = labels[ "target_tokens"][:, :self.params["target.max_seq_len"]] labels["target_len"] = tf.minimum( labels["target_len"], self.params["target.max_seq_len"]) # Look up the target ids in the vocabulary labels["target_ids"] = target_vocab_to_id.lookup( labels["target_tokens"]) labels["target_len"] = tf.to_int32(labels["target_len"]) tf.summary.histogram("target_len", tf.to_float(labels["target_len"])) # Keep track of the number of processed tokens num_tokens = tf.reduce_sum(labels["target_len"]) num_tokens += tf.reduce_sum(features["source_len"]) num_tokens += tf.reduce_sum(features["source_candidate_len"]) token_counter_var = tf.Variable(0, "tokens_counter") total_tokens = tf.assign_add(token_counter_var, num_tokens) tf.summary.scalar("num_tokens", total_tokens) with tf.control_dependencies([total_tokens]): features["source_tokens"] = tf.identity(features["source_tokens"]) features["source_candidate_tokens"] = tf.identity( features["source_candidate_tokens"]) # Add to graph collection for later use graph_utils.add_dict_to_collection(features, "features") if labels: graph_utils.add_dict_to_collection(labels, "labels") print("attention_biseqseq features:{} labels:{}".format( features, labels)) return features, labels
def build(self): print('Building model') self.x_embeddings = tf.Variable(tf.random_normal( [self.alphabet_src_size, self.embedd_dims], stddev=0.1), name='x_embeddings') self.t_embeddings = tf.Variable(tf.random_normal( [self.alphabet_tar_size, self.embedd_dims], stddev=0.1), name='t_embeddings') X_embedded = tf.gather(self.x_embeddings, self.Xs, name='embed_X') t_embedded = tf.gather(self.t_embeddings, self.ts_go, name='embed_t') with tf.variable_scope('dense_out'): W_out = tf.get_variable( 'W_out', [self.word_encoder_units * 2, self.alphabet_tar_size]) b_out = tf.get_variable('b_out', [self.alphabet_tar_size]) # forward encoding char_enc_state, char_enc_out = encoder(X_embedded, self.X_len, 'char_encoder', self.char_encoder_units) char2word = _grid_gather(char_enc_out, self.X_spaces) char2word.set_shape([None, None, self.char_encoder_units]) word_enc_state, word_enc_out = encoder(char2word, self.X_spaces_len, 'word_encoder', self.word_encoder_units) # backward encoding words char2word = tf.reverse_sequence(char2word, tf.to_int64(self.X_spaces_len), 1) char2word.set_shape([None, None, self.char_encoder_units]) word_enc_state_bck, word_enc_out_bck = encoder( char2word, self.X_spaces_len, 'word_encoder_backwards', self.word_encoder_units) word_enc_out_bck = tf.reverse_sequence(word_enc_out_bck, tf.to_int64(self.X_spaces_len), 1) word_enc_state = tf.concat(1, [word_enc_state, word_enc_state_bck]) word_enc_out = tf.concat(2, [word_enc_out, word_enc_out_bck]) # decoding dec_state, dec_out, valid_dec_out, valid_attention_tracker = ( attention_decoder(word_enc_out, self.X_spaces_len, word_enc_state, t_embedded, self.t_len, self.attn_units, self.t_embeddings, W_out, b_out)) out_tensor = tf.reshape(dec_out, [-1, self.word_encoder_units * 2]) out_tensor = tf.matmul(out_tensor, W_out) + b_out out_shape = tf.concat(0, [ tf.expand_dims(tf.shape(self.X_len)[0], 0), tf.expand_dims(tf.shape(t_embedded)[1], 0), tf.expand_dims(tf.constant(self.alphabet_tar_size), 0) ]) self.valid_attention_tracker = valid_attention_tracker.pack() self.out_tensor = tf.reshape(out_tensor, out_shape) self.out_tensor.set_shape([None, None, self.alphabet_tar_size]) valid_out_tensor = tf.reshape(valid_dec_out, [-1, self.word_encoder_units * 2]) valid_out_tensor = tf.matmul(valid_out_tensor, W_out) + b_out self.valid_out_tensor = tf.reshape(valid_out_tensor, out_shape) self.out = None # add TensorBoard summaries for all variables tf.contrib.layers.summarize_variables()
import matplotlib.image as mpimg import matplotlib.pyplot as plt import tensorflow as tf filename = "MarshOrchid.jpg" image = mpimg.imread(filename) height, width, depth = image.shape x = tf.Variable(image, name='x') model = tf.global_variables_initializer() with tf.Session() as session: #2 times makes it upside down for i in range(2): xshape = tf.shape(x) result = session.run(xshape) x = tf.reverse_sequence(x, [result[1]] * result[0], 1, batch_dim=0) x = tf.transpose(x, perm=[1, 0, 2]) session.run(model) result = (session.run(x)) #print(image.shape) plt.imshow(result) plt.show()
def construct(self, args, source_chars, target_chars, bow, eow): with self.session.graph.as_default(): # Inputs self.sentence_lens = tf.placeholder(tf.int32, [None], name="sentence_lens") self.source_ids = tf.placeholder(tf.int32, [None, None], name="source_ids") self.source_seqs = tf.placeholder(tf.int32, [None, None], name="source_seqs") self.source_seq_lens = tf.placeholder(tf.int32, [None], name="source_seq_lens") self.target_ids = tf.placeholder(tf.int32, [None, None], name="target_ids") self.target_seqs = tf.placeholder(tf.int32, [None, None], name="target_seqs") self.target_seq_lens = tf.placeholder(tf.int32, [None], name="target_seq_lens") # Training. The rest of the code assumes that # - when training the decoder, the output layer with logis for each generated # character is in `output_layer` and the corresponding predictions are in # `self.predictions_training`. # - the `target_ids` contains the gold generated characters # - the `target_lens` contains number of valid characters for each lemma # - when running decoder inference, the predictions are in `self.predictions` # and their lengths in `self.prediction_lens`. # Append EOW after target_seqs target_ids = tf.reverse_sequence(self.target_seqs, self.target_seq_lens, 1) target_ids = tf.pad(target_ids, [[0, 0], [1, 0]], constant_values=eow) target_seq_lens = self.target_seq_lens + 1 target_ids = tf.reverse_sequence(target_ids, target_seq_lens, 1) # Encoder # Generate source embeddings for source chars, of shape [source_chars, args.char_dim]. source_embeddings = tf.get_variable("source_embeddings", [source_chars, args.char_dim]) # Embed the self.source_seqs using the source embeddings. embedded_source_seqs = tf.nn.embedding_lookup( source_embeddings, self.source_seqs) # Using a GRU with dimension args.rnn_dim, process the embedded self.source_seqs # using bidirectional RNN. Store the summed fwd and bwd outputs in `source_encoded` # and the summed fwd and bwd states into `source_states`. rnn_cell_fwd = tf.nn.rnn_cell.GRUCell(num_units=args.rnn_dim) rnn_cell_bwd = tf.nn.rnn_cell.GRUCell(num_units=args.rnn_dim) outputs, states = tf.nn.bidirectional_dynamic_rnn( rnn_cell_fwd, rnn_cell_bwd, embedded_source_seqs, sequence_length=self.source_seq_lens, dtype=tf.float32) source_encoded = tf.add(outputs[0], outputs[1]) source_states = tf.add(states[0], states[1]) # Index the unique words using self.source_ids and self.target_ids. sentence_mask = tf.sequence_mask(self.sentence_lens) source_encoded = tf.boolean_mask( tf.nn.embedding_lookup(source_encoded, self.source_ids), sentence_mask) source_states = tf.boolean_mask( tf.nn.embedding_lookup(source_states, self.source_ids), sentence_mask) source_lens = tf.boolean_mask( tf.nn.embedding_lookup(self.source_seq_lens, self.source_ids), sentence_mask) target_ids = tf.boolean_mask( tf.nn.embedding_lookup(target_ids, self.target_ids), sentence_mask) target_lens = tf.boolean_mask( tf.nn.embedding_lookup(target_seq_lens, self.target_ids), sentence_mask) # Decoder # Generate target embeddings for target chars, of shape [target_chars, args.char_dim]. target_embeddings = tf.get_variable("target_embeddings", [target_chars, args.char_dim]) # Embed the target_seqs using the target embeddings. embedded_target_seqs = tf.nn.embedding_lookup( target_embeddings, target_ids) # Generate a decoder GRU with dimension args.rnn_dim. rnn_decoder = tf.nn.rnn_cell.GRUCell(num_units=args.rnn_dim) # Create a `decoder_layer` -- a fully connected layer with # target_chars neurons used in the decoder to classify into target characters. decoder_layer = tf.layers.Dense(units=target_chars, activation=None) # Attention # Generate three fully connected layers without activations: # - `source_layer` with args.rnn_dim units # - `state_layer` with args.rnn_dim units # - `weight_layer` with 1 unit source_layer = tf.layers.Dense(args.rnn_dim, activation=None) state_layer = tf.layers.Dense(args.rnn_dim, activation=None) weight_layer = tf.layers.Dense(1, activation=None) def with_attention(inputs, states): # Generate the attention # Project source_encoded using source_layer. source_projection = source_layer(source_encoded) # Change shape of states from [a, b] to [a, 1, b] and project it using state_layer. states = tf.expand_dims(states, 1) state_projection = state_layer(states) # Sum the two above projections, apply tf.tanh and project the result using weight_layer. # The result has shape [x, y, 1]. weight_projection = weight_layer( tf.tanh(tf.add(source_projection, state_projection))) # Apply tf.nn.softmax to the latest result, using axis corresponding to source characters. softmax_layer = tf.nn.softmax(weight_projection, axis=1) # Multiply the source_encoded by the latest result, and sum the results with respect # to the axis corresponding to source characters. This is the final attention. tmp = tf.multiply(source_encoded, softmax_layer) attention = tf.reduce_sum(tmp, axis=1) # Return concatenation of inputs and the computed attention. return tf.concat([inputs, attention], 1) # The DecoderTraining will be used during training. It will output logits for each # target character. class DecoderTraining(tf.contrib.seq2seq.Decoder): @property def batch_size(self): return tf.shape( source_states )[0] # Return size of the batch, using for example source_states size @property def output_dtype(self): return tf.float32 # Type for logits of target characters @property def output_size(self): return target_chars # Length of logits for every output def initialize(self, name=None): finished = target_lens <= 0 # False if target_lens > 0, True otherwise states = source_states # Initial decoder state to use inputs = with_attention( tf.nn.embedding_lookup(target_embeddings, tf.fill([self.batch_size], bow)), states) # Call with_attention on the embedded BOW characters of shape [self.batch_size]. # You can use tf.fill to generate BOWs of appropriate size. return finished, inputs, states def step(self, time, inputs, states, name=None): outputs, states = rnn_decoder( inputs, states ) # Run the decoder GRU cell using inputs and states. outputs = decoder_layer( outputs) # Apply the decoder_layer on outputs. next_input = with_attention( embedded_target_seqs[:, time], states ) # Next input is with_attention called on words with index `time` in target_embedded. finished = tf.less_equal( target_lens, time + 1) # False if target_lens > time + 1, True otherwise. return outputs, states, next_input, finished output_layer, _, _ = tf.contrib.seq2seq.dynamic_decode( DecoderTraining()) self.predictions_training = tf.argmax(output_layer, axis=2, output_type=tf.int32) # The DecoderPrediction will be used during prediction. It will # directly output the predicted target characters. class DecoderPrediction(tf.contrib.seq2seq.Decoder): @property def batch_size(self): return tf.shape( source_states )[0] # Return size of the batch, using for example source_states size @property def output_dtype(self): return tf.int32 # Type for predicted target characters @property def output_size(self): return 1 # Will return just one output def initialize(self, name=None): finished = tf.fill( [self.batch_size], False) # False of shape [self.batch_size]. states = source_states # Initial decoder state to use. inputs = with_attention( tf.nn.embedding_lookup(target_embeddings, tf.fill([self.batch_size], bow)), states) # Call with_attention on the embedded BOW characters of shape [self.batch_size]. # You can use tf.fill to generate BOWs of appropriate size. return finished, inputs, states def step(self, time, inputs, states, name=None): outputs, states = rnn_decoder( inputs, states ) # Run the decoder GRU cell using inputs and states. outputs = decoder_layer( outputs) # Apply the decoder_layer on outputs. # Use tf.argmax to choose most probable class (supply parameter `output_type=tf.int32`). outputs = tf.argmax(outputs, axis=1, output_type=tf.int32) next_input = with_attention( tf.nn.embedding_lookup(target_embeddings, outputs), states ) # Embed `outputs` using target_embeddings and pass it to with_attention. finished = tf.equal( outputs, eow) # True where outputs==eow, False otherwise # Use tf.equal for the comparison, Python's '==' is not overloaded return outputs, states, next_input, finished self.predictions, _, self.prediction_lens = tf.contrib.seq2seq.dynamic_decode( DecoderPrediction(), maximum_iterations=tf.reduce_max(source_lens) + 10) # - the `target_ids` contains the gold generated characters # - the `target_lens` contains number of valid characters for each lemma # Training weights = tf.sequence_mask(target_lens, dtype=tf.float32) loss = tf.losses.sparse_softmax_cross_entropy(target_ids, output_layer, weights=weights) global_step = tf.train.create_global_step() self.training = tf.train.AdamOptimizer().minimize( loss, global_step=global_step, name="training") # Summaries accuracy_training = tf.reduce_all(tf.logical_or( tf.equal(self.predictions_training, target_ids), tf.logical_not(tf.sequence_mask(target_lens))), axis=1) self.current_accuracy_training, self.update_accuracy_training = tf.metrics.mean( accuracy_training) minimum_length = tf.minimum( tf.shape(self.predictions)[1], tf.shape(target_ids)[1]) accuracy = tf.logical_and( tf.equal(self.prediction_lens, target_lens), tf.reduce_all(tf.logical_or( tf.equal(self.predictions[:, :minimum_length], target_ids[:, :minimum_length]), tf.logical_not( tf.sequence_mask(target_lens, maxlen=minimum_length))), axis=1)) self.current_accuracy, self.update_accuracy = tf.metrics.mean( accuracy) self.current_loss, self.update_loss = tf.metrics.mean( loss, weights=tf.reduce_sum(weights)) self.reset_metrics = tf.variables_initializer( tf.get_collection(tf.GraphKeys.METRIC_VARIABLES)) summary_writer = tf.contrib.summary.create_file_writer( args.logdir, flush_millis=10 * 1000) self.summaries = {} with summary_writer.as_default( ), tf.contrib.summary.record_summaries_every_n_global_steps(10): self.summaries["train"] = [ tf.contrib.summary.scalar("train/loss", self.update_loss), tf.contrib.summary.scalar("train/accuracy", self.update_accuracy_training) ] with summary_writer.as_default( ), tf.contrib.summary.always_record_summaries(): for dataset in ["dev", "test"]: self.summaries[dataset] = [ tf.contrib.summary.scalar(dataset + "/loss", self.current_loss), tf.contrib.summary.scalar(dataset + "/accuracy", self.current_accuracy) ] # Initialize variables self.session.run(tf.global_variables_initializer()) with summary_writer.as_default(): tf.contrib.summary.initialize(session=self.session, graph=self.session.graph)
def build_graph(self, input_network_outputs={}, reuse=True): """""" outputs = {} with tf.variable_scope('Embeddings'): input_tensors = [ input_vocab.get_input_tensor(reuse=reuse) for input_vocab in self.input_vocabs ] for input_network, output in input_network_outputs: with tf.variable_scope(input_network.classname): input_tensors.append( input_network.get_input_tensor(output, reuse=reuse)) layer = tf.concat(input_tensors, 2) n_nonzero = tf.to_float( tf.count_nonzero(layer, axis=-1, keep_dims=True)) batch_size, bucket_size, input_size = nn.get_sizes(layer) layer *= input_size / (n_nonzero + tf.constant(1e-12)) token_weights = nn.greater(self.id_vocab.placeholder, 0, dtype=tf.int32) tokens_per_sequence = tf.reduce_sum(token_weights, axis=1) n_tokens = tf.reduce_sum(tokens_per_sequence) n_sequences = tf.count_nonzero(tokens_per_sequence) seq_lengths = tokens_per_sequence + 1 tokens = { 'n_tokens': n_tokens, 'tokens_per_sequence': tokens_per_sequence, 'token_weights': token_weights, 'n_sequences': n_sequences } conv_keep_prob = 1. if reuse else self.conv_keep_prob recur_keep_prob = 1. if reuse else self.recur_keep_prob recur_include_prob = 1. if reuse else self.recur_include_prob rev_layer = tf.reverse_sequence(layer, seq_lengths, seq_axis=2) for i in six.moves.range(self.n_layers): conv_width = self.first_layer_conv_width if not i else self.conv_width with tf.variable_scope('RNN_FW-{}'.format(i)): layer, _ = recurrent.directed_RNN( layer, self.recur_size, seq_lengths, bidirectional=False, recur_cell=self.recur_cell, conv_width=conv_width, recur_func=self.recur_func, conv_keep_prob=conv_keep_prob, recur_include_prob=recur_include_prob, recur_keep_prob=recur_keep_prob, cifg=self.cifg, highway=self.highway, highway_func=self.highway_func) if self.bidirectional: with tf.variable_scope('RNN_BW-{}'.format(i)): rev_layer, _ = recurrent.directed_RNN( rev_layer, self.recur_size, seq_lengths, bidirectional=False, recur_cell=self.recur_cell, conv_width=conv_width, recur_func=self.recur_func, conv_keep_prob=conv_keep_prob, recur_keep_prob=recur_keep_prob, recur_include_prob=recur_include_prob, cifg=self.cifg, highway=self.highway, highway_func=self.highway_func) ones = tf.ones([batch_size, 1, 1]) with tf.variable_scope('RNN_FW-{}/RNN/Loop'.format(i), reuse=True): fw_initial_state = tf.get_variable('Initial_state') n_splits = fw_initial_state.get_shape().as_list( )[-1] / self.recur_size fw_initial_state = tf.split(fw_initial_state, int(n_splits), -1)[0] start_token = ones * fw_initial_state layer = tf.reverse_sequence(layer, seq_lengths, seq_axis=2) layer = layer[:, 1:] layer = tf.reverse_sequence(layer, seq_lengths - 1, seq_axis=2) layer = tf.concat([start_token, layer], axis=1) if self.bidirectional: with tf.variable_scope('RNN_BW-{}/RNN/Loop'.format(i), reuse=True): bw_initial_state = tf.get_variable('Initial_state') n_splits = bw_initial_state.get_shape().as_list( )[-1] / self.recur_size bw_initial_state = tf.split(bw_initial_state, int(n_splits), -1)[0] stop_token = ones * bw_initial_state rev_layer = tf.concat([stop_token, layer], axis=1) rev_layer = tf.reverse_sequence(rev_layer, seq_lengths + 1, seq_axis=2)[:, 1:] if self.bilin: layer = tf.concat([layer * rev_layer, layer, rev_layer], axis=2) else: layer = tf.concat([layer, rev_layer], axis=2) output_vocabs = {vocab.field: vocab for vocab in self.output_vocabs} outputs = {} with tf.variable_scope('Classifiers'): if 'form' in output_vocabs: vocab = output_vocabs['form'] outputs[vocab.field] = vocab.get_sampled_linear_classifier( layer, self.n_samples, token_weights=token_weights, reuse=reuse) self._evals.add('form') if 'upos' in output_vocabs: vocab = output_vocabs['upos'] outputs[vocab.field] = vocab.get_linear_classifier( layer, token_weights=token_weights, reuse=reuse) self._evals.add('upos') if 'xpos' in output_vocabs: vocab = output_vocabs['xpos'] outputs[vocab.field] = vocab.get_linear_classifier( layer, token_weights=token_weights, reuse=reuse) self._evals.add('xpos') return outputs, tokens
def reverse_sequence(self, inputs, mask=None): if mask is None: return [x[:, ::-1] for x in inputs] else: length = K.cast(K.sum(mask, 1), 'int32') return [tf.reverse_sequence(x, length, seq_axis=1) for x in inputs]
def rnn(inputs, input_lengths, cell_type, num_layers, num_units, keep_prob, is_training, bidirectional=False, debug=False, regular_output=False): # inputs: batch x time x depth assert num_layers >= 1 need_tuple_state = cell_type in (tf.nn.rnn_cell.BasicLSTMCell, tf.nn.rnn_cell.LSTMCell) if need_tuple_state: cell = cell_type(num_units, state_is_tuple=True) else: cell = cell_type(num_units) if is_training and keep_prob < 1: cell = tf.nn.rnn_cell.DropoutWrapper(cell, output_keep_prob=keep_prob) if bidirectional: input_lengths_64 = tf.cast(input_lengths, tf.int64) prev_layer_fwd = inputs prev_layer_rev = tf.reverse_sequence(inputs, input_lengths_64, 1) for i in xrange(num_layers): with tf.variable_scope("Layer%d" % i): with tf.variable_scope("Fwd"): outputs_fwd, final_state_fwd = tf.nn.dynamic_rnn( cell, prev_layer_fwd, input_lengths, dtype=tf.float32) with tf.variable_scope("Rev"): outputs_rev, final_state_rev = tf.nn.dynamic_rnn( cell, prev_layer_rev, input_lengths, dtype=tf.float32) outputs_rev = tf.reverse_sequence(outputs_rev, input_lengths_64, 1) prev_layer_fwd = tf.concat([outputs_fwd, outputs_rev], 2) prev_layer_rev = tf.reverse_sequence(prev_layer_fwd, input_lengths_64, 1) if regular_output: return prev_layer_fwd, final_state_fwd + final_state_rev if need_tuple_state: final_state_fwd = final_state_fwd[1] final_state_fwd.set_shape( [inputs.get_shape()[0], cell.state_size[1]]) final_state_rev = final_state_rev[1] final_state_rev.set_shape( [inputs.get_shape()[0], cell.state_size[1]]) else: final_state_fwd.set_shape([inputs.get_shape()[0], cell.state_size]) final_state_rev.set_shape([inputs.get_shape()[0], cell.state_size]) final_output = tf.concat([final_state_fwd, final_state_rev], 1) return prev_layer_fwd, final_output # Not bidirectional for i in xrange(num_layers): prev_layer = inputs with tf.variable_scope("Layer%d" % i): outputs, final_state = tf.nn.dynamic_rnn(cell, prev_layer, input_lengths, dtype=tf.float32) prev_layer = outputs #if num_layers > 1: # cell = tf.nn.rnn_cell.MultiRNNCell( # [cell] * (num_layers), # state_is_tuple=need_tuple_state) #if debug: # inputs = utils.tf_print_shape(inputs, # message='{} RNN input shape: '.format( # tf.get_default_graph()._name_stack)) #if need_tuple_state and num_layers > 1: # # Work around bug with MultiRNNCell and tuple states # initial_state = tuple(tuple(tf.zeros( # tf.pack([tf.shape(inputs)[0], s]), # dtype=tf.float32) for s in sizes) for sizes in cell.state_size) #else: # initial_state = None #outputs, final_state = tf.nn.dynamic_rnn(cell, # inputs, # input_lengths, # initial_state=initial_state, # dtype=tf.float32) if regular_output: return outputs, final_state if need_tuple_state: final_state[1].set_shape([inputs.get_shape()[0], cell.state_size[1]]) return final_state[1] else: final_state.set_shape([inputs.get_shape()[0], cell.state_size]) return final_state
def cabasc(self): def sequence_mask(sequence): return K.sign(K.max(K.abs(sequence), 2)) def sequence_length(sequence): return K.cast(K.sum(sequence_mask(sequence), 1), tf.int32) input_text = Input(shape=(self.max_len, )) input_text_l = Input(shape=(self.max_len, )) input_text_r = Input(shape=(self.max_len, )) input_aspect = Input(shape=(1, )) input_mask = Input(shape=(self.max_len, )) if self.use_elmo: text_elmo_embedding = ELMoEmbedding( output_mode=self.config.elmo_output_mode, idx2word=self.config.idx2token, mask_zero=True, hub_url=self.config.elmo_hub_url, elmo_trainable=self.config.elmo_trainable) l_elmo_embedding = ELMoEmbedding( output_mode=self.config.elmo_output_mode, idx2word=self.config.idx2token, mask_zero=True, hub_url=self.config.elmo_hub_url, elmo_trainable=self.config.elmo_trainable) r_elmo_embedding = ELMoEmbedding( output_mode=self.config.elmo_output_mode, idx2word=self.config.idx2token, mask_zero=True, hub_url=self.config.elmo_hub_url, elmo_trainable=self.config.elmo_trainable) if self.config.use_elmo_alone: text_embed = SpatialDropout1D(0.2)( text_elmo_embedding(input_text)) text_l_embed = SpatialDropout1D(0.2)( l_elmo_embedding(input_text_l)) text_r_embed = SpatialDropout1D(0.2)( r_elmo_embedding(input_text_r)) else: word_embedding = Embedding( input_dim=self.text_embeddings.shape[0], output_dim=self.config.word_embed_dim, weights=[self.text_embeddings], trainable=self.config.word_embed_trainable, mask_zero=True) text_embed = SpatialDropout1D(0.2)(concatenate([ word_embedding(input_text), text_elmo_embedding(input_text) ])) text_l_embed = SpatialDropout1D(0.2)(concatenate([ word_embedding(input_text_l), l_elmo_embedding(input_text_l) ])) text_r_embed = SpatialDropout1D(0.2)(concatenate([ word_embedding(input_text_r), r_elmo_embedding(input_text_r) ])) else: word_embedding = Embedding( input_dim=self.text_embeddings.shape[0], output_dim=self.config.word_embed_dim, weights=[self.text_embeddings], trainable=self.config.word_embed_trainable, mask_zero=True) text_embed = SpatialDropout1D(0.2)(word_embedding(input_text)) text_l_embed = SpatialDropout1D(0.2)(word_embedding(input_text_l)) text_r_embed = SpatialDropout1D(0.2)(word_embedding(input_text_r)) if self.config.aspect_embed_type == 'random': asp_embedding = Embedding(input_dim=self.n_aspect, output_dim=self.config.aspect_embed_dim) else: asp_embedding = Embedding( input_dim=self.aspect_embeddings.shape[0], output_dim=self.config.aspect_embed_dim, trainable=self.config.aspect_embed_trainable) aspect_embed = asp_embedding(input_aspect) aspect_embed = Flatten()(aspect_embed) # reshape to 2d # regarding aspect string as the first unit hidden_l = GRU(self.config.lstm_units, go_backwards=True, return_sequences=True)(text_l_embed) hidden_r = GRU(self.config.lstm_units, return_sequences=True)(text_r_embed) # left context attention context_attend_l = TimeDistributed(Dense( 1, activation='sigmoid'))(hidden_l) # Note: I couldn't find `reverse_sequence` in keras context_attend_l = Lambda(lambda x: tf.reverse_sequence( x, sequence_length(x), 1, 0))(context_attend_l) context_attend_l = Lambda(lambda x: K.squeeze(x, -1))(context_attend_l) # right context attention context_attend_r = TimeDistributed(Dense( 1, activation='sigmoid'))(hidden_r) context_attend_r = Lambda(lambda x: K.squeeze(x, -1))(context_attend_r) # combine context attention # aspect_text_embed = subtract([add([text_l_embed, text_r_embed]), text_embed]) # aspect_text_mask = Lambda(lambda x: sequence_mask(x))(aspect_text_embed) # text_mask = Lambda(lambda x: sequence_mask(x))(text_embed) # context_mask = subtract([text_mask, aspect_text_mask]) # aspect_text_mask_half = Lambda(lambda x: x*0.5)(aspect_text_mask) # combine_mask = add([context_mask, aspect_text_mask_half]) # 1 for context, 0.5 for aspect context_attend = multiply( [add([context_attend_l, context_attend_r]), input_mask]) # apply context attention context_attend_expand = Lambda(lambda x: K.expand_dims(x))( context_attend) memory = multiply([text_embed, context_attend_expand]) # sentence-level content attention sentence = Lambda(lambda x: K.mean(x, axis=1))(memory) final_output = ContentAttention()([memory, aspect_embed, sentence]) return Model( [input_text, input_text_l, input_text_r, input_aspect, input_mask], final_output)
def __init__(self, sequence_length, num_classes, vocab_size, embedding_size, hidden_size, user_size, user_embedding_size, num_utters, l2_reg_lambda=0.0): self.input_text = tf.placeholder(tf.int32, shape=[None, sequence_length], name='input_text') self.input_user = tf.placeholder(tf.int32, shape=[None, num_utters], name='input_user') self.input_y = tf.placeholder(tf.float32, shape=[None, num_classes], name='input_y') self.dropout_keep_prob = tf.placeholder(tf.float32, name='dropout_keep_prob') self.num_utters = num_utters self.utter_length = int(sequence_length / num_utters) self.hidden_size = hidden_size self.batch_size = tf.shape(self.input_text)[0] self.initializer = tf.random_normal_initializer(stddev=0.1) self.embedding_size = embedding_size self.user_embedding_size = user_embedding_size self._instantiate_weights() l2_loss = tf.constant(0.0) input_text = tf.split(self.input_text, self.num_utters, axis=1) input_text = tf.stack(input_text, axis=1) input_user = self.input_user with tf.name_scope("text-embedding"): self.W_text = tf.Variable(tf.random_uniform( [vocab_size, embedding_size], -1.0, 1.0), name="W_text") self.embedded_words = tf.nn.embedding_lookup( self.W_text, input_text) self.embedded_words_reshaped = tf.reshape( self.embedded_words, shape=[-1, self.utter_length, embedding_size]) with tf.name_scope("user-embedding"): self.W_user = tf.Variable(tf.random_uniform( [user_size, user_embedding_size], -1.0, 1.0), name="W_user") self.embedded_users = tf.nn.embedding_lookup( self.W_user, input_user) with tf.name_scope("rnn"): input_text_reshaped = tf.reshape(input_text, shape=[-1, self.utter_length]) relevant_input_text = tf.sign(tf.abs(input_text_reshaped)) length_input_text = tf.cast( tf.reduce_sum(relevant_input_text, axis=1), tf.int32) reversed_embedded_words = tf.reverse_sequence( self.embedded_words_reshaped, length_input_text, batch_dim=0, seq_dim=1) hidden_state_forward_list = self.gru_forward_word_level( self.embedded_words_reshaped, relevant_input_text) hidden_state_backward_list = self.gru_backward_word_level( reversed_embedded_words, relevant_input_text, length_input_text) self.hidden_state = [ tf.concat([h_forward, h_backward], axis=1) for h_forward, h_backward in zip(hidden_state_forward_list, hidden_state_backward_list) ] utter_representation = self.hidden_state utter_representation, p_attention_word = self.attention_word_level( utter_representation) self.p_attention_word = tf.reshape( p_attention_word, shape=[-1, self.num_utters, self.utter_length]) utter_representation = tf.reshape( utter_representation, shape=[-1, self.num_utters, self.hidden_size * 2]) hidden_state_forward_utters = self.gru_forward_utter_level( utter_representation, self.embedded_users) hidden_state_backward_utters = self.gru_backward_utter_level( utter_representation, self.embedded_users) self.hidden_state_utter = [ tf.concat([h_forward, h_backward], axis=1) for h_forward, h_backward in zip(hidden_state_forward_utters, hidden_state_backward_utters) ] conv_representation = self.hidden_state_utter conv_representation, p_attention_utter = self.attention_utter_level( conv_representation) self.p_attention_utter = tf.reshape(p_attention_utter, shape=[-1, self.num_utters]) self.h_outputs = tf.nn.dropout(conv_representation, keep_prob=self.dropout_keep_prob) with tf.name_scope("output"): W = tf.get_variable( "W", shape=[hidden_size * 4, num_classes], initializer=tf.contrib.layers.xavier_initializer()) b = tf.Variable(tf.constant(0.1, shape=[num_classes]), name="b") l2_loss += tf.nn.l2_loss(W) l2_loss += tf.nn.l2_loss(b) self.logits = tf.nn.xw_plus_b(self.h_outputs, W, b, name="logits") self.predictions = tf.argmax(self.logits, axis=1, name="predictions") with tf.name_scope("loss"): losses = tf.nn.softmax_cross_entropy_with_logits_v2( logits=self.logits, labels=self.input_y) self.loss = tf.reduce_mean(losses) + l2_reg_lambda * l2_loss with tf.name_scope("accuracy"): correct_predictions = tf.equal(self.predictions, tf.argmax(self.input_y, axis=1)) self.accuracy = tf.reduce_mean(tf.cast(correct_predictions, tf.float32), name="accuracy")
def cudnn_bi_gru(units, n_hidden, seq_lengths=None, n_layers=1, trainable_initial_states=False, name='cudnn_bi_gru', reuse=False): """ Fast CuDNN Bi-GRU implementation Args: units: tf.Tensor with dimensions [B x T x F], where B - batch size T - number of tokens F - features n_hidden: dimensionality of hidden state seq_lengths: number of tokens in each sample in the batch n_layers: number of layers trainable_initial_states: whether to create a special trainable variable to initialize the hidden states of the network or use just zeros name: name of the variable scope to use reuse:whether to reuse already initialized variable Returns: h - all hidden states along T dimension, tf.Tensor with dimensionality [B x T x F] h_last - last hidden state, tf.Tensor with dimensionality [B x H * 2] where H - number of hidden units """ with tf.variable_scope(name, reuse=reuse): if seq_lengths is None: seq_lengths = tf.ones([tf.shape(units)[0]], dtype=tf.int32) * tf.shape(units)[1] with tf.variable_scope('Forward'): h_fw, h_last_fw = cudnn_gru_wrapper( units, n_hidden, n_layers=n_layers, trainable_initial_states=trainable_initial_states, seq_lengths=seq_lengths, reuse=reuse) with tf.variable_scope('Backward'): reversed_units = tf.reverse_sequence(units, seq_lengths=seq_lengths, seq_dim=1, batch_dim=0) h_bw, h_last_bw = cudnn_gru_wrapper( reversed_units, n_hidden, n_layers=n_layers, trainable_initial_states=trainable_initial_states, seq_lengths=seq_lengths, reuse=reuse) h_bw = tf.reverse_sequence(h_bw, seq_lengths=seq_lengths, seq_dim=1, batch_dim=0) return (h_fw, h_bw), (h_last_fw, h_last_bw)
def _build_model(self): # initializing the cell type if self.cell_type is 'RNN': cell_element = tf.nn.rnn_cell.BasicRNNCell elif self.cell_type is 'LSTM': cell_element = tf.nn.rnn_cell.BasicLSTMCell elif self.cell_type is 'GRU': cell_element = tf.nn.rnn_cell.GRUCell else: raise ValueError('cell_type must be one of "LSTM", "RNN", "GRU"') # set the depth of cell if len(self.rnn_layers) == 1: cell = cell_element(self.rnn_layers[0]) elif len(self.rnn_layers) > 1: cell_elements = [] for rnn_layer in self.rnn_layers: cell_elements.append(cell_element(rnn_layer)) cell = tf.nn.rnn_cell.MultiRNNCell(cell_elements) self.inputs = tf.placeholder(tf.int32, [self.batch_size, self.max_length]) self.inputs_length = tf.placeholder(tf.int32, [self.batch_size]) previous_tokens = self.inputs[:, :-1] next_tokens = self.inputs[:, 1:] tokens_length = self.inputs_length - 1 previous_tokens_one_hot = tf.one_hot(previous_tokens, self.num_tokens) with tf.variable_scope("RNN_LM"): # training outputs, _ = tf.nn.dynamic_rnn(cell, previous_tokens_one_hot, tokens_length, dtype=tf.float32, swap_memory=True, time_major=False) # extracting features tf.get_variable_scope().reuse_variables() outputs_test, _ = tf.nn.dynamic_rnn(cell, tf.one_hot( self.inputs, self.num_tokens), self.inputs_length, dtype=tf.float32, swap_memory=True, time_major=False) last_state = tf.reverse_sequence(outputs_test, self.inputs_length, seq_dim=1, batch_dim=0)[:, 0, :] max_pooling = tf.reduce_max(outputs_test, 1) self.features = tf.concat([last_state, max_pooling], 1) outputs = tf.reshape( outputs, [self.batch_size * (self.max_length - 1), self.rnn_layers[-1]]) W_output = tf.Variable(tf.random_uniform( [self.rnn_layers[-1], self.num_tokens], -0.1, 0.1), name='W_output') b_output = tf.Variable(tf.zeros([self.num_tokens]), name='b_output') outputs = tf.matmul(outputs, W_output) + b_output next_tokens = tf.reshape(next_tokens, [self.batch_size * (self.max_length - 1)]) self.loss = tf.nn.sparse_softmax_cross_entropy_with_logits( outputs, next_tokens) tokens_mask = tf.sequence_mask(tokens_length, self.max_length - 1, dtype=tf.float32) self.loss = tf.multiply( self.loss, tf.reshape(tokens_mask, [self.batch_size * (self.max_length - 1)])) self.loss = tf.reduce_sum(self.loss) / self.batch_size opt = tf.train.AdamOptimizer(self.learning_rate) grads_and_vars = opt.compute_gradients(self.loss) grads_and_vars = [(tf.clip_by_value(gv[0], -1.0, 1.0), gv[1]) for gv in grads_and_vars] self.train_op = opt.apply_gradients(grads_and_vars) self.init_op = tf.global_variables_initializer()
def inference(X, weights, bias, reuse = None, trainMode = True): word_vectors = tf.nn.embedding_lookup(WORDS, X) # [batch_size, 80, 50] length = GetLength(X) length_64 = tf.cast(length, tf.int64) reuse = None if trainMode else True with tf.variable_scope("rnn_fwbw", reuse = reuse) as scope: forward_output, _ = tf.nn.dynamic_rnn( tf.contrib.rnn.LSTMCell(FLAGS.num_hidden, reuse = reuse), word_vectors, dtype = tf.float32, sequence_length = length, scope = "RNN_forward") backward_output_, _ = tf.nn.dynamic_rnn( tf.contrib.rnn.LSTMCell(FLAGS.num_hidden, reuse = reuse), inputs = tf.reverse_sequence(word_vectors, length_64, seq_dim = 1), dtype = tf.float32, sequence_length = length, scope = "RNN_backword") backward_output = tf.reverse_sequence(backward_output_, length_64, seq_dim = 1) output = tf.concat([forward_output, backward_output], 2) # [batch_size, 80, 200] output = tf.expand_dims(output, -1) # [batch_size, 80, 200, 1] output = tf.transpose(output, perm = [1, 0, 3, 2]) # [80, batch_size, 1, 200] char_blocks = output for i in range(FLAGS.mrank): ileft = output[(i + 1) : ] iright = output[ : FLAGS.max_sentence_len - (i + 1)] ileft = tf.pad(ileft, [[(i + 1), 0], [0, 0], [0, 0], [0, 0]], "CONSTANT") iright = tf.pad(iright, [[0, (i + 1)], [0, 0], [0, 0], [0, 0]], "CONSTANT") char_blocks = tf.concat([ileft, char_blocks, iright], 3) # char_blocks.shape = [80, batch_size, 1, 200 * (2*mrank + 1)] char_blocks = tf.reshape(char_blocks, [FLAGS.max_sentence_len, -1, 2 * FLAGS.mrank + 1, 2 * FLAGS.num_hidden]) char_blocks = tf.expand_dims(char_blocks, -1) # [80, batch_size, 2 * mrank + 1, 200, 1] # Namely, [80, batch_size, 3, 200, 1] for 1-rank Markov assumption # do conv do_char_conv = lambda x: char_convolution(x) abstract_chars = tf.map_fn(do_char_conv, char_blocks) # [80, batch_size, 200] abstract_chars = tf.transpose(abstract_chars, perm = [1, 0, 2]) abstract_chars = tf.reshape(abstract_chars, [-1, FLAGS.num_hidden * 2]) if trainMode: abstract_chars = tf.nn.dropout(abstract_chars, 0.5) matricized_unary_scores = tf.matmul(abstract_chars, weights) + bias unary_scores = tf.reshape(matricized_unary_scores, [-1, FLAGS.max_sentence_len, FLAGS.num_tags]) # [batch_size, 80, 4] return unary_scores, length
def _build_model(self): # initializing the cell type if self.cell_type is 'RNN': cell_element = tf.contrib.rnn.BasicRNNCell elif self.cell_type is 'LSTM': cell_element = tf.contrib.rnn.BasicLSTMCell elif self.cell_type is 'GRU': cell_element = tf.contrib.rnn.GRUCell else: raise ValueError('cell_type must be one of "LSTM", "RNN", "GRU"') # set the depth of cell if len(self.rnn_layers) == 1: cell = cell_element(self.rnn_layers[0]) elif len(self.rnn_layers) > 1: cell_elements = [] for rnn_layer in self.rnn_layers: cell_elements.append(cell_element(rnn_layer)) cell = tf.contrib.rnn.MultiRNNCell(cell_elements) else: raise ValueError('rnn_layers must have at least one element') self.inputs = tf.placeholder(tf.int32, [self.batch_size, self.max_length]) self.inputs_length = tf.placeholder(tf.int32, [self.batch_size]) self.outputs = tf.placeholder(tf.int32, [self.batch_size]) inputs_one_hot = tf.one_hot(self.inputs, self.num_tokens) # bidirectional if self.bidirectional: outputs, _ = tf.nn.bidirectional_dynamic_rnn(cell, cell, inputs_one_hot, self.inputs_length, dtype=tf.float32, swap_memory=True, time_major=False) output_fw, output_bw = outputs per_step_outputs = tf.concat(outputs, 2) if self.attention_layers is None: output_fw = tf.reverse_sequence(output_fw, self.inputs_length, seq_dim=1, batch_dim=0) ff_input = tf.concat([output_fw[:, 0, :], output_bw[:, 0, :]], 1) else: attention_input = tf.concat([output_fw, output_bw], 2) else: outputs, _ = tf.nn.dynamic_rnn(cell, inputs_one_hot, self.inputs_length, dtype=tf.float32, swap_memory=True, time_major=False) per_step_outputs = outputs if self.attention_layers is None: outputs = tf.reverse_sequence(outputs, self.inputs_length, seq_dim=1, batch_dim=0) ff_input = outputs[:, 0, :] else: attention_input = outputs # attention if self.attention_layers is not None: last_attention_hidden = self.rnn_layers[-1] * ( 2 if self.bidirectional else 1) attention_hidden = tf.reshape( attention_input, [self.batch_size * self.max_length, last_attention_hidden]) for layer, attention_layer in enumerate(self.attention_layers): W_attention = tf.Variable(tf.random_uniform( [last_attention_hidden, attention_layer], -0.1, 0.1), name='W_attention_%d' % (layer, )) last_attention_hidden = attention_layer b_attention = tf.Variable(tf.zeros([attention_layer]), name='b_attention_%d' % (layer, )) attention_hidden = tf.matmul(attention_hidden, W_attention) + b_attention if layer < len(self.attention_layers) - 1: attention_hidden = tf.nn.tanh(attention_hidden) attention_weights = tf.exp( tf.reshape(attention_hidden, [self.batch_size, self.max_length])) inputs_mask = tf.sequence_mask(self.inputs_length, self.max_length, dtype=tf.float32) attention_weights *= inputs_mask attention_weights_sum = tf.reduce_sum(attention_weights, 1, keep_dims=True) attention_weights /= attention_weights_sum ff_input = tf.reduce_sum( tf.multiply(attention_input, tf.expand_dims(attention_weights, 2)), 1) # feed forwards last_ff_hidden = self.rnn_layers[-1] * (2 if self.bidirectional else 1) ff_hidden = ff_input params_ff = [] for layer, ff_layer in enumerate(self.ff_layers): W_ff = tf.Variable(tf.random_uniform([last_ff_hidden, ff_layer], -0.1, 0.1), name='W_ff_%d' % (layer, )) last_ff_hidden = ff_layer b_ff = tf.Variable(tf.zeros([ff_layer]), name='b_ff_%d' % (layer, )) ff_hidden = tf.matmul(ff_hidden, W_ff) + b_ff params_ff.append((W_ff, b_ff)) if layer < len(self.ff_layers) - 1: ff_hidden = tf.nn.tanh(ff_hidden) self.probability = tf.nn.softmax(ff_hidden) self.loss = tf.reduce_sum( tf.nn.sparse_softmax_cross_entropy_with_logits(labels=self.outputs, logits=ff_hidden)) opt = tf.train.AdamOptimizer(self.learning_rate) grads_and_vars = opt.compute_gradients(self.loss) grads_and_vars = [(tf.clip_by_value(gv[0], -1.0, 1.0), gv[1]) for gv in grads_and_vars] self.train_op = opt.apply_gradients(grads_and_vars) self.init_op = tf.global_variables_initializer() # classify the sequence per step per_step_hidden = tf.reshape(per_step_outputs, [ self.batch_size * self.max_length, self.rnn_layers[-1] * (2 if self.bidirectional else 1) ]) for layer in range(len(params_ff)): W_ff, b_ff = params_ff[layer] per_step_hidden = tf.matmul(per_step_hidden, W_ff) + b_ff if layer < len(self.ff_layers) - 1: per_step_hidden = tf.nn.tanh(per_step_hidden) self.per_step_result = tf.reshape(tf.argmax(per_step_hidden, axis=1), [self.batch_size, self.max_length])
def _append_eow(self, sequences): """Append EOW character after end every given sequence.""" sequences_rev = tf.reverse_sequence(sequences, tf.reduce_sum(tf.cast(tf.not_equal(sequences, 0), tf.int32), axis=1), 1) sequences_rev_eow = tf.pad(sequences_rev, [[0, 0], [1, 0]], constant_values=MorphoDataset.Factor.EOW) return tf.reverse_sequence(sequences_rev_eow, tf.reduce_sum(tf.cast(tf.not_equal(sequences_rev_eow, 0), tf.int32), axis=1), 1)
def cudnn_lstm_layer(inputs, batch_size, num_units, lengths=None, stack_size=1, rnn_dropout_drop_amt=0, is_training=True, bidirectional=True): """Create a LSTM layer that uses cudnn.""" inputs_t = tf.transpose(inputs, [1, 0, 2]) if lengths is not None: all_outputs = [inputs_t] for i in range(stack_size): with tf.variable_scope('stack_' + str(i)): with tf.variable_scope('forward'): lstm_fw = tf.contrib.cudnn_rnn.CudnnLSTM( num_layers=1, num_units=num_units, direction='unidirectional', dropout=rnn_dropout_drop_amt, kernel_initializer=tf.contrib.layers. variance_scaling_initializer(), bias_initializer=tf.zeros_initializer(), ) c_fw = tf.zeros([1, batch_size, num_units], tf.float32) h_fw = tf.zeros([1, batch_size, num_units], tf.float32) outputs_fw, _ = lstm_fw(all_outputs[-1], (h_fw, c_fw), training=is_training) combined_outputs = outputs_fw if bidirectional: with tf.variable_scope('backward'): lstm_bw = tf.contrib.cudnn_rnn.CudnnLSTM( num_layers=1, num_units=num_units, direction='unidirectional', dropout=rnn_dropout_drop_amt, kernel_initializer=tf.contrib.layers. variance_scaling_initializer(), bias_initializer=tf.zeros_initializer(), ) c_bw = tf.zeros([1, batch_size, num_units], tf.float32) h_bw = tf.zeros([1, batch_size, num_units], tf.float32) inputs_reversed = tf.reverse_sequence(all_outputs[-1], lengths, seq_axis=0, batch_axis=1) outputs_bw, _ = lstm_bw(inputs_reversed, (h_bw, c_bw), training=is_training) outputs_bw = tf.reverse_sequence(outputs_bw, lengths, seq_axis=0, batch_axis=1) combined_outputs = tf.concat([outputs_fw, outputs_bw], axis=2) all_outputs.append(combined_outputs) # for consistency with cudnn, here we just return the top of the stack, # although this can easily be altered to do other things, including be # more resnet like return tf.transpose(all_outputs[-1], [1, 0, 2]) else: lstm = tf.contrib.cudnn_rnn.CudnnLSTM( num_layers=stack_size, num_units=num_units, direction='bidirectional' if bidirectional else 'unidirectional', dropout=rnn_dropout_drop_amt, kernel_initializer=tf.contrib.layers.variance_scaling_initializer( ), bias_initializer=tf.zeros_initializer(), ) stack_multiplier = 2 if bidirectional else 1 c = tf.zeros([stack_multiplier * stack_size, batch_size, num_units], tf.float32) h = tf.zeros([stack_multiplier * stack_size, batch_size, num_units], tf.float32) outputs, _ = lstm(inputs_t, (h, c), training=is_training) outputs = tf.transpose(outputs, [1, 0, 2]) return outputs
def _build_lstms(self): # now the LSTMs # these will collect the initial states for the forward # (and reverse LSTMs if we are doing bidirectional) # parse the options lstm_dim = self.options['lstm']['dim'] projection_dim = self.options['lstm']['projection_dim'] n_lstm_layers = self.options['lstm'].get('n_layers', 1) cell_clip = self.options['lstm'].get('cell_clip') proj_clip = self.options['lstm'].get('proj_clip') use_skip_connections = self.options['lstm']['use_skip_connections'] # the sequence lengths from input mask if self.use_character_inputs: mask = tf.reduce_any(self.ids_placeholder > 0, axis=2) else: mask = self.ids_placeholder > 0 sequence_lengths = tf.reduce_sum(tf.cast(mask, tf.int32), axis=1) batch_size = tf.shape(sequence_lengths)[0] # for each direction, we'll store tensors for each layer self.lstm_outputs = {'forward': [], 'backward': []} self.lstm_state_sizes = {'forward': [], 'backward': []} self.lstm_init_states = {'forward': [], 'backward': []} self.lstm_final_states = {'forward': [], 'backward': []} update_ops = [] for direction in ['forward', 'backward']: if direction == 'forward': layer_input = self.embedding else: layer_input = tf.reverse_sequence( self.embedding, sequence_lengths, seq_axis=1, batch_axis=0 ) for i in range(n_lstm_layers): if projection_dim < lstm_dim: # are projecting down output lstm_cell = tf.nn.rnn_cell.LSTMCell( lstm_dim, num_proj=projection_dim, cell_clip=cell_clip, proj_clip=proj_clip) else: lstm_cell = tf.nn.rnn_cell.LSTMCell( lstm_dim, cell_clip=cell_clip, proj_clip=proj_clip) if use_skip_connections: # ResidualWrapper adds inputs to outputs if i == 0: # don't add skip connection from token embedding to # 1st layer output pass else: # add a skip connection lstm_cell = tf.nn.rnn_cell.ResidualWrapper(lstm_cell) # collect the input state, run the dynamic rnn, collect # the output state_size = lstm_cell.state_size # the LSTMs are stateful. To support multiple batch sizes, # we'll allocate size for states up to max_batch_size, # then use the first batch_size entries for each batch init_states = [ tf.Variable( tf.zeros([self._max_batch_size, dim]), trainable=False ) for dim in lstm_cell.state_size ] batch_init_states = [ state[:batch_size, :] for state in init_states ] if direction == 'forward': i_direction = 0 else: i_direction = 1 variable_scope_name = 'RNN_{0}/RNN/MultiRNNCell/Cell{1}'.format( i_direction, i) with tf.compat.v1.variable_scope(variable_scope_name): layer_output, final_state = tf.nn.dynamic_rnn( lstm_cell, layer_input, sequence_length=sequence_lengths, initial_state=tf.nn.rnn_cell.LSTMStateTuple( *batch_init_states), ) self.lstm_state_sizes[direction].append(lstm_cell.state_size) self.lstm_init_states[direction].append(init_states) self.lstm_final_states[direction].append(final_state) if direction == 'forward': self.lstm_outputs[direction].append(layer_output) else: self.lstm_outputs[direction].append( tf.reverse_sequence( layer_output, sequence_lengths, seq_axis=1, batch_axis=0 ) ) with tf.control_dependencies([layer_output]): # update the initial states for i in range(2): new_state = tf.concat( [final_state[i][:batch_size, :], init_states[i][batch_size:, :]], axis=0) state_update_op = tf.assign(init_states[i], new_state) update_ops.append(state_update_op) layer_input = layer_output self.mask = mask self.sequence_lengths = sequence_lengths self.update_state_op = tf.group(*update_ops)
def backward_dgcnn(x, input_mask, num_layers=2, dilation_rates=[1,2], strides=[1,1], num_filters=[64,64], kernel_sizes=[3,3], is_training=False, scope_name="textcnn", reuse=False, activation=tf.nn.relu, is_casual=False, padding='same'): # input_mask: batch_size, seq # initializer = tf.glorot_uniform_initializer() # initializer = tf.truncated_normal_initializer(stddev=0.1) initializer = create_initializer(initializer_range=0.02) input_len = tf.reduce_sum(tf.cast(input_mask, tf.int32), axis=-1) # inverse_mask = tf.reverse_sequence(input_mask, input_len, seq_axis=1, batch_axis=0) input_mask = tf.expand_dims(input_mask, axis=-1) input_mask = tf.cast(input_mask, dtype=tf.float32) inverse_x = tf.reverse_sequence(x, input_len, seq_axis=1, batch_axis=0) if is_casual: left_pad = dilation_rates[0] * (kernel_sizes[0] - 1) inputs = tf.pad(inverse_x, [[0, 0, ], [left_pad, 0], [0, 0]]) padding = 'valid' tf.logging.info("==casual valid padding==") else: inputs = inverse_x with tf.variable_scope(scope_name, reuse=reuse): inputs = gated_conv1d_op(inputs, filters=num_filters[0], kernel_size=kernel_sizes[0], padding=padding, activation=None, strides=1, reuse=reuse, dilation_rate=1, name="gated_conv", kernel_initializer=initializer, #tf.truncated_normal_initializer(stddev=0.1), is_training=is_training) if padding == 'same': inputs *= input_mask residual_inputs = inputs for (dilation_rate, layer, kernel_size, stride, num_filter) in zip(dilation_rates, range(num_layers), kernel_sizes, strides, num_filters): layer_scope_name = "%s_layer_%s"%(str(scope_name), str(layer)) output_shape = bert_utils.get_shape_list(inputs, expected_rank=3) with tf.variable_scope(layer_scope_name, reuse=reuse): if dilation_rate > 1: stride = 1 if not is_casual: padding = padding tf.logging.info("==none-casual same padding==") else: left_pad = dilation_rate * (kernel_size - 1) inputs = tf.pad(inputs, [[0, 0, ], [left_pad, 0], [0, 0]]) padding = 'valid' tf.logging.info("==casual valid padding==") tf.logging.info("==kernel_size:%s, num_filter:%s, stride:%s, dilation_rate:%s==", str(kernel_size), str(num_filter), str(stride), str(dilation_rate)) inputs = residual_gated_conv1d_op(inputs, residual_inputs, filters=num_filter, kernel_size=kernel_size, padding=padding, activation=None, strides=stride, reuse=False, dilation_rate=dilation_rate, name="residual_gated_conv", kernel_initializer=initializer, #tf.truncated_normal_initializer(stddev=0.1), is_training=is_training) if padding == 'same': inputs *= input_mask residual_inputs = inputs inverse_x = tf.reverse_sequence(inputs, input_len, seq_axis=1, batch_axis=0) return inputs
def _build_ops(self, lm_graph): with tf.control_dependencies([lm_graph.update_state_op]): # get the LM embeddings token_embeddings = lm_graph.embedding layers = [ tf.concat([token_embeddings, token_embeddings], axis=2) ] n_lm_layers = len(lm_graph.lstm_outputs['forward']) for i in range(n_lm_layers): layers.append( tf.concat( [lm_graph.lstm_outputs['forward'][i], lm_graph.lstm_outputs['backward'][i]], axis=-1 ) ) # The layers include the BOS/EOS tokens. Remove them sequence_length_wo_bos_eos = lm_graph.sequence_lengths - 2 layers_without_bos_eos = [] for layer in layers: layer_wo_bos_eos = layer[:, 1:, :] layer_wo_bos_eos = tf.reverse_sequence( layer_wo_bos_eos, lm_graph.sequence_lengths - 1, seq_axis=1, batch_axis=0, ) layer_wo_bos_eos = layer_wo_bos_eos[:, 1:, :] layer_wo_bos_eos = tf.reverse_sequence( layer_wo_bos_eos, sequence_length_wo_bos_eos, seq_axis=1, batch_axis=0, ) layers_without_bos_eos.append(layer_wo_bos_eos) # concatenate the layers lm_embeddings = tf.concat( [tf.expand_dims(t, axis=1) for t in layers_without_bos_eos], axis=1 ) # get the mask op without bos/eos. # tf doesn't support reversing boolean tensors, so cast # to int then back mask_wo_bos_eos = tf.cast(lm_graph.mask[:, 1:], 'int32') mask_wo_bos_eos = tf.reverse_sequence( mask_wo_bos_eos, lm_graph.sequence_lengths - 1, seq_axis=1, batch_axis=0, ) mask_wo_bos_eos = mask_wo_bos_eos[:, 1:] mask_wo_bos_eos = tf.reverse_sequence( mask_wo_bos_eos, sequence_length_wo_bos_eos, seq_axis=1, batch_axis=0, ) mask_wo_bos_eos = tf.cast(mask_wo_bos_eos, 'bool') return { 'lm_embeddings': lm_embeddings, 'lengths': sequence_length_wo_bos_eos, 'token_embeddings': lm_graph.embedding, 'mask': mask_wo_bos_eos, }
def bidirectional_GRU(inputs, inputs_len, cell=None, cell_fn=tf.contrib.rnn.GRUCell, units=0, layers=1, scope="Bidirectional_GRU", output=0, is_training=True, reuse=None, dr_input_keep_prob=1.0, dr_output_keep_prob=1.0, is_bidir=False): ''' Bidirectional recurrent neural network with GRU cells. Args: inputs: rnn input of shape (batch_size, timestep, dim) inputs_len: rnn input_len of shape (batch_size, ) cell: rnn cell of type RNN_Cell. output: [ batch, step, dim (fw;bw) ], [ batch, dim (fw;bw) ] ''' with tf.variable_scope(scope, reuse=reuse, initializer=tf.orthogonal_initializer()): if cell is not None: (cell_fw, cell_bw) = cell else: shapes = inputs.get_shape().as_list() if len(shapes) > 3: print('input reshaped!!!') inputs = tf.reshape(inputs, (shapes[0] * shapes[1], shapes[2], -1)) inputs_len = tf.reshape(inputs_len, (shapes[0] * shapes[1], )) # if no cells are provided, use standard GRU cell implementation if layers > 1: cell_fw = MultiRNNCell([ apply_dropout(cell_fn(units), size=inputs.shape[-1] if i == 0 else units, is_training=is_training, input_keep_prob=dr_input_keep_prob, output_keep_prob=dr_output_keep_prob) for i in range(layers) ]) if is_bidir: cell_bw = MultiRNNCell([ apply_dropout( cell_fn(units), size=inputs.shape[-1] if i == 0 else units, is_training=is_training, input_keep_prob=dr_input_keep_prob, output_keep_prob=dr_output_keep_prob) for i in range(layers) ]) else: cell_fw = apply_dropout(cell_fn(units), size=inputs.shape[-1], is_training=is_training) if is_bidir: cell_bw = apply_dropout(cell_fn(units), size=inputs.shape[-1], is_training=is_training) if is_bidir: outputs, states = tf.nn.bidirectional_dynamic_rnn( cell_fw=cell_fw, cell_bw=cell_bw, inputs=inputs, sequence_length=inputs_len, dtype=tf.float32, scope=scope, time_major=False) if Params.reverse_bw: fw = outputs[0] bw = tf.reverse_sequence(outputs[1], seq_lengths=inputs_len, seq_axis=1) outputs = (fw, bw) return tf.concat(outputs, 2), tf.concat(states, axis=1) else: outputs, states = tf.nn.dynamic_rnn(cell=cell_fw, inputs=inputs, dtype=tf.float32, sequence_length=inputs_len, scope=scope, time_major=False) return outputs, states
def reverse_sequence(self, x, mask): """这里的mask.shape是[batch_size, seq_len, 1] """ seq_len = K.round(K.sum(mask, 1)[:, 0]) seq_len = K.cast(seq_len, 'int32') return tf.reverse_sequence(x, seq_len, seq_dim=1)
import numpy as np import tensorflow as tf import matplotlib.image as mpimg import matplotlib.pyplot as plt # First, load the image again filename = "MarshOrchid.jpg" image = mpimg.imread(filename) height, width, depth = image.shape # Create a TensorFlow Variable x = tf.Variable(image, name='x') model = tf.global_variables_initializer() with tf.Session() as session: x = tf.reverse_sequence(x, [width] * height, 1, batch_dim=0) session.run(model) result = session.run(x) print(result.shape) plt.imshow(result) plt.show()
def __init__(self, is_training, config): hidden_size = config.hidden_size view1_input_size = config.view1_input_size view2_input_size = config.view2_input_size margin = config.margin lr = config.learning_rate kp = config.keep_prob obj = config.objective # View 1 Layer1 x1 self._input_x1 = input_x1 = tf.placeholder( tf.float32, [None, None, view1_input_size]) self._input_x1_lengths = input_x1_lengths = tf.placeholder( tf.int32, [None]) input_x1_lengths_64 = tf.to_int64(input_x1_lengths) if is_training and kp < 1: input_x1 = tf.nn.dropout(input_x1, keep_prob=kp) l2r_cell_layer1_view1 = tf.nn.rnn_cell.BasicLSTMCell( hidden_size, forget_bias=1.0, state_is_tuple=True) r2l_cell_layer1_view1 = tf.nn.rnn_cell.BasicLSTMCell( hidden_size, forget_bias=1.0, state_is_tuple=True) with tf.variable_scope('l2r_layer1_view1'): l2r_outputs_layer1_view1, _ = tf.nn.dynamic_rnn( l2r_cell_layer1_view1, input_x1, dtype=tf.float32, sequence_length=input_x1_lengths) with tf.variable_scope('r2l_layer1_view1'): r2l_outputs_layer1_view1, _ = tf.nn.dynamic_rnn( r2l_cell_layer1_view1, tf.reverse_sequence(input_x1, input_x1_lengths_64, 1), dtype=tf.float32, sequence_length=input_x1_lengths) r2l_outputs_layer1_view1 = tf.reverse_sequence( r2l_outputs_layer1_view1, input_x1_lengths_64, 1) # View 1 Layer 2 x1 input_x1_layer2 = tf.concat( 2, [l2r_outputs_layer1_view1, r2l_outputs_layer1_view1], 'concat_layer1_view1_x1') if is_training and kp < 1: input_x1_layer2 = tf.nn.dropout(input_x1_layer2, keep_prob=kp) l2r_cell_layer2_view1 = tf.nn.rnn_cell.BasicLSTMCell( hidden_size, forget_bias=1.0, state_is_tuple=True) r2l_cell_layer2_view1 = tf.nn.rnn_cell.BasicLSTMCell( hidden_size, forget_bias=1.0, state_is_tuple=True) with tf.variable_scope('l2r_layer2_view1'): l2r_outputs_layer2_view1, _ = tf.nn.dynamic_rnn( l2r_cell_layer2_view1, input_x1_layer2, dtype=tf.float32, sequence_length=input_x1_lengths) with tf.variable_scope('r2l_layer2_view1'): r2l_outputs_layer2_view1, _ = tf.nn.dynamic_rnn( r2l_cell_layer2_view1, tf.reverse_sequence(input_x1_layer2, input_x1_lengths_64, 1), dtype=tf.float32, sequence_length=input_x1_lengths) l2r_outputs_view1 = tf.gather( tf.reshape(tf.concat(1, l2r_outputs_layer2_view1), [-1, hidden_size]), tf.range(tf.shape(input_x1)[0]) * tf.shape(input_x1)[1] + input_x1_lengths - 1) r2l_outputs_view1 = tf.gather( tf.reshape(tf.concat(1, r2l_outputs_layer2_view1), [-1, hidden_size]), tf.range(tf.shape(input_x1)[0]) * tf.shape(input_x1)[1] + input_x1_lengths - 1) self._final_state = x1 = self.normalization( tf.concat(1, [l2r_outputs_view1, r2l_outputs_view1], 'concat_view1_x1')) if not is_training: return # input_x2 if 2 in obj or 3 in obj: # View 1 Layer 1 x2 self._input_x2 = input_x2 = tf.placeholder( tf.float32, [None, None, view1_input_size]) self._input_x2_lengths = input_x2_lengths = tf.placeholder( tf.int32, [None]) input_x2_lengths_64 = tf.to_int64(input_x2_lengths) if is_training and kp < 1: input_x2 = tf.nn.dropout(input_x2, keep_prob=kp) with tf.variable_scope('l2r_layer1_view1', reuse=True): l2r_outputs_layer1_view1, _ = tf.nn.dynamic_rnn( l2r_cell_layer1_view1, input_x2, dtype=tf.float32, sequence_length=input_x2_lengths) with tf.variable_scope('r2l_layer1_view1', reuse=True): r2l_outputs_layer1_view1, _ = tf.nn.dynamic_rnn( r2l_cell_layer1_view1, tf.reverse_sequence(input_x2, input_x2_lengths_64, 1), dtype=tf.float32, sequence_length=input_x2_lengths) r2l_outputs_layer1_view1 = tf.reverse_sequence( r2l_outputs_layer1_view1, input_x2_lengths_64, 1) # View 1 Layer 2 x2 input_x2_layer2 = tf.concat( 2, [l2r_outputs_layer1_view1, r2l_outputs_layer1_view1], 'concat_layer1_view1_x2') if is_training and kp < 1: input_x2_layer2 = tf.nn.dropout(input_x2_layer2, keep_prob=kp) l2r_cell_layer2_view1 = tf.nn.rnn_cell.BasicLSTMCell( hidden_size, forget_bias=1.0, state_is_tuple=True) r2l_cell_layer2_view1 = tf.nn.rnn_cell.BasicLSTMCell( hidden_size, forget_bias=1.0, state_is_tuple=True) with tf.variable_scope('l2r_layer2_view1', reuse=True): l2r_outputs_layer2_view1, _ = tf.nn.dynamic_rnn( l2r_cell_layer2_view1, input_x2_layer2, dtype=tf.float32, sequence_length=input_x2_lengths) with tf.variable_scope('r2l_layer2_view1', reuse=True): r2l_outputs_layer2_view1, _ = tf.nn.dynamic_rnn( r2l_cell_layer2_view1, tf.reverse_sequence(input_x2_layer2, input_x2_lengths_64, 1), dtype=tf.float32, sequence_length=input_x2_lengths) l2r_outputs_view1 = tf.gather( tf.reshape(tf.concat(1, l2r_outputs_layer2_view1), [-1, hidden_size]), tf.range(tf.shape(input_x2)[0]) * tf.shape(input_x2)[1] + input_x2_lengths - 1) r2l_outputs_view1 = tf.gather( tf.reshape(tf.concat(1, r2l_outputs_layer2_view1), [-1, hidden_size]), tf.range(tf.shape(input_x2)[0]) * tf.shape(input_x2)[1] + input_x2_lengths - 1) x2 = self.normalization( tf.concat(1, [l2r_outputs_view1, r2l_outputs_view1], 'concat_view1_x2')) # View 2 Layer 1 c1 self._input_c1 = input_c1 = tf.placeholder( tf.float32, [None, None, view2_input_size]) self._input_c1_lengths = input_c1_lengths = tf.placeholder( tf.int32, [None]) input_c1_lengths_64 = tf.to_int64(input_c1_lengths) l2r_cell_layer1_view2 = tf.nn.rnn_cell.BasicLSTMCell( hidden_size, forget_bias=1.0, state_is_tuple=True) r2l_cell_layer1_view2 = tf.nn.rnn_cell.BasicLSTMCell( hidden_size, forget_bias=1.0, state_is_tuple=True) with tf.variable_scope('l2r_layer1_view2'): l2r_outputs_layer1_view2, _ = tf.nn.dynamic_rnn( l2r_cell_layer1_view2, input_c1, dtype=tf.float32, sequence_length=input_c1_lengths) with tf.variable_scope('r2l_layer1_view2'): r2l_outputs_layer1_view2, _ = tf.nn.dynamic_rnn( r2l_cell_layer1_view2, tf.reverse_sequence(input_c1, input_c1_lengths_64, 1), dtype=tf.float32, sequence_length=input_c1_lengths) r2l_outputs_layer1_view2 = tf.reverse_sequence( r2l_outputs_layer1_view2, input_c1_lengths_64, 1) # View 2 Layer 2 c1 input_c1_layer2 = tf.concat( 2, [l2r_outputs_layer1_view2, r2l_outputs_layer1_view2], 'concat_layer1_view2_c1') if is_training and kp < 1: input_c1_layer2 = tf.nn.dropout(input_c1_layer2, keep_prob=kp) l2r_cell_layer2_view2 = tf.nn.rnn_cell.BasicLSTMCell( hidden_size, forget_bias=1.0, state_is_tuple=True) r2l_cell_layer2_view2 = tf.nn.rnn_cell.BasicLSTMCell( hidden_size, forget_bias=1.0, state_is_tuple=True) with tf.variable_scope('l2r_layer2_view2'): l2r_outputs_layer2_view2, _ = tf.nn.dynamic_rnn( l2r_cell_layer2_view2, input_c1_layer2, dtype=tf.float32, sequence_length=input_c1_lengths) with tf.variable_scope('r2l_layer2_view2'): r2l_outputs_layer2_view2, _ = tf.nn.dynamic_rnn( r2l_cell_layer2_view2, tf.reverse_sequence(input_c1_layer2, input_c1_lengths_64, 1), dtype=tf.float32, sequence_length=input_c1_lengths) l2r_outputs_view2 = tf.gather( tf.reshape(tf.concat(1, l2r_outputs_layer2_view2), [-1, hidden_size]), tf.range(tf.shape(input_c1)[0]) * tf.shape(input_c1)[1] + input_c1_lengths - 1) r2l_outputs_view2 = tf.gather( tf.reshape(tf.concat(1, r2l_outputs_layer2_view2), [-1, hidden_size]), tf.range(tf.shape(input_c1)[0]) * tf.shape(input_c1)[1] + input_c1_lengths - 1) self._word_state = c1 = self.normalization( tf.concat(1, [l2r_outputs_view2, r2l_outputs_view2], 'concat_view2_c1')) # input_c2 if 0 in obj or 1 in obj: # View 2 Layer 1 c2 self._input_c2 = input_c2 = tf.placeholder( tf.float32, [None, None, view2_input_size]) self._input_c2_lengths = input_c2_lengths = tf.placeholder( tf.int32, [None]) input_c2_lengths_64 = tf.to_int64(input_c2_lengths) with tf.variable_scope('l2r_layer1_view2', reuse=True): l2r_outputs_layer1_view2, _ = tf.nn.dynamic_rnn( l2r_cell_layer1_view2, input_c2, dtype=tf.float32, sequence_length=input_c2_lengths) with tf.variable_scope('r2l_layer1_view2', reuse=True): r2l_outputs_layer1_view2, _ = tf.nn.dynamic_rnn( r2l_cell_layer1_view2, tf.reverse_sequence(input_c2, input_c2_lengths_64, 1), dtype=tf.float32, sequence_length=input_c2_lengths) r2l_outputs_layer1_view2 = tf.reverse_sequence( r2l_outputs_layer1_view2, input_c2_lengths_64, 1) # View 2 Layer 2 c2 input_c2_layer2 = tf.concat( 2, [l2r_outputs_layer1_view2, r2l_outputs_layer1_view2], 'concat_layer1_view2_c2') if is_training and kp < 1: input_c2_layer2 = tf.nn.dropout(input_c2_layer2, keep_prob=kp) with tf.variable_scope('l2r_layer2_view2', reuse=True): l2r_outputs_layer2_view2, _ = tf.nn.dynamic_rnn( l2r_cell_layer2_view2, input_c2_layer2, dtype=tf.float32, sequence_length=input_c2_lengths) with tf.variable_scope('r2l_layer2_view2', reuse=True): r2l_outputs_layer2_view2, _ = tf.nn.dynamic_rnn( r2l_cell_layer2_view2, tf.reverse_sequence(input_c2_layer2, input_c2_lengths_64, 1), dtype=tf.float32, sequence_length=input_c2_lengths) l2r_outputs_view2 = tf.gather( tf.reshape(tf.concat(1, l2r_outputs_layer2_view2), [-1, hidden_size]), tf.range(tf.shape(input_c2)[0]) * tf.shape(input_c2)[1] + input_c2_lengths - 1) r2l_outputs_view2 = tf.gather( tf.reshape(tf.concat(1, r2l_outputs_layer2_view2), [-1, hidden_size]), tf.range(tf.shape(input_c2)[0]) * tf.shape(input_c2)[1] + input_c2_lengths - 1) c2 = self.normalization( tf.concat(1, [l2r_outputs_view2, r2l_outputs_view2], 'concat_view2_c2')) num_objectives = len(obj) loss = 0 if 0 in obj: loss += self.contrastive_loss(margin, x1, c1, c2) if 1 in obj: loss += self.contrastive_loss(margin, c1, x1, c2) if 2 in obj: loss += self.contrastive_loss(margin, c1, x1, x2) if 3 in obj: loss += self.contrastive_loss(margin, x1, c1, x2) loss /= num_objectives self._loss = loss self._train_step = tf.train.AdamOptimizer(lr).minimize(loss)
def _construct_network(self, a_input, a_seqlens, n_samples, p_input, p_seqlens, maxlen, p_ids, batch_size, is_training=False, run_prompt_encoder=False, keep_prob=1.0): """ Construct RNNLM network Args: ? Returns: predictions, probabilities, logits, attention """ L2 = self.network_architecture['L2'] initializer = self.network_architecture['initializer'] # Question Encoder RNN with tf.variable_scope('Embeddings', initializer=initializer(self._seed)) as scope: embedding = slim.model_variable( 'word_embedding', trainable=False, shape=[ self.network_architecture['n_in'], self.network_architecture['n_ehid'] ], initializer=tf.truncated_normal_initializer(stddev=0.1), regularizer=slim.l2_regularizer(L2), device='/GPU:0') a_inputs = tf.nn.dropout(tf.nn.embedding_lookup( embedding, a_input, name='embedded_data'), keep_prob=keep_prob, seed=self._seed + 1) p_inputs = tf.nn.dropout(tf.nn.embedding_lookup( embedding, p_input, name='embedded_data'), keep_prob=keep_prob, seed=self._seed + 2) p_inputs_fw = tf.transpose(p_inputs, [1, 0, 2]) p_inputs_bw = tf.transpose( tf.reverse_sequence(p_inputs, seq_lengths=p_seqlens, seq_axis=1, batch_axis=0), [1, 0, 2]) a_inputs_fw = tf.transpose(a_inputs, [1, 0, 2]) a_inputs_bw = tf.transpose( tf.reverse_sequence(a_inputs, seq_lengths=a_seqlens, seq_axis=1, batch_axis=0), [1, 0, 2]) if run_prompt_encoder == True: # Prompt Encoder RNN with tf.variable_scope('RNN_Q_FW', initializer=initializer( self._seed)) as scope: rnn_fw = tf.contrib.rnn.LSTMBlockFusedCell( num_units=self.network_architecture['n_phid']) _, state_fw = rnn_fw(p_inputs_fw, sequence_length=p_seqlens, dtype=tf.float32) with tf.variable_scope('RNN_Q_BW', initializer=initializer( self._seed)) as scope: rnn_bw = tf.contrib.rnn.LSTMBlockFusedCell( num_units=self.network_architecture['n_phid']) _, state_bw = rnn_bw(p_inputs_bw, sequence_length=p_seqlens, dtype=tf.float32) prompt_embeddings = tf.concat([state_fw[1], state_bw[1]], axis=1) prompt_embeddings = tf.nn.dropout(prompt_embeddings, keep_prob=keep_prob, seed=self._seed) else: prompt_embeddings = tf.nn.dropout(self.prompt_embeddings, keep_prob=keep_prob, seed=self._seed) with tf.variable_scope('RNN_KEY_FW', initializer=initializer(self._seed)) as scope: rnn_fw = tf.contrib.rnn.LSTMBlockFusedCell( num_units=self.network_architecture['n_phid']) _, state_fw = rnn_fw(p_inputs_fw, sequence_length=p_seqlens, dtype=tf.float32) with tf.variable_scope('RNN_KEY_BW', initializer=initializer(self._seed)) as scope: rnn_bw = tf.contrib.rnn.LSTMBlockFusedCell( num_units=self.network_architecture['n_phid']) _, state_bw = rnn_bw(p_inputs_bw, sequence_length=p_seqlens, dtype=tf.float32) keys = tf.nn.dropout(tf.concat([state_fw[1], state_bw[1]], axis=1), keep_prob=keep_prob, seed=self._seed + 10) with tf.variable_scope('PROMPT_ATN', initializer=initializer(self._seed)) as scope: # Compute Attention over known questions mems = slim.fully_connected( prompt_embeddings, 2 * self.network_architecture['n_phid'], activation_fn=None, weights_regularizer=slim.l2_regularizer(L2), scope="mem") mems = tf.expand_dims(mems, axis=0, name='expanded_mems') tkeys = slim.fully_connected( keys, 2 * self.network_architecture['n_phid'], activation_fn=None, weights_regularizer=slim.l2_regularizer(L2), scope="tkeys") tkeys = tf.expand_dims(tkeys, axis=1, name='expanded_mems') v = slim.model_variable( 'v', shape=[2 * self.network_architecture['n_phid'], 1], regularizer=slim.l2_regularizer(L2), device='/GPU:0') tmp = tf.nn.tanh(mems + tkeys) print tmp.get_shape() tmp = tf.nn.dropout(tf.reshape( tmp, shape=[-1, 2 * self.network_architecture['n_phid']]), keep_prob=keep_prob, seed=self._seed + 3) a = tf.exp( tf.reshape(tf.matmul(tmp, v), [batch_size * (n_samples + 1), -1])) if is_training: mask = tf.where( tf.equal( tf.expand_dims(p_ids, axis=1), tf.tile( tf.expand_dims( tf.range(0, self.network_architecture['n_topics'], dtype=tf.int32), axis=0), [batch_size * (n_samples + 1), 1])), tf.zeros(shape=[ batch_size * (n_samples + 1), self.network_architecture['n_topics'] ], dtype=tf.float32), tf.ones(shape=[ batch_size * (n_samples + 1), self.network_architecture['n_topics'] ], dtype=tf.float32)) a = a * mask attention = a / tf.reduce_sum(a, axis=1, keep_dims=True) attended_prompt_embedding = tf.matmul(attention, prompt_embeddings) # Response Encoder RNN with tf.variable_scope('RNN_A_FW', initializer=initializer(self._seed)) as scope: rnn_fw = tf.contrib.rnn.LSTMBlockFusedCell( num_units=self.network_architecture['n_phid']) outputs_fw, _ = rnn_fw(a_inputs_fw, sequence_length=a_seqlens, dtype=tf.float32) with tf.variable_scope('RNN_A_BW', initializer=initializer(self._seed)) as scope: rnn_bw = tf.contrib.rnn.LSTMBlockFusedCell( num_units=self.network_architecture['n_phid']) outputs_bw, _ = rnn_bw(a_inputs_bw, sequence_length=a_seqlens, dtype=tf.float32) outputs = tf.concat([outputs_fw, outputs_bw], axis=2) outputs = tf.transpose(outputs, [1, 0, 2]) outputs = tf.nn.dropout(outputs, keep_prob=keep_prob, seed=self._seed) a_seqlens = tf.tile(a_seqlens, [n_samples + 1]) outputs = tf.tile(outputs, [1 + n_samples, 1, 1]) hidden, attention = self._bahdanau_attention( memory=outputs, seq_lens=a_seqlens, maxlen=maxlen, query=attended_prompt_embedding, size=2 * self.network_architecture['n_rhid'], batch_size=batch_size * (n_samples + 1)) with tf.variable_scope('Grader') as scope: for layer in xrange(self.network_architecture['n_flayers']): hidden = slim.fully_connected( hidden, self.network_architecture['n_fhid'], activation_fn=self.network_architecture['f_activation_fn'], weights_regularizer=slim.l2_regularizer(L2), scope="hidden_layer_" + str(layer)) hidden = tf.nn.dropout(hidden, keep_prob=keep_prob, seed=self._seed + layer) logits = slim.fully_connected(hidden, self.network_architecture['n_out'], activation_fn=None, scope="output_layer") probabilities = self.network_architecture['output_fn'](logits) predictions = tf.cast(tf.round(probabilities), dtype=tf.float32) return predictions, probabilities, logits, attention
def reverse_sequence(x): x, mask, seq_len = x return tf.reverse_sequence(x, seq_len, seq_dim=1)
def __init__(self, vocab_size, tok_emb_mat, emb_dim=256, n_hidden=512, n_layers=1, n_unroll=70, model_name='test_model', gpu=1, bidirectional=False, dropout_keep_prob=0.7): tf.reset_default_graph() self.learning_rate_ph = tf.placeholder(dtype=tf.float32, shape=[], name='lr') self._dropout_ph = tf.placeholder_with_default(1.0, shape=[], name='drop') self.tok_ph = tf.placeholder(dtype=tf.int32, shape=[None, None], name='tok_idxs') self.mask_ph = tf.placeholder_with_default(tf.ones_like( self.tok_ph, dtype=tf.float32), shape=[None, None]) self.model_name = model_name self.vocab_size = vocab_size self.n_unroll = n_unroll self.dropout_keep_prob = dropout_keep_prob # Embeddings emb_mat = tf.Variable(tok_emb_mat, name='Embeddings_Mat', trainable=True) embs = tf.nn.embedding_lookup(emb_mat, self.tok_ph) # Forward LSTM with tf.variable_scope('Forward'): units = embs[:, :-1, :] units = self._variational_dropout(units, self._dropout_ph) for n in range(n_layers): with tf.variable_scope('LSTM_' + str(n)): units, _ = cudnn_lstm(units, n_hidden) if n != n_layers - 1: units = self._variational_dropout( units, self._dropout_ph) if n_hidden != emb_dim: units = tf.layers.dense(units, emb_dim, name='Output_Projection') units = self._variational_dropout(units, self._dropout_ph) logits_fw = tf.tensordot(units, emb_mat, (2, 1)) targets = tf.one_hot(self.tok_ph, self.vocab_size) fw_loss = tf.losses.softmax_cross_entropy( targets[:, 1:, :], logits_fw, reduction=tf.losses.Reduction.NONE) fw_loss = self.mask_ph[:, 1:] * fw_loss self.loss = fw_loss if bidirectional: # Backward LSTM # Lengths assumed to be equal to n_unroll + n_hist lengths = tf.cast(tf.reduce_sum(self.mask_ph, 1), tf.int32) embs_bw = tf.reverse_sequence(embs, lengths, seq_axis=1, batch_axis=0) with tf.variable_scope('Backward'): units = embs_bw[:, :-1, :] for n in range(n_layers): with tf.variable_scope('LSTM_' + str(n)): units, _ = cudnn_lstm(units, n_hidden) if n != n_layers - 1: units = self._variational_dropout( units, self._dropout_ph) if n_hidden != emb_dim: units = tf.layers.dense(units, emb_dim, name='Output_Projection') units = self._variational_dropout(units, self._dropout_ph) logits_bw = tf.tensordot(units, emb_mat, (2, 1)) targets_bw = tf.one_hot( tf.reverse_sequence(self.tok_ph, lengths, seq_axis=1, batch_axis=0), self.vocab_size) bw_loss = tf.losses.softmax_cross_entropy( targets_bw[:, 1:, :], logits_bw, reduction=tf.losses.Reduction.NONE) bw_loss = self.mask_ph[:, 1:] * bw_loss self.loss = (self.loss + bw_loss) / 2 self.loss = tf.reduce_sum(self.loss) / tf.reduce_sum(self.mask_ph) # Summary tf.summary.scalar('log_loss', self.loss) self.summary = tf.summary.merge_all() # Predictions self.pred = tf.argmax(logits_fw, axis=-1) if bidirectional: self.pred_bw = tf.argmax(tf.reverse_sequence(logits_bw, lengths, seq_axis=1, batch_axis=0), axis=-1) # Train ops self.train_op = self.get_train_op(self.loss, self.learning_rate_ph, clip_norm=5.0, optimizer_scope_name='Optimizer') # the session config = tf.ConfigProto() config.gpu_options.visible_device_list = str(gpu) self.sess = tf.Session(config=config) # Init variables self.sess.run(tf.global_variables_initializer()) self.saver = tf.train.Saver() # self.saver.restore(self.sess, 'model/reddit_lm.ckpt') self.summary_writer = tf.summary.FileWriter('model/' + self.model_name, self.sess.graph)
def _build_unidi_rnn_cudnn(self, inputs, state, sequence_length, dtype, hparams, num_layers, is_fwd): # cudnn inputs only support time-major if not self.time_major: inputs = tf.transpose(inputs, axis=[1, 0, 2]) if num_layers == 1 and not np.isclose(hparams.dropout, 0.): # Special case when drop is used and only one layer dropout = 0. inputs = tf.nn.dropout(inputs, keep_prob=1 - dropout) else: dropout = hparams.dropout # the outputs would be in time-majored sequence_length = tf.transpose(sequence_length) if not is_fwd: inputs = tf.reverse_sequence(inputs, sequence_length, batch_axis=1, seq_axis=0) cell = tf.contrib.cudnn_rnn.CudnnLSTM( num_layers=num_layers, num_units=hparams.num_units, direction=cudnn_rnn.CUDNN_RNN_UNIDIRECTION, use_fp16=(True if self.dtype == tf.float16 else False), dropout=dropout) outputs, (h, c) = cell(inputs, initial_state=state) """ # Mask outputs # [batch, time] mask = tf.sequence_mask(sequence_length, dtype=self.dtype) # [time, batch] mask = tf.transpose(mask) outputs *= mask """ if not is_fwd: outputs = tf.reverse_sequence(inputs, sequence_length, batch_axis=1, seq_axis=0) # NOTICE! There's no way to get the "correct" masked cell state in cudnn # rnn. if num_layers == 1: h = tf.squeeze(h, axis=0) c = tf.squeeze(c, axis=0) return outputs, tf.nn.rnn_cell.LSTMStateTuple(c=c, h=h) # Split h and c to form a h.set_shape((num_layers, None, hparams.num_units)) c.set_shape((num_layers, None, hparams.num_units)) hs = tf.unstack(h) cs = tf.unstack(c) # The cell passed to bidi-dyanmic-rnn is a MultiRNNCell consisting 2 regular # LSTM, the state of each is a simple LSTMStateTuple. Thus the state of the # MultiRNNCell is a tuple of LSTMStateTuple. states = tuple( tf.nn.rnn_cell.LSTMStateTuple(c=c, h=h) for h, c in zip(hs, cs)) # No need to transpose back return outputs, states
def _build_forward(self): config = self.config N, M, JX, JQ, VW, VC, d, W = \ config.batch_size, config.max_num_sents, config.max_sent_size, \ config.max_ques_size, config.word_vocab_size, config.char_vocab_size, config.hidden_size, \ config.max_word_size print("VW:", VW, "N:", N, "M:", M, "JX:", JX, "JQ:", JQ) JA = config.max_answer_length JX = tf.shape(self.x)[2] JQ = tf.shape(self.q)[1] M = tf.shape(self.x)[1] print("VW:", VW, "N:", N, "M:", M, "JX:", JX, "JQ:", JQ) dc, dw, dco = config.char_emb_size, config.word_emb_size, config.char_out_size with tf.variable_scope("emb"): # Char-CNN Embedding if config.use_char_emb: with tf.variable_scope("emb_var"), tf.device("/cpu:0"): char_emb_mat = tf.get_variable("char_emb_mat", shape=[VC, dc], dtype='float') with tf.variable_scope("char"): Acx = tf.nn.embedding_lookup(char_emb_mat, self.cx) # [N, M, JX, W, dc] Acq = tf.nn.embedding_lookup(char_emb_mat, self.cq) # [N, JQ, W, dc] Acx = tf.reshape(Acx, [-1, JX, W, dc]) Acq = tf.reshape(Acq, [-1, JQ, W, dc]) filter_sizes = list(map(int, config.out_channel_dims.split(','))) # [100] heights = list(map(int, config.filter_heights.split(','))) # [5] assert sum(filter_sizes) == dco, (filter_sizes, dco) # Make sure filter channels = char_cnn_out size with tf.variable_scope("conv"): xx = multi_conv1d(Acx, filter_sizes, heights, "VALID", self.is_train, config.keep_prob, scope="xx") if config.share_cnn_weights: tf.get_variable_scope().reuse_variables() qq = multi_conv1d(Acq, filter_sizes, heights, "VALID", self.is_train, config.keep_prob, scope="xx") else: qq = multi_conv1d(Acq, filter_sizes, heights, "VALID", self.is_train, config.keep_prob, scope="qq") xx = tf.reshape(xx, [-1, M, JX, dco]) qq = tf.reshape(qq, [-1, JQ, dco]) # Word Embedding if config.use_word_emb: with tf.variable_scope("emb_var") as scope, tf.device("/cpu:0"): if config.mode == 'train': word_emb_mat = tf.get_variable("word_emb_mat", dtype='float', shape=[VW, dw], initializer=get_initializer(config.emb_mat)) else: word_emb_mat = tf.get_variable("word_emb_mat", shape=[VW, dw], dtype='float') tf.get_variable_scope().reuse_variables() self.word_emb_scope = scope if config.use_glove_for_unk: word_emb_mat = tf.concat([word_emb_mat, self.new_emb_mat], 0) with tf.name_scope("word"): Ax = tf.nn.embedding_lookup(word_emb_mat, self.x) # [N, M, JX, d] Aq = tf.nn.embedding_lookup(word_emb_mat, self.q) # [N, JQ, d] self.tensor_dict['x'] = Ax self.tensor_dict['q'] = Aq # Concat Char-CNN Embedding and Word Embedding if config.use_char_emb: xx = tf.concat([xx, Ax], 3) # [N, M, JX, di] qq = tf.concat([qq, Aq], 2) # [N, JQ, di] else: xx = Ax qq = Aq # exact match if config.use_exact_match: # TODO: What does it mean? emx = tf.expand_dims(tf.cast(self.emx, tf.float32), -1) xx = tf.concat([xx, emx], 3) # [N, M, JX, di+1] emq = tf.expand_dims(tf.cast(self.emq, tf.float32), -1) qq = tf.concat([qq, emq], 2) # [N, JQ, di+1] # 2 layer highway network on Concat Embedding if config.highway: with tf.variable_scope("highway"): xx = highway_network(xx, config.highway_num_layers, True, wd=config.wd, is_train=self.is_train) tf.get_variable_scope().reuse_variables() qq = highway_network(qq, config.highway_num_layers, True, wd=config.wd, is_train=self.is_train) self.tensor_dict['xx'] = xx self.tensor_dict['qq'] = qq # Bidirection-LSTM (3rd layer on paper) cell = GRUCell(d) if config.GRU else BasicLSTMCell(d, state_is_tuple=True) d_cell = SwitchableDropoutWrapper(cell, self.is_train, input_keep_prob=config.input_keep_prob) x_len = tf.reduce_sum(tf.cast(self.x_mask, 'int32'), 2) # [N, M] q_len = tf.reduce_sum(tf.cast(self.q_mask, 'int32'), 1) # [N] flat_x_len = flatten(x_len, 0) # [N * M] with tf.variable_scope("prepro"): if config.use_fused_lstm: #yes with tf.variable_scope("u1"): fw_inputs = tf.transpose(qq, [1, 0, 2]) #[time_len, batch_size, input_size] bw_inputs = tf.reverse_sequence(fw_inputs, q_len, batch_dim=1, seq_dim=0) fw_inputs = tf.nn.dropout(fw_inputs, config.input_keep_prob) bw_inputs = tf.nn.dropout(bw_inputs, config.input_keep_prob) prep_fw_cell = LSTMBlockFusedCell(d, cell_clip=0) prep_bw_cell = LSTMBlockFusedCell(d, cell_clip=0) fw_outputs, fw_final = prep_fw_cell(fw_inputs, dtype=tf.float32, sequence_length=q_len, scope="fw") bw_outputs, bw_final = prep_bw_cell(bw_inputs, dtype=tf.float32, sequence_length=q_len, scope="bw") bw_outputs = tf.reverse_sequence(bw_outputs, q_len, batch_dim=1, seq_dim = 0) current_inputs = tf.concat((fw_outputs, bw_outputs), 2) output = tf.transpose(current_inputs, [1, 0, 2]) u = output flat_xx = flatten(xx, 2) # [N * M, JX, d] if config.share_lstm_weights: # Yes tf.get_variable_scope().reuse_variables() with tf.variable_scope("u1"): fw_inputs = tf.transpose(flat_xx, [1, 0, 2]) #[time_len, batch_size, input_size] bw_inputs = tf.reverse_sequence(fw_inputs, flat_x_len, batch_dim=1, seq_dim=0) # fw_inputs = tf.nn.dropout(fw_inputs, config.input_keep_prob) # bw_inputs = tf.nn.dropout(bw_inputs, config.input_keep_prob) fw_outputs, fw_final = prep_fw_cell(fw_inputs, dtype=tf.float32, sequence_length=flat_x_len, scope="fw") bw_outputs, bw_final = prep_bw_cell(bw_inputs, dtype=tf.float32, sequence_length=flat_x_len, scope="bw") bw_outputs = tf.reverse_sequence(bw_outputs, flat_x_len, batch_dim=1, seq_dim=0) current_inputs = tf.concat((fw_outputs, bw_outputs), 2) output = tf.transpose(current_inputs, [1, 0, 2]) else: # No with tf.variable_scope("h1"): fw_inputs = tf.transpose(flat_xx, [1, 0, 2]) #[time_len, batch_size, input_size] bw_inputs = tf.reverse_sequence(fw_inputs, flat_x_len, batch_dim=1, seq_dim=0) # fw_inputs = tf.nn.dropout(fw_inputs, config.input_keep_prob) # bw_inputs = tf.nn.dropout(bw_inputs, config.input_keep_prob) prep_fw_cell = LSTMBlockFusedCell(d, cell_clip=0) prep_bw_cell = LSTMBlockFusedCell(d, cell_clip=0) fw_outputs, fw_final = prep_fw_cell(fw_inputs, dtype=tf.float32, sequence_length=flat_x_len, scope="fw") bw_outputs, bw_final = prep_bw_cell(bw_inputs, dtype=tf.float32, sequence_length=flat_x_len, scope="bw") bw_outputs = tf.reverse_sequence(bw_outputs, flat_x_len, batch_dim=1, seq_dim=0) current_inputs = tf.concat((fw_outputs, bw_outputs), 2) output = tf.transpose(current_inputs, [1, 0, 2]) h = tf.expand_dims(output, 1) # [N, M, JX, 2d] else: (fw_u, bw_u), _ = bidirectional_dynamic_rnn(d_cell, d_cell, qq, q_len, dtype='float', scope='u1') # [N, J, d], [N, d] u = tf.concat([fw_u, bw_u], 2) if config.share_lstm_weights: tf.get_variable_scope().reuse_variables() (fw_h, bw_h), _ = bidirectional_dynamic_rnn(cell, cell, xx, x_len, dtype='float', scope='u1') # [N, M, JX, 2d] h = tf.concat([fw_h, bw_h], 3) # [N, M, JX, 2d] else: (fw_h, bw_h), _ = bidirectional_dynamic_rnn(cell, cell, xx, x_len, dtype='float', scope='h1') # [N, M, JX, 2d] h = tf.concat([fw_h, bw_h], 3) # [N, M, JX, 2d] self.tensor_dict['u'] = u # hidden state of Q = u self.tensor_dict['h'] = h # hidden state of C = h # Attention Flow Layer (4th layer on paper) with tf.variable_scope("main"): if config.dynamic_att: p0 = h u = tf.reshape(tf.tile(tf.expand_dims(u, 1), [1, M, 1, 1]), [N * M, JQ, 2 * d]) q_mask = tf.reshape(tf.tile(tf.expand_dims(self.q_mask, 1), [1, M, 1]), [N * M, JQ]) first_cell = AttentionCell(cell, u, size=d, mask=q_mask, mapper='sim', input_keep_prob=self.config.input_keep_prob, is_train=self.is_train) else: p0 = attention_layer(config, self.is_train, h, u, h_mask=self.x_mask, u_mask=self.q_mask, scope="p0", tensor_dict=self.tensor_dict) first_cell = d_cell # a GRU cell with dropout wrapper tp0 = p0 # Output of Attention layer # Modeling layer (5th layer on paper) with tf.variable_scope('modeling_layer'): if config.use_fused_lstm: g1, encoder_state_final = build_fused_bidirectional_rnn(inputs=p0, num_units=config.hidden_size, num_layers=config.num_modeling_layers, inputs_length=flat_x_len, input_keep_prob=config.input_keep_prob, scope='modeling_layer_g') else: for layer_idx in range(config.num_modeling_layers-1): (fw_g0, bw_g0), _ = bidirectional_dynamic_rnn(first_cell, first_cell, p0, x_len, dtype='float', scope="g_{}".format(layer_idx)) # [N, M, JX, 2d] p0 = tf.concat([fw_g0, bw_g0], 3) (fw_g1, bw_g1), (fw_s_f, bw_s_f) = bidirectional_dynamic_rnn(first_cell, first_cell, p0, x_len, dtype='float', scope='g1') # [N, M, JX, 2d] g1 = tf.concat([fw_g1, bw_g1], 3) # [N, M, JX, 2d] # Self match layer if config.use_self_match: s0 = tf.reshape(g1, [N * M, JX, 2 * d]) # [N * M, JX, 2d] x_mask = tf.reshape(self.x_mask, [N * M, JX]) # [N * M, JX] if config.use_static_self_match: with tf.variable_scope("StaticSelfMatch"): # implemented follow r-net section 3.3 W_x_Vj = tf.contrib.layers.fully_connected( # [N * M, JX, d] s0, int(d / 2), scope='row_first', activation_fn=None, biases_initializer=None ) W_x_Vt = tf.contrib.layers.fully_connected( # [N * M, JX, d] s0, int(d / 2), scope='col_first', activation_fn=None, biases_initializer=None ) sum_rc = tf.add( # [N * M, JX, JX, d] tf.expand_dims(W_x_Vj, 1), tf.expand_dims(W_x_Vt, 2) ) v = tf.get_variable('second', shape=[1, 1, 1, int(d / 2)], dtype=tf.float32) Sj = tf.reduce_sum(tf.multiply(v, tf.tanh(sum_rc)), -1) # [N * M, JX, JX] Ai = softmax(Sj, mask = tf.expand_dims(x_mask, 1)) # [N * M, JX, JX] Ai = tf.expand_dims(Ai, -1) # [N * M, JX, JX, 1] Vi = tf.expand_dims(s0, 1) # [N * M, 1, JX, 2d] Ct = tf.reduce_sum( # [N * M, JX, 2d] tf.multiply(Ai, Vi), axis = 2 ) inputs_Vt_Ct = tf.concat([s0, Ct], 2) # [N * M, JX, 4d] if config.use_fused_lstm: fw_inputs = tf.transpose(inputs_Vt_Ct, [1, 0, 2]) # [time_len, batch_size, input_size] bw_inputs = tf.reverse_sequence(fw_inputs, flat_x_len, batch_dim=1, seq_dim=0) fw_inputs = tf.nn.dropout(fw_inputs, config.input_keep_prob) bw_inputs = tf.nn.dropout(bw_inputs, config.input_keep_prob) prep_fw_cell = LSTMBlockFusedCell(d, cell_clip=0) prep_bw_cell = LSTMBlockFusedCell(d, cell_clip=0) fw_outputs, fw_s_f = prep_fw_cell(fw_inputs, dtype=tf.float32, sequence_length=flat_x_len, scope="fw") bw_outputs, bw_s_f = prep_bw_cell(bw_inputs, dtype=tf.float32, sequence_length=flat_x_len, scope="bw") fw_s_f = LSTMStateTuple(c=fw_s_f[0], h=fw_s_f[1]) bw_s_f = LSTMStateTuple(c=bw_s_f[0], h=bw_s_f[1]) bw_outputs = tf.reverse_sequence(bw_outputs, flat_x_len, batch_dim=1, seq_dim=0) current_inputs = tf.concat((fw_outputs, bw_outputs), 2) s1 = tf.transpose(current_inputs, [1, 0, 2]) else: (fw_s, bw_s), (fw_s_f, bw_s_f) = bidirectional_dynamic_rnn(first_cell, first_cell, inputs_Vt_Ct, flat_x_len, dtype='float', scope='s') # [N, M, JX, 2d] s1 = tf.concat([fw_s, bw_s], 2) # [N * M, JX, 2d], M == 1 else: with tf.variable_scope("DynamicSelfMatch"): first_cell = AttentionCell(cell, s0, size=d, mask=x_mask, is_train=self.is_train) (fw_s, bw_s), (fw_s_f, bw_s_f) = bidirectional_dynamic_rnn(first_cell, first_cell, s0, x_len, dtype='float', scope='s') # [N, M, JX, 2d] s1 = tf.concat([fw_s, bw_s], 2) # [N * M, JX, 2d], M == 1 g1 = tf.expand_dims(s1, 1) # [N, M, JX, 2d] # prepare for PtrNet encoder_output = g1 # [N, M, JX, 2d] encoder_output = tf.expand_dims(tf.cast(self.x_mask, tf.float32), -1) * encoder_output # [N, M, JX, 2d] if config.use_self_match or not config.use_fused_lstm: if config.GRU: encoder_state_final = tf.concat((fw_s_f, bw_s_f), 1, name='encoder_concat') else: if isinstance(fw_s_f, LSTMStateTuple): encoder_state_c = tf.concat( (fw_s_f.c, bw_s_f.c), 1, name='encoder_concat_c') encoder_state_h = tf.concat( (fw_s_f.h, bw_s_f.h), 1, name='encoder_concat_h') encoder_state_final = LSTMStateTuple(c=encoder_state_c, h=encoder_state_h) elif isinstance(fw_s_f, tf.Tensor): encoder_state_final = tf.concat((fw_s_f, bw_s_f), 1, name='encoder_concat') else: encoder_state_final = None tf.logging.error("encoder_state_final not set") print("encoder_state_final:", encoder_state_final) with tf.variable_scope("output"): # eos_symbol = config.eos_symbol # next_symbol = config.next_symbol tf.assert_equal(M, 1) # currently dynamic M is not supported, thus we assume M==1 answer_string = tf.placeholder( shape=(N, 1, JA + 1), dtype=tf.int32, name='answer_string' ) # [N, M, JA + 1] answer_string_mask = tf.placeholder( shape=(N, 1, JA + 1), dtype=tf.bool, name='answer_string_mask' ) # [N, M, JA + 1] answer_string_length = tf.placeholder( shape=(N, 1), dtype=tf.int32, name='answer_string_length', ) # [N, M] self.tensor_dict['answer_string'] = answer_string self.tensor_dict['answer_string_mask'] = answer_string_mask self.tensor_dict['answer_string_length'] = answer_string_length self.answer_string = answer_string self.answer_string_mask = answer_string_mask self.answer_string_length = answer_string_length answer_string_flattened = tf.reshape(answer_string, [N * M, JA + 1]) self.answer_string_flattened = answer_string_flattened # [N * M, JA+1] print("answer_string_flattened:", answer_string_flattened) answer_string_length_flattened = tf.reshape(answer_string_length, [N * M]) self.answer_string_length_flattened = answer_string_length_flattened # [N * M] print("answer_string_length_flattened:", answer_string_length_flattened) decoder_cell = GRUCell(2 * d) if config.GRU else BasicLSTMCell(2 * d, state_is_tuple=True) with tf.variable_scope("Decoder"): decoder_train_logits = ptr_decoder(decoder_cell, tf.reshape(tp0, [N * M, JX, 2 * d]), # [N * M, JX, 2d] tf.reshape(encoder_output, [N * M, JX, 2 * d]), # [N * M, JX, 2d] flat_x_len, encoder_final_state=encoder_state_final, max_encoder_length=config.sent_size_th, decoder_output_length=answer_string_length_flattened, # [N * M] batch_size=N, # N * M (M=1) attention_proj_dim=self.config.decoder_proj_dim, scope='ptr_decoder') # [batch_size, dec_len*, enc_seq_len + 1] self.decoder_train_logits = decoder_train_logits print("decoder_train_logits:", decoder_train_logits) self.decoder_train_softmax = tf.nn.softmax(self.decoder_train_logits) self.decoder_inference = tf.argmax(decoder_train_logits, axis=2, name='decoder_inference') # [N, JA + 1] self.yp = tf.ones([N, M, JX], dtype=tf.int32) * -1 self.yp2 = tf.ones([N, M, JX], dtype=tf.int32) * -1
def construct(self, args, num_words, num_chars, lem_num_chars, num_tags, num_senses, bow, eow): with self.session.graph.as_default(): # Training params self.is_training = tf.placeholder(tf.bool, []) self.learning_rate = tf.placeholder(tf.float32, [], name="learning_rate") # Sentence lengths self.sentence_lens = tf.placeholder(tf.int32, [None], name="sentence_lens") # Number of output words self.words_count = tf.reduce_sum(self.sentence_lens) words_count = self.words_count # Map sentences -> word list self.word_indexes = tf.placeholder(tf.int32, [None, 2], name='word_indexes') # Tag data self.tags = tf.placeholder(tf.int32, [None, None, len(num_tags)], name="tags") # Form IDs and charseqs self.word_ids = tf.placeholder(tf.int32, [None, None], name="word_ids") self.charseqs = tf.placeholder(tf.int32, [None, None], name="charseqs") self.charseq_lens = tf.placeholder(tf.int32, [None], name="charseq_lens") self.charseq_ids = tf.placeholder(tf.int32, [None, None], name="charseq_ids") # Lemma charseqs self.target_senses = tf.placeholder(tf.int32, [None, None], name="target_senses") self.target_ids = tf.placeholder(tf.int32, [None, None], name="target_ids") self.target_seqs = tf.placeholder(tf.int32, [None, None], name="target_seqs") self.target_seq_lens = tf.placeholder(tf.int32, [None], name="target_seq_lens") # Sentence weights weights = tf.sequence_mask(self.sentence_lens, dtype=tf.float32) sum_weights = tf.reduce_sum(weights) # Source forms lengths (in sentences and by words/lemmas) sentence_form_len = tf.nn.embedding_lookup(self.charseq_lens, self.charseq_ids) word_form_len = tf.gather_nd(sentence_form_len, self.word_indexes) # Target sequences for words _target_seq_lens = tf.nn.embedding_lookup(self.target_seq_lens, self.target_ids) # 2D _target_seqs = tf.nn.embedding_lookup(self.target_seqs, self.target_ids) # Flattened to word-list target_lens = tf.gather_nd(_target_seq_lens, self.word_indexes) target_seqs = tf.gather_nd(_target_seqs, self.word_indexes) target_senses = tf.gather_nd(self.target_senses, self.word_indexes) # Add eow at the end target_seqs = tf.reverse_sequence(target_seqs, target_lens, 1) target_seqs = tf.pad(target_seqs, [[0, 0], [1, 0]], constant_values=eow) target_lens = target_lens + 1 target_seqs = tf.reverse_sequence(target_seqs, target_lens, 1) # RNN Cell if args.rnn_cell == "LSTM": rnn_cell = tf.nn.rnn_cell.LSTMCell elif args.rnn_cell == "GRU": rnn_cell = tf.nn.rnn_cell.GRUCell else: raise ValueError("Unknown rnn_cell {}".format(args.rnn_cell)) # Encoder enc_out = encoder_network(self.word_indexes, self.word_ids, self.charseqs, self.charseq_ids, self.charseq_lens, self.sentence_lens, num_words, num_chars, args.we_dim, args.cle_dim, rnn_cell, args.rnn_cell_dim, args.rnn_layers, args.dropout, self.is_training, args.separate_embed, args.separate_rnn) rnn_inputs_tags, word_rnn_outputs, sentence_rnn_outputs_tags, word_cle_states, word_cle_outputs = enc_out # Tagger loss_tag, tag_outputs, self.predictions, correct_tag, correct_tags_compositional = tag_decoder( self.tags, sentence_rnn_outputs_tags, weights, sum_weights, num_tags, args.tags, args.label_smoothing) # Tagger features for lemmatizer tag_feats = tag_features(tag_outputs, self.word_indexes, words_count, args.rnn_cell_dim, args.dropout, self.is_training, args.no_tags_to_lemmas, args.tag_signal_dropout) self.current_accuracy_tag, self.update_accuracy_tag = tf.metrics.mean(correct_tag, weights=sum_weights) self.current_accuracy_tags_compositional, self.update_accuracy_tags_compositional = tf.metrics.mean( correct_tags_compositional) # Lemmatizer loss_lem, predictions = lemma_decoder(word_rnn_outputs, tag_feats, word_cle_states, word_cle_outputs, word_form_len, target_seqs, target_lens, self.charseq_lens, words_count, lem_num_chars, rnn_cell, args.rnn_cell, args.rnn_cell_dim, args.cle_dim, args.beams, args.beam_len_penalty, args.lem_smoothing, bow, eow) self.lemma_predictions_training, self.lemma_predictions, self.lemma_prediction_lengths = predictions # Lemmatizer sense predictor loss_sense, self.sense_prediction = sense_predictor(word_rnn_outputs, tag_feats, target_senses, num_senses, words_count, args.predict_sense, args.sense_smoothing) # Lemma predictions, loss and accuracy self._lemma_stats(target_seqs, target_lens, target_senses) # Loss, training and gradients # Compute combined weighted loss on tags and lemmas loss = loss_tag + loss_lem * args.loss_lem_w + loss_sense * args.loss_sense_w self.global_step = tf.train.create_global_step() self.update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS) with tf.control_dependencies(self.update_ops): optimizer = tf.contrib.opt.LazyAdamOptimizer(learning_rate=self.learning_rate, beta2=args.beta_2) gradients, variables = zip(*optimizer.compute_gradients(loss)) self.gradient_norm = tf.global_norm(gradients) if args.grad_clip: gradients, _ = tf.clip_by_global_norm(gradients, args.grad_clip) self.training = optimizer.apply_gradients(zip(gradients, variables), global_step=self.global_step, name="training") # Saver self.saver = tf.train.Saver(max_to_keep=2) # Summaries self.current_loss_tag, self.update_loss_tag = tf.metrics.mean(loss_tag, weights=sum_weights) self.current_loss_lem, self.update_loss_lem = tf.metrics.mean(loss_lem, weights=sum_weights) self.current_loss_sense, self.update_loss_sense = tf.metrics.mean(loss_sense, weights=sum_weights) self.current_loss, self.update_loss = tf.metrics.mean(loss, weights=sum_weights) self.reset_metrics = tf.variables_initializer(tf.get_collection(tf.GraphKeys.METRIC_VARIABLES)) summary_writer = tf.contrib.summary.create_file_writer(args.logdir, flush_millis=1 * 1000) self.summaries = {} with summary_writer.as_default(), tf.contrib.summary.record_summaries_every_n_global_steps(1): self.summaries["train"] = [tf.contrib.summary.scalar("train/loss_tag", self.update_loss_tag), tf.contrib.summary.scalar("train/loss_sense", self.update_loss_sense), tf.contrib.summary.scalar("train/loss_lem", self.update_loss_lem), tf.contrib.summary.scalar("train/loss", self.update_loss), tf.contrib.summary.scalar("train/gradient", self.gradient_norm), tf.contrib.summary.scalar("train/accuracy_tag", self.update_accuracy_tag), tf.contrib.summary.scalar("train/accuracy_compositional_tags", self.update_accuracy_tags_compositional), tf.contrib.summary.scalar("train/accuracy_lem", self.update_accuracy_lem_train), tf.contrib.summary.scalar("train/accuracy_lemsense", self.update_accuracy_lemsense_train), tf.contrib.summary.scalar("train/learning_rate", self.learning_rate)] with summary_writer.as_default(), tf.contrib.summary.always_record_summaries(): for dataset in ["dev", "test"]: self.summaries[dataset] = [tf.contrib.summary.scalar(dataset + "/loss", self.current_loss), tf.contrib.summary.scalar(dataset + "/accuracy_tag", self.current_accuracy_tag), tf.contrib.summary.scalar(dataset + "/accuracy_compositional_tags", self.current_accuracy_tags_compositional), tf.contrib.summary.scalar(dataset + "/accuracy_lem", self.current_accuracy_lem), tf.contrib.summary.scalar(dataset + "/accuracy_lemsense", self.current_accuracy_lemsense)] # Initialize variables self.session.run(tf.global_variables_initializer()) with summary_writer.as_default(): tf.contrib.summary.initialize(session=self.session, graph=self.session.graph)