def h_gru(model_input, vocab_size, is_training=True): with tf.variable_scope("EncLayer0"): first_enc_cell = core_rnn_cell.DeviceWrapper( gru_ops.GRUBlockCell(1024), device='/gpu:1') runtime_batch_size = tf.shape(model_input)[0] enc_init_state = tf.zeros((runtime_batch_size, 1024), dtype=tf.float32) num_splits = 15 model_input_splits = tf.split(model_input, num_or_size_splits=num_splits, axis=1) enc_state = None first_layer_outputs = [] for i in xrange(num_splits): if i == 0: initial_state = enc_init_state else: initial_state = enc_state tf.get_variable_scope().reuse_variables() initial_state = tf.stop_gradient(initial_state) enc_outputs, enc_state = tf.nn.dynamic_rnn( first_enc_cell, model_input_splits[i], initial_state=initial_state, scope="enc0") # TODO enc_state = moe_layer(enc_state, 1024, 4, act_func=None, l2_penalty=1e-12) if is_training: enc_state = tf.nn.dropout(enc_state, 0.5) first_layer_outputs.append(enc_state) with tf.variable_scope("EncLayer1"): second_enc_cell = core_rnn_cell.DeviceWrapper( gru_ops.GRUBlockCell(1024), device='/gpu:1') first_layer_outputs = tf.stack(first_layer_outputs, axis=1) enc_outputs, enc_state = tf.nn.dynamic_rnn(second_enc_cell, first_layer_outputs, dtype=tf.float32, scope="enc1") # TODO if is_training: enc_state = tf.nn.dropout(enc_state, 0.8) logits = moe_layer(enc_state, vocab_size, 2, act_func=tf.nn.sigmoid, l2_penalty=1e-8) return logits
def do_job(self): first_layer_outputs = [] num_splits = 15 context_frames = SampleRandomSequence(model_input, num_frames, 50) cell = gru_ops.GRUBlockCell(1024) cell = core_rnn_cell.OutputProjectionWrapper(cell, vocab_size) with tf.variable_scope("EncLayer0"): cell = gru_ops.GRUBlockCell(1024) for i in xrange(num_splits): if i > 0: tf.get_variable_scope().reuse_variables() enc_outputs, enc_state = tf.nn.dynamic_rnn(cell, frames, scope="enc0") enc_state = moe_layer(enc_state, 1024, 4, act_func=None, l2_penalty=1e-12) if is_training: enc_state = tf.nn.dropout(enc_state, 0.5) first_layer_outputs.append(enc_state) with tf.variable_scope("EncLayer1"): cell = gru_ops.GRUBlockCell(1024) first_layer_outputs = tf.stack(first_layer_outputs, axis=1) enc_outputs, enc_state = tf.nn.dynamic_rnn(cell, first_layer_outputs, scope="enc1") flatten_outputs = tf.reduce_mean(enc_outputs, axis=1) with tf.variable_scope("FC0"): flatten_outputs = moe_layer(flatten_outputs, 1024, 2, act_func=tf.nn.relu, l2_penalty=1e-8) if is_training: flatten_outputs = tf.nn.dropout(flatten_outputs, 0.5) with tf.variable_scope("FC1"): logits = moe_layer(flatten_outputs, vocab_size, 2, act_func=tf.nn.sigmoid, l2_penalty=1e-8) logits = tf.clip_by_value(logits, 0., 1.) return {"predictions": logits}
def get_enc_cell(self, cell_size, vocab_size): # cell = cudnn_rnn_ops.CudnnGRU(1, cell_size, (1024+128)) cells = [] cell = gru_ops.GRUBlockCell(cell_size) cell = core_rnn_cell.OutputProjectionWrapper(cell, cell_size) cell = tf.contrib.rnn.DropoutWrapper(cell, output_keep_prob=0.5) cells.append(cell) cell = gru_ops.GRUBlockCell(cell_size) cells.append(cell) cell = tf.contrib.rnn.MultiRNNCell( cells, state_is_tuple=False) return cell
def testBlockGRUToGRUCellMultiStep(self): with self.test_session(use_gpu=self._use_gpu, graph=tf.Graph()) as sess: batch_size = 2 cell_size = 3 input_size = 3 time_steps = 4 # Random initializers. seed = 1994 initializer = tf.random_uniform_initializer(-0.01, 0.01, seed=seed) np.random.seed(seed) # Inputs concat_x = tf.placeholder(tf.float32, shape=(time_steps, batch_size, input_size)) h = tf.zeros([batch_size, cell_size]) # Values for the inputs. x_values = np.random.rand(time_steps, batch_size, input_size) h_value = np.random.rand(batch_size, cell_size) # Output from the block GRU cell implementation. with tf.variable_scope("block", initializer=initializer): cell = gru_ops.GRUBlockCell(cell_size) outputs_dynamic, state_dynamic = tf.nn.dynamic_rnn( cell, inputs=concat_x, initial_state=h, time_major=True, dtype=tf.float32) feeds = {concat_x: x_values, h: h_value} sess.run([tf.initialize_all_variables()]) block_res = sess.run([outputs_dynamic, state_dynamic], feeds) # Output from the basic GRU cell implementation. with tf.variable_scope("basic", initializer=initializer): cell = tf.nn.rnn_cell.GRUCell(cell_size) outputs_dynamic, state_dynamic = tf.nn.dynamic_rnn( cell, inputs=concat_x, initial_state=h, time_major=True, dtype=tf.float32) feeds = {concat_x: x_values, h: h_value} sess.run([tf.initialize_all_variables()]) basic_res = sess.run([outputs_dynamic, state_dynamic], feeds) # Check the lengths of the outputs_dynamic, and states. self.assertEqual(len(block_res), len(basic_res)) self.assertEqual(len(block_res[0]), len(basic_res[0])) self.assertEqual(len(block_res[1]), len(basic_res[1])) # Check the outputs_dynamic values. for block_output, basic_output in zip(block_res[0], basic_res[0]): self.assertAllClose(block_output, basic_output) # Check the state_dynamic value. self.assertAllClose(block_res[1], block_res[1])
def testBlockGRUToGRUCellSingleStep(self): with self.test_session(use_gpu=self._use_gpu, graph=ops.Graph()) as sess: batch_size = 4 cell_size = 5 input_size = 6 seed = 1994 initializer = init_ops.random_uniform_initializer(-0.01, 0.01, seed=seed) # Inputs x = array_ops.zeros([batch_size, input_size]) h = array_ops.zeros([batch_size, cell_size]) # Values for the inputs. x_value = np.random.rand(batch_size, input_size) h_value = np.random.rand(batch_size, cell_size) # Output from the basic GRU cell implementation. with vs.variable_scope("basic", initializer=initializer): output = core_rnn_cell_impl.GRUCell(cell_size)(x, h) sess.run([variables.global_variables_initializer()]) basic_res = sess.run([output], {x: x_value, h: h_value}) # Output from the block GRU cell implementation. with vs.variable_scope("block", initializer=initializer): output = gru_ops.GRUBlockCell(cell_size)(x, h) sess.run([variables.global_variables_initializer()]) block_res = sess.run([output], {x: x_value, h: h_value}) self.assertEqual(len(block_res), len(basic_res)) for block, basic in zip(block_res, basic_res): self.assertAllClose(block, basic)
def get_enc_cell( self, cell_size, ): # cell = cudnn_rnn_ops.CudnnGRU(1, cell_size, (1024+128)) cell = gru_ops.GRUBlockCell(cell_size) return cell
def do_reconstruction(enc_inputs, enc_outputs, enc_last_state, input_weights, seq_lengths): num_units = 100 # attn_mech = attention_wrapper.LuongAttention( # num_units=num_units, # memory=enc_outputs, # memory_sequence_length=seq_lengths, # scale=True) attn_mech = tf.contrib.seq2seq.BahdanauAttention( num_units=num_units, memory=enc_outputs, memory_sequence_length=seq_lengths, normalize=True, name='attention_mechanism') cell = gru_ops.GRUBlockCell(1024) cell = core_rnn_cell.DropoutWrapper(cell, 0.5, 0.5) attn_cell = tf.contrib.seq2seq.AttentionWrapper( cell=cell, attention_mechanism=attn_mech, attention_layer_size=1024, output_attention=False, initial_cell_state=enc_last_state, name="attention_wrapper") decoder_target = tf.reverse_sequence(enc_inputs, seq_lengths, seq_dim=1, batch_dim=0) decoder_inputs = tf.pad(decoder_target[:, :-1, :], [[0, 0], [1, 0], [0, 0]]) helper = tf.contrib.seq2seq.TrainingHelper( inputs=decoder_inputs, # decoder inputs sequence_length=seq_lengths, # decoder input length name="decoder_training_helper") # Decoder setup decoder = tf.contrib.seq2seq.BasicDecoder( cell=attn_cell, helper=helper, initial_state=attn_cell.zero_state(tf.shape(enc_inputs)[0], dtype=tf.float32), output_layer=Dense(1024 + 128)) # Perform dynamic decoding with decoder object dec_outputs, final_state, final_sequence_lengths = tf.contrib.seq2seq.dynamic_decode( decoder, swap_memory=True, ) loss = reconstruct_loss(logit=dec_outputs.rnn_output, target=decoder_target) # input_weights = tf.cast(input_weights, tf.float32) loss = tf.reduce_sum(loss * input_weights, axis=1) / tf.cast( seq_lengths, tf.float32) loss = tf.reduce_mean(loss) # loss = tf.contrib.seq2seq.sequence_loss( # dec_outputs.rnn_output, decoder_target, input_weights, # softmax_loss_function=reconstruct_loss) predictions = tf.no_op() return predictions, loss
def inference_gru_block_vs_gru_cell(batch_size, cell_size, input_size, time_steps, use_gpu=False, iters=30): """Benchmark inference speed between GRUBlockCell vs GRUCell.""" tf.reset_default_graph() with tf.Session(graph=tf.Graph()) as sess: with tf.device("/cpu:0" if not use_gpu else "/gpu:0"): # Random initializers. seed = 1994 initializer = tf.random_uniform_initializer(-1, 1, seed=seed) np.random.seed(seed) # Inputs concat_x = vs.get_variable("concat_x", [time_steps, batch_size, input_size]) h = vs.get_variable("h", [batch_size, cell_size]) # Output from the basic GRU cell implementation. with tf.variable_scope("basic", initializer=initializer): cell = tf.nn.rnn_cell.GRUCell(cell_size) outputs_dynamic, _ = tf.nn.dynamic_rnn(cell, inputs=concat_x, initial_state=h, time_major=True, dtype=tf.float32) sess.run([tf.initialize_all_variables()]) basic_time_inference = time_taken_by_op( outputs_dynamic, sess, iters) # Output from the block GRU cell implementation. with tf.variable_scope("block", initializer=initializer): cell = gru_ops.GRUBlockCell(cell_size) outputs_dynamic, _ = tf.nn.dynamic_rnn(cell, inputs=concat_x, initial_state=h, time_major=True, dtype=tf.float32) sess.run([tf.initialize_all_variables()]) block_time_inference = time_taken_by_op( outputs_dynamic, sess, iters) performance_inference = (basic_time_inference - block_time_inference ) * 100 / basic_time_inference print(",".join([ str(batch_size), str(cell_size), str(input_size), str(time_steps), str(use_gpu), str(basic_time_inference), str(block_time_inference), str(performance_inference) ])) return basic_time_inference, block_time_inference
def get_pretrain_enc_cell(self, ): cell = gru_ops.GRUBlockCell(1024) if self.is_training: cell = core_rnn_cell.DropoutWrapper(cell, 0.5, 0.5) cell = core_rnn_cell.InputProjectionWrapper(cell, 1024) cell = core_rnn_cell.OutputProjectionWrapper(cell, 1024) cell = core_rnn_cell.DeviceWrapper(cell, device='/gpu:0') return cell
def create_model(self, model_input, vocab_size, num_frames, is_training=True, dense_labels=None, **unused_params): # output_ranges = 9 + tf.range(0, 300, 10) # second_inputs = sample_sequence(model_input, output_ranges, 20) num_frames = tf.cast(tf.expand_dims(num_frames, 1), tf.float32) first_layer_outputs = [] num_splits = 6 with tf.variable_scope("EncLayer0"): cell = gru_ops.GRUBlockCell(1024) for i in xrange(num_splits): frames = SampleRandomSequence(model_input, num_frames, 50) if i > 0: tf.get_variable_scope().reuse_variables() enc_outputs, enc_state = tf.nn.dynamic_rnn( cell, frames, dtype=tf.float32, scope="enc0") enc_state = moe_layer(enc_state, 1024, 4, act_func=None, l2_penalty=1e-12) if is_training: enc_state = tf.nn.dropout(enc_state, 0.5) first_layer_outputs.append(enc_state) with tf.variable_scope("EncLayer1"): cell = gru_ops.GRUBlockCell(1024) first_layer_outputs = tf.stack(first_layer_outputs, axis=1) enc_outputs, enc_state = tf.nn.dynamic_rnn( cell, first_layer_outputs, dtype=tf.float32, scope="enc1") # flatten_outputs = attn_new.attn(enc_outputs, fea_size=1024, seq_len=num_splits) flatten_outputs = tf.reduce_mean(enc_outputs, axis=1) with tf.variable_scope("FC0"): flatten_outputs = slim.fully_connected( flatten_outputs, 1024, activation_fn=tf.nn.relu, weights_regularizer=slim.l2_regularizer(1e-8), scope="fc0") # flatten_outputs = moe_layer(flatten_outputs, 1024, 2, act_func=tf.nn.relu, l2_penalty=1e-8) if is_training: flatten_outputs = tf.nn.dropout(flatten_outputs, 0.5) with tf.variable_scope("FC1"): logits = moe_layer(flatten_outputs, vocab_size, 2, act_func=tf.nn.sigmoid, l2_penalty=1e-8) logits = tf.clip_by_value(logits, 0., 1.) return {"predictions": logits}
def testNoneDimsWithDynamicRNN(self): with self.test_session(use_gpu=self._use_gpu, graph=ops.Graph()) as sess: batch_size = 4 cell_size = 5 input_size = 6 num_steps = 7 cell = gru_ops.GRUBlockCell(cell_size) x = array_ops.placeholder(dtypes.float32, shape=(None, None, input_size)) _, output = rnn.dynamic_rnn( cell, x, time_major=True, dtype=dtypes.float32) sess.run(variables.global_variables_initializer()) feed = {} feed[x] = np.random.randn(num_steps, batch_size, input_size) sess.run(output, feed)
def single_bprop_step_gru_block_vs_gru_cell(batch_size, cell_size, input_size, use_gpu=False, iters=30): """Benchmark single bprop step speed between GRUBlockCell vs GRUCell.""" ops.reset_default_graph() with session.Session(graph=ops.Graph()) as sess: with benchmarking.device(use_gpu): initializer = init_ops.random_uniform_initializer(-1, 1, seed=1989) # Inputs x = vs.get_variable("x", [batch_size, input_size]) h = vs.get_variable("h", [batch_size, cell_size]) # Output from the basic GRU cell implementation. with vs.variable_scope("basic", initializer=initializer): output = rnn_cell.GRUCell(cell_size)(array_ops.identity(x), array_ops.identity(h)) sess.run([variables.global_variables_initializer()]) grad_output_wrt_input = gradients_impl.gradients([output], h) basic_time_bprop = benchmarking.seconds_per_run( grad_output_wrt_input, sess, iters) # Output from the block GRU cell implementation. with vs.variable_scope("block", initializer=initializer): output = gru_ops.GRUBlockCell(cell_size)(array_ops.identity(x), array_ops.identity(h)) sess.run([variables.global_variables_initializer()]) grad_output_wrt_input = gradients_impl.gradients([output], h) block_time_bprop = benchmarking.seconds_per_run( grad_output_wrt_input, sess, iters) performance_inference = (basic_time_bprop - block_time_bprop) * 100 / basic_time_bprop print(",".join([ str(batch_size), str(cell_size), str(input_size), str(use_gpu), str(basic_time_bprop), str(block_time_bprop), str(performance_inference) ])) return basic_time_bprop, block_time_bprop
def single_bprop_step_gru_block_vs_gru_cell(batch_size, cell_size, input_size, use_gpu=False, iters=30): """Benchmark single bprop step speed between GRUBlockCell vs GRUCell.""" tf.reset_default_graph() with tf.Session(graph=tf.Graph()) as sess: with tf.device("/cpu:0" if not use_gpu else "/gpu:0"): initializer = tf.random_uniform_initializer(-1, 1, seed=1989) # Inputs x = vs.get_variable("x", [batch_size, input_size]) h = vs.get_variable("h", [batch_size, cell_size]) # Output from the basic GRU cell implementation. with tf.variable_scope("basic", initializer=initializer): output = tf.nn.rnn_cell.GRUCell(cell_size)(tf.identity(x), tf.identity(h)) sess.run([tf.initialize_all_variables()]) grad_output_wrt_input = tf.gradients([output], h) basic_time_bprop = time_taken_by_op(grad_output_wrt_input, sess, iters) # Output from the block GRU cell implementation. with tf.variable_scope("block", initializer=initializer): output = gru_ops.GRUBlockCell(cell_size)(tf.identity(x), tf.identity(h)) sess.run([tf.initialize_all_variables()]) grad_output_wrt_input = tf.gradients([output], h) block_time_bprop = time_taken_by_op(grad_output_wrt_input, sess, iters) performance_inference = (basic_time_bprop - block_time_bprop) * 100 / basic_time_bprop print(",".join([ str(batch_size), str(cell_size), str(input_size), str(use_gpu), str(basic_time_bprop), str(block_time_bprop), str(performance_inference) ])) return basic_time_bprop, block_time_bprop
def create_model(self, model_input, vocab_size, num_frames, is_training=True, dense_labels=None, **unused_params): num_frames = tf.cast(tf.expand_dims(num_frames, 1), tf.float32) runtime_batch_size = tf.shape(model_input)[0] initial_state = tf.zeros((runtime_batch_size, self.cell_size), dtype=tf.float32) with tf.variable_scope("EncLayer0"): enc_cell = gru_ops.GRUBlockCell(1024) enc_outputs, enc_state = tf.nn.dynamic_rnn(enc_cell, model_input, dtype=tf.float32, scope="enc0") with tf.variable_scope("EncLayer1"): enc_cell = self.get_enc_cell(self.cell_size, vocab_size) enc_outputs, enc_state = dynamic_rnn.dynamic_rnn( enc_cell, enc_outputs, initial_state=initial_state, scope="enc1") if is_training: enc_state = tf.nn.dropout(enc_state, 0.8) enc_state = slim.fully_connected( enc_state, 1024, activation_fn=None, biases_initializer=None, weights_regularizer=slim.l2_regularizer(1e-8), scope="outputLayers") if is_training: enc_state = tf.nn.dropout(enc_state, 0.8) logits = moe_layer(enc_state, vocab_size, 2, act_func=tf.nn.sigmoid, l2_penalty=1e-8) return {"predictions": logits}
def testGradient(self): with self.test_session(use_gpu=self._use_gpu, graph=tf.Graph()) as sess: batch_size = 1 cell_size = 3 input_size = 2 # Inputs x = tf.zeros([batch_size, input_size]) h = tf.zeros([batch_size, cell_size]) output = gru_ops.GRUBlockCell(cell_size)(x, h) sess.run([tf.initialize_all_variables()]) all_variables = tf.all_variables() [w_ru, b_ru, w_c, b_c] = all_variables[:4] error_x = tf.test.compute_gradient_error(x, (batch_size, input_size), output[0], (batch_size, cell_size)) error_h = tf.test.compute_gradient_error(h, (batch_size, cell_size), output[0], (batch_size, cell_size)) error_w_ru = tf.test.compute_gradient_error( w_ru, (input_size + cell_size, 2 * cell_size), output[0], (batch_size, cell_size)) error_w_c = tf.test.compute_gradient_error( w_c, (input_size + cell_size, cell_size), output[0], (batch_size, cell_size)) error_b_ru = tf.test.compute_gradient_error( b_ru, (2 * cell_size, ), output[0], (batch_size, cell_size)) error_b_c = tf.test.compute_gradient_error(b_c, (cell_size, ), output[0], (batch_size, cell_size)) eps = 1e-4 self.assertLess(error_x, eps) self.assertLess(error_h, eps) self.assertLess(error_w_ru, eps) self.assertLess(error_w_c, eps) self.assertLess(error_b_ru, eps) self.assertLess(error_b_c, eps)
def testGradient(self): with self.session(use_gpu=True, graph=ops.Graph()) as sess: batch_size = 1 cell_size = 3 input_size = 2 # Inputs x = array_ops.zeros([batch_size, input_size]) h = array_ops.zeros([batch_size, cell_size]) output = gru_ops.GRUBlockCell(cell_size)(x, h) sess.run([variables.global_variables_initializer()]) all_variables = variables.global_variables() [w_ru, b_ru, w_c, b_c] = all_variables[:4] error_x = gradient_checker.compute_gradient_error( x, (batch_size, input_size), output[0], (batch_size, cell_size)) error_h = gradient_checker.compute_gradient_error(h, (batch_size, cell_size), output[0], (batch_size, cell_size)) error_w_ru = gradient_checker.compute_gradient_error( w_ru, (input_size + cell_size, 2 * cell_size), output[0], (batch_size, cell_size)) error_w_c = gradient_checker.compute_gradient_error( w_c, (input_size + cell_size, cell_size), output[0], (batch_size, cell_size)) error_b_ru = gradient_checker.compute_gradient_error( b_ru, (2 * cell_size,), output[0], (batch_size, cell_size)) error_b_c = gradient_checker.compute_gradient_error( b_c, (cell_size,), output[0], (batch_size, cell_size)) eps = 1e-4 self.assertLess(error_x, eps) self.assertLess(error_h, eps) self.assertLess(error_w_ru, eps) self.assertLess(error_w_c, eps) self.assertLess(error_b_ru, eps) self.assertLess(error_b_c, eps)
def training_gru_block_vs_gru_cell(batch_size, cell_size, input_size, time_steps, use_gpu=False, iters=30): """Benchmark training speed between GRUBlockCell vs GRUCell.""" ops.reset_default_graph() with session.Session(graph=ops.Graph()) as sess: # Specify the device which is been used. with ops.device("/cpu:0" if not use_gpu else "/gpu:0"): # Random initializers. seed = 1994 initializer = init_ops.random_uniform_initializer(-1, 1, seed=seed) np.random.seed(seed) # Inputs concat_x = vs.get_variable("concat_x", [time_steps, batch_size, input_size]) h = vs.get_variable("h", [batch_size, cell_size]) y = vs.get_variable("y", [time_steps, batch_size, cell_size]) # Output from the basic GRU cell implementation. with vs.variable_scope("basic", initializer=initializer): cell = core_rnn_cell_impl.GRUCell(cell_size) outputs_dynamic, _ = rnn.dynamic_rnn( cell, inputs=concat_x, initial_state=h, time_major=True, dtype=dtypes.float32) sess.run([variables.global_variables_initializer()]) cost = math_ops.reduce_mean(math_ops.square(outputs_dynamic - y)) learning_rate = 0.01 optimizer = gradient_descent.GradientDescentOptimizer( learning_rate).minimize(cost) # time for a training step. basic_time_training = time_taken_by_op(optimizer, sess, iters) # Output from the basic GRU cell implementation. with vs.variable_scope("block", initializer=initializer): cell = gru_ops.GRUBlockCell(cell_size) outputs_dynamic, _ = rnn.dynamic_rnn( cell, inputs=concat_x, initial_state=h, time_major=True, dtype=dtypes.float32) sess.run([variables.global_variables_initializer()]) cost = math_ops.reduce_mean(math_ops.square(outputs_dynamic - y)) learning_rate = 0.01 optimizer = gradient_descent.GradientDescentOptimizer( learning_rate).minimize(cost) # time for a training step. block_time_training = time_taken_by_op(optimizer, sess, iters) performance_training = ( basic_time_training - block_time_training) * 100 / basic_time_training print(",".join([ str(batch_size), str(cell_size), str(input_size), str(time_steps), str( use_gpu), str(basic_time_training), str(block_time_training), str( performance_training) ])) return basic_time_training, block_time_training
def testDerivativeOfBlockGRUToGRUCellMultiSteps(self): batch_size = 2 cell_size = 3 input_size = 4 time_steps = 2 with self.test_session(use_gpu=self._use_gpu, graph=ops.Graph()) as sess: # Random initializers. seed = 1994 initializer = init_ops.random_uniform_initializer(-0.01, 0.01, seed=seed) np.random.seed(seed) # Inputs concat_x = array_ops.placeholder( dtypes.float32, shape=(time_steps, batch_size, input_size)) h = array_ops.zeros([batch_size, cell_size]) # Values for the inputs. x_values = np.random.rand(time_steps, batch_size, input_size) h_value = np.random.rand(batch_size, cell_size) feeds = {concat_x: x_values, h: h_value} # Gradients from the block GRU cell implementation. with vs.variable_scope("block", initializer=initializer): cell = gru_ops.GRUBlockCell(cell_size) outputs_dynamic, _ = rnn.dynamic_rnn( cell, inputs=concat_x, initial_state=h, time_major=True, dtype=dtypes.float32) grad_output_wrt_x = gradients_impl.gradients([outputs_dynamic[0]], concat_x) grad_output_wrt_h = gradients_impl.gradients([outputs_dynamic[0]], h) sess.run([variables.global_variables_initializer()]) block_grad_res_x, block_grad_res_h = sess.run( [grad_output_wrt_x, grad_output_wrt_h], feeds) # Gradients from the basic GRU cell implementation. with vs.variable_scope("basic", initializer=initializer): cell = core_rnn_cell_impl.GRUCell(cell_size) outputs_dynamic, _ = rnn.dynamic_rnn( cell, inputs=concat_x, initial_state=h, time_major=True, dtype=dtypes.float32) grad_output_wrt_x = gradients_impl.gradients([outputs_dynamic[0]], concat_x) grad_output_wrt_h = gradients_impl.gradients([outputs_dynamic[0]], h) sess.run([variables.global_variables_initializer()]) basic_grad_res_x, basic_grad_res_h = sess.run( [grad_output_wrt_x, grad_output_wrt_h], feeds) # Check derivatives values of the outputs wrt to x. self.assertEqual(len(block_grad_res_x), len(basic_grad_res_x)) # Check derivatives values of the outputs wrt to h. for block, basic in zip(block_grad_res_x, basic_grad_res_x): self.assertAllClose(block, basic) # Check derivatives values of the outputs wrt to x. self.assertEqual(len(block_grad_res_h), len(basic_grad_res_h)) # Check derivatives values of the outputs wrt to h. for block, basic in zip(block_grad_res_h, basic_grad_res_h): self.assertAllClose(block, basic)
def testDerivativeOfBlockGRUToGRUCellSingleStep(self): with self.test_session(use_gpu=self._use_gpu, graph=ops.Graph()) as sess: batch_size = 2 cell_size = 3 input_size = 4 seed = 1994 initializer = init_ops.random_uniform_initializer(-0.01, 0.01, seed=seed) np.random.seed(seed) # Inputs x = array_ops.zeros([batch_size, input_size]) h = array_ops.zeros([batch_size, cell_size]) # Values for the inputs. x_value = np.random.rand(batch_size, input_size) h_value = np.random.rand(batch_size, cell_size) # Gradients from the block GRU cell implementation. with vs.variable_scope("block", initializer=initializer): output = gru_ops.GRUBlockCell(cell_size)(x, h) sess.run([variables.global_variables_initializer()]) all_variables = variables.global_variables()[0:4] [w_ru, b_ru, w_c, b_c] = all_variables d_new_h_wrt_x = gradients_impl.gradients([output], x) d_new_h_wrt_h = gradients_impl.gradients([output], h) d_new_h_wrt_w_ru = gradients_impl.gradients([output], w_ru) d_new_h_wrt_w_c = gradients_impl.gradients([output], w_c) d_new_h_wrt_b_ru = gradients_impl.gradients([output], b_ru) d_new_h_wrt_b_c = gradients_impl.gradients([output], b_c) d_block_res = sess.run([ d_new_h_wrt_x, d_new_h_wrt_h, d_new_h_wrt_w_ru, d_new_h_wrt_w_c, d_new_h_wrt_b_ru, d_new_h_wrt_b_c ], {x: x_value, h: h_value}) # Gradients from the basic GRU cell implementation. with vs.variable_scope("basic", initializer=initializer): output = core_rnn_cell_impl.GRUCell(cell_size)(x, h) sess.run([variables.global_variables_initializer()]) all_variables = variables.global_variables()[4:8] [w_ru, b_ru, w_c, b_c] = all_variables d_new_h_wrt_x = gradients_impl.gradients([output], x) d_new_h_wrt_h = gradients_impl.gradients([output], h) d_new_h_wrt_w_ru = gradients_impl.gradients([output], w_ru) d_new_h_wrt_w_c = gradients_impl.gradients([output], w_c) d_new_h_wrt_b_ru = gradients_impl.gradients([output], b_ru) d_new_h_wrt_b_c = gradients_impl.gradients([output], b_c) d_basic_res = sess.run([ d_new_h_wrt_x, d_new_h_wrt_h, d_new_h_wrt_w_ru, d_new_h_wrt_w_c, d_new_h_wrt_b_ru, d_new_h_wrt_b_c ], {x: x_value, h: h_value}) # Check lengths of derivative results. self.assertEqual(len(d_block_res), len(d_basic_res)) # Check the value of every derivative result. for block, basic in zip(d_block_res, d_basic_res): self.assertAllClose(block, basic)
def get_enc_cell(self, cell_size, vocab_size): cell = gru_ops.GRUBlockCell(cell_size) return cell
def get_enc_cell1(self, cell_size): cell = gru_ops.GRUBlockCell(cell_size) cell = core_rnn_cell.OutputProjectionWrapper(cell, 1024) return cell