Esempio n. 1
0
def h_gru(model_input, vocab_size, is_training=True):
    with tf.variable_scope("EncLayer0"):
        first_enc_cell = core_rnn_cell.DeviceWrapper(
            gru_ops.GRUBlockCell(1024), device='/gpu:1')
        runtime_batch_size = tf.shape(model_input)[0]
        enc_init_state = tf.zeros((runtime_batch_size, 1024), dtype=tf.float32)
        num_splits = 15
        model_input_splits = tf.split(model_input,
                                      num_or_size_splits=num_splits,
                                      axis=1)
        enc_state = None
        first_layer_outputs = []
        for i in xrange(num_splits):
            if i == 0:
                initial_state = enc_init_state
            else:
                initial_state = enc_state
                tf.get_variable_scope().reuse_variables()
            initial_state = tf.stop_gradient(initial_state)
            enc_outputs, enc_state = tf.nn.dynamic_rnn(
                first_enc_cell,
                model_input_splits[i],
                initial_state=initial_state,
                scope="enc0")
            # TODO
            enc_state = moe_layer(enc_state,
                                  1024,
                                  4,
                                  act_func=None,
                                  l2_penalty=1e-12)
            if is_training:
                enc_state = tf.nn.dropout(enc_state, 0.5)
            first_layer_outputs.append(enc_state)

    with tf.variable_scope("EncLayer1"):
        second_enc_cell = core_rnn_cell.DeviceWrapper(
            gru_ops.GRUBlockCell(1024), device='/gpu:1')
        first_layer_outputs = tf.stack(first_layer_outputs, axis=1)
        enc_outputs, enc_state = tf.nn.dynamic_rnn(second_enc_cell,
                                                   first_layer_outputs,
                                                   dtype=tf.float32,
                                                   scope="enc1")
    # TODO
    if is_training:
        enc_state = tf.nn.dropout(enc_state, 0.8)
    logits = moe_layer(enc_state,
                       vocab_size,
                       2,
                       act_func=tf.nn.sigmoid,
                       l2_penalty=1e-8)
    return logits
Esempio n. 2
0
    def do_job(self):
        first_layer_outputs = []
        num_splits = 15
        context_frames = SampleRandomSequence(model_input, num_frames, 50)
        cell = gru_ops.GRUBlockCell(1024)
        cell = core_rnn_cell.OutputProjectionWrapper(cell, vocab_size)

        with tf.variable_scope("EncLayer0"):
            cell = gru_ops.GRUBlockCell(1024)
            for i in xrange(num_splits):
                if i > 0:
                    tf.get_variable_scope().reuse_variables()
                enc_outputs, enc_state = tf.nn.dynamic_rnn(cell,
                                                           frames,
                                                           scope="enc0")
                enc_state = moe_layer(enc_state,
                                      1024,
                                      4,
                                      act_func=None,
                                      l2_penalty=1e-12)
                if is_training:
                    enc_state = tf.nn.dropout(enc_state, 0.5)
                first_layer_outputs.append(enc_state)

        with tf.variable_scope("EncLayer1"):
            cell = gru_ops.GRUBlockCell(1024)
            first_layer_outputs = tf.stack(first_layer_outputs, axis=1)
            enc_outputs, enc_state = tf.nn.dynamic_rnn(cell,
                                                       first_layer_outputs,
                                                       scope="enc1")

        flatten_outputs = tf.reduce_mean(enc_outputs, axis=1)

        with tf.variable_scope("FC0"):
            flatten_outputs = moe_layer(flatten_outputs,
                                        1024,
                                        2,
                                        act_func=tf.nn.relu,
                                        l2_penalty=1e-8)
        if is_training:
            flatten_outputs = tf.nn.dropout(flatten_outputs, 0.5)
        with tf.variable_scope("FC1"):
            logits = moe_layer(flatten_outputs,
                               vocab_size,
                               2,
                               act_func=tf.nn.sigmoid,
                               l2_penalty=1e-8)
        logits = tf.clip_by_value(logits, 0., 1.)
        return {"predictions": logits}
Esempio n. 3
0
  def get_enc_cell(self, cell_size, vocab_size):
    # cell = cudnn_rnn_ops.CudnnGRU(1, cell_size, (1024+128))
    cells = []
    cell = gru_ops.GRUBlockCell(cell_size)
    cell = core_rnn_cell.OutputProjectionWrapper(cell, cell_size)
    cell = tf.contrib.rnn.DropoutWrapper(cell, output_keep_prob=0.5)
    cells.append(cell)

    cell = gru_ops.GRUBlockCell(cell_size)
    cells.append(cell)

    cell = tf.contrib.rnn.MultiRNNCell(
        cells,
        state_is_tuple=False)
    return cell
Esempio n. 4
0
    def testBlockGRUToGRUCellMultiStep(self):
        with self.test_session(use_gpu=self._use_gpu,
                               graph=tf.Graph()) as sess:
            batch_size = 2
            cell_size = 3
            input_size = 3
            time_steps = 4

            # Random initializers.
            seed = 1994
            initializer = tf.random_uniform_initializer(-0.01, 0.01, seed=seed)
            np.random.seed(seed)

            # Inputs
            concat_x = tf.placeholder(tf.float32,
                                      shape=(time_steps, batch_size,
                                             input_size))
            h = tf.zeros([batch_size, cell_size])

            # Values for the inputs.
            x_values = np.random.rand(time_steps, batch_size, input_size)
            h_value = np.random.rand(batch_size, cell_size)

            # Output from the block GRU cell implementation.
            with tf.variable_scope("block", initializer=initializer):
                cell = gru_ops.GRUBlockCell(cell_size)
                outputs_dynamic, state_dynamic = tf.nn.dynamic_rnn(
                    cell,
                    inputs=concat_x,
                    initial_state=h,
                    time_major=True,
                    dtype=tf.float32)
                feeds = {concat_x: x_values, h: h_value}
                sess.run([tf.initialize_all_variables()])
                block_res = sess.run([outputs_dynamic, state_dynamic], feeds)

            # Output from the basic GRU cell implementation.
            with tf.variable_scope("basic", initializer=initializer):
                cell = tf.nn.rnn_cell.GRUCell(cell_size)
                outputs_dynamic, state_dynamic = tf.nn.dynamic_rnn(
                    cell,
                    inputs=concat_x,
                    initial_state=h,
                    time_major=True,
                    dtype=tf.float32)
                feeds = {concat_x: x_values, h: h_value}
                sess.run([tf.initialize_all_variables()])
                basic_res = sess.run([outputs_dynamic, state_dynamic], feeds)

            # Check the lengths of the outputs_dynamic, and states.
            self.assertEqual(len(block_res), len(basic_res))
            self.assertEqual(len(block_res[0]), len(basic_res[0]))
            self.assertEqual(len(block_res[1]), len(basic_res[1]))

            # Check the outputs_dynamic values.
            for block_output, basic_output in zip(block_res[0], basic_res[0]):
                self.assertAllClose(block_output, basic_output)

            # Check the state_dynamic value.
            self.assertAllClose(block_res[1], block_res[1])
Esempio n. 5
0
  def testBlockGRUToGRUCellSingleStep(self):
    with self.test_session(use_gpu=self._use_gpu, graph=ops.Graph()) as sess:
      batch_size = 4
      cell_size = 5
      input_size = 6

      seed = 1994
      initializer = init_ops.random_uniform_initializer(-0.01, 0.01, seed=seed)

      # Inputs
      x = array_ops.zeros([batch_size, input_size])
      h = array_ops.zeros([batch_size, cell_size])

      # Values for the inputs.
      x_value = np.random.rand(batch_size, input_size)
      h_value = np.random.rand(batch_size, cell_size)

      # Output from the basic GRU cell implementation.
      with vs.variable_scope("basic", initializer=initializer):
        output = core_rnn_cell_impl.GRUCell(cell_size)(x, h)
        sess.run([variables.global_variables_initializer()])
        basic_res = sess.run([output], {x: x_value, h: h_value})

      # Output from the block GRU cell implementation.
      with vs.variable_scope("block", initializer=initializer):
        output = gru_ops.GRUBlockCell(cell_size)(x, h)
        sess.run([variables.global_variables_initializer()])
        block_res = sess.run([output], {x: x_value, h: h_value})

      self.assertEqual(len(block_res), len(basic_res))
      for block, basic in zip(block_res, basic_res):
        self.assertAllClose(block, basic)
Esempio n. 6
0
 def get_enc_cell(
     self,
     cell_size,
 ):
     # cell = cudnn_rnn_ops.CudnnGRU(1, cell_size, (1024+128))
     cell = gru_ops.GRUBlockCell(cell_size)
     return cell
Esempio n. 7
0
        def do_reconstruction(enc_inputs, enc_outputs, enc_last_state,
                              input_weights, seq_lengths):
            num_units = 100
            # attn_mech = attention_wrapper.LuongAttention(
            # num_units=num_units,
            # memory=enc_outputs,
            # memory_sequence_length=seq_lengths,
            # scale=True)
            attn_mech = tf.contrib.seq2seq.BahdanauAttention(
                num_units=num_units,
                memory=enc_outputs,
                memory_sequence_length=seq_lengths,
                normalize=True,
                name='attention_mechanism')
            cell = gru_ops.GRUBlockCell(1024)
            cell = core_rnn_cell.DropoutWrapper(cell, 0.5, 0.5)
            attn_cell = tf.contrib.seq2seq.AttentionWrapper(
                cell=cell,
                attention_mechanism=attn_mech,
                attention_layer_size=1024,
                output_attention=False,
                initial_cell_state=enc_last_state,
                name="attention_wrapper")

            decoder_target = tf.reverse_sequence(enc_inputs,
                                                 seq_lengths,
                                                 seq_dim=1,
                                                 batch_dim=0)
            decoder_inputs = tf.pad(decoder_target[:, :-1, :],
                                    [[0, 0], [1, 0], [0, 0]])

            helper = tf.contrib.seq2seq.TrainingHelper(
                inputs=decoder_inputs,  # decoder inputs
                sequence_length=seq_lengths,  # decoder input length
                name="decoder_training_helper")

            # Decoder setup
            decoder = tf.contrib.seq2seq.BasicDecoder(
                cell=attn_cell,
                helper=helper,
                initial_state=attn_cell.zero_state(tf.shape(enc_inputs)[0],
                                                   dtype=tf.float32),
                output_layer=Dense(1024 + 128))
            # Perform dynamic decoding with decoder object
            dec_outputs, final_state, final_sequence_lengths = tf.contrib.seq2seq.dynamic_decode(
                decoder,
                swap_memory=True,
            )
            loss = reconstruct_loss(logit=dec_outputs.rnn_output,
                                    target=decoder_target)
            # input_weights = tf.cast(input_weights, tf.float32)
            loss = tf.reduce_sum(loss * input_weights, axis=1) / tf.cast(
                seq_lengths, tf.float32)
            loss = tf.reduce_mean(loss)
            # loss = tf.contrib.seq2seq.sequence_loss(
            # dec_outputs.rnn_output, decoder_target, input_weights,
            # softmax_loss_function=reconstruct_loss)
            predictions = tf.no_op()
            return predictions, loss
Esempio n. 8
0
def inference_gru_block_vs_gru_cell(batch_size,
                                    cell_size,
                                    input_size,
                                    time_steps,
                                    use_gpu=False,
                                    iters=30):
    """Benchmark inference speed between GRUBlockCell vs GRUCell."""
    tf.reset_default_graph()
    with tf.Session(graph=tf.Graph()) as sess:
        with tf.device("/cpu:0" if not use_gpu else "/gpu:0"):

            # Random initializers.
            seed = 1994
            initializer = tf.random_uniform_initializer(-1, 1, seed=seed)
            np.random.seed(seed)

            # Inputs
            concat_x = vs.get_variable("concat_x",
                                       [time_steps, batch_size, input_size])
            h = vs.get_variable("h", [batch_size, cell_size])

            # Output from the basic GRU cell implementation.
            with tf.variable_scope("basic", initializer=initializer):
                cell = tf.nn.rnn_cell.GRUCell(cell_size)
                outputs_dynamic, _ = tf.nn.dynamic_rnn(cell,
                                                       inputs=concat_x,
                                                       initial_state=h,
                                                       time_major=True,
                                                       dtype=tf.float32)
                sess.run([tf.initialize_all_variables()])
                basic_time_inference = time_taken_by_op(
                    outputs_dynamic, sess, iters)

            # Output from the block GRU cell implementation.
            with tf.variable_scope("block", initializer=initializer):
                cell = gru_ops.GRUBlockCell(cell_size)
                outputs_dynamic, _ = tf.nn.dynamic_rnn(cell,
                                                       inputs=concat_x,
                                                       initial_state=h,
                                                       time_major=True,
                                                       dtype=tf.float32)
                sess.run([tf.initialize_all_variables()])
                block_time_inference = time_taken_by_op(
                    outputs_dynamic, sess, iters)

        performance_inference = (basic_time_inference - block_time_inference
                                 ) * 100 / basic_time_inference
        print(",".join([
            str(batch_size),
            str(cell_size),
            str(input_size),
            str(time_steps),
            str(use_gpu),
            str(basic_time_inference),
            str(block_time_inference),
            str(performance_inference)
        ]))

        return basic_time_inference, block_time_inference
Esempio n. 9
0
    def get_pretrain_enc_cell(self, ):
        cell = gru_ops.GRUBlockCell(1024)
        if self.is_training:
            cell = core_rnn_cell.DropoutWrapper(cell, 0.5, 0.5)
        cell = core_rnn_cell.InputProjectionWrapper(cell, 1024)
        cell = core_rnn_cell.OutputProjectionWrapper(cell, 1024)

        cell = core_rnn_cell.DeviceWrapper(cell, device='/gpu:0')
        return cell
Esempio n. 10
0
  def create_model(self, model_input, vocab_size, num_frames,
                   is_training=True, dense_labels=None, **unused_params):
    # output_ranges = 9 + tf.range(0, 300, 10)
    # second_inputs = sample_sequence(model_input, output_ranges, 20)
    num_frames = tf.cast(tf.expand_dims(num_frames, 1), tf.float32)

    first_layer_outputs = []
    num_splits = 6
    with tf.variable_scope("EncLayer0"):
      cell = gru_ops.GRUBlockCell(1024)
      for i in xrange(num_splits):
        frames = SampleRandomSequence(model_input, num_frames, 50)
        if i > 0:
          tf.get_variable_scope().reuse_variables()
        enc_outputs, enc_state = tf.nn.dynamic_rnn(
            cell, frames, dtype=tf.float32, scope="enc0")
        enc_state = moe_layer(enc_state, 1024, 4, act_func=None, l2_penalty=1e-12)
        if is_training:
          enc_state = tf.nn.dropout(enc_state, 0.5)
        first_layer_outputs.append(enc_state)

    with tf.variable_scope("EncLayer1"):
      cell = gru_ops.GRUBlockCell(1024)
      first_layer_outputs = tf.stack(first_layer_outputs, axis=1)
      enc_outputs, enc_state = tf.nn.dynamic_rnn(
          cell, first_layer_outputs, dtype=tf.float32, scope="enc1")

    # flatten_outputs = attn_new.attn(enc_outputs, fea_size=1024, seq_len=num_splits)
    flatten_outputs = tf.reduce_mean(enc_outputs, axis=1)

    with tf.variable_scope("FC0"):
      flatten_outputs = slim.fully_connected(
          flatten_outputs,
          1024,
          activation_fn=tf.nn.relu,
          weights_regularizer=slim.l2_regularizer(1e-8),
          scope="fc0")
      # flatten_outputs = moe_layer(flatten_outputs, 1024, 2, act_func=tf.nn.relu, l2_penalty=1e-8)
    if is_training:
      flatten_outputs = tf.nn.dropout(flatten_outputs, 0.5)
    with tf.variable_scope("FC1"):
      logits = moe_layer(flatten_outputs, vocab_size, 2, act_func=tf.nn.sigmoid, l2_penalty=1e-8)
    logits = tf.clip_by_value(logits, 0., 1.)
    return {"predictions": logits}
Esempio n. 11
0
  def testNoneDimsWithDynamicRNN(self):
    with self.test_session(use_gpu=self._use_gpu, graph=ops.Graph()) as sess:
      batch_size = 4
      cell_size = 5
      input_size = 6
      num_steps = 7

      cell = gru_ops.GRUBlockCell(cell_size)

      x = array_ops.placeholder(dtypes.float32, shape=(None, None, input_size))
      _, output = rnn.dynamic_rnn(
          cell, x, time_major=True, dtype=dtypes.float32)
      sess.run(variables.global_variables_initializer())
      feed = {}
      feed[x] = np.random.randn(num_steps, batch_size, input_size)
      sess.run(output, feed)
Esempio n. 12
0
def single_bprop_step_gru_block_vs_gru_cell(batch_size,
                                            cell_size,
                                            input_size,
                                            use_gpu=False,
                                            iters=30):
    """Benchmark single bprop step speed between GRUBlockCell vs GRUCell."""
    ops.reset_default_graph()
    with session.Session(graph=ops.Graph()) as sess:
        with benchmarking.device(use_gpu):
            initializer = init_ops.random_uniform_initializer(-1, 1, seed=1989)
            # Inputs
            x = vs.get_variable("x", [batch_size, input_size])
            h = vs.get_variable("h", [batch_size, cell_size])

            # Output from the basic GRU cell implementation.
            with vs.variable_scope("basic", initializer=initializer):
                output = rnn_cell.GRUCell(cell_size)(array_ops.identity(x),
                                                     array_ops.identity(h))
                sess.run([variables.global_variables_initializer()])
                grad_output_wrt_input = gradients_impl.gradients([output], h)
                basic_time_bprop = benchmarking.seconds_per_run(
                    grad_output_wrt_input, sess, iters)

            # Output from the block GRU cell implementation.
            with vs.variable_scope("block", initializer=initializer):
                output = gru_ops.GRUBlockCell(cell_size)(array_ops.identity(x),
                                                         array_ops.identity(h))
                sess.run([variables.global_variables_initializer()])
                grad_output_wrt_input = gradients_impl.gradients([output], h)
                block_time_bprop = benchmarking.seconds_per_run(
                    grad_output_wrt_input, sess, iters)

    performance_inference = (basic_time_bprop -
                             block_time_bprop) * 100 / basic_time_bprop

    print(",".join([
        str(batch_size),
        str(cell_size),
        str(input_size),
        str(use_gpu),
        str(basic_time_bprop),
        str(block_time_bprop),
        str(performance_inference)
    ]))

    return basic_time_bprop, block_time_bprop
Esempio n. 13
0
def single_bprop_step_gru_block_vs_gru_cell(batch_size,
                                            cell_size,
                                            input_size,
                                            use_gpu=False,
                                            iters=30):
    """Benchmark single bprop step speed between GRUBlockCell vs GRUCell."""
    tf.reset_default_graph()
    with tf.Session(graph=tf.Graph()) as sess:
        with tf.device("/cpu:0" if not use_gpu else "/gpu:0"):
            initializer = tf.random_uniform_initializer(-1, 1, seed=1989)
            # Inputs
            x = vs.get_variable("x", [batch_size, input_size])
            h = vs.get_variable("h", [batch_size, cell_size])

            # Output from the basic GRU cell implementation.
            with tf.variable_scope("basic", initializer=initializer):
                output = tf.nn.rnn_cell.GRUCell(cell_size)(tf.identity(x),
                                                           tf.identity(h))
                sess.run([tf.initialize_all_variables()])
                grad_output_wrt_input = tf.gradients([output], h)
                basic_time_bprop = time_taken_by_op(grad_output_wrt_input,
                                                    sess, iters)

            # Output from the block GRU cell implementation.
            with tf.variable_scope("block", initializer=initializer):
                output = gru_ops.GRUBlockCell(cell_size)(tf.identity(x),
                                                         tf.identity(h))
                sess.run([tf.initialize_all_variables()])
                grad_output_wrt_input = tf.gradients([output], h)
                block_time_bprop = time_taken_by_op(grad_output_wrt_input,
                                                    sess, iters)

    performance_inference = (basic_time_bprop -
                             block_time_bprop) * 100 / basic_time_bprop

    print(",".join([
        str(batch_size),
        str(cell_size),
        str(input_size),
        str(use_gpu),
        str(basic_time_bprop),
        str(block_time_bprop),
        str(performance_inference)
    ]))

    return basic_time_bprop, block_time_bprop
Esempio n. 14
0
    def create_model(self,
                     model_input,
                     vocab_size,
                     num_frames,
                     is_training=True,
                     dense_labels=None,
                     **unused_params):
        num_frames = tf.cast(tf.expand_dims(num_frames, 1), tf.float32)

        runtime_batch_size = tf.shape(model_input)[0]
        initial_state = tf.zeros((runtime_batch_size, self.cell_size),
                                 dtype=tf.float32)

        with tf.variable_scope("EncLayer0"):
            enc_cell = gru_ops.GRUBlockCell(1024)
            enc_outputs, enc_state = tf.nn.dynamic_rnn(enc_cell,
                                                       model_input,
                                                       dtype=tf.float32,
                                                       scope="enc0")

        with tf.variable_scope("EncLayer1"):
            enc_cell = self.get_enc_cell(self.cell_size, vocab_size)
            enc_outputs, enc_state = dynamic_rnn.dynamic_rnn(
                enc_cell,
                enc_outputs,
                initial_state=initial_state,
                scope="enc1")

        if is_training:
            enc_state = tf.nn.dropout(enc_state, 0.8)
        enc_state = slim.fully_connected(
            enc_state,
            1024,
            activation_fn=None,
            biases_initializer=None,
            weights_regularizer=slim.l2_regularizer(1e-8),
            scope="outputLayers")
        if is_training:
            enc_state = tf.nn.dropout(enc_state, 0.8)
        logits = moe_layer(enc_state,
                           vocab_size,
                           2,
                           act_func=tf.nn.sigmoid,
                           l2_penalty=1e-8)
        return {"predictions": logits}
Esempio n. 15
0
    def testGradient(self):
        with self.test_session(use_gpu=self._use_gpu,
                               graph=tf.Graph()) as sess:
            batch_size = 1
            cell_size = 3
            input_size = 2

            # Inputs
            x = tf.zeros([batch_size, input_size])
            h = tf.zeros([batch_size, cell_size])
            output = gru_ops.GRUBlockCell(cell_size)(x, h)

            sess.run([tf.initialize_all_variables()])

            all_variables = tf.all_variables()

            [w_ru, b_ru, w_c, b_c] = all_variables[:4]

            error_x = tf.test.compute_gradient_error(x,
                                                     (batch_size, input_size),
                                                     output[0],
                                                     (batch_size, cell_size))
            error_h = tf.test.compute_gradient_error(h,
                                                     (batch_size, cell_size),
                                                     output[0],
                                                     (batch_size, cell_size))
            error_w_ru = tf.test.compute_gradient_error(
                w_ru, (input_size + cell_size, 2 * cell_size), output[0],
                (batch_size, cell_size))
            error_w_c = tf.test.compute_gradient_error(
                w_c, (input_size + cell_size, cell_size), output[0],
                (batch_size, cell_size))
            error_b_ru = tf.test.compute_gradient_error(
                b_ru, (2 * cell_size, ), output[0], (batch_size, cell_size))
            error_b_c = tf.test.compute_gradient_error(b_c, (cell_size, ),
                                                       output[0],
                                                       (batch_size, cell_size))

        eps = 1e-4
        self.assertLess(error_x, eps)
        self.assertLess(error_h, eps)
        self.assertLess(error_w_ru, eps)
        self.assertLess(error_w_c, eps)
        self.assertLess(error_b_ru, eps)
        self.assertLess(error_b_c, eps)
Esempio n. 16
0
  def testGradient(self):
    with self.session(use_gpu=True, graph=ops.Graph()) as sess:
      batch_size = 1
      cell_size = 3
      input_size = 2

      # Inputs
      x = array_ops.zeros([batch_size, input_size])
      h = array_ops.zeros([batch_size, cell_size])
      output = gru_ops.GRUBlockCell(cell_size)(x, h)

      sess.run([variables.global_variables_initializer()])

      all_variables = variables.global_variables()

      [w_ru, b_ru, w_c, b_c] = all_variables[:4]

      error_x = gradient_checker.compute_gradient_error(
          x, (batch_size, input_size), output[0], (batch_size, cell_size))
      error_h = gradient_checker.compute_gradient_error(h,
                                                        (batch_size, cell_size),
                                                        output[0],
                                                        (batch_size, cell_size))
      error_w_ru = gradient_checker.compute_gradient_error(
          w_ru, (input_size + cell_size, 2 * cell_size), output[0],
          (batch_size, cell_size))
      error_w_c = gradient_checker.compute_gradient_error(
          w_c, (input_size + cell_size, cell_size), output[0],
          (batch_size, cell_size))
      error_b_ru = gradient_checker.compute_gradient_error(
          b_ru, (2 * cell_size,), output[0], (batch_size, cell_size))
      error_b_c = gradient_checker.compute_gradient_error(
          b_c, (cell_size,), output[0], (batch_size, cell_size))

    eps = 1e-4
    self.assertLess(error_x, eps)
    self.assertLess(error_h, eps)
    self.assertLess(error_w_ru, eps)
    self.assertLess(error_w_c, eps)
    self.assertLess(error_b_ru, eps)
    self.assertLess(error_b_c, eps)
Esempio n. 17
0
def training_gru_block_vs_gru_cell(batch_size,
                                   cell_size,
                                   input_size,
                                   time_steps,
                                   use_gpu=False,
                                   iters=30):
  """Benchmark training speed between GRUBlockCell vs GRUCell."""
  ops.reset_default_graph()
  with session.Session(graph=ops.Graph()) as sess:
    # Specify the device which is been used.
    with ops.device("/cpu:0" if not use_gpu else "/gpu:0"):

      # Random initializers.
      seed = 1994
      initializer = init_ops.random_uniform_initializer(-1, 1, seed=seed)
      np.random.seed(seed)

      # Inputs
      concat_x = vs.get_variable("concat_x",
                                 [time_steps, batch_size, input_size])
      h = vs.get_variable("h", [batch_size, cell_size])
      y = vs.get_variable("y", [time_steps, batch_size, cell_size])

      # Output from the basic GRU cell implementation.
      with vs.variable_scope("basic", initializer=initializer):
        cell = core_rnn_cell_impl.GRUCell(cell_size)

        outputs_dynamic, _ = rnn.dynamic_rnn(
            cell,
            inputs=concat_x,
            initial_state=h,
            time_major=True,
            dtype=dtypes.float32)
        sess.run([variables.global_variables_initializer()])
        cost = math_ops.reduce_mean(math_ops.square(outputs_dynamic - y))
        learning_rate = 0.01
        optimizer = gradient_descent.GradientDescentOptimizer(
            learning_rate).minimize(cost)

        # time for a training step.
        basic_time_training = time_taken_by_op(optimizer, sess, iters)

      # Output from the basic GRU cell implementation.
      with vs.variable_scope("block", initializer=initializer):
        cell = gru_ops.GRUBlockCell(cell_size)

        outputs_dynamic, _ = rnn.dynamic_rnn(
            cell,
            inputs=concat_x,
            initial_state=h,
            time_major=True,
            dtype=dtypes.float32)
        sess.run([variables.global_variables_initializer()])
        cost = math_ops.reduce_mean(math_ops.square(outputs_dynamic - y))
        learning_rate = 0.01
        optimizer = gradient_descent.GradientDescentOptimizer(
            learning_rate).minimize(cost)

        # time for a training step.
        block_time_training = time_taken_by_op(optimizer, sess, iters)

    performance_training = (
        basic_time_training - block_time_training) * 100 / basic_time_training

    print(",".join([
        str(batch_size), str(cell_size), str(input_size), str(time_steps), str(
            use_gpu), str(basic_time_training), str(block_time_training), str(
                performance_training)
    ]))

    return basic_time_training, block_time_training
Esempio n. 18
0
  def testDerivativeOfBlockGRUToGRUCellMultiSteps(self):
    batch_size = 2
    cell_size = 3
    input_size = 4
    time_steps = 2
    with self.test_session(use_gpu=self._use_gpu, graph=ops.Graph()) as sess:
      # Random initializers.
      seed = 1994
      initializer = init_ops.random_uniform_initializer(-0.01, 0.01, seed=seed)
      np.random.seed(seed)

      # Inputs
      concat_x = array_ops.placeholder(
          dtypes.float32, shape=(time_steps, batch_size, input_size))
      h = array_ops.zeros([batch_size, cell_size])

      # Values for the inputs.
      x_values = np.random.rand(time_steps, batch_size, input_size)
      h_value = np.random.rand(batch_size, cell_size)
      feeds = {concat_x: x_values, h: h_value}

      # Gradients from the block GRU cell implementation.
      with vs.variable_scope("block", initializer=initializer):
        cell = gru_ops.GRUBlockCell(cell_size)

        outputs_dynamic, _ = rnn.dynamic_rnn(
            cell,
            inputs=concat_x,
            initial_state=h,
            time_major=True,
            dtype=dtypes.float32)
        grad_output_wrt_x = gradients_impl.gradients([outputs_dynamic[0]],
                                                     concat_x)
        grad_output_wrt_h = gradients_impl.gradients([outputs_dynamic[0]], h)

        sess.run([variables.global_variables_initializer()])
        block_grad_res_x, block_grad_res_h = sess.run(
            [grad_output_wrt_x, grad_output_wrt_h], feeds)

      # Gradients from the basic GRU cell implementation.
      with vs.variable_scope("basic", initializer=initializer):
        cell = core_rnn_cell_impl.GRUCell(cell_size)

        outputs_dynamic, _ = rnn.dynamic_rnn(
            cell,
            inputs=concat_x,
            initial_state=h,
            time_major=True,
            dtype=dtypes.float32)
        grad_output_wrt_x = gradients_impl.gradients([outputs_dynamic[0]],
                                                     concat_x)
        grad_output_wrt_h = gradients_impl.gradients([outputs_dynamic[0]], h)

        sess.run([variables.global_variables_initializer()])
        basic_grad_res_x, basic_grad_res_h = sess.run(
            [grad_output_wrt_x, grad_output_wrt_h], feeds)

    # Check derivatives values of the outputs wrt to x.
    self.assertEqual(len(block_grad_res_x), len(basic_grad_res_x))

    # Check derivatives values of the outputs wrt to h.
    for block, basic in zip(block_grad_res_x, basic_grad_res_x):
      self.assertAllClose(block, basic)

    # Check derivatives values of the outputs wrt to x.
    self.assertEqual(len(block_grad_res_h), len(basic_grad_res_h))

    # Check derivatives values of the outputs wrt to h.
    for block, basic in zip(block_grad_res_h, basic_grad_res_h):
      self.assertAllClose(block, basic)
Esempio n. 19
0
  def testDerivativeOfBlockGRUToGRUCellSingleStep(self):
    with self.test_session(use_gpu=self._use_gpu, graph=ops.Graph()) as sess:
      batch_size = 2
      cell_size = 3
      input_size = 4

      seed = 1994
      initializer = init_ops.random_uniform_initializer(-0.01, 0.01, seed=seed)
      np.random.seed(seed)

      # Inputs
      x = array_ops.zeros([batch_size, input_size])
      h = array_ops.zeros([batch_size, cell_size])

      # Values for the inputs.
      x_value = np.random.rand(batch_size, input_size)
      h_value = np.random.rand(batch_size, cell_size)

      # Gradients from the block GRU cell implementation.
      with vs.variable_scope("block", initializer=initializer):
        output = gru_ops.GRUBlockCell(cell_size)(x, h)
        sess.run([variables.global_variables_initializer()])

        all_variables = variables.global_variables()[0:4]
        [w_ru, b_ru, w_c, b_c] = all_variables

        d_new_h_wrt_x = gradients_impl.gradients([output], x)
        d_new_h_wrt_h = gradients_impl.gradients([output], h)
        d_new_h_wrt_w_ru = gradients_impl.gradients([output], w_ru)
        d_new_h_wrt_w_c = gradients_impl.gradients([output], w_c)
        d_new_h_wrt_b_ru = gradients_impl.gradients([output], b_ru)
        d_new_h_wrt_b_c = gradients_impl.gradients([output], b_c)

        d_block_res = sess.run([
            d_new_h_wrt_x, d_new_h_wrt_h, d_new_h_wrt_w_ru, d_new_h_wrt_w_c,
            d_new_h_wrt_b_ru, d_new_h_wrt_b_c
        ], {x: x_value,
            h: h_value})

      # Gradients from the basic GRU cell implementation.
      with vs.variable_scope("basic", initializer=initializer):
        output = core_rnn_cell_impl.GRUCell(cell_size)(x, h)
        sess.run([variables.global_variables_initializer()])

        all_variables = variables.global_variables()[4:8]
        [w_ru, b_ru, w_c, b_c] = all_variables

        d_new_h_wrt_x = gradients_impl.gradients([output], x)
        d_new_h_wrt_h = gradients_impl.gradients([output], h)
        d_new_h_wrt_w_ru = gradients_impl.gradients([output], w_ru)
        d_new_h_wrt_w_c = gradients_impl.gradients([output], w_c)
        d_new_h_wrt_b_ru = gradients_impl.gradients([output], b_ru)
        d_new_h_wrt_b_c = gradients_impl.gradients([output], b_c)

        d_basic_res = sess.run([
            d_new_h_wrt_x, d_new_h_wrt_h, d_new_h_wrt_w_ru, d_new_h_wrt_w_c,
            d_new_h_wrt_b_ru, d_new_h_wrt_b_c
        ], {x: x_value,
            h: h_value})

      # Check lengths of derivative results.
      self.assertEqual(len(d_block_res), len(d_basic_res))
      # Check the value of every derivative result.
      for block, basic in zip(d_block_res, d_basic_res):
        self.assertAllClose(block, basic)
Esempio n. 20
0
 def get_enc_cell(self, cell_size, vocab_size):
     cell = gru_ops.GRUBlockCell(cell_size)
     return cell
Esempio n. 21
0
 def get_enc_cell1(self, cell_size):
     cell = gru_ops.GRUBlockCell(cell_size)
     cell = core_rnn_cell.OutputProjectionWrapper(cell, 1024)
     return cell