Exemple #1
0
 def testGetOrCreateGlobalStep(self):
   with tf.variable_scope('s1'):
     with tf.name_scope('s2'):
       gs1 = py_utils.GetOrCreateGlobalStep()
       gs2 = tf.train.get_global_step()
     gs3 = py_utils.GetOrCreateGlobalStep()
     gs4 = tf.train.get_global_step()
   gs5 = py_utils.GetOrCreateGlobalStep()
   gs6 = tf.train.get_global_step()
   for gs in [gs2, gs3, gs4, gs5, gs6]:
     self.assertTrue(gs1 is gs)
   self.assertEqual(gs1.name, 'global_step:0')
Exemple #2
0
    def _testGPipeTransformerStackTrainEncoderTransparentFProp(
            self, splits=1, num_micro_batches=1):
        # time = 2,
        batch = 4
        with self.session() as sess:
            params = self._TransformerParams(
                splits=splits,
                num_micro_batches=num_micro_batches,
                num_decoder_layers=2,
                num_encoder_layers=2)
            params.is_transparent = True
            params.num_transparent_outputs = 1
            params.transparent_merger_dropout_prob = 0.0
            xformer = GPipeTransformerStack(params)

            inputs, paddings, tgt_inputs, tgt_paddings = self._random_inputs(
                batch=batch)
            py_utils.GetOrCreateGlobalStep()
            tf.set_random_seed(1234)
            tf.global_variables_initializer().run()
            enc_output = xformer.EncoderFPropDefaultTheta(inputs, paddings)
            dec_output = xformer.FProp(xformer.theta, inputs, paddings,
                                       tgt_inputs, tgt_paddings)
            enc_out = sess.run(enc_output)
            dec_out = sess.run(dec_output)
            self.assertAllClose(enc_out, [[[-0.118476, 1.031626]] * batch,
                                          [[0.643884, -1.02581167]] * batch])
            self.assertAllClose(dec_out, [[[-2.8764534, 1.00808454]] * batch,
                                          [[1.02129495, -0.78406084]] * batch,
                                          [[1.02129495, -0.78406084]] * batch])
Exemple #3
0
    def _testGPipeTransformerStackTrainTransparentFProp(
            self, splits=1, num_micro_batches=1):
        # time = 2,
        batch = 4
        with self.session() as sess:
            params = self._TransformerParams(
                splits=splits,
                num_micro_batches=num_micro_batches,
                num_decoder_layers=3,
                num_encoder_layers=1)
            params.is_transparent = True
            params.num_transparent_outputs = 3
            params.transparent_merger_dropout_prob = 0.0
            xformer = GPipeTransformerStack(params)

            inputs, paddings, tgt_inputs, tgt_paddings = self._random_inputs(
                batch=batch)
            py_utils.GetOrCreateGlobalStep()
            tf.set_random_seed(1234)
            tf.global_variables_initializer().run()
            enc_outputs = xformer.EncoderFPropDefaultTheta(inputs, paddings)
            dec_output = xformer.FProp(xformer.theta, inputs, paddings,
                                       tgt_inputs, tgt_paddings)
            enc_out_1, enc_out_2, enc_out_3 = sess.run(enc_outputs)
            dec_out = sess.run(dec_output)
            self.assertAllClose(enc_out_1, enc_out_2)
            self.assertAllClose(enc_out_2, enc_out_3)
            self.assertAllClose(enc_out_1,
                                [[[-0.27896273, 1.46589136]] * batch,
                                 [[1.03141928, -0.847896]] * batch])
            self.assertAllClose(dec_out, [[[2.926736, -4.090812]] * batch,
                                          [[-1.69508219, 1.75891459]] * batch,
                                          [[-1.6950829, 1.75891507]] * batch])
  def testTransformerAttentionLayerDeterministicDropout(self):
    with self.session(use_gpu=True) as sess:
      # Needed to generate a seed pair.
      py_utils.ResetStepSeed()
      py_utils.GetOrCreateGlobalStep()

      depth = 4
      p = layers_with_attention.TransformerAttentionLayer.Params()
      p.name = 'transformer_atten'
      p.source_dim = depth
      p.is_masked = False
      p.num_attention_heads = 2

      p.residual_dropout_tpl = layers.DeterministicDropoutLayer.Params()
      p.residual_dropout_prob = 0.1

      transformer_atten = layers_with_attention.TransformerAttentionLayer(p)

      (source_vecs, source_padding, _,
       _) = self._testTransformerAttentionLayerInputs(depth=depth)

      ctx, probs = transformer_atten.FProp(transformer_atten.theta, source_vecs,
                                           source_padding)

      tf.global_variables_initializer().run()
      actual_ctx, actual_probs = sess.run([ctx, probs])

      # pylint: disable=bad-whitespace
      # pyformat: disable
      print(np.array_repr(actual_ctx))
      expected_ctx = np.array([
          [[-1.45762944,  1.5337404 ,  0.34037334, -0.97208667],
           [-1.35992002, -1.06530988,  1.53705895,  2.79370689]],
          [[ 0.00657134,  1.12030125, -1.32564592, -1.73569465],
           [-0.80793667, -0.10877949, -0.80295694,  2.25494242]],
          [[ 1.76956046, -0.50777751, -1.19745886, -1.46751583],
           [-1.79178905, -0.77374339,  1.31586027,  2.98173356]],
          [[-0.85498607, -0.37413225,  1.25707364, -0.50043333],
           [ 1.62276983,  0.50820369, -1.52967572, -2.02076197]],
          [[-0.66754031, -0.68657839, -0.51643699,  1.96581018],
           [-1.4816376 ,  0.89419198, -0.57226259,  1.90177512]]
      ], dtype=np.float32)

      print(np.array_repr(actual_probs))
      expected_probs = np.array([
          [[ 0.21387868,  0.22080734,  0.        ,  0.        ,  0.56531399],
           [ 0.        ,  0.30584112,  0.24723588,  0.44692296,  0.        ]],
          [[ 0.25358215,  0.50932312,  0.        ,  0.        ,  0.23709476],
           [ 0.        ,  0.56834149,  0.2632803 ,  0.16837817,  0.        ]],
          [[ 0.38519409,  0.55454361,  0.        ,  0.        ,  0.06026226],
           [ 0.        ,  0.33708778,  0.21976741,  0.4431448 ,  0.        ]],
          [[ 0.27139962,  0.12790371,  0.        ,  0.        ,  0.60069668],
           [ 0.        ,  0.31849149,  0.28174096,  0.39976761,  0.        ]],
          [[ 0.16272782,  0.15781289,  0.        ,  0.        ,  0.67945927],
           [ 0.        ,  0.55003977,  0.26049581,  0.18946445,  0.        ]]
      ], dtype=np.float32)
      # pyformat: enable
      # pylint: enable=bad-whitespace
      self.assertAllClose(expected_ctx, actual_ctx, rtol=1e-05, atol=1e-05)
      self.assertAllClose(expected_probs, actual_probs, rtol=1e-05, atol=1e-05)
Exemple #5
0
 def _verify_timestep_counts(self, num_splits):
     num_micro_batches = 8
     batch_size = 16
     g = tf.Graph()
     with g.as_default():
         py_utils.GetOrCreateGlobalStep()
         tf.set_random_seed(1245)
         inputs = tf.random_uniform([batch_size, 8, 8, 1])
         net = _BuildDummyPipelineCnn(num_splits=num_splits,
                                      num_micro_batches=num_micro_batches)
         endpoints = net.FPropDefaultTheta(inputs)
         if isinstance(endpoints, (list, tuple)):
             logits, aux_logits = endpoints
         else:
             logits = endpoints
             aux_logits = None
         loss = tf.reduce_mean(logits)
         grads = tf.gradients(loss, tf.trainable_variables())
         grad_norm = tf.sqrt(py_utils.SumSquared(grads))
         ts = net.GetAccumulatorValues().Flatten()
     with self.session(graph=g) as sess:
         sess.run(tf.global_variables_initializer())
         grad_norm_val, ts_vals = sess.run([grad_norm, ts])
         self.assertNear(grad_norm_val, 0.269997, err=1.0e-6)
         # Accumulator values should be equal to number of time steps in pipeline.
         for ts_val in list(ts_vals):
             expected_ts = num_micro_batches if num_splits > 1 else 1
             self.assertEqual(ts_val, expected_ts)
         if aux_logits is not None:
             aux_logit_tensor = sess.run(aux_logits)
             self.assertEqual(aux_logit_tensor.shape, (batch_size, 8, 8, 1))
Exemple #6
0
def GetOverWriteGlobalStep(graph=None):
    graph = graph or tf.get_default_graph()
    mb_tensors = graph.get_collection_ref(_OVERWRITE_GLOBAL_STEP_COLLECTION)
    if len(mb_tensors) == 1:
        mb_tensor = mb_tensors[0]
    else:
        mb_tensor = py_utils.GetOrCreateGlobalStep()
    return mb_tensor
Exemple #7
0
  def _InputBatchFromCKPT(self):
    p = self.params

    @function.Defun()
    def ReadData():
      x, = io_ops.restore_v2(p.ckpt, [p.data], [''],
                               [p.data_dtype])
      return x

    # Loads data and label into memory and keep it around.
    data, = py_x_ops.cached_call(f=ReadData, T=[p.data_dtype])
    
    
    b = p.batch_size
    total_length = p.data_shape[0]
    total_batches = total_length // b
    total_steps = total_batches // p.num_steps
    left_over = total_batches % p.num_steps > 0
    if left_over:
      total_steps += 1
    
    if p.eval:
      dataset = tf.data.Dataset.range(total_steps).repeat()
      iterator = dataset.make_one_shot_iterator()
      global_step = iterator.get_next()
    else:
      global_step = py_utils.GetOrCreateGlobalStep() - 1
    
    batch_id = tf.to_int32(global_step % total_steps)
    
    data = data[:total_batches * b]
    data = tf.reshape(data, [b, total_batches])
    
    start = p.num_steps * batch_id
    end = tf.minimum(tf.to_int32(total_batches), start + p.num_steps)
    raw = tf.gather(data, tf.range(start, end, dtype=tf.int32), axis=1, name='ids')
    label_end = tf.minimum(end + 1, tf.to_int32(total_batches))
    label = tf.gather(data, tf.range(start + 1, label_end, dtype=tf.int32), axis=1, name='labels')
    raw = py_utils.PadOrTrimTo(raw, [b, end - start])
    ret = py_utils.NestedMap()
    # raw = tf.reshape(data[:700], [20, 35])
    # ret.ids = raw
    # ret.labels = raw
    # ret.weights = tf.ones([20, 35])
    # ret.paddings = 1.0 - ret.weights
    # ret.word_count = 700
    # ret.take_last_state = py_utils.GetOrCreateGlobalStep() > 0
    ret.ids = raw
    ret.labels = py_utils.PadOrTrimTo(label, [b, end - start])
    ret.weights = py_utils.PadOrTrimTo(tf.ones([b, label_end - start], dtype=tf.float32), [b, end - start])
    ret.paddings = 1.0 - ret.weights
    ret.word_count = b * (label_end - start - 1)
    ret.take_last_state = batch_id > 0
    
    return ret
Exemple #8
0
  def __init__(self, params):
    """Initializes this Model."""
    assert issubclass(params.cls, BaseModel)
    super(BaseModel, self).__init__(params)
    self._global_step = py_utils.GetOrCreateGlobalStep()
    # tasks are not yet instantiated.
    self._total_examples_sum = None

    self._ema = None
    tp = self.params.train
    tf.logging.info('Training parameters for %s: %s', params.cls, tp)
    if tp.ema_decay > 0:
      assert tp.ema_decay < 1.0
      self._ema = tf.train.ExponentialMovingAverage(
          decay=tp.ema_decay, num_updates=self._global_step)
Exemple #9
0
    def testDecoderFPropDeterministicAttentionDropout(self):
        """Verify that attention dropout is deterministic given fixed seeds."""
        with self.session(use_gpu=False) as sess:
            tf.set_random_seed(8372749040)
            p = self._DecoderParams(
                py_utils.VariationalNoiseParams(None, True, False, seed=1792))

            p.use_while_loop_based_unrolling = False
            p.attention.atten_dropout_prob = 0.5
            p.attention.atten_dropout_deterministic = True

            loss, per_sequence_loss = self._testDecoderFPropHelper(params=p)
            global_step = py_utils.GetOrCreateGlobalStep()
            tf.global_variables_initializer().run()
            loss_val, per_sequence_loss_val, global_steps_val, time_steps_val = (
                sess.run([
                    loss, per_sequence_loss,
                    'decoder_1/accumulated_global_steps:0',
                    'decoder_1/accumulated_time_steps:0'
                ]))

            print('loss = ', loss_val, 'per sequence loss = ',
                  per_sequence_loss_val)
            self.assertAllClose([3.473008, 15.0], loss_val)
            self.assertAllClose([13.563036, 10.053869, 10.362661, 18.115553],
                                per_sequence_loss_val)
            self.assertAllEqual([0, 0, 0, 0, 0], global_steps_val)
            self.assertAllEqual([1, 2, 3, 4, 5], time_steps_val)

            # Run another step to test global_step and time_step are incremented
            # correctly.
            sess.run(tf.assign_add(global_step, 1))
            loss_val, per_sequence_loss_val, global_steps_val, time_steps_val = (
                sess.run([
                    loss, per_sequence_loss,
                    'decoder_1/accumulated_global_steps:0',
                    'decoder_1/accumulated_time_steps:0'
                ]))

            print('loss = ', loss_val, 'per sequence loss = ',
                  per_sequence_loss_val)
            self.assertAllClose([3.567736, 15.0], loss_val)
            self.assertAllClose([14.730419, 10.176270, 10.73501, 17.87434578],
                                per_sequence_loss_val)
            self.assertAllEqual([1, 1, 1, 1, 1], global_steps_val)
            self.assertAllEqual([1, 2, 3, 4, 5], time_steps_val)
Exemple #10
0
    def testDecoderFPropDeterministicAttentionDropout(self):
        """Verify that attention dropout is deterministic given fixed seeds."""
        with self.session(use_gpu=False, graph=tf.Graph()) as sess:
            tf.set_random_seed(8372749040)
            p = self._DecoderParams(
                py_utils.VariationalNoiseParams(None, True, False, seed=1792))

            p.use_while_loop_based_unrolling = False
            p.attention.atten_dropout_prob = 0.5
            p.attention.atten_dropout_deterministic = True

            loss, per_sequence_loss = self._testDecoderFPropHelper(params=p)
            global_step = py_utils.GetOrCreateGlobalStep()
            tf.global_variables_initializer().run()
            loss_val, per_sequence_loss_val, global_steps_val, time_steps_val = (
                sess.run([
                    loss, per_sequence_loss,
                    'decoder_1/accumulated_global_steps:0',
                    'decoder_1/accumulated_time_steps:0'
                ]))

            print('loss = ', loss_val, 'per sequence loss = ',
                  per_sequence_loss_val)
            self.assertAllClose([3.567466, 15.0], loss_val)
            self.assertAllClose([13.762117, 10.278571, 10.660231, 18.811079],
                                per_sequence_loss_val)
            self.assertAllEqual([0, 0, 0, 0, 0], global_steps_val)
            self.assertAllEqual([2, 3, 4, 5, 6], time_steps_val)

            # Run another step to test global_step and time_step are incremented
            # correctly.
            sess.run(tf.assign_add(global_step, 1))
            loss_val, per_sequence_loss_val, global_steps_val, time_steps_val = (
                sess.run([
                    loss, per_sequence_loss,
                    'decoder_1/accumulated_global_steps:0',
                    'decoder_1/accumulated_time_steps:0'
                ]))

            print('loss = ', loss_val, 'per sequence loss = ',
                  per_sequence_loss_val)
            self.assertAllClose([3.56244, 15.0], loss_val)
            self.assertAllClose([14.180107, 10.391582, 10.460568, 18.40435],
                                per_sequence_loss_val)
            self.assertAllEqual([1, 1, 1, 1, 1], global_steps_val)
            self.assertAllEqual([2, 3, 4, 5, 6], time_steps_val)
Exemple #11
0
  def InputBatch(self):
    np.random.seed(1)
    bs, sl = 10, 7
    src_ids = tf.constant(
        np.random.randint(low=0, high=8192 - 1, size=[bs, sl], dtype=np.int32))
    tgt_ids = tf.constant(
        np.random.randint(low=0, high=8192 - 1, size=[bs, sl], dtype=np.int32))
    tgt_labels = tf.constant(
        np.random.randint(low=0, high=8192 - 1, size=[bs, sl], dtype=np.int32))
    tgt_weights = tf.constant(np.ones(shape=[bs, sl], dtype=np.float32))

    src_paddings = tf.zeros([bs, sl])
    tgt_paddings = tf.zeros([bs, sl])

    ret = py_utils.NestedMap()
    ret.src = py_utils.NestedMap()
    ret.tgt = py_utils.NestedMap()

    if self.params.split:
      src_ids = tf.split(src_ids, 2, 0)
      src_paddings = tf.split(src_paddings, 2, 0)
      tgt_ids = tf.split(tgt_ids, 2, 0)
      tgt_labels = tf.split(tgt_labels, 2, 0)
      tgt_paddings = tf.split(tgt_paddings, 2, 0)
      tgt_weights = tf.split(tgt_weights, 2, 0)

      ret.src.ids = tf.cond(
          tf.equal(tf.mod(py_utils.GetOrCreateGlobalStep(), 2), 0),
          lambda: src_ids[0], lambda: src_ids[1])
      ret.src.paddings = tf.cond(
          tf.equal(tf.mod(py_utils.GetOrCreateGlobalStep(), 2), 0),
          lambda: src_paddings[0], lambda: src_paddings[1])
      ret.tgt.ids = tf.cond(
          tf.equal(tf.mod(py_utils.GetOrCreateGlobalStep(), 2), 0),
          lambda: tgt_ids[0], lambda: tgt_ids[1])
      ret.tgt.labels = tf.cond(
          tf.equal(tf.mod(py_utils.GetOrCreateGlobalStep(), 2), 0),
          lambda: tgt_labels[0], lambda: tgt_labels[1])
      ret.tgt.paddings = tf.cond(
          tf.equal(tf.mod(py_utils.GetOrCreateGlobalStep(), 2), 0),
          lambda: tgt_paddings[0], lambda: tgt_paddings[1])
      ret.tgt.weights = tf.cond(
          tf.equal(tf.mod(py_utils.GetOrCreateGlobalStep(), 2), 0),
          lambda: tgt_weights[0], lambda: tgt_weights[1])
    else:
      ret.src.ids = src_ids
      ret.src.paddings = src_paddings
      ret.tgt.ids = tgt_ids
      ret.tgt.labels = tgt_labels
      ret.tgt.paddings = tgt_paddings
      ret.tgt.weights = tgt_weights

    return ret
Exemple #12
0
 def testDeterministicDropoutInsideFunctionalWhile(self):
     with self.session() as sess:
         cells = FeatureExtractionLayer.Params().Set(
             name='cell',
             sub=[
                 DeterministicDropoutLayer.Params().Set(name='dropout',
                                                        keep_prob=0.7)
             ])
         p = PipeliningLayer.Params().Set(name='pipe', cell_tpl=[cells])
         x = tf.ones([2, 3], dtype=tf.float32)
         model = p.cls(p)
         y = model.FPropDefaultTheta(x)
         py_utils.GetOrCreateGlobalStep()
         tf.global_variables_initializer().run()
         y_val = sess.run(y)
         self.assertAllClose([
             [1.0 / 0.7, 1.0 / 0.7, 1.0 / 0.7],
             [0.0, 0.0, 1.0 / 0.7],
         ], y_val)
         self.assertAllClose(5.7142859, np.sum(y_val))
  def testGPipeTransformerStackTrainTransparentFPropWithEmbeddings(
      self, splits=1, num_micro_batches=1):
    # time = 2,
    batch = 4
    with self.session() as sess:
      params = self._TransformerParamsWithEmbeddings(
          splits=splits,
          num_micro_batches=num_micro_batches,
          num_decoder_layers=3,
          num_encoder_layers=1)
      params.is_transparent = True
      params.num_transparent_outputs = 3
      params.transparent_merger_dropout_prob = 0.0
      xformer = GPipeTransformerStack(params)

      input_ids, id_paddings, tgt_inputs, tgt_paddings = self._random_inputs_ids(
          batch=batch)
      inputs, paddings, _, _ = self._random_inputs_vecs(batch=batch)
      py_utils.GetOrCreateGlobalStep()
      tf.set_random_seed(1234)
      tf.global_variables_initializer().run()
      enc_outputs = xformer.EncoderFPropDefaultTheta(inputs, paddings)
      dec_output = xformer.FProp(xformer.theta, input_ids, id_paddings,
                                 tgt_inputs, tgt_paddings)
      enc_out_1, enc_out_2, enc_out_3 = sess.run(enc_outputs)
      dec_out = sess.run(dec_output)
      self.assertAllClose(enc_out_1, enc_out_2)
      self.assertAllClose(enc_out_2, enc_out_3)
      self.assertAllClose(
          [[[0.68660116, 0.947429, 0.78953624, -1.20142817]] * batch,
           [[0.57919669, 1.12979364, 4.29336643, 0.45106331]] * batch],
          enc_out_1)
      self.assertAllClose(
          [[[-0.46651918, -1.62957835, 1.15657926, 1.08397353]] * batch,
           [[-0.34674695, -1.65999401, 1.08431196, 1.07384491]] * batch,
           [[-0.41073492, -1.60431314, 1.04607999, 1.08858371]] * batch],
          dec_out)
 def testDropoutInRecurrent(self, splits=1, num_micro_batches=1):
   assert splits in [1, 2, 4]
   with self.session() as sess:
     tf.set_random_seed(12345)
     num_layers = 4
     py_utils.GetOrCreateGlobalStep()
     # Build a model with 4 dropout layers.
     layers = []
     for l in range(num_layers):
       layers.append(DeterministicDropoutLayer.Params().Set(
           name='dropout_{}'.format(l), keep_prob=0.7))
     # Divide the model into splits partitions.
     cell_tpl = []
     layers_per_split = num_layers // splits
     for i in range(splits):
       sub = layers[i * layers_per_split:(i + 1) * layers_per_split]
       cell_tpl.append(FeatureExtractionLayer.Params().Set(
           name='cell_{}'.format(i), sub=sub))
     # Parallelize partitions using pipeline.
     p = PipeliningLayer.Params().Set(
         name='pipeline',
         num_micro_batches=num_micro_batches,
         cell_tpl=cell_tpl)
     # Fake input
     x = tf.ones([2, 3])
     # Construct weights.
     w = tf.get_variable(
         'w', shape=[2, 3], initializer=tf.constant_initializer([[1] * 3] * 2))
     mdl = p.cls(p)
     y = mdl.FPropDefaultTheta(x * w)
     # Construct loss function such that gradients = final activation.
     loss = tf.reduce_sum(y)
     grads = py_utils.ComputeGradients(loss, py_utils.NestedMap(w=w))
     tf.global_variables_initializer().run()
     y_val = sess.run(y)
     grads_val = sess.run(grads)['w'][1]
     self.assertAllClose(y_val, grads_val)
Exemple #15
0
    def Apply(self, lr, var_grad):
        p = self.params

        def _Acc(vg):
            """Updating accumulators."""

            v, g = vg
            with tf.variable_scope(v.op.name):
                _, a = py_utils.CreateVariable(
                    'grad_accumulator',
                    py_utils.WeightParams(v.get_shape(),
                                          py_utils.WeightInit.Constant(0.0),
                                          self.params.dtype),
                    trainable=False)
                a = tf.assign_add(a, g)

            return v, a

        var_grad = var_grad.Transform(_Acc)

        def _ApplyAndReset():
            with tf.control_dependencies([
                    self._opt.Apply(
                        lr,
                        py_utils.ApplyGradMultiplier(var_grad,
                                                     1. / p.accum_steps))
            ]):
                return tf.group(*[
                    tf.assign(a, tf.zeros_like(a))
                    for _, a in var_grad.Flatten()
                ])

        return tf.cond(
            tf.equal(tf.mod(py_utils.GetOrCreateGlobalStep(), p.accum_steps),
                     p.accum_steps - 1), _ApplyAndReset,
            lambda: tf.group(tf.no_op()))
Exemple #16
0
  def __init__(self, params):
    assert issubclass(params.cls, BaseTask)
    super(BaseTask, self).__init__(params)

    p = self.params

    if p.input:
      # TODO(zhifengc): Consider a simpler way to ensure the input
      # generator stops after one epoch.
      if p.is_eval and p.eval:
        seq_inp = issubclass(p.input.cls,
                             base_input_generator.BaseInputGeneratorFromFiles)
        if p.input.num_samples == 0:
          # Dataset size is unknown. Computes eval summary based on num_samples.
          assert p.eval.samples_per_summary > 0
        elif (p.eval.samples_per_summary == 0) or (p.input.num_samples <
                                                   p.eval.samples_per_summary):
          # If we know the dataset size and we want to evaluate the full
          # set, we need to coordinate the input generator to flush out
          # all samples so the evaler and decoder compute metrics on the
          # whole set for each summary step.
          if seq_inp:
            p.input.flush_every_n = p.input.num_samples
          p.eval.samples_per_summary = p.input.num_samples
        if seq_inp and p.input.num_batcher_threads > 1:
          tf.logging.warning('input.num_batcher_threads > 1 inside eval mode.  '
                             'The input generator may not iterate over exactly '
                             'one epoch per run')

      with tf.device(
          self.cluster.input_device), py_utils.outside_all_rewrites():
        self.CreateChild('input', p.input)

    self._var_grads = None
    self._encoder = None
    self._online_encoder = None
    self._decoder = None

    self._total_examples = None
    self._total_nans_and_infs = None
    self._loss = None
    self._num_predictions = None
    self._train_op = None
    self._eval_metrics = {}
    self._trainer_verbose_tensors = {}

    # Create the gradient mask,
    self._per_input_gradient_mask = None
    self._shared_global_step = py_utils.GetOrCreateGlobalStep()
    tp = p.train
    if tp:
      if tp.task_global_step:
        self._task_global_step = CreateTaskGlobalStep(p, p.name)
        self._global_step = self._task_global_step
      else:
        self._task_global_step = None
        self._global_step = self._shared_global_step
      if tp.grad_norm_tracker:
        with tf.variable_scope(p.name):
          self.CreateChild('grad_norm_tracker', tp.grad_norm_tracker)

      self.CreateChild('lr_schedule', tp.lr_schedule)
      self.CreateChild('optimizer', tp.optimizer)
    self._UpdateVnConfig()
Exemple #17
0
    def FProp(self, theta, *args):
        """Run multiple cells in different devices in a pipelining manner.

    Args:
      theta: A NestedMap object containing weights' values of this layer and its
        children layers.
      *args: Non-keyworded variable length argument list of input tensors.

    Returns:
      A list of output tensors
    """
        # TODO(huangyp): handle optional None inputs.
        p = self.params
        if p.is_eval:
            outputs = _ToTuple(args)
            for (name, l) in self._before_layers:
                outputs = _ToTuple(outputs)
                outputs = l.FProp(theta[name], *outputs)
            for (name, l) in self._cells:
                outputs = _ToTuple(outputs)
                outputs = l.FProp(theta[name], *outputs)
            return outputs

        num_cells = len(p.cell_tpl)
        cluster = self.cluster

        # Compute shapes of input and output tenors.
        input_tenors = _ToTuple(args)
        mini_batch_size = input_tenors[0].get_shape().as_list()[p.batch_dim]
        input_dtype = input_tenors[0].dtype
        if p.num_micro_batches > mini_batch_size:
            p.num_micro_batches = mini_batch_size
        micro_batch_size = mini_batch_size // p.num_micro_batches

        input_shapes = ()
        for input_tensor in input_tenors:
            if input_tensor is not None:
                input_shape = input_tensor.get_shape().as_list()
                input_shape[p.batch_dim] = micro_batch_size
                input_shapes += (tf.TensorShape(input_shape), )
            else:
                input_shapes += (None, )

        state_shapes = self._CalculateOutputShapes(input_shapes)

        def GetCellFn(i):
            """Get the ith feature extraction layer."""
            def CellFn(theta, state0, inputs):
                """A cell fn is exectued inside of StackedRecurrent."""
                del state0
                frop_inputs = []
                for input_idx in range(len(state_shapes[i])):
                    name = 's{}'.format(input_idx)
                    if state_shapes[i][input_idx] is not None:
                        inputs[name].set_shape(state_shapes[i][input_idx])
                        frop_inputs.append(inputs[name])
                    else:
                        frop_inputs.append(None)

                with CellFnFropOpReplacementWrapper():
                    tf.logging.info('cell {} input {}'.format(i, frop_inputs))
                    mb_tensor = inputs[_MICRO_BATCH_STATE_NAME]
                    SetOverWriteGlobalStep(mb_tensor)
                    _, cell = self._cells[i]
                    outputs = cell.FProp(theta, *frop_inputs)

                state1 = py_utils.NestedMap()
                state1[_MICRO_BATCH_STATE_NAME] = mb_tensor
                outputs = _ToTuple(outputs)
                assert len(outputs) == len(state_shapes[i + 1])
                for output_idx in range(len(outputs)):
                    if outputs[output_idx] is not None:
                        name = 's{}'.format(output_idx)
                        state1[name] = outputs[output_idx]
                return state1, py_utils.NestedMap()

            return CellFn

        cell_fns = []
        accumulator_layers = []
        thetas = []
        init_states = []
        devices = []
        for cell_idx in range(num_cells):
            cell_name, cell = self._cells[cell_idx]
            accumulator_layers.append(cell)
            cell_fns.append(GetCellFn(cell_idx))
            thetas.append(theta[cell_name])
            init_state = py_utils.NestedMap()
            init_state[_MICRO_BATCH_STATE_NAME] = tf.cast(0, dtype=input_dtype)
            for output_idx in range(len(state_shapes[cell_idx + 1])):
                name = 's{}'.format(output_idx)
                if state_shapes[cell_idx + 1][output_idx] is not None:
                    init_state[name] = tf.zeros(state_shapes[cell_idx +
                                                             1][output_idx],
                                                dtype=input_dtype)
            init_states.append(init_state)
            devices.append(cluster.WorkerDeviceInModelSplit(cell_idx))

        cell_grads = [None] * num_cells
        cell_outs = [lambda x: x] * num_cells
        cell_out_grads = [lambda x: x] * num_cells

        with tf.device(devices[0]):
            previous = input_tenors
            for (name, l) in self._before_layers:
                previous = l.FProp(theta[name], *previous)
                previous = _ToTuple(previous)
            inputs = py_utils.NestedMap()
            gs_tensor = py_utils.GetOrCreateGlobalStep()
            inputs[_MICRO_BATCH_STATE_NAME] = tf.stack([
                tf.cast(gs_tensor * p.num_micro_batches + t, dtype=input_dtype)
                for t in range(p.num_micro_batches)
            ])

            # TODO(huangyp, dehao): apply dehao's trick to reshape the input tensor
            # to [p.num_micro_batches, -1, 128].
            for output_idx, output_tenor in enumerate(previous):
                name = 's{}'.format(output_idx)
                if output_tenor is not None:
                    output_tenor = tf.stack(
                        tf.split(output_tenor,
                                 p.num_micro_batches,
                                 axis=p.batch_dim))
                    inputs[name] = output_tenor

        output, _ = recurrent.StackedRecurrent(
            devices=devices,
            cell_fns=cell_fns,
            cell_grads=cell_grads,
            cell_outs=cell_outs,
            cell_out_grads=cell_out_grads,
            thetas=thetas,
            init_states=init_states,
            inputs=inputs,
            accumulator_layers=accumulator_layers,
            unused_acc_state=True)

        with tf.device(devices[-1]):
            output_tensors = []
            for output_idx in range(len(state_shapes[-1])):
                state_shape = state_shapes[-1][output_idx]
                if state_shape is None:
                    output_tensors.append(None)
                    continue
                output_name = 's{}'.format(output_idx)
                output_tensor = output[output_name]
                if p.batch_dim != 0:
                    perm = list(range(1, p.batch_dim + 1)) + [0]
                    perm += list(range(p.batch_dim + 1, len(state_shape) + 1))
                    output_tensor = tf.transpose(output_tensor, perm=perm)
                state_shape[p.batch_dim] *= p.num_micro_batches
                output_tensor = tf.reshape(output_tensor, state_shape)
                output_tensors.append(output_tensor)
            tf.logging.info('pipeline output = {}'.format(output_tensors))
            if len(output_tensors) == 1:
                return output_tensors[0]
            return tuple(output_tensors)
Exemple #18
0
  def testBPropWithAccumComparison(self):

    def _SetDefaults(p):
      p.random_seed = 12345
      p.decoder.input_dropout_prob = 0.0
      mp = p.encoder.transformer_stack.transparent_merger_tpl
      mp.weighted_merger_dropout_prob = 0.0
      disable_vn = py_utils.VariationalNoiseParams(1.0, False, False)
      for lp in base_layer.RecursiveFindLayerParams(p):
        # TODO(lepikhin): lp.dtype = dtype
        lp.params_init = py_utils.WeightInit.Gaussian(0.1, 12345)
        lp.vn = disable_vn

      tp = p.train
      assert tp.l2_regularizer_weight is None
      tp.clip_gradient_norm_to_value = False
      tp.grad_norm_to_clip_to_zero = False
      tp.optimizer = optimizer.SGD.Params()
      tp.learning_rate = 1e-2
      tp.lr_schedule = lr_schedule.ContinuousLearningRateSchedule.Params()
      for l in p.ToText().split('\n'):
        print(l)
      return p

    with self.session(use_gpu=False, graph=tf.Graph()) as sess:
      tf.set_random_seed(_TF_RANDOM_SEED)
      p = self._testParams()
      p.input = TestInputGenerator.Params()
      p.input.split = True
      p = _SetDefaults(p)
      p.train.optimizer = optimizer.Accumulator.Params().Set(
          accum_steps=2, optimizer_tpl=p.train.optimizer)
      mdl = p.cls(p)
      mdl.FPropDefaultTheta()
      mdl.BProp()
      loss = mdl.loss
      logp = mdl.eval_metrics['log_pplx'][0]

      tf.global_variables_initializer().run()

      for _ in range(2):
        sess.run((py_utils.GetOrCreateGlobalStep(), loss, logp, mdl.train_op))

      expected = sess.run(mdl.dec.softmax.vars['weight_0'])

    with self.session(use_gpu=False, graph=tf.Graph()) as sess:
      tf.set_random_seed(_TF_RANDOM_SEED)
      p = self._testParams()
      p.input = TestInputGenerator.Params()
      p.input.split = False
      p = _SetDefaults(p)
      mdl = p.cls(p)
      mdl.FPropDefaultTheta()
      mdl.BProp()
      loss = mdl.loss
      logp = mdl.eval_metrics['log_pplx'][0]

      tf.global_variables_initializer().run()

      sess.run((py_utils.GetOrCreateGlobalStep(), loss, logp, mdl.train_op))

      actual = sess.run(mdl.dec.softmax.vars['weight_0'])

    self.assertAllClose(expected, actual, rtol=1e-2, atol=1e-2)
Exemple #19
0
  def FPropTower(self, theta, input_batch):
    p = self.params
    chunk_ids = input_batch.chunk_ids if p.lm.use_chunks else None
    ids, paddings, labels_ids, weights, chunk_ids = self._TrimIfPossibleThenTranspose(
        input_batch.ids, input_batch.paddings, input_batch.labels,
        input_batch.weights, chunk_ids=chunk_ids)

    seqlen = tf.shape(ids)[0]
    batch_size = tf.shape(ids)[1]
    zero_state = self.lm.zero_state(batch_size)
    
    with tf.name_scope('prepare_state'):
      if p.contiguous:
        state0 = py_utils.NestedMap(rnn=[])
        for i in range(p.lm.rnns.num_layers):
          if p.is_eval:
            last_m = tf.reshape(self.theta['last_state_%d_m' %i], [p.batch_size, p.lm.emb.embedding_dim])
            last_c = tf.reshape(self.theta['last_state_%d_c' %i], [p.batch_size, p.lm.emb.embedding_dim])
          else:
            last_m = self.theta['last_state_%d_m' %i]
            last_c = self.theta['last_state_%d_c' %i]
          m = tf.cond(input_batch.take_last_state, lambda: last_m, lambda: zero_state.rnn[i].m)
          c = tf.cond(input_batch.take_last_state, lambda: last_c, lambda: zero_state.rnn[i].c)
          # c = tf.Print(c, [c])
          state0.rnn.append(py_utils.NestedMap(c=c, m=m))
      else:
        state0 = zero_state
    labels = py_utils.NestedMap(class_ids=labels_ids, class_weights=weights)
    
    xent_output, state1 = self.lm.FProp(theta.lm, ids, paddings, state0, labels=labels, chunk_ids=chunk_ids)
    
    # self.state1 = state1
    
    if p.contiguous:
      assign_ops = list()
      for i in range(p.lm.rnns.num_layers):
        m = tf.reshape(state1.rnn[i].m, [1, p.batch_size, p.lm.emb.embedding_dim])
        c = tf.reshape(state1.rnn[i].c, [1, p.batch_size, p.lm.emb.embedding_dim])
        if not p.is_eval:
          state1.rnn[i].m = m
          state1.rnn[i].c = c
        assign_ops.append(tf.assign(self.vars['last_state_%i_m' %i], m))
        assign_ops.append(tf.assign(self.vars['last_state_%i_c' %i], c))
      self.last_state_group_op = tf.group(*assign_ops)
    
    # +1 to account for the end of sequence symbol.
    div = 2 if p.input.use_chunks else 1 # tags shouldn't be counted as words
    num_words = tf.cast(
        tf.reduce_sum(input_batch.word_count // div + tf.constant(1, dtype=tf.int32) * (1 - p.contiguous)),
        tf.float32)
    predicted_labels = tf.cast(xent_output.per_example_argmax, labels_ids.dtype)

    num_preds = xent_output.total_weight
    mean_acc = tf.reduce_sum(
        tf.cast(tf.equal(labels_ids, predicted_labels), tf.float32) *
        weights) / (
            num_preds + 1e-4)
    if p.lm.emb.cls == layers.HRREmbeddingLayer:
      if p.train.isometric > 0.0:
        isometric_constraint = 0.0
        nr = p.lm.emb.num_roles
        # TODO(jmluo) rearrange it to divide the code according to three modes
        if 'F' in theta.lm.emb:
          F_wm = theta.lm.emb.F
          nr, nf, d = F_wm.get_shape().as_list()
          # F2d leads to overspefication of parameters in F
          F2d = tf.reshape(F_wm, [nr * nf, d])
          diff = tf.matmul(F2d, tf.transpose(F2d)) - tf.eye(nr * nf)
          # diff = tf.matmul(F_wm, tf.transpose(F_wm, perm=[0, 2, 1])) - tf.eye(nf)
          isometric_constraint += tf.reduce_sum(diff**2) 
        if 'A' in theta.lm:
          d = theta.lm.A.get_shape().as_list()[0]
          A = tf.reshape(theta.lm.A, [d, 2, d])
          A1 = A[:, 0]
          A2 = A[:, 1]
          diff = tf.matmul(A1, tf.transpose(A2)) / 2
          # isometric_constraint += tf.reduce_sum(diff ** 2)

        if nr > 1 and 'r' in theta.lm.emb:
          r_wm = theta.lm.emb.r
          diff = tf.matmul(r_wm, tf.transpose(r_wm)) - tf.eye(nr)
          isometric_constraint += tf.reduce_sum(diff**2)
        if 'R' in theta.lm:
          R_wm = theta.lm.R
          diff = tf.matmul(R_wm, tf.transpose(R_wm)) - tf.eye(p.lm.num_sent_roles)
          isometric_constraint += tf.reduce_sum(diff**2)
        if p.lm.emb.mode == 'rs':
          assert 'rR' in theta.lm.emb
          rR = theta.lm.emb.rR
          diff = tf.matmul(rR, tf.transpose(rR)) - tf.eye(2)
          isometric_constraint += tf.reduce_sum(diff ** 2)

          rs_all = theta.lm.emb.rs.wm
          for rs in rs_all:
            rs = tf.reshape(rs, [-1, 2, 2])
            norm = tf.reduce_sum(rs ** 2, axis=-1)
            isometric_constraint += tf.reduce_sum((norm - 1.0) ** 2) + tf.reduce_sum((rs ** 2) * ((1 - rs) ** 2))

            normalized_rs = tf.nn.l2_normalize(rs, axis=-1)
            dot = tf.matmul(normalized_rs, tf.transpose(normalized_rs, perm=[0, 2, 1]))
            isometric_constraint += tf.reduce_sum(((dot * (tf.ones([2, 2]) - tf.eye(2))) ** 2) * 0.5)
          tf.summary.histogram('rs', tf.stack(rs_all))
        isometric_loss = isometric_constraint * p.train.isometric

    if p.lm.use_chunks:# and not p.is_eval:
      with tf.name_scope('global_decode'):
        assert p.lm.num_sent_roles > 0
        total_chunk_loss = -tf.reduce_sum(xent_output.chunk_log_probs)
        avg_chunk_loss = total_chunk_loss / xent_output.num_chunks
        global_step = tf.to_float(py_utils.GetOrCreateGlobalStep())
        temperature = tf.minimum(tf.constant(p.train.chunk_loss_anneal), global_step) / p.train.chunk_loss_anneal
        tf.summary.scalar('chunk/temperature', temperature)
        annealed_total_chunk_loss = temperature * total_chunk_loss
        annealed_avg_chunk_loss = temperature * avg_chunk_loss
        chunk_loss = annealed_avg_chunk_loss

    loss = xent_output.avg_xent
    if p.train.sum_loss_across_tokens_in_batch:
      loss = xent_output.total_xent
      if 'chunk_loss' in locals():
        chunk_loss = annealed_total_chunk_loss

    metrics = {
        'fraction_of_correct_next_step_preds': (mean_acc, num_preds),
        'log_pplx': (xent_output.avg_xent, num_preds),
        'log_pplx_per_word': (xent_output.total_xent / num_words, num_words),
        'num_predictions': (num_preds, 1),
        'num_words': (num_words, 1)
    }
    #tmp_loss = loss# + theta.dummy * theta.dummy
    if 'isometric_loss' in locals():
      #tmp_loss += isometric_loss
      metrics['isometric'] = (isometric_loss, 1)
    if 'chunk_loss' in locals():
      #tmp_loss += chunk_loss
      metrics['chunk_loss'] = (chunk_loss, 1)
      metrics['annealed_total_chunk_loss'] = (annealed_total_chunk_loss, 1)
      metrics['annealed_avg_chunk_loss'] = (annealed_avg_chunk_loss, xent_output.num_chunks)
      metrics['total_chunk_loss'] = (total_chunk_loss, 1)
      metrics['avg_chunk_loss'] = (avg_chunk_loss, xent_output.num_chunks)
      metrics['num_chunks'] = (xent_output.num_chunks, 1)
    #metrics['loss'] = (tmp_loss, num_preds)
    if p.train.sum_loss_across_tokens_in_batch:
        metrics['loss'] = (loss, 1)
    else:
        metrics['loss'] = (loss, num_preds)
    metrics['batch_size'] = (tf.cast(batch_size, tf.float32), 1)

    return metrics
Exemple #20
0
  def testAccumulator(self):
    # testAccumulator compares
    #   - explicit averaging of independently computed var_grads1 and
    #     var_grads2,
    #   - Accumulator(SGD) optimizer effectively doing this over 2 steps.
    np.random.seed(12345)
    np_input1 = np.random.normal(0.1, 0.5, [2, 4, 3])
    np.random.seed(12346)
    np_input2 = np.random.normal(0.1, 0.5, [2, 4, 3])

    g1 = tf.Graph()
    with g1.as_default():
      tf.set_random_seed(123456)
      params = layers.ProjectionLayer.Params()
      params.name = 'proj'
      params.dtype = tf.float64
      params.input_dim = 3
      params.output_dim = 2
      params.params_init = py_utils.WeightInit.Gaussian(0.01, 123456)
      params.is_eval = False
      params.batch_norm = False
      proj_layer = layers.ProjectionLayer(params)
      inputs1 = tf.placeholder(shape=[2, 4, 3], dtype=tf.float64)
      in_padding1 = tf.zeros([2, 4, 1], dtype=tf.float64)
      inputs2 = tf.placeholder(shape=[2, 4, 3], dtype=tf.float64)
      in_padding2 = tf.zeros([2, 4, 1], dtype=tf.float64)
      output1 = proj_layer.FPropDefaultTheta(inputs1, in_padding1)
      output2 = proj_layer.FPropDefaultTheta(inputs2, in_padding2)
      loss1 = tf.reduce_sum(output1)
      loss2 = tf.reduce_sum(output2)
      var_grads1 = py_utils.ComputeGradients(loss1, proj_layer.vars)
      var_grads2 = py_utils.ComputeGradients(loss2, proj_layer.vars)
      op = optimizer.SGD.Params().Set(add_summary=False)
      opt = op.cls(op)
      lr = 1e-1
      with tf.control_dependencies([loss1, loss2]):
        var_update_op1 = opt.Apply(
            lr, py_utils.ApplyGradMultiplier(var_grads1, 1. / 2.))
        with tf.control_dependencies([var_update_op1]):
          var_update_op2 = opt.Apply(
              lr, py_utils.ApplyGradMultiplier(var_grads2, 1. / 2.))
      init_op = tf.global_variables_initializer()

    with self.session(use_gpu=True, graph=g1) as sess:
      sess.run(init_op)
      vars1 = sess.run(proj_layer.vars.Flatten())
      loss1_1, grads1_1, loss1_2, grads1_2 = sess.run(
          [loss1, var_grads1, loss2, var_grads2],
          feed_dict={
              inputs1: np_input1,
              inputs2: np_input2,
          })
      sess.run(
          [var_update_op2], feed_dict={
              inputs1: np_input1,
              inputs2: np_input2,
          })
      vars1_1 = sess.run(proj_layer.vars.Flatten())

    g2 = tf.Graph()
    with g2.as_default():
      tf.set_random_seed(123456)
      params = layers.ProjectionLayer.Params()
      params.name = 'proj'
      params.dtype = tf.float64
      params.input_dim = 3
      params.output_dim = 2
      params.params_init = py_utils.WeightInit.Gaussian(0.01, 123456)
      params.is_eval = False
      params.batch_norm = False
      proj_layer = layers.ProjectionLayer(params)
      in_padding1 = tf.zeros([2, 4, 1], dtype=tf.float64)
      inputs1 = tf.placeholder(shape=[2, 4, 3], dtype=tf.float64)
      output1 = proj_layer.FPropDefaultTheta(inputs1, in_padding1)
      loss = tf.reduce_sum(output1)
      var_grads = py_utils.ComputeGradients(loss, proj_layer.vars)
      op = optimizer.Accumulator.Params().Set(
          accum_steps=2,
          dtype=tf.float64,
          optimizer_tpl=optimizer.SGD.Params().Set(add_summary=False))
      opt = op.cls(op)
      lr = 1e-1
      var_update_op = opt.Apply(lr, var_grads)
      init_op = tf.global_variables_initializer()
      global_step = py_utils.GetOrCreateGlobalStep()
      increment_global_step_op = tf.assign_add(global_step, 1)
    with self.session(use_gpu=True, graph=g2) as sess:
      sess.run(init_op)
      vars2, global_step = sess.run([proj_layer.vars.Flatten(), global_step])
      loss2_1, grads2_1 = sess.run(
          [loss, var_grads], feed_dict={
              inputs1: np_input1,
          })
      loss2_2, grads2_2 = sess.run(
          [loss, var_grads], feed_dict={
              inputs1: np_input2,
          })
      acc_0 = sess.run(
          [v for v in tf.global_variables() if 'grad_accumulator' in v.name])[0]
      sess.run(
          [var_update_op], feed_dict={
              inputs1: np_input1,
          })
      acc_1 = sess.run(
          [v for v in tf.global_variables() if 'grad_accumulator' in v.name])[0]
      vars2_intermediate = sess.run(proj_layer.vars.Flatten())
      sess.run(increment_global_step_op)
      sess.run(
          [var_update_op], feed_dict={
              inputs1: np_input2,
          })
      acc_2 = sess.run(
          [v for v in tf.global_variables() if 'grad_accumulator' in v.name])[0]
      vars2_1 = sess.run(proj_layer.vars.Flatten())

    self.assertAllClose(vars1, vars2)

    self.assertAllClose(acc_0, np.zeros_like(acc_0))
    self.assertAllClose(acc_1, grads2_1['w'][1])
    self.assertAllClose(acc_2, np.zeros_like(acc_0))

    self.assertAllClose(loss1_1, loss2_1)
    self.assertAllClose(loss1_2, loss2_2)
    self.assertAllClose(grads1_1, grads2_1)
    self.assertAllClose(grads1_2, grads2_2)

    self.assertAllClose(vars1, vars2_intermediate)

    self.assertAllClose(vars2[0], grads2_1['w'][0])
    self.assertAllClose(vars2[0], grads2_2['w'][0])

    self.assertAllClose(
        vars1[0] - 0.5 * lr * (grads1_1['w'][1] + grads1_2['w'][1]), vars1_1[0])

    self.assertAllClose(
        vars2[0] - 0.5 * lr * (grads2_1['w'][1] + grads2_2['w'][1]), vars2_1[0])

    self.assertAllClose(vars2, vars2_intermediate)
    self.assertAllClose(vars1_1, vars2_1)