Beispiel #1
0
 def ConstructFPropBPropGraph(self):
     # We need to override this since constructing the BPropGraph
     # creates slot variables.
     with py_utils.OpportunisticVariableReuseScope():
         with py_utils.VariableRenameScope(
                 self.params.variable_renaming_rules):
             super().ConstructFPropBPropGraph()
    def __init__(self, params):
        super().__init__(params)
        p = params
        if p.input_symbols:
            assert p.input_symbols.num_symbols() == p.input_vocab_size
        if p.input_symbols:
            assert p.output_symbols.num_symbols() == p.output_vocab_size

        if p.share_embeddings:
            renames = [("(.*)/token_emb/(.*)", "%s/shared_emb/token_emb/%s")]
        else:
            renames = [("(.*)/(?:encoder|spell_encoder)/token_emb/(.*)",
                        "%s/shared_inp_emb/token_emb/%s"),
                       ("(.*)/(?:decoder|pron_encoder)/token_emb/(.*)",
                        "%s/shared_out_emb/token_emb/%s")]

        # Enable variable sharing.
        with py_utils.OpportunisticVariableReuseScope():
            with py_utils.VariableRenameScope(renames):
                self.CreateChild("encoder", p.encoder)
                self.CreateChild("decoder", p.decoder)

                if p.use_neighbors:
                    self.CreateChild("spell_encoder", p.spell_encoder)
                    if p.pron_encoder:
                        self.CreateChild("pron_encoder", p.pron_encoder)
Beispiel #3
0
 def _DecodeFn():
   with py_utils.OpportunisticVariableReuseScope(True):
     self._model = self._task_params.Instantiate()
     self._model_task = self._model.GetTask()
     input_batch = self._model_task.GetInputBatch()
     metrics_dict = self._model_task.Decode(input_batch)
     self.metrics_nm = py_utils.NestedMap(metrics_dict)
     return self.metrics_nm.Flatten()
Beispiel #4
0
 def _DecodeFn():
     """Decode call to be compiled for TPU."""
     with py_utils.OpportunisticVariableReuseScope(True):
         self._model.InstantiateVariables()
         input_batch = self._task.input.TpuDequeueBatch()
         metrics_dict = self._task.Decode(input_batch)
     self.metrics_nm = py_utils.NestedMap(metrics_dict)
     return self.metrics_nm.Flatten()
Beispiel #5
0
 def ConstructFPropBPropGraph(self):
     # We need to override this since constructing the BPropGraph
     # creates slot variables.
     p = self._params
     with py_utils.OpportunisticVariableReuseScope():
         with py_utils.VariableRenameScope(p.variable_renaming_rules):
             super(RegExSharedVariableModel,
                   self).ConstructFPropBPropGraph()
Beispiel #6
0
 def _DecodeStep():
   """Decode call to be compiled for TPU."""
   with py_utils.OpportunisticVariableReuseScope(True):
     self._model.InstantiateVariables()
     input_batch = self._task.input.TpuDequeueBatch()
     decode_dict = self._task.Decode(input_batch)
   self.decode_nm = py_utils.NestedMap(decode_dict)
   return [self._OutfeedEnqueue(decode_dict)]
Beispiel #7
0
    def BuildTpuSubgraph(self):
        tf.logging.info('TrainProgram BuildTpuSubGraph')

        with py_utils.OpportunisticVariableReuseScope(True):
            self._eval_metrics = metrics.TpuEvalMetrics()
            data_parallelism = self.data_parallelism

            # Instantiate input generator first.
            self._input = self._task_params.input.Instantiate()
            self._input.CreateTpuEnqueueOps()
            self.SkipCreateChild(self._task_params)

            def TpuTrainStep(*args):
                """Train a shard of a batch on a single TPU core.

        Args:
          *args: metrics values from previous steps.

        Returns:
          New summed metrics values and a train_op.
        """
                self._model = self._task_params.Instantiate()
                self._task = self._model.GetTask()
                self._task.AddChild('input', self._input)
                self._model.ConstructFPropBPropGraph()
                per_step_eval_metrics = self._eval_metrics.SetMetrics(
                    self._task.eval_metrics, args)
                outfeed_op = self._OutfeedEnqueue(
                    self._task.per_example_tensors)
                summed_metrics = []
                assert len(per_step_eval_metrics) == len(args)
                with tf.control_dependencies([outfeed_op]):
                    for x, y in zip(per_step_eval_metrics, args):
                        summed_metrics.append(x + y)
                return summed_metrics + [self._task.train_op]

            @tpu_function.on_device_training_loop
            def TpuTrain():
                loop_result = tpu_training_loop.repeat(
                    self._steps_per_loop,
                    TpuTrainStep,
                    inputs=self._eval_metrics.initial_values,
                    name='train_loop')
                # Final metrics are the avg across self._steps_per_loop steps.
                return self._eval_metrics.FinalizeMetrics(loop_result)

            self._compile_op, batch_parallel_res = tpu.split_compile_and_shard(
                TpuTrain,
                num_shards=data_parallelism,
                device_assignment=py_utils.GetTpuDeviceAssignment())
            outfeed_dequeue_op = self._OutfeedDequeueLoop(
                self._task.per_example_tensors, self._steps_per_loop,
                self.num_splits_per_client)
            # Get metric result from a single replica; they are all same here.
            self.tpu_ops = [[t[0] for t in batch_parallel_res],
                            outfeed_dequeue_op]

        return self.tpu_ops
Beispiel #8
0
 def _DecodeFn():
   """Decode call to be compiled for TPU."""
   with py_utils.OpportunisticVariableReuseScope(True):
     with cluster_factory.SetEval(True):
       self._decode_model.InstantiateVariables()
       input_batch = self._decode_task.input.TpuDequeueBatch()
       decode_dict = self._decode_task.Decode(input_batch)
   self.decode_nm = py_utils.NestedMap(decode_dict)
   return self.decode_nm.Flatten()
Beispiel #9
0
  def testOpportunisticReuse(self):
    pc = py_utils.WeightParams([3, 3])
    _, v1 = py_utils.CreateVariable('v1', pc)
    with self.assertRaises(Exception):
      _ = py_utils.CreateVariable('v1', pc)
    with py_utils.OpportunisticVariableReuseScope(True):
      _, v2 = py_utils.CreateVariable('v1', pc)
      _, x1 = py_utils.CreateVariable('x1', pc)
      with py_utils.OpportunisticVariableReuseScope(False):
        with self.assertRaises(Exception):
          _ = py_utils.CreateVariable('v1', pc)
      _, v3 = py_utils.CreateVariable('v1', pc)
    with self.assertRaises(Exception):
      _ = py_utils.CreateVariable('v1', pc)

    for v in [v2, v3]:
      self.assertTrue(v1 is v)
    self.assertTrue(v1 is not x1)
Beispiel #10
0
 def _DecodeFn():
     """Decode call to be compiled for TPU."""
     with py_utils.OpportunisticVariableReuseScope(True):
         with cluster_factory.SetEval(True):
             self._decode_model = self._decode_task_params.Instantiate()
             self._decode_task = self._decode_model.GetTask()
             self._decode_task.AddChild('input', self._decode_input)
             input_batch = self._decode_task.input.TpuDequeueBatch()
             metrics_dict = self._decode_task.Decode(input_batch)
             self.metrics_nm = py_utils.NestedMap(metrics_dict)
             return self.metrics_nm.Flatten()
Beispiel #11
0
 def _DecodeStep():
     """Decode call to be compiled for TPU."""
     with py_utils.OpportunisticVariableReuseScope(True):
         self._model.InstantiateVariables()
         input_batch = self._task.input.TpuDequeueBatch()
         metrics_dict = self._task.Decode(input_batch)
     self.metrics_nm = py_utils.NestedMap(metrics_dict)
     device = tpu.core(0) if self.spmd else ''
     with tf.device(device):
         outfeed_enqueue = tpu_ops.outfeed_enqueue_tuple(
             self.metrics_nm.Flatten())
         return [outfeed_enqueue]
Beispiel #12
0
        def TpuTrainStep():
            """Train a shard of a batch on a single TPU core.

      Do not calculate loss metrics.

      Returns:
       [train_op].
      """
            with py_utils.OpportunisticVariableReuseScope(True):
                self._train_model.InstantiateVariables()
                self._train_model.ConstructFPropBPropGraph()
            return [self._train_task.train_op]
Beispiel #13
0
    def BuildTpuSubgraph(self):
        with py_utils.OpportunisticVariableReuseScope(True):
            self._eval_metrics = metrics.TpuEvalMetrics()
            data_parallelism = self.data_parallelism

            def TpuTrainStep(*args):
                """Train a shard of a batch on a single TPU core.

        Args:
          *args: metrics values from previous steps.

        Returns:
          New summed metrics values and a train_op.
        """
                self._model = self._task_params.Instantiate()
                self._model.ConstructFPropBPropGraph()
                per_step_eval_metrics = self._eval_metrics.SetMetrics(
                    self._model.GetTask().eval_metrics, args)
                outfeed_op = self._OutfeedEnqueue(
                    self._model.GetTask().per_example_tensors)
                summed_metrics = []
                assert len(per_step_eval_metrics) == len(args)
                with tf.control_dependencies([outfeed_op]):
                    for x, y in zip(per_step_eval_metrics, args):
                        summed_metrics.append(x + y)
                return summed_metrics + [self._model.GetTask().train_op]

            @tpu_function.on_device_training_loop
            def TpuTrain():
                loop_result = tpu_training_loop.repeat(
                    self._steps_per_loop,
                    TpuTrainStep,
                    inputs=self._eval_metrics.initial_values,
                    name='train_loop')
                # Final metrics are the avg across self._steps_per_loop steps.
                return self._eval_metrics.FinalizeMetrics(loop_result)

            batch_parallel_res = tf.tpu.batch_parallel(
                TpuTrain,
                num_shards=data_parallelism,
                device_assignment=py_utils.GetTpuDeviceAssignment())
            outfeed_dequeue_op = self._OutfeedDequeueLoop(
                self._model.GetTask().per_example_tensors,
                self._steps_per_loop, self.num_splits_per_client)
            # Get metric result from a single replica; they are all same here.
            self.tpu_ops = [[t[0] for t in batch_parallel_res],
                            outfeed_dequeue_op]

            # TODO(blee): This is going to need to be fixed for multiple-model
            # execution. Need to get only the vars associated with the model.
            self._checkpointer = self._CreateCheckpointer(
                self._checkpoint_dir, self._model)
        return self.tpu_ops
Beispiel #14
0
    def BuildTpuSubgraph(self):
        tf.logging.info('EvalProgram BuildTpuSubGraph')
        with py_utils.OpportunisticVariableReuseScope(True):
            self._eval_metrics = metrics.TpuEvalMetrics()
            data_parallelism = self.data_parallelism

            self._input = self._task_params.input.Instantiate()
            self._input.CreateTpuEnqueueOps()
            self.SkipCreateChild(self._task_params)

            def TpuEvalStep(*args):
                """Eval a shard of a batch on a single TPU core.

        Args:
          *args: metrics values from previous steps.

        Returns:
          Summed eval metrics.
        """
                with cluster_factory.SetEval(True):
                    self._model = self._task_params.Instantiate()
                    self._task = self._model.GetTask()
                    self._task.AddChild('input', self._input)

                    self._model.ConstructFPropGraph()
                    per_step_eval_metrics = self._eval_metrics.SetMetrics(
                        self._task.eval_metrics, args)
                    summed_metrics = []
                    for x, y in zip(per_step_eval_metrics, args):
                        summed_metrics.append(x + y)
                    return summed_metrics

            @tpu_function.on_device_training_loop
            def TpuEval():
                loop_result = tpu_training_loop.repeat(
                    self._steps_per_loop,
                    TpuEvalStep,
                    inputs=self._eval_metrics.initial_values,
                    name='eval_loop')
                # Final metrics are the avg across self._steps_per_loop steps.
                return self._eval_metrics.FinalizeMetrics(loop_result)

            self._compile_op, batch_parallel_res = tpu.split_compile_and_shard(
                TpuEval,
                num_shards=data_parallelism,
                device_assignment=py_utils.GetTpuDeviceAssignment())
            # Get metric result from a single replica; they are all same here.
            self.tpu_ops = [[t[0] for t in batch_parallel_res]]

            return self.tpu_ops
Beispiel #15
0
 def _DecodeFn():
   """Decode call to be compiled for TPU."""
   with py_utils.OpportunisticVariableReuseScope(True):
     with cluster_factory.SetEval(True):
       self._model = self._task_params.Instantiate()
       self._model_task = self._model.GetTask()
       if py_utils.use_tpu():
         input_batch = self._model_task.input_generator.CreateTpuFeeds()
       else:
         input_batch = self._model_task.input_generator.SplitInputBatch(
             self.cluster.num_splits_per_client)
       metrics_dict = self._model_task.Decode(input_batch)
       self.metrics_nm = py_utils.NestedMap(metrics_dict)
       return self.metrics_nm.Flatten()
Beispiel #16
0
            def TpuEvalStep(*args):
                """Eval a shard of a batch on a single TPU core.

        Args:
          *args: metrics values from previous steps.

        Returns:
          Summed eval metrics.
        """
                with py_utils.OpportunisticVariableReuseScope(True):
                    self._model.InstantiateVariables()
                    self._model.ConstructFPropGraph()
                per_step_eval_metrics = self._eval_metrics.SetMetrics(
                    self._task.eval_metrics, args)
                summed_metrics = []
                for x, y in zip(per_step_eval_metrics, args):
                    summed_metrics.append(x + y)
                return summed_metrics
Beispiel #17
0
    def BuildTpuSubgraph(self):
        tf.logging.info('EvalProgram BuildTpuSubGraph')
        with py_utils.OpportunisticVariableReuseScope(True):
            self._eval_metrics = metrics.TpuEvalMetrics()
            data_parallelism = self.data_parallelism

            def TpuEvalStep(*args):
                """Eval a shard of a batch on a single TPU core.

        Args:
          *args: metrics values from previous steps.

        Returns:
          Per-step eval metrics.
        """
                self._model = self._task_params.Instantiate()
                self._model.ConstructFPropGraph()
                per_step_eval_metrics = self._eval_metrics.SetMetrics(
                    self._model.GetTask().eval_metrics, args)
                return per_step_eval_metrics

            @tpu_function.on_device_training_loop
            def TpuEval():
                loop_result = tpu_training_loop.repeat(
                    self._steps_per_loop,
                    TpuEvalStep,
                    inputs=self._eval_metrics.initial_values,
                    name='eval_loop')
                # Final metrics are the avg across self._steps_per_loop steps.
                return self._eval_metrics.FinalizeMetrics(loop_result)

            batch_parallel_res = tf.tpu.batch_parallel(
                TpuEval,
                num_shards=data_parallelism,
                device_assignment=py_utils.GetTpuDeviceAssignment())
            # Get metric result from a single replica; they are all same here.
            self.tpu_ops = [[t[0] for t in batch_parallel_res]]
            self._checkpointer = checkpointer.Checkpointer(
                self._checkpoint_dir, self._model)

            return self.tpu_ops
Beispiel #18
0
        def TpuTrainStep(*args):
            """Train a shard of a batch on a single TPU core.

      Args:
        *args: metrics values from previous steps.

      Returns:
        New summed metrics values and a train_op.
      """
            with py_utils.OpportunisticVariableReuseScope(True):
                self._model.InstantiateVariables()
                self._model.ConstructFPropBPropGraph()
            per_step_eval_metrics = self._eval_metrics.SetMetrics(
                self._task.eval_metrics, args)
            outfeed_op = self._OutfeedEnqueue(self._task.per_example_tensors)
            summed_metrics = []
            assert len(per_step_eval_metrics) == len(args)
            with tf.control_dependencies([outfeed_op]):
                for x, y in zip(per_step_eval_metrics, args):
                    summed_metrics.append(x + y)
            return summed_metrics + [self._task.train_op]
Beispiel #19
0
    def BuildTpuSubgraph(self):
        tf.logging.info('DecodeProgram BuildTpuSubGraph')
        py_utils.ResetStepSeed()
        device_assignment = py_utils.GetTpuDeviceAssignment()
        self.spmd = self._task_params.input.use_partitioned_infeed_queue
        with py_utils.OpportunisticVariableReuseScope(True):
            with cluster_factory.SetEval(True):
                self._model = self._task_params.Instantiate()
                self._model_task = self._model.GetTask()
                self._model_task.input.CreateTpuEnqueueOps()

                def _DecodeStep():
                    """Decode call to be compiled for TPU."""
                    input_batch = self._model_task.input_generator.TpuDequeueBatch(
                    )
                    metrics_dict = self._model_task.Decode(input_batch)
                    self.metrics_nm = py_utils.NestedMap(metrics_dict)
                    device = tpu.core(0) if self.spmd else ''
                    with tf.device(device):
                        outfeed_enqueue = tpu_ops.outfeed_enqueue_tuple(
                            self.metrics_nm.Flatten())
                        return [outfeed_enqueue]

        @tpu_function.on_device_training_loop
        def DecodeLoopFn():
            return tpu_training_loop.repeat(self._steps_per_loop,
                                            _DecodeStep,
                                            inputs=[])

        self._compile_op, self.decode_loop = tpu.split_compile_and_shard(
            DecodeLoopFn,
            num_shards=self.data_parallelism,
            device_assignment=device_assignment)
        # Get a list of outfeed ops.
        self.metrics = self._OutfeedDequeue()
        # Pack the list of outfeed ops with structure in self.metrics_nm.
        self.metrics = tf.nest.pack_sequence_as(self.metrics_nm, self.metrics)
        return
Beispiel #20
0
 def __init__(self, params):
     # Enable variable sharing.
     p = params
     with py_utils.OpportunisticVariableReuseScope():
         with py_utils.VariableRenameScope(p.variable_renaming_rules):
             super(RegExSharedVariableModel, self).__init__(params)
Beispiel #21
0
    def BuildTpuSubgraph(self):
        if self._ml_perf_log:
            mlp_log.mlperf_print('global_batch_size',
                                 self._ml_perf.global_batch_size)
            mlp_log.mlperf_print('max_sequence_length',
                                 self._ml_perf.max_sequence_length)
            mlp_log.mlperf_print('opt_name', self._ml_perf.optimizer_name)
            mlp_log.mlperf_print('opt_base_learning_rate',
                                 self._ml_perf.base_learning_rate)
            mlp_log.mlperf_print('opt_learning_rate_warmup_steps',
                                 self._ml_perf.warmup_steps)

        with py_utils.OpportunisticVariableReuseScope(True):
            self._eval_metrics = metrics.TpuEvalMetrics()
            data_parallelism = self.data_parallelism

            def TpuTrainStep():
                """Train a shard of a batch on a single TPU core.

        Do not calculate loss metrics.

        Returns:
         [train_op].
        """
                self._train_model = self._train_task_params.Instantiate()
                self._model = self._train_model
                self._train_model.ConstructFPropBPropGraph()
                return [self._train_model.GetTask().train_op]

            def TpuTrain():
                loop_result = tpu_training_loop.repeat(
                    self._train_steps_per_loop,
                    TpuTrainStep,
                    inputs=[],
                    name='train_loop')
                return loop_result

        py_utils.ResetStepSeed()

        def _DecodeFn():
            """Decode call to be compiled for TPU."""
            with py_utils.OpportunisticVariableReuseScope(True):
                with cluster_factory.SetEval(True):
                    self._decode_model = self._decode_task_params.Instantiate()
                    self._decode_model_task = self._decode_model.GetTask()
                    if py_utils.use_tpu():
                        input_batch = self._decode_model_task.input_generator.CreateTpuFeeds(
                        )
                    else:
                        input_batch = self._decode_model_task.input_generator.SplitInputBatch(
                            self.cluster.num_splits_per_client)
                    metrics_dict = self._decode_model_task.Decode(input_batch)
                    self.metrics_nm = py_utils.NestedMap(metrics_dict)
                    return self.metrics_nm.Flatten()

        @tpu_function.on_device_training_loop
        def TrainAndDecode():
            with tf.control_dependencies([TpuTrain()]):
                return _DecodeFn()

        self._compile_op, batch_parallel_res = tpu.split_compile_and_shard(
            TrainAndDecode,
            num_shards=data_parallelism,
            device_assignment=py_utils.GetTpuDeviceAssignment())

        self.metrics = py_utils.NestedMap(self.metrics_nm)
        self.metrics = self.metrics.Pack(batch_parallel_res)
        return None
Beispiel #22
0
 def __init__(self, params):
     # Enable variable sharing.
     with py_utils.OpportunisticVariableReuseScope():
         with py_utils.VariableRenameScope(params.variable_renaming_rules):
             super().__init__(params)
Beispiel #23
0
    def BuildTpuSubgraph(self):
        tf.logging.info('TrainProgram BuildTpuSubGraph')

        with py_utils.OpportunisticVariableReuseScope(True):
            self._eval_metrics = metrics.TpuEvalMetrics()
            data_parallelism = self.data_parallelism

            # Instantiate input generator first.
            self._input = self._task_params.input.Instantiate()
            self._input.CreateTpuEnqueueOps()
            self.SkipCreateChild(self._task_params)

            def TpuTrainStep(*args):
                """Train a shard of a batch on a single TPU core.

        Args:
          *args: metrics values from previous steps.

        Returns:
          New summed metrics values and a train_op.
        """
                self._model = self._task_params.Instantiate()
                self._task = self._model.GetTask()
                self._task.AddChild('input', self._input)
                self._model.ConstructFPropBPropGraph()
                per_step_eval_metrics = self._eval_metrics.SetMetrics(
                    self._task.eval_metrics, args)
                outfeed_op = self._OutfeedEnqueue(
                    self._task.per_example_tensors)
                summed_metrics = []
                assert len(per_step_eval_metrics) == len(args)
                with tf.control_dependencies([outfeed_op]):
                    for x, y in zip(per_step_eval_metrics, args):
                        summed_metrics.append(x + y)
                return summed_metrics + [self._task.train_op]

            @tpu_function.on_device_training_loop
            def TpuTrain():
                loop_result = tpu_training_loop.repeat(
                    self._steps_per_loop,
                    TpuTrainStep,
                    inputs=self._eval_metrics.initial_values,
                    name='train_loop')
                # Final metrics are the avg across self._steps_per_loop steps.
                return self._eval_metrics.FinalizeMetrics(loop_result)

            self._compile_op, batch_parallel_res = tpu.split_compile_and_shard(
                TpuTrain,
                num_shards=data_parallelism,
                device_assignment=py_utils.GetTpuDeviceAssignment())
            outfeed_dequeue_op = self._OutfeedDequeueLoop(
                self._task.per_example_tensors, self._steps_per_loop,
                self.num_splits_per_client)

            # Get metric result from a single replica; they are all same here.

            def _ConstructPostTrainingLoop(train_loop_op, outfeed_dequeue_op):
                """Returns the op for tpu training with tail cpu computation."""
                # Adds a tail computation that is run after the tpu_training loop
                # step finishes. This allows us to run certain computation that
                # acts on the variable between tpu_train_loop iterations and
                # amortizing the cost of the operations. Alternative of running
                # tpu.outside_compilation & using tf.cond is expenseive.
                with tf.control_dependencies(train_loop_op):
                    self._model.ConstructPostTrainingLoop()
                    with tf.control_dependencies(
                        [self._task.post_training_loop_op]):
                        return ([[tf.identity(o) for o in train_loop_op],
                                 outfeed_dequeue_op])

            # Get metric result from a single replica; they are all same here.
            all_tpu_ops = [t[0] for t in batch_parallel_res]
            self.tpu_ops = (_ConstructPostTrainingLoop(all_tpu_ops,
                                                       outfeed_dequeue_op))

        return self.tpu_ops
Beispiel #24
0
 def CreateVariables(self):
     # Enable variable sharing.
     with py_utils.OpportunisticVariableReuseScope():
         with py_utils.VariableRenameScope(
                 self.params.variable_renaming_rules):
             super().CreateVariables()