Example #1
0
  def training_loop(self, train_ds, test_ds):
    """Custom training and testing loop.

    Args:
      train_ds: Training dataset
      test_ds: Testing dataset

    Returns:
      train_loss, test_loss
    """

    if self.enable_function:
      self.train_step = tf.function(self.train_step)
      self.test_step = tf.function(self.test_step)

    template = 'Epoch: {}, Train Loss: {}, Test Loss: {}'

    for epoch in range(self.epochs):
      self.train_loss_metric.reset_states()
      self.test_loss_metric.reset_states()

      for inp, targ in train_ds:
        self.train_step((inp, targ))

      for inp_test, targ_test in test_ds:
        self.test_step((inp_test, targ_test))

      print (template.format(epoch,
                             self.train_loss_metric.result().numpy(),
                             self.test_loss_metric.result().numpy()))

    return (self.train_loss_metric.result().numpy(),
            self.test_loss_metric.result().numpy())
 def test_tf_saved_model_save_multiple_signatures(self):
   base_path = os.path.join(self.get_temp_dir(), 'tf_saved_model_save')
   export_path = os.path.join(base_path, '00000123')
   root = tf.train.Checkpoint()
   root.f = tf.function(lambda x: {'y': 1.},
                        input_signature=[tf.TensorSpec(None, tf.float32)])
   root.g = tf.function(lambda x: {'y': 2.},
                        input_signature=[tf.TensorSpec(None, tf.float32)])
   tf.saved_model.experimental.save(
       root, export_path,
       signatures={
           signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY: root.f,
           'custom_signature_key': root.g})
   _, model_server_address, _ = TensorflowModelServerTest.RunServer(
       'default', base_path)
   expected_version = self._GetModelVersion(base_path)
   self.VerifyPredictRequest(
       model_server_address,
       expected_output=2.0,
       expected_version=expected_version,
       signature_name='custom_signature_key')
   self.VerifyPredictRequest(
       model_server_address,
       expected_output=1.0,
       expected_version=expected_version)
Example #3
0
  def custom_loop(self, train_iterator, test_iterator,
                  num_train_steps_per_epoch, num_test_steps_per_epoch,
                  strategy):
    """Custom training and testing loop.

    Args:
      train_iterator: Training iterator created using strategy
      test_iterator: Testing iterator created using strategy
      num_train_steps_per_epoch: number of training steps in an epoch.
      num_test_steps_per_epoch: number of test steps in an epoch.
      strategy: Distribution strategy

    Returns:
      train_loss, train_accuracy, test_loss, test_accuracy
    """

    # this code is expected to change.
    def distributed_train():
      return strategy.experimental_run(
          self.train_step, train_iterator)

    def distributed_test():
      return strategy.experimental_run(
          self.test_step, test_iterator)

    if self.enable_function:
      distributed_train = tf.function(distributed_train)
      distributed_test = tf.function(distributed_test)

    for epoch in range(self.epochs):
      self.optimizer.learning_rate = self.decay(epoch)

      train_iterator.initialize()
      for _ in range(num_train_steps_per_epoch):
        distributed_train()

      test_iterator.initialize()
      for _ in range(num_test_steps_per_epoch):
        distributed_test()

      template = ('Epoch: {}, Train Loss: {}, Train Accuracy: {}, '
                  'Test Loss: {}, Test Accuracy: {}')

      print(
          template.format(epoch, self.train_loss_metric.result(),
                          self.train_acc_metric.result(),
                          self.test_loss_metric.result(),
                          self.test_acc_metric.result()))

      if epoch != self.epochs - 1:
        self.train_loss_metric.reset_states()
        self.train_acc_metric.reset_states()
        self.test_loss_metric.reset_states()
        self.test_acc_metric.reset_states()

    return (self.train_loss_metric.result().numpy(),
            self.train_acc_metric.result().numpy(),
            self.test_loss_metric.result().numpy(),
            self.test_acc_metric.result().numpy())
Example #4
0
def main(argv):
  del argv

  root = tf.train.Checkpoint()
  # Create a cell and attach to our checkpointable.
  root.rnn_cell = tf.keras.layers.LSTMCell(units=10, recurrent_initializer=None)

  # Wrap the rnn_cell.__call__ function and assign to next_state.
  root.next_state = tf.function(root.rnn_cell.__call__, autograph=False)

  # Wrap the rnn_cell.get_initial_function using a decorator and assign to an
  # attribute with the same name.
  @tf.function(input_signature=[tf.TensorSpec([None, None], tf.float32)])
  def get_initial_state(tensor):
    return root.rnn_cell.get_initial_state(tensor, None, None)

  root.get_initial_state = get_initial_state

  # Construct an initial_state, then call next_state explicitly to trigger a
  # trace for serialization (we need an explicit call, because next_state has
  # not been annotated with an input_signature).
  initial_state = root.get_initial_state(
      tf.constant(np.random.uniform(size=[3, 10]).astype(np.float32)))
  root.next_state(
      tf.constant(np.random.uniform(size=[3, 19]).astype(np.float32)),
      initial_state)

  tf.saved_model.save(root, FLAGS.export_dir)
Example #5
0
  def train(self, dataset, checkpoint_pr):
    """Train the GAN for x number of epochs.

    Args:
      dataset: train dataset.
      checkpoint_pr: prefix in which the checkpoints are stored.

    Returns:
      Time for each epoch.
    """
    time_list = []
    if self.enable_function:
      self.train_step = tf.function(self.train_step)

    for epoch in range(self.epochs):
      start_time = time.time()
      for input_image, target_image in dataset:
        gen_loss, disc_loss = self.train_step(input_image, target_image)

      wall_time_sec = time.time() - start_time
      time_list.append(wall_time_sec)

      # saving (checkpoint) the model every 20 epochs
      if (epoch + 1) % 20 == 0:
        self.checkpoint.save(file_prefix=checkpoint_pr)

      template = 'Epoch {}, Generator loss {}, Discriminator Loss {}'
      print (template.format(epoch, gen_loss, disc_loss))

    return time_list
Example #6
0
  def custom_loop(self, train_dataset, test_dataset):
    """Custom training and testing loop.

    Args:
      train_dataset: Training dataset
      test_dataset: Testing dataset

    Returns:
      train_loss, train_accuracy, test_loss, test_accuracy
    """
    if self.enable_function:
      self.train_step = tf.function(self.train_step)
      self.test_step = tf.function(self.test_step)

    for epoch in range(self.epochs):
      self.optimizer.learning_rate = self.decay(epoch)

      for image, label in train_dataset:
        self.train_step(image, label)

      for test_image, test_label in test_dataset:
        self.test_step(test_image, test_label)

      template = ('Epoch: {}, Train Loss: {}, Train Accuracy: {}, '
                  'Test Loss: {}, Test Accuracy: {}')

      print(
          template.format(epoch, self.train_loss_metric.result(),
                          self.train_acc_metric.result(),
                          self.test_loss_metric.result(),
                          self.test_acc_metric.result()))

      if epoch != self.epochs - 1:
        self.train_loss_metric.reset_states()
        self.train_acc_metric.reset_states()
        self.test_loss_metric.reset_states()
        self.test_acc_metric.reset_states()

    return (self.train_loss_metric.result().numpy(),
            self.train_acc_metric.result().numpy(),
            self.test_loss_metric.result().numpy(),
            self.test_acc_metric.result().numpy())
 def test_tf_saved_model_save(self):
   base_path = os.path.join(self.get_temp_dir(), 'tf_saved_model_save')
   export_path = os.path.join(base_path, '00000123')
   root = tf.train.Checkpoint()
   root.v1 = tf.Variable(3.)
   root.v2 = tf.Variable(2.)
   root.f = tf.function(
       lambda x: {'y': root.v1 * root.v2 * x})
   to_save = root.f.get_concrete_function(tf.TensorSpec(None, tf.float32))
   tf.saved_model.experimental.save(root, export_path, to_save)
   _, model_server_address, _ = TensorflowModelServerTest.RunServer(
       'default', base_path)
   expected_version = self._GetModelVersion(base_path)
   self.VerifyPredictRequest(
       model_server_address,
       expected_output=12.0,
       specify_output=False,
       expected_version=expected_version)
Example #8
0
    def _check_sharding_annotations(self,
                                    f_jax,
                                    args: Sequence[Any],
                                    *,
                                    expected: Sequence[str],
                                    expected_opt: Sequence[str],
                                    num_partitions=2):
        """Check expected patterns in the HLO generated from f_jax and its conversion.

    We run this check on CPU also, which is useful for debugging locally.
    We currently check the unoptimized HLO against `expected` on CPU and TPU,
    and we check the optimized HLO against `expected_opt` on TPU only and
    only for JAX.

    See `self.AssertShardingAnnotations` for documentation of `expected`
    and `expected_opt`.
    """
        if jtu.device_under_test() == "gpu":
            raise unittest.SkipTest("Sharding HLO tests not useful for GPU")

        jax_comp = jax.xla_computation(f_jax)(*args)
        jax_hlo = jax_comp.as_hlo_text()
        if LOG_HLO:
            logging.info("[%s] got JAX HLO %s", self._testMethodName, jax_hlo)
        self.AssertShardingAnnotations("JAX before optimizations", jax_hlo,
                                       expected)

        if jtu.device_under_test() == "tpu":
            backend = jax._src.lib.xla_bridge.get_backend()
            num_replicas = 1
            device_assignment = np.arange(num_partitions * num_replicas)
            device_assignment = np.reshape(device_assignment,
                                           (-1, num_partitions))
            use_spmd_partitioning = num_partitions > 1
            compile_options = jax._src.lib.xla_bridge.get_compile_options(
                num_replicas=num_replicas,
                num_partitions=num_partitions,
                device_assignment=device_assignment,
                use_spmd_partitioning=use_spmd_partitioning,
            )
            jax_optimized_hlo = backend.compile(
                jax_comp, compile_options).hlo_modules()[0].to_string()
            if LOG_HLO:
                logging.info("[%s] got JAX optimized HLO for platform %s %s",
                             self._testMethodName, backend.platform,
                             jax_optimized_hlo)
            self.AssertShardingAnnotations("JAX after optimizations",
                                           jax_optimized_hlo, expected_opt)

        f_tf = jax2tf.convert(f_jax)
        device_name = f"/device:{jtu.device_under_test().upper()}:0"
        tf_hlo = (tf.function(f_tf, jit_compile=True,
                              autograph=False).experimental_get_compiler_ir(
                                  *args)(stage="hlo", device_name=device_name))
        if LOG_HLO:
            logging.info("[%s] got TF HLO %s", self._testMethodName, tf_hlo)
        self.AssertShardingAnnotations("TF before optimizations", tf_hlo,
                                       expected)
        tf_optimized_hlo = (tf.function(
            f_tf, jit_compile=True).experimental_get_compiler_ir(*args)(
                stage="optimized_hlo", device_name=device_name))
        if LOG_HLO:
            logging.info("[%s] got TF optimized HLO for %s: %s",
                         self._testMethodName, device_name, tf_optimized_hlo)
Example #9
0
 def test_is_defun(self):
   self.assertTrue(function_utils.is_defun(tf.function(lambda x: None)))
   fn = tf.function(lambda x: None, (tf.TensorSpec(None, tf.int32),))
   self.assertTrue(function_utils.is_defun(fn))
   self.assertFalse(function_utils.is_defun(lambda x: None))
   self.assertFalse(function_utils.is_defun(None))
def export_saved_model(model: tf.keras.Model,
                       input_shape: Tuple[int, int, int, int, int],
                       export_path: str = '/tmp/movinet/',
                       causal: bool = False,
                       bundle_input_init_states_fn: bool = True,
                       checkpoint_path: Optional[str] = None) -> None:
    """Exports a MoViNet model to a saved model.

  Args:
    model: the tf.keras.Model to export.
    input_shape: The 5D spatiotemporal input shape of size
      [batch_size, num_frames, image_height, image_width, num_channels].
      Set the field or a shape position in the field to None for dynamic input.
    export_path: Export path to save the saved_model file.
    causal: Run the model in causal mode.
    bundle_input_init_states_fn: Add init_states as a function signature to the
      saved model. This is not necessary if the input shape is static (e.g.,
      for TF Lite).
    checkpoint_path: Checkpoint path to load. Leave blank to keep the model's
      initialization.
  """

    # Use dimensions of 1 except the channels to export faster,
    # since we only really need the last dimension to build and get the output
    # states. These dimensions can be set to `None` once the model is built.
    input_shape_concrete = [1 if s is None else s for s in input_shape]
    model.build(input_shape_concrete)

    # Compile model to generate some internal Keras variables.
    model.compile()

    if checkpoint_path:
        checkpoint = tf.train.Checkpoint(model=model)
        status = checkpoint.restore(checkpoint_path)
        status.assert_existing_objects_matched()

    if causal:
        # Call the model once to get the output states. Call again with `states`
        # input to ensure that the inputs with the `states` argument is built
        # with the full output state shapes.
        input_image = tf.ones(input_shape_concrete)
        _, states = model({
            **model.init_states(input_shape_concrete), 'image':
            input_image
        })
        _ = model({**states, 'image': input_image})

        # Create a function to explicitly set the names of the outputs
        def predict(inputs):
            outputs, states = model(inputs)
            return {**states, 'logits': outputs}

        specs = {
            name: tf.TensorSpec(spec.shape, name=name, dtype=spec.dtype)
            for name, spec in model.initial_state_specs(input_shape).items()
        }
        specs['image'] = tf.TensorSpec(input_shape,
                                       dtype=model.dtype,
                                       name='image')

        predict_fn = tf.function(predict, jit_compile=True)
        predict_fn = predict_fn.get_concrete_function(specs)

        init_states_fn = tf.function(model.init_states, jit_compile=True)
        init_states_fn = init_states_fn.get_concrete_function(
            tf.TensorSpec([5], dtype=tf.int32))

        if bundle_input_init_states_fn:
            signatures = {'call': predict_fn, 'init_states': init_states_fn}
        else:
            signatures = predict_fn

        tf.keras.models.save_model(model, export_path, signatures=signatures)
    else:
        _ = model(tf.ones(input_shape_concrete))
        tf.keras.models.save_model(model, export_path)
Example #11
0
    def convert_and_save_model(
            jax_fn: tp.Callable[[tp.Any, tp.Any], tp.Any],
            params,
            model_dir: str,
            *,
            input_signatures: tp.Sequence[tf.TensorSpec],
            shape_polymorphic_input_spec: tp.Optional[str] = None,
            with_gradient: bool = False,
            enable_xla: bool = True,
            compile_model: bool = True,
            save_model_options: tp.Optional[
                tf.saved_model.SaveOptions] = None):
        """Convert a JAX function and saves a SavedModel.
        This is an example, for serious uses you will likely want to copy and
        expand it as needed (see note at the top of the model).
        Use this function if you have a trained ML model that has both a prediction
        function and trained parameters, which you want to save separately from the
        function graph as variables (e.g., to avoid limits on the size of the
        GraphDef, or to enable fine-tuning.) If you don't have such parameters,
        you can still use this library function but probably don't need it
        (see jax2tf/README.md for some simple examples).
        In order to use this wrapper you must first convert your model to a function
        with two arguments: the parameters and the input on which you want to do
        inference. Both arguments may be np.ndarray or (nested)
        tuples/lists/dictionaries thereof.
        See the README.md for a discussion of how to prepare Flax and Haiku models.
        Args:
        jax_fn: a JAX function taking two arguments, the parameters and the inputs.
            Both arguments may be (nested) tuples/lists/dictionaries of np.ndarray.
        params: the parameters, to be used as first argument for `jax_fn`. These
            must be (nested) tuples/lists/dictionaries of np.ndarray, and will be
            saved as the variables of the SavedModel.
        model_dir: the directory where the model should be saved.
        input_signatures: the input signatures for the second argument of `jax_fn`
            (the input). A signature must be a `tensorflow.TensorSpec` instance, or a
            (nested) tuple/list/dictionary thereof with a structure matching the
            second argument of `jax_fn`. The first input_signature will be saved as
            the default serving signature. The additional signatures will be used
            only to ensure that the `jax_fn` is traced and converted to TF for the
            corresponding input shapes.
        shape_polymorphic_input_spec: if given then it will be used as the
            `in_shapes` argument to jax2tf.convert for the second parameter of
            `jax_fn`. In this case, a single `input_signatures` is supported, and
            should have `None` in the polymorphic dimensions. Should be a string, or a
            (nesteD) tuple/list/dictionary thereof with a structure matching the
            second argument of `jax_fn`.
        with_gradient: whether the SavedModel should support gradients. If True,
            then a custom gradient is saved. If False, then a
            tf.raw_ops.PreventGradient is saved to error if a gradient is attempted.
            (At the moment due to a bug in SavedModel, custom gradients are not
            supported.)
        enable_xla: whether the jax2tf converter is allowed to use TFXLA ops. If
            False, the conversion tries harder to use purely TF ops and raises an
            exception if it is not possible. (default: True)
        compile_model: use TensorFlow jit_compiler on the SavedModel. This
            is needed if the SavedModel will be used for TensorFlow serving.
        save_model_options: options to pass to savedmodel.save.
        """

        if not input_signatures:
            raise ValueError("At least one input_signature must be given")
        if shape_polymorphic_input_spec is not None:
            if len(input_signatures) > 1:
                raise ValueError("For shape-polymorphic conversion a single "
                                 "input_signature is supported.")
        tf_fn = jax2tf.convert(
            jax_fn,
            with_gradient=with_gradient,
            in_shapes=[None, shape_polymorphic_input_spec],
            enable_xla=enable_xla,
        )

        # Create tf.Variables for the parameters. If you want more useful variable
        # names, you can use `tree.map_structure_with_path` from the `dm-tree` package
        param_vars = tf.nest.map_structure(
            # Due to a bug in SavedModel it is not possible to use tf.GradientTape on
            # a function converted with jax2tf and loaded from SavedModel. Thus, we
            # mark the variables as non-trainable to ensure that users of the
            # SavedModel will not try to fine tune them.
            lambda param: tf.Variable(param, trainable=with_gradient),
            params,
        )
        tf_fun = tf.function(
            lambda inputs: tf_fn(param_vars, inputs),
            autograph=False,
            experimental_compile=compile_model,
        )

        signatures = {}
        # This signature is needed for TensorFlow Serving use.
        signatures[
            tf.saved_model.
            DEFAULT_SERVING_SIGNATURE_DEF_KEY] = tf_fun.get_concrete_function(
                input_signatures[0])

        for input_signature in input_signatures[1:]:
            # If there are more signatures, trace and cache a TF function for each one
            tf_fun.get_concrete_function(input_signature)

        wrapper = _ReusableSavedModelWrapper(tf_fun, param_vars)
        tf.saved_model.save(wrapper,
                            model_dir,
                            signatures=signatures,
                            options=save_model_options)
Example #12
0
def run_customized_training_loop(
        # pylint: disable=invalid-name
        _sentinel=None,
        # pylint: enable=invalid-name
        strategy=None,
        model_fn=None,
        loss_fn=None,
        scale_loss=True,
        model_dir=None,
        train_input_fn=None,
        steps_per_epoch=None,
        steps_per_loop=1,
        epochs=1,
        eval_input_fn=None,
        eval_steps=None,
        metric_fn=None,
        init_checkpoint=None,
        custom_callbacks=None,
        run_eagerly=False,
        sub_model_export_name=None,
        explicit_allreduce=False,
        pre_allreduce_callbacks=None,
        post_allreduce_callbacks=None,
        train_summary_interval=0):
    """Run BERT pretrain model training using low-level API.

  Arguments:
      _sentinel: Used to prevent positional parameters. Internal, do not use.
      strategy: Distribution strategy on which to run low level training loop.
      model_fn: Function that returns a tuple (model, sub_model). Caller of this
        function should add optimizer to the `model` via calling
        `model.compile()` API or manually setting `model.optimizer` attribute.
        Second element of the returned tuple(sub_model) is an optional sub model
        to be used for initial checkpoint -- if provided.
      loss_fn: Function with signature func(labels, logits) and returns a loss
        tensor.
      scale_loss: Whether to divide the raw loss by number of replicas before
        gradients calculation.
      model_dir: Model directory used during training for restoring/saving model
        weights.
      train_input_fn: Function that returns a tf.data.Dataset used for training.
      steps_per_epoch: Number of steps to run per epoch. At the end of each
        epoch, model checkpoint will be saved and evaluation will be conducted
        if evaluation dataset is provided.
      steps_per_loop: Number of steps per graph-mode loop. In order to reduce
        communication in eager context, training logs are printed every
        steps_per_loop.
      epochs: Number of epochs to train.
      eval_input_fn: Function that returns evaluation dataset. If none,
        evaluation is skipped.
      eval_steps: Number of steps to run evaluation. Required if `eval_input_fn`
        is not none.
      metric_fn: A metrics function that returns a Keras Metric object to record
        evaluation result using evaluation dataset or with training dataset
        after every epoch.
      init_checkpoint: Optional checkpoint to load to `sub_model` returned by
        `model_fn`.
      custom_callbacks: A list of Keras Callbacks objects to run during
        training. More specifically, `on_batch_begin()`, `on_batch_end()`,
        `on_epoch_begin()`, `on_epoch_end()` methods are invoked during
        training.  Note that some metrics may be missing from `logs`.
      run_eagerly: Whether to run model training in pure eager execution. This
        should be disable for TPUStrategy.
      sub_model_export_name: If not None, will export `sub_model` returned by
        `model_fn` into checkpoint files. The name of intermediate checkpoint
        file is {sub_model_export_name}_step_{step}.ckpt and the last
        checkpint's name is {sub_model_export_name}.ckpt;
        if None, `sub_model` will not be exported as checkpoint.
      explicit_allreduce: Whether to explicitly perform gradient allreduce,
        instead of relying on implicit allreduce in optimizer.apply_gradients().
        default is False. For now, if training using FP16 mixed precision,
        explicit allreduce will aggregate gradients in FP16 format. For TPU and
        GPU training using FP32, explicit allreduce will aggregate gradients in
        FP32 format.
      pre_allreduce_callbacks: A list of callback functions that takes gradients
        and model variables pairs as input, manipulate them, and returns a new
        gradients and model variables paris. The callback functions will be
        invoked in the list order and before gradients are allreduced.
        With mixed precision training, the pre_allreduce_allbacks will be
        applied on scaled_gradients. Default is no callbacks.
        Only used when explicit_allreduce=True.
      post_allreduce_callbacks: A list of callback functions that takes
        gradients and model variables pairs as input, manipulate them, and
        returns a new gradients and model variables paris. The callback
        functions will be invoked in the list order and right before gradients
        are applied to variables for updates. Default is no callbacks. Only used
        when explicit_allreduce=True.
      train_summary_interval: Step interval for training summaries. If the value
        is a negative number, then training summaries are not enabled.

  Returns:
      Trained model.

  Raises:
      ValueError: (1) When model returned by `model_fn` does not have optimizer
        attribute or when required parameters are set to none. (2) eval args are
        not specified correctly. (3) metric_fn must be a callable if specified.
        (4) sub_model_checkpoint_name is specified, but `sub_model` returned
        by `model_fn` is None.
  """

    if _sentinel is not None:
        raise ValueError('only call `run_customized_training_loop()` '
                         'with named arguments.')

    required_arguments = [
        strategy, model_fn, loss_fn, model_dir, steps_per_epoch, train_input_fn
    ]
    if [arg for arg in required_arguments if arg is None]:
        raise ValueError('`strategy`, `model_fn`, `loss_fn`, `model_dir`, '
                         '`steps_per_loop` and `steps_per_epoch` are required '
                         'parameters.')
    if steps_per_loop > steps_per_epoch:
        logging.error(
            'steps_per_loop: %d is specified to be greater than '
            ' steps_per_epoch: %d, we will use steps_per_epoch as'
            ' steps_per_loop.', steps_per_loop, steps_per_epoch)
        steps_per_loop = steps_per_epoch
    assert tf.executing_eagerly()

    if run_eagerly:
        if isinstance(strategy, tf.distribute.experimental.TPUStrategy):
            raise ValueError(
                'TPUStrategy should not run eagerly as it heavily relies on graph'
                ' optimization for the distributed system.')

    if eval_input_fn and (eval_steps is None or metric_fn is None):
        raise ValueError(
            '`eval_step` and `metric_fn` are required when `eval_input_fn ` '
            'is not none.')
    if metric_fn and not callable(metric_fn):
        raise ValueError(
            'if `metric_fn` is specified, metric_fn must be a callable.')

    callback_list = tf.keras.callbacks.CallbackList(custom_callbacks)

    total_training_steps = steps_per_epoch * epochs
    train_iterator = _get_input_iterator(train_input_fn, strategy)

    with distribution_utils.get_strategy_scope(strategy):
        # To correctly place the model weights on accelerators,
        # model and optimizer should be created in scope.
        model, sub_model = model_fn()
        if not hasattr(model, 'optimizer'):
            raise ValueError('User should set optimizer attribute to model '
                             'inside `model_fn`.')
        if sub_model_export_name and sub_model is None:
            raise ValueError('sub_model_export_name is specified as %s, but '
                             'sub_model is None.' % sub_model_export_name)

        optimizer = model.optimizer

        if init_checkpoint:
            logging.info(
                'Checkpoint file %s found and restoring from '
                'initial checkpoint for core model.', init_checkpoint)
            checkpoint = tf.train.Checkpoint(model=sub_model)
            checkpoint.restore(
                init_checkpoint).assert_existing_objects_matched()
            logging.info('Loading from checkpoint file completed')

        train_loss_metric = tf.keras.metrics.Mean('training_loss',
                                                  dtype=tf.float32)
        eval_metrics = [metric_fn()] if metric_fn else []
        # If evaluation is required, make a copy of metric as it will be used by
        # both train and evaluation.
        train_metrics = [
            metric.__class__.from_config(metric.get_config())
            for metric in eval_metrics
        ]

        # Create summary writers
        if _should_export_summary(strategy):
            summary_dir = os.path.join(model_dir, 'summaries')
        else:
            # In multi worker training we need every worker to write summary, because
            # variables can trigger synchronization on read and synchronization needs
            # all workers to participate.
            summary_dir = tempfile.mkdtemp()
        eval_summary_writer = tf.summary.create_file_writer(
            os.path.join(summary_dir, 'eval'))
        last_summary_step = 0
        if steps_per_loop >= _MIN_SUMMARY_STEPS and train_summary_interval >= 0:
            # Only writes summary when the stats are collected sufficiently over
            # enough steps.
            train_summary_writer = tf.summary.create_file_writer(
                os.path.join(summary_dir, 'train'))
        else:
            train_summary_writer = tf.summary.create_noop_writer()

        # Collects training variables.
        training_vars = model.trainable_variables

        def _replicated_step(inputs):
            """Replicated training step."""

            inputs, labels = inputs
            with tf.GradientTape() as tape:
                model_outputs = model(inputs, training=True)
                loss = loss_fn(labels, model_outputs)
                # Raw loss is used for reporting in metrics/logs.
                raw_loss = loss
                if scale_loss:
                    # Scales down the loss for gradients to be invariant from replicas.
                    loss = loss / strategy.num_replicas_in_sync

            if explicit_allreduce:
                grad_utils.minimize_using_explicit_allreduce(
                    tape, optimizer, loss, training_vars,
                    pre_allreduce_callbacks, post_allreduce_callbacks)
            else:
                if isinstance(
                        optimizer, tf.keras.mixed_precision.experimental.
                        LossScaleOptimizer):
                    with tape:
                        scaled_loss = optimizer.get_scaled_loss(loss)
                    scaled_grads = tape.gradient(scaled_loss, training_vars)
                    grads = optimizer.get_unscaled_gradients(scaled_grads)
                else:
                    grads = tape.gradient(loss, training_vars)
                optimizer.apply_gradients(zip(grads, training_vars))
            # For reporting, the metric takes the mean of losses.
            train_loss_metric.update_state(raw_loss)
            for metric in train_metrics:
                metric.update_state(labels, model_outputs)

        @tf.function
        def train_steps(iterator, steps):
            """Performs distributed training steps in a loop.

      Args:
        iterator: the distributed iterator of training datasets.
        steps: an tf.int32 integer tensor to specify number of steps to run
          inside host training loop.

      Raises:
        ValueError: Any of the arguments or tensor shapes are invalid.
      """
            if not isinstance(steps, tf.Tensor):
                raise ValueError(
                    'steps should be an Tensor. Python object may cause '
                    'retracing.')

            for _ in tf.range(steps):
                strategy.run(_replicated_step, args=(next(iterator), ))

        def train_single_step(iterator):
            """Performs a distributed training step.

      Args:
        iterator: the distributed iterator of training datasets.

      Raises:
        ValueError: Any of the arguments or tensor shapes are invalid.
      """
            strategy.run(_replicated_step, args=(next(iterator), ))

        def test_step(iterator):
            """Calculates evaluation metrics on distributed devices."""
            def _test_step_fn(inputs):
                """Replicated accuracy calculation."""

                inputs, labels = inputs
                model_outputs = model(inputs, training=False)
                for metric in eval_metrics:
                    metric.update_state(labels, model_outputs)

            strategy.run(_test_step_fn, args=(next(iterator), ))

        if not run_eagerly:
            train_single_step = tf.function(train_single_step)
            test_step = tf.function(test_step)

        def _run_evaluation(current_training_step, test_iterator):
            """Runs validation steps and aggregate metrics.

      Args:
        current_training_step: tf.int32 tensor containing the current step.
        test_iterator: distributed iterator of test datasets.

      Returns:
        A dict of metic names and values.
      """
            for _ in range(eval_steps):
                test_step(test_iterator)

            logs = {}
            with eval_summary_writer.as_default():
                for metric in eval_metrics + model.metrics:
                    metric_value = _float_metric_value(metric)
                    logs[metric.name] = metric_value
                    logging.info('Step: [%d] Validation %s = %f',
                                 current_training_step, metric.name,
                                 metric_value)
                    tf.summary.scalar(metric.name,
                                      metric_value,
                                      step=current_training_step)
                eval_summary_writer.flush()

            return logs

        # Training loop starts here.
        checkpoint = tf.train.Checkpoint(model=model,
                                         optimizer=optimizer,
                                         global_step=optimizer.iterations)
        sub_model_checkpoint = tf.train.Checkpoint(
            model=sub_model, global_step=optimizer.iterations
        ) if sub_model_export_name else None

        latest_checkpoint_file = tf.train.latest_checkpoint(model_dir)
        if latest_checkpoint_file:
            logging.info(
                'Checkpoint file %s found and restoring from '
                'checkpoint', latest_checkpoint_file)
            checkpoint.restore(latest_checkpoint_file)
            logging.info('Loading from checkpoint file completed')

        current_step = optimizer.iterations.numpy()
        checkpoint_name = 'ctl_step_{step}.ckpt'

        while current_step < total_training_steps:
            if current_step % steps_per_epoch == 0:
                callback_list.on_epoch_begin(
                    int(current_step / steps_per_epoch) + 1)

            # Training loss/metric are taking average over steps inside micro
            # training loop. We reset the their values before each round.
            train_loss_metric.reset_states()
            for metric in train_metrics + model.metrics:
                metric.reset_states()

            callback_list.on_batch_begin(current_step)
            # Runs several steps in the host while loop.
            steps = steps_to_run(current_step, steps_per_epoch, steps_per_loop)

            if tf.config.list_physical_devices('GPU'):
                # TODO(zongweiz): merge with train_steps once tf.while_loop
                # GPU performance bugs are fixed.
                for _ in range(steps):
                    train_single_step(train_iterator)
            else:
                # Converts steps to a Tensor to avoid tf.function retracing.
                train_steps(train_iterator,
                            tf.convert_to_tensor(steps, dtype=tf.int32))
            train_loss = _float_metric_value(train_loss_metric)
            current_step += steps
            callback_list.on_batch_end(current_step - 1, {'loss': train_loss})

            # Updates training logging.
            training_status = 'Train Step: %d/%d  / loss = %s' % (
                current_step, total_training_steps, train_loss)

            if current_step >= last_summary_step + train_summary_interval:
                summary_writer = train_summary_writer
                last_summary_step = current_step
            else:
                summary_writer = tf.summary.create_noop_writer()

            with summary_writer.as_default():
                tf.summary.scalar(train_loss_metric.name,
                                  train_loss,
                                  step=current_step)
                for metric in train_metrics + model.metrics:
                    metric_value = _float_metric_value(metric)
                    training_status += '  %s = %f' % (metric.name,
                                                      metric_value)
                    tf.summary.scalar(metric.name,
                                      metric_value,
                                      step=current_step)
                summary_writer.flush()
            logging.info(training_status)

            if current_step % steps_per_epoch == 0:
                # Save a submodel with the step in the file name after each epoch.
                if sub_model_export_name:
                    _save_checkpoint(
                        strategy, sub_model_checkpoint, model_dir,
                        '%s_step_%d.ckpt' %
                        (sub_model_export_name, current_step))

                # Save model checkpoints and run validation steps after each epoch
                # (with the exception of the final epoch which is handled after the
                # training loop).
                if current_step < total_training_steps:
                    _save_checkpoint(strategy, checkpoint, model_dir,
                                     checkpoint_name.format(step=current_step))
                    logs = None
                    if eval_input_fn:
                        logging.info('Running evaluation after step: %s.',
                                     current_step)
                        logs = _run_evaluation(
                            current_step,
                            _get_input_iterator(eval_input_fn, strategy))
                        # Re-initialize evaluation metric.
                        for metric in eval_metrics + model.metrics:
                            metric.reset_states()

                    callback_list.on_epoch_end(
                        int(current_step / steps_per_epoch), logs)

        if sub_model_export_name:
            _save_checkpoint(strategy, sub_model_checkpoint, model_dir,
                             '%s.ckpt' % sub_model_export_name)

        _save_checkpoint(strategy, checkpoint, model_dir,
                         checkpoint_name.format(step=current_step))
        logs = None
        if eval_input_fn:
            logging.info(
                'Running final evaluation after training is complete.')
            logs = _run_evaluation(
                current_step, _get_input_iterator(eval_input_fn, strategy))

        callback_list.on_epoch_end(int(current_step / steps_per_epoch), logs)

        training_summary = {
            'total_training_steps': total_training_steps,
            'train_loss': _float_metric_value(train_loss_metric),
        }
        for metric in model.metrics:
            training_summary[metric.name] = _float_metric_value(metric)
        if eval_metrics:
            # TODO(hongkuny): Cleans up summary reporting in text.
            training_summary['last_train_metrics'] = _float_metric_value(
                train_metrics[0])
            training_summary['eval_metrics'] = _float_metric_value(
                eval_metrics[0])

        write_txt_summary(training_summary, summary_dir)

        if not _should_export_summary(strategy):
            tf.io.gfile.rmtree(summary_dir)

        return model
Example #13
0
 def jit_compile(self, f: Callable) -> Callable:
     return tf.function(f)
Example #14
0
  def infer(self,
            features_file,
            predictions_file=None,
            checkpoint_path=None,
            log_time=False):
    """Runs inference.

    Args:
      features_file: The file(s) to infer from.
      predictions_file: If set, predictions are saved in this file.
      checkpoint_path: Path of a specific checkpoint to predict. If ``None``,
        the latest is used.
      log_time: If ``True``, several time metrics will be printed in the logs at
        the end of the inference loop.
    """
    checkpoint, config = self._init_run()
    checkpoint.restore(checkpoint_path=checkpoint_path, weights_only=True)
    model = checkpoint.model
    infer_config = config["infer"]
    dataset = model.examples_inputter.make_inference_dataset(
        features_file,
        infer_config["batch_size"],
        length_bucket_width=infer_config["length_bucket_width"],
        prefetch_buffer_size=infer_config.get("prefetch_buffer_size"))

    if predictions_file:
      stream = io.open(predictions_file, encoding="utf-8", mode="w")
    else:
      stream = sys.stdout

    ordered_writer = None
    infer_fn = tf.function(model.infer, input_signature=(dataset.element_spec,))
    write_fn = lambda prediction: (
        model.print_prediction(prediction, params=infer_config, stream=stream))

    total_time = 0
    total_tokens = 0
    total_examples = 0
    start_time = time.time()

    for source in dataset:
      predictions = infer_fn(source)
      predictions = tf.nest.map_structure(lambda t: t.numpy(), predictions)
      end_time = time.time()
      if log_time:
        total_time += end_time - start_time
        batch_size = next(six.itervalues(predictions)).shape[0]
        total_examples += batch_size
        length = predictions.get("length")
        if length is not None:
          if len(length.shape) == 2:
            length = length[:, 0]
          total_tokens += sum(length)
      for prediction in misc.extract_batches(predictions):
        if "index" in prediction:
          if ordered_writer is None:
            ordered_writer = misc.OrderRestorer(
                index_fn=lambda prediction: prediction["index"], callback_fn=write_fn)
          ordered_writer.push(prediction)
        else:
          write_fn(prediction)
      start_time = time.time()

    if log_time:
      tf.get_logger().info("Total prediction time (s): %f", total_time)
      tf.get_logger().info(
          "Average prediction time (s): %f", total_time / total_examples)
      if total_tokens > 0:
        tf.get_logger().info("Tokens per second: %f", total_tokens / total_time)
    if predictions_file:
      stream.close()
Example #15
0
    logits_aux4 = aux4(feat4)
    logits_aux5_4 = aux5_4(feat5_4)
    return tf.keras.Model(
        inputs=x,
        outputs=[logits, logits_aux2, logits_aux3, logits_aux4, logits_aux5_4],
        name="BiSeNetV2",
    )


if __name__ == "__main__":
    import time

    input_shape = (360, 640, 3)
    model = get_bisenetv2(input_shape, n_classes=2)
    model.summary()
    model.compile("adam", "mse")
    model = tf.function(model)
    image = tf.random.normal((1, *input_shape))
    # warm up
    for i in range(10):
        model(image)

    iters = 200
    init = time.time()
    for i in range(iters):
        model(image)
    end = time.time() - init

    print(f"FPS {1/(end/iters)}")
    print(f"Time {end/iters}")
Example #16
0
sigm25_1 = sigmoid(conv25_3)

concat25_1 = Concatenate()([conv24_3, sigm25_1])



model = Model(inputs=inputs, outputs=[concat23_1, concat25_1, concat50_3, concat50_5])

model.summary()

tf.saved_model.save(model, 'saved_model_{}_{}x{}'.format(ds, height, width))
# model.save('footprints_{}_{}x{}_float32.h5'.format(ds, height, width).format(height, width))


full_model = tf.function(lambda inputs: model(inputs))
full_model = full_model.get_concrete_function(inputs=[tf.TensorSpec(model.inputs[0].shape, model.inputs[0].dtype)])
frozen_func = convert_variables_to_constants_v2(full_model, lower_control_flow=False)
frozen_func.graph.as_graph_def()
tf.io.write_graph(graph_or_graph_def=frozen_func.graph,
                    logdir=".",
                    name="footprints_{}_{}x{}_float32.pb".format(ds, height, width),
                    as_text=False)


# No Quantization - Input/Output=float32
converter = tf.lite.TFLiteConverter.from_keras_model(model)
tflite_model = converter.convert()
with open('footprints_{}_{}x{}_float32.tflite'.format(ds, height, width), 'wb') as w:
    w.write(tflite_model)
print("tflite convert complete! - footprints_{}_{}x{}_float32.tflite".format(ds, height, width))
def run(flags_obj):
    """Run ResNet ImageNet training and eval loop using custom training loops.

  Args:
    flags_obj: An object containing parsed flag values.

  Raises:
    ValueError: If fp16 is passed as it is not currently supported.

  Returns:
    Dictionary of training and eval stats.
  """
    keras_utils.set_session_config(enable_eager=flags_obj.enable_eager,
                                   enable_xla=flags_obj.enable_xla)

    dtype = flags_core.get_tf_dtype(flags_obj)
    if dtype == tf.float16:
        policy = tf.compat.v2.keras.mixed_precision.experimental.Policy(
            'mixed_float16')
        tf.compat.v2.keras.mixed_precision.experimental.set_policy(policy)
    elif dtype == tf.bfloat16:
        policy = tf.compat.v2.keras.mixed_precision.experimental.Policy(
            'mixed_bfloat16')
        tf.compat.v2.keras.mixed_precision.experimental.set_policy(policy)

    # This only affects GPU.
    common.set_cudnn_batchnorm_mode()

    # TODO(anj-s): Set data_format without using Keras.
    data_format = flags_obj.data_format
    if data_format is None:
        data_format = ('channels_first'
                       if tf.test.is_built_with_cuda() else 'channels_last')
    tf.keras.backend.set_image_data_format(data_format)

    strategy = distribution_utils.get_distribution_strategy(
        distribution_strategy=flags_obj.distribution_strategy,
        num_gpus=flags_obj.num_gpus,
        num_workers=distribution_utils.configure_cluster(),
        all_reduce_alg=flags_obj.all_reduce_alg,
        num_packs=flags_obj.num_packs,
        tpu_address=flags_obj.tpu)

    train_ds, test_ds = get_input_dataset(flags_obj, strategy)
    per_epoch_steps, train_epochs, eval_steps = get_num_train_iterations(
        flags_obj)
    steps_per_loop = min(flags_obj.steps_per_loop, per_epoch_steps)
    logging.info(
        "Training %d epochs, each epoch has %d steps, "
        "total steps: %d; Eval %d steps", train_epochs, per_epoch_steps,
        train_epochs * per_epoch_steps, eval_steps)

    time_callback = keras_utils.TimeHistory(flags_obj.batch_size,
                                            flags_obj.log_steps)

    with distribution_utils.get_strategy_scope(strategy):
        resnet_model.change_keras_layer(flags_obj.use_tf_keras_layers)
        model = resnet_model.resnet50(
            num_classes=imagenet_preprocessing.NUM_CLASSES,
            batch_size=flags_obj.batch_size,
            use_l2_regularizer=not flags_obj.single_l2_loss_op)

        lr_schedule = common.PiecewiseConstantDecayWithWarmup(
            batch_size=flags_obj.batch_size,
            epoch_size=imagenet_preprocessing.NUM_IMAGES['train'],
            warmup_epochs=common.LR_SCHEDULE[0][1],
            boundaries=list(p[1] for p in common.LR_SCHEDULE[1:]),
            multipliers=list(p[0] for p in common.LR_SCHEDULE),
            compute_lr_on_cpu=True)
        optimizer = common.get_optimizer(lr_schedule)

        if dtype == tf.float16:
            loss_scale = flags_core.get_loss_scale(flags_obj,
                                                   default_for_fp16=128)
            optimizer = tf.keras.mixed_precision.experimental.LossScaleOptimizer(
                optimizer, loss_scale)
        elif flags_obj.fp16_implementation == 'graph_rewrite':
            # `dtype` is still float32 in this case. We built the graph in float32 and
            # let the graph rewrite change parts of it float16.
            if not flags_obj.use_tf_function:
                raise ValueError(
                    '--fp16_implementation=graph_rewrite requires '
                    '--use_tf_function to be true')
            loss_scale = flags_core.get_loss_scale(flags_obj,
                                                   default_for_fp16=128)
            optimizer = tf.train.experimental.enable_mixed_precision_graph_rewrite(
                optimizer, loss_scale)

        current_step = 0
        checkpoint = tf.train.Checkpoint(model=model, optimizer=optimizer)
        latest_checkpoint = tf.train.latest_checkpoint(flags_obj.model_dir)
        if latest_checkpoint:
            checkpoint.restore(latest_checkpoint)
            logging.info("Load checkpoint %s", latest_checkpoint)
            current_step = optimizer.iterations.numpy()

        train_loss = tf.keras.metrics.Mean('train_loss', dtype=tf.float32)
        training_accuracy = tf.keras.metrics.SparseCategoricalAccuracy(
            'training_accuracy', dtype=tf.float32)
        test_loss = tf.keras.metrics.Mean('test_loss', dtype=tf.float32)
        test_accuracy = tf.keras.metrics.SparseCategoricalAccuracy(
            'test_accuracy', dtype=tf.float32)

        trainable_variables = model.trainable_variables

        def step_fn(inputs):
            """Per-Replica StepFn."""
            images, labels = inputs
            with tf.GradientTape() as tape:
                logits = model(images, training=True)

                prediction_loss = tf.keras.losses.sparse_categorical_crossentropy(
                    labels, logits)
                loss = tf.reduce_sum(prediction_loss) * (1.0 /
                                                         flags_obj.batch_size)
                num_replicas = tf.distribute.get_strategy(
                ).num_replicas_in_sync

                if flags_obj.single_l2_loss_op:
                    l2_loss = resnet_model.L2_WEIGHT_DECAY * 2 * tf.add_n([
                        tf.nn.l2_loss(v)
                        for v in trainable_variables if 'bn' not in v.name
                    ])

                    loss += (l2_loss / num_replicas)
                else:
                    loss += (tf.reduce_sum(model.losses) / num_replicas)

                # Scale the loss
                if flags_obj.dtype == "fp16":
                    loss = optimizer.get_scaled_loss(loss)

            grads = tape.gradient(loss, trainable_variables)

            # Unscale the grads
            if flags_obj.dtype == "fp16":
                grads = optimizer.get_unscaled_gradients(grads)

            optimizer.apply_gradients(zip(grads, trainable_variables))
            train_loss.update_state(loss)
            training_accuracy.update_state(labels, logits)

        @tf.function
        def train_steps(iterator, steps):
            """Performs distributed training steps in a loop."""
            for _ in tf.range(steps):
                strategy.experimental_run_v2(step_fn, args=(next(iterator), ))

        def train_single_step(iterator):
            if strategy:
                strategy.experimental_run_v2(step_fn, args=(next(iterator), ))
            else:
                return step_fn(next(iterator))

        def test_step(iterator):
            """Evaluation StepFn."""
            def step_fn(inputs):
                images, labels = inputs
                logits = model(images, training=False)
                loss = tf.keras.losses.sparse_categorical_crossentropy(
                    labels, logits)
                loss = tf.reduce_sum(loss) * (1.0 / flags_obj.batch_size)
                test_loss.update_state(loss)
                test_accuracy.update_state(labels, logits)

            if strategy:
                strategy.experimental_run_v2(step_fn, args=(next(iterator), ))
            else:
                step_fn(next(iterator))

        if flags_obj.use_tf_function:
            train_single_step = tf.function(train_single_step)
            test_step = tf.function(test_step)

        if flags_obj.enable_tensorboard:
            summary_writer = tf.summary.create_file_writer(flags_obj.model_dir)
        else:
            summary_writer = None

        train_iter = iter(train_ds)
        time_callback.on_train_begin()
        for epoch in range(current_step // per_epoch_steps, train_epochs):
            train_loss.reset_states()
            training_accuracy.reset_states()

            steps_in_current_epoch = 0
            while steps_in_current_epoch < per_epoch_steps:
                time_callback.on_batch_begin(steps_in_current_epoch +
                                             epoch * per_epoch_steps)
                steps = _steps_to_run(steps_in_current_epoch, per_epoch_steps,
                                      steps_per_loop)
                if steps == 1:
                    train_single_step(train_iter)
                else:
                    # Converts steps to a Tensor to avoid tf.function retracing.
                    train_steps(train_iter,
                                tf.convert_to_tensor(steps, dtype=tf.int32))
                time_callback.on_batch_end(steps_in_current_epoch +
                                           epoch * per_epoch_steps)
                steps_in_current_epoch += steps

            logging.info('Training loss: %s, accuracy: %s at epoch %d',
                         train_loss.result().numpy(),
                         training_accuracy.result().numpy(), epoch + 1)

            if (not flags_obj.skip_eval
                    and (epoch + 1) % flags_obj.epochs_between_evals == 0):
                test_loss.reset_states()
                test_accuracy.reset_states()

                test_iter = iter(test_ds)
                for _ in range(eval_steps):
                    test_step(test_iter)

                logging.info('Test loss: %s, accuracy: %s%% at epoch: %d',
                             test_loss.result().numpy(),
                             test_accuracy.result().numpy(), epoch + 1)

            if flags_obj.enable_checkpoint_and_export:
                checkpoint_name = checkpoint.save(
                    os.path.join(flags_obj.model_dir,
                                 'model.ckpt-{}'.format(epoch + 1)))
                logging.info('Saved checkpoint to %s', checkpoint_name)

            if summary_writer:
                current_steps = steps_in_current_epoch + (epoch *
                                                          per_epoch_steps)
                with summary_writer.as_default():
                    tf.summary.scalar('train_loss', train_loss.result(),
                                      current_steps)
                    tf.summary.scalar('train_accuracy',
                                      training_accuracy.result(),
                                      current_steps)
                    tf.summary.scalar('eval_loss', test_loss.result(),
                                      current_steps)
                    tf.summary.scalar('eval_accuracy', test_accuracy.result(),
                                      current_steps)

        time_callback.on_train_end()
        if summary_writer:
            summary_writer.close()

        eval_result = None
        train_result = None
        if not flags_obj.skip_eval:
            eval_result = [
                test_loss.result().numpy(),
                test_accuracy.result().numpy()
            ]
            train_result = [
                train_loss.result().numpy(),
                training_accuracy.result().numpy()
            ]

        stats = build_stats(train_result, eval_result, time_callback)
        return stats
import os
import tensorflow as tf
from tensorflow import keras


# tf.function and auto-graph.
def scaled_elu(z, scale=1.0, alpha=1.0):
    #z>=0?scale*z:scale*alpha*tf.nn.elu(z)
    is_positive = tf.greater_equal(z, 0.0)
    return scale * tf.where(is_positive, z, alpha * tf.nn.elu(z))


print(scaled_elu(tf.constant(-3.)))
print(scaled_elu(tf.constant([-3, -2.5])))

scaled_elu_tf = tf.function(scaled_elu)

print(scaled_elu_tf(tf.constant(-3.)))
print(scaled_elu_tf(tf.constant([-3, -2.5])))

print(scaled_elu_tf.python_function is scaled_elu)

# 转化后的优势就是快


def converge_to_2(n_iters):
    total = tf.constant(0.)
    increment = tf.constant(1.)
    for _ in range(n_iters):
        total += increment
        increment /= 2.0
Example #19
0

def policy_vtest():
    """Autoaugment test policy for debugging."""
    # Each tuple is an augmentation operation of the form
    # (operation, probability, magnitude). Each element in policy is a
    # sub-policy that will be applied sequentially on the image.
    policy = [
        [('TranslateX', 1.0, 4), ('Equalize', 1.0, 10)],
    ]
    return policy


# pylint: disable=g-long-lambda
blend = tf.function(lambda i1, i2, factor: tf.cast(
    tfa_image.blend(tf.cast(i1, tf.float32), tf.cast(i2, tf.float32), factor),
    tf.uint8))
# pylint: enable=g-long-lambda


def random_erase(image,
                 prob,
                 min_area=0.02,
                 max_area=1 / 3,
                 min_aspect=1 / 3,
                 max_aspect=10 / 3,
                 mode='pixel'):
    """The random erasing augmentations: https://arxiv.org/pdf/1708.04896.pdf.

  This augmentation is applied after image normalization.
  def custom_loop(self, epoch, optimizer, train_dist_dataset, test_dist_dataset,
                  strategy):
    """Custom training and testing loop.

    Args:
      train_dist_dataset: Training dataset created using strategy.
      test_dist_dataset: Testing dataset created using strategy.
      strategy: Distribution strategy.

    Returns:
      train_loss, train_accuracy, test_loss, test_accuracy
    """

    def distributed_train_step(dataset_inputs):
        per_replica_losses = strategy.experimental_run_v2(self.train_step,
                                                          args=(dataset_inputs, optimizer,))
        return strategy.reduce(tf.distribute.ReduceOp.SUM, per_replica_losses,
                               axis=None)

    def distributed_test_step(dataset_inputs):
        per_replica_losses = strategy.experimental_run_v2(self.test_step, args=(dataset_inputs,))
        return strategy.reduce(tf.distribute.ReduceOp.SUM, per_replica_losses,
                               axis=None)
    if self.enable_function:
        distributed_train_step = tf.function(distributed_train_step)
        distributed_test_step = tf.function(distributed_test_step)

    self.train_top1_metric.reset_states()
    self.train_top5_metric.reset_states()
    self.val_top1_metric.reset_states()
    self.val_top5_metric.reset_states()
    self.batch_time.reset()

    optimizer.learning_rate = self.decay(epoch)
    print('learningRate: {:.4f}'.format(optimizer.learning_rate.numpy()))
    train_total_loss = 0.0
    num_train_batches = 0.0
    for one_batch in train_dist_dataset:
        end = time.time()
        if args.WarmingUp:
            if epoch < args.learning_rate_schedule[0]:
                batch_learning_rate = self.decay(epoch) + float(num_train_batches / np.ceil(args.train_num/args.batchSize))\
                                      * args.learning_rate / args.learning_rate_schedule[0]
                optimizer.learning_rate = batch_learning_rate
                # print('learningRate: {:.4f}'.format(optimizer.learning_rate.numpy()))

        train_total_loss += distributed_train_step(one_batch)
        num_train_batches += 1
        self.batch_time.update(time.time() - end)
        if num_train_batches % args.print_freq == 0:
            print('learningRate: {:.4f}'.format(optimizer.learning_rate.numpy()))
            template = ('Epoch: {}({}/{})\tTime:{:.4f}({:.4f})\tLoss: {:.4f}\tTop1_Accuracy: {:.4f}\tTop5_Accuracy: {:.4f}')
            print(template.format(epoch, int(num_train_batches), int(np.ceil(args.train_num/args.batchSize)),
                                  self.batch_time.val, self.batch_time.avg,
                                  train_total_loss / num_train_batches,
                                  100 * self.train_top1_metric.result(),
                                  100 * self.train_top5_metric.result()))
    self.batch_time.reset()

    val_total_loss = 0.0
    num_val_batches = 0.0
    for one_batch in test_dist_dataset:
        end = time.time()
        val_total_loss += distributed_test_step(one_batch)
        num_val_batches += 1
        self.batch_time.update(time.time() - end)
        if num_val_batches % args.print_freq == 0:
            template = ('Val: {}({}/{})\tTime:{:.4f}({:.4f})\tLoss: {:.4f}\tTop1_Accuracy: {:.4f}\tTop5_Accuracy: {:.4f}')
            print(template.format(epoch, int(num_val_batches), int(np.ceil(args.val_num/args.batchSize)),
                                  self.batch_time.val, self.batch_time.avg,
                                  val_total_loss / num_val_batches,
                                  100 * self.val_top1_metric.result(),
                                  100 * self.val_top5_metric.result()))

    return (train_total_loss / num_train_batches,
            100 * self.train_top1_metric.result().numpy(),
            100 * self.train_top5_metric.result().numpy(),
            val_total_loss / num_val_batches,
            100 * self.val_top1_metric.result().numpy(),
            100 * self.val_top5_metric.result().numpy())
Example #21
0
    def run_test_case(self, func, feed_dict, input_names_with_port, output_names_with_port, rtol=1e-07, atol=1e-5,
                      convert_var_to_const=True, constant_fold=True, check_value=True, check_shape=True,
                      check_dtype=True, process_args=None, onnx_feed_dict=None, graph_validator=None, as_session=False,
                      large_model=False):
        # optional - passed to process_tf_graph
        if process_args is None:
            process_args = {}
        # optional - pass distinct feed_dict to onnx runtime
        if onnx_feed_dict is None:
            onnx_feed_dict = feed_dict
        input_names_with_port = list(feed_dict)
        tf_reset_default_graph()
        graph_def = None
        initialized_tables = None

        np.random.seed(1)  # Make it reproducible.
        clean_feed_dict = {utils.node_name(k): v for k, v in feed_dict.items()}
        if is_tf2() and not as_session:
            #
            # use eager to execute the tensorflow func
            #
            # numpy doesn't work for all ops, make it tf.Tensor()
            input_tensors = [tf.TensorSpec(shape=v.shape, dtype=tf.as_dtype(v.dtype), name=utils.node_name(k))
                             for k, v in feed_dict.items()]
            input_list = [tf.convert_to_tensor(v, dtype=tf.as_dtype(v.dtype), name=utils.node_name(k))
                          for k, v in feed_dict.items()]
            tf.random.set_seed(1)
            expected = func(*input_list)
            if isinstance(expected, (list, tuple)):
                # list or tuple
                expected = [x.numpy() for x in expected]
            else:
                # single result
                expected = [expected.numpy()]

            # now make the eager functions a graph
            concrete_func = tf.function(func, input_signature=tuple(input_tensors))
            concrete_func = concrete_func.get_concrete_function()
            graph_def = from_function(concrete_func,
                                      input_names=list(feed_dict.keys()),
                                      output_names=output_names_with_port,
                                      large_model=large_model)
        else:
            #
            # use graph to execute the tensorflow func
            #
            with tf_session() as sess:
                tf_set_random_seed(1)
                input_list = []
                for k, v in clean_feed_dict.items():
                    input_list.append(tf_placeholder(name=k, shape=v.shape, dtype=tf.as_dtype(v.dtype)))
                func(*input_list)
                variables_lib.global_variables_initializer().run()
                tf_tables_initializer().run()

                output_dict = []
                for out_name in output_names_with_port:
                    output_dict.append(sess.graph.get_tensor_by_name(out_name))
                expected = sess.run(output_dict, feed_dict=feed_dict)
                graph_def = freeze_session(sess,
                                           input_names=list(feed_dict.keys()),
                                           output_names=output_names_with_port)
                table_names, key_dtypes, value_dtypes = get_hash_table_info(graph_def)
                initialized_tables = {}
                for n, k_dtype, val_dtype in zip(table_names, key_dtypes, value_dtypes):
                    h = lookup_ops.hash_table_v2(k_dtype, val_dtype, shared_name=n)
                    k, v = lookup_ops.lookup_table_export_v2(h, k_dtype, val_dtype)
                    initialized_tables[n] = (sess.run(k), sess.run(v))

            tf_reset_default_graph()
            with tf_session() as sess:
                tf.import_graph_def(graph_def, name='')
                graph_def = tf_optimize(list(feed_dict.keys()), output_names_with_port,
                                        graph_def, fold_constant=constant_fold)

        tf_reset_default_graph()
        with tf_session() as sess:
            const_node_values = None
            if large_model:
                const_node_values = compress_graph_def(graph_def)
            tf.import_graph_def(graph_def, name='')

            if self.config.is_debug_mode:
                model_path = os.path.join(self.test_data_directory, self._testMethodName + "_after_tf_optimize.pb")
                utils.save_protobuf(model_path, graph_def)
                self.logger.debug("created file  %s", model_path)

            g = process_tf_graph(sess.graph, opset=self.config.opset,
                                 input_names=list(feed_dict.keys()),
                                 output_names=output_names_with_port,
                                 target=self.config.target,
                                 const_node_values=const_node_values,
                                 initialized_tables=initialized_tables,
                                 **process_args)
            g = optimizer.optimize_graph(g)
            actual = self.run_backend(g, output_names_with_port, onnx_feed_dict, large_model)

        for expected_val, actual_val in zip(expected, actual):
            if check_value:
                self.assertAllClose(expected_val, actual_val, rtol=rtol, atol=atol)
            if check_dtype:
                self.assertEqual(expected_val.dtype, actual_val.dtype)
            # why need shape checke: issue when compare [] with scalar
            # https://github.com/numpy/numpy/issues/11071
            if check_shape:
                self.assertEqual(expected_val.shape, actual_val.shape)

        if graph_validator:
            self.assertTrue(graph_validator(g))

        return g
    def run_test(self,
                 name,
                 backend="onnxruntime",
                 onnx_file=None,
                 opset=None,
                 extra_opset=None,
                 perf=None):
        """Run complete test against backend."""
        self.perf = perf

        # get the model
        if self.url:
            _, dir_name = self.download_model()
            logger.info("Downloaded to %s", dir_name)
            model_path = os.path.join(
                dir_name, self.local) if self.local != "." else dir_name
        else:
            model_path = self.local

        logger.info("Load model from %s", model_path)
        input_names = list(self.input_names.keys())
        initialized_tables = {}
        outputs = self.output_names
        tflite_path = None
        to_rename = None
        if self.model_type in ["checkpoint"]:
            graph_def, input_names, outputs = tf_loader.from_checkpoint(
                model_path, input_names, outputs)
        elif self.model_type in ["saved_model"]:
            loaded = tf_loader.from_saved_model(
                model_path,
                None,
                None,
                self.tag,
                self.signatures,
                self.concrete_function,
                self.large_model,
                return_concrete_func=not self.run_tf_frozen,
                return_initialized_tables=True,
                return_tensors_to_rename=True)
            if not self.run_tf_frozen:
                # Must maintain ref to imported since concrete_func uses weak refs
                # pylint: disable=unused-variable
                graph_def, input_names, outputs, concrete_func, imported, initialized_tables, to_rename = loaded
            else:
                graph_def, input_names, outputs, initialized_tables, to_rename = loaded
        elif self.model_type in ["keras"]:
            graph_def, input_names, outputs = tf_loader.from_keras(
                model_path, input_names, outputs)
        elif self.model_type in ["tflite"]:
            tflite_path = model_path
            graph_def = None
        else:
            graph_def, input_names, outputs = tf_loader.from_graphdef(
                model_path, input_names, outputs)

        if utils.is_debug_mode():
            utils.save_protobuf(
                os.path.join(TEMP_DIR, name + "_after_tf_optimize.pb"),
                graph_def)

        if tflite_path is not None:
            inputs = {}
            for k in input_names:
                v = self.input_names[k]
                inputs[k] = self.make_input(v)

            interpreter = tf.lite.Interpreter(tflite_path)
            input_details = interpreter.get_input_details()
            output_details = interpreter.get_output_details()
            input_name_to_index = {
                n['name'].split(':')[0]: n['index']
                for n in input_details
            }
            for k, v in inputs.items():
                interpreter.resize_tensor_input(input_name_to_index[k],
                                                v.shape)
            interpreter.allocate_tensors()

            def run_tflite():
                for k, v in inputs.items():
                    interpreter.set_tensor(input_name_to_index[k], v)
                interpreter.invoke()
                result = [
                    interpreter.get_tensor(output['index'])
                    for output in output_details
                ]
                return result

            tf_results = run_tflite()
            if self.perf:
                logger.info("Running TFLite perf")
                n = 0
                start = time.time()
                stop = start + PERF_TIME
                while time.time() < stop:
                    for _ in range(PERF_STEP):
                        _ = run_tflite()
                    n += PERF_STEP
                self.tf_runtime = 1000 * (time.time() - start) / n
                logger.info("TFLite perf {:.2f}ms/inference, n={}".format(
                    self.tf_runtime, n))
            logger.info("TFLite OK")

        if not self.run_tf_frozen:
            inputs = {}
            for k in input_names:
                v = self.input_names[k]
                inputs[k.split(":")[0]] = tf.constant(self.make_input(v))
            tf_func = tf.function(concrete_func)
            logger.info("Running TF")
            tf_results_d = tf_func(**inputs)
            # If there is only a single output a dict might not be returned
            if isinstance(tf_results_d, tf.Tensor):
                tf_results = [tf_results_d]
            else:
                tf_results = [
                    tf_results_d[k] for k in sorted(tf_results_d.keys())
                ]
            tf_results = [tf_res.numpy() for tf_res in tf_results]
            if self.perf:
                logger.info("Running TF perf")
                n = 0
                start = time.time()
                stop = start + PERF_TIME
                if self.tf_profile is not None:
                    tf.profiler.experimental.start(self.tf_profile)
                while time.time() < stop:
                    for _ in range(PERF_STEP):
                        _ = concrete_func(**inputs)
                    n += PERF_STEP
                if self.tf_profile is not None:
                    tf.profiler.experimental.stop()
                self.tf_runtime = 1000 * (time.time() - start) / n
                logger.info("TF perf {:.2f}ms/inference, n={}".format(
                    self.tf_runtime, n))
            logger.info("TensorFlow OK")

        shape_override = {}
        const_node_values = None
        tf_graph = None

        if graph_def is not None:
            inputs = {}
            tf_reset_default_graph()

            with tf.Graph().as_default() as tf_graph:
                from tf2onnx.tf_utils import compress_graph_def
                if self.large_model:
                    const_node_values = compress_graph_def(graph_def)
                tf.import_graph_def(graph_def, name='')

            with tf_session(graph=tf_graph) as sess:
                # create the input data
                for k in input_names:
                    v = self.input_names[k]
                    t = sess.graph.get_tensor_by_name(k)
                    expected_dtype = tf.as_dtype(t.dtype).name
                    if isinstance(v, six.text_type) and v.startswith("np."):
                        np_value = eval(v)  # pylint: disable=eval-used
                        if expected_dtype != np_value.dtype:
                            logger.warning(
                                "dtype mismatch for input %s: expected=%s, actual=%s",
                                k, expected_dtype, np_value.dtype)
                        inputs[k] = np_value.astype(expected_dtype)
                    else:
                        if expected_dtype == "string":
                            inputs[k] = self.make_input(v).astype(
                                np.str).astype(np.object)
                        else:
                            inputs[k] = self.make_input(v).astype(
                                expected_dtype)

                if self.force_input_shape:
                    for k, v in inputs.items():
                        shape_override[k] = list(v.shape)

                # run the model with tensorflow
                if self.skip_tensorflow:
                    logger.info("TensorFlow SKIPPED")
                elif self.run_tf_frozen:
                    if self.tf_profile is not None:
                        tf.profiler.experimental.start(self.tf_profile)
                    tf_results = self.run_tensorflow(sess, inputs)
                    if self.tf_profile is not None:
                        tf.profiler.experimental.stop()
                    logger.info("TensorFlow OK")
                tf_graph = sess.graph

        model_proto = None
        if self.skip_conversion:
            if self.large_model:
                external_tensor_storage = ExternalTensorStorage()
                model_proto = utils.model_proto_from_zip(
                    self.converted_model, external_tensor_storage)
            else:
                external_tensor_storage = None
                model_proto = utils.model_proto_from_file(self.converted_model)
            logger.info("ONNX loaded from file")
        else:
            try:
                # convert model to onnx
                onnx_graph = self.to_onnx(
                    tf_graph,
                    opset=opset,
                    extra_opset=extra_opset,
                    shape_override=shape_override,
                    input_names=inputs.keys(),
                    const_node_values=const_node_values,
                    initialized_tables=initialized_tables,
                    tflite_path=tflite_path,
                    tensors_to_rename=to_rename)
                onnx_graph = optimizer.optimize_graph(onnx_graph)
                print("ONNX", onnx_graph.dump_node_statistics())
                external_tensor_storage = ExternalTensorStorage(
                ) if self.large_model else None
                model_proto = onnx_graph.make_model(
                    "converted from tf2onnx",
                    external_tensor_storage=external_tensor_storage)
                logger.info("To_ONNX, OK")
                if onnx_file:
                    self.create_onnx_file(name, model_proto, inputs, onnx_file,
                                          external_tensor_storage)
                if self.converted_model:
                    if self.large_model:
                        utils.save_onnx_zip(self.converted_model, model_proto,
                                            external_tensor_storage)
                    else:
                        utils.save_protobuf(self.converted_model, model_proto)
                    logger.info("Created %s", self.converted_model)

            except Exception:
                logger.error("To_ONNX FAIL", exc_info=1)
                return False

        try:
            onnx_results = None
            if backend == "onnxruntime":
                if to_rename is None:
                    struc_outputs = self.output_names
                else:
                    struc_outputs = [
                        to_rename.get(k, k) for k in self.output_names
                    ]
                onnx_results = self.run_onnxruntime(name, model_proto, inputs,
                                                    struc_outputs,
                                                    external_tensor_storage)
            else:
                raise ValueError("unknown backend")
            logger.info("Run_ONNX OK")

            try:
                if self.skip_tensorflow:
                    logger.info("Results: skipped tensorflow")
                else:
                    if self.check_only_shape:
                        for tf_res, onnx_res in zip(tf_results, onnx_results):
                            np.testing.assert_array_equal(
                                tf_res.shape, onnx_res.shape)
                    else:
                        for tf_res, onnx_res in zip(tf_results, onnx_results):
                            good_cnt = np.count_nonzero(
                                np.isclose(tf_res,
                                           onnx_res,
                                           rtol=self.rtol,
                                           atol=self.atol))
                            bad_cnt = tf_res.size - good_cnt
                            if bad_cnt > self.ptol / 100 * tf_res.size:
                                # Prints a nice error message with stats
                                np.testing.assert_allclose(tf_res,
                                                           onnx_res,
                                                           rtol=self.rtol,
                                                           atol=self.atol)
                    logger.info("Results: OK")
                return True
            except Exception:
                logger.error("Results", exc_info=1)

        except Exception:
            logger.error("Run_ONNX FAIL", exc_info=1)

        return False
Example #23
0
import numpy as np
import tensorflow as tf

root = tf.train.Checkpoint()
root.f = tf.function(lambda x, y: tf.matmul(x, y))

new_input_data = np.random.randn(2, 2, 2, 2).astype(np.float32)
new_w = np.random.randn(2, 2, 2, 2).astype(np.float32)

input_data = tf.convert_to_tensor(new_input_data)
input_w = tf.convert_to_tensor(new_w)

concrete_func = root.f.get_concrete_function(input_data, input_w)

converter = tf.lite.TFLiteConverter.from_concrete_functions([concrete_func])
tflite_model = converter.convert()
tflite_filename = "matmul.tflite"
with open(tflite_filename, "wb") as f:
  f.write(tflite_model)
  print("Converted %s." % tflite_filename)
Example #24
0
 def wrapper(fn):
     wrapped_fn = tf.function(fn, input_signature)
     return _eager_function_handler(input_signature)(wrapped_fn)
Example #25
0
def make_tf_opt_epoch_fn(
        inputs: GraphsTuple, target: np.ndarray, batch_size: int, model: snt.Module,
        optimizer: snt.Optimizer, loss_fn: templates.LossFunction,
        l2_reg: float = 0.0) -> Callable[[tf.Tensor, tf.Tensor], tf.Tensor]:
    """Make a tf.function of (inputs, target) for optimization.

    This function is useful for basic inference training of GNN models. Uses all
    variables to create a a function that has a tf.function optimized input
    signature. Function uses pure tf.functions to build batches and aggregate
    losses. The result is a heavily optimized function that is at least 2x
    faster than a basic tf.function with experimental_relax_shapes=True.

    Args:
      inputs: graphs used for training.
      target: values to predict for training.
      batch_size: batch size.
      model: a GNN model.
      optimizer: optimizer, probably Adam or SGD.
      loss_fn: a loss function to optimize.
      l2_reg: l2 regularization weight.

    Returns:
      optimize_one_epoch(intpus, target), a tf.function optimized
      callable.

    """
    # Explicit input signature is faster than experimental relax shapes.
    input_signature = [
        graph_nets.utils_tf.specs_from_graphs_tuple(inputs),
        tf.TensorSpec.from_tensor(tf.convert_to_tensor(target))
    ]
    n = graph_utils.get_num_graphs(inputs)
    n_batches = tf.cast(n // batch_size, tf.float32)

    if l2_reg > 0.0:
        regularizer = snt.regularizers.L2(l2_reg)
        linear_variables = gnn_models.get_linear_variables(model)

    if batch_size == 1 or n == 1:
        def optimize_one_epoch(inputs, target):
            """One epoch single-batch optimization."""
            with tf.GradientTape() as tape:
                loss = loss_fn(target, model(inputs))
                if l2_reg > 0.0:
                    loss += regularizer(linear_variables)

            grads = tape.gradient(loss, model.trainable_variables)
            optimizer.apply(grads, model.trainable_variables)
            return loss
    else:
        def optimize_one_epoch(inputs, target):
            """One epoch optimization."""
            loss = tf.constant(0.0, tf.float32)
            for batch in get_batch_indices(n, batch_size):
                x_batch = graph_utils.get_graphs_tf(inputs, batch)
                y_batch = tf.gather(target, batch)
                with tf.GradientTape() as tape:
                    batch_loss = loss_fn(y_batch, model(x_batch))
                    if l2_reg > 0.0:
                        batch_loss += regularizer(linear_variables)

                grads = tape.gradient(batch_loss, model.trainable_variables)
                optimizer.apply(grads, model.trainable_variables)
                loss += batch_loss
            return loss / n_batches

    return tf.function(optimize_one_epoch, input_signature=input_signature)
Example #26
0
def run(experiment_name: str, run_name: str, config: Config) -> None:
    mlflow.set_experiment(experiment_name)
    train, val, test = __load_data(config.path, config.window_size,
                                   config.batch_size)

    model = __model_fn(config.window_size, config.hidden_size, config.dropout)
    loss_fn = losses.WeightedBinaryCrossEntropy(config.pos_weight)
    optimizer = __optimizer_fn(config.learning_rate)

    input_spec = (tf.TensorSpec((None, config.window_size, len(SENSORS)),
                                dtype=tf.float32),
                  tf.TensorSpec((None, 1), dtype=tf.float32))

    ckpt = tf.train.Checkpoint(model=model, optimizer=optimizer)
    ckpt_manager = tf.train.CheckpointManager(ckpt,
                                              config.output,
                                              max_to_keep=5)

    train_loss = tf.keras.metrics.Mean(name='loss')
    train_metrics = __training_metrics_fn()
    train_step = tf.function(partial(__train_step,
                                     model=model,
                                     optimizer=optimizer,
                                     loss_fn=loss_fn,
                                     loss=train_loss,
                                     metrics=train_metrics),
                             input_signature=input_spec)

    val_loss = tf.keras.metrics.Mean(name='loss')
    val_metrics = __training_metrics_fn()
    val_step = tf.function(partial(__validation_step,
                                   model=model,
                                   loss_fn=loss_fn,
                                   loss=val_loss,
                                   metrics=val_metrics),
                           input_signature=input_spec)

    eval_state = tf.zeros((2, 2), dtype=tf.int32)
    eval_metrics = __evaluation_metrics_fn()
    eval_step = partial(__evaluation_step, model=model, metrics=eval_metrics)

    with mlflow.start_run(run_name=run_name):
        mlflow.log_params(config._asdict())

        # Fitting
        for epoch in range(1, config.epochs + 1):
            train_loss.reset_states()
            for metrics in train_metrics:
                metrics.reset_states()

            val_loss.reset_states()
            for metric in val_metrics:
                metric.reset_states()

            # Training
            for X, y in train:
                train_step(X, y)

            mlflow.log_metric(train_loss.name,
                              train_loss.result().numpy(),
                              step=epoch)
            mlflow.log_metrics(
                {
                    metric.name: metric.result().numpy()
                    for metric in train_metrics
                },
                step=epoch)

            # Validation
            for X, y in val:
                val_step(X, y)

            mlflow.log_metric(f'val_{val_loss.name}',
                              val_loss.result().numpy(),
                              step=epoch)
            mlflow.log_metrics(
                {
                    f'val_{metric.name}': metric.result().numpy()
                    for metric in val_metrics
                },
                step=epoch)

            # Checkpoint
            if epoch % config.checkpoint_rate == 0:
                ckpt_manager.save()

        # Evaluation
        def evaluate(confusion_matrix, client):
            # Reset PR-AUC and Accuracy metrics
            eval_metrics[0].reset_states()
            eval_metrics[3].reset_states()

            results = test[client].reduce(eval_state, eval_step)
            mlflow.log_metrics(f'client_{client}_val_auc',
                               eval_metrics[0].result().numpy())
            mlflow.log_metrics(f'client_{client}_val_acc',
                               eval_metrics[3].result().numpy())

            return confusion_matrix + results

        confusion_matrix = reduce(evaluate, test.clients,
                                  tf.zeros((2, 2), dtype=tf.int32))

        # Confusion matrix
        fig, ax = plt.subplots(figsize=(16, 8))

        sns.heatmap(confusion_matrix,
                    annot=True,
                    fmt='d',
                    cmap=sns.color_palette("Blues"),
                    ax=ax)

        ax.set_xlabel('Predicted')
        ax.set_ylabel('Ground Truth')

        mlflow.log_figure(fig, 'confusion_matrix.png')
        plt.close(fig)

        # Precision Recall
        fig, ax = plt.subplots(figsize=(16, 8))
        sns.lineplot(x=eval_metrics[2].results().numpy(),
                     y=eval_metrics[1].results().numpy(),
                     ax=ax)

        ax.set_xlabel('Recall')
        ax.set_xlim(0., 1.)

        ax.set_ylabel('Precision')
        ax.set_ylim(0., 1.)

        mlflow.log_figure(fig, 'precision_recall.png')
        plt.close(fig)
Example #27
0
 def __call__(self):
     """
     Assigns the values of the parameters of the main network to the
     parameters of the target network
     """
     tf.function(self._update_target_vars())
Example #28
0
def run_customized_training_loop(
        # pylint: disable=invalid-name
        _sentinel=None,
        # pylint: enable=invalid-name
        strategy=None,
        model_fn=None,
        loss_fn=None,
        model_dir=None,
        train_input_fn=None,
        steps_per_epoch=None,
        steps_per_loop=1,
        epochs=1,
        eval_input_fn=None,
        eval_steps=None,
        metric_fn=None,
        init_checkpoint=None,
        use_remote_tpu=False,
        custom_callbacks=None,
        run_eagerly=False):
    """Run BERT pretrain model training using low-level API.

  Arguments:
      _sentinel: Used to prevent positional parameters. Internal, do not use.
      strategy: Distribution strategy on which to run low level training loop.
      model_fn: Function that returns a tuple (model, sub_model). Caller of this
        function should add optimizer to the `model` via calling
        `model.compile()` API or manually setting `model.optimizer` attribute.
        Second element of the returned tuple(sub_model) is an optional sub model
        to be used for initial checkpoint -- if provided.
      loss_fn: Function with signature func(labels, logits) and returns a loss
        tensor.
      model_dir: Model directory used during training for restoring/saving model
        weights.
      train_input_fn: Function that returns a tf.data.Dataset used for training.
      steps_per_epoch: Number of steps to run per epoch. At the end of each
        epoch, model checkpoint will be saved and evaluation will be conducted
        if evaluation dataset is provided.
      steps_per_loop: Number of steps per graph-mode loop. In order to reduce
        communication in eager context, training logs are printed every
        steps_per_loop.
      epochs: Number of epochs to train.
      eval_input_fn: Function that returns evaluation dataset. If none,
        evaluation is skipped.
      eval_steps: Number of steps to run evaluation. Required if `eval_input_fn`
        is not none.
      metric_fn: A metrics function that returns a Keras Metric object to record
        evaluation result using evaluation dataset or with training dataset
        after every epoch.
      init_checkpoint: Optional checkpoint to load to `sub_model` returned by
        `model_fn`.
      use_remote_tpu: Ignored, will be removed in the future.
      custom_callbacks: A list of Keras Callbacks objects to run during
        training. More specifically, `on_batch_begin()`, `on_batch_end()`,
        methods are invoked during training.
      run_eagerly: Whether to run model training in pure eager execution. This
        should be disable for TPUStrategy.

  Returns:
      Trained model.

  Raises:
      ValueError: (1) When model returned by `model_fn` does not have optimizer
        attribute or when required parameters are set to none. (2) eval args are
        not specified correctly. (3) metric_fn must be a callable if specified.
  """
    # TODO(bfontain): Remove use_remote_tpu once there are no models using it.
    del use_remote_tpu

    if _sentinel is not None:
        raise ValueError('only call `run_customized_training_loop()` '
                         'with named arguments.')

    required_arguments = [
        strategy, model_fn, loss_fn, model_dir, steps_per_epoch, train_input_fn
    ]
    if [arg for arg in required_arguments if arg is None]:
        raise ValueError('`strategy`, `model_fn`, `loss_fn`, `model_dir`, '
                         '`steps_per_loop` and `steps_per_epoch` are required '
                         'parameters.')
    if steps_per_loop > steps_per_epoch:
        logging.error(
            'steps_per_loop: %d is specified to be greater than '
            ' steps_per_epoch: %d, we will use steps_per_epoch as'
            ' steps_per_loop.', steps_per_loop, steps_per_epoch)
        steps_per_loop = steps_per_epoch
    assert tf.executing_eagerly()

    if run_eagerly:
        if steps_per_loop > 1:
            raise ValueError(
                'steps_per_loop is used for performance optimization. When you want '
                'to run eagerly, you cannot leverage graph mode loop.')
        if isinstance(strategy, tf.distribute.experimental.TPUStrategy):
            raise ValueError(
                'TPUStrategy should not run eagerly as it heavily replies on graph'
                ' optimization for the distributed system.')

    if eval_input_fn and (eval_steps is None or metric_fn is None):
        raise ValueError(
            '`eval_step` and `metric_fn` are required when `eval_input_fn ` '
            'is not none.')
    if metric_fn and not callable(metric_fn):
        raise ValueError(
            'if `metric_fn` is specified, metric_fn must be a callable.')

    total_training_steps = steps_per_epoch * epochs

    # To reduce unnecessary send/receive input pipeline operation, we place input
    # pipeline ops in worker task.
    train_iterator = _get_input_iterator(train_input_fn, strategy)

    with distribution_utils.get_strategy_scope(strategy):
        # To correctly place the model weights on accelerators,
        # model and optimizer should be created in scope.
        model, sub_model = model_fn()
        if not hasattr(model, 'optimizer'):
            raise ValueError('User should set optimizer attribute to model '
                             'inside `model_fn`.')
        optimizer = model.optimizer
        use_float16 = isinstance(
            optimizer,
            tf.keras.mixed_precision.experimental.LossScaleOptimizer)

        if init_checkpoint:
            logging.info(
                'Checkpoint file %s found and restoring from '
                'initial checkpoint for core model.', init_checkpoint)
            checkpoint = tf.train.Checkpoint(model=sub_model)
            checkpoint.restore(init_checkpoint).assert_consumed()
            logging.info('Loading from checkpoint file completed')

        train_loss_metric = tf.keras.metrics.Mean('training_loss',
                                                  dtype=tf.float32)
        eval_metrics = [metric_fn()] if metric_fn else []
        # If evaluation is required, make a copy of metric as it will be used by
        # both train and evaluation.
        train_metrics = [
            metric.__class__.from_config(metric.get_config())
            for metric in eval_metrics
        ]

        # Create summary writers
        eval_summary_writer = tf.summary.create_file_writer(
            os.path.join(model_dir, 'summaries/eval'))
        if steps_per_loop >= _MIN_SUMMARY_STEPS:
            # Only writes summary when the stats are collected sufficiently over
            # enough steps.
            train_summary_writer = tf.summary.create_file_writer(
                os.path.join(model_dir, 'summaries/train'))
        else:
            train_summary_writer = None

        # Collects training variables.
        training_vars = model.trainable_variables

        def _replicated_step(inputs):
            """Replicated training step."""

            inputs, labels = inputs
            with tf.GradientTape() as tape:
                model_outputs = model(inputs, training=True)
                loss = loss_fn(labels, model_outputs)
                if use_float16:
                    scaled_loss = optimizer.get_scaled_loss(loss)

            if use_float16:
                scaled_grads = tape.gradient(scaled_loss, training_vars)
                grads = optimizer.get_unscaled_gradients(scaled_grads)
            else:
                grads = tape.gradient(loss, training_vars)
            optimizer.apply_gradients(zip(grads, training_vars))
            # For reporting, the metric takes the mean of losses.
            train_loss_metric.update_state(loss)
            for metric in train_metrics:
                metric.update_state(labels, model_outputs)

        @tf.function
        def train_steps(iterator, steps):
            """Performs distributed training steps in a loop.

      Args:
        iterator: the distributed iterator of training datasets.
        steps: an tf.int32 integer tensor to specify number of steps to run
          inside host training loop.

      Raises:
        ValueError: Any of the arguments or tensor shapes are invalid.
      """
            if not isinstance(steps, tf.Tensor):
                raise ValueError(
                    'steps should be an Tensor. Python object may cause '
                    'retracing.')

            for _ in tf.range(steps):
                strategy.experimental_run_v2(_replicated_step,
                                             args=(next(iterator), ))

        def train_single_step(iterator):
            """Performs a distributed training step.

      Args:
        iterator: the distributed iterator of training datasets.

      Raises:
        ValueError: Any of the arguments or tensor shapes are invalid.
      """
            strategy.experimental_run_v2(_replicated_step,
                                         args=(next(iterator), ))

        def test_step(iterator):
            """Calculates evaluation metrics on distributed devices."""
            def _test_step_fn(inputs):
                """Replicated accuracy calculation."""

                inputs, labels = inputs
                model_outputs = model(inputs, training=False)
                for metric in eval_metrics:
                    metric.update_state(labels, model_outputs)

            strategy.experimental_run_v2(_test_step_fn,
                                         args=(next(iterator), ))

        if not run_eagerly:
            train_single_step = tf.function(train_single_step)
            test_step = tf.function(test_step)

        def _run_evaluation(current_training_step, test_iterator):
            """Runs validation steps and aggregate metrics."""
            for _ in range(eval_steps):
                test_step(test_iterator)

            with eval_summary_writer.as_default():
                for metric in eval_metrics + model.metrics:
                    metric_value = _float_metric_value(metric)
                    logging.info('Step: [%d] Validation %s = %f',
                                 current_training_step, metric.name,
                                 metric_value)
                    tf.summary.scalar(metric.name,
                                      metric_value,
                                      step=current_training_step)
                eval_summary_writer.flush()

        def _run_callbacks_on_batch_begin(batch):
            """Runs custom callbacks at the start of every step."""
            if not custom_callbacks:
                return
            for callback in custom_callbacks:
                callback.on_batch_begin(batch)

        def _run_callbacks_on_batch_end(batch):
            """Runs custom callbacks at the end of every step."""
            if not custom_callbacks:
                return
            for callback in custom_callbacks:
                callback.on_batch_end(batch)

        # Training loop starts here.
        checkpoint = tf.train.Checkpoint(model=model, optimizer=optimizer)
        latest_checkpoint_file = tf.train.latest_checkpoint(model_dir)
        if latest_checkpoint_file:
            logging.info(
                'Checkpoint file %s found and restoring from '
                'checkpoint', latest_checkpoint_file)
            checkpoint.restore(latest_checkpoint_file)
            logging.info('Loading from checkpoint file completed')

        current_step = optimizer.iterations.numpy()
        checkpoint_name = 'ctl_step_{step}.ckpt'

        while current_step < total_training_steps:
            # Training loss/metric are taking average over steps inside micro
            # training loop. We reset the their values before each round.
            train_loss_metric.reset_states()
            for metric in train_metrics + model.metrics:
                metric.reset_states()

            _run_callbacks_on_batch_begin(current_step)
            # Runs several steps in the host while loop.
            steps = _steps_to_run(current_step, steps_per_epoch,
                                  steps_per_loop)

            if steps == 1:
                # TODO(zongweiz): merge with train_steps once tf.while_loop
                # GPU performance bugs are fixed.
                train_single_step(train_iterator)
            else:
                # Converts steps to a Tensor to avoid tf.function retracing.
                train_steps(train_iterator,
                            tf.convert_to_tensor(steps, dtype=tf.int32))
            _run_callbacks_on_batch_end(current_step)
            current_step += steps

            train_loss = _float_metric_value(train_loss_metric)
            # Updates training logging.
            training_status = 'Train Step: %d/%d  / loss = %s' % (
                current_step, total_training_steps, train_loss)

            if train_summary_writer:
                with train_summary_writer.as_default():
                    tf.summary.scalar(train_loss_metric.name,
                                      train_loss,
                                      step=current_step)
                    for metric in train_metrics + model.metrics:
                        metric_value = _float_metric_value(metric)
                        training_status += '  %s = %f' % (metric.name,
                                                          metric_value)
                        tf.summary.scalar(metric.name,
                                          metric_value,
                                          step=current_step)
                    train_summary_writer.flush()
            logging.info(training_status)

            # Saves model checkpoints and run validation steps at every epoch end.
            if current_step % steps_per_epoch == 0:
                # To avoid repeated model saving, we do not save after the last
                # step of training.
                if current_step < total_training_steps:
                    _save_checkpoint(checkpoint, model_dir,
                                     checkpoint_name.format(step=current_step))

                if eval_input_fn:
                    logging.info('Running evaluation after step: %s.',
                                 current_step)
                    _run_evaluation(
                        current_step,
                        _get_input_iterator(eval_input_fn, strategy))
                    # Re-initialize evaluation metric.
                    for metric in eval_metrics + model.metrics:
                        metric.reset_states()

        _save_checkpoint(checkpoint, model_dir,
                         checkpoint_name.format(step=current_step))

        if eval_input_fn:
            logging.info(
                'Running final evaluation after training is complete.')
            _run_evaluation(current_step,
                            _get_input_iterator(eval_input_fn, strategy))

        training_summary = {
            'total_training_steps': total_training_steps,
            'train_loss': _float_metric_value(train_loss_metric),
        }
        if eval_metrics:
            # TODO(hongkuny): Cleans up summary reporting in text.
            training_summary['last_train_metrics'] = _float_metric_value(
                train_metrics[0])
            training_summary['eval_metrics'] = _float_metric_value(
                eval_metrics[0])

        write_txt_summary(training_summary, model_dir)

        return model
Example #29
0
def benchmark(
    gen_displace, N, num, batchsize, dataset=False, graph=False, check_err=True
):
    """Runs a randomized benchmark of displacement operator generation, D(alpha)

    Args:
        gen_displace ([type]): [description]
        N (int): Dimension of Hilbert space
        num (int): Total number of alphas to benchmark
        batchsize (int): Number of alphas per batch (only used with Dataset)
        dataset (bool, optional): Enable tf.data.Datset API. Defaults to False.
        graph (bool, optional): Enable tf.function compilation. Defaults to False.
        check_err (bool, optional): Check error against analytic coherent state. Defaults to True.

    Returns:
        (float x 4): (total time per alpha, loop time per alpha, mean error, max error)
    """
    alphas = random_alphas(num)

    # Dataset API adds some overhead, we can compare to the in-memory case
    if dataset:
        b_alphas = tf.data.Dataset.from_tensor_slices(alphas).batch(batchsize)

    # Initialize the constants used in the displacement generation
    # For the direct expm method, this is just creating a, a_dag
    # For BCH, we diagonalize the q, p operators
    start_time = time.perf_counter()
    f = gen_displace(N)
    init_time = time.perf_counter() - start_time

    if graph:  # Enable tf.function and tf.autograph
        f = tf.function(f)
        f = f.get_concrete_function(tf.TensorSpec(shape=[num], dtype=tf.complex64))

    # Repeat batch 3x to iron out timing fluctuations
    repeat_times = []
    for _ in range(3):
        if dataset:  # Loop through each Dataset batch
            start_time = time.perf_counter()
            for l_alpha in b_alphas:  # Be careful of memory limitations here
                results = f(l_alpha)  # Calculate error of last batch only
                alphas = l_alpha
            loop_time = time.perf_counter() - start_time
        else:
            start_time = time.perf_counter()
            results = f(alphas)
            loop_time = time.perf_counter() - start_time
        repeat_times.append(loop_time)

    # We take the minimum time from above. This is typically representative of
    # a lower bound, as higher times are often caused by other processes
    # interfering with timing accuracy. See Python's timeit.repeat docs.
    total_time = (min(repeat_times) + init_time) / num
    loop_time = min(repeat_times) / num

    if check_err:
        mean_err, max_err = err_checks.coeff_err(results, alphas)
    else:
        mean_err, max_err = float("inf"), float("inf")

    return total_time, loop_time, mean_err, max_err
    def custom_loop(self, train_dist_dataset, test_dist_dataset, strategy):
        """Custom training and testing loop.
    Args:
      train_dist_dataset: Training dataset created using strategy.
      test_dist_dataset: Testing dataset created using strategy.
      strategy: Distribution strategy.
    Returns:
      train_loss, train_accuracy, test_loss, test_accuracy
    """
        def distributed_train_epoch(ds, epoch_num):
            total_loss = 0.0
            num_train_batches = 0.0
            for one_batch in ds:

                start = time.time()
                per_replica_loss = strategy.experimental_run_v2(
                    self.train_step, args=(one_batch, ))
                current_loss = strategy.reduce(tf.distribute.ReduceOp.SUM,
                                               per_replica_loss,
                                               axis=None)
                total_loss += current_loss
                num_train_batches += 1
                self.iter_num += 1
                time_cost_per_batch = time.time() - start

                images_per_sec = cfg.TRAIN.batch_size / time_cost_per_batch

                if self.iter_num % cfg.TRAIN.log_interval == 0:
                    logger.info('epoch_num: %d, '
                                'iter_num: %d, '
                                'loss_value: %.6f,  '
                                'speed: %d images/sec ' %
                                (epoch_num, self.iter_num, current_loss,
                                 images_per_sec))

            return total_loss, num_train_batches

        def distributed_test_epoch(ds, epoch_num):
            total_loss = 0.
            num_test_batches = 0.0
            for one_batch in ds:
                per_replica_loss = strategy.experimental_run_v2(
                    self.test_step, args=(one_batch, ))

                current_loss = strategy.reduce(tf.distribute.ReduceOp.SUM,
                                               per_replica_loss,
                                               axis=None)
                total_loss += current_loss
                num_test_batches += 1
            return total_loss, num_test_batches

        if self.enable_function:
            distributed_train_epoch = tf.function(distributed_train_epoch)
            distributed_test_epoch = tf.function(distributed_test_epoch)

        for epoch in range(self.epochs):

            start = time.time()
            self.optimizer.learning_rate = self.decay(epoch)

            train_total_loss, num_train_batches = distributed_train_epoch(
                train_dist_dataset, epoch)
            test_total_loss, num_test_batches = distributed_test_epoch(
                test_dist_dataset, epoch)

            time_consume_per_epoch = time.time() - start
            training_massage = 'Epoch: %d, ' \
                               'Train Loss: %.6f, ' \
                               'Test Loss: %.6f '\
                               'Time consume: %.2f'%(epoch,
                                                     train_total_loss / num_train_batches,
                                                     test_total_loss / num_test_batches,
                                                     time_consume_per_epoch)

            logger.info(training_massage)

            #### save the model every end of epoch
            current_model_saved_name = os.path.join(
                cfg.MODEL.model_path, 'epoch_%d_val_loss%.6f' %
                (epoch, test_total_loss / num_test_batches))

            logger.info('A model saved to %s' % current_model_saved_name)

            if not os.access(cfg.MODEL.model_path, os.F_OK):
                os.mkdir(cfg.MODEL.model_path)

            tf.saved_model.save(self.model, current_model_saved_name)

        return (train_total_loss / num_train_batches,
                test_total_loss / num_test_batches)
Example #31
0
    def __init__(self,
                 layer_sizes: Sequence[int],
                 input_size: int,
                 num_classes: int = 2,
                 context_map_size: int = 4,
                 bias: bool = True,
                 context_bias: bool = False,
                 base_predictor: Optional[Callable[[np.ndarray],
                                                   np.ndarray]] = None,
                 learning_rate: Union[float, DynamicParameter] = 1e-3,
                 pred_clipping: float = 1e-3,
                 weight_clipping: float = 5.0):

        tf.Module.__init__(self, name='GLN')
        GLNBase.__init__(self, layer_sizes, input_size, num_classes,
                         context_map_size, bias, context_bias, base_predictor,
                         learning_rate, pred_clipping, weight_clipping)

        # Learning rate as dynamic parameter
        if self.learning_rate == 'paper':
            self.learning_rate = PaperLearningRate(name='learning_rate')
        else:
            self.learning_rate = ConstantParameter(self.learning_rate,
                                                   name='learning_rate')

        # Initialize layers
        self.layers = list()
        previous_size = self.base_pred_size
        for size in (self.layer_sizes + (1, )):
            self.layers.append(
                Linear(size=size,
                       input_size=previous_size,
                       context_size=self.input_size,
                       context_map_size=self.context_map_size,
                       num_classes=self.num_classes,
                       learning_rate=self.learning_rate,
                       pred_clipping=self.pred_clipping,
                       weight_clipping=self.weight_clipping,
                       bias=self.bias,
                       context_bias=self.context_bias))
            previous_size = size

        # TF-compiled predict function
        self._tf_predict = tf.function(
            func=self._predict,
            input_signature=[
                tf.TensorSpec(shape=(None, self.base_pred_size),
                              dtype=tf.dtypes.float32),
                tf.TensorSpec(shape=(None, self.input_size),
                              dtype=tf.dtypes.float32)
            ],
            autograph=False)

        # TF-compiled update function
        self.target_dtype = tf.dtypes.int64
        self._tf_update = tf.function(
            func=self._predict,
            input_signature=[
                tf.TensorSpec(shape=(None, self.base_pred_size),
                              dtype=tf.dtypes.float32),
                tf.TensorSpec(shape=(None, self.input_size),
                              dtype=tf.dtypes.float32),
                tf.TensorSpec(shape=(None, ), dtype=self.target_dtype)
            ],
            autograph=False)
Example #32
0
    def __init__(
        self,
        model,
        features_file,
        labels_file,
        batch_size,
        batch_type="examples",
        length_bucket_width=None,
        scorers=None,
        save_predictions=False,
        early_stopping=None,
        model_dir=None,
        export_on_best=None,
        exporter=None,
        max_exports_to_keep=5,
    ):
        """Initializes the evaluator.

        Args:
          model: A :class:`opennmt.models.Model` to evaluate.
          features_file: Path to the evaluation features.
          labels_file: Path to the evaluation labels.
          batch_size: The evaluation batch size.
          batch_type: The batching strategy to use: can be "examples" or "tokens".
          length_bucket_width: The width of the length buckets to select batch
            candidates from (for efficiency). Set ``None`` to not constrain batch
            formation.
          scorers: A list of scorers, callables taking the path to the reference and
            the hypothesis and return one or more scores.
          save_predictions: Save evaluation predictions to a file. This is ``True``
            when :obj:`scorers` is set.
          early_stopping: An ``EarlyStopping`` instance.
          model_dir: The active model directory.
          export_on_best: Export a model when this evaluation metric has the
            best value so far.
          exporter: A :class:`opennmt.utils.Exporter` instance to export the model.
            Defaults to :class:`opennmt.utils.SavedModelExporter`.
          max_exports_to_keep: Maximum number of exports to keep. Older exports will
            be garbage collected. Set to ``None`` to keep all exports.

        Raises:
          ValueError: If :obj:`save_predictions` is set but the model is not compatible.
          ValueError: If :obj:`save_predictions` is set but :obj:`model_dir` is ``None``.
          ValueError: If :obj:`export_on_best` is set but :obj:`model_dir` is ``None``.
          ValueError: If the :obj:`early_stopping` configuration is invalid.
        """
        if model_dir is not None:
            export_dir = os.path.join(model_dir, "export")
            eval_dir = os.path.join(model_dir, "eval")
        else:
            if save_predictions:
                raise ValueError(
                    "Saving evaluation predictions requires model_dir to be set"
                )
            if export_on_best is not None:
                raise ValueError("Exporting models requires model_dir to be set")
            export_dir = None
            eval_dir = None

        if scorers is None:
            scorers = []
        if scorers:
            save_predictions = True
        if save_predictions:
            if model.unsupervised:
                raise ValueError(
                    "This model does not support saving evaluation predictions"
                )
            if not tf.io.gfile.exists(eval_dir):
                tf.io.gfile.makedirs(eval_dir)
        self._model = model
        self._labels_file = labels_file
        self._save_predictions = save_predictions
        self._scorers = scorers
        self._eval_dir = eval_dir
        self._metrics_history = []
        if eval_dir is not None:
            self._summary_writer = tf.summary.create_file_writer(eval_dir)
            summaries = misc.read_summaries(eval_dir)
            for step, values in summaries:
                metrics = misc.extract_prefixed_keys(values, _SUMMARIES_SCOPE + "/")
                self._metrics_history.append((step, metrics))
        else:
            self._summary_writer = tf.summary.create_noop_writer()
        dataset = model.examples_inputter.make_evaluation_dataset(
            features_file,
            labels_file,
            batch_size,
            batch_type=batch_type,
            length_bucket_width=length_bucket_width,
            num_threads=1,
            prefetch_buffer_size=1,
        )

        self._eval_fn = tf.function(
            model.evaluate, input_signature=dataset.element_spec
        )
        self._dataset = dataset

        self._metrics_name = {"loss", "perplexity"}
        for scorer in self._scorers:
            self._metrics_name.update(scorer.scores_name)
        model_metrics = self._model.get_metrics()
        if model_metrics:
            self._metrics_name.update(set(model_metrics.keys()))

        if early_stopping is not None:
            if early_stopping.metric not in self._metrics_name:
                raise ValueError(
                    "Invalid early stopping metric '%s', expected one in %s"
                    % (early_stopping.metric, str(self._metrics_name))
                )
            if early_stopping.steps <= 0:
                raise ValueError("Early stopping steps should greater than 0")
        self._early_stopping = early_stopping

        self._export_on_best = export_on_best
        self._exporter = exporter
        self._export_dir = export_dir
        self._max_exports_to_keep = max_exports_to_keep
Example #33
0
import os
import random
import tensorflow as tf
import numpy as np
import matplotlib.pyplot as plt
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.models import Sequential

keras_model_path = '.' + os.sep + 'Models' + os.sep + 'fsw_best_model' + os.sep
model = tf.keras.models.load_model(keras_model_path)

run_model = tf.function(lambda x: model(x))
# This is important, let's fix the input size.
concrete_func = run_model.get_concrete_function(
    tf.TensorSpec([1, 60, 7, 1], model.inputs[0].dtype))

# model directory.
MODEL_DIR = '.' + os.sep + 'Models' + os.sep + 'Saved_Model' + os.sep
model.save(MODEL_DIR, save_format="tf", signatures=concrete_func)

converter = tf.lite.TFLiteConverter.from_saved_model(MODEL_DIR)
tflite_model = converter.convert()

if not os.path.exists('./tflite_models'):
    os.mkdir('./tflite_models')

with open('./tflite_models/keras_tflite', 'wb') as f:
    f.write(tflite_model)

interpreter = tf.lite.Interpreter(model_content=tflite_model)
Example #34
0
def _run(func, *args, **kwargs):
    """funcをグラフモードで実行する。"""
    return tf.function(func)(*args, **kwargs)
Example #35
0
    def test_all_tf_functions_work_together_high_threshold(self):
        clients = 3
        num_sub_rounds = 4
        max_rounds = 6
        max_num_prefixes = 3
        threshold = 100
        max_user_contribution = 100
        roots = (string.ascii_lowercase + string.digits + "'@#-;*:./" +
                 triehh_tf.DEFAULT_TERMINATOR)
        possible_prefix_extensions = list(roots)
        possible_prefix_extensions_num = len(possible_prefix_extensions)
        possible_prefix_extensions = tf.constant(possible_prefix_extensions,
                                                 dtype=tf.string)

        server_state = triehh_tf.ServerState(
            discovered_heavy_hitters=tf.constant([], dtype=tf.string),
            heavy_hitter_frequencies=tf.constant([], dtype=tf.float64),
            discovered_prefixes=tf.constant([''], dtype=tf.string),
            round_num=tf.constant(0, dtype=tf.int32),
            accumulated_votes=tf.zeros(
                dtype=tf.int32,
                shape=[max_num_prefixes, possible_prefix_extensions_num]),
            accumulated_weights=tf.constant(0, dtype=tf.int32))

        def create_dataset_fn(client_id):
            del client_id
            return tf.data.Dataset.from_tensor_slices(['hello', 'hey', 'hi'])

        client_ids = list(range(100))

        client_data = tff.simulation.ClientData.from_clients_and_fn(
            client_ids=client_ids,
            create_tf_dataset_for_client_fn=create_dataset_fn)

        for round_num in range(max_rounds * num_sub_rounds):
            sampled_clients = list(range(clients))
            sampled_datasets = [
                client_data.create_tf_dataset_for_client(client_id)
                for client_id in sampled_clients
            ]
            accumulated_votes = tf.zeros(
                dtype=tf.int32,
                shape=[max_num_prefixes, possible_prefix_extensions_num])
            accumulated_weights = tf.constant(0, dtype=tf.int32)

            # This is a workaround to clear the graph cache in the `tf.function`; this
            # is necessary because we need to construct a new lookup table every round
            # based on new prefixes.
            client_update = tf.function(
                triehh_tf.client_update.python_function)

            for dataset in sampled_datasets:
                client_output = client_update(
                    dataset, server_state.discovered_prefixes,
                    possible_prefix_extensions, round_num,
                    tf.constant(num_sub_rounds),
                    tf.constant(max_num_prefixes, dtype=tf.int32),
                    tf.constant(max_user_contribution, dtype=tf.int32))
                accumulated_votes += client_output.client_votes
                accumulated_weights += client_output.client_weight

            server_state = triehh_tf.server_update(
                server_state, possible_prefix_extensions, accumulated_votes,
                accumulated_weights, tf.constant(num_sub_rounds,
                                                 dtype=tf.int32),
                tf.constant(max_num_prefixes, dtype=tf.int32),
                tf.constant(threshold, dtype=tf.int32))

        expected_discovered_heavy_hitters = tf.constant([], dtype=tf.string)
        expected_heavy_hitter_frequencies = tf.constant([], dtype=tf.float64)
        expected_discovered_prefixes = tf.constant([], dtype=tf.string)

        self.assertSetAllEqual(server_state.discovered_heavy_hitters,
                               expected_discovered_heavy_hitters)
        self.assertHistogramsEqual(server_state.discovered_heavy_hitters,
                                   server_state.heavy_hitter_frequencies,
                                   expected_discovered_heavy_hitters,
                                   expected_heavy_hitter_frequencies)
        self.assertSetAllEqual(server_state.discovered_prefixes,
                               expected_discovered_prefixes)
Example #36
0
    def freeze_and_run_tf(self, func, feed_dict, outputs, as_session,
                          premade_placeholders, large_model, constant_fold):
        np.random.seed(1)  # Make it reproducible.
        clean_feed_dict = {utils.node_name(k): v for k, v in feed_dict.items()}
        if is_tf2() and not as_session:
            #
            # use eager to execute the tensorflow func
            #
            # numpy doesn't work for all ops, make it tf.Tensor()
            input_tensors = [
                tf.TensorSpec(shape=v.shape,
                              dtype=tf.as_dtype(v.dtype),
                              name=utils.node_name(k))
                for k, v in feed_dict.items()
            ]
            input_list = [
                tf.convert_to_tensor(v,
                                     dtype=tf.as_dtype(v.dtype),
                                     name=utils.node_name(k))
                for k, v in feed_dict.items()
            ]
            tf.random.set_seed(1)
            result = func(*input_list)
            if isinstance(result, (list, tuple)):
                # list or tuple
                result = [x.numpy() for x in result]
            else:
                # single result
                result = [result.numpy()]

            # now make the eager functions a graph
            concrete_func = tf.function(func,
                                        input_signature=tuple(input_tensors))
            concrete_func = concrete_func.get_concrete_function()
            graph_def = from_function(concrete_func,
                                      input_names=list(feed_dict.keys()),
                                      output_names=outputs,
                                      large_model=large_model)
            initialized_tables = None
        else:
            #
            # use graph to execute the tensorflow func
            #
            with tf_session() as sess:
                tf_set_random_seed(1)
                input_list = []
                if not premade_placeholders:
                    for k, v in clean_feed_dict.items():
                        input_list.append(
                            tf_placeholder(name=k,
                                           shape=v.shape,
                                           dtype=tf.as_dtype(v.dtype)))
                func(*input_list)
                variables_lib.global_variables_initializer().run()
                tf_tables_initializer().run()

                output_dict = []
                for out_name in outputs:
                    output_dict.append(sess.graph.get_tensor_by_name(out_name))
                result = sess.run(output_dict, feed_dict=feed_dict)
                graph_def = freeze_session(sess,
                                           input_names=list(feed_dict.keys()),
                                           output_names=outputs)
                table_names, key_dtypes, value_dtypes = get_hash_table_info(
                    graph_def)
                initialized_tables = {}
                for n, k_dtype, val_dtype in zip(table_names, key_dtypes,
                                                 value_dtypes):
                    h = lookup_ops.hash_table_v2(k_dtype,
                                                 val_dtype,
                                                 shared_name=n)
                    k, v = lookup_ops.lookup_table_export_v2(
                        h, k_dtype, val_dtype)
                    initialized_tables[n] = (sess.run(k), sess.run(v))

            tf_reset_default_graph()
            with tf_session() as sess:
                tf.import_graph_def(graph_def, name='')
                graph_def = tf_optimize(list(feed_dict.keys()),
                                        outputs,
                                        graph_def,
                                        fold_constant=constant_fold)

        model_path = os.path.join(
            self.test_data_directory,
            self._testMethodName + "_after_tf_optimize.pb")
        utils.save_protobuf(model_path, graph_def)
        self.logger.debug("created file  %s", model_path)
        return result, graph_def, initialized_tables
Example #37
0
def _create_eval_loop_fn(eval_step_fn, options: StandardEvaluatorOptions):
  if options.use_tf_function:
    eval_step_fn = tf.function(eval_step_fn)
  return loop_fns.create_loop_fn(eval_step_fn)