Python GradientAccumulator.reset Examples

Programming Language: Python

Namespace/Package Name: transformers

Method/Function: reset

Examples at hotexamples.com: 4

Python GradientAccumulator.reset - 4 examples found. These are the top rated real world Python examples of transformers.GradientAccumulator.reset extracted from open source projects. You can rate examples to help us improve the quality of examples.

Frequently Used Methods

Show Hide

GradientAccumulator(8)

reset(4)

Frequently Used Methods

GradientAccumulator (8)

reset (4)

Example #1

Show file

File: test_optimization_tf.py Project: manybodycpa/transformers-huggingface-fork

 def testGradientAccumulator(self):
     accumulator = GradientAccumulator()
     accumulator([tf.constant([1.0, 2.0])])
     accumulator([tf.constant([-2.0, 1.0])])
     accumulator([tf.constant([-1.0, 2.0])])
     with self.assertRaises(ValueError):
         accumulator([tf.constant([1.0, 1.0]), tf.constant([2.0, 2.0])])
     self.assertEqual(accumulator.step, 3)
     self.assertEqual(len(accumulator.gradients), 1)
     self.assertListAlmostEqual(accumulator.gradients[0].numpy().tolist(),
                                [-2.0, 5.0],
                                tol=1e-2)
     accumulator.reset()
     self.assertEqual(accumulator.step, 0)
     self.assertListAlmostEqual(accumulator.gradients[0].numpy().tolist(),
                                [0.0, 0.0],
                                tol=1e-2)

Example #2

Show file

File: optimization_tf_test.py Project: jingjingli01/TGLS

    def testGradientAccumulatorDistributionStrategy(self):
        context._context = None
        ops.enable_eager_execution_internal()
        physical_devices = tf.config.experimental.list_physical_devices("CPU")
        tf.config.experimental.set_virtual_device_configuration(
            physical_devices[0], [
                tf.config.experimental.VirtualDeviceConfiguration(),
                tf.config.experimental.VirtualDeviceConfiguration()
            ])

        devices = tf.config.experimental.list_logical_devices(
            device_type="CPU")
        strategy = tf.distribute.MirroredStrategy(
            devices=[device.name for device in devices])

        with strategy.scope():
            accumulator = GradientAccumulator()
            variable = tf.Variable([4.0, 3.0])
            optimizer = create_optimizer(5e-5, 10, 5)
            gradient_placeholder = tf.Variable([0.0, 0.0], trainable=False)

        def accumulate_on_replica(gradient):
            accumulator([gradient])

        def apply_on_replica():
            optimizer.apply_gradients(
                list(zip(accumulator.gradients, [variable])), 1.0)

        @tf.function
        def accumulate(grad1, grad2):
            with strategy.scope():
                gradient_placeholder.values[0].assign(grad1)
                gradient_placeholder.values[1].assign(grad2)
                strategy.experimental_run_v2(accumulate_on_replica,
                                             args=(gradient_placeholder, ))

        @tf.function
        def apply_grad():
            with strategy.scope():
                strategy.experimental_run_v2(apply_on_replica)

        accumulate([1.0, 2.0], [-1.0, 1.0])
        accumulate([3.0, -1.0], [-1.0, -1.0])
        accumulate([-2.0, 2.0], [3.0, -2.0])
        self.assertEqual(accumulator.step, 3)
        self.assertListAlmostEqual(
            accumulator._gradients[0].values[0].value().numpy().tolist(),
            [2.0, 3.0],
            tol=1e-2)
        self.assertListAlmostEqual(
            accumulator._gradients[0].values[1].value().numpy().tolist(),
            [1.0, -2.0],
            tol=1e-2)
        apply_grad()
        self.assertListAlmostEqual(variable.value().numpy().tolist(),
                                   [4.0, 3.0],
                                   tol=1e-2)
        accumulator.reset()
        self.assertEqual(accumulator.step, 0)
        self.assertListAlmostEqual(
            accumulator._gradients[0].values[0].value().numpy().tolist(),
            [0.0, 0.0],
            tol=1e-2)
        self.assertListAlmostEqual(
            accumulator._gradients[0].values[1].value().numpy().tolist(),
            [0.0, 0.0],
            tol=1e-2)

Example #3

Show file

File: test_optimization_tf.py Project: LuCeHe/transformers-plus-performers

    def testGradientAccumulatorDistributionStrategy(self):
        context._context = None
        ops.enable_eager_execution_internal()
        physical_devices = tf.config.list_physical_devices("CPU")
        if len(physical_devices) == 1:
            tf.config.set_logical_device_configuration(physical_devices[0], [
                tf.config.LogicalDeviceConfiguration(),
                tf.config.LogicalDeviceConfiguration()
            ])
        devices = tf.config.list_logical_devices(device_type="CPU")
        strategy = tf.distribute.MirroredStrategy(devices=devices[:2])

        with strategy.scope():
            accumulator = GradientAccumulator()
            variable = tf.Variable([4.0, 3.0])
            optimizer, _ = create_optimizer(5e-5, 10, 5)
            gradient_placeholder = tf.Variable([0.0, 0.0], trainable=False)

        def accumulate_on_replica(gradient):
            accumulator([gradient])

        def apply_on_replica():
            optimizer.apply_gradients(
                list(zip(accumulator.gradients, [variable])))

        @tf.function
        def accumulate(grad1, grad2):
            with strategy.scope():
                local_variables = strategy.experimental_local_results(
                    gradient_placeholder)
                local_variables[0].assign(grad1)
                local_variables[1].assign(grad2)
                if version.parse(tf.version.VERSION) >= version.parse("2.2"):
                    strategy.run(accumulate_on_replica,
                                 args=(gradient_placeholder, ))
                else:
                    strategy.experimental_run_v2(accumulate_on_replica,
                                                 args=(gradient_placeholder, ))

        @tf.function
        def apply_grad():
            with strategy.scope():
                if version.parse(tf.version.VERSION) >= version.parse("2.2"):
                    strategy.run(apply_on_replica)
                else:
                    strategy.experimental_run_v2(apply_on_replica)

        def _check_local_values(grad1, grad2):
            values = strategy.experimental_local_results(
                accumulator._gradients[0])
            self.assertListAlmostEqual(values[0].value(), grad1, tol=1e-2)
            self.assertListAlmostEqual(values[1].value(), grad2, tol=1e-2)

        accumulate([1.0, 2.0], [-1.0, 1.0])
        accumulate([3.0, -1.0], [-1.0, -1.0])
        accumulate([-2.0, 2.0], [3.0, -2.0])
        self.assertEqual(accumulator.step, 3)
        _check_local_values([2.0, 3.0], [1.0, -2.0])
        apply_grad()
        self.assertListAlmostEqual(variable.value(), [4.0, 3.0], tol=1e-2)
        accumulator.reset()
        self.assertEqual(accumulator.step, 0)
        _check_local_values([0.0, 0.0], [0.0, 0.0])

Example #4

Show file

File: trainer_tf_barebones.py Project: julien-c/tf-trainer-proposal

class TFTrainer:
    model: TFPreTrainedModel
    args: TFTrainingArguments
    train_dataset: Optional[tf.data.Dataset]
    eval_dataset: Optional[tf.data.Dataset]
    test_dataset: Optional[tf.data.Dataset]
    dataset_info: DatasetInfo

    strategy: Strategy

    def __init__(
        self,
        model: TFPreTrainedModel,
        args: TFTrainingArguments,
        train_dataset: Optional[tf.data.Dataset] = None,
        eval_dataset: Optional[tf.data.Dataset] = None,
        test_dataset: Optional[tf.data.Dataset] = None,
        dataset_info: Optional[DatasetInfo] = None,
    ):
        self.model = model
        self.args = args
        self.train_dataset = train_dataset
        self.eval_dataset = eval_dataset
        self.test_dataset = test_dataset
        self.dataset_info = dataset_info

        self.gradient_accumulator = GradientAccumulator()
        self.accum_steps = 1

        if self.args.strategy_name == "mirrored":
            self.strategy = tf.distribute.MirroredStrategy()
        elif self.args.strategy_name == "onedevice":
            if len(tf.config.list_physical_devices('GPU')) >= 1:
                self.strategy = tf.distribute.OneDeviceStrategy(
                    device="/gpu:0")
            else:
                self.strategy = tf.distribute.OneDeviceStrategy(
                    device="/cpu:0")
        else:
            raise ValueError("The strategy {} does not exists.".format(
                self.args.strategy_name))

        # To conform with Trainer's API we call this from here.
        # All args should be in the `args` already.
        self._setup_training()

    def _setup_training(self,
                        checkpoint_path: str = "checkpoints",
                        log_path: str = "logs") -> None:
        """
        Setup the different steps to train a model:
          - check if all the data are given
          - create the proper strategy
          - create the features
          - prepare the model settings

        Args:
          checkpoint_path: the directory path where the model checkpoints will be saved, "./checkpoints" folder by default.
          log_path: the directory path where the Tensorboard logs will be saved, "./logs" folder by default.
          data_cache_dir: the directory path where the data will be cached, "./cache" folder by default.
          model_cache_dir (optional): the directory path where the pretrained model will be cached.
        """
        self._prepare_dataset()

        with self.strategy.scope():
            self._create_optimizer()
            _ = self.optimizer.iterations
            self._set_loss_and_metric()
            self._create_checkpoint_manager(checkpoint_path)
            self._create_summary_writer(log_path)

    def _set_loss_and_metric(self) -> None:
        """
        Create the training loss and metric with their name. Allowed names are those listed
        in the Tensorflow documentation and those contained in the transformers library.
        """
        try:
            self.loss = tf.keras.losses.get({
                "class_name": self.args.loss_name,
                "config": {
                    "from_logits": True,
                    "reduction": tf.keras.losses.Reduction.NONE
                }
            })
        except TypeError:
            self.loss = tf.keras.losses.get({
                "class_name": self.args.loss_name,
                "config": {
                    "reduction": tf.keras.losses.Reduction.NONE
                }
            })

        self.train_acc_metric = tf.keras.metrics.get({
            "class_name": self.args.metric_name,
            "config": {
                "name": "train_accuracy"
            }
        })
        self.test_acc_metric = tf.keras.metrics.get({
            "class_name": self.args.metric_name,
            "config": {
                "name": "test_accuracy"
            }
        })

    def _create_summary_writer(self, log_path: str) -> None:
        """
        Create a summary writer to be able to read the logs in Tensorboard.
        Args:
          log_path: the directory path where the Tensorboard logs will be saved.
        """
        self.log_path = log_path
        self.train_writer = tf.summary.create_file_writer(log_path + "/train")
        self.test_writer = tf.summary.create_file_writer(log_path + "/test")

    def _prepare_dataset(self) -> None:
        """
        Prepare the training, validation and test data.
        Args:
          data_cache_dir: the directory path where the cached data are / should be saved.
        """
        train_batch = self.args.per_gpu_train_batch_size * self.strategy.num_replicas_in_sync
        eval_batch = self.args.per_gpu_eval_batch_size * self.strategy.num_replicas_in_sync
        test_batch = self.args.per_gpu_eval_batch_size
        self.train_steps = math.ceil(self.dataset_info.sizes["train"] /
                                     train_batch)
        self.train_dataset = self.train_dataset.shuffle(128).batch(
            train_batch).repeat(-1)
        self.train_dataset = self.strategy.experimental_distribute_dataset(
            self.train_dataset)
        self.validation_steps = math.ceil(
            self.dataset_info.sizes["validation"] / eval_batch)
        self.eval_dataset = self.eval_dataset.batch(eval_batch)
        self.eval_dataset = self.strategy.experimental_distribute_dataset(
            self.eval_dataset)
        self.test_steps = math.ceil(self.dataset_info.sizes["test"] /
                                    test_batch)
        self.test_dataset = self.test_dataset.batch(test_batch)

    def _create_optimizer(self) -> None:
        """
        Create the training optimizer with its name. Allowed names are those listed
        in the Tensorflow documentation and those contained in the transformers library.
        """
        if self.args.optimizer_name == "adamw":
            learning_rate_fn = tf.keras.optimizers.schedules.PolynomialDecay(
                initial_learning_rate=self.args.learning_rate,
                decay_steps=self.train_steps,
                end_learning_rate=0.0)
            if self.args.warmup_steps:
                learning_rate_fn = WarmUp(
                    initial_learning_rate=self.args.learning_rate,
                    decay_schedule_fn=learning_rate_fn,
                    warmup_steps=self.args.warmup_steps)

            self.optimizer = AdamWeightDecay(
                learning_rate=learning_rate_fn,
                weight_decay_rate=0.01,
                epsilon=self.args.adam_epsilon,
                exclude_from_weight_decay=["layer_norm", "bias"])
        else:
            try:
                self.optimizer = tf.keras.optimizers.get({
                    "class_name":
                    self.args.optimizer_name,
                    "config": {
                        "learning_rate": self.args.learning_rate,
                        "epsilon": self.args.adam_epsilon
                    }
                })
            except TypeError:
                # This is for the case where the optimizer is not Adam-like such as SGD
                self.optimizer = tf.keras.optimizers.get({
                    "class_name":
                    self.args.optimizer_name,
                    "config": {
                        "learning_rate": self.args.learning_rate
                    }
                })

    def _create_checkpoint_manager(self,
                                   checkpoint_path: str,
                                   max_to_keep: int = 5,
                                   load_model: bool = True) -> None:
        """
        Create a checkpoint manager in order to be able to make the training
        fault-tolerant.
        Args:
          checkpoint_path: the directory path where the model checkpoints will be saved.
          max_to_keep: the maximum number of checkpoints to keep in the checkpoint path.
          load_model: if we want to start the training from the latest checkpoint.
        """
        ckpt = tf.train.Checkpoint(optimizer=self.optimizer, model=self.model)
        self.model.ckpt_manager = tf.train.CheckpointManager(
            ckpt, checkpoint_path, max_to_keep=max_to_keep)

        if load_model:
            ckpt.restore(self.model.ckpt_manager.latest_checkpoint)

    def _evaluate_steps(self, per_replica_features, per_replica_labels):
        """
        One step evaluation across replica.
        Args:
          features: the batched features.
          labels: the batched labels.
        Returns:
          The loss corresponding to the given batch.
        """
        per_replica_loss = self.strategy.experimental_run_v2(
            self._run_model,
            args=(per_replica_features, per_replica_labels, False))

        return self.strategy.reduce(tf.distribute.ReduceOp.MEAN,
                                    per_replica_loss, None)

    def _evaluate(self) -> None:
        """
        Evaluate the model during the training at the end of each epoch.
        """
        step = 1
        loss = 0.0

        for features, labels in self.eval_dataset:
            step = tf.convert_to_tensor(step, dtype=tf.int64)
            loss = self._evaluate_steps(features, labels)
            loss = tf.reduce_mean(loss)

            with self.test_writer.as_default():
                tf.summary.scalar("loss", loss, step=step)

            if step % self.validation_steps == 0:
                break

            step += 1

        return loss

    def train(self) -> None:
        """
        Train method to train the model.
        """
        tf.summary.trace_on(graph=True, profiler=True)
        self.gradient_accumulator.reset()

        iterations = self.optimizer.iterations
        tf.summary.experimental.set_step(iterations)

        for epoch in range(int(self.args.num_train_epochs)):
            for training_loss in self._training_steps():
                step = iterations.numpy()
                training_loss = tf.reduce_mean(training_loss)

                with self.train_writer.as_default():
                    tf.summary.scalar("loss", training_loss, step=step)

                if step == 1:
                    with self.train_writer.as_default():
                        tf.summary.trace_export(name="training",
                                                step=step,
                                                profiler_outdir=self.log_path)

                if step % 10 == 0:
                    logger.info(
                        "Epoch {} Step {} Loss {:.4f} Train Accuracy {:.4f}".
                        format(epoch, step, training_loss.numpy(),
                               self.train_acc_metric.result()))

                if step % 100 == 0:
                    ckpt_save_path = self.model.ckpt_manager.save()
                    logger.info("Saving checkpoint for step {} at {}".format(
                        step, ckpt_save_path))

                if step % self.train_steps == 0:
                    break

            test_loss = self._evaluate()

            logger.info(
                "Epoch {} Step {} Train Loss {:.4f} Train Accuracy {:.4f}".
                format(epoch, step, training_loss.numpy(),
                       self.train_acc_metric.result()))
            logger.info(
                "Epoch {} Validation Loss {:.4f} Validation Accuracy {:.4f}".
                format(epoch, test_loss.numpy(),
                       self.test_acc_metric.result()))

            self.train_acc_metric.reset_states()
            self.test_acc_metric.reset_states()

    def _training_steps(self):
        """
        Returns a generator over training steps (i.e. parameters update).
        Args:
          dataset: The training dataset.
        Returns:
          A generator that yields a loss value to report for this step.
        """
        for i, loss in enumerate(self._accumulate_next_gradients()):
            if i % self.accum_steps == 0:
                self._apply_gradients()
                yield loss

    @tf.function
    def _apply_gradients(self):
        """Applies the gradients (cross-replica)."""
        self.strategy.experimental_run_v2(self._step)

    def _step(self):
        """Applies gradients and resets accumulation."""
        gradient_scale = self.gradient_accumulator.step * self.strategy.num_replicas_in_sync
        gradients = [
            gradient / tf.cast(gradient_scale, gradient.dtype)
            for gradient in self.gradient_accumulator.gradients
        ]
        gradients = [(tf.clip_by_value(grad, -self.args.max_grad_norm,
                                       self.args.max_grad_norm))
                     for grad in gradients]
        vars = self.model.trainable_variables
        if self.args.mode == "labelling":
            vars = [
                var for var in self.model.trainable_variables
                if "pooler" not in var.name
            ]
        self.optimizer.apply_gradients(list(zip(gradients, vars)))
        self.gradient_accumulator.reset()

    def _accumulate_next_gradients(self):
        """Accumulates the gradients from the next element in dataset."""
        iterator = iter(self.train_dataset)

        @tf.function
        def _accumulate_next():
            per_replica_features, per_replica_labels = next(iterator)

            return self._accumulate_gradients(per_replica_features,
                                              per_replica_labels)

        while True:
            try:
                yield _accumulate_next()
            except tf.errors.OutOfRangeError:
                break

    def _accumulate_gradients(self, per_replica_features, per_replica_labels):
        """Accumulates the gradients across all the replica."""
        per_replica_loss = self.strategy.experimental_run_v2(
            self._forward, args=(per_replica_features, per_replica_labels))

        return self.strategy.reduce(tf.distribute.ReduceOp.MEAN,
                                    per_replica_loss, None)

    def _forward(self, features, labels):
        """Forwards a training example and accumulates the gradients."""
        per_example_loss = self._run_model(features, labels, True)
        loss = tf.nn.compute_average_loss(
            per_example_loss,
            global_batch_size=self.args.per_gpu_train_batch_size)
        vars = self.model.trainable_variables
        if self.args.mode == "labelling":
            vars = [
                var for var in self.model.trainable_variables
                if "pooler" not in var.name
            ]
        gradients = self.optimizer.get_gradients(loss, vars)

        self.gradient_accumulator(gradients)

        return per_example_loss

    def _run_model(self, features, labels, training):
        """
        Computes the loss of the given features and labels pair.
        Args:
          features: the batched features.
          labels: the batched labels.
        """
        if self.args.mode == "classification" or self.args.mode == "labelling":
            logits = self.model(features, training=training)[0]
        else:
            logits = self.model(features, training=training)

        if self.args.mode == "labelling":
            active_loss = tf.reshape(labels, (-1, )) != -1
            logits = tf.boolean_mask(
                tf.reshape(logits, (-1, len(self.dataset_info.labels))),
                active_loss)
            labels = tf.boolean_mask(tf.reshape(labels, (-1, )), active_loss)

        loss = self.loss(labels, logits)

        if training:
            self.train_acc_metric(labels, logits)
        else:
            self.test_acc_metric(labels, logits)

        return loss

    def test(self) -> None:
        """
        Test the model over the test dataset and print a report.
        """
        y_true = []
        results = self.model.predict(self.test_dataset, steps=self.test_steps)

        if self.args.mode == "classification":
            for batch in self.test_dataset:
                y_true.extend(batch[1].numpy().tolist())

            y_pred = np.reshape(np.argmax(results, axis=-1), (-1, 1)).tolist()
            y_true = list(itertools.chain.from_iterable(y_true))
            y_pred = list(itertools.chain.from_iterable(y_pred))

            logger.info(
                classification_report(y_true,
                                      y_pred,
                                      target_names=self.dataset_info.labels))

    def save_model(self, save_path: str) -> None:
        """
        Save the pretrained model and create a Tensorflow saved model.
        Args:
          save_path: directory path where the pretrained model and
            Tensorflow saved model will be saved
        """
        logger.info("Saving model in {}".format(save_path))

        path = os.path.join(save_path, "saved_model")

        os.makedirs(path, exist_ok=True)
        self.model.save_pretrained(save_path)
        tf.saved_model.save(self.model, path)