def batch_loss(self, batch_group: List[TensorDict],
                   for_training: bool) -> torch.Tensor:
        """
		Does a forward pass on the given batches and returns the ``loss`` value in the result.
		If ``for_training`` is `True` also applies regularization penalty.

		This function has been modified for gradient accumulation and half precision training. 
		The ability to do multi-GPU training has also been taken out since I never use it.
		"""
        assert len(batch_group) == 1
        batch = batch_group[0]
        batch_size = training_util.get_batch_size(batch)
        accumulated_loss = 0

        for start_idx in range(0, batch_size, self._accumulation_steps):
            mini_batch = self.get_mini_batch(batch, start_idx,
                                             self._accumulation_steps)
            mini_batch = nn_util.move_to_device(mini_batch,
                                                self._cuda_devices[0])
            mini_batch_size = training_util.get_batch_size(mini_batch)
            output_dict = self.model(**mini_batch)

            try:
                loss = output_dict["loss"]
                if for_training:
                    loss += self.model.get_regularization_penalty()

                # For Gradient Accumulation: scales loss by multiplying the mini_batch_size and scaling down by the total batch size
                gradient_accumulation_scaling_factor = mini_batch_size / batch_size
                loss *= gradient_accumulation_scaling_factor

                accumulated_loss += loss

                if torch.isnan(loss):
                    raise ValueError("nan loss encountered")

                if for_training:
                    if self._half_precision:
                        with amp.scale_loss(loss,
                                            self.optimizer) as scaled_loss:
                            scaled_loss.backward()
                    else:
                        loss.backward()
            except KeyError:
                if for_training:
                    raise RuntimeError(
                        "The model you are trying to optimize does not contain a"
                        " 'loss' key in the output of model.forward(inputs).")
                return None

        return accumulated_loss
    def log_batch(
        self,
        model: Model,
        optimizer: Optimizer,
        batch_grad_norm: Optional[float],
        metrics: Dict[str, float],
        batch_group: List[List[TensorDict]],
        param_updates: Optional[Dict[str, torch.Tensor]],
    ) -> None:
        if self.should_log_this_batch():
            self.log_parameter_and_gradient_statistics(model, batch_grad_norm)
            self.log_learning_rates(model, optimizer)

            self.add_train_scalar("loss/loss_train", metrics["loss"])
            self.log_metrics({"epoch_metrics/" + k: v for k, v in metrics.items()})

        if self.should_log_histograms_this_batch():
            self.log_histograms(model)
            self.log_gradient_updates(model, param_updates)

        if self._batch_size_interval:
            # We're assuming here that `log_batch` will get called every batch, and only every
            # batch.  This is true with our current usage of this code (version 1.0); if that
            # assumption becomes wrong, this code will break.
            batch_group_size = sum(training_util.get_batch_size(batch) for batch in batch_group)
            self._batches_this_epoch += 1
            self._cumulative_batch_group_size += batch_group_size

            if (self._batches_this_epoch - 1) % self._batch_size_interval == 0:
                average = self._cumulative_batch_group_size / self._batches_this_epoch
                logger.info(f"current batch size: {batch_group_size} mean batch size: {average}")
                self.add_train_scalar("current_batch_size", batch_group_size)
                self.add_train_scalar("mean_batch_size", average)
Exemple #3
0
    def log_batch(
        self,
        batch_grad_norm: Optional[float],
        metrics: Dict[str, float],
        batch_group: List[TensorDict],
        param_updates: Optional[Dict[str, torch.Tensor]],
        batch_number: int,
    ) -> None:
        """
        Called every batch to perform all of the logging that is due.
        """
        if batch_number <= 1:  # batch_number is usually 1-indexed
            self._cumulative_batch_group_size = 0
            self.log_inputs(batch_group)

        if self._should_log_this_batch():
            if self._should_log_parameter_statistics:
                self._log_parameter_and_gradient_statistics(batch_grad_norm)

            if self._should_log_learning_rate:
                self._log_learning_rates()

            # Now collect per-batch metrics to log.
            metrics_to_log: Dict[str, float] = {}
            for key in ("batch_loss", "batch_reg_loss"):
                if key not in metrics:
                    continue
                value = metrics[key]
                metrics_to_log[key] = value
                # Update and add moving average.
                self._batch_loss_moving_sum[key] += value
                self._batch_loss_moving_items[key].append(value)
                if len(self._batch_loss_moving_items[key]) > self._batch_loss_moving_average_count:
                    self._batch_loss_moving_sum[key] -= self._batch_loss_moving_items[key].popleft()
                metrics_to_log[f"{key}_mov_avg"] = self._batch_loss_moving_sum[key] / len(
                    self._batch_loss_moving_items[key]
                )

            self.log_scalars(
                metrics_to_log,
                log_prefix="train",
            )

        if self._should_log_distributions_this_batch():
            assert param_updates is not None
            self._log_distributions()
            self._log_gradient_updates(param_updates)

        if self._batch_size_interval:
            # We're assuming here that `log_batch` will get called every batch, and only every
            # batch.  This is true with our current usage of this code (version 1.0); if that
            # assumption becomes wrong, this code will break.
            batch_group_size = sum(get_batch_size(batch) for batch in batch_group)  # type: ignore
            self._cumulative_batch_group_size += batch_group_size
            if batch_number % self._batch_size_interval == 0:
                average = self._cumulative_batch_group_size / batch_number
                self.log_scalars(
                    {"batch_size": batch_group_size, "mean_batch_size": average}, log_prefix="train"
                )
Exemple #4
0
    def forward(self,  # type: ignore
                task_index: torch.IntTensor,
                reverse: torch.ByteTensor,
                epoch_trained: torch.IntTensor,
                for_training: torch.ByteTensor,
                tokens: Dict[str, torch.LongTensor],
                label: torch.IntTensor = None,
                text_id: torch.IntTensor = None) -> Dict[str, torch.Tensor]:

        embeddeds = self._encoder(task_index, tokens, epoch_trained, self._valid_discriminator, reverse, for_training,
                                  text_id)
        batch_size = get_batch_size(embeddeds["embedded_text"])

        sentiment_logits = self._sentiment_discriminator(embeddeds["embedded_text"])

        p_domain_logits = self._p_domain_discriminator(embeddeds["private_embedding"])

        # TODO set reverse = true
        s_domain_logits = self._s_domain_discriminator(embeddeds["share_embedding"], reverse=reverse)

        logits = [sentiment_logits, p_domain_logits, s_domain_logits]

        # domain_logits = self._domain_discriminator(embedded_text)
        output_dict = {'logits': sentiment_logits}
        if label is not None:
            loss = self._loss(sentiment_logits, label)
            # task_index = task_index.unsqueeze(0)
            task_index = task_index.expand(batch_size)
            # targets = [label, label, label, task_index, task_index]
            # print(p_domain_logits.shape, task_index, task_index.shape)
            p_domain_loss = self._domain_loss(p_domain_logits, task_index)
            s_domain_loss = self._domain_loss(s_domain_logits, task_index)
            # logger.info("Share domain logits standard variation is {}",
            #             torch.mean(torch.std(F.softmax(s_domain_logits), dim=-1)))
            output_dict["tokens"] = tokens
            output_dict['stm_loss'] = loss
            output_dict['p_d_loss'] = p_domain_loss
            output_dict['s_d_loss'] = s_domain_loss
            # TODO add share domain logits std loss
            output_dict['loss'] = loss + 0.06 * p_domain_loss + 0.04 * s_domain_loss

            for (metric_name, metric) in zip(self.metrics.keys(), self.metrics.values()):
                if "auc" in metric_name:
                    metric(self.decode(output_dict)["label"], label)
                    continue
                metric(sentiment_logits, label)
        print("for training", for_training)
        if not for_training:
            with open("class_probabilities.txt", "a", encoding="utf8") as f:
                f.write(f"Task: {TASKS_NAME[task_index[0].detach()]}\nLine ID: ")
                f.write(" ".join(list(map(str, text_id.cpu().detach().numpy()))))
                f.write("\nProb: ")
                f.write(" ".join(list(map(str, F.softmax(sentiment_logits, dim=-1).cpu().detach().numpy()))))
                f.write("\nLabel: " + " ".join(list(map(str, label.cpu().detach().numpy()))) + "\n")
                f.write("\n\n\n")
        return output_dict
Exemple #5
0
    def forward(self, task_index: torch.IntTensor,
                tokens: Dict[str,
                             torch.LongTensor], epoch_trained: torch.IntTensor,
                valid_discriminator: Discriminator, reverse: torch.ByteTensor,
                for_training: torch.ByteTensor) -> Dict[str, torch.Tensor]:
        embedded_text_input = self._text_field_embedder(tokens)
        tokens_mask = util.get_text_field_mask(tokens)
        batch_size = get_batch_size(tokens)
        # TODO
        if np.random.rand() < -1 and for_training.all():
            logger.info("Domain Embedding with Perturbation")
            domain_embeddings = self._domain_embeddings(
                torch.arange(0, len(TASKS_NAME)).cuda())
            domain_embedding = get_perturbation_domain_embedding(
                domain_embeddings, task_index, epoch_trained)
            # domain_embedding = FGSM(self._domain_embeddings, task_index, valid_discriminator)
            output_dict = {"valid": torch.tensor(0)}
        else:
            logger.info("Domain Embedding without Perturbation")
            domain_embedding = self._domain_embeddings(task_index)
            output_dict = {"valid": torch.tensor(1)}

        output_dict["domain_embedding"] = domain_embedding
        embedded_text_input = self._input_dropout(embedded_text_input)
        if self._with_domain_embedding:
            domain_embedding = domain_embedding.expand(batch_size, 1, -1)
            embedded_text_input = torch.cat(
                (domain_embedding, embedded_text_input), 1)
            tokens_mask = torch.cat(
                [tokens_mask.new_ones(batch_size, 1), tokens_mask], 1)

        shared_encoded_text = self._shared_encoder(embedded_text_input,
                                                   tokens_mask)
        # shared_encoded_text = self._seq2vec(shared_encoded_text, tokens_mask)
        shared_encoded_text = get_final_encoder_states(shared_encoded_text,
                                                       tokens_mask,
                                                       bidirectional=True)
        output_dict["share_embedding"] = shared_encoded_text

        private_encoded_text = self._private_encoder(embedded_text_input,
                                                     tokens_mask)
        # private_encoded_text = self._seq2vec(private_encoded_text)
        private_encoded_text = get_final_encoder_states(private_encoded_text,
                                                        tokens_mask,
                                                        bidirectional=True)
        output_dict["private_embedding"] = private_encoded_text

        embedded_text = torch.cat([shared_encoded_text, private_encoded_text],
                                  -1)
        output_dict["embedded_text"] = embedded_text
        return output_dict
Exemple #6
0
    def forward(self,
                task_index: torch.IntTensor,
                tokens: Dict[str, torch.LongTensor]) -> Dict[str, torch.Tensor]:
        embedded_text_input = self._text_field_embedder(tokens)
        tokens_mask = util.get_text_field_mask(tokens)
        batch_size = get_batch_size(tokens)
        embedded_text_input = self._input_dropout(embedded_text_input)

        shared_encoded_text = self._shared_encoder(embedded_text_input, tokens_mask)
        private_encoded_text = self._private_encoder(embedded_text_input, tokens_mask)
        output_dict = dict()
        output_dict["share_embedding"] = shared_encoded_text
        output_dict["private_embedding"] = private_encoded_text

        embedded_text = torch.cat([shared_encoded_text, private_encoded_text], -1)
        output_dict["embedded_text"] = embedded_text
        return output_dict
Exemple #7
0
def log_iterable(iterable, assume_multiprocess_types):
    start = time.perf_counter()
    last = start
    periodic_logging_process = None
    have_started_periodic_process = False

    batch_count = 0
    cumulative_batch_size = 0
    cumulative_token_count = 0
    for batch in iterable:
        batch_count += 1
        cumulative_batch_size += get_batch_size(batch)
        tokens_size = batch['source']['tokens'].size()
        cumulative_token_count += tokens_size[0] * tokens_size[1]

        if assume_multiprocess_types and not have_started_periodic_process:
            have_started_periodic_process = True
            periodic_logging_process = Process(
                    target=run_periodically,
                    # Pass the queues directly. Passing the iterable naively
                    # won't work because the forked process (in contrast with
                    # threads) has an entirely separate address space.
                    # Presumably this could be worked around with
                    # multiprocessing.managers or similar.
                    args=(iterable.gi_frame.f_locals['qiterable'].output_queue,
                          iterable.gi_frame.f_locals['output_queue']
                    )
                    )
            periodic_logging_process.start()

        if batch_count % BATCH_INTERVAL == 0:
            end = time.perf_counter()

            msg = (f"s/b total: {(end - start) / batch_count:.3f} " +
                   f"s/b last: {(end - last) / BATCH_INTERVAL:.3f} " +
                   f"batch count: {batch_count} " +
                   f"batch size: {cumulative_batch_size / batch_count:.1f} " +
                   f"total tokens {cumulative_token_count}")
            print(msg)

            last = end

    if periodic_logging_process:
        periodic_logging_process.terminate()
    def batch_end_logging(self, trainer: "CallbackTrainer"):
        # Log parameter values to tensorboard
        if self.tensorboard.should_log_this_batch():
            self.tensorboard.log_parameter_and_gradient_statistics(
                trainer.model, trainer.batch_grad_norm)
            self.tensorboard.log_learning_rates(trainer.model,
                                                trainer.optimizer)

            self.tensorboard.add_train_scalar("loss/loss_train",
                                              trainer.train_metrics["loss"])
            self.tensorboard.log_metrics({
                "epoch_metrics/" + k: v
                for k, v in trainer.train_metrics.items()
            })

        if self.log_batch_size_period:
            cur_batch = sum([
                training_util.get_batch_size(batch)
                for batch in trainer.batch_group
            ])
            self.cumulative_batch_size += cur_batch
            if (trainer.batches_this_epoch -
                    1) % self.log_batch_size_period == 0:
                average = self.cumulative_batch_size / trainer.batches_this_epoch
                logger.debug(
                    f"current batch size: {cur_batch} mean batch size: {average}"
                )
                self.tensorboard.add_train_scalar("current_batch_size",
                                                  cur_batch)
                self.tensorboard.add_train_scalar("mean_batch_size", average)

        if self.tensorboard.should_log_histograms_this_batch():
            for name, param in trainer.model.named_parameters():
                self.param_updates[name].sub_(param.detach().cpu())
                update_norm = torch.norm(self.param_updates[name].view(-1))
                param_norm = torch.norm(param.view(-1)).cpu()
                self.tensorboard.add_train_scalar(
                    "gradient_update/" + name,
                    update_norm / (param_norm + 1e-7))
            self.param_updates.clear()
            self.tensorboard.log_histograms(trainer.model,
                                            self.histogram_parameters)
Exemple #9
0
    def forward(
            self, task_index: torch.IntTensor,
            tokens: Dict[str, torch.LongTensor]) -> Dict[str, torch.Tensor]:
        embedded_text_input = self._text_field_embedder(tokens)
        tokens_mask = util.get_text_field_mask(tokens)
        batch_size = get_batch_size(tokens)
        domain_embedding = self._domain_embeddings(task_index)
        output_dict = {"domain_embedding": domain_embedding}
        embedded_text_input = self._input_dropout(embedded_text_input)

        shared_encoded_text = self._shared_encoder(embedded_text_input,
                                                   tokens_mask)
        private_encoded_text = self._private_encoder(embedded_text_input,
                                                     tokens_mask)

        if self._with_domain_embedding:
            domain_embedding = domain_embedding.expand(batch_size, -1)
            shared_encoded_text = torch.cat(
                [domain_embedding, shared_encoded_text], -1)
            private_encoded_text = torch.cat(
                [domain_embedding, shared_encoded_text], -1)

            # shared_encoded_text = shared_encoded_text.view(batch_size, 4, -1)
            # scores = F.softmax(torch.matmul(shared_encoded_text, domain_embedding), -1)
            # shared_encoded_text = torch.matmul(shared_encoded_text.transpose(1, 2), scores.unsqueeze(2))
            # shared_encoded_text = shared_encoded_text.view(batch_size, -1)
            # private_encoded_text = private_encoded_text.view(batch_size, 4, -1)
            # scores = F.softmax(torch.matmul(private_encoded_text, domain_embedding), -1)
            # private_encoded_text = torch.matmul(private_encoded_text.transpose(1, 2), scores.unsqueeze(2))
            # private_encoded_text = private_encoded_text.view(batch_size, -1)

            # domain_embedding = domain_embedding.expand(batch_size, -1)
            # shared_encoded_text = torch.cat([domain_embedding, shared_encoded_text], -1)
            # private_encoded_text = torch.cat([domain_embedding, private_encoded_text], -1)
        output_dict["share_embedding"] = shared_encoded_text
        output_dict["private_embedding"] = private_encoded_text

        # embedded_text = torch.cat([domain_embedding, shared_encoded_text, private_encoded_text], -1)
        embedded_text = torch.cat([shared_encoded_text, private_encoded_text],
                                  -1)
        output_dict["embedded_text"] = embedded_text
        return output_dict
Exemple #10
0
    def _train_epoch(self, epoch: int) -> Dict[str, float]:
        """
        Trains one epoch and returns metrics.
        """
        logger.info("Epoch %d/%d", epoch, self._num_epochs - 1)
        peak_cpu_usage = peak_memory_mb()
        logger.info(f"Peak CPU memory usage MB: {peak_cpu_usage}")
        gpu_usage = []
        for gpu, memory in gpu_memory_mb().items():
            gpu_usage.append((gpu, memory))
            logger.info(f"GPU {gpu} memory usage MB: {memory}")

        train_loss = 0.0
        # Set the model to "train" mode.
        self.model.train()

        num_gpus = len(self._cuda_devices)

        # Get tqdm for the training batches
        raw_train_generator = self.iterator(self.train_data,
                                            num_epochs=1,
                                            shuffle=self.shuffle)
        train_generator = lazy_groups_of(raw_train_generator, num_gpus)
        num_training_batches = math.ceil(
            self.iterator.get_num_batches(self.train_data) / num_gpus)
        self._last_log = time.time()
        last_save_time = time.time()

        batches_this_epoch = 0
        if self._batch_num_total is None:
            self._batch_num_total = 0

        histogram_parameters = set(
            self.model.get_parameters_for_histogram_tensorboard_logging())

        logger.info("Training")
        train_generator_tqdm = Tqdm.tqdm(train_generator,
                                         total=num_training_batches)
        cumulative_batch_size = 0
        for batch_group in train_generator_tqdm:
            batches_this_epoch += 1
            self._batch_num_total += 1
            batch_num_total = self._batch_num_total

            self.optimizer.zero_grad()

            images = []
            text = []
            segment_ids = []
            labels = []
            for i in range(len(batch_group[0]['images'])):
                positive_index = random.randint(0, self.num_negative_samples)
                labels.append(positive_index)
                if self.retrieve_text:
                    instance_text = []
                    instance_segment_ids = []
                    for j in range(self.num_negative_samples + 1):
                        if j == positive_index:
                            instance_text.append(batch_group[0]['token_ids']
                                                 ['tokens'][i, :].tolist())
                            instance_segment_ids.append(
                                batch_group[0]['segment_ids'][i].tolist())
                        else:
                            negative_sample_index = random.choice(
                                self.train_indices)
                            text_field = TextField(
                                self.train_text_db[negative_sample_index],
                                self.train_token_indexers)
                            text_field.index(self.model.vocab)
                            padding_lengths = text_field.get_padding_lengths()
                            instance_text.append(
                                text_field.as_tensor(
                                    padding_lengths=padding_lengths)
                                ['tokens'].tolist())
                            instance_segment_ids.append(
                                self.train_segment_ids_db[
                                    negative_sample_index].tolist())
                    text += instance_text
                    segment_ids += instance_segment_ids
                else:
                    instance_images = [
                        None for _ in range(self.num_negative_samples + 1)
                    ]
                    for j in range(self.num_negative_samples + 1):
                        if j == positive_index:
                            instance_images[j] = np.expand_dims(
                                batch_group[0]['images'][i].numpy(), 0)
                        else:
                            instance_images[j] = np.expand_dims(
                                random.choice(self.train_image_db), 0)
                    images += instance_images
            matching_label_field_name = "labels"
            if self.retrieve_text:
                max_text_len = max([len(sequence) for sequence in text])
                text = [
                    sequence +
                    [0 for _ in range(max_text_len - len(sequence))]
                    for sequence in text
                ]
                batch_group[0]['token_ids'] = {
                    'tokens': torch.LongTensor(text)
                }
                segment_ids = [
                    sequence +
                    [0 for _ in range(max_text_len - len(sequence))]
                    for sequence in segment_ids
                ]
                batch_group[0]['segment_ids'] = torch.from_numpy(
                    np.array(segment_ids, dtype=np.int64))
            else:
                batch_group[0]['images'] = torch.from_numpy(np.vstack(images))
            batch_group[0][matching_label_field_name] = torch.from_numpy(
                np.array(labels, dtype=np.int64))
            loss = self.batch_loss(batch_group, for_training=True)

            if torch.isnan(loss):
                raise ValueError("nan loss encountered")

            loss.backward()

            train_loss += loss.item()

            batch_grad_norm = self.rescale_gradients()

            # This does nothing if batch_num_total is None or you are using a
            # scheduler which doesn't update per batch.
            if self._learning_rate_scheduler:
                self._learning_rate_scheduler.step_batch(batch_num_total)
            if self._momentum_scheduler:
                self._momentum_scheduler.step_batch(batch_num_total)

            if self._tensorboard.should_log_histograms_this_batch():
                # get the magnitude of parameter updates for logging
                # We need a copy of current parameters to compute magnitude of updates,
                # and copy them to CPU so large models won't go OOM on the GPU.
                param_updates = {
                    name: param.detach().cpu().clone()
                    for name, param in self.model.named_parameters()
                }
                self.optimizer.step()
                for name, param in self.model.named_parameters():
                    param_updates[name].sub_(param.detach().cpu())
                    update_norm = torch.norm(param_updates[name].view(-1, ))
                    param_norm = torch.norm(param.view(-1, )).cpu()
                    self._tensorboard.add_train_scalar(
                        "gradient_update/" + name,
                        update_norm / (param_norm + 1e-7))
            else:
                self.optimizer.step()

            # Update moving averages
            if self._moving_average is not None:
                self._moving_average.apply(batch_num_total)

            # Update the description with the latest metrics
            metrics = training_util.get_metrics(self.model, train_loss,
                                                batches_this_epoch)
            description = training_util.description_from_metrics(metrics)

            train_generator_tqdm.set_description(description, refresh=False)

            # Log parameter values to Tensorboard
            if self._tensorboard.should_log_this_batch():
                self._tensorboard.log_parameter_and_gradient_statistics(
                    self.model, batch_grad_norm)
                self._tensorboard.log_learning_rates(self.model,
                                                     self.optimizer)

                self._tensorboard.add_train_scalar("loss/loss_train",
                                                   metrics["loss"])
                self._tensorboard.log_metrics(
                    {"epoch_metrics/" + k: v
                     for k, v in metrics.items()})

            if self._tensorboard.should_log_histograms_this_batch():
                self._tensorboard.log_histograms(self.model,
                                                 histogram_parameters)

            if self._log_batch_size_period:
                cur_batch = sum([
                    training_util.get_batch_size(batch)
                    for batch in batch_group
                ])
                cumulative_batch_size += cur_batch
                if (batches_this_epoch - 1) % self._log_batch_size_period == 0:
                    average = cumulative_batch_size / batches_this_epoch
                    logger.info(
                        f"current batch size: {cur_batch} mean batch size: {average}"
                    )
                    self._tensorboard.add_train_scalar("current_batch_size",
                                                       cur_batch)
                    self._tensorboard.add_train_scalar("mean_batch_size",
                                                       average)

            # Save model if needed.
            if self._model_save_interval is not None and (
                    time.time() - last_save_time > self._model_save_interval):
                last_save_time = time.time()
                self._save_checkpoint('{0}.{1}'.format(
                    epoch, training_util.time_to_str(int(last_save_time))))
        metrics = training_util.get_metrics(self.model,
                                            train_loss,
                                            batches_this_epoch,
                                            reset=True)
        metrics['cpu_memory_MB'] = peak_cpu_usage
        for (gpu_num, memory) in gpu_usage:
            metrics['gpu_' + str(gpu_num) + '_memory_MB'] = memory
        return metrics
Exemple #11
0
    def forward(
            self,  # type: ignore
            task_index: torch.IntTensor,
            reverse: torch.ByteTensor,
            epoch_trained: torch.IntTensor,
            for_training: torch.ByteTensor,
            tokens: Dict[str, torch.LongTensor],
            label: torch.IntTensor = None) -> Dict[str, torch.Tensor]:

        embeddeds = self._encoder(task_index, tokens, epoch_trained,
                                  self._valid_discriminator, reverse,
                                  for_training)
        batch_size = get_batch_size(embeddeds["embedded_text"])

        sentiment_logits = self._sentiment_discriminator(
            embeddeds["embedded_text"])

        p_domain_logits = self._p_domain_discriminator(
            embeddeds["private_embedding"])

        # TODO set reverse = true
        s_domain_logits = self._s_domain_discriminator(
            embeddeds["share_embedding"], reverse=reverse)
        # TODO set reverse = true
        # TODO use share_embedding instead of domain_embedding
        valid_logits = self._valid_discriminator(embeddeds["domain_embedding"],
                                                 reverse=reverse)

        valid_label = embeddeds['valid']

        logits = [
            sentiment_logits, p_domain_logits, s_domain_logits, valid_logits
        ]

        # domain_logits = self._domain_discriminator(embedded_text)
        output_dict = {'logits': sentiment_logits}
        if label is not None:
            loss = self._loss(sentiment_logits, label)
            # task_index = task_index.unsqueeze(0)
            task_index = task_index.expand(batch_size)
            targets = [label, task_index, task_index, valid_label]
            # print(p_domain_logits.shape, task_index, task_index.shape)
            p_domain_loss = self._domain_loss(p_domain_logits, task_index)
            s_domain_loss = self._domain_loss(s_domain_logits, task_index)
            logger.info(
                "Share domain logits standard variation is {}",
                torch.mean(torch.std(F.softmax(s_domain_logits), dim=-1)))
            if self._label_smoothing is not None and self._label_smoothing > 0.0:
                valid_loss = sequence_cross_entropy_with_logits(
                    valid_logits,
                    valid_label.unsqueeze(0).cuda(),
                    torch.tensor(1).unsqueeze(0).cuda(),
                    average="token",
                    label_smoothing=self._label_smoothing)
            else:
                valid_loss = self._valid_loss(
                    valid_logits,
                    torch.zeros(2).scatter_(0, valid_label,
                                            torch.tensor(1.0)).cuda())
            output_dict['stm_loss'] = loss
            output_dict['p_d_loss'] = p_domain_loss
            output_dict['s_d_loss'] = s_domain_loss
            output_dict['valid_loss'] = valid_loss
            # TODO add share domain logits std loss
            output_dict['loss'] = loss + p_domain_loss + 0.005 * s_domain_loss\
                                  # + 0.005 * valid_loss

            # + torch.mean(torch.std(s_domain_logits, dim=1))
            # output_dict['loss'] = loss + p_domain_loss + 0.005 * s_domain_loss

            for (metric, logit, target) in zip(self.metrics.values(), logits,
                                               targets):
                metric(logit, target)

        return output_dict
    def _train_epoch(self, epoch: int) -> Dict[str, float]:
        """
        Trains one epoch and returns metrics.
        """
        logger.info("Epoch %d/%d", epoch, self._num_epochs - 1)
        peak_cpu_usage = peak_memory_mb()
        logger.info(f"Peak CPU memory usage MB: {peak_cpu_usage}")
        gpu_usage = []
        for gpu, memory in gpu_memory_mb().items():
            gpu_usage.append((gpu, memory))
            logger.info(f"GPU {gpu} memory usage MB: {memory}")

        train_loss = 0.0
        train_loss_lang1 = 0.0
        train_loss_lang2 = 0.0
        train_loss_cm = 0.0

        # Set the model to "train" mode.
        self.model.train()

        num_gpus = len(self._cuda_devices)

        # Get tqdm for the training batches
        raw_train_generator = self.iterator(self.train_data,
                                            num_epochs=1,
                                            shuffle=self.shuffle)
        train_generator = lazy_groups_of(raw_train_generator, num_gpus)
        num_training_batches = math.ceil(
            self.iterator.get_num_batches(self.train_data) / num_gpus)
        self._last_log = time.time()
        last_save_time = time.time()

        batches_this_epoch = 0
        if self._batch_num_total is None:
            self._batch_num_total = 0

        histogram_parameters = set(
            self.model.get_parameters_for_histogram_tensorboard_logging())

        logger.info("Training")
        train_generator_tqdm = Tqdm.tqdm(train_generator,
                                         total=num_training_batches)
        cumulative_batch_size = 0

        for batch_group in train_generator_tqdm:
            batches_this_epoch += 1
            self._batch_num_total += 1
            batch_num_total = self._batch_num_total

            self.optimizer.zero_grad()
            self.optimizer_lang1.zero_grad()
            self.optimizer_lang2.zero_grad()
            self.optimizer_cm.zero_grad()

            loss, loss_cm, loss_lang1, loss_lang2 = self.batch_loss(
                batch_group, for_training=True)

            if torch.isnan(loss):
                # if either on of loss_%s is nan, loss will be nan
                raise ValueError("nan loss encountered")

            #######
            # lang1
            #######
            loss_lang1.backward()
            train_loss_lang1 += loss_lang1.item()
            self.rescale_gradients()

            if self._learning_rate_scheduler_lang1:
                self._learning_rate_scheduler_lang1.step_batch(batch_num_total)
            if self._momentum_scheduler_lang1:
                self._momentum_scheduler_lang1.step_batch(batch_num_total)

            self.optimizer_lang1.step()
            self.optimizer_lang1.zero_grad()

            #######
            # cm
            #######
            loss_lang2.backward()
            train_loss_lang2 += loss_lang2.item()
            batch_grad_norm = self.rescale_gradients()

            if self._learning_rate_scheduler_lang2:
                self._learning_rate_scheduler_lang2.step_batch(batch_num_total)
            if self._momentum_scheduler_lang2:
                self._momentum_scheduler_lang2.step_batch(batch_num_total)

            self.optimizer_lang2.step()
            self.optimizer_lang2.zero_grad()

            #######
            # lang2
            #######
            loss_cm.backward()
            train_loss_cm += loss_cm.item()
            self.rescale_gradients()

            if self._learning_rate_scheduler_cm:
                self._learning_rate_scheduler_cm.step_batch(batch_num_total)
            if self._momentum_scheduler_cm:
                self._momentum_scheduler_cm.step_batch(batch_num_total)

            self.optimizer_cm.step()
            self.optimizer_cm.zero_grad()

            train_loss += loss.item()

            # Update the description with the latest metrics
            # metrics = training_util.get_metrics(self.model, train_loss, batches_this_epoch)
            metrics = self.model.get_metrics(False)
            metrics["loss"] = float(
                train_loss /
                batches_this_epoch) if batches_this_epoch > 0 else 0.0
            metrics["cm_loss"] = float(
                train_loss_cm /
                batches_this_epoch) if batches_this_epoch > 0 else 0.0
            metrics["lang1_loss"] = float(
                train_loss_lang1 /
                batches_this_epoch) if batches_this_epoch > 0 else 0.0
            metrics["lang2_loss"] = float(
                train_loss_lang2 /
                batches_this_epoch) if batches_this_epoch > 0 else 0.0
            description = training_util.description_from_metrics(metrics)

            train_generator_tqdm.set_description(description, refresh=False)

            # Log parameter values to Tensorboard
            if self._tensorboard.should_log_this_batch():
                self._tensorboard.log_parameter_and_gradient_statistics(
                    self.model, batch_grad_norm)
                self._tensorboard.log_learning_rates(self.model,
                                                     self.optimizer_lang1)
                self._tensorboard.log_learning_rates(self.model,
                                                     self.optimizer_lang2)
                self._tensorboard.log_learning_rates(self.model,
                                                     self.optimizer_cm)

                self._tensorboard.add_train_scalar("loss/loss_train",
                                                   metrics["loss"])
                self._tensorboard.add_train_scalar("loss/cm_loss_train",
                                                   metrics["cm_loss"])
                self._tensorboard.add_train_scalar("loss/lang1_loss_train",
                                                   metrics["lang1_loss"])
                self._tensorboard.add_train_scalar("loss/lang2_loss_train",
                                                   metrics["lang2_loss"])
                self._tensorboard.log_metrics(
                    {"epoch_metrics/" + k: v
                     for k, v in metrics.items()})

            if self._tensorboard.should_log_histograms_this_batch():
                self._tensorboard.log_histograms(self.model,
                                                 histogram_parameters)

            if self._log_batch_size_period:
                cur_batch = sum([
                    training_util.get_batch_size(batch)
                    for batch in batch_group
                ])
                cumulative_batch_size += cur_batch
                if (batches_this_epoch - 1) % self._log_batch_size_period == 0:
                    average = cumulative_batch_size / batches_this_epoch
                    logger.info(
                        f"current batch size: {cur_batch} mean batch size: {average}"
                    )
                    self._tensorboard.add_train_scalar("current_batch_size",
                                                       cur_batch)
                    self._tensorboard.add_train_scalar("mean_batch_size",
                                                       average)

            # Save model if needed.
            if self._model_save_interval is not None and (
                    time.time() - last_save_time > self._model_save_interval):
                last_save_time = time.time()
                self._save_checkpoint("{0}.{1}".format(
                    epoch, training_util.time_to_str(int(last_save_time))))
        # metrics = training_util.get_metrics(self.model, train_loss, batches_this_epoch, reset=True)
        metrics = self.model.get_metrics(reset=True)
        metrics["loss"] = float(
            train_loss / batches_this_epoch) if batches_this_epoch > 0 else 0.0
        metrics["cm_loss"] = float(
            train_loss_cm /
            batches_this_epoch) if batches_this_epoch > 0 else 0.0
        metrics["lang1_loss"] = float(
            train_loss_lang1 /
            batches_this_epoch) if batches_this_epoch > 0 else 0.0
        metrics["lang2_loss"] = float(
            train_loss_lang2 /
            batches_this_epoch) if batches_this_epoch > 0 else 0.0
        metrics["cpu_memory_MB"] = peak_cpu_usage
        for (gpu_num, memory) in gpu_usage:
            metrics["gpu_" + str(gpu_num) + "_memory_MB"] = memory
        return metrics
Exemple #13
0
    def _train_epoch(self, epoch: int) -> Dict[str, float]:
        """
        Trains one epoch and returns metrics.
        """
        logger.info("Epoch %d/%d", epoch, self._num_epochs - 1)
        peak_cpu_usage = peak_memory_mb()
        logger.info(f"Peak CPU memory usage MB: {peak_cpu_usage}")
        gpu_usage = []
        for gpu, memory in gpu_memory_mb().items():
            gpu_usage.append((gpu, memory))
            logger.info(f"GPU {gpu} memory usage MB: {memory}")

        train_loss = 0.0
        # Set the model to "train" mode.
        self._pytorch_model.train()

        # Get tqdm for the training batches
        batch_generator = self.iterator(self.train_data,
                                        num_epochs=1,
                                        shuffle=self.shuffle)
        batch_group_generator = lazy_groups_of(
            batch_generator, self._num_gradient_accumulation_steps)
        num_training_batches = math.ceil(
            self.iterator.get_num_batches(self.train_data) /
            self._num_gradient_accumulation_steps)
        # Having multiple tqdm bars in case of distributed training will be a mess. Hence only the master's
        # progress is shown
        if self._master:
            batch_group_generator_tqdm = Tqdm.tqdm(batch_group_generator,
                                                   total=num_training_batches)
        else:
            batch_group_generator_tqdm = batch_group_generator

        self._last_log = time.time()
        last_save_time = time.time()

        batches_this_epoch = 0
        if self._batch_num_total is None:
            self._batch_num_total = 0

        histogram_parameters = set(
            self.model.get_parameters_for_histogram_tensorboard_logging())

        logger.info("Training")

        cumulative_batch_group_size = 0
        for batch_group in batch_group_generator_tqdm:
            batches_this_epoch += 1
            self._batch_num_total += 1
            batch_num_total = self._batch_num_total

            self.optimizer.zero_grad()

            for batch in batch_group:
                loss = self.batch_loss(batch, for_training=True)
                if torch.isnan(loss):
                    raise ValueError("nan loss encountered")
                loss = loss / len(batch_group)
                loss.backward()
                train_loss += loss.item()

            batch_grad_norm = self.rescale_gradients()

            # This does nothing if batch_num_total is None or you are using a
            # scheduler which doesn't update per batch.
            if self._learning_rate_scheduler:
                self._learning_rate_scheduler.step_batch(batch_num_total)
            if self._momentum_scheduler:
                self._momentum_scheduler.step_batch(batch_num_total)

            if self._tensorboard.should_log_histograms_this_batch(
            ) and self._master:
                # get the magnitude of parameter updates for logging
                # We need a copy of current parameters to compute magnitude of updates,
                # and copy them to CPU so large models won't go OOM on the GPU.
                param_updates = {
                    name: param.detach().cpu().clone()
                    for name, param in self.model.named_parameters()
                }
                self.optimizer.step()
                for name, param in self.model.named_parameters():
                    param_updates[name].sub_(param.detach().cpu())
                    update_norm = torch.norm(param_updates[name].view(-1))
                    param_norm = torch.norm(param.view(-1)).cpu()
                    self._tensorboard.add_train_scalar(
                        "gradient_update/" + name,
                        update_norm / (param_norm + 1e-7))
            else:
                self.optimizer.step()

            # Update moving averages
            if self._moving_average is not None:
                self._moving_average.apply(batch_num_total)

            # Update the description with the latest metrics
            metrics = training_util.get_metrics(
                self.model,
                train_loss,
                batches_this_epoch,
                world_size=self._world_size,
                cuda_device=[self.cuda_device],
            )

            # Updating tqdm only for the master as the trainers wouldn't have one
            if self._master:
                description = training_util.description_from_metrics(metrics)
                batch_group_generator_tqdm.set_description(description,
                                                           refresh=False)

            # Log parameter values to Tensorboard (only from the master)
            if self._tensorboard.should_log_this_batch() and self._master:
                self._tensorboard.log_parameter_and_gradient_statistics(
                    self.model, batch_grad_norm)
                self._tensorboard.log_learning_rates(self.model,
                                                     self.optimizer)

                self._tensorboard.add_train_scalar("loss/loss_train",
                                                   metrics["loss"])
                self._tensorboard.log_metrics(
                    {"epoch_metrics/" + k: v
                     for k, v in metrics.items()})

            if self._tensorboard.should_log_histograms_this_batch(
            ) and self._master:
                self._tensorboard.log_histograms(self.model,
                                                 histogram_parameters)

            if self._log_batch_size_period:
                batch_group_size = sum(
                    training_util.get_batch_size(batch)
                    for batch in batch_group)
                cumulative_batch_group_size += batch_group_size
                if (batches_this_epoch - 1) % self._log_batch_size_period == 0:
                    average = cumulative_batch_group_size / batches_this_epoch
                    logger.info(
                        f"current batch size: {batch_group_size} mean batch size: {average}"
                    )
                    self._tensorboard.add_train_scalar("current_batch_size",
                                                       batch_group_size)
                    self._tensorboard.add_train_scalar("mean_batch_size",
                                                       average)

            # Save model if needed.
            if (self._model_save_interval is not None and
                (time.time() - last_save_time > self._model_save_interval)
                    and self._master):
                last_save_time = time.time()
                self._save_checkpoint("{0}.{1}".format(
                    epoch, training_util.time_to_str(int(last_save_time))))

        # Let all workers finish their epoch before computing
        # the final statistics for the epoch.
        if self._distributed:
            dist.barrier()

        metrics = training_util.get_metrics(
            self.model,
            train_loss,
            batches_this_epoch,
            reset=True,
            world_size=self._world_size,
            cuda_device=[self.cuda_device],
        )
        metrics["cpu_memory_MB"] = peak_cpu_usage
        for (gpu_num, memory) in gpu_usage:
            metrics["gpu_" + str(gpu_num) + "_memory_MB"] = memory
        return metrics
    def semi_train_epoch(self, epoch: int) -> Dict[str, float]:
        """
        Trains one epoch and returns metrics.
        """
        logger.info("Epoch %d/%d", epoch, self.trainer._num_epochs - 1)
        peak_cpu_usage = peak_memory_mb()
        logger.info(f"Peak CPU memory usage MB: {peak_cpu_usage}")
        gpu_usage = []
        for gpu, memory in gpu_memory_mb().items():
            gpu_usage.append((gpu, memory))
            logger.info(f"GPU {gpu} memory usage MB: {memory}")

        train_loss = 0.0
        # Set the model to "train" mode.
        self.trainer.model.train()

        num_gpus = len(self.trainer._cuda_devices)

        self.trainer._last_log = time.time()
        last_save_time = time.time()

        batches_this_epoch = 0
        if self.trainer._batch_num_total is None:
            self.trainer._batch_num_total = 0

        histogram_parameters = set(
            self.trainer.model.
            get_parameters_for_histogram_tensorboard_logging())
        #Pdb().set_trace()
        mixed_generator, num_training_batches = get_mixer(
            self.trainer.iterator, self.trainer.train_data,
            self.trainer.iterator, self.unlabelled_dataset, num_gpus,
            self.labelled_id, self.which_mixer, self.min_pct_of_unlabelled)
        #mixed_generator, num_training_batches = get_mixer(self.trainer.iterator, self.trainer.train_data, self.trainer._validation_iterator,  self.unlabelled_dataset,num_gpus, self.labelled_id, self.which_mixer)

        #generator for lambda update
        mixed_generator_for_lambda, _ = get_mixer(self.trainer.iterator,
                                                  self.trainer.train_data,
                                                  self.trainer.iterator,
                                                  self.unlabelled_dataset,
                                                  num_gpus, self.labelled_id,
                                                  'cm', 1.0)
        #mixed_generator_for_lambda, _ = get_mixer(self.trainer._validation_iterator, self.trainer.train_data, self.trainer._validation_iterator,  self.unlabelled_dataset, num_gpus, self.labelled_id, 'cm')

        logger.info("Training")
        train_generator_tqdm = Tqdm.tqdm(mixed_generator,
                                         total=num_training_batches)
        #train_generator_tqdm = Tqdm.tqdm(zip(train_generator,unlabelled_train_generator),
        #                                 total=num_training_batches)
        cumulative_batch_size = 0
        unlabelled_loss = 0
        unlabelled_batches_this_epoch = 0

        batches_since_last_step = 0
        agg_loss = 0.0
        flag = False
        batch_grad_norm = None
        for batch_group, group_id in train_generator_tqdm:
            #print(batch_group[0]['sentence']['tokens'].shape)
            if self.total_supervised_iters < self.dd_semi_warmup_iters and group_id != self.labelled_id:
                continue
            output_dict = self.batch_loss(
                batch_group,
                for_training=True,
                eval_metric=(group_id == self.labelled_id))
            penalties = defaultdict(float)

            if self.constraints_model is not None:
                penalties = self.constraints_model(
                    output_dict['task1_tag_logits'],
                    output_dict['task2_tag_logits'], output_dict['mask'])

            loss = 0.0
            if 'loss' in output_dict:
                loss = output_dict['loss']
                train_loss += loss.item()
            loss += output_dict.get('regularization_penalty', 0.0)

            loss += self.constraints_wt * penalties['loss']

            unlabelled_loss += penalties['loss'].item() if torch.is_tensor(
                penalties['loss']) else penalties['loss']

            agg_loss += loss
            batches_since_last_step += 1

            if batches_since_last_step == self.backprop_after_xbatches:
                #print("STEP THROUGH! : {}. loss: {} agg_loss: {}".format(group_id, loss, agg_loss))
                batch_grad_norm = self.step(agg_loss)
                batches_since_last_step = 0
                agg_loss = 0.0
                flag = False
            else:
                flag = True
                #print("skipp : {}. loss: {} agg_loss: {}".format(group_id, loss, agg_loss))

            if (group_id != self.labelled_id):
                unlabelled_batches_this_epoch += 1
                #self.trainer.optimizer.zero_grad()
                #loss.backward()
                #batch_grad_norm = self.trainer.rescale_gradients()
                #self.trainer.optimizer.step()
            else:
                self.total_supervised_iters += 1.0
                batches_this_epoch += 1
                self.trainer._batch_num_total += 1
                batch_num_total = self.trainer._batch_num_total

                #self.trainer.optimizer.zero_grad()
                #loss.backward()
                #batch_grad_norm = self.trainer.rescale_gradients()

                # This does nothing if batch_num_total is None or you are using an
                # LRScheduler which doesn't update per batch.
                if self.trainer._learning_rate_scheduler:
                    self.trainer._learning_rate_scheduler.step_batch(
                        batch_num_total)

                if self.trainer._tensorboard.should_log_histograms_this_batch(
                ):
                    # get the magnitude of parameter updates for logging
                    # We need a copy of current parameters to compute magnitude of updates,
                    # and copy them to CPU so large models won't go OOM on the GPU.
                    param_updates = {
                        name: param.detach().cpu().clone()
                        for name, param in
                        self.trainer.model.named_parameters()
                    }
                    #self.trainer.optimizer.step()
                    for name, param in self.trainer.model.named_parameters():
                        param_updates[name].sub_(param.detach().cpu())
                        update_norm = torch.norm(param_updates[name].view(
                            -1, ))
                        param_norm = torch.norm(param.view(-1, )).cpu()
                        self.trainer._tensorboard.add_train_scalar(
                            "gradient_update/" + name,
                            update_norm / (param_norm + 1e-7))
                else:
                    pass
                    #self.trainer.optimizer.step()

                # Update moving averages
                if self.trainer._moving_average is not None:
                    self.trainer._moving_average.apply(batch_num_total)
            #
                metrics = training_util.get_metrics(self.trainer.model,
                                                    train_loss,
                                                    batches_this_epoch)
                metrics["uloss"] = float(
                    unlabelled_loss /
                    (batches_this_epoch + unlabelled_batches_this_epoch))
                # Update the description with the latest metrics
                description = training_util.description_from_metrics(metrics)
                train_generator_tqdm.set_description(description,
                                                     refresh=False)

                # Log parameter values to Tensorboard
                if self.trainer._tensorboard.should_log_this_batch(
                ) and batch_grad_norm is not None:
                    self.trainer._tensorboard.log_parameter_and_gradient_statistics(
                        self.trainer.model, batch_grad_norm)
                    self.trainer._tensorboard.log_learning_rates(
                        self.trainer.model, self.trainer.optimizer)

                    self.trainer._tensorboard.add_train_scalar(
                        "loss/loss_train", metrics["loss"])
                    self.trainer._tensorboard.log_metrics(
                        {"epoch_metrics/" + k: v
                         for k, v in metrics.items()})

                if self.trainer._tensorboard.should_log_histograms_this_batch(
                ):
                    self.trainer._tensorboard.log_histograms(
                        self.trainer.model, histogram_parameters)

                if self.trainer._log_batch_size_period:
                    cur_batch = sum([
                        training_util.get_batch_size(batch)
                        for batch in batch_group
                    ])
                    cumulative_batch_size += cur_batch
                    if (batches_this_epoch -
                            1) % self.trainer._log_batch_size_period == 0:
                        average = cumulative_batch_size / batches_this_epoch
                        logger.info(
                            f"current batch size: {cur_batch} mean batch size: {average}"
                        )
                        self.trainer._tensorboard.add_train_scalar(
                            "current_batch_size", cur_batch)
                        self.trainer._tensorboard.add_train_scalar(
                            "mean_batch_size", average)

                # Save model if needed.
                if self.trainer._model_save_interval is not None and (
                        time.time() - last_save_time >
                        self.trainer._model_save_interval):
                    last_save_time = time.time()
                    self.trainer._save_checkpoint('{0}.{1}'.format(
                        epoch, training_util.time_to_str(int(last_save_time))))

            #lambda update
            #if  (self.constraints_model is not None) and (self.dd_optimizer is not None) and (self.total_supervised_iters >= self.dd_warmup_iters) and (batches_this_epoch % self.dd_update_freq == 0):
            if (self.constraints_model
                    is not None) and (self.dd_optimizer is not None) and (
                        self.total_supervised_iters >= self.dd_warmup_iters
                    ) and (self.total_supervised_iters -
                           self.last_lambda_update >= self.dd_update_freq):
                for batch_group, group_id in mixed_generator_for_lambda:
                    self.lambda_update(batch_group)
                    self.last_lambda_update = self.total_supervised_iters
                    break

                self.count_lambda_updates += 1
                if (self.dd_increase_freq_after
                        is not None) and (self.count_lambda_updates %
                                          self.dd_increase_freq_after == 0):
                    self.dd_update_freq += self.dd_increase_freq_by
        if flag:
            batch_grad_norm = self.step(agg_loss)
            batches_since_last_step = 0
            agg_loss = 0.0
            flag = False

        #lambda update
        #if (self.constraints_model is not None) and (self.dd_optimizer is not None) and (self.total_supervised_iters >= self.dd_warmup_iters):
        if (self.constraints_model
                is not None) and (self.dd_optimizer is not None) and (
                    self.total_supervised_iters >= self.dd_warmup_iters) and (
                        self.total_supervised_iters - self.last_lambda_update
                        >= self.dd_update_freq):
            for batch_group, group_id in mixed_generator_for_lambda:
                self.lambda_update(batch_group)
                self.last_lambda_update = self.total_supervised_iters
                break

            self.count_lambda_updates += 1
            if (self.dd_increase_freq_after
                    is not None) and (self.count_lambda_updates %
                                      self.dd_increase_freq_after == 0):
                self.dd_update_freq += self.dd_increase_freq_by

        metrics = training_util.get_metrics(self.trainer.model,
                                            train_loss,
                                            batches_this_epoch,
                                            reset=True)
        metrics['cpu_memory_MB'] = peak_cpu_usage
        metrics['lb'] = batches_this_epoch
        metrics['ub'] = unlabelled_batches_this_epoch
        metrics["uloss"] = float(
            unlabelled_loss /
            (batches_this_epoch + unlabelled_batches_this_epoch))
        if self.constraints_model is not None:
            lambda_stats_dict = self.constraints_model.lambda_stats()
            metrics.update(lambda_stats_dict)
        for (gpu_num, memory) in gpu_usage:
            metrics['gpu_' + str(gpu_num) + '_memory_MB'] = memory
        return metrics
Exemple #15
0
    def _train_epoch(self, epoch: int) -> Dict[str, float]:
        """
        Trains one epoch and returns metrics.
        """
        logger.info("Epoch %d/%d", epoch, self._num_epochs - 1)
        peak_cpu_usage = peak_memory_mb()
        logger.info(f"Peak CPU memory usage MB: {peak_cpu_usage}")
        gpu_usage = []
        for gpu, memory in gpu_memory_mb().items():
            gpu_usage.append((gpu, memory))
            logger.info(f"GPU {gpu} memory usage MB: {memory}")

        train_loss = 0.0
        # Set the model to "train" mode.
        self.model.train()

        num_gpus = len(self._cuda_devices)

        # Get tqdm for the training batches
        raw_train_generator = self.iterator(self.train_data,
                                            num_epochs=1,
                                            shuffle=self.shuffle)
        train_generator = lazy_groups_of(
            raw_train_generator,
            num_gpus * self._num_gradient_accumulation_steps)
        num_training_batches = math.ceil(
            self.iterator.get_num_batches(self.train_data) /
            (num_gpus * self._num_gradient_accumulation_steps))
        self._last_log = time.time()
        last_save_time = time.time()

        batches_this_epoch = 0
        if self._batch_num_total is None:
            self._batch_num_total = 0

        histogram_parameters = set(
            self.model.get_parameters_for_histogram_tensorboard_logging())

        logger.info("Training")
        train_generator_tqdm = Tqdm.tqdm(train_generator,
                                         total=num_training_batches)
        cumulative_batch_size = 0
        for batch_group in train_generator_tqdm:
            if not self._graph_added and self._require_graph:
                model_copy = deepcopy(self.model)
                model_copy.log_graph()
                wrapped_model = ModelWrapper(model_copy)
                graph_inputs = wrapped_model.process_inputs(batch_group[0])
                # print(deepcopy(wrapped_model)(graph_inputs))
                self._tensorboard.add_graph(wrapped_model, [graph_inputs])
                self._graph_added = True

            batches_this_epoch += 1
            self._batch_num_total += 1
            batch_num_total = self._batch_num_total

            self.optimizer.zero_grad()

            num_batch = len(batch_group) // num_gpus
            for i in range(num_batch):
                if (i + 1) * num_gpus > len(batch_group):
                    batch_i = batch_group[i * num_gpus:]
                else:
                    batch_i = batch_group[i * num_gpus:(i + 1) * num_gpus]

                loss = self.batch_loss(batch_i, for_training=True)
                if loss is None or torch.isnan(loss):
                    print("nan loss")
                    continue
                    # raise ValueError("nan loss encountered")
                loss = loss / num_batch
                # try:
                #     loss.backward()
                # except Exception:
                #     print("loss: ", loss)
                #     print(batch_group)
                # with torch.autograd.set_detect_anomaly(True):  This can potentially lead to slower training
                #     loss.backward()

                # loss = loss.half()
                loss.backward()
                # with amp.scale_loss(loss, self.optimizer) as scaled_loss:
                #    try:
                #        scaled_loss.backward()
                #    except RuntimeError:
                #        print("CUDA out of memory during backward()")
                #        continue

                train_loss += loss.item()

            batch_grad_norm = self.rescale_gradients()

            # This does nothing if batch_num_total is None or you are using a
            # scheduler which doesn't update per batch.
            if self._learning_rate_scheduler:
                self._learning_rate_scheduler.step_batch(batch_num_total)
            if self._momentum_scheduler:
                self._momentum_scheduler.step_batch(batch_num_total)

            if self._tensorboard.should_log_histograms_this_batch():
                # get the magnitude of parameter updates for logging
                # We need a copy of current parameters to compute magnitude of updates,
                # and copy them to CPU so large models won't go OOM on the GPU.
                param_updates = {
                    name: param.detach().cpu().clone()
                    for name, param in self.model.named_parameters()
                }
                self.optimizer.step()
                for name, param in self.model.named_parameters():
                    param_updates[name].sub_(param.detach().cpu())
                    update_norm = torch.norm(param_updates[name].view(-1, ))
                    param_norm = torch.norm(param.view(-1, )).cpu()
                    self._tensorboard.add_train_scalar(
                        "gradient_update/" + name,
                        update_norm / (param_norm + 1e-7))
            else:
                self.optimizer.step()

            # Update moving averages
            if self._moving_average is not None:
                self._moving_average.apply(batch_num_total)

            # Update the description with the latest metrics
            metrics = training_util.get_metrics(self.model, train_loss,
                                                batches_this_epoch)
            description = training_util.description_from_metrics(metrics)

            train_generator_tqdm.set_description(description, refresh=False)

            # Log parameter values to Tensorboard
            if self._tensorboard.should_log_this_batch():
                self._tensorboard.log_parameter_and_gradient_statistics(
                    self.model, batch_grad_norm)
                self._tensorboard.log_learning_rates(self.model,
                                                     self.optimizer)

                self._tensorboard.add_train_scalar("loss/loss_train",
                                                   metrics["loss"])
                self._tensorboard.log_metrics(
                    {"epoch_metrics/" + k: v
                     for k, v in metrics.items()})

            if self._tensorboard.should_log_histograms_this_batch():
                self._tensorboard.log_histograms(self.model,
                                                 histogram_parameters)

            if self._log_batch_size_period:
                cur_batch = sum([
                    training_util.get_batch_size(batch)
                    for batch in batch_group
                ])
                cumulative_batch_size += cur_batch
                if (batches_this_epoch - 1) % self._log_batch_size_period == 0:
                    average = cumulative_batch_size / batches_this_epoch
                    logger.info(
                        f"current batch size: {cur_batch} mean batch size: {average}"
                    )
                    self._tensorboard.add_train_scalar("current_batch_size",
                                                       cur_batch)
                    self._tensorboard.add_train_scalar("mean_batch_size",
                                                       average)

            # Save model if needed.
            if self._model_save_interval is not None and (
                    time.time() - last_save_time > self._model_save_interval):
                last_save_time = time.time()
                self._save_checkpoint('{0}.{1}'.format(
                    epoch, training_util.time_to_str(int(last_save_time))))
        metrics = training_util.get_metrics(self.model,
                                            train_loss,
                                            batches_this_epoch,
                                            reset=True)
        metrics['cpu_memory_MB'] = peak_cpu_usage
        for (gpu_num, memory) in gpu_usage:
            metrics['gpu_' + str(gpu_num) + '_memory_MB'] = memory
        return metrics
Exemple #16
0
    def _train_epoch(self, epoch: int) -> Dict[str, float]:
        """
        Trains on one epoch. Differs from base trainer in that 
        it utilizes
        """
        logger.info("Epoch %d/%d", epoch, self._num_epochs - 1)
        peak_cpu_usage = peak_memory_mb()
        logger.info(f"Peak CPU memory usage MB: {peak_cpu_usage}")
        gpu_usage = []
        for gpu, memory in gpu_memory_mb().items():
            gpu_usage.append((gpu, memory))
            logger.info(f"GPU {gpu} memory usage MB: {memory}")

        train_loss = 0.0
        # Set the model to "train" mode.
        self.model.train()

        num_gpus = len(self._cuda_devices)
        raw_generators = []

        # fix max number of batches
        self._last_log = time.time()
        last_save_time = time.time()

        batches_this_epoch = 0
        if self._batch_num_total is None:
            self._batch_num_total = 0

        histogram_parameters = set(
            self.model.get_parameters_for_histogram_tensorboard_logging())

        logger.info("Training")

        cumulative_batch_size = 0
        for i in range(0, self.meta_batches):
            train_generators = []
            for i, train_info in enumerate(self.train_data):
                raw_train_generator = self.iterator(train_info,
                                                    num_epochs=1,
                                                    shuffle=self.shuffle)
                train_generators.append(
                    lazy_groups_of(raw_train_generator, num_gpus))

            loss_batch = self.reptile_outer_update(train_generators, i,
                                                   num_gpus)

            # TODO figure out if is important
            train_loss = loss_batch
            print('[info] train_loss is:{}'.format(train_loss))

            # TODO figure out BATCH NORM MAML https://openreview.net/pdf?id=HygBZnRctX
            if self.batch_norm:
                batch_grad_norm = self.rescale_gradients()
            # This does nothing if batch_num_total is None or you are using a
            # scheduler which doesn't update per batch.
            # TODO investigate learning rate scheduling for meta learning
            #if self._learning_rate_scheduler:
            #self._learning_rate_scheduler.step_batch(batch_num_total)
            #if self._momentum_scheduler:
            #self._momentum_scheduler.step_batch(batch_num_total)

            if self._tensorboard.should_log_histograms_this_batch():
                # get the magnitude of parameter updates for logging
                # We need a copy of current parameters to compute magnitude of updates,
                # and copy them to CPU so large models won't go OOM on the GPU.
                param_updates = {
                    name: param.detach().cpu().clone()
                    for name, param in self.model.named_parameters()
                }
                self.optimizer.step()
                for name, param in self.model.named_parameters():
                    param_updates[name].sub_(param.detach().cpu())
                    update_norm = torch.norm(param_updates[name].view(-1, ))
                    param_norm = torch.norm(param.view(-1, )).cpu()
                    self._tensorboard.add_train_scalar(
                        "gradient_update/" + name,
                        update_norm / (param_norm + 1e-7))
            else:
                self.optimizer.step()

            # Update moving averages
            if self._moving_average is not None:
                self._moving_average.apply(batch_num_total)

            # Update the description with the latest metrics
            metrics = training_util.get_metrics(self.model, train_loss,
                                                batches_this_epoch)
            description = training_util.description_from_metrics(metrics)

            # Log parameter values to Tensorboard
            if self._tensorboard.should_log_this_batch():
                self._tensorboard.log_parameter_and_gradient_statistics(
                    self.model, batch_grad_norm)
                self._tensorboard.log_learning_rates(self.model,
                                                     self.optimizer)

                self._tensorboard.add_train_scalar("loss/loss_train",
                                                   metrics["loss"])
                self._tensorboard.log_metrics(
                    {"epoch_metrics/" + k: v
                     for k, v in metrics.items()})

            if self._tensorboard.should_log_histograms_this_batch():
                self._tensorboard.log_histograms(self.model,
                                                 histogram_parameters)

            if self._log_batch_size_period:
                cur_batch = sum([
                    training_util.get_batch_size(batch)
                    for batch in batch_group
                ])
                cumulative_batch_size += cur_batch
                if (batches_this_epoch - 1) % self._log_batch_size_period == 0:
                    average = cumulative_batch_size / batches_this_epoch
                    logger.info(
                        f"current batch size: {cur_batch} mean batch size: {average}"
                    )
                    self._tensorboard.add_train_scalar("current_batch_size",
                                                       cur_batch)
                    self._tensorboard.add_train_scalar("mean_batch_size",
                                                       average)

            # Save model if needed.
            if self._model_save_interval is not None and (
                    time.time() - last_save_time > self._model_save_interval):
                last_save_time = time.time()
                self._save_checkpoint('{0}.{1}'.format(
                    epoch, training_util.time_to_str(int(last_save_time))))
        metrics = training_util.get_metrics(self.model,
                                            train_loss,
                                            batches_this_epoch,
                                            reset=True)
        metrics['cpu_memory_MB'] = peak_cpu_usage
        for (gpu_num, memory) in gpu_usage:
            metrics['gpu_' + str(gpu_num) + '_memory_MB'] = memory
        return metrics
Exemple #17
0
    def _train_epoch(self, epoch: int) -> Dict[str, float]:
        """
        Trains one epoch and returns metrics.
        """
        logger.info("Epoch %d/%d", epoch, self._num_epochs - 1)
        peak_cpu_usage = peak_memory_mb()
        logger.info(f"Peak CPU memory usage MB: {peak_cpu_usage}")
        gpu_usage = []
        for gpu, memory in gpu_memory_mb().items():
            gpu_usage.append((gpu, memory))
            logger.info(f"GPU {gpu} memory usage MB: {memory}")

        train_loss = 0.0
        # Set the model to "train" mode.
        self.model.train()

        num_gpus = len(self._cuda_devices)

        # Get tqdm for the training batches
        # 使训练数据可迭代
        raw_train_generator = self.iterator(self.train_data,
                                            num_epochs=1,
                                            shuffle=self.shuffle)
        # 将可迭代的单实例批处理到list中
        train_generator = lazy_groups_of(raw_train_generator, num_gpus)
        # 向上取整 获取batch数 (总batch/gpu数)
        num_training_batches = math.ceil(
            self.iterator.get_num_batches(self.train_data) / num_gpus)
        # 默认的accumulated batch count 为4,此处是求accumulate的尾巴
        residue = num_training_batches % self.accumulated_batch_count
        self._last_log = time.time()
        last_save_time = time.time()

        batches_this_epoch = 0
        if self._batch_num_total is None:
            self._batch_num_total = 0

        histogram_parameters = set(
            self.model.get_parameters_for_histogram_tensorboard_logging())

        logger.info("Training")
        # 训练进度条
        train_generator_tqdm = Tqdm.tqdm(train_generator,
                                         total=num_training_batches)
        cumulative_batch_size = 0
        # 梯度清零 常规操作
        self.optimizer.zero_grad()
        # 开始训练
        for batch_group in train_generator_tqdm:
            batches_this_epoch += 1
            self._batch_num_total += 1
            batch_num_total = self._batch_num_total
            # 一个batch为accumulated_batch_count个iteration,梯度累积
            iter_len = self.accumulated_batch_count \
                if batches_this_epoch <= (num_training_batches - residue) else residue

            if self.cuda_verbose_step is not None and batch_num_total % self.cuda_verbose_step == 0:
                print(
                    f'Before forward pass - Cuda memory allocated: {torch.cuda.memory_allocated() / 1e9}'
                )
                print(
                    f'Before forward pass - Cuda memory cached: {torch.cuda.memory_cached() / 1e9}'
                )
            try:  # 平均loss
                loss = self.batch_loss(batch_group,
                                       for_training=True) / iter_len
            except RuntimeError as e:
                print(e)
                for x in batch_group:
                    all_words = [len(y['words']) for y in x['metadata']]
                    print(f"Total sents: {len(all_words)}. "
                          f"Min {min(all_words)}. Max {max(all_words)}")
                    for elem in ['labels', 'd_tags']:
                        tt = x[elem]
                        print(
                            f"{elem} shape {list(tt.shape)} and min {tt.min().item()} and {tt.max().item()}"
                        )
                    for elem in ["bert", "mask", "bert-offsets"]:
                        tt = x['tokens'][elem]
                        print(
                            f"{elem} shape {list(tt.shape)} and min {tt.min().item()} and {tt.max().item()}"
                        )
                raise e

            if self.cuda_verbose_step is not None and batch_num_total % self.cuda_verbose_step == 0:
                print(
                    f'After forward pass - Cuda memory allocated: {torch.cuda.memory_allocated() / 1e9}'
                )
                print(
                    f'After forward pass - Cuda memory cached: {torch.cuda.memory_cached() / 1e9}'
                )

            if torch.isnan(loss):
                raise ValueError("nan loss encountered")
            # 反向传播
            loss.backward()

            if self.cuda_verbose_step is not None and batch_num_total % self.cuda_verbose_step == 0:
                print(
                    f'After backprop - Cuda memory allocated: {torch.cuda.memory_allocated() / 1e9}'
                )
                print(
                    f'After backprop - Cuda memory cached: {torch.cuda.memory_cached() / 1e9}'
                )
            # 计算loss
            train_loss += loss.item() * iter_len
            # 删除两个变量
            del batch_group, loss
            # pytorch 训练时无用的临时变量可能会越来越多,导致 out of memory ,可以使用下面语句来清理这些不需要的变量。
            torch.cuda.empty_cache()

            if self.cuda_verbose_step is not None and batch_num_total % self.cuda_verbose_step == 0:
                print(
                    f'After collecting garbage - Cuda memory allocated: {torch.cuda.memory_allocated() / 1e9}'
                )
                print(
                    f'After collecting garbage - Cuda memory cached: {torch.cuda.memory_cached() / 1e9}'
                )
            # 正则化梯度
            batch_grad_norm = self.rescale_gradients()

            # This does nothing if batch_num_total is None or you are using a
            # scheduler which doesn't update per batch.
            # lr会在epoch变大的同时予以调整,一般是逐渐变小
            # momentum 动量 防止损失函数陷入局部极小值,跳出鞍点
            if self._learning_rate_scheduler:
                self._learning_rate_scheduler.step_batch(batch_num_total)
            if self._momentum_scheduler:
                self._momentum_scheduler.step_batch(batch_num_total)

            if self._tensorboard.should_log_histograms_this_batch():
                # copy参数 防止爆内存
                # get the magnitude of parameter updates for logging
                # We need a copy of current parameters to compute magnitude of updates,
                # and copy them to CPU so large models won't go OOM on the GPU.
                param_updates = {
                    name: param.detach().cpu().clone()
                    for name, param in self.model.named_parameters()
                }
                if batches_this_epoch % self.accumulated_batch_count == 0 or \
                        batches_this_epoch == num_training_batches:
                    # 自动计算梯度 optimizer.step()
                    self.optimizer.step()
                    self.optimizer.zero_grad()
                for name, param in self.model.named_parameters():
                    param_updates[name].sub_(param.detach().cpu())
                    # 求l1范数
                    update_norm = torch.norm(param_updates[name].view(-1))
                    param_norm = torch.norm(param.view(-1)).cpu()
                    self._tensorboard.add_train_scalar(
                        "gradient_update/" + name,
                        update_norm / (param_norm + 1e-7))
            else:
                if batches_this_epoch % self.accumulated_batch_count == 0 or \
                        batches_this_epoch == num_training_batches:
                    self.optimizer.step()
                    self.optimizer.zero_grad()

            # Update moving averages 在adam或SGD优化中为了平衡模型更新速度一般设置滑动平均来提高模型在测试数据上的健壮性
            if self._moving_average is not None:
                self._moving_average.apply(batch_num_total)

            # Update the description with the latest metrics
            metrics = training_util.get_metrics(self.model, train_loss,
                                                batches_this_epoch)
            description = training_util.description_from_metrics(metrics)

            train_generator_tqdm.set_description(description, refresh=False)

            # Log parameter values to Tensorboard
            if self._tensorboard.should_log_this_batch():
                self._tensorboard.log_parameter_and_gradient_statistics(
                    self.model, batch_grad_norm)
                self._tensorboard.log_learning_rates(self.model,
                                                     self.optimizer)

                self._tensorboard.add_train_scalar("loss/loss_train",
                                                   metrics["loss"])
                self._tensorboard.log_metrics(
                    {"epoch_metrics/" + k: v
                     for k, v in metrics.items()})

            if self._tensorboard.should_log_histograms_this_batch():
                self._tensorboard.log_histograms(self.model,
                                                 histogram_parameters)

            if self._log_batch_size_period:
                cur_batch = sum([
                    training_util.get_batch_size(batch)
                    for batch in batch_group
                ])
                cumulative_batch_size += cur_batch
                if (batches_this_epoch - 1) % self._log_batch_size_period == 0:
                    average = cumulative_batch_size / batches_this_epoch
                    logger.info(
                        f"current batch size: {cur_batch} mean batch size: {average}"
                    )
                    self._tensorboard.add_train_scalar("current_batch_size",
                                                       cur_batch)
                    self._tensorboard.add_train_scalar("mean_batch_size",
                                                       average)

            # Save model if needed.
            if self._model_save_interval is not None and (
                    time.time() - last_save_time > self._model_save_interval):
                last_save_time = time.time()
                self._save_checkpoint("{0}.{1}".format(
                    epoch, training_util.time_to_str(int(last_save_time))))

        metrics = training_util.get_metrics(self.model,
                                            train_loss,
                                            batches_this_epoch,
                                            reset=True)
        metrics["cpu_memory_MB"] = peak_cpu_usage
        for (gpu_num, memory) in gpu_usage:
            metrics["gpu_" + str(gpu_num) + "_memory_MB"] = memory
        return metrics
    def _train_epoch(self, epoch: int) -> Dict[str, float]:
        """
        Trains one epoch and returns metrics.
        """
        logger.info("Epoch %d/%d", epoch, self._num_epochs - 1)
        peak_cpu_usage = peak_memory_mb()
        logger.info(f"Peak CPU memory usage MB: {peak_cpu_usage}")
        gpu_usage = []
        for gpu, memory in gpu_memory_mb().items():
            gpu_usage.append((gpu, memory))
            logger.info(f"GPU {gpu} memory usage MB: {memory}")

        train_loss = 0.0
        # Set the model to "train" mode.
        self.model.train()

        # Get tqdm for the training batches
        train_generator = self.iterator(self.train_data,
                                        num_epochs=1,
                                        shuffle=self.shuffle)
        num_training_batches = self.iterator.get_num_batches(self.train_data)
        self._last_log = time.time()
        last_save_time = time.time()

        batches_this_epoch = 0
        if self._batch_num_total is None:
            self._batch_num_total = 0

        histogram_parameters = set(
            self.model.get_parameters_for_histogram_tensorboard_logging())

        logger.info("Training")
        train_generator_tqdm = Tqdm.tqdm(train_generator,
                                         total=num_training_batches)
        cumulative_batch_size = 0

        self.optimizer.zero_grad()
        for batch_id, batch in enumerate(train_generator_tqdm):
            batches_this_epoch += 1
            self._batch_num_total += 1
            batch_num_total = self._batch_num_total

            loss = self.batch_loss(batch, for_training=True)
            if torch.isnan(loss):
                raise ValueError("nan loss encountered")

            loss.backward()

            train_loss += loss.item()

            batch_grad_norm = self.rescale_gradients()

            # This does nothing if batch_num_total is None or you are using an
            # LRScheduler which doesn't update per batch.
            if self._learning_rate_scheduler:
                self._learning_rate_scheduler.step_batch(batch_num_total)

            if self._tensorboard.should_log_histograms_this_batch():
                # get the magnitude of parameter updates for logging
                # We need a copy of current parameters to compute magnitude of updates,
                # and copy them to CPU so large models won't go OOM on the GPU.
                param_updates = {
                    name: param.detach().cpu().clone()
                    for name, param in self.model.named_parameters()
                }
                if (batch_id + 1) % self._accumulation_steps == 0:
                    self.optimizer.step()
                    self.optimizer.zero_grad()

                for name, param in self.model.named_parameters():
                    param_updates[name].sub_(param.detach().cpu())
                    update_norm = torch.norm(param_updates[name].view(-1, ))
                    param_norm = torch.norm(param.view(-1, )).cpu()
                    self._tensorboard.add_train_scalar(
                        "gradient_update/" + name,
                        update_norm / (param_norm + 1e-7))
            else:
                if (batch_id + 1) % self._accumulation_steps == 0:
                    self.optimizer.step()
                    self.optimizer.zero_grad()

            # Update the description with the latest metrics
            metrics = training_util.get_metrics(self.model, train_loss,
                                                batches_this_epoch)
            description = training_util.description_from_metrics(metrics)

            train_generator_tqdm.set_description(description, refresh=False)

            # Log parameter values to Tensorboard
            if self._tensorboard.should_log_this_batch():
                self._tensorboard.log_parameter_and_gradient_statistics(
                    self.model, batch_grad_norm)
                self._tensorboard.log_learning_rates(self.model,
                                                     self.optimizer)

                self._tensorboard.add_train_scalar("loss/loss_train",
                                                   metrics["loss"])
                self._tensorboard.log_metrics(
                    {"epoch_metrics/" + k: v
                     for k, v in metrics.items()})

            if self._tensorboard.should_log_histograms_this_batch():
                self._tensorboard.log_histograms(self.model,
                                                 histogram_parameters)

            if self._log_batch_size_period:
                cur_batch = training_util.get_batch_size(batch)
                cumulative_batch_size += cur_batch
                if (batches_this_epoch - 1) % self._log_batch_size_period == 0:
                    average = cumulative_batch_size / batches_this_epoch
                    logger.info(
                        f"current batch size: {cur_batch} mean batch size: {average}"
                    )
                    self._tensorboard.add_train_scalar("current_batch_size",
                                                       cur_batch)
                    self._tensorboard.add_train_scalar("mean_batch_size",
                                                       average)

            # Save model if needed.
            if self._model_save_interval is not None and (
                    time.time() - last_save_time > self._model_save_interval):
                last_save_time = time.time()
                self._save_checkpoint('{0}.{1}'.format(
                    epoch, training_util.time_to_str(int(last_save_time))))
        metrics = training_util.get_metrics(self.model,
                                            train_loss,
                                            batches_this_epoch,
                                            reset=True)
        metrics['cpu_memory_MB'] = peak_cpu_usage
        for (gpu_num, memory) in gpu_usage:
            metrics['gpu_' + str(gpu_num) + '_memory_MB'] = memory
        return metrics
Exemple #19
0
    def _train_epoch(self, epoch: int) -> Dict[str, float]:
        """
        Trains one epoch and returns metrics.
        """
        logger.info("Epoch %d/%d", epoch, self._num_epochs - 1)
        peak_cpu_usage = peak_memory_mb()
        logger.info(f"Peak CPU memory usage MB: {peak_cpu_usage}")
        gpu_usage = []
        for gpu, memory in gpu_memory_mb().items():
            gpu_usage.append((gpu, memory))
            logger.info(f"GPU {gpu} memory usage MB: {memory}")

        train_loss = 0.0
        # Set the model to "train" mode.
        self.model.train()

        num_gpus = len(self._cuda_devices)  # 如果没有gpu ,也返回1.

        # Get tqdm for the training batches
        raw_train_generator = self.iterator(self.train_data,
                                            num_epochs=1,
                                            shuffle=self.shuffle)
        train_generator = lazy_groups_of(raw_train_generator, num_gpus)
        num_training_batches = math.ceil(
            self.iterator.get_num_batches(self.train_data) / num_gpus)
        residue = num_training_batches % self.accumulated_batch_count
        self._last_log = time.time()
        last_save_time = time.time()

        batches_this_epoch = 0
        if self._batch_num_total is None:
            self._batch_num_total = 0

        histogram_parameters = set(
            self.model.get_parameters_for_histogram_tensorboard_logging())

        logger.info("Training")
        train_generator_tqdm = Tqdm.tqdm(
            train_generator, total=num_training_batches)  # 打印一个进度条而已.
        cumulative_batch_size = 0
        self.optimizer.zero_grad()
        for batch_group in train_generator_tqdm:
            batches_this_epoch += 1
            self._batch_num_total += 1
            batch_num_total = self._batch_num_total

            iter_len = self.accumulated_batch_count \
                if batches_this_epoch <= (num_training_batches - residue) else residue

            if self.cuda_verbose_step is not None and batch_num_total % self.cuda_verbose_step == 0:
                print(
                    f'Before forward pass - Cuda memory allocated: {torch.cuda.memory_allocated() / 1e9}'
                )
                print(
                    f'Before forward pass - Cuda memory cached: {torch.cuda.memory_cached() / 1e9}'
                )
            try:
                loss = self.batch_loss(
                    batch_group,
                    for_training=True) / iter_len  # 输入的数据里面去除了全部都是keep的情况
            except RuntimeError as e:
                print(e)
                for x in batch_group:
                    all_words = [len(y['words']) for y in x['metadata']]
                    print(f"Total sents: {len(all_words)}. "
                          f"Min {min(all_words)}. Max {max(all_words)}")
                    for elem in ['labels', 'd_tags']:
                        tt = x[elem]
                        print(
                            f"{elem} shape {list(tt.shape)} and min {tt.min().item()} and {tt.max().item()}"
                        )
                    for elem in ["bert", "mask", "bert-offsets"]:
                        tt = x['tokens'][elem]
                        print(
                            f"{elem} shape {list(tt.shape)} and min {tt.min().item()} and {tt.max().item()}"
                        )
                raise e

            if self.cuda_verbose_step is not None and batch_num_total % self.cuda_verbose_step == 0:
                print(
                    f'After forward pass - Cuda memory allocated: {torch.cuda.memory_allocated() / 1e9}'
                )
                print(
                    f'After forward pass - Cuda memory cached: {torch.cuda.memory_cached() / 1e9}'
                )

            if torch.isnan(loss):
                raise ValueError("nan loss encountered")

            loss.backward()

            if self.cuda_verbose_step is not None and batch_num_total % self.cuda_verbose_step == 0:
                print(
                    f'After backprop - Cuda memory allocated: {torch.cuda.memory_allocated() / 1e9}'
                )
                print(
                    f'After backprop - Cuda memory cached: {torch.cuda.memory_cached() / 1e9}'
                )

            train_loss += loss.item() * iter_len

            del batch_group, loss
            torch.cuda.empty_cache()  # 节省内存,显存

            if self.cuda_verbose_step is not None and batch_num_total % self.cuda_verbose_step == 0:
                print(
                    f'After collecting garbage - Cuda memory allocated: {torch.cuda.memory_allocated() / 1e9}'
                )
                print(
                    f'After collecting garbage - Cuda memory cached: {torch.cuda.memory_cached() / 1e9}'
                )

            batch_grad_norm = self.rescale_gradients()

            # This does nothing if batch_num_total is None or you are using a
            # scheduler which doesn't update per batch.
            if self._learning_rate_scheduler:
                self._learning_rate_scheduler.step_batch(batch_num_total)
            if self._momentum_scheduler:
                self._momentum_scheduler.step_batch(batch_num_total)

            if self._tensorboard.should_log_histograms_this_batch():
                # get the magnitude of parameter updates for logging
                # We need a copy of current parameters to compute magnitude of updates,
                # and copy them to CPU so large models won't go OOM on the GPU.
                param_updates = {
                    name: param.detach().cpu().clone()
                    for name, param in self.model.named_parameters()
                }
                if batches_this_epoch % self.accumulated_batch_count == 0 or \
                        batches_this_epoch == num_training_batches:
                    self.optimizer.step()
                    self.optimizer.zero_grad()
                for name, param in self.model.named_parameters():
                    param_updates[name].sub_(param.detach().cpu())
                    update_norm = torch.norm(param_updates[name].view(-1))
                    param_norm = torch.norm(param.view(-1)).cpu()
                    self._tensorboard.add_train_scalar(
                        "gradient_update/" + name,
                        update_norm / (param_norm + 1e-7))
            else:
                if batches_this_epoch % self.accumulated_batch_count == 0 or \
                        batches_this_epoch == num_training_batches:
                    self.optimizer.step()  #多个batch才进行bp算法.
                    self.optimizer.zero_grad()

            # Update moving averages
            if self._moving_average is not None:
                self._moving_average.apply(batch_num_total)

            # Update the description with the latest metrics
            metrics = training_util.get_metrics(self.model, train_loss,
                                                batches_this_epoch)  # 计算准确率
            description = training_util.description_from_metrics(metrics)

            train_generator_tqdm.set_description(description, refresh=False)

            # Log parameter values to Tensorboard
            if self._tensorboard.should_log_this_batch():
                self._tensorboard.log_parameter_and_gradient_statistics(
                    self.model, batch_grad_norm)
                self._tensorboard.log_learning_rates(self.model,
                                                     self.optimizer)

                self._tensorboard.add_train_scalar("loss/loss_train",
                                                   metrics["loss"])
                self._tensorboard.log_metrics(
                    {"epoch_metrics/" + k: v
                     for k, v in metrics.items()})

            if self._tensorboard.should_log_histograms_this_batch():
                self._tensorboard.log_histograms(self.model,
                                                 histogram_parameters)

            if self._log_batch_size_period:
                cur_batch = sum([
                    training_util.get_batch_size(batch)
                    for batch in batch_group
                ])
                cumulative_batch_size += cur_batch
                if (batches_this_epoch - 1) % self._log_batch_size_period == 0:
                    average = cumulative_batch_size / batches_this_epoch
                    logger.info(
                        f"current batch size: {cur_batch} mean batch size: {average}"
                    )
                    self._tensorboard.add_train_scalar("current_batch_size",
                                                       cur_batch)
                    self._tensorboard.add_train_scalar("mean_batch_size",
                                                       average)

            # Save model if needed. 取一个间隔来存
            if self._model_save_interval is not None and (
                    time.time() - last_save_time > self._model_save_interval):
                last_save_time = time.time()
                self._save_checkpoint("{0}.{1}".format(
                    epoch, training_util.time_to_str(int(last_save_time))))

        metrics = training_util.get_metrics(self.model,
                                            train_loss,
                                            batches_this_epoch,
                                            reset=True)
        metrics["cpu_memory_MB"] = peak_cpu_usage
        for (gpu_num, memory) in gpu_usage:
            metrics["gpu_" + str(gpu_num) + "_memory_MB"] = memory
        return metrics
Exemple #20
0
    def _train_epoch(self, epoch: int) -> Dict[str, float]:
        """
        Trains one epoch and returns metrics.
        """
        logger.info("Epoch %d/%d", epoch, self._num_epochs - 1)
        peak_cpu_usage = common_util.peak_memory_mb()
        logger.info(f"Peak CPU memory usage MB: {peak_cpu_usage}")
        gpu_usage = []
        for gpu, memory in common_util.gpu_memory_mb().items():
            gpu_usage.append((gpu, memory))
            logger.info(f"GPU {gpu} memory usage MB: {memory}")

        train_loss = 0.0
        # Set the model to "train" mode.
        self._pytorch_model.train()

        # Get tqdm for the training batches
        batch_generator = self.iterator(self.train_data,
                                        num_epochs=1,
                                        shuffle=self.shuffle)
        batch_group_generator = common_util.lazy_groups_of(
            batch_generator, self._num_gradient_accumulation_steps)
        num_training_batches = math.ceil(
            self.iterator.get_num_batches(self.train_data) /
            self._num_gradient_accumulation_steps)
        # Having multiple tqdm bars in case of distributed training will be a mess. Hence only the master's
        # progress is shown
        if self._master:
            batch_group_generator_tqdm = Tqdm.tqdm(batch_group_generator,
                                                   total=num_training_batches)
        else:
            batch_group_generator_tqdm = batch_group_generator

        self._last_log = time.time()
        last_save_time = time.time()

        batches_this_epoch = 0
        if self._batch_num_total is None:
            self._batch_num_total = 0

        histogram_parameters = set(
            self.model.get_parameters_for_histogram_tensorboard_logging())

        logger.info("Training")

        cumulative_batch_group_size = 0
        done_early = False
        for batch_group in batch_group_generator_tqdm:
            if self._distributed:
                # Check whether the other workers have stopped already (due to differing amounts of
                # data in each). If so, we can't proceed because we would hang when we hit the
                # barrier implicit in Model.forward. We use a IntTensor instead a BoolTensor
                # here because NCCL process groups apparently don't support BoolTensor.
                done = torch.tensor(0, device=self.cuda_device)
                torch.distributed.all_reduce(done,
                                             torch.distributed.ReduceOp.SUM)
                if done.item() > 0:
                    done_early = True
                    logger.warning(
                        f"Worker {torch.distributed.get_rank()} finishing training early! "
                        "This implies that there is an imbalance in your training "
                        "data across the workers and that some amount of it will be "
                        "ignored. A small amount of this is fine, but a major imbalance "
                        "should be avoided. Note: This warning will appear unless your "
                        "data is perfectly balanced.")
                    break

            batches_this_epoch += 1
            self._batch_num_total += 1
            batch_num_total = self._batch_num_total

            self.optimizer.zero_grad()

            for batch in batch_group:
                loss = self.batch_loss(batch, for_training=True)
                if torch.isnan(loss):
                    raise ValueError("nan loss encountered")
                loss = loss / len(batch_group)
                loss.backward()
                train_loss += loss.item()

            batch_grad_norm = self.rescale_gradients()

            # This does nothing if batch_num_total is None or you are using a
            # scheduler which doesn't update per batch.
            if self._learning_rate_scheduler:
                self._learning_rate_scheduler.step_batch(batch_num_total)
            if self._momentum_scheduler:
                self._momentum_scheduler.step_batch(batch_num_total)

            if self._tensorboard.should_log_histograms_this_batch(
            ) and self._master:
                # get the magnitude of parameter updates for logging
                # We need a copy of current parameters to compute magnitude of updates,
                # and copy them to CPU so large models won't go OOM on the GPU.
                param_updates = {
                    name: param.detach().cpu().clone()
                    for name, param in self.model.named_parameters()
                }
                self.optimizer.step()
                for name, param in self.model.named_parameters():
                    param_updates[name].sub_(param.detach().cpu())
                    update_norm = torch.norm(param_updates[name].view(-1))
                    param_norm = torch.norm(param.view(-1)).cpu()
                    self._tensorboard.add_train_scalar(
                        "gradient_update/" + name,
                        update_norm / (param_norm + 1e-7))
            else:
                self.optimizer.step()

            # Update moving averages
            if self._moving_average is not None:
                self._moving_average.apply(batch_num_total)

            # Update the description with the latest metrics
            metrics = training_util.get_metrics(
                self.model,
                train_loss,
                batches_this_epoch,
                world_size=self._world_size,
                cuda_device=[self.cuda_device],
            )

            # Updating tqdm only for the master as the trainers wouldn't have one
            if self._master:
                description = training_util.description_from_metrics(metrics)
                batch_group_generator_tqdm.set_description(description,
                                                           refresh=False)

            # Log parameter values to Tensorboard (only from the master)
            if self._tensorboard.should_log_this_batch() and self._master:
                self._tensorboard.log_parameter_and_gradient_statistics(
                    self.model, batch_grad_norm)
                self._tensorboard.log_learning_rates(self.model,
                                                     self.optimizer)

                self._tensorboard.add_train_scalar("loss/loss_train",
                                                   metrics["loss"])
                self._tensorboard.log_metrics(
                    {"epoch_metrics/" + k: v
                     for k, v in metrics.items()})

            if self._tensorboard.should_log_histograms_this_batch(
            ) and self._master:
                self._tensorboard.log_histograms(self.model,
                                                 histogram_parameters)

            if self._log_batch_size_period:
                batch_group_size = sum(
                    training_util.get_batch_size(batch)
                    for batch in batch_group)
                cumulative_batch_group_size += batch_group_size
                if (batches_this_epoch - 1) % self._log_batch_size_period == 0:
                    average = cumulative_batch_group_size / batches_this_epoch
                    logger.info(
                        f"current batch size: {batch_group_size} mean batch size: {average}"
                    )
                    self._tensorboard.add_train_scalar("current_batch_size",
                                                       batch_group_size)
                    self._tensorboard.add_train_scalar("mean_batch_size",
                                                       average)

            # Save model if needed.
            if (self._model_save_interval is not None and
                (time.time() - last_save_time > self._model_save_interval)
                    and self._master):
                last_save_time = time.time()
                self._save_checkpoint("{0}.{1}".format(
                    epoch, training_util.time_to_str(int(last_save_time))))
        if self._distributed and not done_early:
            logger.warning(
                f"Worker {torch.distributed.get_rank()} completed its entire epoch (training)."
            )
            # Indicate that we're done so that any workers that have remaining data stop the epoch early.
            done = torch.tensor(1, device=self.cuda_device)
            torch.distributed.all_reduce(done, torch.distributed.ReduceOp.SUM)
            assert done.item()

        # Let all workers finish their epoch before computing
        # the final statistics for the epoch.
        if self._distributed:
            dist.barrier()

        metrics = training_util.get_metrics(
            self.model,
            train_loss,
            batches_this_epoch,
            reset=True,
            world_size=self._world_size,
            cuda_device=[self.cuda_device],
        )
        metrics["cpu_memory_MB"] = peak_cpu_usage
        for (gpu_num, memory) in gpu_usage:
            metrics["gpu_" + str(gpu_num) + "_memory_MB"] = memory
        return metrics
Exemple #21
0
    def forward(self,  # type: ignore
                task_index: torch.IntTensor,
                reverse: torch.ByteTensor,
                for_training: torch.ByteTensor,
                train_stage: torch.IntTensor,
                tokens: Dict[str, torch.LongTensor],
                label: torch.IntTensor = None) -> Dict[str, torch.Tensor]:
        """
        :param task_index:
        :param reverse:
        :param for_training:
        :param train_stage: ["share_senti", "share_classify",
        "share_classify_adversarial", "domain_valid", "domain_valid_adversarial"]
        :param tokens:
        :param label:
        :return:
        """
        embedded_text = self._text_field_embedder(tokens)
        mask = get_text_field_mask(tokens).float()
        embed_tokens = self._encoder(embedded_text, mask)
        batch_size = get_batch_size(embed_tokens)
        # bs * (25*4)
        seq_vec = self._seq_vec(embed_tokens, mask)
        # TODO add linear layer

        domain_embeddings = self._domain_embeddings(torch.arange(self._de_dim).cuda())

        de_scores = F.softmax(
            self._de_attention(seq_vec, domain_embeddings.expand(batch_size, *domain_embeddings.size())), dim=1)
        de_valid = False
        if np.random.rand() < 0.3:
            de_valid = True
            noise = 0.01 * torch.normal(mean=0.5,
                                        # std=torch.std(domain_embeddings).sign_())
                                        std=torch.empty(*de_scores.size()).fill_(1.0))
            de_scores = de_scores + noise.cuda()
        domain_embedding = torch.matmul(de_scores, domain_embeddings)
        domain_embedding = self._de_feedforward(domain_embedding)
        # train sentiment classify
        if train_stage.cpu() == torch.tensor(0) or not for_training:

            de_representation = torch.tanh(torch.add(domain_embedding, seq_vec))

            sentiment_logits = self._sentiment_discriminator(de_representation)
            if label is not None:
                loss = self._loss(sentiment_logits, label)
                self.metrics["{}_stm_acc".format(TASKS_NAME[task_index.cpu()])](sentiment_logits, label)

        if train_stage.cpu() == torch.tensor(1) or not for_training:
            s_domain_logits = self._s_domain_discriminator(seq_vec, reverse=reverse)
            task_index = task_index.expand(batch_size)
            loss = self._domain_loss(s_domain_logits, task_index)
            self.metrics["s_domain_acc"](s_domain_logits, task_index)

        if train_stage.cpu() == torch.tensor(2) or not for_training:
            valid_logits = self._valid_discriminator(domain_embedding, reverse=reverse)
            valid_label = torch.ones(batch_size).cuda()
            if de_valid:
                valid_label = torch.zeros(batch_size).cuda()
            if self._label_smoothing is not None and self._label_smoothing > 0.0:
                loss = sequence_cross_entropy_with_logits(valid_logits,
                                                          valid_label.unsqueeze(0).cuda(),
                                                          torch.tensor(1).unsqueeze(0).cuda(),
                                                          average="token",
                                                          label_smoothing=self._label_smoothing)
            else:
                loss = self._valid_loss(valid_logits,
                                        torch.zeros(2).scatter_(0, valid_label, torch.tensor(1.0)).cuda())
            self.metrics["valid_acc"](valid_logits, valid_label)
        # TODO add orthogonal loss
        output_dict = {"loss": loss}

        return output_dict
Exemple #22
0
    def _train_epoch(self, epoch: int) -> Dict[str, float]:
        """
        Trains one epoch and returns metrics.
        """
        logger.info("Epoch %d/%d", epoch, self._num_epochs - 1)
        peak_cpu_usage = peak_memory_mb()
        logger.info(f"Peak CPU memory usage MB: {peak_cpu_usage}")
        gpu_usage = []
        for gpu, memory in gpu_memory_mb().items():
            gpu_usage.append((gpu, memory))
            logger.info(f"GPU {gpu} memory usage MB: {memory}")

        train_loss = 0.0
        # Set the model to "train" mode.
        self.model.train()

        num_gpus = len(self._cuda_devices)

        # Get tqdm for the training batches
        raw_train_generator = self.iterator(self.train_data,
                                            num_epochs=1,
                                            shuffle=self.shuffle)
        train_generator = lazy_groups_of(raw_train_generator, num_gpus)
        num_training_batches = math.ceil(self.iterator.get_num_batches(self.train_data)/num_gpus)
        self._last_log = time.time()
        last_save_time = time.time()

        batches_this_epoch = 0
        if self._batch_num_total is None:
            self._batch_num_total = 0

        histogram_parameters = set(self.model.get_parameters_for_histogram_tensorboard_logging())


        logger.info("Training")
        train_generator_tqdm = Tqdm.tqdm(train_generator,
                                         total=num_training_batches)
        cumulative_batch_size = 0
        for batch_group in train_generator_tqdm:
            self.model.train()
            batches_this_epoch += 1
            self._batch_num_total += 1
            batch_num_total = self._batch_num_total

            self.optimizer.zero_grad()

            loss = self.batch_loss(batch_group, for_training=True)

            if torch.isnan(loss):
                raise ValueError("nan loss encountered")

            loss.backward()

            train_loss += loss.item()

            batch_grad_norm = self.rescale_gradients()

            # This does nothing if batch_num_total is None or you are using a
            # scheduler which doesn't update per batch.
            if self._learning_rate_scheduler:
                self._learning_rate_scheduler.step_batch(batch_num_total)
            if self._momentum_scheduler:
                self._momentum_scheduler.step_batch(batch_num_total)

            if self._tensorboard.should_log_histograms_this_batch():
                # get the magnitude of parameter updates for logging
                # We need a copy of current parameters to compute magnitude of updates,
                # and copy them to CPU so large models won't go OOM on the GPU.
                param_updates = {name: param.detach().cpu().clone()
                                 for name, param in self.model.named_parameters()}
                self.optimizer.step()
                for name, param in self.model.named_parameters():
                    param_updates[name].sub_(param.detach().cpu())
                    update_norm = torch.norm(param_updates[name].view(-1, ))
                    param_norm = torch.norm(param.view(-1, )).cpu()
                    self._tensorboard.add_train_scalar("gradient_update/" + name,
                                                       update_norm / (param_norm + 1e-7))
            else:
                self.optimizer.step()

            # Update moving averages
            if self._moving_average is not None:
                self._moving_average.apply(batch_num_total)

            # Update the description with the latest metrics
            metrics = training_util.get_metrics(self.model, train_loss, batches_this_epoch)
            description = training_util.description_from_metrics(metrics)

            train_generator_tqdm.set_description(description, refresh=False)

            # Log parameter values to Tensorboard
            if self._tensorboard.should_log_this_batch():
                self._tensorboard.log_parameter_and_gradient_statistics(self.model, batch_grad_norm)
                self._tensorboard.log_learning_rates(self.model, self.optimizer)

                self._tensorboard.add_train_scalar("loss/loss_train", metrics["loss"])
                self._tensorboard.log_metrics({"epoch_metrics/" + k: v for k, v in metrics.items()})

            if self._tensorboard.should_log_histograms_this_batch():
                self._tensorboard.log_histograms(self.model, histogram_parameters)

            if self._log_batch_size_period:
                cur_batch = sum([training_util.get_batch_size(batch) for batch in batch_group])
                cumulative_batch_size += cur_batch
                if (batches_this_epoch - 1) % self._log_batch_size_period == 0:
                    average = cumulative_batch_size/batches_this_epoch
                    logger.info(f"current batch size: {cur_batch} mean batch size: {average}")
                    self._tensorboard.add_train_scalar("current_batch_size", cur_batch)
                    self._tensorboard.add_train_scalar("mean_batch_size", average)

            # Save model if needed.
            if self._model_save_interval is not None and (
                    time.time() - last_save_time > self._model_save_interval
            ):
                last_save_time = time.time()
                self._save_checkpoint(
                        '{0}.{1}'.format(epoch, training_util.time_to_str(int(last_save_time)))
                )
            if self._early_stopping_by_batch and self._batch_num_total % 10 == 0:
                if self._validation_data is not None:
                    with torch.no_grad():
                        # We have a validation set, so compute all the metrics on it.
                        val_loss, num_batches = self._validation_loss()
                        val_metrics = training_util.get_metrics(self.model, val_loss, num_batches, reset=True)

                        # Check validation metric for early stopping
                        this_epoch_val_metric = val_metrics[self._validation_metric]
                        self._metric_tracker.add_metric(this_epoch_val_metric)

                        if self._metric_tracker.is_best_so_far():
                            metrics['best_batch'] = self._batch_num_total
                            for key, value in val_metrics.items():
                                metrics["best_validation_" + key] = value
                            self._metric_tracker.best_epoch_metrics = val_metrics

                        self._save_checkpoint(self._batch_num_total)

                        if self.callbacks is not None:
                            for callback in self.callbacks:
                                callback.on_batch_end(self._batch_num_total)

        metrics = training_util.get_metrics(self.model, train_loss, batches_this_epoch, reset=True)
        metrics['cpu_memory_MB'] = peak_cpu_usage
        for (gpu_num, memory) in gpu_usage:
            metrics['gpu_'+str(gpu_num)+'_memory_MB'] = memory
        return metrics
    def _train_epoch(self, epoch: int) -> Dict[str, float]:
        """ 
    Trains one epoch and returns metrics. 
    only report system utils when we are local rank 0 at each machine. 
    """
        logger.info("Rank %d: Epoch %d/%d", self._rank, epoch,
                    self._num_epochs - 1)
        peak_cpu_usage = peak_memory_mb()
        if self._is_chief:
            logger.info(f"Peak CPU memory usage MB: {peak_cpu_usage}")

        train_loss = 0.0
        # Set the model to "train" mode.
        self.model.train()

        # should be 1 anyway, because we are only dealing with nprocess_with_ngpus
        num_gpus = len(self._cuda_device)

        # TODO: Implementation of whether the generator should take into account of worldsize.
        # Get tqdm for the training batches
        raw_train_generator = self.iterator(self.train_data,
                                            num_epochs=1,
                                            shuffle=self.shuffle)
        train_generator = lazy_groups_of(raw_train_generator, num_gpus)
        num_training_batches = math.ceil(
            self.iterator.get_num_batches(self.train_data) / num_gpus)
        self._last_log = time.time()
        last_save_time = time.time()

        batches_this_epoch = 0
        if self._batch_num_total is None:
            self._batch_num_total = 0

        histogram_parameters = set(
            self.model.get_parameters_for_histogram_tensorboard_logging())

        logger.info("Training")
        train_generator_tqdm = Tqdm.tqdm(train_generator,
                                         total=num_training_batches)
        cumulative_batch_size = 0
        # NOTE: only work in nprocess_ngpus
        device = torch.device("cuda:%d" % self._cuda_device[0])
        for batch_group in train_generator_tqdm:
            batches_this_epoch += 1
            self._batch_num_total += 1
            batch_num_total = self._batch_num_total

            self.optimizer.zero_grad()

            loss = self.batch_loss(batch_group, for_training=True)

            if torch.isnan(loss):
                raise ValueError("nan loss encountered")

            loss.backward()
            train_loss += loss.item()
            batch_grad_norm = self.rescale_gradients()

            # This does nothing if batch_num_total is None or you are using a
            # scheduler which doesn't update per batch.
            if self._learning_rate_scheduler:
                self._learning_rate_scheduler.step_batch(batch_num_total)
            if self._momentum_scheduler:
                self._momentum_scheduler.step_batch(batch_num_total)

            if self._is_chief:
                # only chief do tensorboard
                if self._tensorboard.should_log_histograms_this_batch():
                    # get the magnitude of parameter updates for logging
                    # We need a copy of current parameters to compute magnitude of updates,
                    # and copy them to CPU so large models won't go OOM on the GPU.
                    param_updates = {
                        name: param.detach().cpu().clone()
                        for name, param in self.model.named_parameters()
                    }
                    self.optimizer.step()
                    for name, param in self.model.named_parameters():
                        param_updates[name].sub_(param.detach().cpu())
                        update_norm = torch.norm(param_updates[name].view(
                            -1, ))
                        param_norm = torch.norm(param.view(-1, )).cpu()
                        self._tensorboard.add_train_scalar(
                            "gradient_update/" + name,
                            update_norm / (param_norm + 1e-7))
                else:
                    self.optimizer.step()
            else:
                self.optimizer.step()

            # Update moving averages
            # NOTE: not sure whether this need to be average
            if self._moving_average is not None:
                self._moving_average.apply(batch_num_total)

            metrics = get_metrics(self.model, device, self._worldsize,
                                  train_loss, batches_this_epoch)

            description = training_util.description_from_metrics(metrics)
            train_generator_tqdm.set_description(
                ("Rank %d: " % self._rank) + description, refresh=False)

            if self._is_chief:
                # Log parameter values to Tensorboard
                if self._tensorboard.should_log_this_batch():
                    self._tensorboard.log_parameter_and_gradient_statistics(
                        self.model, batch_grad_norm)
                    self._tensorboard.log_learning_rates(
                        self.model, self.optimizer)

                    self._tensorboard.add_train_scalar("loss/loss_train",
                                                       metrics["loss"])
                    self._tensorboard.log_metrics(
                        {"epoch_metrics/" + k: v
                         for k, v in metrics.items()})

                if self._tensorboard.should_log_histograms_this_batch():
                    self._tensorboard.log_histograms(self.model,
                                                     histogram_parameters)

            if self._log_batch_size_period:
                cur_batch = sum([
                    training_util.get_batch_size(batch)
                    for batch in batch_group
                ])
                cumulative_batch_size += cur_batch
                if (batches_this_epoch - 1) % self._log_batch_size_period == 0:
                    average = cumulative_batch_size / batches_this_epoch
                    logger.info(
                        f"rank {self._rank}, current batch size: {cur_batch} mean batch size: {average}"
                    )
                    if self._is_chief:
                        self._tensorboard.add_train_scalar(
                            "current_batch_size", cur_batch)
                        self._tensorboard.add_train_scalar(
                            "mean_batch_size", average)

            if self._is_chief:
                # Save model if needed.
                if self._model_save_interval is not None and (
                        time.time() - last_save_time >
                        self._model_save_interval):
                    last_save_time = time.time()
                    self._save_checkpoint('{0}.{1}'.format(
                        epoch, training_util.time_to_str(int(last_save_time))))

            metrics = get_metrics(self.model, device, self._worldsize,
                                  train_loss, batches_this_epoch)
            metrics['cpu_memory_MB'] = peak_cpu_usage
            return metrics