def get_train_dataloader(self) -> DataLoader:
        if self.train_dataset is None:
            raise ValueError("Trainer: training requires a train_dataset.")

        if not self.alternate:
            if is_tpu_available():
                train_sampler = get_tpu_sampler(self.train_dataset)
            else:
                train_sampler = (RandomSampler(self.train_dataset)
                                 if self.args.local_rank == -1 else
                                 DistributedSampler(self.train_dataset))

            data_loader = DataLoader(
                self.train_dataset,
                batch_size=self.args.train_batch_size,
                sampler=train_sampler,
                collate_fn=self.data_collator.collate_batch,
            )
        else:
            data_loader = DataLoader(
                self.train_dataset,
                batch_size=self.args.train_batch_size,
                shuffle=False,
                sampler=None,
                collate_fn=self.data_collator.collate_batch,
            )

        if is_tpu_available():
            data_loader = pl.ParallelLoader(
                data_loader,
                [self.args.device]).per_device_loader(self.args.device)

        return data_loader
Beispiel #2
0
    def get_test_dataloader(self, test_dataset: Dataset) -> DataLoader:
        # We use the same batch_size as for eval.
        if is_tpu_available():
            sampler = SequentialDistributedSampler(
                test_dataset,
                num_replicas=xm.xrt_world_size(),
                rank=xm.get_ordinal())
        elif self.args.local_rank != -1:
            sampler = SequentialDistributedSampler(test_dataset)
        else:
            sampler = SequentialSampler(test_dataset)

        data_loader = DataLoader(
            test_dataset,
            sampler=sampler,
            batch_size=self.args.eval_batch_size,
            collate_fn=self.data_collator.collate_batch,
        )

        if is_tpu_available():
            data_loader = pl.ParallelLoader(
                data_loader,
                [self.args.device]).per_device_loader(self.args.device)

        return data_loader
    def get_eval_dataloader(self,
                            eval_dataset: Optional[Dataset] = None
                            ) -> DataLoader:
        if eval_dataset is None and self.eval_dataset is None:
            raise ValueError("Trainer: evaluation requires an eval_dataset.")

        eval_dataset = eval_dataset if eval_dataset is not None else self.eval_dataset

        sampler = get_tpu_sampler(eval_dataset) if is_tpu_available() else None

        batch_size = 1 if self.alternate else self.args.eval_batch_size
        data_loader = DataLoader(
            eval_dataset,
            sampler=sampler,
            batch_size=batch_size,
            shuffle=False,
            collate_fn=self.data_collator.collate_batch,
        )

        if is_tpu_available():
            data_loader = pl.ParallelLoader(
                data_loader,
                [self.args.device]).per_device_loader(self.args.device)

        return data_loader
Beispiel #4
0
    def get_eval_dataloader(self,
                            eval_dataset: Optional[Dataset] = None
                            ) -> DataLoader:
        if eval_dataset is None and self.eval_dataset is None:
            raise ValueError("Trainer: evaluation requires an eval_dataset.")

        eval_dataset = eval_dataset if eval_dataset is not None else self.eval_dataset

        if self.args.use_bucket_iterator:

            bucket_boundaries = [0, 20, 30, 40, 50, 60, 70, 80, 90, 101]
            eval_sampler = BySequenceLengthSampler(
                eval_dataset,
                bucket_boundaries,
                batch_size=self.args.eval_batch_size,
                drop_last=False)

            data_loader = DataLoader(
                eval_dataset,
                batch_size=1,
                batch_sampler=eval_sampler,
                collate_fn=self.data_collator.collate_batch,
                num_workers=0,
                pin_memory=False)
        else:

            if is_tpu_available():
                sampler = SequentialDistributedSampler(
                    eval_dataset,
                    num_replicas=xm.xrt_world_size(),
                    rank=xm.get_ordinal())
            elif self.args.local_rank != -1:
                sampler = SequentialDistributedSampler(eval_dataset)
            else:
                sampler = SequentialSampler(eval_dataset)

            data_loader = DataLoader(
                eval_dataset,
                sampler=sampler,
                batch_size=self.args.eval_batch_size,
                collate_fn=self.data_collator.collate_batch,
            )

        if is_tpu_available():
            data_loader = pl.ParallelLoader(
                data_loader,
                [self.args.device]).per_device_loader(self.args.device)

        return data_loader
Beispiel #5
0
    def get_eval_dataloader(self,
                            eval_dataset: Optional[Dataset] = None
                            ) -> DataLoader:
        if eval_dataset is None and self.eval_dataset is None:
            raise ValueError("Trainer: evaluation requires an eval_dataset.")

        eval_dataset = eval_dataset if eval_dataset is not None else self.eval_dataset

        if is_tpu_available():
            sampler = SequentialDistributedSampler(
                eval_dataset,
                num_replicas=xm.xrt_world_size(),
                rank=xm.get_ordinal())
        elif self.args.local_rank != -1:
            sampler = SequentialDistributedSampler(eval_dataset)
        else:
            sampler = SequentialSampler(eval_dataset)

        data_loader = DataLoader(
            eval_dataset,
            sampler=sampler,
            batch_size=self.args.eval_batch_size,
            collate_fn=self.data_collator.collate_batch,
        )

        return data_loader
    def __init__(
            self,
            model: PreTrainedModel,
            args: TrainingArguments,
            model_args: ModelArguments,
            data_args: DataTrainingArguments,
            data_collator: Optional[DataCollator] = None,
            train_dataset: Optional[Dataset] = None,
            eval_dataset: Optional[Dataset] = None,
            compute_metrics: Optional[Callable[[EvalPrediction], Dict]] = None,
            prediction_loss_only=False,
            tb_writer: Optional["SummaryWriter"] = None,
            optimizers: Tuple[torch.optim.Optimizer, torch.optim.lr_scheduler.LambdaLR] = None,
    ):
        """
        Trainer is a simple but feature-complete training and eval loop for PyTorch,
        optimized for Transformers.

        Args:
            prediction_loss_only:
                (Optional) in evaluation and prediction, only return the loss
        """
        self.model = model.to(args.device)
        self.args = args
        self.model_args = model_args
        self.data_args = data_args
        if data_collator is not None:
            self.data_collator = data_collator
        else:
            self.data_collator = DefaultDataCollator()
        self.train_dataset = train_dataset
        self.eval_dataset = eval_dataset
        self.compute_metrics = compute_metrics
        self.prediction_loss_only = prediction_loss_only
        self.optimizers = optimizers
        self.best_model_path = None
        if tb_writer is not None:
            self.tb_writer = tb_writer
        elif is_tensorboard_available() and self.is_world_master():
            self.tb_writer = SummaryWriter(log_dir=self.args.logging_dir)
        if not is_tensorboard_available():
            logger.warning(
                "You are instantiating a Trainer but Tensorboard is not installed. You should consider installing it."
            )
        if is_wandb_available():
            self._setup_wandb()
        else:
            logger.info(
                "You are instantiating a Trainer but W&B is not installed. To use wandb logging, "
                "run `pip install wandb; wandb login` see https://docs.wandb.com/huggingface."
            )
        set_seed(self.args.seed)
        # Create output directory if needed
        if self.is_world_master():
            os.makedirs(self.args.output_dir, exist_ok=True)
        if is_tpu_available():
            # Set an xla_device flag on the model's config.
            # We'll find a more elegant and not need to do this in the future.
            self.model.config.xla_device = True
 def is_world_master(self) -> bool:
     """
     This will be True only in one process, even in distributed mode,
     even when training on multiple machines.
     """
     if is_tpu_available():
         return xm.is_master_ordinal(local=False)
     else:
         return self.args.local_rank == -1 or torch.distributed.get_rank() == 0
Beispiel #8
0
    def get_test_dataloader(self, test_dataset: Dataset) -> DataLoader:
        # We use the same batch_size as for eval.
        sampler = get_tpu_sampler(test_dataset) if is_tpu_available() else None

        data_loader = DataLoader(
            test_dataset,
            sampler=sampler,
            batch_size=self.args.eval_batch_size,
            shuffle=False,
            collate_fn=self.data_collator.collate_batch,
        )

        if is_tpu_available():
            data_loader = pl.ParallelLoader(
                data_loader,
                [self.args.device]).per_device_loader(self.args.device)

        return data_loader
Beispiel #9
0
 def num_examples(
         self, dataloader: Union[DataLoader, "pl.PerDeviceLoader"]) -> int:
     """
     Helper to get num of examples from a DataLoader, by accessing its Dataset.
     """
     if is_tpu_available():
         assert isinstance(dataloader, pl.PerDeviceLoader)
         return len(dataloader._loader._loader.dataset)
     else:
         return len(dataloader.dataset)
Beispiel #10
0
    def get_train_dataloader(self) -> DataLoader:
        if self.args.use_bucket_iterator:

            print("\n\n\n\n USING BUCKET ITERATOR \n\n\n\n")

            bucket_boundaries = [0, 20, 30, 40, 50, 60, 70, 80, 90, 101]
            train_sampler = BySequenceLengthSampler(
                self.train_dataset,
                bucket_boundaries,
                batch_size=self.args.train_batch_size,
                drop_last=False)

            data_loader = DataLoader(
                self.train_dataset,
                batch_size=1,
                batch_sampler=train_sampler,
                collate_fn=self.data_collator.collate_batch,
                num_workers=0,
                pin_memory=False)

        else:
            if self.train_dataset is None:
                raise ValueError("Trainer: training requires a train_dataset.")
            if is_tpu_available():
                train_sampler = get_tpu_sampler(self.train_dataset)
            else:
                train_sampler = (RandomSampler(self.train_dataset)
                                 if self.args.local_rank == -1 else
                                 DistributedSampler(self.train_dataset))

            data_loader = DataLoader(
                self.train_dataset,
                batch_size=self.args.train_batch_size,
                sampler=train_sampler,
                collate_fn=self.data_collator.collate_batch,
            )

        if is_tpu_available():
            data_loader = pl.ParallelLoader(
                data_loader,
                [self.args.device]).per_device_loader(self.args.device)

        return data_loader
Beispiel #11
0
    def save_model(self, output_dir: Optional[str] = None):
        """
        Saving best-practices: if you use default names for the model,
        you can reload it using from_pretrained().

        Will only save from the world_master process (unless in TPUs).
        """

        if is_tpu_available():
            self._save_tpu(output_dir)
        elif self.is_world_master():
            self._save(output_dir)
Beispiel #12
0
    def get_train_dataloader(self) -> DataLoader:
        if self.train_dataset is None:
            raise ValueError("Trainer: training requires a train_dataset.")
        if is_tpu_available():
            train_sampler = get_tpu_sampler(self.train_dataset)
        else:
            train_sampler = (RandomSampler(self.train_dataset)
                             if self.args.local_rank == -1 else
                             DistributedSampler(self.train_dataset))

        data_loader = DataLoader(
            self.train_dataset,
            batch_size=self.args.train_batch_size,
            sampler=train_sampler,
            collate_fn=self.data_collator.collate_batch,
        )

        return data_loader
Beispiel #13
0
from transformers.trainer_utils import PREFIX_CHECKPOINT_DIR, EvalPrediction, PredictionOutput, TrainOutput
from transformers.training_args import TrainingArguments, is_tpu_available

try:
    from apex import amp

    _has_apex = True
except ImportError:
    _has_apex = False


def is_apex_available():
    return _has_apex


if is_tpu_available():
    import torch_xla.core.xla_model as xm
    import torch_xla.debug.metrics as met
    import torch_xla.distributed.parallel_loader as pl

try:
    from torch.utils.tensorboard import SummaryWriter

    _has_tensorboard = True
except ImportError:
    try:
        from tensorboardX import SummaryWriter

        _has_tensorboard = True
    except ImportError:
        _has_tensorboard = False
    def _prediction_loop(
            self,
            dataloader: DataLoader,
            description: str,
            prediction_loss_only: Optional[bool] = None) -> PredictionOutput:
        """
        Prediction/evaluation loop, shared by `evaluate()` and `predict()`.

        Works both with or without labels.
        """

        prediction_loss_only = prediction_loss_only if prediction_loss_only is not None else self.prediction_loss_only

        # multi-gpu eval
        if self.args.n_gpu > 1 and not isinstance(self.model,
                                                  torch.nn.DataParallel):
            model = torch.nn.DataParallel(self.model)
        else:
            model = self.model
        model.to(self.args.device)

        if is_tpu_available():
            batch_size = dataloader._loader._loader.batch_size
        else:
            batch_size = dataloader.batch_size
        logger.info("***** Running %s *****", description)
        logger.info("  Num examples = %d", self.num_examples(dataloader))
        logger.info("  Batch size = %d", batch_size)
        eval_losses: List[float] = []
        preds_t1: np.ndarray = None
        preds_t2: np.ndarray = None
        label_ids_t1: np.ndarray = None
        label_ids_t2: np.ndarray = None
        model.eval()

        for inputs in tqdm(dataloader, desc=description):
            has_labels = any(
                inputs.get(k) is not None for k in [
                    "labels", "labels_t1", "labels_t2", "lm_labels",
                    "masked_lm_labels"
                ])

            for k, v in inputs.items():
                inputs[k] = v.to(self.args.device)

            with torch.no_grad():
                outputs = model(**inputs)

                if has_labels:
                    if self.alternate:
                        step_eval_loss, logits, task = outputs[:3]
                    else:
                        step_eval_loss, logits_t1, logits_t2 = outputs[:3]
                    eval_losses += [step_eval_loss.mean().item()]
                else:
                    logits = outputs[0]

            if self.alternate:
                if not prediction_loss_only:
                    if task == 0:
                        if preds_t1 is None:
                            preds_t1 = logits.detach().cpu().numpy()
                        else:
                            preds_t1 = np.append(preds_t1,
                                                 logits.detach().cpu().numpy(),
                                                 axis=0)
                        if inputs.get("labels") is not None:
                            if label_ids_t1 is None:
                                label_ids_t1 = inputs["labels"].detach().cpu(
                                ).numpy()
                            else:
                                label_ids_t1 = np.append(
                                    label_ids_t1,
                                    inputs["labels"].detach().cpu().numpy(),
                                    axis=0)

                    elif task == 1:
                        if preds_t2 is None:
                            preds_t2 = logits.detach().cpu().numpy()
                        else:
                            preds_t2 = np.append(preds_t2,
                                                 logits.detach().cpu().numpy(),
                                                 axis=0)
                        if inputs.get("labels") is not None:
                            if label_ids_t2 is None:
                                label_ids_t2 = inputs["labels"].detach().cpu(
                                ).numpy()
                            else:
                                label_ids_t2 = np.append(
                                    label_ids_t2,
                                    inputs["labels"].detach().cpu().numpy(),
                                    axis=0)

            else:
                if not prediction_loss_only:
                    if preds_t1 is None or preds_t2 is None:
                        preds_t1 = logits_t1.detach().cpu().numpy()
                        preds_t2 = logits_t1.detach().cpu().numpy()
                    else:
                        preds_t1 = np.append(preds_t1,
                                             logits_t1.detach().cpu().numpy(),
                                             axis=0)
                        preds_t2 = np.append(preds_t2,
                                             logits_t2.detach().cpu().numpy(),
                                             axis=0)
                    if inputs.get("labels_t1") is not None:
                        if label_ids_t1 is None or label_ids_t2 is None:
                            label_ids_t1 = inputs["labels_t1"].detach().cpu(
                            ).numpy()
                            label_ids_t2 = inputs["labels_t2"].detach().cpu(
                            ).numpy()
                        else:
                            label_ids_t1 = np.append(
                                label_ids_t1,
                                inputs["labels_t1"].detach().cpu().numpy(),
                                axis=0)
                            label_ids_t2 = np.append(
                                label_ids_t2,
                                inputs["labels_t2"].detach().cpu().numpy(),
                                axis=0)

        # if is_tpu_available() and preds is not None and label_ids is not None:
        #     # tpu-comment: Get all predictions and labels from all worker shards of eval dataset
        #     preds = xm.mesh_reduce("eval_preds", preds, np.concatenate)
        #     label_ids = xm.mesh_reduce("eval_out_label_ids", label_ids, np.concatenate)

        metrics = {}
        if self.compute_metrics is not None:
            if preds_t1 is not None and label_ids_t1 is not None:
                metrics["task 1"] = self.compute_metrics(
                    EvalPrediction(predictions=preds_t1,
                                   label_ids=label_ids_t1))
            if preds_t2 is not None and label_ids_t2 is not None:
                metrics["task 2"] = self.compute_metrics(
                    EvalPrediction(predictions=preds_t2,
                                   label_ids=label_ids_t2))

        if len(eval_losses) > 0:
            metrics["eval_loss"] = np.mean(eval_losses)

        # Prefix all keys with eval_
        for key in list(metrics.keys()):
            if not key.startswith("eval_"):
                metrics[f"eval_{key}"] = metrics.pop(key)

        return (PredictionOutput(predictions=preds_t1,
                                 label_ids=label_ids_t1,
                                 metrics=metrics),
                PredictionOutput(predictions=preds_t2,
                                 label_ids=label_ids_t2,
                                 metrics=metrics))
    def _prediction_loop(
            self, dataloader: DataLoader, description: str, prediction_loss_only: Optional[bool] = None
    ) -> PredictionOutput:
        """
        Prediction/evaluation loop, shared by `evaluate()` and `predict()`.

        Works both with or without labels.
        """

        prediction_loss_only = prediction_loss_only if prediction_loss_only is not None else self.prediction_loss_only

        model = self.model
        # multi-gpu eval
        if self.args.n_gpu > 1:
            model = torch.nn.DataParallel(model)
        else:
            model = self.model
        # Note: in torch.distributed mode, there's no point in wrapping the model
        # inside a DistributedDataParallel as we'll be under `no_grad` anyways.

        batch_size = dataloader.batch_size
        logger.info("***** Running %s *****", description)
        logger.info("  Num examples = %d", self.num_examples(dataloader))
        logger.info("  Batch size = %d", batch_size)
        eval_losses: List[float] = []
        task_probs: Dict[str, torch.Tensor] = {}
        preds: Dict[str, torch.Tensor] = {}
        label_ids: Dict[str, torch.Tensor] = {}
        model.eval()

        if is_tpu_available():
            dataloader = pl.ParallelLoader(dataloader, [self.args.device]).per_device_loader(self.args.device)

        for inputs in tqdm(dataloader, desc=description):
            if description == 'Prediction':
                inputs.pop('labels')
            has_labels = any(inputs.get(k) is not None for k in ["labels", "lm_labels", "masked_lm_labels"])

            task_id = inputs.pop('task_id')
            for k, v in inputs.items():
                inputs[k] = v.to(self.args.device)
            inputs['task_id'] = task_id

            with torch.no_grad():
                outputs = model(**inputs)
                if has_labels:
                    step_eval_loss, logits = outputs[:2]
                    eval_losses += [step_eval_loss.mean().item()]
                else:
                    logits = outputs[0]

            if not prediction_loss_only:
                probs = nn.functional.softmax(logits.detach(), dim=-1)
                pred_labels = logits.detach().argmax(dim=-1)
                if task_id not in preds:
                    preds[task_id] = pred_labels
                    task_probs[task_id] = probs
                else:
                    task_probs[task_id] = torch.cat((task_probs[task_id], probs), dim=0)
                    preds[task_id] = torch.cat((preds[task_id], pred_labels), dim=0)
                if inputs.get("labels") is not None:
                    labels = inputs["labels"].detach()
                    if task_id not in label_ids:
                        label_ids[task_id] = labels
                    else:
                        label_ids[task_id] = torch.cat((label_ids[task_id], labels), dim=0)

        if self.args.local_rank != -1:
            # In distributed mode, concatenate all results from all nodes:
            if not preds:
                preds = self.distributed_concat(preds, num_total_examples=self.num_examples(dataloader))
            if not label_ids:
                label_ids = self.distributed_concat(label_ids, num_total_examples=self.num_examples(dataloader))
        elif is_tpu_available():
            # tpu-comment: Get all predictions and labels from all worker shards of eval dataset
            if not preds:
                preds = xm.mesh_reduce("eval_preds", preds, torch.cat)
            if not label_ids:
                label_ids = xm.mesh_reduce("eval_label_ids", label_ids, torch.cat)

        metrics = {}
        if self.compute_metrics is not None and preds and label_ids:
            for task_id, task_preds in preds.items():
                task_preds = task_preds.cpu().numpy()
                task_label_ids = label_ids[task_id].cpu().numpy()
                metrics[task_id] = self.compute_metrics(
                    EvalPrediction(predictions=task_preds, label_ids=task_label_ids)
                )
            metrics['eval_avg'] = sum(metrics.values()) / 3

        if len(eval_losses) > 0:
            metrics["eval_loss"] = np.mean(eval_losses)

        # Prefix all keys with eval_
        for key in list(metrics.keys()):
            if not key.startswith("eval_"):
                metrics[f"eval_{task_id_to_name[key]}"] = metrics.pop(key)

        return PredictionOutput(predictions=preds, task_probs=task_probs, label_ids=label_ids, metrics=metrics)
Beispiel #16
0
    def train(self, model_path: Optional[str] = None):
        """
        Main training entry point.

        Args:
            model_path:
                (Optional) Local path to model if model to train has been instantiated from a local path
                If present, we will try reloading the optimizer/scheduler states from there.
        """
        train_dataloader = self.get_train_dataloader()
        if self.args.max_steps > 0:
            t_total = self.args.max_steps
            num_train_epochs = (self.args.max_steps //
                                (len(train_dataloader) //
                                 self.args.gradient_accumulation_steps) + 1)
        else:
            t_total = int(
                (len(self.train_dataset) // self.args.train_batch_size) //
                self.args.gradient_accumulation_steps *
                self.args.num_train_epochs)
            num_train_epochs = self.args.num_train_epochs

        optimizer, scheduler = self.get_optimizers(num_training_steps=t_total)

        # Check if saved optimizer or scheduler states exist
        if (model_path is not None
                and os.path.isfile(os.path.join(model_path, "optimizer.pt"))
                and os.path.isfile(os.path.join(model_path, "scheduler.pt"))):
            # Load in optimizer and scheduler states
            optimizer.load_state_dict(
                torch.load(os.path.join(model_path, "optimizer.pt"),
                           map_location=self.args.device))
            scheduler.load_state_dict(
                torch.load(os.path.join(model_path, "scheduler.pt")))

        model = self.model
        if self.args.fp16:
            if not is_apex_available():
                raise ImportError(
                    "Please install apex from https://www.github.com/nvidia/apex to use fp16 training."
                )
            model, optimizer = amp.initialize(
                model, optimizer, opt_level=self.args.fp16_opt_level)

        # multi-gpu training (should be after apex fp16 initialization)
        if self.args.n_gpu > 1:
            model = torch.nn.DataParallel(model)

        # Distributed training (should be after apex fp16 initialization)
        if self.args.local_rank != -1:
            model = torch.nn.parallel.DistributedDataParallel(
                model,
                device_ids=[self.args.local_rank],
                output_device=self.args.local_rank,
                find_unused_parameters=True,
            )

        if self.tb_writer is not None:
            self.tb_writer.add_text("args", self.args.to_json_string())
            self.tb_writer.add_hparams(self.args.to_sanitized_dict(),
                                       metric_dict={})

        # Train!
        if is_tpu_available():
            total_train_batch_size = self.args.train_batch_size * xm.xrt_world_size(
            )
        else:
            total_train_batch_size = (self.args.train_batch_size *
                                      self.args.gradient_accumulation_steps *
                                      (torch.distributed.get_world_size()
                                       if self.args.local_rank != -1 else 1))
        logger.info("***** Running training *****")
        logger.info("  Num examples = %d", self.num_examples(train_dataloader))
        logger.info("  Num Epochs = %d", num_train_epochs)
        logger.info("  Instantaneous batch size per device = %d",
                    self.args.per_gpu_train_batch_size)
        logger.info(
            "  Total train batch size (w. parallel, distributed & accumulation) = %d",
            total_train_batch_size)
        logger.info("  Gradient Accumulation steps = %d",
                    self.args.gradient_accumulation_steps)
        logger.info("  Total optimization steps = %d", t_total)

        self.global_step = 0
        self.epoch = 0
        epochs_trained = 0
        steps_trained_in_current_epoch = 0
        # Check if continuing training from a checkpoint
        if model_path is not None:
            # set global_step to global_step of last saved checkpoint from model path
            try:
                self.global_step = int(model_path.split("-")[-1].split("/")[0])
                epochs_trained = self.global_step // (
                    len(train_dataloader) //
                    self.args.gradient_accumulation_steps)
                steps_trained_in_current_epoch = self.global_step % (
                    len(train_dataloader) //
                    self.args.gradient_accumulation_steps)

                logger.info(
                    "  Continuing training from checkpoint, will skip to saved global_step"
                )
                logger.info("  Continuing training from epoch %d",
                            epochs_trained)
                logger.info("  Continuing training from global step %d",
                            self.global_step)
                logger.info(
                    "  Will skip the first %d steps in the first epoch",
                    steps_trained_in_current_epoch)
            except ValueError:
                self.global_step = 0
                logger.info("  Starting fine-tuning.")

        if self.args.evaluate_step_zero:
            self.evaluate()

        tr_loss = 0.0
        logging_loss = 0.0
        model.zero_grad()
        train_iterator = trange(epochs_trained,
                                int(num_train_epochs),
                                desc="Epoch",
                                disable=not self.is_local_master())
        for epoch in train_iterator:
            if isinstance(train_dataloader, DataLoader) and isinstance(
                    train_dataloader.sampler, DistributedSampler):
                train_dataloader.sampler.set_epoch(epoch)

            epoch_iterator = tqdm(train_dataloader,
                                  desc="Iteration",
                                  disable=not self.is_local_master())

            for step, inputs in enumerate(epoch_iterator):

                # Skip past any already trained steps if resuming training
                if steps_trained_in_current_epoch > 0:
                    steps_trained_in_current_epoch -= 1
                    continue

                temp_loss = tr_loss
                tr_loss += self._training_step(model, inputs, optimizer)

                if self.global_step % 100 == 0:
                    print(tr_loss - temp_loss)
                    self.tb_writer.add_scalar('train loss',
                                              tr_loss - temp_loss,
                                              self.global_step)

                if (step + 1) % self.args.gradient_accumulation_steps == 0 or (
                        # last step in epoch but step is always smaller than gradient_accumulation_steps
                        len(epoch_iterator) <=
                        self.args.gradient_accumulation_steps and
                    (step + 1) == len(epoch_iterator)):
                    if self.args.fp16:
                        torch.nn.utils.clip_grad_norm_(
                            amp.master_params(optimizer),
                            self.args.max_grad_norm)
                    else:
                        torch.nn.utils.clip_grad_norm_(model.parameters(),
                                                       self.args.max_grad_norm)

                    if is_tpu_available():
                        xm.optimizer_step(optimizer)
                    else:
                        optimizer.step()

                    scheduler.step()
                    model.zero_grad()
                    self.global_step += 1
                    self.epoch = epoch + (step + 1) / len(epoch_iterator)

                    if (self.args.logging_steps > 0
                            and self.global_step % self.args.logging_steps
                            == 0) or (self.global_step == 1
                                      and self.args.logging_first_step):
                        logs: Dict[str, float] = {}
                        logs["loss"] = (tr_loss -
                                        logging_loss) / self.args.logging_steps
                        # backward compatibility for pytorch schedulers
                        logs["learning_rate"] = (
                            scheduler.get_last_lr()[0]
                            if version.parse(torch.__version__) >=
                            version.parse("1.4") else scheduler.get_lr()[0])
                        logging_loss = tr_loss

                        self._log(logs)

                        if self.args.evaluate_during_training:
                            self.evaluate()

                    if self.is_world_master():
                        if self.args.save_steps > 0 and self.global_step % self.args.save_steps == 0:
                            # In all cases (even distributed/parallel), self.model is always a reference
                            # to the model we want to save.
                            if hasattr(model, "module"):
                                assert model.module is self.model
                            else:
                                assert model is self.model
                            # Save model checkpoint
                            output_dir = os.path.join(
                                self.args.output_dir,
                                f"{PREFIX_CHECKPOINT_DIR}-{self.global_step}")

                            self.save_model(output_dir)
                            self._rotate_checkpoints()
                            torch.save(
                                optimizer.state_dict(),
                                os.path.join(output_dir, "optimizer.pt"))
                            torch.save(
                                scheduler.state_dict(),
                                os.path.join(output_dir, "scheduler.pt"))
                            logger.info(
                                "Saving optimizer and scheduler states to %s",
                                output_dir)

                if self.args.max_steps > 0 and self.global_step > self.args.max_steps:
                    epoch_iterator.close()
                    break
            if self.args.max_steps > 0 and self.global_step > self.args.max_steps:
                train_iterator.close()
                break
            if self.args.tpu_metrics_debug:
                # tpu-comment: Logging debug metrics for PyTorch/XLA (compile, execute times, ops, etc.)
                xm.master_print(met.metrics_report())

        if self.tb_writer:
            self.tb_writer.close()

        logger.info(
            "\n\nTraining completed. Do not forget to share your model on huggingface.co/models =)\n\n"
        )
        return TrainOutput(self.global_step, tr_loss / self.global_step)
Beispiel #17
0
 def is_local_master(self) -> bool:
     if is_tpu_available():
         return xm.is_master_ordinal(local=True)
     else:
         return self.args.local_rank in [-1, 0]
Beispiel #18
0
    def _prediction_loop(
            self,
            dataloader: DataLoader,
            description: str,
            prediction_loss_only: Optional[bool] = None) -> PredictionOutput:
        """
        Prediction/evaluation loop, shared by `evaluate()` and `predict()`.

        Works both with or without labels.
        """

        prediction_loss_only = prediction_loss_only if prediction_loss_only is not None else self.prediction_loss_only

        # multi-gpu eval
        if self.args.n_gpu > 1 and not isinstance(self.model,
                                                  torch.nn.DataParallel):
            model = torch.nn.DataParallel(self.model)
        else:
            model = self.model
        model.to(self.args.device)

        if is_tpu_available():
            batch_size = dataloader._loader._loader.batch_size
        else:
            batch_size = dataloader.batch_size
        logger.info("***** Running %s *****", description)
        logger.info("  Num examples = %d", self.num_examples(dataloader))
        logger.info("  Batch size = %d", batch_size)
        eval_losses: List[float] = []

        eval_tag_losses = []
        eval_gen_losses = []
        eval_cov_losses = []

        preds = []
        label_ids = []
        model.eval()

        for inputs in tqdm(dataloader, desc=description):
            for k, v in inputs.items():
                inputs[k] = v.to(self.args.device)

            with torch.no_grad():
                outputs = model(**inputs)
                step_eval_loss, logits = outputs[:2]
                other_loss = outputs[-1]

                eval_losses += [step_eval_loss.mean().item()]
                eval_tag_losses += [other_loss['tag_loss'].mean().item()]
                eval_gen_losses += [other_loss['gen_loss'].mean().item()]
                eval_cov_losses += [other_loss['cov_loss'].mean().item()]

            if not prediction_loss_only:

                preds.append(logits.detach().cpu().numpy().argmax(-1))

                if inputs.get("tgt_token") is not None:
                    label_ids.append(
                        inputs["tgt_token"][:, 1:].detach().cpu().numpy())

        if is_tpu_available():
            # tpu-comment: Get all predictions and labels from all worker shards of eval dataset
            preds = xm.mesh_reduce("eval_preds", preds, np.concatenate)
            label_ids = xm.mesh_reduce("eval_out_label_ids", label_ids,
                                       np.concatenate)

        if self.compute_metrics is not None and preds is not None and label_ids is not None:
            metrics = self.compute_metrics(
                EvalPrediction(predictions=preds, label_ids=label_ids))
        else:
            metrics = {}
        if len(eval_losses) > 0:
            metrics["eval_loss"] = np.mean(eval_losses)
        if len(eval_tag_losses) > 0:
            metrics["eval_tag_loss"] = np.mean(eval_tag_losses)
        if len(eval_gen_losses) > 0:
            metrics["eval_gen_loss"] = np.mean(eval_gen_losses)
        if len(eval_cov_losses) > 0:
            metrics["eval_cov_loss"] = np.mean(eval_cov_losses)
        if metrics["eval_cov_loss"] != 0:
            metrics["eval_loss"] = metrics["eval_tag_loss"] + metrics[
                "eval_gen_loss"]

        # Prefix all keys with eval_
        for key in list(metrics.keys()):
            if not key.startswith("eval_"):
                metrics[f"eval_{key}"] = metrics.pop(key)

        return PredictionOutput(predictions=preds,
                                label_ids=label_ids,
                                metrics=metrics)
Beispiel #19
0
    def _prediction_loop(
            self,
            dataloader: DataLoader,
            description: str,
            prediction_loss_only: Optional[bool] = None) -> PredictionOutput:
        """
        Prediction/evaluation loop, shared by `evaluate()` and `predict()`.

        Works both with or without labels.
        """

        prediction_loss_only = prediction_loss_only if prediction_loss_only is not None else self.prediction_loss_only

        model = self.model
        # multi-gpu eval
        if self.args.n_gpu > 1:
            model = torch.nn.DataParallel(model)
        else:
            model = self.model
        # Note: in torch.distributed mode, there's no point in wrapping the model
        # inside a DistributedDataParallel as we'll be under `no_grad` anyways.

        # if is_tpu_available():
        #     batch_size = dataloader._loader._loader.batch_size
        # else:
        #     batch_size = dataloader.batch_size
        logger.info("***** Running %s *****", description)
        logger.info("  Num examples = %d", self.num_examples(dataloader))
        # logger.info("  Batch size = %d", batch_size)
        eval_losses: List[float] = []
        preds: torch.Tensor = None
        label_ids: torch.Tensor = None
        model.eval()

        for inputs in tqdm(dataloader, desc=description):
            has_labels = any(
                inputs.get(k) is not None
                for k in ["labels", "lm_labels", "masked_lm_labels"])

            for k, v in inputs.items():
                inputs[k] = v.to(self.args.device)

            with torch.no_grad():
                outputs = model(**inputs)
                if has_labels:
                    step_eval_loss, logits = outputs[:2]
                    eval_losses += [step_eval_loss.mean().item()]
                else:
                    logits = outputs[0]

            if not prediction_loss_only:
                if preds is None:
                    preds = logits.detach()
                else:
                    preds = torch.cat((preds, logits.detach()), dim=0)
                if inputs.get("labels") is not None:
                    if label_ids is None:
                        label_ids = inputs["labels"].detach()
                    else:
                        label_ids = torch.cat(
                            (label_ids, inputs["labels"].detach()), dim=0)

        if self.args.local_rank != -1:
            # In distributed mode, concatenate all results from all nodes:
            if preds is not None:
                preds = self.distributed_concat(
                    preds, num_total_examples=self.num_examples(dataloader))
            if label_ids is not None:
                label_ids = self.distributed_concat(
                    label_ids,
                    num_total_examples=self.num_examples(dataloader))
        elif is_tpu_available():
            # tpu-comment: Get all predictions and labels from all worker shards of eval dataset
            if preds is not None:
                preds = xm.mesh_reduce("eval_preds", preds, torch.cat)
            if label_ids is not None:
                label_ids = xm.mesh_reduce("eval_label_ids", label_ids,
                                           torch.cat)

        # Finally, turn the aggregated tensors into numpy arrays.
        if preds is not None:
            preds = preds.cpu().numpy()
        if label_ids is not None:
            label_ids = label_ids.cpu().numpy()

        if self.compute_metrics is not None and preds is not None and label_ids is not None:
            metrics = self.compute_metrics(
                EvalPrediction(predictions=preds, label_ids=label_ids))
        else:
            metrics = {}

        if len(eval_losses) > 0:
            m = np.mean(eval_losses)
            metrics["eval_loss"] = m
            metrics["eval_perplexity"] = torch.exp(torch.tensor(m)).item()
            print("Perplexity: {}".format(metrics["eval_perplexity"]))

        # Prefix all keys with eval_
        for key in list(metrics.keys()):
            if not key.startswith("eval_"):
                metrics[f"eval_{key}"] = metrics.pop(key)

        return PredictionOutput(predictions=preds,
                                label_ids=label_ids,
                                metrics=metrics)
Beispiel #20
0
    def _prediction_loop(
            self,
            dataloader: DataLoader,
            description: str,
            prediction_loss_only: Optional[bool] = None) -> PredictionOutput:
        """
                Prediction/evaluation loop, shared by `evaluate()` and `predict()`.

                Works both with or without labels.
                """

        prediction_loss_only = prediction_loss_only if prediction_loss_only is not None else self.prediction_loss_only

        model = self.model
        # multi-gpu eval
        if self.args.n_gpu > 1:
            model = torch.nn.DataParallel(model)
        else:
            model = self.model
        # Note: in torch.distributed mode, there's no point in wrapping the model
        # inside a DistributedDataParallel as we'll be under `no_grad` anyways.

        batch_size = dataloader.batch_size
        logger.info("***** Running %s *****", description)
        logger.info("  Num examples = %d", self.num_examples(dataloader))
        logger.info("  Batch size = %d", batch_size)
        logger.info("  Decode mode = %s", self.args.decode_mode)
        eval_losses: List[float] = []
        model.eval()

        metric = ParsingMetric()

        if is_tpu_available():
            dataloader = pl.ParallelLoader(
                dataloader,
                [self.args.device]).per_device_loader(self.args.device)

        for inputs in tqdm(dataloader, desc=description):

            for k, v in inputs.items():
                inputs[k] = v.to(self.args.device)

            with torch.no_grad():
                step_eval_loss, rel_preds, arc_preds = model(
                    **inputs, adapter_names=self.adapter_names)

                eval_losses += [step_eval_loss.mean().item()]

            mask = inputs["labels_arcs"].ne(self.model.config.pad_token_id)
            predictions_arcs = torch.argmax(arc_preds, dim=-1)[mask]

            labels_arcs = inputs["labels_arcs"][mask]

            predictions_rels, labels_rels = rel_preds[mask], inputs[
                "labels_rels"][mask]
            predictions_rels = predictions_rels[torch.arange(len(labels_arcs)),
                                                labels_arcs]
            predictions_rels = torch.argmax(predictions_rels, dim=-1)

            metric.add(labels_arcs, labels_rels, predictions_arcs,
                       predictions_rels)

        results = metric.get_metric()
        results[f"{description}_loss"] = np.mean(eval_losses)

        # Add predictions_rels to output, even though we are only interested in the metrics
        return PredictionOutput(predictions=predictions_rels,
                                label_ids=None,
                                metrics=results)