Beispiel #1
0
    def __init__(self,
                 model,
                 optimizer,
                 data_silo,
                 epochs,
                 n_gpu,
                 device,
                 lr_schedule=None,
                 evaluate_every=100,
                 eval_report=True,
                 use_amp=None,
                 grad_acc_steps=1,
                 local_rank=-1,
                 early_stopping=None,
                 log_learning_rate=False,
                 log_loss_every=10,
                 checkpoint_on_sigterm=False,
                 checkpoint_every=None,
                 checkpoint_root_dir=None,
                 checkpoints_to_keep=3,
                 from_epoch=0,
                 from_step=0,
                 global_step=0,
                 evaluator_test=True,
                 disable_tqdm=False,
                 max_grad_norm=1.0):
        """
        :param optimizer: An optimizer object that determines the learning strategy to be used during training
        :param data_silo: A DataSilo object that will contain the train, dev and test datasets as PyTorch DataLoaders
        :type data_silo: DataSilo
        :param epochs: How many times the training procedure will loop through the train dataset
        :type epochs: int
        :param n_gpu: The number of gpus available for training and evaluation.
        :type n_gpu: int
        :param device: The device on which the train, dev and test tensors should be hosted. Choose from "cpu" and "cuda".
        :param lr_schedule: An optional scheduler object that can regulate the learning rate of the optimizer
        :param evaluate_every: Perform dev set evaluation after this many steps of training.
        :type evaluate_every: int
        :param eval_report: If evaluate_every is not 0, specifies if an eval report should be generated when evaluating
        :type eval_report: bool
        :param use_amp: Whether to use automatic mixed precision with Apex. One of the optimization levels must be chosen.
                        "O1" is recommended in almost all cases.
        :type use_amp: str
        :param grad_acc_steps: Number of training steps for which the gradients should be accumulated.
                               Useful to achieve larger effective batch sizes that would not fit in GPU memory.
        :type grad_acc_steps: int
        :param local_rank: Local rank of process when distributed training via DDP is used.
        :type local_rank: int
        :param early_stopping: an initialized EarlyStopping object to control early stopping and saving of best models.
        :type early_stopping: EarlyStopping
        :param log_learning_rate: Whether to log learning rate to Mlflow
        :type log_learning_rate: bool
        :param log_loss_every: Log current train loss after this many train steps.
        :type log_loss_every: int
        :param checkpoint_on_sigterm: save a checkpoint for the Trainer when a SIGTERM signal is sent. The checkpoint
               can be used to resume training. It is useful in frameworks like AWS SageMaker with Spot instances where
               a SIGTERM notifies to save the training state and subsequently the instance is terminated.
        :type checkpoint_on_sigterm: bool
        :param checkpoint_every: save a train checkpoint after this many steps of training.
        :type checkpoint_every: int
        :param checkpoint_root_dir: the Path of directory where all train checkpoints are saved. For each individual
               checkpoint, a subdirectory with the name epoch_{epoch_num}_step_{step_num} is created.
        :type checkpoint_root_dir: Path
        :param checkpoints_to_keep: maximum number of train checkpoints to save.
        :type checkpoints_to_keep: int
        :param from_epoch: the epoch number to start the training from. In the case when training resumes from a saved
               checkpoint, it is used to fast-forward training to the last epoch in the checkpoint.
        :type from_epoch: int
        :param from_step: the step number to start the training from. In the case when training resumes from a saved
               checkpoint, it is used to fast-forward training to the last step in the checkpoint.
        :type from_step: int
        :param global_step: the global step number across the training epochs.
        :type global_step: int
        :param evaluator_test: whether to perform evaluation on the test set
        :type evaluator_test: bool
        :param disable_tqdm: Disable tqdm progress bar (helps to reduce verbosity in some environments)
        :type disable_tqdm: bool
        :param max_grad_norm: Max gradient norm for clipping, default 1.0, set to None to disable
        :type max_grad_norm: float
        """

        self.model = model
        self.data_silo = data_silo
        self.epochs = int(epochs)
        self.optimizer = optimizer
        self.evaluate_every = evaluate_every
        self.eval_report = eval_report
        self.evaluator_test = evaluator_test
        self.n_gpu = n_gpu
        self.grad_acc_steps = grad_acc_steps
        self.use_amp = use_amp
        self.lr_schedule = lr_schedule
        self.device = device
        self.local_rank = local_rank
        self.log_params()
        self.early_stopping = early_stopping
        self.log_learning_rate = log_learning_rate
        self.log_loss_every = log_loss_every
        self.disable_tqdm = disable_tqdm
        self.max_grad_norm = max_grad_norm
        self.test_result = None

        if use_amp and not AMP_AVAILABLE:
            raise ImportError(
                f'Got use_amp = {use_amp}, but cannot find apex. '
                'Please install Apex if you want to make use of automatic mixed precision. '
                'https://github.com/NVIDIA/apex')
        self.checkpoint_on_sigterm = checkpoint_on_sigterm
        if checkpoint_on_sigterm:
            self.sigterm_handler = GracefulKiller()
        else:
            self.sigterm_handler = None
        self.checkpoint_root_dir = checkpoint_root_dir
        self.checkpoints_to_keep = checkpoints_to_keep
        self.checkpoint_every = checkpoint_every
        if self.checkpoint_every and not checkpoint_root_dir:
            raise Exception(
                "checkpoint_path needs to be supplied when using checkpoint_every."
            )
        if checkpoint_on_sigterm and not checkpoint_root_dir:
            raise Exception(
                "checkpoint_path needs to be supplied when using checkpoint_on_sigterm."
            )

        self.from_epoch = from_epoch
        self.from_step = from_step
        self.global_step = global_step
Beispiel #2
0
    def __init__(
        self,
        model,
        optimizer,
        data_silo,
        epochs,
        n_gpu,
        device,
        lr_schedule=None,
        evaluate_every=100,
        evaluator_dev=None,
        evaluator_test=None,
        use_amp=None,
        grad_acc_steps=1,
        local_rank=-1,
        early_stopping=None,
        log_learning_rate=False,
        checkpoint_on_sigterm=False,
        checkpoint_every=None,
        checkpoint_root_dir=None,
        checkpoints_to_keep=3,
        from_epoch=0,
        from_step=0,
    ):
        """
        :param optimizer: An optimizer object that determines the learning strategy to be used during training
        :param data_silo: A DataSilo object that will contain the train, dev and test datasets as PyTorch DataLoaders
        :type data_silo: DataSilo
        :param epochs: How many times the training procedure will loop through the train dataset
        :type epochs: int
        :param n_gpu: The number of gpus available for training and evaluation.
        :type n_gpu: int
        :param device: The device on which the train, dev and test tensors should be hosted. Choose from "cpu" and "cuda".
        :param lr_schedule: An optional scheduler object that can regulate the learning rate of the optimizer
        :param evaluate_every: Perform dev set evaluation after this many steps of training.
        :type evaluate_every: int
        :param evaluator_dev: Evaluator for dev set. Options:
                              `None` (Default) => will init a new evaluator, if there's a dev set in the DataSilo
                              `Evaluator Object` => use the manually supplied evaluator
                              `False` => Don't use any evaluator
        :type evaluator_dev: Evaluator, None or False
        :param evaluator_test: Evaluator for test set. Options:
                              `None` (Default) => will init a new evaluator, if there's a test set in the DataSilo
                              `Evaluator Object` => use the manually supplied evaluator
                              `False` => Don't use any evaluator
        :type evaluator_test: Evaluator, None or False
        :param use_amp: Whether to use automatic mixed precision with Apex. One of the optimization levels must be chosen.
                        "O1" is recommended in almost all cases.
        :type use_amp: str
        :param grad_acc_steps: TODO
        :type grad_acc_steps: int
        :param local_rank: TODO
        :type local_rank: int
        :param early_stopping: an initialized EarlyStopping object to control early stopping and saving of best models.
        :type early_stopping: EarlyStopping
        :param log_learning_rate: Whether to log learning rate to Mlflow
        :type log_learning_rate: bool
        :param checkpoint_on_sigterm: save a checkpoint for the Trainer when a SIGTERM signal is sent. The checkpoint
               can be used to resume training. It is useful in frameworks like AWS SageMaker with Spot instances where
               a SIGTERM notifies to save the training state and subsequently the instance is terminated.
        :type checkpoint_on_sigterm: bool
        :param checkpoint_every: save a train checkpoint after this many steps of training.
        :type checkpoint_every: int
        :param checkpoint_root_dir: the Path of directory where all train checkpoints are saved. For each individual
               checkpoint, a subdirectory with the name epoch_{epoch_num}_step_{step_num} is created.
        :type checkpoint_root_dir: Path
        :param checkpoints_to_keep: maximum number of train checkpoints to save.
        :type checkpoints_to_keep: int
        :param from_epoch: the epoch number to start the training from. In the case when training resumes from a saved
               checkpoint, it is used to fast-forward training to the last epoch in the checkpoint.
        :type from_epoch: int
        :param from_step: the step number to start the training from. In the case when training resumes from a saved
               checkpoint, it is used to fast-forward training to the last step in the checkpoint.
        :type from_step: int
        """

        self.model = model
        self.data_silo = data_silo
        self.epochs = int(epochs)
        self.optimizer = optimizer
        self.evaluate_every = evaluate_every
        self.n_gpu = n_gpu
        self.grad_acc_steps = grad_acc_steps
        self.use_amp = use_amp
        self.lr_schedule = lr_schedule
        self.data_loader_train = data_silo.get_data_loader("train")
        self.device = device
        self.local_rank = local_rank
        self.log_params()
        self.early_stopping = early_stopping
        self.log_learning_rate = log_learning_rate

        if use_amp and not AMP_AVAILABLE:
            raise ImportError(f'Got use_amp = {use_amp}, but cannot find apex. '
                              'Please install Apex if you want to make use of automatic mixed precision. '
                              'https://github.com/NVIDIA/apex')
        self.checkpoint_on_sigterm = checkpoint_on_sigterm
        if checkpoint_on_sigterm:
            self.sigterm_handler = GracefulKiller()
        else:
            self.sigterm_handler = None
        self.checkpoint_root_dir = checkpoint_root_dir
        self.checkpoints_to_keep = checkpoints_to_keep
        self.checkpoint_every = checkpoint_every
        if self.checkpoint_every and not checkpoint_root_dir:
            raise Exception("checkpoint_path needs to be supplied when using checkpoint_every.")
        if checkpoint_on_sigterm and not checkpoint_root_dir:
            raise Exception("checkpoint_path needs to be supplied when using checkpoint_on_sigterm.")

        self.from_epoch = from_epoch
        self.from_step = from_step
        self.global_step = (from_epoch * from_step) - 1

        # evaluator on dev set
        if evaluator_dev is None and self.data_silo.get_data_loader("dev"):
            evaluator_dev = Evaluator(
                data_loader=self.data_silo.get_data_loader("dev"),
                tasks=self.data_silo.processor.tasks,
                device=device,
            )
        self.evaluator_dev = evaluator_dev

        # evaluator on test set
        if evaluator_test is None and self.data_silo.get_data_loader("test"):
            evaluator_test = Evaluator(
                data_loader=self.data_silo.get_data_loader("test"),
                tasks=self.data_silo.processor.tasks,
                device=device
            )
        self.evaluator_test = evaluator_test