def lr_step(self, val_loss=None, epoch=None):
     """Adjust the learning rate depending on the validation loss."""
     lr = Future.gen_list([
         self.call_async(rank,
                         '_async_lr_step',
                         val_loss=val_loss,
                         epoch=epoch) for rank in range(self.num_replicas)
     ])
     return lr[0]
    def _scatter_samples(self,
                         samples,
                         volatile=False,
                         replace_empty_samples=False):
        """Split and distribute a sample across GPUs."""
        if not replace_empty_samples:
            # pad with None until its size is equal to the number of replicas
            samples = samples + [None] * (self.num_replicas - len(samples))
        else:
            # pad by cycling through the given samples
            samples = list(islice(cycle(samples), self.num_replicas))

        Future.gen_list([
            self.call_async(rank,
                            '_async_prepare_sample',
                            sample=samples[rank],
                            volatile=volatile)
            for rank in range(self.num_replicas)
        ])
Beispiel #3
0
    def __init__(self, args, model, criterion, device_ids=None,
                 multiprocessing_method='spawn'):
        if device_ids is None:
            device_ids = tuple(range(torch.cuda.device_count()))
        super().__init__(device_ids, multiprocessing_method)

        if not torch.cuda.is_available():
            raise NotImplementedError('Training on CPU is not supported')
        model = model.share_memory()
        nccl_uid = nccl.get_unique_id()
        self.criterion = criterion

        Future.gen_list([
            self.call_async(rank, '_async_init', args=args, model=model,
                            criterion=criterion, nccl_uid=nccl_uid)
            for rank in range(self.num_replicas)
        ])

        self._grads_initialized = False
    def valid_step(self, samples, criterion):
        """Do forward pass in parallel."""
        # scatter sample across GPUs
        self._scatter_samples(samples, volatile=True)
        criterion.prepare(samples)

        # forward pass
        losses = [
            self.call_async(rank, '_async_valid_step', criterion=criterion)
            for rank in range(self.num_replicas)
        ]

        # aggregate losses
        loss = criterion.aggregate(Future.gen_list(losses))

        return loss
Beispiel #5
0
    def valid_step(self, samples):
        """Do forward pass in parallel."""
        # scatter sample across GPUs
        self._scatter_samples(samples, volatile=True)

        # forward pass
        _sample_sizes, logging_outputs, ooms_fwd = Future.gen_tuple_list([
            self.call_async(rank, '_async_forward', eval=True)
            for rank in range(self.num_replicas)
        ])
        assert sum(ooms_fwd) == 0

        # aggregate logging output
        logging_output = self.criterion.__class__.aggregate_logging_outputs(logging_outputs)

        return logging_output
    def train_step(self, samples, criterion):
        """Do forward, backward and gradient step in parallel."""
        assert isinstance(criterion, FairseqCriterion)

        # scatter sample across GPUs
        self._scatter_samples(samples)
        criterion.prepare(samples)

        # forward pass, backward pass and gradient step
        losses = [
            self.call_async(rank, '_async_train_step', criterion=criterion)
            for rank in range(self.num_replicas)
        ]

        # aggregate losses and gradient norms
        losses, grad_norms = Future.gen_tuple_list(losses)
        loss = criterion.aggregate(losses)

        return loss, grad_norms[0]
Beispiel #7
0
    def valid_step(self, samples, criterion):
        """Do forward pass in parallel."""
        # scatter sample across GPUs
        self._scatter_samples(samples, volatile=True)
        criterion.prepare(samples)

        # forward pass
        res = [
            self.call_async(rank, '_async_valid_step', criterion=criterion)
            for rank in range(self.num_replicas)
        ]

        # aggregate losses
        losses, mean_rouge_greedy, mean_rouge_sampled = Future.gen_tuple_list(
            res)
        loss = criterion.aggregate(losses)
        mean_rouge_greedy = utils.sum_if_not_none(mean_rouge_greedy)
        mean_rouge_sampled = utils.sum_if_not_none(mean_rouge_sampled)

        return loss, mean_rouge_greedy, mean_rouge_sampled
Beispiel #8
0
    def valid_step(self, samples, criterion):
        """Do forward pass in parallel."""
        # scatter sample across GPUs
        samples, data_events = self._scatter_samples(samples, volatile=True)
        criterion.prepare(samples)

        # forward pass
        losses = [
            self.call_async(rank,
                            '_async_valid_step',
                            sample=samples[rank],
                            criterion=criterion,
                            data_event=event)
            for rank, event in enumerate(data_events)
        ]

        # aggregate losses
        loss = criterion.aggregate(Future.gen_list(losses))

        return loss
 def set_seed(self, seed):
     Future.gen_list([
         self.call_async(rank, '_async_set_seed', seed=seed)
         for rank in range(self.num_replicas)
     ])
Beispiel #10
0
    def train_step(self, samples, criterion):
        """Do forward, backward and gradient step in parallel."""
        assert isinstance(criterion, FairseqCriterion)

        # scatter sample across GPUs
        self._scatter_samples(samples)
        criterion.prepare(samples)

        # forward pass, backward pass and gradient step
        # res is namedtuple
        res = [
            self.call_async(rank, '_async_train_step', criterion=criterion)
            for rank in range(self.num_replicas)
        ]
        # aggregate losses and gradient norms
        losses, grad_norms, ml_losses, rl_losses, mean_rouge_greedy, mean_rouge_sampled, mean_sum_log_probs = Future.gen_tuple_list(
            res)
        loss = criterion.aggregate(losses)
        ml_loss = criterion.aggregate(ml_losses)

        rl_loss = utils.sum_if_not_none(rl_losses)
        mean_rouge_greedy = utils.sum_if_not_none(mean_rouge_greedy)
        mean_rouge_sampled = utils.sum_if_not_none(mean_rouge_sampled)
        mean_sum_log_prob = utils.sum_if_not_none(mean_sum_log_probs)

        aggregate_res = Results(loss, grad_norms[0], ml_loss, rl_loss,
                                mean_rouge_greedy, mean_rouge_sampled,
                                mean_sum_log_prob)

        return aggregate_res