Exemple #1
0
    def run_extensions(self,
                       *,
                       completed: bool = True,
                       only_iterations: bool = True) -> None:
        if completed:
            # Check if the model is available for the iteration just
            # completed, i.e., the iteration number is already incremented.
            self._model_available = self.needs_model_state(self.iteration)
        else:
            self._model_available = False

        to_run = []
        for name, entry in self.extensions:
            # When iterations are deferred we only
            # launch the extensions that doesn't need
            # the training status to advance
            # those are extensions set to execute
            # in a given interval of executions
            is_async = (hasattr(entry.extension, 'is_async')
                        and entry.extension.is_async)
            if ((not completed and not is_async)
                    or (completed and is_async and only_iterations)):
                continue
            manager: _manager_protocol.ExtensionsManagerProtocol = self
            if is_async:
                manager = self._get_proxy_for_trigger(entry.trigger)
            if entry.trigger(manager):
                # Execution of snapshot extensions are deferred until all the
                # triggers are evaluated.
                # If we don't do this, when two (or more) snapshot extensions
                # are registered and triggers for them are stateful, the first
                # snapshot extension will save the state of the second trigger
                # before invoking it although it will be executed later in this
                # iteration, making them to fire again just after resuming from
                # the snaphsot saved by the first snapshot extension.
                # Non-snapshot extensions are executed right away (note that
                # the order is already sorted by the priority) as they will
                # report values that might be needed by other triggers, i.e.,
                # trigger based on evaluator reported value.
                if entry.priority == extension_module.PRIORITY_SNAPSHOT:
                    to_run.append((name, entry.extension))
                else:
                    with record(
                            f'pytorch_pfn_extras.training.ExtensionsManager'
                            f'.run_extensions:{name}',
                            enable=self._enable_profile,
                    ):
                        entry.extension(self)
        for name, extension in to_run:
            with record(
                    f'pytorch_pfn_extras.training.ExtensionsManager'
                    f'.run_extensions:{name}',
                    enable=self._enable_profile,
            ):
                extension(self)
        self._model_available = True
def _reduce(
    values: Sequence[torch.Tensor],
    group: Optional[dist.ProcessGroup],
) -> None:
    size = sum([v.numel() for v in values])

    # flatten values to improve the runtime perfomance of all-reduce
    coalesced = torch.empty(size,
                            device=values[0].device,
                            dtype=values[0].dtype)
    coalesced_views = get_foreach_wrapper(
    ).unflatten(  # type: ignore[no-untyped-call]
        coalesced, values)
    get_foreach_wrapper().multi_tensor_scale(values, coalesced_views, 1.0)

    with record("torch.distributed.all_reduce",
                use_cuda=torch.cuda.is_available()):
        dist.all_reduce(coalesced,
                        group=group)  # type: ignore[no-untyped-call]

    # unflatten values
    get_foreach_wrapper().multi_tensor_scale(
        coalesced_views,
        values,
        1.0 / dist.get_world_size(group)  # type: ignore[no-untyped-call]
    )
Exemple #3
0
def _broadcast(values: Sequence[torch.Tensor],
               group: Optional[dist.ProcessGroup]) -> None:
    with torch.no_grad():  # type: ignore[no-untyped-call]
        coalesced = get_foreach_wrapper().flatten(values)
        with record("torch.distributed.broadcast",
                    use_cuda=torch.cuda.is_available()):
            dist.broadcast(coalesced, 0,
                           group=group)  # type: ignore[no-untyped-call]
        src = get_foreach_wrapper().unflatten(coalesced, values)
        get_foreach_wrapper().multi_tensor_scale(src, values, 1.0)
 def run_extensions(self) -> None:
     self._model_available = self.needs_model_state(self.iteration)
     to_run = []
     for name, entry in self.extensions:
         # When iterations are deferred we only
         # launch the extensions that doesn't need
         # the training status to advance
         # those are extensions set to execute
         # in a given interval of executions
         if entry.trigger(self):
             # Execution of snapshot extensions are deferred until all the
             # triggers are evaluated.
             # If we don't do this, when two (or more) snapshot extensions
             # are registered and triggers for them are stateful, the first
             # snapshot extension will save the state of the second trigger
             # before invoking it although it will be executed later in this
             # iteration, making them to fire again just after resuming from
             # the snaphsot saved by the first snapshot extension.
             # Non-snapshot extensions are executed right away (note that
             # the order is already sorted by the priority) as they will
             # report values that might be needed by other triggers, i.e.,
             # trigger based on evaluator reported value.
             if entry.priority == extension_module.PRIORITY_SNAPSHOT:
                 to_run.append((name, entry.extension))
             else:
                 with record(
                         f'pytorch_pfn_extras.training.ExtensionsManager'
                         f'.run_extensions:{name}',
                         enable=self._enable_profile,
                 ):
                     entry.extension(self)
     for name, extension in to_run:
         with record(
                 f'pytorch_pfn_extras.training.ExtensionsManager'
                 f'.run_extensions:{name}',
                 enable=self._enable_profile,
         ):
             extension(self)
     self._model_available = True
Exemple #5
0
    def run(self,
            train_loader: torch.utils.data.DataLoader,
            val_loader: Optional[torch.utils.data.DataLoader] = None,
            *,
            train_len: Optional[int] = None,
            eval_len: Optional[int] = None):
        """Executes the training loop.

        Args:
            train_loader (torch.utils.data.DataLoader):
                A data loader for training.
            val_loader (torch.utils.data.DataLoader, optional):
                A data loader passed to ``Evaluator.run()``.
            train_len (int, optional):
                The number of iterations per one training epoch. The default
                value is inferred from the size of training data loader.
            eval_len (int, optional):
                The number of iterations per one evaluation epoch, passed
                to ``Evaluator.run()``

        .. seealso::
            - :meth:`pytorch_pfn_extras.training._evaluator._Evaluator`
        """
        if train_len is None:
            train_len = len(train_loader)

        self._val_loader = val_loader
        self._eval_len = eval_len

        class _EvaluatorExt:
            def __init__(self, trainer):
                self.name = 'Evaluator'
                self.needs_model_state = True
                self._trainer = trainer

            def __call__(self, manager):
                self._trainer._run_evaluator()

        if self._manager is None:
            self._setup_manager(train_len)
            if self.evaluator is not None:
                # Register the evaluator as an extension to the manager
                # To be triggered with the correct timing
                self._manager.extend(
                    _EvaluatorExt(self),
                    trigger=self.evaluator_trigger,
                    priority=extension.PRIORITY_WRITER,
                )
            self.handler.train_setup(self, train_loader)
            if self.evaluator is not None:
                self.evaluator.handler.eval_setup(self.evaluator, val_loader)

        while not self.manager.stop_trigger:
            self.handler.train_epoch_begin(self, train_loader)

            # When iterations are completed in the callback
            # This is needed to avoid being constantly passing parameters
            self._idxs = queue.Queue()
            self._inputs = queue.Queue()
            self._times = queue.Queue()
            self._observed = queue.Queue()
            # Iterator must be created after `train_epoch_begin` as it may be
            #  using a DistributedSampler.
            loader_iter = iter(train_loader)
            self._profile_records = queue.Queue()
            for idx in range(train_len):
                with record(
                    "pytorch_pfn_extras.training.Trainer:iteration",
                    use_cuda=torch.cuda.is_available()
                ) as ntf0:
                    try:
                        with record(
                            "pytorch_pfn_extras.training.Trainer:get_data"
                        ):
                            x = next(loader_iter)
                    except StopIteration:
                        loader_iter = iter(train_loader)
                        with record(
                            "pytorch_pfn_extras.training.Trainer:get_data"
                        ):
                            x = next(loader_iter)
                    begin = time.time()
                    self._idxs.put(idx)
                    self._inputs.put(x)
                    self._times.put(begin)
                    self._deferred = True
                    with record(
                        "pytorch_pfn_extras.training.Trainer:run_iteration",
                        use_cuda=torch.cuda.is_available()
                    ) as ntf1, \
                            self.manager.run_iteration() as iter_notifier:
                        self._observed.put(self.manager.observation)
                        with record(
                            "pytorch_pfn_extras.training.Trainer:train_step",
                            use_cuda=torch.cuda.is_available(),
                        ) as ntf2:
                            self._profile_records.put([ntf0, ntf1, ntf2])
                            self.handler.train_step(
                                self, idx, x, complete_fn=self._complete_step)
                            # Check if the callback was called
                            if self._deferred:
                                # The iteration will be completed later
                                ntf0.defer()
                                ntf1.defer()
                                ntf2.defer()
                                iter_notifier.defer()
                    # In some cases, DataLoaders are continuos
                    # And will keep yielding results even if the epoch
                    # is completed. We forcefully exit at the end of
                    # every epoch
                    if (
                        self.is_epoch_last_iter(idx)
                        or self.manager.stop_trigger
                    ):
                        break
            # In handlers that support a completely Async model train_epoch_end
            # Will take care of completing pending work
            self.handler.train_epoch_end(self)
    def run(self,
            train_loader: Iterable[Any],
            val_loader: Optional[Iterable[Any]] = None,
            *,
            train_len: Optional[int] = None,
            eval_len: Optional[int] = None) -> None:
        """Executes the training loop.

        Args:
            train_loader (torch.utils.data.DataLoader):
                A data loader for training.
            val_loader (torch.utils.data.DataLoader, optional):
                A data loader passed to ``Evaluator.run()``.
            train_len (int, optional):
                The number of iterations per one training epoch. The default
                value is inferred from the size of training data loader.
            eval_len (int, optional):
                The number of iterations per one evaluation epoch, passed
                to ``Evaluator.run()``

        .. seealso::
            - :meth:`pytorch_pfn_extras.training._evaluator.Evaluator`
        """
        if train_len is None:
            train_len = len(train_loader)  # type: ignore[arg-type]
        if eval_len is None and val_loader is not None:
            eval_len = len(val_loader)  # type: ignore[arg-type]

        self._train_len = train_len
        self._eval_len = eval_len

        class _EvaluatorExt:
            def __init__(
                    self,
                    trainer: 'Trainer',
                    evaluator: 'Evaluator',
                    val_loader: Optional[Iterable[Any]],
                    eval_len: Optional[int],
            ) -> None:
                self.needs_model_state = True
                self._trainer = trainer
                self._evaluator = evaluator
                self._val_loader = val_loader
                self._eval_len = eval_len

            def __call__(self, manager: ExtensionsManagerProtocol) -> None:
                evaluator = self._evaluator
                if self._val_loader is None:
                    raise ValueError('"val_loader" is not given.')
                evaluator.handler.train_validation_begin(self._trainer, evaluator)
                evaluator.run(self._val_loader, eval_len=self._eval_len)
                evaluator.handler.train_validation_end(self._trainer, evaluator)

        if self._manager is None:
            self._manager = self._setup_manager(train_len)
            for name, (evaluator, trigger) in self._evaluators.items():
                # Register the evaluator as an extension to the manager
                # To be triggered with the correct timing
                self._manager.extend(
                    _EvaluatorExt(self, evaluator, val_loader, eval_len),
                    name=name,
                    trigger=trigger_module.get_trigger(trigger),
                    priority=extension.PRIORITY_WRITER,
                )
            self.handler.train_setup(self, train_loader)
            if len(self._evaluators) == 0:
                if val_loader is not None:
                    warnings.warn(
                        '`val_loader` is given whereas the evaluator is missing.',
                        UserWarning)
            else:
                if val_loader is None:
                    raise ValueError('`val_loader` is required')
                for _, (evaluator, _) in self._evaluators.items():
                    evaluator.handler.eval_setup(evaluator, val_loader)

        with self._profile or _nullcontext() as prof:
            while not self.manager.stop_trigger:
                self.handler.train_epoch_begin(self, train_loader)

                # When iterations are completed in the callback
                # This is needed to avoid being constantly passing parameters
                self._idxs: 'queue.Queue[int]' = queue.Queue()
                self._inputs: 'queue.Queue[Any]' = queue.Queue()
                self._times: 'queue.Queue[float]' = queue.Queue()
                self._observed: 'queue.Queue[reporting.Observation]' = queue.Queue()
                # Iterator must be created after `train_epoch_begin` as it may be
                #  using a DistributedSampler.
                loader_iter = iter(train_loader)
                self._profile_records: 'queue.Queue[List[_ReportNotification]]' \
                    = queue.Queue()
                for idx in range(train_len):
                    with record(
                        "pytorch_pfn_extras.training.Trainer:iteration",
                        use_cuda=torch.cuda.is_available(),
                        enable=self._enable_profile
                    ) as ntf0:
                        try:
                            with record(
                                "pytorch_pfn_extras.training.Trainer:get_data",
                                enable=self._enable_profile
                            ):
                                x = next(loader_iter)
                        except StopIteration:
                            loader_iter = iter(train_loader)
                            with record(
                                "pytorch_pfn_extras.training.Trainer:get_data",
                                enable=self._enable_profile
                            ):
                                x = next(loader_iter)
                        begin = time.time()
                        self._idxs.put(idx)
                        self._inputs.put(x)
                        self._times.put(begin)
                        with record(
                            "pytorch_pfn_extras.training.Trainer:run_iteration",
                            use_cuda=torch.cuda.is_available(),
                            enable=self._enable_profile
                        ) as ntf1, \
                                self.manager.run_iteration():
                            self._observed.put(self.manager.observation)
                            with record(
                                "pytorch_pfn_extras.training.Trainer:train_step",
                                use_cuda=torch.cuda.is_available(),
                                enable=self._enable_profile
                            ) as ntf2:
                                self._profile_records.put([ntf0, ntf1, ntf2])
                                self.handler.train_step(
                                    self, idx, x, complete_fn=self._complete_step)
                                # Check if the callback was called
                    if prof is not None:
                        prof.step()  # type: ignore[no-untyped-call]
                    # In some cases, DataLoaders are continuos
                    # And will keep yielding results even if the epoch
                    # is completed. We forcefully exit at the end of
                    # every epoch
                    if self.is_epoch_last_iter(idx) or self.manager.stop_trigger:
                        break
                # In handlers that support a completely Async model train_epoch_end
                # Will take care of completing pending work
                self.handler.train_epoch_end(self)
            if prof is not None:
                prof.on_trace_ready = None
        self.handler.train_cleanup(self)
Exemple #7
0
        def _synchronize() -> None:
            if not self._require_sync:
                return

            for hook in self._comm_hooks.values():
                hook(self)

            with record_function(
                    "ppe.nn.parallel.DistributedDataParallel.synchronize"):
                params = dict(self.named_parameters())
                if self._negotiate_grads:
                    # find parameters that have gradients
                    has_grads = torch.tensor([
                        params[name].grad is not None
                        for name in self._sorted_param_keys
                    ],
                                             device=self._device)

                    # cast to long because bool may not be used in all_reduce
                    has_grads = has_grads.long()
                    with record(
                            "pytorch_pfn_extras.nn.parallel."
                            "DistributedDataParallel:coordinate",
                            use_cuda=torch.cuda.is_available(),
                    ):
                        dist.all_reduce(  # type: ignore[no-untyped-call]
                            has_grads,
                            op=dist.ReduceOp.MAX)

                    for name, has_grad in zip(self._sorted_param_keys,
                                              has_grads.bool().cpu()):
                        # create zero tensor as a gradient if a parameter
                        # does not have the gradient and other processes
                        # require to synchronize this parameter.
                        if has_grad and params[name].grad is None:
                            params[name].grad = \
                                torch.zeros_like(params[name].data)

                grads = [
                    params[name].grad for name in self._sorted_param_keys
                    if params[name].grad is not None
                ]
                groups = _group_by_type(grads)
                with record(
                        "pytorch_pfn_extras.nn.parallel."
                        "DistributedDataParallel:reduce_gradient",
                        use_cuda=torch.cuda.is_available(),
                ):
                    for group in groups:
                        self._reduce_function(group, self._process_group)

                if self._broadcast_buffers:
                    buffers = dict(self.named_buffers())
                    bufs = [buffers[name] for name in self._sorted_buffer_keys]
                    groups = _group_by_type(bufs)
                    with record(
                            "pytorch_pfn_extras.nn.parallel."
                            "DistributedDataParallel:broadcast_buffer",
                            use_cuda=torch.cuda.is_available(),
                    ):
                        for group in groups:
                            self._broadcast_function(group,
                                                     self._process_group)