Exemple #1
0
    def __init__(self, *args: Any, **kwargs: Any) -> None:
        det.TrialContext.__init__(self, *args, **kwargs)
        pytorch._PyTorchReducerContext.__init__(self, self.distributed._zmq_allgather)

        self._init_device()

        # Track which types we have issued warnings for in to_device().
        self._to_device_warned_types = set()  # type: Set[Type]

        # The following attributes are initialized during the lifetime of
        # a PyTorchTrialContext.
        self.models = []  # type: List[nn.Module]
        self.optimizers = []  # type: List[torch.optim.Optimizer]
        self.lr_schedulers = []  # type: List[pytorch.LRScheduler]
        self._epoch_len = None  # type: Optional[int]

        # Use a main model to contain all of the models because when using horovod
        # to broadcast the states of models we want to avoid name conflicts for these
        # states so we set all the models to be sub-module of the main model with
        # different names using __setattr__ and use the state_dict of the main model
        # for broadcasting. Note that broadcast_parameters only accepts state_dict()
        # although its doc says it also accepts named_parameters()
        self._main_model = nn.Module()
        self._scaler = None
        self._use_apex = False
        self._loss_ids = {}  # type: Dict[torch.Tensor, int]
        self._last_backward_batch_idx = None  # type: Optional[int]
        self._current_batch_idx = None  # type: Optional[int]

        self.experimental = pytorch.PyTorchExperimentalContext(self)
        self._reducers = pytorch._PyTorchReducerContext()
Exemple #2
0
    def __init__(self, *args: Any, **kwargs: Any) -> None:
        det.TrialContext.__init__(self, *args, **kwargs)
        pytorch._PyTorchReducerContext.__init__(self,
                                                self.distributed.allgather)
        self._per_slot_batch_size, self._global_batch_size = util.calculate_batch_sizes(
            self.get_hparams(),
            self.env.experiment_config.slots_per_trial(),
            "PyTorchTrial",
        )

        self._distributed_backend = det._DistributedBackend()

        self.device = self._init_device()

        # Track which types we have issued warnings for in to_device().
        self._to_device_warned_types = set()  # type: Set[Type]

        # The following attributes are initialized during the lifetime of
        # a PyTorchTrialContext.
        self.models = []  # type: List[nn.Module]
        self.optimizers = []  # type: List[torch.optim.Optimizer]
        self.profiler = None  # type: Any
        self.lr_schedulers = []  # type: List[pytorch.LRScheduler]
        self._epoch_len = None  # type: Optional[int]

        # Keep a map of wrapped models to their original input forms, which is needed
        # by torch DDP and apex to initialize in the correct order
        self._wrapped_models = {}  # type: Dict[nn.Module, nn.Module]

        # Use a main model to contain all of the models because when using horovod
        # to broadcast the states of models we want to avoid name conflicts for these
        # states so we set all the models to be sub-module of the main model with
        # different names using __setattr__ and use the state_dict of the main model
        # for broadcasting. Note that broadcast_parameters only accepts state_dict()
        # although its doc says it also accepts named_parameters()
        self._main_model = nn.Module()
        self._scaler = None
        self._use_apex = False
        self._loss_ids = {}  # type: Dict[torch.Tensor, int]
        self._last_backward_batch_idx = None  # type: Optional[int]
        self._current_batch_idx = None  # type: Optional[int]

        self.experimental = pytorch.PyTorchExperimentalContext(self)
        self._reducers = pytorch._PyTorchReducerContext()
        self._determined_profiler = None  # type: Optional[profiler.ProfilerAgent]

        optimizations_config = self.env.experiment_config.get_optimizations_config(
        )
        self._aggregation_frequency = cast(
            int, optimizations_config.get("aggregation_frequency"))
        self._fp16_compression = cast(
            bool, optimizations_config.get("gradient_compression"))
        self._average_aggregated_gradients = cast(
            bool, optimizations_config.get("average_aggregated_gradients"))
        self._average_training_metrics = cast(
            bool, optimizations_config.get("average_training_metrics"))
Exemple #3
0
 def make_reducer_context(
         rank: int, cross_rank: int,
         local_rank: int) -> DummyDistributedReducerContext:
     distributed_context = core.DistributedContext(
         rank=cross_rank * local_size + local_rank,
         size=cross_size * local_size,
         local_rank=local_rank,
         local_size=local_size,
         cross_rank=cross_rank,
         cross_size=cross_size,
         chief_ip="localhost",
         force_tcp=False,
     )
     reducer_context = _PyTorchReducerContext(distributed_context.allgather)
     # reducer_context.wrap_reducer(lambda x: x, "dummy")
     wrapped_reducer = reducer_context.wrap_reducer(dummy_reducer)
     return DummyDistributedReducerContext(distributed_context,
                                           reducer_context, wrapped_reducer)