def __init__(self, *args: Any, **kwargs: Any) -> None: det.TrialContext.__init__(self, *args, **kwargs) pytorch._PyTorchReducerContext.__init__(self, self.distributed._zmq_allgather) self._init_device() # Track which types we have issued warnings for in to_device(). self._to_device_warned_types = set() # type: Set[Type] # The following attributes are initialized during the lifetime of # a PyTorchTrialContext. self.models = [] # type: List[nn.Module] self.optimizers = [] # type: List[torch.optim.Optimizer] self.lr_schedulers = [] # type: List[pytorch.LRScheduler] self._epoch_len = None # type: Optional[int] # Use a main model to contain all of the models because when using horovod # to broadcast the states of models we want to avoid name conflicts for these # states so we set all the models to be sub-module of the main model with # different names using __setattr__ and use the state_dict of the main model # for broadcasting. Note that broadcast_parameters only accepts state_dict() # although its doc says it also accepts named_parameters() self._main_model = nn.Module() self._scaler = None self._use_apex = False self._loss_ids = {} # type: Dict[torch.Tensor, int] self._last_backward_batch_idx = None # type: Optional[int] self._current_batch_idx = None # type: Optional[int] self.experimental = pytorch.PyTorchExperimentalContext(self) self._reducers = pytorch._PyTorchReducerContext()
def __init__(self, *args: Any, **kwargs: Any) -> None: det.TrialContext.__init__(self, *args, **kwargs) pytorch._PyTorchReducerContext.__init__(self, self.distributed.allgather) self._per_slot_batch_size, self._global_batch_size = util.calculate_batch_sizes( self.get_hparams(), self.env.experiment_config.slots_per_trial(), "PyTorchTrial", ) self._distributed_backend = det._DistributedBackend() self.device = self._init_device() # Track which types we have issued warnings for in to_device(). self._to_device_warned_types = set() # type: Set[Type] # The following attributes are initialized during the lifetime of # a PyTorchTrialContext. self.models = [] # type: List[nn.Module] self.optimizers = [] # type: List[torch.optim.Optimizer] self.profiler = None # type: Any self.lr_schedulers = [] # type: List[pytorch.LRScheduler] self._epoch_len = None # type: Optional[int] # Keep a map of wrapped models to their original input forms, which is needed # by torch DDP and apex to initialize in the correct order self._wrapped_models = {} # type: Dict[nn.Module, nn.Module] # Use a main model to contain all of the models because when using horovod # to broadcast the states of models we want to avoid name conflicts for these # states so we set all the models to be sub-module of the main model with # different names using __setattr__ and use the state_dict of the main model # for broadcasting. Note that broadcast_parameters only accepts state_dict() # although its doc says it also accepts named_parameters() self._main_model = nn.Module() self._scaler = None self._use_apex = False self._loss_ids = {} # type: Dict[torch.Tensor, int] self._last_backward_batch_idx = None # type: Optional[int] self._current_batch_idx = None # type: Optional[int] self.experimental = pytorch.PyTorchExperimentalContext(self) self._reducers = pytorch._PyTorchReducerContext() self._determined_profiler = None # type: Optional[profiler.ProfilerAgent] optimizations_config = self.env.experiment_config.get_optimizations_config( ) self._aggregation_frequency = cast( int, optimizations_config.get("aggregation_frequency")) self._fp16_compression = cast( bool, optimizations_config.get("gradient_compression")) self._average_aggregated_gradients = cast( bool, optimizations_config.get("average_aggregated_gradients")) self._average_training_metrics = cast( bool, optimizations_config.get("average_training_metrics"))
def make_reducer_context( rank: int, cross_rank: int, local_rank: int) -> DummyDistributedReducerContext: distributed_context = core.DistributedContext( rank=cross_rank * local_size + local_rank, size=cross_size * local_size, local_rank=local_rank, local_size=local_size, cross_rank=cross_rank, cross_size=cross_size, chief_ip="localhost", force_tcp=False, ) reducer_context = _PyTorchReducerContext(distributed_context.allgather) # reducer_context.wrap_reducer(lambda x: x, "dummy") wrapped_reducer = reducer_context.wrap_reducer(dummy_reducer) return DummyDistributedReducerContext(distributed_context, reducer_context, wrapped_reducer)