def __init__(self, inputs, group=tnt.Group()): # TensorAllreducer handles either a single Allreduce operation # when the input is a scalar/array/tensor # or a list/dictionary of `TensorAllreducer`s, respectively self.group = group self.algorithm = "recursivedoubling" self.reduction_op = tnt.ReductionOp.SUM self.create_allreduces(inputs)
def __init__( self, keras_callback: tf.keras.callbacks.Callback, group: tnt.Group = tnt.Group() ) -> None: super().__init__(keras_callback) self.group = group self.customize_callback(keras_callback)
def get_replica_group_for_rank(self, rank): if not rank in self.replica_mapping: raise ValueError(f"Rank {rank} not found in the mapping of replica IDs to ranks: {self.replica_mapping}") partition_id = self.get_partition_for_rank(rank) replica_group_ranks = [r for r in self.replica_mapping.keys() \ if self.get_partition_for_rank(r) == partition_id] logger.debug(f"[RankMapper] Replica group = {replica_group_ranks}.") return tnt.Group(replica_group_ranks)
def get_pipelining_group_for_rank(self, rank): if not rank in self.partition_mapping: raise ValueError(f"Rank {rank} not found in the mapping of partition IDs to ranks: {self.partition_mapping}") replica_id = self.get_replica_for_rank(rank) pipeline_group_ranks = [r for r in self.partition_mapping.keys() \ if self.get_replica_for_rank(r) == replica_id] logger.debug(f"[RankMapper] Pipeline group = {pipeline_group_ranks}.") return tnt.Group(pipeline_group_ranks)
def assert_on_all_ranks(results_array: Union[bool, List[bool]]): if not isinstance(results_array, list): results_array = [results_array] allreduce = tnt.Allreduce(tnt.Group(), nelems=len(results_array), dtype=bool, op=tnt.ReductionOp.AND) allreduce.start(results_array) output_array = allreduce.wait_for_completion() assert np.all(output_array)
def __init__(self, model, group = tnt.Group()): super().__init__(model = model, group = group) self.input_shapes = None self.done_broadcast = False self.compiled = False self.broadcaster = None self.barrier = tnt.Barrier(group = self.group) self.dist_optimizer = None self.default_shuffle_seed = 42
def __call__(cls, callback: tf.keras.callbacks.Callback, parallel_strategy: tnt.ParallelStrategy = tnt. ParallelStrategy.PIPELINING, group: tnt.Group = tnt.Group(), **kwargs: Any) -> tf.keras.callbacks.Callback: if hasattr(callback, "tnt_parallel_strategy"): keras_callback_type = type(callback.keras_callback) else: keras_callback_type = type(callback) return callbackFactory(callback, keras_callback_type, parallel_strategy, group, **kwargs)
def __init__(self, keras_callback: tf.keras.callbacks.Callback, aggregate_logs: bool = True, run_on_all_ranks: bool = True, group: tnt.Group = tnt.Group()) -> None: super().__init__(group = group) logger.debug(f"[DataParallelCallback] init with {keras_callback}") base_type.__init__(self, keras_callback) self.aggregate_logs = aggregate_logs self.run_on_all_ranks = run_on_all_ranks self.is_built = False self._distribute_callback = self._distribute_callback_default self.customize_callback(keras_callback)
def _create_tnt_model(cls, model: tf.keras.Model, parallel_strategy: tnt.ParallelStrategy = tnt.ParallelStrategy.ALL if TF_DEFAULT_PIPELINING_FLAG \ else tnt.ParallelStrategy.DATA, num_pipeline_stages: int = 1): replica_group = tnt.Group() if (tnt.ParallelStrategy.PIPELINING in parallel_strategy) and isinstance(model, tf.keras.Sequential): logger.warn( f"Cannot pipeline a `tf.keras.Sequential` model; disabling model parallelism." ) parallel_strategy = parallel_strategy ^ tnt.ParallelStrategy.PIPELINING logger.info(f"Creating parallel model using {parallel_strategy}.") if tnt.ParallelStrategy.PIPELINING in parallel_strategy: rank = tnt.get_rank() partition_generator = pgen.GraphPartitionGenerator(model) rank_mapper = rmapper.RankMapper( num_ranks=tnt.get_size(), pipeline_graph=partition_generator.get_pipeline_graph()) pipeline_group = rank_mapper.get_pipelining_group_for_rank(rank) logger.info( f"[Pipelining] Creating pipelined model with {pipeline_group.size} partitions." ) # get my partition model = pm.PartitionedModel( model=model, group=pipeline_group, partition_generator=partition_generator, rank_mapper=rank_mapper, num_pipeline_stages=num_pipeline_stages) if tnt.ParallelStrategy.DATA in parallel_strategy: replica_group = rank_mapper.get_replica_group_for_rank(rank) else: if pipeline_group.size != tnt.get_size(): raise ValueError( f"Provided model has only {pipeline_group.size} partitions; use {pipeline_group.size} ranks or a different parallel strategy." ) if tnt.ParallelStrategy.DATA in parallel_strategy: # replicate my partition across the data parallel group logger.info( f"[DataParallel] Replicating local model across ranks {replica_group.group}." ) model = dpm.DataParallelModel(model=model, group=replica_group) return model
def __init__(self, inputs, root_rank=tnt.get_master_rank(), group=tnt.Group()): self.root_global_rank = group.to_global_rank(root_rank) self.shapes = list() self.broadcasts = list() self.algorithm = "linear" if utils.is_nonEmptyArray(inputs): inputs = [inputs] elif not utils.is_nonEmptyList(inputs): self._raise_input_error() for tensor in inputs: self.shapes.append(tensor.shape) self.broadcasts.append( tnt.Broadcast(group=group, nelems=int(np.prod(tensor.shape)), root=root_rank, algorithm=self.algorithm, dtype=tensor.dtype))
def __init__(self, model, group = tnt.Group()): super().__init__() self.rank = tnt.get_rank() self.group = group self.model = model atexit.register(self.close)
def __init__(self, inputs, group=tnt.Group()): # TensorAllgather performs a single Allgather operation # when the input is a scalar/array/tensor self.group = group self.algorithm = "ring" self.create_allgather(inputs)
def __init__(self, group: tnt.Group = tnt.Group()) -> None: self.group = group self.num_ranks = group.size self.allreducer = None atexit.register(self.close)