Esempio n. 1
0
 def __init__(self, inputs, group=tnt.Group()):
     # TensorAllreducer handles either a single Allreduce operation
     # when the input is a scalar/array/tensor
     # or a list/dictionary of `TensorAllreducer`s, respectively
     self.group = group
     self.algorithm = "recursivedoubling"
     self.reduction_op = tnt.ReductionOp.SUM
     self.create_allreduces(inputs)
 def __init__(
     self,
     keras_callback: tf.keras.callbacks.Callback,
     group: tnt.Group = tnt.Group()
 ) -> None:
     super().__init__(keras_callback)
     self.group = group
     self.customize_callback(keras_callback)
Esempio n. 3
0
  def get_replica_group_for_rank(self, rank):
    if not rank in self.replica_mapping:
      raise ValueError(f"Rank {rank} not found in the mapping of replica IDs to ranks: {self.replica_mapping}")
    partition_id = self.get_partition_for_rank(rank)
    replica_group_ranks = [r for r in self.replica_mapping.keys() \
                           if self.get_partition_for_rank(r) == partition_id]

    logger.debug(f"[RankMapper] Replica group = {replica_group_ranks}.")
    return tnt.Group(replica_group_ranks)
Esempio n. 4
0
  def get_pipelining_group_for_rank(self, rank):
    if not rank in self.partition_mapping:
      raise ValueError(f"Rank {rank} not found in the mapping of partition IDs to ranks: {self.partition_mapping}")
    replica_id = self.get_replica_for_rank(rank)
    pipeline_group_ranks = [r for r in self.partition_mapping.keys() \
                            if self.get_replica_for_rank(r) == replica_id]

    logger.debug(f"[RankMapper] Pipeline group = {pipeline_group_ranks}.")
    return tnt.Group(pipeline_group_ranks)
Esempio n. 5
0
def assert_on_all_ranks(results_array: Union[bool, List[bool]]):
    if not isinstance(results_array, list):
        results_array = [results_array]
    allreduce = tnt.Allreduce(tnt.Group(),
                              nelems=len(results_array),
                              dtype=bool,
                              op=tnt.ReductionOp.AND)
    allreduce.start(results_array)
    output_array = allreduce.wait_for_completion()
    assert np.all(output_array)
Esempio n. 6
0
  def __init__(self, model, group = tnt.Group()):
    super().__init__(model = model, group = group)
    self.input_shapes = None
    self.done_broadcast = False
    self.compiled = False
    self.broadcaster = None
    self.barrier = tnt.Barrier(group = self.group)

    self.dist_optimizer = None
    self.default_shuffle_seed = 42
Esempio n. 7
0
 def __call__(cls,
              callback: tf.keras.callbacks.Callback,
              parallel_strategy: tnt.ParallelStrategy = tnt.
              ParallelStrategy.PIPELINING,
              group: tnt.Group = tnt.Group(),
              **kwargs: Any) -> tf.keras.callbacks.Callback:
     if hasattr(callback, "tnt_parallel_strategy"):
         keras_callback_type = type(callback.keras_callback)
     else:
         keras_callback_type = type(callback)
     return callbackFactory(callback, keras_callback_type,
                            parallel_strategy, group, **kwargs)
Esempio n. 8
0
 def __init__(self, keras_callback: tf.keras.callbacks.Callback,
              aggregate_logs: bool = True,
              run_on_all_ranks: bool = True,
              group: tnt.Group = tnt.Group()) -> None:
   super().__init__(group = group)
   logger.debug(f"[DataParallelCallback] init with {keras_callback}")
   base_type.__init__(self, keras_callback)
   self.aggregate_logs = aggregate_logs
   self.run_on_all_ranks = run_on_all_ranks
   self.is_built = False
   self._distribute_callback = self._distribute_callback_default
   self.customize_callback(keras_callback)
Esempio n. 9
0
    def _create_tnt_model(cls, model: tf.keras.Model,
                          parallel_strategy: tnt.ParallelStrategy = tnt.ParallelStrategy.ALL if TF_DEFAULT_PIPELINING_FLAG \
                                                                                             else tnt.ParallelStrategy.DATA,
                          num_pipeline_stages: int = 1):
        replica_group = tnt.Group()

        if (tnt.ParallelStrategy.PIPELINING
                in parallel_strategy) and isinstance(model,
                                                     tf.keras.Sequential):
            logger.warn(
                f"Cannot pipeline a `tf.keras.Sequential` model; disabling model parallelism."
            )
            parallel_strategy = parallel_strategy ^ tnt.ParallelStrategy.PIPELINING

        logger.info(f"Creating parallel model using {parallel_strategy}.")
        if tnt.ParallelStrategy.PIPELINING in parallel_strategy:
            rank = tnt.get_rank()

            partition_generator = pgen.GraphPartitionGenerator(model)
            rank_mapper = rmapper.RankMapper(
                num_ranks=tnt.get_size(),
                pipeline_graph=partition_generator.get_pipeline_graph())
            pipeline_group = rank_mapper.get_pipelining_group_for_rank(rank)

            logger.info(
                f"[Pipelining] Creating pipelined model with {pipeline_group.size} partitions."
            )
            # get my partition
            model = pm.PartitionedModel(
                model=model,
                group=pipeline_group,
                partition_generator=partition_generator,
                rank_mapper=rank_mapper,
                num_pipeline_stages=num_pipeline_stages)
            if tnt.ParallelStrategy.DATA in parallel_strategy:
                replica_group = rank_mapper.get_replica_group_for_rank(rank)
            else:
                if pipeline_group.size != tnt.get_size():
                    raise ValueError(
                        f"Provided model has only {pipeline_group.size} partitions; use {pipeline_group.size} ranks or a different parallel strategy."
                    )

        if tnt.ParallelStrategy.DATA in parallel_strategy:
            # replicate my partition across the data parallel group
            logger.info(
                f"[DataParallel] Replicating local model across ranks {replica_group.group}."
            )
            model = dpm.DataParallelModel(model=model, group=replica_group)
        return model
Esempio n. 10
0
    def __init__(self,
                 inputs,
                 root_rank=tnt.get_master_rank(),
                 group=tnt.Group()):
        self.root_global_rank = group.to_global_rank(root_rank)
        self.shapes = list()
        self.broadcasts = list()
        self.algorithm = "linear"

        if utils.is_nonEmptyArray(inputs):
            inputs = [inputs]
        elif not utils.is_nonEmptyList(inputs):
            self._raise_input_error()
        for tensor in inputs:
            self.shapes.append(tensor.shape)
            self.broadcasts.append(
                tnt.Broadcast(group=group,
                              nelems=int(np.prod(tensor.shape)),
                              root=root_rank,
                              algorithm=self.algorithm,
                              dtype=tensor.dtype))
Esempio n. 11
0
 def __init__(self, model, group = tnt.Group()):
   super().__init__()
   self.rank = tnt.get_rank()
   self.group = group
   self.model = model
   atexit.register(self.close)
Esempio n. 12
0
 def __init__(self, inputs, group=tnt.Group()):
     # TensorAllgather performs a single Allgather operation
     # when the input is a scalar/array/tensor
     self.group = group
     self.algorithm = "ring"
     self.create_allgather(inputs)
Esempio n. 13
0
 def __init__(self, group: tnt.Group = tnt.Group()) -> None:
   self.group = group
   self.num_ranks = group.size
   self.allreducer = None
   atexit.register(self.close)