Example #1
0
  def get_connections_for_rank(self, rank):
    # Create connections dict: { conn_id: ((rank_i, rank), size_in_bytes)}
    assert rank < self.num_ranks
    partition_id = self.get_partition_for_rank(rank)
    replica_id = self.get_replica_for_rank(rank)

    connection_table = dict()
    for edge in self.pipeline_graph.get_edges():
      conn_id = edge.info_dict['connection_id']
      # `source` and `target` nodes of an edge are partition IDs
      if partition_id in [edge.source_node, edge.target_node]:
        size_in_bytes = edge.info_dict['number_elements'] * edge.info_dict['dtype'].size

        source_ranks = self._get_ranks_for_partition(edge.source_node)
        target_ranks = self._get_ranks_for_partition(edge.target_node)

        source_rank_for_replica = [r for r in source_ranks if self.get_replica_for_rank(r) == replica_id]
        target_rank_for_replica = [r for r in target_ranks if self.get_replica_for_rank(r) == replica_id]

        assert len(source_rank_for_replica) == 1, \
               f"[RankMapper] Incorrect set of ranks for replica = {replica_id}: {source_rank_for_replica}"
        assert len(target_rank_for_replica) == 1, \
               f"[RankMapper] Incorrect set of ranks for replica = {replica_id}: {target_rank_for_replica}"
        connection_table[conn_id] = cinfo.ConnectionInfo((source_rank_for_replica[0], target_rank_for_replica[0]), size_in_bytes)
    logger.debug(f"[RankMapper] Connection table for rank {rank}: {connection_table}")
    return connection_table
  def _get_model(self, model):
    logger.debug(f"Creating model for partition {self.partition_id}")
    core_model_config = self._to_model_config(self.partition_id, self.partition_graph)
    core_model = tf.keras.Model().from_config(core_model_config)

    set_weights(core_model, model)
    return core_model
Example #3
0
 def shuffle_with_seed(self, dataset, ds_kwargs):
   if not 'seed' in ds_kwargs or ds_kwargs['seed'] is None:
     logger.warn("Shuffling with fixed shuffle seed {}.".format(self.shuffle_seed))
     ds_kwargs['seed'] = self.shuffle_seed
   else:
     logger.debug("Shuffling with shuffle seed {}.".format(ds_kwargs['seed']))
   return dataset.shuffle(**ds_kwargs)
Example #4
0
def load_model(filepath, compile=True, **kwargs):
    logger.debug("Load model from file: {}".format(filepath))
    keras_model = tf.keras.models.load_model(filepath,
                                             compile=compile,
                                             **kwargs)
    # FIXME load models with any type of parallelization strategy
    logger.warning("Loading model with the default `data parallel` strategy.")
    tnt_model = tnt.Model(keras_model,
                          parallel_strategy=tnt.ParallelStrategy.DATA)
    if compile:
        try:
            tnt_optimizer = tnt.distributed_optimizers.SynchDistributedOptimizer(
                keras_model.optimizer, group=tnt_model.group)
            tnt_model.dist_optimizer = tnt_optimizer
            tnt_model._set_internal_optimizer(tnt_model.dist_optimizer)
            tnt_model.compiled = True
            tnt_model.done_broadcast = True

            if version_utils.tf_version_below_equal('2.1'):
                tnt_model.model._experimental_run_tf_function = False
                logger.info("Set `experimental_run_tf_function` to False.")
        except:
            logger.info("The loaded model was not pre-compiled.")
    tnt_model.barrier.execute()
    return tnt_model
Example #5
0
 def __init__(self,
              keras_callback: tf.keras.callbacks.Callback) -> None:
     self.keras_callback = keras_callback
     logger.debug(
         f"Creating generic TNT callback of type={type(keras_callback)}"
     )
     _construct_from_keras_object(self, keras_callback)
     self.tnt_parallel_strategy = parallel_strategy
Example #6
0
 def __init__(self, model, params):
   # params = {'optimizer', 'loss', 'metrics', 'loss_weights',
   #           'sample_loss_weights', 'weighted_metrics'}
   self._optimizer = params.get('optimizer', None)
   self._loss = self._assign_named_attributes_to_outputs(model, 'loss', params)
   self._loss_weights = self._assign_named_attributes_to_outputs(model, 'loss_weights', params)
   self._metrics = self._assign_named_attributes_to_outputs(model, 'metrics', params)
   logger.debug(f"Compile properties: losses = {self._loss}, loss_weights = {self._loss_weights}, metrics = {self._metrics}")
Example #7
0
def _to_parallel_callbacks(callbacks, group, parallel_strategy):
    for index, callback in enumerate(callbacks):
        logger.debug(
            f"[{parallel_strategy}] Preprocessing callback {callback} of type {type(callback)}"
        )
        callbacks[index] = tnt.keras.callbacks.Callback(callback,
                                                        parallel_strategy,
                                                        group=group)
    return callbacks
Example #8
0
  def get_pipelining_group_for_rank(self, rank):
    if not rank in self.partition_mapping:
      raise ValueError(f"Rank {rank} not found in the mapping of partition IDs to ranks: {self.partition_mapping}")
    replica_id = self.get_replica_for_rank(rank)
    pipeline_group_ranks = [r for r in self.partition_mapping.keys() \
                            if self.get_replica_for_rank(r) == replica_id]

    logger.debug(f"[RankMapper] Pipeline group = {pipeline_group_ranks}.")
    return tnt.Group(pipeline_group_ranks)
Example #9
0
  def get_replica_group_for_rank(self, rank):
    if not rank in self.replica_mapping:
      raise ValueError(f"Rank {rank} not found in the mapping of replica IDs to ranks: {self.replica_mapping}")
    partition_id = self.get_partition_for_rank(rank)
    replica_group_ranks = [r for r in self.replica_mapping.keys() \
                           if self.get_partition_for_rank(r) == partition_id]

    logger.debug(f"[RankMapper] Replica group = {replica_group_ranks}.")
    return tnt.Group(replica_group_ranks)
Example #10
0
    def _(self, keras_callback: tf.keras.callbacks.EarlyStopping):
      logger.debug("[DataParallel] EarlyStopping callback")
      # only master rank should print messages
      self.verbose = keras_callback.verbose if tnt.is_group_master_rank(self.group) \
                                            else utilities.TF_verbose.SILENT.value

      def _get_monitor_value(self, logs):
        averaged_logs = self.average_logs(logs)
        return super().get_monitor_value(averaged_logs)
      self.get_monitor_value = _get_monitor_value
Example #11
0
  def _get_partition_compile_params(self):
    if not self.compile_properties:
      raise RuntimeError("[PipelinedModel] `model.fit` called before `model.compile`")

    logger.debug(f"[PartitionedModel] Compiled partitioned model with losses={self.compile_properties.loss}, "
                f"metrics = {self.compile_properties.metrics} {self.model.metrics}")
    return {'optimizer' : self.compile_properties.optimizer,
            'loss' : self.microbatched_model_builder.get_losses(self.compile_properties.loss),
            'loss_weights' : self.microbatched_model_builder.get_loss_weights(),
            'metrics' : self.microbatched_model_builder.get_metrics(self.compile_properties.metrics)}
Example #12
0
  def get_microbatch_size(self, batch_size):
    if batch_size is None or batch_size == 0:
      raise ValueError("[DistributedDataset]Incorrectly defined batch size")

    if batch_size % self.num_ranks != 0:
      raise ValueError("[DistributedDataset] Batch size ({}) is not a multiple".format(batch_size) +
                       "of the number of ranks {}".format(self.num_ranks))

    logger.debug("Batch size ({}) is a multiple of the number of ranks {}.".format(
                 batch_size, self.num_ranks))
    return int(batch_size // self.num_ranks)
Example #13
0
def model_from_yaml(yaml_string, **kwargs):
    logger.debug("Load model from yaml")
    try:
        keras_model = tf.keras.models.model_from_yaml(yaml_string, **kwargs)
        # FIXME load models with any type of parallelization strategy
        logger.warning(
            "Loading model with the default `data parallel` strategy.")
        return tnt.Model(keras_model,
                         parallel_strategy=tnt.ParallelStrategy.DATA)
    except:
        raise RuntimeError("[tnt.models.model_from_yaml] Cannot load model")
Example #14
0
 def __init__(self, keras_callback: tf.keras.callbacks.Callback,
              aggregate_logs: bool = True,
              run_on_all_ranks: bool = True,
              group: tnt.Group = tnt.Group()) -> None:
   super().__init__(group = group)
   logger.debug(f"[DataParallelCallback] init with {keras_callback}")
   base_type.__init__(self, keras_callback)
   self.aggregate_logs = aggregate_logs
   self.run_on_all_ranks = run_on_all_ranks
   self.is_built = False
   self._distribute_callback = self._distribute_callback_default
   self.customize_callback(keras_callback)
Example #15
0
    def _(self, keras_callback: tf.keras.callbacks.ModelCheckpoint):
      logger.debug("[DataParallel] ModelCheckpoint callback")
      # only master rank should save and thus print messages
      self.verbose = keras_callback.verbose if tnt.is_group_master_rank(self.group) \
                                            else utilities.TF_verbose.SILENT.value
      self.run_on_all_ranks = False # only one checkpoint is needed (models are identical in a data parallel setting)

      # disable checkpointing for all ranks except the master rank
      if not tnt.is_group_master_rank(self.group):
        self._supports_tf_logs = False
        self.save_freq = 1e20 # very large value to avoid triggering checkpointing
        self.epochs_since_last_save = 0
Example #16
0
def setup_gpus(rank, ngpus=None):
    """Checks whether there are GPUs available on the machine and assigns one
  to the current rank.

  To make sure a specific GPU will be used by the current rank, TensorFlow is
  configured so that this particular GPU is the only one visible.
  A GPU is selected if its index within the list of available GPUs is equal to
  (rank % ngpus).
  This allocation assumes that all nodes are homogeneous and are configured with
  the same number of processes (< ngpus).

  Args:
    rank: int, rank of the current process

    ngpus: int value specifying the maximum number of GPUs per node that will
    be used (`None` stands for using all GPUs available)
  """
    if ngpus is not None and ngpus <= 0:
        # Disable all GPUs
        tf.config.experimental.set_visible_devices([], 'GPU')
        visible_gpus = tf.config.experimental.get_visible_devices('GPU')
        if visible_gpus and len(visible_gpus) > 0:
            sys.exit(
                "ERROR: [rank {}] Could not disable GPUs: {} GPUs still visible"
                .format(rank, len(visible_gpus)))
    else:  # try to use `ngpus` per node
        phys_gpus = tf_config.get_available_gpus()
        if phys_gpus and len(phys_gpus) > 0:
            if ngpus is None:  # use as many GPUs as possible
                ngpus = len(phys_gpus)

            target_gpu = rank % ngpus
            if len(phys_gpus) < ngpus:
                sys.exit(
                    "ERROR: rank {} cannot use GPU_id={} (only {} GPUs available)"
                    .format(rank, target_gpu, len(phys_gpus)))

            try:
                # memory growth has to be set only once on all availble GPUs
                if target_gpu == 0:
                    for gpu in phys_gpus:
                        tf.config.experimental.set_memory_growth(gpu, True)
                # make sure only one GPU is visible per process
                tf.config.experimental.set_visible_devices(
                    phys_gpus[target_gpu], 'GPU')
            except RuntimeError:
                raise RuntimeError("[Tarantella][init] Cannot configure GPUs")
    logger.debug("Using device: {}".format(
        tf.config.experimental.get_visible_devices()))
 def _(self, keras_callback: tf.keras.callbacks.TensorBoard):
     logger.debug("[PipeliningParallel] TensorBoard callback")
     if tnt.global_tnt_config.tensorboard_on_all_devices:
         self.log_dir += f"/rank_{tnt.get_rank()}"
     else:
         # disable any data logging for all ranks except the last partition
         if not tnt.is_group_master_rank(self.group):
             self.histogram_freq = 0
             self.write_graph = False
             self.write_images = False
             self.write_steps_per_second = False
             self.update_freq = 0
             self.embeddings_freq = 0
             self.embeddings_metadata = None
             self.profile_batch = None
Example #18
0
  def _map_ranks_to_partition_and_replica_ids(self):
    nranks_dp = self.num_ranks // self.num_partitions

    partition_mapping = dict()
    replica_mapping = dict()
    for index, node in enumerate(self.pipeline_graph.get_nodes()):
      for replica_index in range(nranks_dp):
        first_rank_in_partition = replica_index * self.num_partitions
        rank = first_rank_in_partition + index
        partition_id = node.name
        partition_mapping[rank] = partition_id
        replica_mapping[rank] = replica_index
    logger.debug(f"[RankMapper] Mapping of ranks to partition IDs: {partition_mapping}")
    logger.debug(f"[RankMapper] Mapping of ranks to replica IDs: {replica_mapping}")
    return partition_mapping, replica_mapping
Example #19
0
def _get_microbatch_size(rank, num_ranks, batch_size):
    if batch_size is None or batch_size == 0:
        raise ValueError("[DistributedDataset]Incorrectly defined batch size")

    microbatch_size = int(batch_size // num_ranks)
    remaining_samples = batch_size % num_ranks

    if remaining_samples != 0:
        logger.debug(
            f"[Rank {tnt.get_rank()}] Batch size ({batch_size}) is a not multiple of the number of ranks {num_ranks}."
        )
    if rank < remaining_samples:
        microbatch_size = microbatch_size + 1

    logger.debug(
        f"[Rank {tnt.get_rank()}] Micro batch size {microbatch_size}.")
    return microbatch_size
Example #20
0
def _pad_dataset_if_necessary(dataset, num_samples, batch_size,
                              min_last_batch_size):
    last_batch_size = _get_last_incomplete_batch_size(num_samples, batch_size)
    if last_batch_size == 0:
        logger.debug(f"No padding required: number of samples {num_samples} is a multiple " \
                     f"of the batch size {batch_size}.")
        return dataset

    logger.info(f"Incomplete last batch in the dataset: number of samples is " \
                f"{last_batch_size} ( != batch size {batch_size}).")

    if version_utils.tf_version_below_equal('2.1'):
        num_samples_multiple = num_samples - last_batch_size
        logger.warn(f"Number of samples ({num_samples}) is not a multiple of batch size. " \
                    f"This use case is not supported in TF v{version_utils.current_version()}. " \
                    f"Dropping the last incomplete batch from the dataset, "\
                    f"and proceeding with {num_samples_multiple} samples.")
        return dataset.take(num_samples_multiple)

    if last_batch_size < min_last_batch_size:
        logger.debug(f"Padding required for the last batch: number of samples is " \
                     f"{last_batch_size} ( < min_batch_size {min_last_batch_size}).")

        # Create helper dataset that contains one full batch and one incomplete batch
        helper_dataset = dataset.take(min_last_batch_size + last_batch_size)
        helper_dataset = helper_dataset.batch(min_last_batch_size,
                                              drop_remainder=False)

        # If `padded_shape` is unspecified, all dimensions of all components
        # are padded to the maximum size in the batch.
        # The second batch in `helper_dataset` will now contain `min_last_batch_size - last_batch_size`
        # default-initialized samples.
        helper_dataset = helper_dataset.padded_batch(2)

        # Switch back to a list of samples instead of batches
        helper_dataset = helper_dataset.unbatch().unbatch()

        # Remaining samples in the dataset are those generated through padding
        padding_samples = helper_dataset.skip(min_last_batch_size +
                                              last_batch_size)
        dataset = dataset.concatenate(padding_samples)
        logger.info(f"[Rank {tnt.get_rank()}] Dataset padded with " \
                    f"{min_last_batch_size - last_batch_size} samples.")
    return dataset
Example #21
0
def _get_num_samples(dataset):
    cardinality = tf.data.experimental.cardinality(dataset)

    if cardinality == tf.data.experimental.INFINITE_CARDINALITY:
        logger.debug("Infinite dataset detected.")
        return tf.data.experimental.INFINITE_CARDINALITY

    if cardinality != tf.data.experimental.UNKNOWN_CARDINALITY:
        logger.debug("Dataset size is %d" % (cardinality.numpy()))
        return cardinality.numpy()

    logger.debug("Unknown dataset size. Counting samples...")
    dataset_size = 0
    for d in dataset:
        dataset_size += 1
    logger.debug("Dataset size is %d" % (dataset_size))
    return dataset_size
Example #22
0
 def _(self, keras_callback: tf.keras.callbacks.LearningRateScheduler):
   logger.debug("[DataParallel] LearningRateScheduler callback")
   if not tnt.global_tnt_config.output_on_all_devices:
     if not tnt.is_group_master_rank(self.group):
       self.verbose = 0
Example #23
0
 def on_train_batch_begin(self, batch, logs=None):
   scaling_factor = get_scaling_factor_by_iteration(batch, self._scaling_factor_table)
   if scaling_factor != self.model.optimizer.scaling_factor:
     logger.debug(f"[Rank {tnt.get_rank()}] Setting scaling factor to {scaling_factor}")
     K.set_value(self.model.optimizer.scaling_factor, scaling_factor)
Example #24
0
 def _(self, keras_callback: tf.keras.callbacks.ProgbarLogger):
   logger.debug("[DataParallel] ProgbarLogger callback")
   _customize_progbar_logger(self)
Example #25
0
 def _(self, keras_callback: tf.keras.callbacks.ReduceLROnPlateau):
   logger.debug("[DataParallel] ReduceLROnPlateau callback")
   # only master rank should print messages
   self.verbose = keras_callback.verbose if tnt.is_group_master_rank(self.group) \
                                         else utilities.TF_verbose.SILENT.value
Example #26
0
 def _(self, keras_callback: tf.keras.callbacks.RemoteMonitor):
   logger.debug("[DataParallel] RemoteMonitor callback")
Example #27
0
 def _(self, keras_callback: tf.keras.callbacks.TerminateOnNaN):
   logger.debug("[DataParallel] TerminateOnNaN callback")
Example #28
0
 def customize_callback(self, keras_callback: tf.keras.callbacks.Callback) -> None:
   logger.debug("[DataParallel] Generic callback")
Example #29
0
 def _(self, keras_callback: tf.keras.callbacks.CSVLogger):
   logger.debug("[DataParallel] CSVLogger callback")
Example #30
0
 def _(self, keras_callback: tf.keras.callbacks.History):
   logger.debug("[DataParallel] History callback")