Esempio n. 1
0
  def compile(self,
              optimizer='rmsprop',
              loss=None,
              metrics=None,
              loss_weights=None,
              sample_weight_mode=None,
              weighted_metrics=None,
              **kwargs):
    self.done_broadcast = False
    self.compiled = True
    logger.info("[DataParallelModel] compile.")
    if isinstance(optimizer, dict):
      optimizer = tf.keras.optimizers.deserialize(optimizer)
    elif isinstance(optimizer, str):
      config = {'class_name': optimizer, 'config': {}}
      optimizer = tf.keras.optimizers.deserialize(config)
    self.dist_optimizer = tnt.distributed_optimizers.SynchDistributedOptimizer(optimizer, group = self.group)

    kwargs = self._preprocess_compile_kwargs(kwargs)
    return self.model.compile(optimizer = self.dist_optimizer,
                              loss = loss,
                              metrics = metrics,
                              loss_weights = loss_weights,
                              sample_weight_mode = sample_weight_mode,
                              weighted_metrics = weighted_metrics,
                              **kwargs)
    def distributed_batch(self, dataset, batch_size, micro_batch_size,
                          apply_batch):
        if self.batching_info.drop_remainder == True:
            dataset = self.batching_info.apply(dataset,
                                               new_batch_size=batch_size)
            dataset = dataset.unbatch()
            self._dataset = dataset
        else:
            self._dataset = dataset
            # pad final incomplete batch to have at least `num_ranks` samples, such that
            # each rank will have the same number of iterations within one epoch
            dataset = ds_helpers._pad_dataset_if_necessary(
                dataset,
                self.num_samples,
                batch_size,
                min_last_batch_size=self.num_ranks)

        dataset = self._get_dataset_slice_per_rank(dataset, batch_size,
                                                   micro_batch_size)

        if apply_batch:
            dataset = self.batching_info.apply(dataset,
                                               new_batch_size=micro_batch_size)
            logger.info(
                f"Using batch size = {batch_size}, micro batch size = {micro_batch_size}."
            )
        return dataset
Esempio n. 3
0
  def fit(self,
          x = None,
          y = None,
          callbacks = None,
          validation_data = None,
          **kwargs):
    logger.info(f"[PartitionedModel] fit.")
    self._configure_rebuild(dataset = x)
    self._build_model_and_compile_if_necessary()
    processed_callbacks = utilities._preprocess_callbacks(callbacks, self.group,
                                                          parallel_strategy = tnt.ParallelStrategy.PIPELINING,
                                                          exec_type = 'fit',
                                                          verbose = kwargs.get('verbose', None))

    ds = self._get_microbatched_dataset(dataset = x, nano_batch_size = self.nano_batch_size,
                                        num_pipeline_stages = self.num_pipeline_stages)

    distributed_validation_data = None
    if validation_data:
      distributed_validation_data = self._get_microbatched_dataset(dataset = validation_data,
                                                                   nano_batch_size = self.nano_batch_size,
                                                                   num_pipeline_stages = self.num_pipeline_stages)

    return self.model.fit(x = ds, callbacks = processed_callbacks,
                          validation_data = distributed_validation_data,
                          **kwargs)
Esempio n. 4
0
def load_model(filepath, compile=True, **kwargs):
    logger.debug("Load model from file: {}".format(filepath))
    keras_model = tf.keras.models.load_model(filepath,
                                             compile=compile,
                                             **kwargs)
    # FIXME load models with any type of parallelization strategy
    logger.warning("Loading model with the default `data parallel` strategy.")
    tnt_model = tnt.Model(keras_model,
                          parallel_strategy=tnt.ParallelStrategy.DATA)
    if compile:
        try:
            tnt_optimizer = tnt.distributed_optimizers.SynchDistributedOptimizer(
                keras_model.optimizer, group=tnt_model.group)
            tnt_model.dist_optimizer = tnt_optimizer
            tnt_model._set_internal_optimizer(tnt_model.dist_optimizer)
            tnt_model.compiled = True
            tnt_model.done_broadcast = True

            if version_utils.tf_version_below_equal('2.1'):
                tnt_model.model._experimental_run_tf_function = False
                logger.info("Set `experimental_run_tf_function` to False.")
        except:
            logger.info("The loaded model was not pre-compiled.")
    tnt_model.barrier.execute()
    return tnt_model
Esempio n. 5
0
  def evaluate(self,
               x = None,
               y = None,
               callbacks = None,
               tnt_micro_batch_size = None,
               tnt_distribute_dataset = True,
               **kwargs):
    self._setup_for_execution('evaluate', x, y, kwargs)
    processed_callbacks = utilities._preprocess_callbacks(callbacks, self.group,
                                                          parallel_strategy = tnt.ParallelStrategy.DATA,
                                                          exec_type = 'evaluate',
                                                          verbose = kwargs.get('verbose', None))

    if tnt_distribute_dataset:
      test_dataset = tnt.data.Dataset(dataset = x,
                                      num_ranks = self.group.size,
                                      rank = self.group.to_group_rank(self.rank),
                                      shuffle_seed = self.default_shuffle_seed)
      x = test_dataset.distribute_dataset_across_ranks(
              user_micro_batch_size = tnt_micro_batch_size,
              is_training = False)
      self._validate_micro_batch_size_for_batch_normalization(test_dataset.micro_batch_size)
    else:
      logger.info("Automatic dataset distribution is disabled.")

    return self.model.evaluate(x, callbacks = processed_callbacks, **kwargs)
 def shuffle_with_seed(self, dataset, ds_kwargs):
     if not 'seed' in ds_kwargs or ds_kwargs['seed'] is None:
         logger.info(
             f"Shuffling with fixed shuffle seed {self.shuffle_seed}")
         ds_kwargs['seed'] = self.shuffle_seed
     else:
         logger.info(f"Shuffling with shuffle seed {ds_kwargs['seed']}")
     return dataset.shuffle(**ds_kwargs)
Esempio n. 7
0
def save_model(model, filepath, **kwargs):
  if isinstance(model, tnt.Model):
    logger.info("save model from instance of tnt.Model")
  elif isinstance(model, tf.keras.Model):
    logger.info("save model from instance of tf.keras.Model")
  else:
    raise ValueError("[tnt.models.save_model] `model` needs to be either",
                     "a `tf.keras.Model`, or a `tnt.Model`")
  model.save(filepath, **kwargs)
Esempio n. 8
0
 def from_config(cls, config, **kwargs):
   try:
     keras_model = tf.keras.Model.from_config(config, **kwargs)
     logger.info("Loaded model from `keras.Model`.")
   except:
     raise RuntimeError("""[tnt.Model.from_config] Cannot load
           model; provided configuration is neither a `keras.Model`
           nor a `tnt.Model`.""")
   return cls(keras_model)
Esempio n. 9
0
def clone_model(model, **kwargs):
  if isinstance(model, tnt.Model):
    keras_model = tf.keras.models.clone_model(model.model, **kwargs)
    logger.info("clone model from instance of tnt.Model")
  elif isinstance(model, tf.keras.Model):
    keras_model = tf.keras.models.clone_model(model, **kwargs)
    logger.info("clone model from instance of tf.keras.Model")
  else:
    raise ValueError("[tnt.models.clone_model] `model` needs to be either",
                     "a `tf.keras.Model`, or a `tnt.Model`")
  return tnt.Model(keras_model)
Esempio n. 10
0
  def _build_model_and_compile_if_necessary(self):
    if self.built:
      logger.info(f"[PartitionedModel] Model already built with nano_batch_size={self.nano_batch_size}")
      return

    logger.info(f"[PartitionedModel] Building pipelined model with nano_batch_size={self.nano_batch_size}")
    self.microbatched_model_builder = self._get_microbatched_model_builder(self.nano_batch_size)
    self.model = self.microbatched_model_builder.get_model()

    compile_parameters = self._get_partition_compile_params()
    self.model.compile(**compile_parameters)
    self.built = True
Esempio n. 11
0
def clone_model(model, **kwargs):
    if isinstance(model, tnt.strategy.parallel_model.ParallelModel):
        keras_model = tf.keras.models.clone_model(model.model, **kwargs)
        logger.info("clone model from instance of tnt.Model")
    elif isinstance(model, tf.keras.Model):
        keras_model = tf.keras.models.clone_model(model, **kwargs)
        logger.info("clone model from instance of tf.keras.Model")
    else:
        raise ValueError("[tnt.models.clone_model] `model` needs to be either",
                         "a `tf.keras.Model`, or a `tnt.Model`")
    # FIXME load models with any type of parallelization strategy
    logger.warning("Loading model with the default `data parallel` strategy.")
    return tnt.Model(keras_model, parallel_strategy=tnt.ParallelStrategy.DATA)
Esempio n. 12
0
 def from_config(cls, config, **kwargs):
     try:
         keras_model = tf.keras.Sequential.from_config(config, **kwargs)
         logger.info("Loaded model from `keras.Sequential`.")
     except:
         raise RuntimeError(
             """[tnt.keras.Sequential.from_config] Cannot load
         model; provided configuration is not a `keras.Sequential` model."""
         )
     # FIXME load models with any type of parallelization strategy
     logger.warning(
         "Loading model with the default `data parallel` strategy.")
     return tnt.Model(keras_model,
                      parallel_strategy=tnt.ParallelStrategy.DATA)
Esempio n. 13
0
    def fit(self,
            x=None,
            y=None,
            callbacks=None,
            validation_data=None,
            tnt_micro_batch_size=None,
            tnt_validation_micro_batch_size=None,
            tnt_distribute_dataset=True,
            tnt_distribute_validation_dataset=True,
            **kwargs):
        self._setup_for_execution('fit', x, y, callbacks, kwargs)

        if tnt_distribute_dataset:
            distributed_x = ds.DistributedDataset(
                dataset=x,
                num_ranks=self.comm_size,
                rank=self.rank,
                shuffle_seed=self.default_shuffle_seed)
            x = distributed_x.distribute_dataset_across_ranks(
                user_micro_batch_size=tnt_micro_batch_size, is_training=True)
        else:
            logger.info(
                "Automatic dataset distribution is disabled."
                "Make sure the dataset is sharded manually across ranks.")

        # Always switch off shuffling
        kwargs["shuffle"] = False

        if validation_data:
            if tnt_distribute_validation_dataset:
                distributed_validation_data = ds.DistributedDataset(
                    dataset=validation_data,
                    num_ranks=self.comm_size,
                    rank=self.rank,
                    shuffle_seed=self.default_shuffle_seed)
                validation_data = distributed_validation_data.distribute_dataset_across_ranks(
                    user_micro_batch_size=tnt_validation_micro_batch_size,
                    is_training=False)
            else:
                logger.info(
                    "Automatic distribution for the validation dataset is disabled."
                )

        return self.model.fit(x,
                              validation_data=validation_data,
                              callbacks=callbacks,
                              **kwargs)
Esempio n. 14
0
def _pad_dataset_if_necessary(dataset, num_samples, batch_size,
                              min_last_batch_size):
    last_batch_size = _get_last_incomplete_batch_size(num_samples, batch_size)
    if last_batch_size == 0:
        logger.debug(f"No padding required: number of samples {num_samples} is a multiple " \
                     f"of the batch size {batch_size}.")
        return dataset

    logger.info(f"Incomplete last batch in the dataset: number of samples is " \
                f"{last_batch_size} ( != batch size {batch_size}).")

    if version_utils.tf_version_below_equal('2.1'):
        num_samples_multiple = num_samples - last_batch_size
        logger.warn(f"Number of samples ({num_samples}) is not a multiple of batch size. " \
                    f"This use case is not supported in TF v{version_utils.current_version()}. " \
                    f"Dropping the last incomplete batch from the dataset, "\
                    f"and proceeding with {num_samples_multiple} samples.")
        return dataset.take(num_samples_multiple)

    if last_batch_size < min_last_batch_size:
        logger.debug(f"Padding required for the last batch: number of samples is " \
                     f"{last_batch_size} ( < min_batch_size {min_last_batch_size}).")

        # Create helper dataset that contains one full batch and one incomplete batch
        helper_dataset = dataset.take(min_last_batch_size + last_batch_size)
        helper_dataset = helper_dataset.batch(min_last_batch_size,
                                              drop_remainder=False)

        # If `padded_shape` is unspecified, all dimensions of all components
        # are padded to the maximum size in the batch.
        # The second batch in `helper_dataset` will now contain `min_last_batch_size - last_batch_size`
        # default-initialized samples.
        helper_dataset = helper_dataset.padded_batch(2)

        # Switch back to a list of samples instead of batches
        helper_dataset = helper_dataset.unbatch().unbatch()

        # Remaining samples in the dataset are those generated through padding
        padding_samples = helper_dataset.skip(min_last_batch_size +
                                              last_batch_size)
        dataset = dataset.concatenate(padding_samples)
        logger.info(f"[Rank {tnt.get_rank()}] Dataset padded with " \
                    f"{min_last_batch_size - last_batch_size} samples.")
    return dataset
Esempio n. 15
0
    def predict(self,
                x=None,
                callbacks=None,
                tnt_micro_batch_size=None,
                tnt_distribute_dataset=True,
                **kwargs):
        self._setup_for_execution('predict', x, None, callbacks, kwargs)

        if tnt_distribute_dataset:
            test_dataset = ds.DistributedDataset(
                dataset=x,
                num_ranks=self.comm_size,
                rank=self.rank,
                shuffle_seed=self.default_shuffle_seed)
            x = test_dataset.distribute_dataset_across_ranks(
                user_micro_batch_size=tnt_micro_batch_size, is_training=False)
        else:
            logger.info("Automatic dataset distribution is disabled.")
        return self.model.predict(x, callbacks=callbacks, **kwargs)
Esempio n. 16
0
 def compile(self,
             optimizer='rmsprop',
             loss=None,
             metrics=None,
             loss_weights=None,
             sample_weight_mode=None,
             weighted_metrics=None,
             **kwargs):
   self.built = True
   params = dict(locals())
   logger.info(f"[PartitionedModel] compile.")
   self.compile_properties = CompileProperties(self.model, params)
   return self.model.compile(optimizer,
                             loss,
                             metrics,
                             loss_weights,
                             sample_weight_mode,
                             weighted_metrics,
                             **kwargs)
Esempio n. 17
0
  def distributed_batch(self, dataset, batch_size, micro_batch_size):
    if self.batching_info.drop_remainder == True:
      dataset = self.batching_info.apply(dataset, new_batch_size = batch_size)
      dataset = dataset.unbatch()

    else: # no drop remainder
      num_samples = ds_helpers.get_num_samples(dataset)
      if num_samples == tf.data.experimental.INFINITE_CARDINALITY:
        raise ValueError("[DistributedDataset] Infinite dataset provided")

      # Total number of samples is not multiple of the batch size
      if num_samples % batch_size != 0:
        logger.warn("Number of samples ({}) is not a multiple of batch size.\
 Removing the last incomplete batch from the dataset.".format(num_samples))
        num_samples_multiple = (num_samples // batch_size) * batch_size
        dataset = dataset.take(num_samples_multiple)

    dataset = self.batching_info.apply(dataset, new_batch_size = micro_batch_size)
    dataset = dataset.shard(num_shards=self.num_ranks, index = self.rank)

    logger.info("Using batch size = {}, micro batch size = {}.".format(
                batch_size, micro_batch_size))
    return dataset
Esempio n. 18
0
    def _create_tnt_model(cls, model: tf.keras.Model,
                          parallel_strategy: tnt.ParallelStrategy = tnt.ParallelStrategy.ALL if TF_DEFAULT_PIPELINING_FLAG \
                                                                                             else tnt.ParallelStrategy.DATA,
                          num_pipeline_stages: int = 1):
        replica_group = tnt.Group()

        if (tnt.ParallelStrategy.PIPELINING
                in parallel_strategy) and isinstance(model,
                                                     tf.keras.Sequential):
            logger.warn(
                f"Cannot pipeline a `tf.keras.Sequential` model; disabling model parallelism."
            )
            parallel_strategy = parallel_strategy ^ tnt.ParallelStrategy.PIPELINING

        logger.info(f"Creating parallel model using {parallel_strategy}.")
        if tnt.ParallelStrategy.PIPELINING in parallel_strategy:
            rank = tnt.get_rank()

            partition_generator = pgen.GraphPartitionGenerator(model)
            rank_mapper = rmapper.RankMapper(
                num_ranks=tnt.get_size(),
                pipeline_graph=partition_generator.get_pipeline_graph())
            pipeline_group = rank_mapper.get_pipelining_group_for_rank(rank)

            logger.info(
                f"[Pipelining] Creating pipelined model with {pipeline_group.size} partitions."
            )
            # get my partition
            model = pm.PartitionedModel(
                model=model,
                group=pipeline_group,
                partition_generator=partition_generator,
                rank_mapper=rank_mapper,
                num_pipeline_stages=num_pipeline_stages)
            if tnt.ParallelStrategy.DATA in parallel_strategy:
                replica_group = rank_mapper.get_replica_group_for_rank(rank)
            else:
                if pipeline_group.size != tnt.get_size():
                    raise ValueError(
                        f"Provided model has only {pipeline_group.size} partitions; use {pipeline_group.size} ranks or a different parallel strategy."
                    )

        if tnt.ParallelStrategy.DATA in parallel_strategy:
            # replicate my partition across the data parallel group
            logger.info(
                f"[DataParallel] Replicating local model across ranks {replica_group.group}."
            )
            model = dpm.DataParallelModel(model=model, group=replica_group)
        return model
Esempio n. 19
0
  def fit(self,
          x = None,
          y = None,
          callbacks = None,
          validation_data = None,
          tnt_micro_batch_size = None,
          tnt_validation_micro_batch_size = None,
          tnt_distribute_dataset = True,
          tnt_distribute_validation_dataset = True,
          **kwargs):
    self._setup_for_execution('fit', x, y, kwargs)
    processed_callbacks = utilities._preprocess_callbacks(callbacks, self.group,
                                                          parallel_strategy = tnt.ParallelStrategy.DATA,
                                                          exec_type = 'fit',
                                                          verbose = kwargs.get('verbose', None))

    if tnt_distribute_dataset:
      # Distribute dataset into micro-batches among ranks by taking into account
      # all possible cases of splitting the dataset:
      #
      # 1. Batch size
      # a. `batch_size` is a multiple of the number of ranks
      #     => identical `micro_batch_size` for all ranks
      # b. `batch_size` is not a multiple of the number of ranks
      #     => different ranks have different `micro_batch_size`s and
      #        locally computed gradients need to be scaled by a factor to
      #        account for the differences
      # c. `batch_size` < number of ranks
      #     => raise Error
      #
      # 2. Last batch within epoch
      # a. the last batch in the dataset is incomplete, but dataset is batched
      #    with `drop_remainder = True`
      #     => the last batch is dropped
      # b. the last batch in the dataset is incomplete with `drop_remainder = False`
      #     - number of samples in the last batch is smaller than `num_ranks`,
      #         => pad the dataset with a number of zeroed samples to ensure that each rank
      #            has one sample, so that they all see the same number of iterations in an epoch;
      #            the fake samples will be filtered out from the final gradient computation by
      #            assigning them `micro_batch_size = 0`
      #     - number of samples in the last batch is >= `num_ranks`
      #         => last batch can be considered a new `batch_size`, which will be handled as above (in 1.),
      #            both for computing the `micro_batch_size` and the `scaling_factor`
      distributed_x = tnt.data.Dataset(dataset = x,
                                       num_ranks = self.group.size,
                                       rank = self.group.to_group_rank(self.rank),
                                       shuffle_seed = self.default_shuffle_seed)
      x = distributed_x.distribute_dataset_across_ranks(
            user_micro_batch_size = tnt_micro_batch_size,
            is_training = True)
      self._validate_micro_batch_size_for_batch_normalization(distributed_x.micro_batch_size)

      # if different ranks have different micro-batch sizes, the gradients need rescaling
      dataset_callback = distributed_x.get_gradient_scaling_callback()
      if dataset_callback:
        processed_callbacks.append(dataset_callback)

    else:
      logger.info("Automatic dataset distribution is disabled."
                  "Make sure the dataset is sharded manually across ranks.")

    # Always switch off shuffling
    kwargs["shuffle"] = False

    if validation_data:
      if tnt_distribute_validation_dataset:
        distributed_validation_data = tnt.data.Dataset(dataset = validation_data,
                                                       num_ranks = self.group.size,
                                                       rank = self.group.to_group_rank(self.rank),
                                                       shuffle_seed = self.default_shuffle_seed)
        validation_data = distributed_validation_data.distribute_dataset_across_ranks(
              user_micro_batch_size = tnt_validation_micro_batch_size,
              is_training = False)
        self._validate_micro_batch_size_for_batch_normalization(distributed_validation_data.micro_batch_size)
      else:
        logger.info("Automatic distribution for the validation dataset is disabled.")

    return self.model.fit(x = x,
                          validation_data = validation_data,
                          callbacks = processed_callbacks,
                          **kwargs)
Esempio n. 20
0
 def _preprocess_compile_kwargs(self, kwargs):
   if version_utils.tf_version_below_equal('2.1'):
     kwargs['experimental_run_tf_function'] = False
     logger.info("Set `experimental_run_tf_function` to False.")
   return kwargs