Ejemplo n.º 1
0
def handle_distribution_strategy(distribution_strategy):
    """ Create distribution strategy. """
    strategy = None
    if distribution_strategy:
        strategy = distribution_strategy
        if isinstance(distribution_strategy, dict):
            strategy = distribution_strategy.get("distribution_strategy", None)
        if isinstance(distribution_strategy, str):
            strategy = distribution_strategy.lower()
        if is_third_party_allreduce(strategy):
            if strategy == "horovod":
                import horovod.tensorflow.keras as hvd
            else:
                import byteps.tensorflow.keras as hvd
            logging.info("import {} as hvd backend.".format(strategy))
            hvd.init()
            # Horovod: pin GPU to be used to process local rank (one GPU per process)
            gpus = tf.config.experimental.list_physical_devices('GPU')
            for gpu in gpus:
                tf.config.experimental.set_memory_growth(gpu, True)
            if gpus:
                tf.config.experimental.set_visible_devices(
                    gpus[hvd.local_rank()], 'GPU')
            compat.register_distributed_worker_setting(hvd.rank(), hvd.size(),
                                                       strategy)
            if hvd.rank() != 0:
                logging.set_verbosity(logging.ERROR)
        else:
            if isinstance(distribution_strategy, str):
                strategy = distribution_utils.get_distribution_strategy(
                    distribution_strategy=distribution_strategy)
            elif isinstance(distribution_strategy, dict):
                strategy = distribution_utils.get_distribution_strategy(
                    **distribution_strategy)

    if strategy is None:
        logging.info("No distribution strategy was used.")
    else:
        try:
            logging.info(
                "Using distribution strategy: {} with num_replicas_in_sync={}".
                format(strategy, strategy.num_replicas_in_sync))
        except Exception:
            pass
    return strategy
Ejemplo n.º 2
0
def get_distribution_strategy(distribution_strategy="mirrored",
                              num_gpus=0,
                              worker_hosts=None,
                              task_index=-1,
                              all_reduce_alg="nccl",
                              num_packs=1,
                              tpu_address=None):
    """Return a DistributionStrategy for running the model.

    Args:
      distribution_strategy: a string specifying which distribution strategy to
        use. Accepted values are 'off', 'default', 'one_device', 'mirrored',
        'parameter_server', 'multi_worker_mirrored', and 'tpu' -- case insensitive.
        'off' means not to use Distribution Strategy; 'default' means to choose from
        `MirroredStrategy`, `MultiWorkerMirroredStrategy`, or `OneDeviceStrategy`
        according to the number of GPUs and number of workers. 'tpu' means to use
        TPUStrategy using `tpu_address`.
      num_gpus: Number of GPUs to run this model.
      worker_hosts: The worker hosts for 'multi_worker_mirrored'.
      task_index: The task index for 'multi_worker_mirrored'.
      all_reduce_alg: Optional. Specifies which algorithm to use when performing
        all-reduce. For `MirroredStrategy`, valid values are "nccl" and
        "hierarchical_copy". For `MultiWorkerMirroredStrategy`, valid values are
        "ring" and "nccl".  If None, DistributionStrategy will choose based on
        device topology.
      num_packs: Optional.  Sets the `num_packs` in `tf.distribute.NcclAllReduce`
        or `tf.distribute.HierarchicalCopyAllReduce` for `MirroredStrategy`.
      tpu_address: Optional. String that represents TPU to connect to. Must not
        be None if `distribution_strategy` is set to `tpu`.
    Returns:
      tf.distribute.DistibutionStrategy object.
    Raises:
      ValueError: if `distribution_strategy` is 'off' or 'one_device' and
        `num_gpus` is larger than 1; or `num_gpus` is negative or if
        `distribution_strategy` is `tpu` but `tpu_address` is not specified.
    """
    if num_gpus == 0:
        num_gpus = int(os.environ.get("WORKER_GPUS", '0'))
    if num_gpus < 0:
        raise ValueError("`num_gpus` can not be negative.")
    if (distribution_strategy is None
            or distribution_strategy.lower() == "none"
            or distribution_strategy == ""):
        return None
    distribution_strategy = distribution_strategy.lower()

    if distribution_strategy == "tpu":
        # When tpu_address is an empty string, we communicate with local TPUs.
        cluster_resolver = tpu_initialize(tpu_address)
        return tf.distribute.experimental.TPUStrategy(cluster_resolver)

    if distribution_strategy == "multi_worker_mirrored":
        if worker_hosts is None:
            worker_hosts = os.environ.get("WORKER_HOSTS", None)
            task_index = int(os.environ.get("TASK_ID", -1))
        assert worker_hosts, (
            "worker_hosts must be provided when using 'multi_worker_mirrored'."
        )

        workers = worker_hosts.split(',')
        if len(workers) > 1 and 0 > task_index:
            raise ValueError(
                'Must specify task_index when number of workers > 1')
        task_index = 0 if len(workers) == 1 else task_index
        register_distributed_worker_setting(worker_id=task_index,
                                            num_workers=len(workers),
                                            strategy="multi_worker_mirrored")
        os.environ['TF_CONFIG'] = json.dumps({
            'cluster': {
                'worker': workers
            },
            'task': {
                'type': 'worker',
                'index': task_index
            }
        })
        strategy = tf.distribute.experimental.MultiWorkerMirroredStrategy(
            communication=_collective_communication(all_reduce_alg))
        strategy.extended.experimental_enable_get_next_as_optional = True
        return strategy

    if distribution_strategy in ("mirrored", "default"):
        return tf.distribute.MirroredStrategy(
            cross_device_ops=_mirrored_cross_device_ops(
                all_reduce_alg, num_packs))

    raise ValueError("Unrecognized Distribution Strategy: %r" %
                     distribution_strategy)