class MultiWorkerCrossTowerOpsTest(multi_worker_test_base.MultiWorkerTestBase,
                                   CrossTowerOpsTestBase):

    worker_devices = [
        "/job:worker/replica:0/task:0", "/job:worker/replica:0/task:1"
    ]
    multi_worker_allreduce_combinations = combinations.combine(
        cross_tower_ops=[
            combinations.NamedObject(
                "MultiWorkerAllReduce",
                cross_tower_ops_lib.MultiWorkerAllReduce(
                    worker_devices, 2, ("pscpu/pscpu", 2, -1), 0, 0, 0)),
            combinations.NamedObject(
                "MultiWorkerAllReducePack",
                cross_tower_ops_lib.MultiWorkerAllReduce(
                    worker_devices, 2, ("pscpu/pscpu", 2, -1), 1, 0, 0)),
            combinations.NamedObject(
                "MultiWorkerAllReduceAggregation",
                cross_tower_ops_lib.MultiWorkerAllReduce(
                    worker_devices, 2, ("pscpu/pscpu", 2, -1), 0, 100, 10)),
            combinations.NamedObject(
                "MultiWorkerAllReduceMultipleSpecs",
                cross_tower_ops_lib.MultiWorkerAllReduce(
                    worker_devices, 2, [("pscpu/pscpu", 2, 100),
                                        ("xring", 2, -1)], 0, 0, 0)),
        ],
        distribution=[
            combinations.NamedDistribution(
                "MirroredCPU",
                lambda: mirrored_strategy.MirroredStrategy(num_gpus=0),
                required_gpus=0),
            combinations.NamedDistribution(
                "Mirrored1GPU",
                lambda: mirrored_strategy.MirroredStrategy(num_gpus=1),
                required_gpus=1),
            combinations.NamedDistribution(
                "Mirrored2GPUs",
                lambda: mirrored_strategy.MirroredStrategy(num_gpus=2),
                required_gpus=2),
        ],
        mode=["graph"])

    @combinations.generate(multi_worker_allreduce_combinations)
    def testReductionAndBroadcast(self, cross_tower_ops, distribution):
        distribution.configure(
            cluster_spec={
                "worker": [
                    "/job:worker/replica:0/task:0",
                    "/job:worker/replica:0/task:1"
                ]
            })
        with distribution.scope():
            self._testReductionAndBroadcast(cross_tower_ops, distribution)
    def configure(self,
                  session_config=None,
                  cluster_spec=None,
                  task_type=None,
                  task_id=None):
        del task_type, task_id

        if session_config:
            session_config.isolate_session_state = True

        if cluster_spec:
            self._initialize_multi_worker(self._num_gpus, cluster_spec)

        if self._cross_tower_ops is None:
            if self._cluster_spec:
                # It currently cannot detect the toplogy of remote workers. So we
                # hard-code the multi-worker all-reduce algorithm for now.
                if len(self._workers) == 1:
                    # The default is "nccl".
                    self._cross_tower_ops = cross_tower_ops_lib.AllReduceCrossDeviceOps(
                    )
                else:
                    # The default is hierarchical reduce and broadcast.
                    self._cross_tower_ops = cross_tower_ops_lib.MultiWorkerAllReduce(
                        self._workers, self._num_gpus)
            else:
                self._cross_tower_ops = cross_tower_ops_lib.choose_the_best(
                    self._devices, session_config=session_config)
Ejemplo n.º 3
0
 def configure(self,
               session_config=None,
               cluster_spec=None,
               task_type=None,
               task_id=None):
     del cluster_spec, task_type, task_id
     if self._cross_tower_ops is None:
         if self._cluster_spec:
             self._cross_tower_ops = cross_tower_ops_lib.MultiWorkerAllReduce(
                 self._workers, self._num_gpus)
         else:
             self._cross_tower_ops = cross_tower_ops_lib.choose_the_best(
                 self._devices, session_config=session_config)
class MultiWorkerCrossTowerOpsTest(multi_worker_test_base.MultiWorkerTestBase,
                                   CrossTowerOpsTestBase):

  worker_devices = [
      "/job:worker/replica:0/task:0", "/job:worker/replica:0/task:1"
  ]
  multi_worker_allreduce_combinations = combinations.combine(
      cross_tower_ops=[
          combinations.NamedObject(
              "MultiWorkerAllReduce",
              cross_tower_ops_lib.MultiWorkerAllReduce(
                  worker_devices, 2, ("pscpu/pscpu", 2, -1), 0, 0, 0)),
          combinations.NamedObject(
              "MultiWorkerAllReducePack",
              cross_tower_ops_lib.MultiWorkerAllReduce(
                  worker_devices, 2, ("pscpu/pscpu", 2, -1), 1, 0, 0)),
          combinations.NamedObject(
              "MultiWorkerAllReduceAggregation",
              cross_tower_ops_lib.MultiWorkerAllReduce(
                  worker_devices, 2, ("pscpu/pscpu", 2, -1), 0, 100, 10)),
          combinations.NamedObject(
              "MultiWorkerAllReduceMultipleSpecs",
              cross_tower_ops_lib.MultiWorkerAllReduce(
                  worker_devices, 2, [("pscpu/pscpu", 2, 100),
                                      ("xring", 2, -1)], 0, 0, 0)),
      ],
      distribution=[
          combinations.multi_worker_strategy_with_cpu,
          combinations.multi_worker_strategy_with_one_gpu,
          combinations.multi_worker_strategy_with_two_gpus
      ],
      mode=["graph"])

  @combinations.generate(multi_worker_allreduce_combinations)
  def testReductionAndBroadcast(self, cross_tower_ops, distribution):
    with distribution.scope():
      self._testReductionAndBroadcast(cross_tower_ops, distribution)