def _train_model_fn(self, diter, x_placeholder, num_workers, num_features, global_num_samples, num_iterations,
                        schema_params: SchemaParams):
        """ The training objective function and the gradients. """
        value = tf1.constant(0.0, tf1.float64)
        # Add bias
        gradients = tf1.constant(np.zeros(num_features + 1))
        feature_bag_name = self.feature_bag_name
        label_column_name = schema_params.label_column_name
        sample_weight_column_name = schema_params.weight_column_name
        offset_column_name = self.offset_column_name
        is_regularize_bias = self.is_regularize_bias
        has_weight = self._has_feature(sample_weight_column_name)
        has_offset = self._has_feature(offset_column_name)
        i = 0

        def cond(i, value, gradients):
            return i < num_iterations

        def body(i, value, gradients):
            i += 1
            all_features, all_labels = diter.get_next()
            labels = all_labels[label_column_name]
            current_batch_size = tf1.shape(labels)[0]
            features = self._get_feature_bag_tensor(all_features, feature_bag_name, current_batch_size)
            weights = all_features[sample_weight_column_name] if has_weight else tf1.ones(current_batch_size,
                                                                                          tf1.float64)
            offsets = all_features[offset_column_name] if has_offset else tf1.zeros(current_batch_size, tf1.float64)

            w = x_placeholder[:-1]
            b = x_placeholder[-1]
            logits = tf1.sparse.sparse_dense_matmul(tf1.cast(features, tf1.float64),
                                                    tf1.cast(tf1.expand_dims(w, 1), tf1.float64)) \
                + tf1.expand_dims(tf1.ones(current_batch_size, tf1.float64) * tf1.cast(b, tf1.float64), 1) \
                + tf1.expand_dims(tf1.cast(offsets, tf1.float64), 1)

            loss = tf1.nn.sigmoid_cross_entropy_with_logits(labels=tf1.cast(labels, tf1.float64),
                                                            logits=tf1.reshape(tf1.cast(logits, tf1.float64), [-1]))
            weighted_loss = tf1.cast(weights, tf1.float64) * loss
            # regularzer has the option to include or exclude bias
            regularizer = tf1.nn.l2_loss(x_placeholder) if is_regularize_bias else tf1.nn.l2_loss(w)
            batch_value = tf1.reduce_sum(weighted_loss) + regularizer * self.l2_reg_weight \
                * tf1.cast(current_batch_size, tf1.float64) / global_num_samples
            batch_gradients = tf1.gradients(batch_value, x_placeholder)[0]
            value += batch_value
            gradients += batch_gradients
            return i, value, gradients

        _, value, gradients = tf1.while_loop(cond, body, [i, value, gradients])

        if num_workers > 1:
            # sum all reduce
            reduced_value = collective_ops.all_reduce(
                value, num_workers, FixedEffectLRModelLBFGS.TF_ALL_REDUCE_GROUP_KEY, 0,
                merge_op='Add', final_op='Id')
            reduced_gradients = collective_ops.all_reduce(
                gradients, num_workers, FixedEffectLRModelLBFGS.TF_ALL_REDUCE_GROUP_KEY, 1,
                merge_op='Add', final_op='Id')
            return reduced_value, reduced_gradients
        else:
            return value, gradients
Exemple #2
0
    def worker_fn():
      cluster_resolver = cluster_resolver_lib.TFConfigClusterResolver()
      enable_collective_ops(cluster_resolver)

      collective_ops.all_reduce(
          constant_op.constant(1.),
          group_size=2,
          group_key=100,
          instance_key=100,
          merge_op="Add",
          final_op="Id",
          communication_hint="ring")

      if cluster_resolver.task_type == "worker":
        # MultiProcessRunner will auto restart worker-0.
        os._exit(1)  # pylint: disable=protected-access
      else:
        # chief should eventually gets FailedPreconditionError after worker-0
        # has restarted.
        while True:
          time.sleep(1)
          try:
            context.context().check_collective_ops_peer_health(
                "/job:worker/replica:0/task:0",)
          except errors.UnavailableError:
            pass
          except errors.FailedPreconditionError:
            break
    def testConstantWithScopedAllocator(self):
        group_size = 2
        group_key = 1
        instance_key1 = 1
        instance_key2 = 2

        graph_options = config_pb2.GraphOptions(
            optimizer_options=config_pb2.OptimizerOptions(
                do_constant_folding=True))
        cfg = config_pb2.ConfigProto(device_count={'CPU': group_size},
                                     graph_options=graph_options)
        rewrite_options = cfg.graph_options.rewrite_options
        rewrite_options.scoped_allocator_optimization = (
            rewriter_config_pb2.RewriterConfig.ON)
        del rewrite_options.scoped_allocator_opts.enable_op[:]
        rewrite_options.scoped_allocator_opts.enable_op.append(
            'CollectiveReduce')

        with self.session(config=cfg) as sess:
            run_ops = []
            for i in range(group_size):
                with ops.device('CPU:%d' % i):
                    constant = constant_op.constant(i + 1.)
                    input_tensor1 = array_ops.identity(constant)
                    input_tensor2 = array_ops.identity(constant)
                    reduced_tensor1 = collective_ops.all_reduce(
                        input_tensor1, group_size, group_key, instance_key1,
                        'Add', 'Id')
                    reduced_tensor2 = collective_ops.all_reduce(
                        input_tensor2, group_size, group_key, instance_key2,
                        'Add', 'Id')
                    run_ops.append(array_ops.identity(reduced_tensor1))
                    run_ops.append(array_ops.identity(reduced_tensor2))
            results = sess.run(run_ops)
            self.assertEqual(results, [3., 3., 3., 3.])
Exemple #4
0
    def testCollectiveDeviceMismatch(self):
        group_key = 10
        instance_key = 20
        t0 = [1, 2, 3, 4]
        t1 = [5, 6, 7, 8]

        with ops.Graph().as_default(), self.session(config=self._configure(
                set_config_proto_nccl=False)) as sess:
            if not test_util.is_gpu_available(cuda_only=True):
                self.skipTest('No GPU available')
            with ops.device('/CPU:0'):
                in0 = constant_op.constant(t0)
                c0 = collective_ops.all_reduce(in0, self._group_size,
                                               group_key, instance_key, 'Add',
                                               'Id')
            with ops.device('/GPU:0'):
                in1 = constant_op.constant(t1)
                c1 = collective_ops.all_reduce(in1, self._group_size,
                                               group_key, instance_key, 'Add',
                                               'Id')
            run_options = config_pb2.RunOptions()
            run_options.experimental.collective_graph_key = 100
            with self.assertRaisesRegex(errors.InternalError,
                                        'but that group has type'):
                sess.run([c0, c1], options=run_options)
Exemple #5
0
    def testWhileWithScopedAllocator(self):
        group_size = 2
        group_key = 1
        instance_key0 = 1
        instance_key1 = 2

        config = config_pb2.ConfigProto(device_count={'CPU': group_size})
        rewrite_options = config.graph_options.rewrite_options
        rewrite_options.scoped_allocator_optimization = (
            rewriter_config_pb2.RewriterConfig.ON)
        del rewrite_options.scoped_allocator_opts.enable_op[:]
        rewrite_options.scoped_allocator_opts.enable_op.append(
            'CollectiveReduce')

        # Tests that execute collectives need to be enclosed in graph or tf.function
        with ops.Graph().as_default():
            with self.session(config=config) as sess:
                run_ops = []
                for i in range(group_size):
                    with ops.device('CPU:%d' % i):
                        constant = constant_op.constant(0.)
                        cond = lambda i: math_ops.less(i, 10.)
                        body = lambda i: math_ops.add(i, 1.)
                        input0 = control_flow_ops.while_loop(
                            cond, body, [constant])
                        input1 = math_ops.add(constant, 5)
                        colred0 = collective_ops.all_reduce(
                            input0, group_size, group_key, instance_key0,
                            'Add', 'Id')
                        colred1 = collective_ops.all_reduce(
                            input1, group_size, group_key, instance_key1,
                            'Add', 'Id')
                        run_ops.append(math_ops.add_n([colred0, colred1]))
                results = sess.run(run_ops)
            self.assertEqual(results, [30., 30.])
Exemple #6
0
 def _check_health(self, device, group_key, instance_key):
     first = True
     # We need to use a large enough value so that the all-reduce forms a
     # complete RING. In RING implementation, when value is too small, the
     # all-reduce may degrade into broadcasts. This means that some worker
     # failure may not be detected.
     value = array_ops.ones((32, 32), dtype=dtypes.float32)
     while True:
         if self._check_health_thread_should_stop.is_set():
             return
         timeout = None
         if first:
             # For the first check health we set timeout since it may need to do
             # group resolution, which may hang if the cluster is never healthy.
             timeout = self._check_health_initial_timeout
             first = False
         try:
             # We use an dummy all-reduce as a way to check the health of a cluster.
             # For RING it should be able to detect failed workers in the cluster if
             # the values are large enough.
             #
             # We're not using CrossDeviceOps because we need to run it with
             # pre-allocated group and instance keys.
             #
             # TODO(b/151232436): Replace the reduce with a check health op once we
             # add that.
             with ops.device(device):
                 collective_ops.all_reduce(value,
                                           group_size=self._num_workers,
                                           group_key=group_key,
                                           instance_key=instance_key,
                                           merge_op="Add",
                                           final_op="Id",
                                           subdiv_offsets=[0],
                                           communication_hint="ring",
                                           timeout=timeout)
                 if context.is_async():
                     context.async_wait()
         except (errors.UnavailableError, errors.DeadlineExceededError,
                 errors.FailedPreconditionError,
                 errors.CancelledError) as e:
             # TODO(b/151232436): Always raise UnavailableError when a peer fails.
             # Now there could be many kinds of errors:
             # - Unavailable: when the peer is not reachable, e.g. it's down.
             # - FailedPrecondition: when the peer has restarted.
             # - DeadlineExceeded: when the first check health exceeds the deadline,
             #   e.g. the peers take too long to be ready.
             # - Cancelled: when failures in organic collectives aborts first,
             #   outgoing RPCs may be aborted with Cancelled.
             logging.error(
                 "Cluster check alive failed, aborting collectives")
             context.context().abort_collective_ops(
                 errors.UNAVAILABLE, "cluster check alive failed: %s" % e)
         except Exception as e:  # pylint: disable=broad-except
             logging.exception("Unexpected exception in check alive.")
             context.context().abort_collective_ops(
                 errors.INTERNAL,
                 "unexecpted exception in check alive: %s" % e)
             return
         time.sleep(self._check_health_interval)
def build_collective_reduce(input_tensors,
                            num_workers,
                            collective_keys,
                            reduction_op='Add',
                            unary_op='Id',
                            communication_hint='auto',
                            control_inputs=None):
    """Build a subgraph that does one full all-reduce, using the collective Op.

  Args:
    input_tensors: tensors within a single worker graph that are to be reduced
      together; must be one per device.
    num_workers: total number of workers with identical independent graphs that
      will be doing this same reduction.  The reduction will actually include
      the corresponding tensors at all these workers.
    collective_keys: a CollectiveKeys object.
    reduction_op: string naming the reduction op.
    unary_op: string naming the unary final op.
    communication_hint: string providing hint to runtime for choosing collective
      implementation.
    control_inputs: if not None, add control edges between control_inputs and
      (index-wise) corresponding collective_reduce tensors

  Returns:
    An array of final tensors, one per device, computed by the full reduction.

  Raises:
    ValueError: There must be at least two tensors over all the workers.
  """
    group_size = len(input_tensors) * num_workers
    if group_size < 2:
        return input_tensors
    devices = [t.device for t in input_tensors]
    num_devices = len(devices)
    group_key = collective_keys.get_group_key(devices)
    instance_key = collective_keys.get_op_instance_key()
    subdiv_offsets = [0]  # TODO(tucker): maybe support non-default subdiv spec
    if control_inputs:
        assert len(control_inputs) == len(input_tensors)

    out_tensors = []
    for dev_idx in range(num_devices):
        with ops.device(devices[dev_idx]):
            if control_inputs:
                assert control_inputs[dev_idx].device == input_tensors[
                    dev_idx].device
                with ops.control_dependencies([control_inputs[dev_idx]]):
                    reduce_op = collective_ops.all_reduce(
                        input_tensors[dev_idx], group_size, group_key,
                        instance_key, reduction_op, unary_op, subdiv_offsets,
                        communication_hint)
            else:
                reduce_op = collective_ops.all_reduce(input_tensors[dev_idx],
                                                      group_size, group_key,
                                                      instance_key,
                                                      reduction_op, unary_op,
                                                      subdiv_offsets,
                                                      communication_hint)
            out_tensors.append(reduce_op)
    return out_tensors
Exemple #8
0
    def testAbortInstanceParamsResolution(self, device, communication):
        if communication == "NCCL":
            self.skipTest("b/171358086: cannot test multi worker NCCL")
        dev0 = "/device:%s:0" % device
        cluster_resolver = cluster_resolver_lib.TFConfigClusterResolver()
        enable_collective_ops_with_barrier(cluster_resolver)
        group_size = 2
        group_key = 100
        instance_key = 100
        in_tensor = constant_op.constant([1.])

        # First perform a normal all-reduce to complete the group resolution.
        with ops.device(dev0):
            collective_ops.all_reduce(in_tensor, group_size, group_key,
                                      instance_key)

        # We use broadcast to test aborting instance resolution since only broadcast
        # waits for the group.

        if cluster_resolver.task_id == 1:

            def abort_fn():
                time.sleep(2)
                context.context().abort_collective_ops(errors.UNAVAILABLE,
                                                       "peer down")

            t = threading.Thread(target=abort_fn)
            t.start()

            # Use a different instance key to trigger another instance resolution.
            instance_key = 101
            with self.assertRaisesRegex(errors.UnavailableError, "peer down"):
                # This hangs on params resolution since we're only launching one
                # collective for a group size of 2.
                with ops.device(dev0):
                    collective_ops.broadcast_send(in_tensor, (1, ),
                                                  dtypes.float32, group_size,
                                                  group_key, instance_key)

            # After abortion, subsequent collectives should fail immediately.
            with self.assertRaisesRegex(errors.UnavailableError, "peer down"):
                with ops.device(dev0):
                    collective_ops.broadcast_send(in_tensor, (1, ),
                                                  dtypes.float32, group_size,
                                                  group_key, instance_key)

            t.join()

        # Enable collective ops again in order to reset the collective executor.
        enable_collective_ops_with_barrier(cluster_resolver)
        # Reassign instance_key so that it's the same on each worker.
        instance_key = 100
        with ops.device(dev0):
            if cluster_resolver.task_id == 0:
                collective_ops.broadcast_send(in_tensor, (1, ), dtypes.float32,
                                              group_size, group_key,
                                              instance_key)
            else:
                collective_ops.broadcast_recv(
                    (1, ), dtypes.float32, group_size, group_key, instance_key)
 def run_all_reduce(group_key, instance_key, merge_op):
     group_size = 2
     t0 = [1., 20., 3., 40., 5.]
     t1 = [10., 2., 30., 4., 50.]
     os.environ['NCCL_DEBUG'] = 'INFO'
     os.environ['NCCL_LAUNCH_MODE'] = 'PARALLEL'
     with ops.device('/GPU:0'):
         in0 = constant_op.constant(t0)
         c0 = collective_ops.all_reduce(in0,
                                        group_size,
                                        group_key,
                                        instance_key,
                                        merge_op,
                                        final_op='Id',
                                        communication_hint='nccl')
     with ops.device('/GPU:1'):
         in1 = constant_op.constant(t1)
         c1 = collective_ops.all_reduce(in1,
                                        group_size,
                                        group_key,
                                        instance_key,
                                        merge_op,
                                        final_op='Id',
                                        communication_hint='nccl')
     return c0, c1
  def testWhileWithScopedAllocator(self):
    group_size = 2
    group_key = 1
    instance_key0 = 1
    instance_key1 = 2

    config = config_pb2.ConfigProto(device_count={'CPU': group_size})
    rewrite_options = config.graph_options.rewrite_options
    rewrite_options.scoped_allocator_optimization = (
        rewriter_config_pb2.RewriterConfig.ON)
    del rewrite_options.scoped_allocator_opts.enable_op[:]
    rewrite_options.scoped_allocator_opts.enable_op.append('CollectiveReduce')

    with self.session(config=config) as sess:
      run_ops = []
      for i in range(group_size):
        with ops.device('CPU:%d' % i):
          constant = constant_op.constant(0.)
          cond = lambda i: math_ops.less(i, 10.)
          body = lambda i: math_ops.add(i, 1.)
          input0 = control_flow_ops.while_loop(cond, body, [constant])
          input1 = math_ops.add(constant, 5)
          colred0 = collective_ops.all_reduce(input0, group_size, group_key,
                                              instance_key0, 'Add', 'Id')
          colred1 = collective_ops.all_reduce(input1, group_size, group_key,
                                              instance_key1, 'Add', 'Id')
          run_ops.append(math_ops.add_n([colred0, colred1]))
      results = sess.run(run_ops)
      self.assertEqual(results, [30., 30.])
Exemple #11
0
 def run_collective_device_mismatch():
     with ops.device('/CPU:0'):
         in0 = constant_op.constant(t0)
         collective_ops.all_reduce(in0, self._group_size, group_key,
                                   instance_key, 'Add', 'Id')
     with ops.device('/GPU:0'):
         in1 = constant_op.constant(t1)
         collective_ops.all_reduce(in1, self._group_size, group_key,
                                   instance_key, 'Add', 'Id')
  def testAbortNccl(self):
    self._setup_context(num_gpus=2)

    group_size = 2
    group_key = 100
    instance_key = 100
    in_tensor = constant_op.constant(1.)

    # First perform a normal collective to finish resolution.
    def collective_fn():
      for device in ['GPU:0', 'GPU:1']:
        with ops.device(device):
          collective_ops.all_reduce(
              in_tensor,
              group_size,
              group_key,
              instance_key,
              'Add',
              'Id',
              communication_hint='nccl')

    def_function.function(collective_fn)()

    # Launch a collective that hangs, and abort the collective executor after
    # the launch.
    def abort_fn():
      time.sleep(2)
      context.context().abort_collective_ops(errors.UNAVAILABLE, 'peer down')

    t = threading.Thread(target=abort_fn)
    t.start()

    with self.assertRaisesRegex(errors.UnavailableError, 'peer down'):
      collective_ops.all_reduce(
          in_tensor,
          group_size,
          group_key,
          instance_key,
          'Add',
          'Id',
          communication_hint='nccl')

    # After abortion, subsequent collectives should fail immediately.
    with self.assertRaisesRegex(errors.UnavailableError, 'peer down'):
      collective_ops.all_reduce(
          in_tensor,
          group_size,
          group_key,
          instance_key,
          'Add',
          'Id',
          communication_hint='nccl')

    t.join()
    # Reset the context in order to reset the collective executor.
    context._reset_context()  # pylint: disable=protected-access
    def_function.function(collective_fn)()
  def testNcclStress(self):
    self._setup_context(num_gpus=1)

    num_iters = 1000
    for _ in range(num_iters):
      with ops.device('/device:GPU:0'):
        collective_ops.all_reduce(
            [1.], group_size=1, group_key=0, instance_key=0, merge_op='Add',
            final_op='Id', communication_hint='NCCL')
Exemple #14
0
    def testAbortInstanceParamsResolution(self):
        cpus = config.list_physical_devices('CPU')
        config.set_logical_device_configuration(cpus[0], [
            context.LogicalDeviceConfiguration(),
            context.LogicalDeviceConfiguration()
        ])
        group_size = 2
        group_key = 100
        instance_key = 100
        in_tensor = constant_op.constant(1.)

        def collective_fn():
            for device in ['CPU:0', 'CPU:1']:
                with ops.device(device):
                    collective_ops.all_reduce(in_tensor,
                                              group_size,
                                              group_key,
                                              instance_key,
                                              'Add',
                                              'Id',
                                              communication_hint='ring')

        # First perform a normal all-reduce to complete the group resolution.
        def_function.function(collective_fn)()

        def abort_fn():
            time.sleep(2)
            context.context().abort_collective_ops(errors.UNAVAILABLE,
                                                   'peer down')

        t = threading.Thread(target=abort_fn)
        t.start()

        # Use a different instance key to trigger another instance resolution.
        instance_key = 101
        with self.assertRaisesRegex(errors.UnavailableError, 'peer down'):
            # This hangs on params resolution since we're only launching one
            # collective for a group size of 2.
            collective_ops.all_reduce(in_tensor, group_size, group_key,
                                      instance_key, 'Add', 'Id')

        # After abortion, subsequent collectives should fail immediately.
        with self.assertRaisesRegex(errors.UnavailableError, 'peer down'):
            collective_ops.all_reduce(in_tensor, group_size, group_key,
                                      instance_key, 'Add', 'Id')

        # Reset the context in order to reset the collective executor.
        context._reset_context()  # pylint: disable=protected-access
        t.join()

        # After reset non-NCCL collectives should work.
        cpus = config.list_physical_devices('CPU')
        config.set_logical_device_configuration(cpus[0], [
            context.LogicalDeviceConfiguration(),
            context.LogicalDeviceConfiguration()
        ])
        def_function.function(collective_fn)()
Exemple #15
0
 def collective_fn():
     for device in ['CPU:0', 'CPU:1']:
         with ops.device(device):
             collective_ops.all_reduce(in_tensor,
                                       group_size,
                                       group_key,
                                       instance_key,
                                       'Add',
                                       'Id',
                                       communication_hint='ring')
Exemple #16
0
 def run_all_reduce():
     for device in ['CPU:0', 'CPU:1']:
         with ops.device(device):
             collective_ops.all_reduce(input_data,
                                       group_size=2,
                                       group_key=group_key,
                                       instance_key=instance_key,
                                       merge_op='Add',
                                       final_op='Id',
                                       timeout=timeout)
Exemple #17
0
    def testExecutionAfterTimeoutV2(self):
        timeout = 1.5
        cpus = config.list_physical_devices('CPU')
        self.assertEqual(len(cpus), 1)
        config.set_logical_device_configuration(cpus[0], [
            context.LogicalDeviceConfiguration(),
            context.LogicalDeviceConfiguration()
        ])
        context.ensure_initialized()

        group_key = 20
        instance_key = 30
        input_data = constant_op.constant([1, 2, 3, 4])

        @def_function.function
        def run_all_reduce():
            for device in ['CPU:0', 'CPU:1']:
                with ops.device(device):
                    collective_ops.all_reduce(input_data,
                                              group_size=2,
                                              group_key=group_key,
                                              instance_key=instance_key,
                                              merge_op='Add',
                                              final_op='Id',
                                              timeout=timeout)

        # Run a normal all-reduce to complete param resolution.
        run_all_reduce()

        with self.assertRaisesRegex(
                errors.DeadlineExceededError,
                'Collective has timed out during execution'):
            with ops.device('CPU:0'):
                collective_ops.all_reduce(input_data,
                                          group_size=2,
                                          group_key=group_key,
                                          instance_key=instance_key,
                                          merge_op='Add',
                                          final_op='Id',
                                          timeout=timeout)

        # We launch the second device after the first device times out. This is to
        # simulate the situation when other workers are slow and the timeout is
        # short. It should error immediately.
        with self.assertRaisesRegex(
                errors.DeadlineExceededError,
                'Collective has timed out during execution'):
            with ops.device('CPU:1'):
                # No timeout.
                collective_ops.all_reduce(input_data,
                                          group_size=2,
                                          group_key=group_key,
                                          merge_op='Add',
                                          final_op='Id',
                                          instance_key=instance_key)
Exemple #18
0
    def testAbortRing(self):
        cpus = config.list_physical_devices('CPU')
        config.set_logical_device_configuration(cpus[0], [
            context.LogicalDeviceConfiguration(),
            context.LogicalDeviceConfiguration()
        ])
        group_size = 2
        group_key = 100
        instance_key = 100
        in_tensor = constant_op.constant(1.)

        # First perform a normal collective to finish resolution.
        def collective_fn():
            for device in ['CPU:0', 'CPU:1']:
                with ops.device(device):
                    collective_ops.all_reduce(in_tensor,
                                              group_size,
                                              group_key,
                                              instance_key,
                                              'Add',
                                              'Id',
                                              communication_hint='ring')

        def_function.function(collective_fn)()

        # Launch a collective that hangs, and abort the collective executor after
        # the launch.
        def abort_fn():
            time.sleep(2)
            context.context().abort_collective_ops(errors.UNAVAILABLE,
                                                   'peer down')

        t = threading.Thread(target=abort_fn)
        t.start()

        with self.assertRaisesRegex(errors.UnavailableError, 'peer down'):
            collective_ops.all_reduce(in_tensor, group_size, group_key,
                                      instance_key, 'Add', 'Id')

        # After abortion, subsequent collectives should fail immediately.
        with self.assertRaisesRegex(errors.UnavailableError, 'peer down'):
            collective_ops.all_reduce(in_tensor, group_size, group_key,
                                      instance_key, 'Add', 'Id')

        # Reset the context in order to reset the collective executor.
        t.join()
        context._reset_context()  # pylint: disable=protected-access
        # After reset non-NCCL collectives should work.
        cpus = config.list_physical_devices('CPU')
        config.set_logical_device_configuration(cpus[0], [
            context.LogicalDeviceConfiguration(),
            context.LogicalDeviceConfiguration()
        ])
        def_function.function(collective_fn)()
  def testAbortCommunication(self, device, communication):
    if communication == "NCCL":
      self.skipTest("b/171358086: cannot test multi worker NCCL")
    dev0 = "/device:%s:0" % device
    cluster_resolver = cluster_resolver_lib.TFConfigClusterResolver()
    enable_collective_ops_with_barrier(cluster_resolver)
    group_size = 2
    group_key = 100
    instance_key = 100
    in_tensor = constant_op.constant([1.])

    # First perform a normal all-reduce to complete the group and instance
    # resolution.
    with ops.device(dev0):
      collective_ops.all_reduce(
          in_tensor,
          group_size,
          group_key,
          instance_key,
          communication_hint=communication)

    if cluster_resolver.task_id == 1:

      def abort_fn():
        time.sleep(2)
        context.context().abort_collective_ops(errors.UNAVAILABLE, "peer down")

      t = threading.Thread(target=abort_fn)
      t.start()

      with self.assertRaisesRegex(errors.UnavailableError, "peer down"):
        with ops.device(dev0):
          collective_ops.all_reduce(
              in_tensor,
              group_size,
              group_key,
              instance_key,
              communication_hint=communication)

      # After abortion, subsequent collectives should fail immediately.
      with self.assertRaisesRegex(errors.UnavailableError, "peer down"):
        with ops.device(dev0):
          collective_ops.all_reduce(
              in_tensor,
              group_size,
              group_key,
              instance_key,
              communication_hint=communication)

      t.join()

    # Enable collective ops again in order to reset the collective executor.
    enable_collective_ops_with_barrier(cluster_resolver)
    with ops.device(dev0):
      collective_ops.all_reduce(
          in_tensor,
          group_size,
          group_key,
          instance_key,
          communication_hint=communication)
 def run_all_reduce(group_key, instance_key, merge_op):
   t0 = [1., 20., 3., 40., 5.]
   t1 = [10., 2., 30., 4., 50.]
   with ops.device('/GPU:0'):
     in0 = constant_op.constant(t0)
     c0 = collective_ops.all_reduce(
         in0, self._group_size, group_key, instance_key, merge_op,
         final_op='Id', communication_hint='nccl')
   with ops.device('/GPU:1'):
     in1 = constant_op.constant(t1)
     c1 = collective_ops.all_reduce(
         in1, self._group_size, group_key, instance_key, merge_op,
         final_op='Id', communication_hint='nccl')
   return c0, c1
Exemple #21
0
def test_dist():
    ts = []
    for task_id in (0, 1):
        with tf.device('/job:worker/task:{0}/device:GPU:0'.format(task_id)):
            t = tf.Variable([1.0, 3.0 * task_id],
                            dtype=tf.float32,
                            name='myvar')
            ts.append(t)

    with tf.device('/job:worker/task:0/device:GPU:0'):
        sum0 = collective_ops.all_reduce(ts[0], 2, 0, 1, 'Add', 'Id')
    with tf.device('/job:worker/task:1/device:GPU:0'):
        sum1 = collective_ops.all_reduce(ts[1], 2, 0, 1, 'Add', 'Id')
    dependency = [sum0, sum1]
    result = [sum0, sum1]
    for i in range(20):
        with tf.control_dependencies(dependency):
            with tf.device('/job:worker/task:0/device:GPU:0'):
                sumb0 = collective_ops.all_reduce(tf.identity(ts[0]), 2, 0,
                                                  i + 2, 'Add', 'Id')
            with tf.device('/job:worker/task:1/device:GPU:0'):
                sumb1 = collective_ops.all_reduce(tf.identity(ts[1]), 2, 0,
                                                  i + 2, 'Add', 'Id')
            result.append(sumb0, sumb1)
            dependency = [sumb0, sumb1]
    resolver = TFConfigClusterResolver()
    cluster = resolver.cluster_spec()

    #dist = tf.distribute.experimental.MultiWorkerMirroredStrategy(
    #   tf.distribute.experimental.CollectiveCommunication.NCCL)

    #sess_config = dist.update_config_proto(tf.ConfigProto())
    #sess_config.ClearField("device_filters")
    sess_config = tf.ConfigProto()
    with open("dist_config.pbtxt", "r") as f:
        txt = f.read()
    pbtf.Parse(txt, sess_config)

    server = tf.distribute.Server(cluster,
                                  job_name="worker",
                                  task_index=0,
                                  config=sess_config)

    sess = tf.compat.v1.Session(server.target, config=sess_config)
    sess.run(tf.compat.v1.global_variables_initializer())

    print('tensor value', sess.run(result))

    with open("graph_def", "w") as f:
        f.write(str(tf.get_default_graph().as_graph_def()))
Exemple #22
0
 def _broadcast_fallback(self):
     """Sum gradients across devices using TensorFlow collective ops (slow fallback path)."""
     from tensorflow.python.ops import collective_ops # pylint: disable=no-name-in-module
     global _collective_ops_warning_printed, _collective_ops_group_key, _collective_ops_instance_key
     if all(x.shape.num_elements() == 0 for device in self._devices.values() for x in device.grad_clean.values()):
         return
     if not _collective_ops_warning_printed:
         print("------------------------------------------------------------------------")
         print("WARNING: Using slow fallback implementation for inter-GPU communication.")
         print("Please use TensorFlow 1.14 on Linux for optimal training performance.")
         print("------------------------------------------------------------------------")
         _collective_ops_warning_printed = True
     for device in self._devices.values():
         with tf.device(device.name):
             combo = [tf.reshape(x, [x.shape.num_elements()]) for x in device.grad_clean.values()]
             combo = tf.concat(combo, axis=0)
             combo = collective_ops.all_reduce(combo, merge_op='Add', final_op='Id',
                 group_size=len(self._devices), group_key=_collective_ops_group_key,
                 instance_key=_collective_ops_instance_key)
             cur_ofs = 0
             for var, grad_old in device.grad_clean.items():
                 grad_new = tf.reshape(combo[cur_ofs : cur_ofs + grad_old.shape.num_elements()], grad_old.shape)
                 cur_ofs += grad_old.shape.num_elements()
                 device.grad_clean[var] = grad_new
     _collective_ops_instance_key += 1
Exemple #23
0
    def _profile(self, devices):
        from tensorflow.python.ops import collective_ops

        id = self.seed
        self.seed += 1

        result = []
        for size in (2**i for i in range(21)): # 1 KB to 1GB
            handles = []
            tf.reset_default_graph()
            for dev in devices:
                with tf.device(dev):
                    x = tf.random.uniform((size, 128), dtype=tf.dtypes.float64)
                    nccl = collective_ops.all_reduce(x, len(devices), id, id, 'Add', 'Id')
                    handles.append(tf.identity(nccl))
            run_meta = tf.compat.v1.RunMetadata()
            run_opt = tf.compat.v1.RunOptions(trace_level=tf.RunOptions.FULL_TRACE)
            sess = tf.Session(self.target)
            sess.run(handles)
            sess.run(handles, options=run_opt, run_metadata=run_meta)

            time = min(node.all_end_rel_micros for d in run_meta.step_stats.dev_stats for node in d.node_stats if 'CollectiveReduce' in node.node_name)
            result.append((size, time))

        return result
    def _testCollectiveReduce(self,
                              inputs,
                              expected,
                              set_graph_key,
                              communication_hint='auto'):
        group_key = 1
        group_size = len(inputs)
        instance_key = 1
        device_type = 'CPU'
        config = config_pb2.ConfigProto(device_count={device_type: group_size})
        devices = ['/{}:{}'.format(device_type, i) for i in range(group_size)]

        with self.session(config=config) as sess:
            colred = []
            for i in range(group_size):
                with ops.device(devices[i]):
                    tensor = constant_op.constant(inputs[i])
                    colred.append(
                        collective_ops.all_reduce(
                            tensor,
                            group_size,
                            group_key,
                            instance_key,
                            'Add',
                            'Div',
                            communication_hint=communication_hint))
            run_options = config_pb2.RunOptions()
            if set_graph_key:
                run_options.experimental.collective_graph_key = 1
            results = sess.run(colred, options=run_options)
        for i in range(group_size):
            self.assertAllClose(results[i], expected, rtol=1e-5, atol=1e-5)
    def all_reduce(self,
                   input_tensor,
                   control_input=None,
                   communication_hint='AUTO',
                   timeout=0):
        """All-reduce a dense tensor.

    This can be called in eager mode if a async executor is supplied when
    creating the launcher.

    Args:
      input_tensor: a dense tensor. It must have the same shape on all replicas.
      control_input: if not None, add control edges between control_input and
        the all-reduce.
      communication_hint: string providing hint to runtime for choosing
        collective implementation.
      timeout: a float. The timeout in seconds.

    Returns:
      The reduced tensor.
    """
        instance_key = self._collective_keys.get_instance_key(
            self._group_key, self._device)
        with self._executor_scope(), \
             ops.device(self._device), \
             self._control_input(control_input):
            return collective_ops.all_reduce(
                input_tensor,
                self._group_size,
                self._group_key,
                instance_key,
                communication_hint=communication_hint,
                timeout=timeout)
 def run_all_reduce():
   group_key = 10
   instance_key = 20
   t0 = [1, 2, 3, 4]
   t1 = [5, 6, 7, 8]
   with ops.device('/CPU:0'):
     in0 = constant_op.constant(t0)
     c0 = collective_ops.all_reduce(
         in0, group_size=2, group_key=group_key, instance_key=instance_key,
         merge_op='Add', final_op='Id')
   with ops.device('/CPU:1'):
     in1 = constant_op.constant(t1)
     c1 = collective_ops.all_reduce(
         in1, group_size=3, group_key=group_key, instance_key=instance_key,
         merge_op='Add', final_op='Id')
   return c0, c1
  def _testCollectiveReduce(self, inputs, expected, set_graph_key,
                            communication_hint='auto', fp16=False,
                            instance_key=1, merge_op='Add', final_op='Div'):
    group_key = 1
    group_size = len(inputs)
    device_type = 'CPU'
    config = config_pb2.ConfigProto(device_count={device_type: group_size})
    devices = ['/{}:{}'.format(device_type, i) for i in range(group_size)]

    with self.session(config=config) as sess:
      colred = []
      for i in range(group_size):
        with ops.device(devices[i]):
          tensor = constant_op.constant(inputs[i], dtype=(
              dtypes.float16 if fp16 else dtypes.float32))
          colred.append(collective_ops.all_reduce(
              tensor, group_size, group_key, instance_key, merge_op, final_op,
              communication_hint=communication_hint))
      run_options = config_pb2.RunOptions()
      if set_graph_key:
        run_options.experimental.collective_graph_key = 1
      results = sess.run(colred, options=run_options)
    tolerance = 1e-3 if fp16 else 1e-5
    for i in range(group_size):
      logging.info('i {} result {} expected {}'.format(i, results[i], expected))
      self.assertAllClose(results[i], expected, rtol=tolerance, atol=tolerance)
Exemple #28
0
    def testNcclHintAllReduce(self):
        inputs = [[0.1, 1.1, 2.1, 3.1, 4.1, 5.1, 6.1, 7.1],
                  [0.3, 1.3, 2.3, 3.3, 4.3, 5.3, 6.3, 7.3]]
        expected = [0.2, 1.2, 2.2, 3.2, 4.2, 5.2, 6.2, 7.2]
        group_size = len(inputs)
        group_key = 1
        instance_key = 1
        devices = ['/GPU:{}'.format(i) for i in range(group_size)]

        with self.session(config=self._configure(
                group_size, set_config_proto_nccl=False)) as sess:
            if not test_util.is_gpu_available(cuda_only=True):
                self.skipTest('No GPU available')
            collectives = []
            for i in range(group_size):
                with ops.device(devices[i]):
                    t = constant_op.constant(inputs[i])
                    collectives.append(
                        collective_ops.all_reduce(t,
                                                  group_size,
                                                  group_key,
                                                  instance_key,
                                                  'Add',
                                                  'Div',
                                                  communication_hint='nccl'))
            results = sess.run(collectives)
        for result in results:
            self.assertAllClose(result, expected, rtol=1e-5, atol=1e-5)
Exemple #29
0
    def testFp16Reduce(self):
        inputs = [[0.1, 1.1, 2.1, 3.1, 4.1, 5.1, 6.1, 7.1],
                  [0.3, 1.3, 2.3, 3.3, 4.3, 5.3, 6.3, 7.3]]
        expected = [0.2, 1.2, 2.2, 3.2, 4.2, 5.2, 6.2, 7.2]
        group_key = 1
        instance_key = 100
        devices = ['/GPU:{}'.format(i) for i in range(self._group_size)]

        with ops.Graph().as_default(), self.session(
                config=self._configure()) as sess:
            if not test_util.is_gpu_available(cuda_only=True):
                self.skipTest('No GPU available')
            collectives = []
            for i in range(self._group_size):
                with ops.device(devices[i]):
                    t = constant_op.constant(inputs[i], dtype=dtypes.float16)
                    collectives.append(
                        collective_ops.all_reduce(t, self._group_size,
                                                  group_key, instance_key,
                                                  'Add', 'Div'))
            results = sess.run(collectives)
        for result in results:
            logging.info('i {} result {} expected {}'.format(
                i, results[i], expected))
            self.assertAllClose(result, expected, rtol=1e-3, atol=1e-3)
 def run_basic_all_reduce():
   collectives = []
   for i in range(self._group_size):
     with ops.device(self._devices[i]):
       t = constant_op.constant(inputs[i])
       collectives.append(collective_ops.all_reduce(
           t, self._group_size, group_key, instance_key, 'Add', 'Div'))
   return collectives
    def fn(all_args):
      results = []
      # The inputs have no devices set. This is expected to be a trace-time
      # check only.
      self.assertEqual(all_args[0].device, '')
      self.assertEqual(all_args[1].device, '')

      with ops.device('/CPU:0'):
        results.append(
            collective_ops.all_reduce(all_args[0], group_size, group_key,
                                      instance_key, 'Add', 'Div'))
      with ops.device('/CPU:1'):
        results.append(
            collective_ops.all_reduce(all_args[1], group_size, group_key,
                                      instance_key, 'Add', 'Div'))

      return results
 def _testCollectiveReduce(self, t0, t1, expected):
   group_key = 1
   instance_key = 1
   with self.test_session(
       config=config_pb2.ConfigProto(device_count={'CPU': 2})) as sess:
     with ops.device('/CPU:0'):
       in0 = constant_op.constant(t0)
       colred0 = collective_ops.all_reduce(in0, 2, group_key, instance_key,
                                           'Add', 'Div', [0])
     with ops.device('/CPU:1'):
       in1 = constant_op.constant(t1)
       colred1 = collective_ops.all_reduce(in1, 2, group_key, instance_key,
                                           'Add', 'Div', [0])
     run_options = config_pb2.RunOptions()
     run_options.experimental.collective_graph_key = 1
     results = sess.run([colred0, colred1], options=run_options)
   self.assertAllClose(results[0], expected, rtol=1e-5, atol=1e-5)
   self.assertAllClose(results[1], expected, rtol=1e-5, atol=1e-5)
 def collective_all_reduce():
   """Call collective allreduce."""
   assert not context.executing_eagerly()
   out_tensors = []
   for d in range(num_devices):
     with ops.device(devices[d]):
       reduce_op = collective_ops.all_reduce(
           input_tensors[d], group_size, group_key, instance_key, reduction_op,
           unary_op, subdiv_offsets)
       out_tensors.append(reduce_op)
   return out_tensors
 def _testMultipleConcurrentCollectiveReduce(self, t0, t1, expected):
   group_key = 1
   group_size = 2
   num_instances = 2
   all_reduces = []
   config = config_pb2.ConfigProto(device_count={'CPU': group_size})
   config.experimental.collective_deterministic_sequential_execution = True
   with self.session(config=config) as sess:
     for cpu in range(group_size):
       with ops.device('/CPU:%d' % cpu):
         in_tensor = constant_op.constant(t0 if cpu == 0 else t1)
         for instance in range(num_instances):
           all_reduces.append(collective_ops.all_reduce(
               in_tensor, group_size, group_key, instance, 'Add', 'Div'))
     results = sess.run(all_reduces)
   for i in range(group_size * num_instances):
     self.assertAllClose(results[i], expected, rtol=1e-5, atol=1e-5)
def build_collective_reduce(input_tensors,
                            num_workers,
                            collective_keys,
                            reduction_op='Add',
                            unary_op='Id'):
  """Build a subgraph that does one full all-reduce, using the collective Op.

  Args:
    input_tensors: tensors within a single worker graph that are to be reduced
      together; must be one per device.
    num_workers: total number of workers with identical independent graphs that
      will be doing this same reduction.  The reduction will actually include
      the corresponding tensors at all these workers.
    collective_keys: a CollectiveKeys object.
    reduction_op: string naming the reduction op.
    unary_op: string naming the unary final op.

  Returns:
    An array of final tensors, one per device, computed by the full reduction.

  Raises:
    ValueError: There must be at least two tensors over all the workers.
  """
  group_size = len(input_tensors) * num_workers
  if group_size < 2:
    raise ValueError('num_workers * len(input_tensors) must be 2 or greater')
  devices = [t.device for t in input_tensors]
  num_devices = len(devices)
  group_key = collective_keys.get_group_key(devices)
  instance_key = collective_keys.get_instance_key()
  out_tensors = []
  subdiv_offsets = [0]  # TODO(tucker): maybe support non-default subdiv spec
  for d in range(num_devices):
    with ops.device(devices[d]):
      reduce_op = collective_ops.all_reduce(
          input_tensors[d], group_size, group_key, instance_key, reduction_op,
          unary_op, subdiv_offsets)
      out_tensors.append(reduce_op)
  return out_tensors